Use native CRC instructions on 64-bit LoongArch
authorJohn Naylor <john.naylor@postgresql.org>
Thu, 10 Aug 2023 04:36:15 +0000 (11:36 +0700)
committerJohn Naylor <john.naylor@postgresql.org>
Thu, 10 Aug 2023 04:36:15 +0000 (11:36 +0700)
As with the Intel and Arm CRC instructions, compiler intrinsics for
them must be supported by the compiler. In contrast, no runtime check
is needed. Aligned memory access is faster, so use the Arm coding as
a model.

YANG Xudong

Discussion: https://postgr.es/m/b522a0c5-e3b2-99cc-6387-58134fb88cbe%40ymatrix.cn

config/c-compiler.m4
configure
configure.ac
meson.build
src/include/pg_config.h.in
src/include/port/pg_crc32c.h
src/port/meson.build
src/port/pg_crc32c_loongarch.c [new file with mode: 0644]

index 5be8f0f08dcb53ae50bd6b9abe22cb42210680ed..5db02b2ab75de609ccfb888f96be0dc68e40a808 100644 (file)
@@ -661,3 +661,36 @@ if test x"$Ac_cachevar" = x"yes"; then
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_ARMV8_CRC32C_INTRINSICS
+
+# PGAC_LOONGARCH_CRC32C_INTRINSICS
+# ---------------------------
+# Check if the compiler supports the LoongArch CRCC instructions, using
+# __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w,
+# __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w
+# intrinsic functions.
+#
+# We test for the 8-byte variant since platforms capable of running
+# Postgres are 64-bit only (as of PG17), and we know CRC instructions
+# are available there without a runtime check.
+#
+# If the intrinsics are supported, sets pgac_loongarch_crc32c_intrinsics.
+AC_DEFUN([PGAC_LOONGARCH_CRC32C_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_loongarch_crc32c_intrinsics])])dnl
+AC_CACHE_CHECK(
+  [for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w],
+  [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([],
+  [unsigned int crc = 0;
+   crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+   /* return computed value, to prevent the above being optimized away */
+   return crc == 0;])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+  pgac_loongarch_crc32c_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_LOONGARCH_CRC32C_INTRINSICS
index 963fbbcf1e75ff06712b3ab5648591a3fbaecca3..86ffccb1ee18bdd20838a5d41bdef7010175fc55 100755 (executable)
--- a/configure
+++ b/configure
 
 fi
 
+# Check for LoongArch CRC intrinsics to do CRC calculations.
+#
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w" >&5
+$as_echo_n "checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w... " >&6; }
+if ${pgac_cv_loongarch_crc32c_intrinsics+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+unsigned int crc = 0;
+   crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+   /* return computed value, to prevent the above being optimized away */
+   return crc == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_loongarch_crc32c_intrinsics=yes
+else
+  pgac_cv_loongarch_crc32c_intrinsics=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_loongarch_crc32c_intrinsics" >&5
+$as_echo "$pgac_cv_loongarch_crc32c_intrinsics" >&6; }
+if test x"$pgac_cv_loongarch_crc32c_intrinsics" = x"yes"; then
+  pgac_loongarch_crc32c_intrinsics=yes
+fi
+
+
 
 
 # Select CRC-32C implementation.
 # we're not targeting such a processor, but can nevertheless produce code that
 # uses the CRC instructions, compile both, and select at runtime.
 #
-# You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
+# You can skip the runtime check by setting the appropriate USE_*_CRC32 flag to 1
 # in the template or configure command line.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
+#
+# If we are targeting a LoongArch processor, CRC instructions are
+# always available (at least on 64 bit), so no runtime check is needed.
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
   # Use Intel SSE 4.2 if available.
   if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
@@ -18083,10 +18127,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
         if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
           USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
         else
-          # fall back to slicing-by-8 algorithm, which doesn't require any
-          # special CPU support.
-          USE_SLICING_BY_8_CRC32C=1
-   fi
+          # LoongArch CRCC instructions.
+          if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+            USE_LOONGARCH_CRC32C=1
+          else
+            # fall back to slicing-by-8 algorithm, which doesn't require any
+            # special CPU support.
+            USE_SLICING_BY_8_CRC32C=1
+          fi
+        fi
       fi
     fi
   fi
@@ -18127,12 +18176,21 @@ $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
         { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5
 $as_echo "ARMv8 CRC instructions with runtime check" >&6; }
       else
+        if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+
+$as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h
+
+          PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+          { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5
+$as_echo "LoongArch CRCC instructions" >&6; }
+        else
 
 $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h
 
-        PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-        { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
+          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+          { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
 $as_echo "slicing-by-8" >&6; }
+        fi
       fi
     fi
   fi
index 5153b8b3fdc25ee3eaf90141952176b153abe093..116c5a1f68b078e48cff860e1fd78d45f025fad6 100644 (file)
@@ -2099,6 +2099,12 @@ if test x"$pgac_armv8_crc32c_intrinsics" != x"yes"; then
   PGAC_ARMV8_CRC32C_INTRINSICS([-march=armv8-a+crc])
 fi
 
+# Check for LoongArch CRC intrinsics to do CRC calculations.
+#
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+PGAC_LOONGARCH_CRC32C_INTRINSICS()
+
 AC_SUBST(CFLAGS_CRC)
 
 # Select CRC-32C implementation.
@@ -2115,9 +2121,12 @@ AC_SUBST(CFLAGS_CRC)
 # we're not targeting such a processor, but can nevertheless produce code that
 # uses the CRC instructions, compile both, and select at runtime.
 #
-# You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
+# You can skip the runtime check by setting the appropriate USE_*_CRC32 flag to 1
 # in the template or configure command line.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
+#
+# If we are targeting a LoongArch processor, CRC instructions are
+# always available (at least on 64 bit), so no runtime check is needed.
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
   # Use Intel SSE 4.2 if available.
   if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
@@ -2135,10 +2144,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
         if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
           USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
         else
-          # fall back to slicing-by-8 algorithm, which doesn't require any
-          # special CPU support.
-          USE_SLICING_BY_8_CRC32C=1
-   fi
+          # LoongArch CRCC instructions.
+          if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+            USE_LOONGARCH_CRC32C=1
+          else
+            # fall back to slicing-by-8 algorithm, which doesn't require any
+            # special CPU support.
+            USE_SLICING_BY_8_CRC32C=1
+          fi
+        fi
       fi
     fi
   fi
@@ -2166,9 +2180,15 @@ else
         PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
         AC_MSG_RESULT(ARMv8 CRC instructions with runtime check)
       else
-        AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
-        PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-        AC_MSG_RESULT(slicing-by-8)
+        if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+          AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.])
+          PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+          AC_MSG_RESULT(LoongArch CRCC instructions)
+        else
+          AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
+          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+          AC_MSG_RESULT(slicing-by-8)
+        fi
       fi
     fi
   fi
index 0a11efc97a1ac740c3ec1b67418389b14b1b769d..2acb2040037f5f7c0cbfb8ec358136f479c606d2 100644 (file)
@@ -2065,6 +2065,30 @@ int main(void)
     cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1)
     have_optimized_crc = true
   endif
+
+elif host_cpu == 'loongarch64'
+
+  prog = '''
+int main(void)
+{
+    unsigned int crc = 0;
+    crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+
+    /* return computed value, to prevent the above being optimized away */
+    return crc == 0;
+}
+'''
+
+  if cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and __builtin_loongarch_crcc_w_d_w',
+      args: test_c_args)
+    # Use LoongArch CRC instruction unconditionally
+    cdata.set('USE_LOONGARCH_CRC32C', 1)
+    have_optimized_crc = true
+  endif
+
 endif
 
 if not have_optimized_crc
index ee209d6d7023148d8f69a40f5e14a9f494fbd525..d8a2985567fbf19c0a4a46c19946c6369b143887 100644 (file)
 /* Define to 1 to build with LLVM based JIT support. (--with-llvm) */
 #undef USE_LLVM
 
+/* Define to 1 to use LoongArch CRCC instructions. */
+#undef USE_LOONGARCH_CRC32C
+
 /* Define to 1 to build with LZ4 support. (--with-lz4) */
 #undef USE_LZ4
 
index 7f8779261c3c98949413c1a9ad7679f666bb8e52..d085f1dc00b578b4d7201a1ff399bb81a132e645 100644 (file)
@@ -58,6 +58,15 @@ extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t le
 
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
 
+#elif defined(USE_LOONGARCH_CRC32C)
+/* Use LoongArch CRCC instructions. */
+
+#define COMP_CRC32C(crc, data, len)                            \
+   ((crc) = pg_comp_crc32c_loongarch((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
+
 #elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 
 /*
index 9d0cd93c438c7457ee713634cfdcacb2093933b5..deb354418dbb48df0dee5b4c52ae0ac170338a73 100644 (file)
@@ -92,6 +92,9 @@ replace_funcs_pos = [
   ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
 
+  # loongarch
+  ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'],
+
   # generic fallback
   ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
 ]
diff --git a/src/port/pg_crc32c_loongarch.c b/src/port/pg_crc32c_loongarch.c
new file mode 100644 (file)
index 0000000..db9da80
--- /dev/null
@@ -0,0 +1,73 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_loongarch.c
+ *   Compute CRC-32C checksum using LoongArch CRCC instructions
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *   src/port/pg_crc32c_loongarch.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "port/pg_crc32c.h"
+
+pg_crc32c
+pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len)
+{
+   const unsigned char *p = data;
+   const unsigned char *pend = p + len;
+
+   /*
+    * LoongArch doesn't require alignment, but aligned memory access is
+    * significantly faster. Process leading bytes so that the loop below
+    * starts with a pointer aligned to eight bytes.
+    */
+   if (!PointerIsAligned(p, uint16) &&
+       p + 1 <= pend)
+   {
+       crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
+       p += 1;
+   }
+   if (!PointerIsAligned(p, uint32) &&
+       p + 2 <= pend)
+   {
+       crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
+       p += 2;
+   }
+   if (!PointerIsAligned(p, uint64) &&
+       p + 4 <= pend)
+   {
+       crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
+       p += 4;
+   }
+
+   /* Process eight bytes at a time, as far as we can. */
+   while (p + 8 <= pend)
+   {
+       crc = __builtin_loongarch_crcc_w_d_w(*(uint64 *) p, crc);
+       p += 8;
+   }
+
+   /* Process remaining 0-7 bytes. */
+   if (p + 4 <= pend)
+   {
+       crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
+       p += 4;
+   }
+   if (p + 2 <= pend)
+   {
+       crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
+       p += 2;
+   }
+   if (p < pend)
+   {
+       crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
+   }
+
+   return crc;
+}