Compute CRC32C on ARM using the Crypto Extension where available

In similar vein to commit 3c6e8c123, the ARMv8 cryptography extension has 64x64 -> 128-bit carryless multiplication instructions suitable for computing CRC. This was tested to be around twice as fast as scalar CRC instructions for longer inputs. We now do a runtime check, even for builds that target "armv8-a+crc", but those builds can still use a direct call for constant inputs, which we assume are short. As for x86, the MIT-licensed implementation was generated with the "generate" program from https://github.com/corsix/fast-crc32/ Reviewed-by: Nathan Bossart <nathandbossart@gmail.com> Discussion: https://postgr.es/m/CANWCAZaKhE+RD5KKouUFoxx1EbUNrNhcduM1VQ=DkSDadNEFng@mail.gmail.com
2026-05-25 10:43:53 -04:00 · 2026-04-04 20:47:01 +07:00 · 2026-04-04 20:47:01 +07:00 · fbc57f2bc2
commit fbc57f2bc2
parent 5e13b0f240
9 changed files with 329 additions and 11 deletions
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@ -784,6 +784,44 @@ fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_ARMV8_CRC32C_INTRINSICS

+# PGAC_ARM_PLMULL
+# ---------------------------
+# Check if the compiler supports Arm CRYPTO PMULL (carryless multiplication)
+# instructions used for vectorized CRC.
+#
+# If the instructions are supported, sets pgac_arm_pmull.
+AC_DEFUN([PGAC_ARM_PLMULL],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_arm_pmull_$1])])dnl
+AC_CACHE_CHECK([for pmull and pmull2], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <arm_acle.h>
+#include <arm_neon.h>
+uint64x2_t  a;
+uint64x2_t  b;
+uint64x2_t  c;
+uint64x2_t  r1;
+uint64x2_t  r2;
+
+  #if defined(__has_attribute) && __has_attribute (target)
+  __attribute__((target("+crypto")))
+  #endif
+  static int pmull_test(void)
+  {
+    __asm("pmull  %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r1), "+w"(c):"w"(a), "w"(b));
+    __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b));
+
+    r1 = veorq_u64(r1, r2);
+    /* return computed value, to prevent the above being optimized away */
+    return (int) vgetq_lane_u64(r1, 0);
+  }],
+  [return pmull_test();])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+  pgac_arm_pmull=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_ARM_PLMULL
+
 # PGAC_LOONGARCH_CRC32C_INTRINSICS
 # ---------------------------
 # Check if the compiler supports the LoongArch CRCC instructions, using
--- a/64
+++ b/64
@ -18358,7 +18358,7 @@ $as_echo "SSE 4.2 with runtime check" >&6; }

 $as_echo "#define USE_ARMV8_CRC32C 1" >>confdefs.h

-      PG_CRC32C_OBJS="pg_crc32c_armv8.o"
+      PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_armv8_choose.o"
      { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5
 $as_echo "ARMv8 CRC instructions" >&6; }
    else
@ -18443,6 +18443,58 @@ if test x"$pgac_cv_avx512_pclmul_intrinsics" = x"yes"; then
  pgac_avx512_pclmul_intrinsics=yes
 fi

+else
+  if test x"$host_cpu" = x"aarch64"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pmull and pmull2" >&5
+$as_echo_n "checking for pmull and pmull2... " >&6; }
+if ${pgac_cv_arm_pmull_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <arm_acle.h>
+#include <arm_neon.h>
+uint64x2_t  a;
+uint64x2_t  b;
+uint64x2_t  c;
+uint64x2_t  r1;
+uint64x2_t  r2;
+
+  #if defined(__has_attribute) && __has_attribute (target)
+  __attribute__((target("+crypto")))
+  #endif
+  static int pmull_test(void)
+  {
+    __asm("pmull  %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r1), "+w"(c):"w"(a), "w"(b));
+    __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b));
+
+    r1 = veorq_u64(r1, r2);
+    /* return computed value, to prevent the above being optimized away */
+    return (int) vgetq_lane_u64(r1, 0);
+  }
+int
+main ()
+{
+return pmull_test();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_arm_pmull_=yes
+else
+  pgac_cv_arm_pmull_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm_pmull_" >&5
+$as_echo "$pgac_cv_arm_pmull_" >&6; }
+if test x"$pgac_cv_arm_pmull_" = x"yes"; then
+  pgac_arm_pmull=yes
+fi
+
+  fi
 fi

 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for vectorized CRC-32C" >&5
@ -18454,8 +18506,16 @@ $as_echo "#define USE_AVX512_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
  { $as_echo "$as_me:${as_lineno-$LINENO}: result: AVX-512 with runtime check" >&5
 $as_echo "AVX-512 with runtime check" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5
+  if test x"$pgac_arm_pmull" = x"yes"; then
+
+$as_echo "#define USE_PMULL_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: CRYPTO PMULL with runtime check" >&5
+$as_echo "CRYPTO PMULL with runtime check" >&6; }
+  else
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5
 $as_echo "none" >&6; }
+  fi
 fi

 # Select semaphore implementation type.
--- a/configure.ac
+++ b/configure.ac
@ -2277,7 +2277,7 @@ else
  else
    if test x"$USE_ARMV8_CRC32C" = x"1"; then
      AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.])
-      PG_CRC32C_OBJS="pg_crc32c_armv8.o"
+      PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_armv8_choose.o"
      AC_MSG_RESULT(ARMv8 CRC instructions)
    else
      if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
@ -2304,6 +2304,10 @@ AC_SUBST(PG_CRC32C_OBJS)
 #
 if test x"$host_cpu" = x"x86_64"; then
  PGAC_AVX512_PCLMUL_INTRINSICS()
+else
+  if test x"$host_cpu" = x"aarch64"; then
+    PGAC_ARM_PLMULL()
+  fi
 fi

 AC_MSG_CHECKING([for vectorized CRC-32C])
@ -2311,7 +2315,12 @@ if test x"$pgac_avx512_pclmul_intrinsics" = x"yes"; then
  AC_DEFINE(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX-512 CRC algorithms with a runtime check.])
  AC_MSG_RESULT(AVX-512 with runtime check)
 else
-  AC_MSG_RESULT(none)
+  if test x"$pgac_arm_pmull" = x"yes"; then
+    AC_DEFINE(USE_PMULL_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Arm PMULL CRC algorithms with a runtime check.])
+    AC_MSG_RESULT(CRYPTO PMULL with runtime check)
+  else
+    AC_MSG_RESULT(none)
+  fi
 fi

 # Select semaphore implementation type.
--- a/meson.build
+++ b/meson.build
@ -2747,6 +2747,39 @@ int main(void)
    have_optimized_crc = true
  endif

+    # Check if the compiler supports Arm CRYPTO PMULL (carryless multiplication)
+    # instructions used for vectorized CRC.
+    prog = '''
+#include <arm_acle.h>
+#include <arm_neon.h>
+uint64x2_t	a;
+uint64x2_t	b;
+uint64x2_t	c;
+
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("+crypto")))
+#endif
+int main(void)
+{
+    uint64x2_t	r1;
+    uint64x2_t	r2;
+
+	__asm("pmull  %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r1), "+w"(c):"w"(a), "w"(b));
+	__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b));
+
+	r1 = veorq_u64(r1, r2);
+	/* return computed value, to prevent the above being optimized away */
+	return (int) vgetq_lane_u64(r1, 0);
+}
+'''
+
+  if cc.links(prog,
+      name: 'CRYPTO CRC32C',
+      args: test_c_args)
+    # Use ARM CRYPTO Extension, with runtime check
+    cdata.set('USE_PMULL_CRC32C_WITH_RUNTIME_CHECK', 1)
+  endif
+
 elif host_cpu == 'loongarch64'

  prog = '''
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@ -732,6 +732,9 @@
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM

+/* Define to 1 to use Arm PMULL CRC algorithms with a runtime check. */
+#undef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+
 /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
 #undef USE_SLICING_BY_8_CRC32C

--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@ -111,13 +111,22 @@ extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t l
 #endif

 #elif defined(USE_ARMV8_CRC32C)
-/* Use ARMv8 CRC Extension instructions. */
-
+/*
+ * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions.
+ * We don't need a runtime check for CRC, so for constant inputs, where
+ * we assume the input is small, we can avoid an indirect function call.
+ */
 #define COMP_CRC32C(crc, data, len)							\
-	((crc) = pg_comp_crc32c_armv8((crc), (data), (len)))
+	((crc) = __builtin_constant_p(len) ? 					\
+		pg_comp_crc32c_armv8((crc), (data), (len)) : 		\
+		pg_comp_crc32c((crc), (data), (len)))
 #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)

+extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len);
+#endif

 #elif defined(USE_LOONGARCH_CRC32C)
 /* Use LoongArch CRCC instructions. */
@ -131,8 +140,8 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_
 #elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)

 /*
- * Use ARMv8 instructions, but perform a runtime check first
- * to check that they are available.
+ * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions,
+ * but perform a runtime check first to check that they are available.
 */
 #define COMP_CRC32C(crc, data, len) \
 	((crc) = pg_comp_crc32c((crc), (data), (len)))
@ -141,6 +150,9 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_
 extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len);
+#endif

 #else
 /*
--- a/src/port/meson.build
+++ b/src/port/meson.build
@ -93,6 +93,7 @@ replace_funcs_pos = [
  # arm / aarch64
  ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
  ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
+  ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C'],
  ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
  ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],

--- a/src/port/pg_crc32c_armv8.c
+++ b/src/port/pg_crc32c_armv8.c
@ -20,6 +20,10 @@
 #include <arm_acle.h>
 #endif

+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+#include <arm_neon.h>
+#endif
+
 #include "port/pg_crc32c.h"

 pg_crc32c
@ -77,3 +81,127 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len)

 	return crc;
 }
+
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+
+/*
+ * Note: There is no copyright notice in the following generated code.
+ *
+ * We have modified the output to
+ *   - match our function declaration
+ *   - match whitespace to our project style
+ *   - be more friendly for pgindent
+ */
+
+/* Generated by https://github.com/corsix/fast-crc32/ using: */
+/* ./generate -i neon -p crc32c -a v4e */
+/* MIT licensed */
+
+pg_attribute_target("+crypto")
+static inline
+uint64x2_t
+clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c)
+{
+	uint64x2_t	r;
+
+__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b));
+	return r;
+}
+
+pg_attribute_target("+crypto")
+static inline
+uint64x2_t
+clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c)
+{
+	uint64x2_t	r;
+
+__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b));
+	return r;
+}
+
+pg_attribute_target("+crypto")
+pg_crc32c
+pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len)
+{
+	/* adjust names to match generated code */
+	pg_crc32c	crc0 = crc;
+	const char *buf = data;
+
+	/* align to 16 bytes */
+	for (; len && ((uintptr_t) buf & 7); --len)
+	{
+		crc0 = __crc32cb(crc0, *buf++);
+	}
+	if (((uintptr_t) buf & 8) && len >= 8)
+	{
+		crc0 = __crc32cd(crc0, *(const uint64_t *) buf);
+		buf += 8;
+		len -= 8;
+	}
+
+	if (len >= 64)
+	{
+		const char *end = buf + len;
+		const char *limit = buf + len - 64;
+
+		/* First vector chunk. */
+		uint64x2_t	x0 = vld1q_u64((const uint64_t *) buf),
+					y0;
+		uint64x2_t	x1 = vld1q_u64((const uint64_t *) (buf + 16)),
+					y1;
+		uint64x2_t	x2 = vld1q_u64((const uint64_t *) (buf + 32)),
+					y2;
+		uint64x2_t	x3 = vld1q_u64((const uint64_t *) (buf + 48)),
+					y3;
+		uint64x2_t	k;
+
+		{
+			static const uint64_t pg_attribute_aligned(16) k_[] = {0x740eef02, 0x9e4addf8};
+
+			k = vld1q_u64(k_);
+		}
+
+		/*
+		 * pgindent complained of unmatched parens, so the following has
+		 * been re-written with intrinsics:
+		 *
+		 * x0 = veorq_u64((uint64x2_t) {crc0, 0}, x0);
+		 */
+		x0 = veorq_u64((uint64x2_t) vsetq_lane_u64(crc0, vdupq_n_u64(0), 0), x0);
+		buf += 64;
+
+		/* Main loop. */
+		while (buf <= limit)
+		{
+			y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t *) buf)), x0 = clmul_hi_e(x0, k, y0);
+			y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t *) (buf + 16))), x1 = clmul_hi_e(x1, k, y1);
+			y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t *) (buf + 32))), x2 = clmul_hi_e(x2, k, y2);
+			y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t *) (buf + 48))), x3 = clmul_hi_e(x3, k, y3);
+			buf += 64;
+		}
+
+		/* Reduce x0 ... x3 to just x0. */
+		{
+			static const uint64_t pg_attribute_aligned(16) k_[] = {0xf20c0dfe, 0x493c7d27};
+
+			k = vld1q_u64(k_);
+		}
+		y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0);
+		y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2);
+		{
+			static const uint64_t pg_attribute_aligned(16) k_[] = {0x3da6d0cb, 0xba4fc28e};
+
+			k = vld1q_u64(k_);
+		}
+		y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0);
+
+		/* Reduce 128 bits to 32 bits, and multiply by x^32. */
+		crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
+		crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
+		len = end - buf;
+	}
+
+	return pg_comp_crc32c_armv8(crc0, buf, len);
+}
+
+#endif
--- a/src/port/pg_crc32c_armv8_choose.c
+++ b/src/port/pg_crc32c_armv8_choose.c
@ -108,6 +108,27 @@ pg_crc32c_armv8_available(void)
 #endif
 }

+static inline bool
+pg_pmull_available(void)
+{
+#if defined(__aarch64__) && defined(HWCAP_PMULL)
+
+#ifdef HAVE_ELF_AUX_INFO
+	unsigned long value;
+
+	return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 &&
+		(value & HWCAP_PMULL) != 0;
+#elif defined(HAVE_GETAUXVAL)
+	return (getauxval(AT_HWCAP) & HWCAP_PMULL) != 0;
+#else
+	return false;
+#endif
+
+#else
+	return false;
+#endif
+}
+
 /*
 * This gets called on the first call. It replaces the function pointer
 * so that subsequent calls are routed directly to the chosen implementation.
@ -115,10 +136,23 @@ pg_crc32c_armv8_available(void)
 static pg_crc32c
 pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 {
+	/* set fallbacks */
+#ifdef USE_ARMV8_CRC32C
+	/* On e.g. MacOS, our runtime feature detection doesn't work */
+	pg_comp_crc32c = pg_comp_crc32c_armv8;
+#else
+	pg_comp_crc32c = pg_comp_crc32c_sb8;
+#endif
+
 	if (pg_crc32c_armv8_available())
+	{
 		pg_comp_crc32c = pg_comp_crc32c_armv8;
-	else
-		pg_comp_crc32c = pg_comp_crc32c_sb8;
+
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+		if (pg_pmull_available())
+			pg_comp_crc32c = pg_comp_crc32c_pmull;
+#endif
+	}

 	return pg_comp_crc32c(crc, data, len);
 }