Compute CRC32C on ARM using the Crypto Extension where available

In similar vein to commit 3c6e8c123, the ARMv8 cryptography extension
has 64x64 -> 128-bit carryless multiplication instructions suitable
for computing CRC. This was tested to be around twice as fast as
scalar CRC instructions for longer inputs.

We now do a runtime check, even for builds that target "armv8-a+crc",
but those builds can still use a direct call for constant inputs,
which we assume are short.

As for x86, the MIT-licensed implementation was generated with the
"generate" program from

https://github.com/corsix/fast-crc32/

Reviewed-by: Nathan Bossart <nathandbossart@gmail.com>
Discussion: https://postgr.es/m/CANWCAZaKhE+RD5KKouUFoxx1EbUNrNhcduM1VQ=DkSDadNEFng@mail.gmail.com
This commit is contained in:
John Naylor 2026-04-04 20:47:01 +07:00
parent 5e13b0f240
commit fbc57f2bc2
9 changed files with 329 additions and 11 deletions

View file

@ -784,6 +784,44 @@ fi
undefine([Ac_cachevar])dnl
])# PGAC_ARMV8_CRC32C_INTRINSICS
# PGAC_ARM_PLMULL
# ---------------------------
# Check if the compiler supports Arm CRYPTO PMULL (carryless multiplication)
# instructions used for vectorized CRC.
#
# If the instructions are supported, sets pgac_arm_pmull.
AC_DEFUN([PGAC_ARM_PLMULL],
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_arm_pmull_$1])])dnl
AC_CACHE_CHECK([for pmull and pmull2], [Ac_cachevar],
[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <arm_acle.h>
#include <arm_neon.h>
uint64x2_t a;
uint64x2_t b;
uint64x2_t c;
uint64x2_t r1;
uint64x2_t r2;
#if defined(__has_attribute) && __has_attribute (target)
__attribute__((target("+crypto")))
#endif
static int pmull_test(void)
{
__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r1), "+w"(c):"w"(a), "w"(b));
__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b));
r1 = veorq_u64(r1, r2);
/* return computed value, to prevent the above being optimized away */
return (int) vgetq_lane_u64(r1, 0);
}],
[return pmull_test();])],
[Ac_cachevar=yes],
[Ac_cachevar=no])])
if test x"$Ac_cachevar" = x"yes"; then
pgac_arm_pmull=yes
fi
undefine([Ac_cachevar])dnl
])# PGAC_ARM_PLMULL
# PGAC_LOONGARCH_CRC32C_INTRINSICS
# ---------------------------
# Check if the compiler supports the LoongArch CRCC instructions, using

64
configure vendored
View file

@ -18358,7 +18358,7 @@ $as_echo "SSE 4.2 with runtime check" >&6; }
$as_echo "#define USE_ARMV8_CRC32C 1" >>confdefs.h
PG_CRC32C_OBJS="pg_crc32c_armv8.o"
PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_armv8_choose.o"
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5
$as_echo "ARMv8 CRC instructions" >&6; }
else
@ -18443,6 +18443,58 @@ if test x"$pgac_cv_avx512_pclmul_intrinsics" = x"yes"; then
pgac_avx512_pclmul_intrinsics=yes
fi
else
if test x"$host_cpu" = x"aarch64"; then
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for pmull and pmull2" >&5
$as_echo_n "checking for pmull and pmull2... " >&6; }
if ${pgac_cv_arm_pmull_+:} false; then :
$as_echo_n "(cached) " >&6
else
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
#include <arm_acle.h>
#include <arm_neon.h>
uint64x2_t a;
uint64x2_t b;
uint64x2_t c;
uint64x2_t r1;
uint64x2_t r2;
#if defined(__has_attribute) && __has_attribute (target)
__attribute__((target("+crypto")))
#endif
static int pmull_test(void)
{
__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r1), "+w"(c):"w"(a), "w"(b));
__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b));
r1 = veorq_u64(r1, r2);
/* return computed value, to prevent the above being optimized away */
return (int) vgetq_lane_u64(r1, 0);
}
int
main ()
{
return pmull_test();
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"; then :
pgac_cv_arm_pmull_=yes
else
pgac_cv_arm_pmull_=no
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm_pmull_" >&5
$as_echo "$pgac_cv_arm_pmull_" >&6; }
if test x"$pgac_cv_arm_pmull_" = x"yes"; then
pgac_arm_pmull=yes
fi
fi
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for vectorized CRC-32C" >&5
@ -18454,8 +18506,16 @@ $as_echo "#define USE_AVX512_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: AVX-512 with runtime check" >&5
$as_echo "AVX-512 with runtime check" >&6; }
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5
if test x"$pgac_arm_pmull" = x"yes"; then
$as_echo "#define USE_PMULL_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: CRYPTO PMULL with runtime check" >&5
$as_echo "CRYPTO PMULL with runtime check" >&6; }
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5
$as_echo "none" >&6; }
fi
fi
# Select semaphore implementation type.

View file

@ -2277,7 +2277,7 @@ else
else
if test x"$USE_ARMV8_CRC32C" = x"1"; then
AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.])
PG_CRC32C_OBJS="pg_crc32c_armv8.o"
PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_armv8_choose.o"
AC_MSG_RESULT(ARMv8 CRC instructions)
else
if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
@ -2304,6 +2304,10 @@ AC_SUBST(PG_CRC32C_OBJS)
#
if test x"$host_cpu" = x"x86_64"; then
PGAC_AVX512_PCLMUL_INTRINSICS()
else
if test x"$host_cpu" = x"aarch64"; then
PGAC_ARM_PLMULL()
fi
fi
AC_MSG_CHECKING([for vectorized CRC-32C])
@ -2311,7 +2315,12 @@ if test x"$pgac_avx512_pclmul_intrinsics" = x"yes"; then
AC_DEFINE(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX-512 CRC algorithms with a runtime check.])
AC_MSG_RESULT(AVX-512 with runtime check)
else
AC_MSG_RESULT(none)
if test x"$pgac_arm_pmull" = x"yes"; then
AC_DEFINE(USE_PMULL_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Arm PMULL CRC algorithms with a runtime check.])
AC_MSG_RESULT(CRYPTO PMULL with runtime check)
else
AC_MSG_RESULT(none)
fi
fi
# Select semaphore implementation type.

View file

@ -2747,6 +2747,39 @@ int main(void)
have_optimized_crc = true
endif
# Check if the compiler supports Arm CRYPTO PMULL (carryless multiplication)
# instructions used for vectorized CRC.
prog = '''
#include <arm_acle.h>
#include <arm_neon.h>
uint64x2_t a;
uint64x2_t b;
uint64x2_t c;
#if defined(__has_attribute) && __has_attribute (target)
__attribute__((target("+crypto")))
#endif
int main(void)
{
uint64x2_t r1;
uint64x2_t r2;
__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r1), "+w"(c):"w"(a), "w"(b));
__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b));
r1 = veorq_u64(r1, r2);
/* return computed value, to prevent the above being optimized away */
return (int) vgetq_lane_u64(r1, 0);
}
'''
if cc.links(prog,
name: 'CRYPTO CRC32C',
args: test_c_args)
# Use ARM CRYPTO Extension, with runtime check
cdata.set('USE_PMULL_CRC32C_WITH_RUNTIME_CHECK', 1)
endif
elif host_cpu == 'loongarch64'
prog = '''

View file

@ -732,6 +732,9 @@
/* Define to 1 to build with PAM support. (--with-pam) */
#undef USE_PAM
/* Define to 1 to use Arm PMULL CRC algorithms with a runtime check. */
#undef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
/* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
#undef USE_SLICING_BY_8_CRC32C

View file

@ -111,13 +111,22 @@ extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t l
#endif
#elif defined(USE_ARMV8_CRC32C)
/* Use ARMv8 CRC Extension instructions. */
/*
* Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions.
* We don't need a runtime check for CRC, so for constant inputs, where
* we assume the input is small, we can avoid an indirect function call.
*/
#define COMP_CRC32C(crc, data, len) \
((crc) = pg_comp_crc32c_armv8((crc), (data), (len)))
((crc) = __builtin_constant_p(len) ? \
pg_comp_crc32c_armv8((crc), (data), (len)) : \
pg_comp_crc32c((crc), (data), (len)))
#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len);
#endif
#elif defined(USE_LOONGARCH_CRC32C)
/* Use LoongArch CRCC instructions. */
@ -131,8 +140,8 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_
#elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
/*
* Use ARMv8 instructions, but perform a runtime check first
* to check that they are available.
* Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions,
* but perform a runtime check first to check that they are available.
*/
#define COMP_CRC32C(crc, data, len) \
((crc) = pg_comp_crc32c((crc), (data), (len)))
@ -141,6 +150,9 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_
extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len);
#endif
#else
/*

View file

@ -93,6 +93,7 @@ replace_funcs_pos = [
# arm / aarch64
['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C'],
['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],

View file

@ -20,6 +20,10 @@
#include <arm_acle.h>
#endif
#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
#include <arm_neon.h>
#endif
#include "port/pg_crc32c.h"
pg_crc32c
@ -77,3 +81,127 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len)
return crc;
}
#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
/*
* Note: There is no copyright notice in the following generated code.
*
* We have modified the output to
* - match our function declaration
* - match whitespace to our project style
* - be more friendly for pgindent
*/
/* Generated by https://github.com/corsix/fast-crc32/ using: */
/* ./generate -i neon -p crc32c -a v4e */
/* MIT licensed */
pg_attribute_target("+crypto")
static inline
uint64x2_t
clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c)
{
uint64x2_t r;
__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b));
return r;
}
pg_attribute_target("+crypto")
static inline
uint64x2_t
clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c)
{
uint64x2_t r;
__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b));
return r;
}
pg_attribute_target("+crypto")
pg_crc32c
pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len)
{
/* adjust names to match generated code */
pg_crc32c crc0 = crc;
const char *buf = data;
/* align to 16 bytes */
for (; len && ((uintptr_t) buf & 7); --len)
{
crc0 = __crc32cb(crc0, *buf++);
}
if (((uintptr_t) buf & 8) && len >= 8)
{
crc0 = __crc32cd(crc0, *(const uint64_t *) buf);
buf += 8;
len -= 8;
}
if (len >= 64)
{
const char *end = buf + len;
const char *limit = buf + len - 64;
/* First vector chunk. */
uint64x2_t x0 = vld1q_u64((const uint64_t *) buf),
y0;
uint64x2_t x1 = vld1q_u64((const uint64_t *) (buf + 16)),
y1;
uint64x2_t x2 = vld1q_u64((const uint64_t *) (buf + 32)),
y2;
uint64x2_t x3 = vld1q_u64((const uint64_t *) (buf + 48)),
y3;
uint64x2_t k;
{
static const uint64_t pg_attribute_aligned(16) k_[] = {0x740eef02, 0x9e4addf8};
k = vld1q_u64(k_);
}
/*
* pgindent complained of unmatched parens, so the following has
* been re-written with intrinsics:
*
* x0 = veorq_u64((uint64x2_t) {crc0, 0}, x0);
*/
x0 = veorq_u64((uint64x2_t) vsetq_lane_u64(crc0, vdupq_n_u64(0), 0), x0);
buf += 64;
/* Main loop. */
while (buf <= limit)
{
y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t *) buf)), x0 = clmul_hi_e(x0, k, y0);
y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t *) (buf + 16))), x1 = clmul_hi_e(x1, k, y1);
y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t *) (buf + 32))), x2 = clmul_hi_e(x2, k, y2);
y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t *) (buf + 48))), x3 = clmul_hi_e(x3, k, y3);
buf += 64;
}
/* Reduce x0 ... x3 to just x0. */
{
static const uint64_t pg_attribute_aligned(16) k_[] = {0xf20c0dfe, 0x493c7d27};
k = vld1q_u64(k_);
}
y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0);
y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2);
{
static const uint64_t pg_attribute_aligned(16) k_[] = {0x3da6d0cb, 0xba4fc28e};
k = vld1q_u64(k_);
}
y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0);
/* Reduce 128 bits to 32 bits, and multiply by x^32. */
crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
len = end - buf;
}
return pg_comp_crc32c_armv8(crc0, buf, len);
}
#endif

View file

@ -108,6 +108,27 @@ pg_crc32c_armv8_available(void)
#endif
}
static inline bool
pg_pmull_available(void)
{
#if defined(__aarch64__) && defined(HWCAP_PMULL)
#ifdef HAVE_ELF_AUX_INFO
unsigned long value;
return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 &&
(value & HWCAP_PMULL) != 0;
#elif defined(HAVE_GETAUXVAL)
return (getauxval(AT_HWCAP) & HWCAP_PMULL) != 0;
#else
return false;
#endif
#else
return false;
#endif
}
/*
* This gets called on the first call. It replaces the function pointer
* so that subsequent calls are routed directly to the chosen implementation.
@ -115,10 +136,23 @@ pg_crc32c_armv8_available(void)
static pg_crc32c
pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
{
/* set fallbacks */
#ifdef USE_ARMV8_CRC32C
/* On e.g. MacOS, our runtime feature detection doesn't work */
pg_comp_crc32c = pg_comp_crc32c_armv8;
#else
pg_comp_crc32c = pg_comp_crc32c_sb8;
#endif
if (pg_crc32c_armv8_available())
{
pg_comp_crc32c = pg_comp_crc32c_armv8;
else
pg_comp_crc32c = pg_comp_crc32c_sb8;
#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
if (pg_pmull_available())
pg_comp_crc32c = pg_comp_crc32c_pmull;
#endif
}
return pg_comp_crc32c(crc, data, len);
}