diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index e2e7015f1bb..3eab0da9cb6 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -784,6 +784,44 @@ fi undefine([Ac_cachevar])dnl ])# PGAC_ARMV8_CRC32C_INTRINSICS +# PGAC_ARM_PLMULL +# --------------------------- +# Check if the compiler supports Arm CRYPTO PMULL (carryless multiplication) +# instructions used for vectorized CRC. +# +# If the instructions are supported, sets pgac_arm_pmull. +AC_DEFUN([PGAC_ARM_PLMULL], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_arm_pmull_$1])])dnl +AC_CACHE_CHECK([for pmull and pmull2], [Ac_cachevar], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include +#include +uint64x2_t a; +uint64x2_t b; +uint64x2_t c; +uint64x2_t r1; +uint64x2_t r2; + + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("+crypto"))) + #endif + static int pmull_test(void) + { + __asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r1), "+w"(c):"w"(a), "w"(b)); + __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b)); + + r1 = veorq_u64(r1, r2); + /* return computed value, to prevent the above being optimized away */ + return (int) vgetq_lane_u64(r1, 0); + }], + [return pmull_test();])], + [Ac_cachevar=yes], + [Ac_cachevar=no])]) +if test x"$Ac_cachevar" = x"yes"; then + pgac_arm_pmull=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_ARM_PLMULL + # PGAC_LOONGARCH_CRC32C_INTRINSICS # --------------------------- # Check if the compiler supports the LoongArch CRCC instructions, using diff --git a/configure b/configure index 1182c3dc92e..c56ef60226d 100755 --- a/configure +++ b/configure @@ -18358,7 +18358,7 @@ $as_echo "SSE 4.2 with runtime check" >&6; } $as_echo "#define USE_ARMV8_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_armv8.o" + PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_armv8_choose.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5 $as_echo "ARMv8 CRC instructions" >&6; } else @@ -18443,6 +18443,58 @@ if test x"$pgac_cv_avx512_pclmul_intrinsics" = x"yes"; then pgac_avx512_pclmul_intrinsics=yes fi +else + if test x"$host_cpu" = x"aarch64"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pmull and pmull2" >&5 +$as_echo_n "checking for pmull and pmull2... " >&6; } +if ${pgac_cv_arm_pmull_+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +uint64x2_t a; +uint64x2_t b; +uint64x2_t c; +uint64x2_t r1; +uint64x2_t r2; + + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("+crypto"))) + #endif + static int pmull_test(void) + { + __asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r1), "+w"(c):"w"(a), "w"(b)); + __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b)); + + r1 = veorq_u64(r1, r2); + /* return computed value, to prevent the above being optimized away */ + return (int) vgetq_lane_u64(r1, 0); + } +int +main () +{ +return pmull_test(); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_arm_pmull_=yes +else + pgac_cv_arm_pmull_=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm_pmull_" >&5 +$as_echo "$pgac_cv_arm_pmull_" >&6; } +if test x"$pgac_cv_arm_pmull_" = x"yes"; then + pgac_arm_pmull=yes +fi + + fi fi { $as_echo "$as_me:${as_lineno-$LINENO}: checking for vectorized CRC-32C" >&5 @@ -18454,8 +18506,16 @@ $as_echo "#define USE_AVX512_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h { $as_echo "$as_me:${as_lineno-$LINENO}: result: AVX-512 with runtime check" >&5 $as_echo "AVX-512 with runtime check" >&6; } else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5 + if test x"$pgac_arm_pmull" = x"yes"; then + +$as_echo "#define USE_PMULL_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: CRYPTO PMULL with runtime check" >&5 +$as_echo "CRYPTO PMULL with runtime check" >&6; } + else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5 $as_echo "none" >&6; } + fi fi # Select semaphore implementation type. diff --git a/configure.ac b/configure.ac index 39d8fe0e77b..ff5dd64468e 100644 --- a/configure.ac +++ b/configure.ac @@ -2277,7 +2277,7 @@ else else if test x"$USE_ARMV8_CRC32C" = x"1"; then AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.]) - PG_CRC32C_OBJS="pg_crc32c_armv8.o" + PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_armv8_choose.o" AC_MSG_RESULT(ARMv8 CRC instructions) else if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then @@ -2304,6 +2304,10 @@ AC_SUBST(PG_CRC32C_OBJS) # if test x"$host_cpu" = x"x86_64"; then PGAC_AVX512_PCLMUL_INTRINSICS() +else + if test x"$host_cpu" = x"aarch64"; then + PGAC_ARM_PLMULL() + fi fi AC_MSG_CHECKING([for vectorized CRC-32C]) @@ -2311,7 +2315,12 @@ if test x"$pgac_avx512_pclmul_intrinsics" = x"yes"; then AC_DEFINE(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX-512 CRC algorithms with a runtime check.]) AC_MSG_RESULT(AVX-512 with runtime check) else - AC_MSG_RESULT(none) + if test x"$pgac_arm_pmull" = x"yes"; then + AC_DEFINE(USE_PMULL_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Arm PMULL CRC algorithms with a runtime check.]) + AC_MSG_RESULT(CRYPTO PMULL with runtime check) + else + AC_MSG_RESULT(none) + fi fi # Select semaphore implementation type. diff --git a/meson.build b/meson.build index 1cecd7d1b84..43d5ffc30b1 100644 --- a/meson.build +++ b/meson.build @@ -2747,6 +2747,39 @@ int main(void) have_optimized_crc = true endif + # Check if the compiler supports Arm CRYPTO PMULL (carryless multiplication) + # instructions used for vectorized CRC. + prog = ''' +#include +#include +uint64x2_t a; +uint64x2_t b; +uint64x2_t c; + +#if defined(__has_attribute) && __has_attribute (target) +__attribute__((target("+crypto"))) +#endif +int main(void) +{ + uint64x2_t r1; + uint64x2_t r2; + + __asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r1), "+w"(c):"w"(a), "w"(b)); + __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b)); + + r1 = veorq_u64(r1, r2); + /* return computed value, to prevent the above being optimized away */ + return (int) vgetq_lane_u64(r1, 0); +} +''' + + if cc.links(prog, + name: 'CRYPTO CRC32C', + args: test_c_args) + # Use ARM CRYPTO Extension, with runtime check + cdata.set('USE_PMULL_CRC32C_WITH_RUNTIME_CHECK', 1) + endif + elif host_cpu == 'loongarch64' prog = ''' diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index f624bda32b4..9f6d512347e 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -732,6 +732,9 @@ /* Define to 1 to build with PAM support. (--with-pam) */ #undef USE_PAM +/* Define to 1 to use Arm PMULL CRC algorithms with a runtime check. */ +#undef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK + /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */ #undef USE_SLICING_BY_8_CRC32C diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 1f8e837d119..10518614664 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -111,13 +111,22 @@ extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t l #endif #elif defined(USE_ARMV8_CRC32C) -/* Use ARMv8 CRC Extension instructions. */ - +/* + * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions. + * We don't need a runtime check for CRC, so for constant inputs, where + * we assume the input is small, we can avoid an indirect function call. + */ #define COMP_CRC32C(crc, data, len) \ - ((crc) = pg_comp_crc32c_armv8((crc), (data), (len))) + ((crc) = __builtin_constant_p(len) ? \ + pg_comp_crc32c_armv8((crc), (data), (len)) : \ + pg_comp_crc32c((crc), (data), (len))) #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) +extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK +extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len); +#endif #elif defined(USE_LOONGARCH_CRC32C) /* Use LoongArch CRCC instructions. */ @@ -131,8 +140,8 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_ #elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) /* - * Use ARMv8 instructions, but perform a runtime check first - * to check that they are available. + * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions, + * but perform a runtime check first to check that they are available. */ #define COMP_CRC32C(crc, data, len) \ ((crc) = pg_comp_crc32c((crc), (data), (len))) @@ -141,6 +150,9 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_ extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK +extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len); +#endif #else /* diff --git a/src/port/meson.build b/src/port/meson.build index d55cb0424f3..922b3f64676 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -93,6 +93,7 @@ replace_funcs_pos = [ # arm / aarch64 ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'], ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'], + ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C'], ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c index 9ca0f728d39..b404e6c373e 100644 --- a/src/port/pg_crc32c_armv8.c +++ b/src/port/pg_crc32c_armv8.c @@ -20,6 +20,10 @@ #include #endif +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK +#include +#endif + #include "port/pg_crc32c.h" pg_crc32c @@ -77,3 +81,127 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) return crc; } + +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK + +/* + * Note: There is no copyright notice in the following generated code. + * + * We have modified the output to + * - match our function declaration + * - match whitespace to our project style + * - be more friendly for pgindent + */ + +/* Generated by https://github.com/corsix/fast-crc32/ using: */ +/* ./generate -i neon -p crc32c -a v4e */ +/* MIT licensed */ + +pg_attribute_target("+crypto") +static inline +uint64x2_t +clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) +{ + uint64x2_t r; + +__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b)); + return r; +} + +pg_attribute_target("+crypto") +static inline +uint64x2_t +clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) +{ + uint64x2_t r; + +__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b)); + return r; +} + +pg_attribute_target("+crypto") +pg_crc32c +pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len) +{ + /* adjust names to match generated code */ + pg_crc32c crc0 = crc; + const char *buf = data; + + /* align to 16 bytes */ + for (; len && ((uintptr_t) buf & 7); --len) + { + crc0 = __crc32cb(crc0, *buf++); + } + if (((uintptr_t) buf & 8) && len >= 8) + { + crc0 = __crc32cd(crc0, *(const uint64_t *) buf); + buf += 8; + len -= 8; + } + + if (len >= 64) + { + const char *end = buf + len; + const char *limit = buf + len - 64; + + /* First vector chunk. */ + uint64x2_t x0 = vld1q_u64((const uint64_t *) buf), + y0; + uint64x2_t x1 = vld1q_u64((const uint64_t *) (buf + 16)), + y1; + uint64x2_t x2 = vld1q_u64((const uint64_t *) (buf + 32)), + y2; + uint64x2_t x3 = vld1q_u64((const uint64_t *) (buf + 48)), + y3; + uint64x2_t k; + + { + static const uint64_t pg_attribute_aligned(16) k_[] = {0x740eef02, 0x9e4addf8}; + + k = vld1q_u64(k_); + } + + /* + * pgindent complained of unmatched parens, so the following has + * been re-written with intrinsics: + * + * x0 = veorq_u64((uint64x2_t) {crc0, 0}, x0); + */ + x0 = veorq_u64((uint64x2_t) vsetq_lane_u64(crc0, vdupq_n_u64(0), 0), x0); + buf += 64; + + /* Main loop. */ + while (buf <= limit) + { + y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t *) buf)), x0 = clmul_hi_e(x0, k, y0); + y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t *) (buf + 16))), x1 = clmul_hi_e(x1, k, y1); + y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t *) (buf + 32))), x2 = clmul_hi_e(x2, k, y2); + y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t *) (buf + 48))), x3 = clmul_hi_e(x3, k, y3); + buf += 64; + } + + /* Reduce x0 ... x3 to just x0. */ + { + static const uint64_t pg_attribute_aligned(16) k_[] = {0xf20c0dfe, 0x493c7d27}; + + k = vld1q_u64(k_); + } + y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0); + y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2); + { + static const uint64_t pg_attribute_aligned(16) k_[] = {0x3da6d0cb, 0xba4fc28e}; + + k = vld1q_u64(k_); + } + y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0); + + /* Reduce 128 bits to 32 bits, and multiply by x^32. */ + crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); + crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1)); + len = end - buf; + } + + return pg_comp_crc32c_armv8(crc0, buf, len); +} + +#endif diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_crc32c_armv8_choose.c index a1f0e540c6b..72d70aea1e1 100644 --- a/src/port/pg_crc32c_armv8_choose.c +++ b/src/port/pg_crc32c_armv8_choose.c @@ -108,6 +108,27 @@ pg_crc32c_armv8_available(void) #endif } +static inline bool +pg_pmull_available(void) +{ +#if defined(__aarch64__) && defined(HWCAP_PMULL) + +#ifdef HAVE_ELF_AUX_INFO + unsigned long value; + + return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 && + (value & HWCAP_PMULL) != 0; +#elif defined(HAVE_GETAUXVAL) + return (getauxval(AT_HWCAP) & HWCAP_PMULL) != 0; +#else + return false; +#endif + +#else + return false; +#endif +} + /* * This gets called on the first call. It replaces the function pointer * so that subsequent calls are routed directly to the chosen implementation. @@ -115,10 +136,23 @@ pg_crc32c_armv8_available(void) static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) { + /* set fallbacks */ +#ifdef USE_ARMV8_CRC32C + /* On e.g. MacOS, our runtime feature detection doesn't work */ + pg_comp_crc32c = pg_comp_crc32c_armv8; +#else + pg_comp_crc32c = pg_comp_crc32c_sb8; +#endif + if (pg_crc32c_armv8_available()) + { pg_comp_crc32c = pg_comp_crc32c_armv8; - else - pg_comp_crc32c = pg_comp_crc32c_sb8; + +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK + if (pg_pmull_available()) + pg_comp_crc32c = pg_comp_crc32c_pmull; +#endif + } return pg_comp_crc32c(crc, data, len); }