Mailing List Archive: [PATCH] sha512: add AArch64 crypto/SHA512 extension implementation

* cipher/Makefile.am: Add 'sha512-armv8-aarch64-ce.S'.
* cipher/sha512-armv8-aarch64-ce.S: New.
* cipher/sha512.c (ATTR_ALIGNED_64, USE_ARM64_SHA512): New.
(k): Make array aligned to 64 bytes.
[USE_ARM64_SHA512] (_gcry_sha512_transform_armv8_ce): New.
[USE_ARM64_SHA512] (do_sha512_transform_armv8_ce): New.
(sha512_init_common) [USE_ARM64_SHA512]: Use ARMv8-SHA512 accelerated
implementation if HW feature available.
* configure.ac: Add 'sha512-armv8-aarch64-ce.lo'.
(gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4)
(HAVE_GCC_INLINE_ASM_AARCH64_SHA3_SHA512_SM3_SM4): New.
--

Benchmark on AWS Graviton3:

Before:
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
SHA512 | 2.36 ns/B 404.2 MiB/s 6.13 c/B 2600

After (2.4x faster):
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
SHA512 | 0.977 ns/B 976.6 MiB/s 2.54 c/B 2600

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
---
cipher/Makefile.am | 2 +-
cipher/sha512-armv8-aarch64-ce.S | 383 +++++++++++++++++++++++++++++++
cipher/sha512.c | 40 +++-
configure.ac | 54 +++++
4 files changed, 477 insertions(+), 2 deletions(-)
create mode 100644 cipher/sha512-armv8-aarch64-ce.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 97823cb4..e27bb0bc 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -131,7 +131,7 @@ EXTRA_libcipher_la_SOURCES = \
sha256-intel-shaext.c sha256-ppc.c \
sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S \
sha512-avx2-bmi2-amd64.S sha512-avx512-amd64.S \
- sha512-armv7-neon.S sha512-arm.S \
+ sha512-armv7-neon.S sha512-armv8-aarch64-ce.S sha512-arm.S \
sha512-ppc.c sha512-ssse3-i386.c \
sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S sm3-armv8-aarch64-ce.S \
keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
diff --git a/cipher/sha512-armv8-aarch64-ce.S b/cipher/sha512-armv8-aarch64-ce.S
new file mode 100644
index 00000000..73fe7ced
--- /dev/null
+++ b/cipher/sha512-armv8-aarch64-ce.S
@@ -0,0 +1,383 @@
+/* sha512-armv8-aarch64-ce.S - ARM/CE accelerated SHA-512 transform function
+ * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__) && \
+ defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_AARCH64_SHA3_SHA512_SM3_SM4) && \
+ defined(USE_SHA512)
+
+.arch armv8.2-a+sha3+sm4
+
+.text
+
+
+/* Register macros */
+
+#define Qv0 q0
+#define Qv1 q1
+#define Qv2 q2
+#define Qv3 q3
+#define Qv4 q4
+
+#define vT0 v5
+#define vT1 v6
+#define QvT1 q6
+#define vT2 v7
+#define vT3 v16
+
+#define vH01 v17
+#define vH23 v18
+#define vH45 v19
+#define vH67 v20
+
+#define vW0 v21
+#define vW1 v22
+#define vW2 v23
+#define vW3 v24
+#define vW4 v25
+#define vW5 v26
+#define vW6 v27
+#define vW7 v28
+
+#define vK0 v29
+#define vK1 v30
+#define vK2 v31
+
+
+/* Round macros */
+
+#define _(...) /*_*/
+
+#define do_add(a, b) add a.2d, a.2d, b.2d;
+
+#define load_k_3() ld1 {vK0.2d-vK2.2d}, [x3], #48;
+#define load_k_last() ld1 {vK0.2d}, [x3];
+
+#define load_msg1(...) \
+ ld1 {vW0.16b-vW3.16b}, [x1], #64;
+
+#define load_msg2(...) \
+ rev64 vW0.16b, vW0.16b;
+
+#define load_msg3(...) \
+ rev64 vW1.16b, vW1.16b;
+
+#define load_msg4(...) \
+ ld1 {vW4.16b-vW7.16b}, [x1], #64;
+
+#define load_msg5(...) \
+ rev64 vW2.16b, vW2.16b;
+
+#define load_msg6(...) \
+ rev64 vW3.16b, vW3.16b;
+
+#define load_msg7(...) \
+ rev64 vW4.16b, vW4.16b;
+
+#define load_msg8(...) \
+ rev64 vW5.16b, vW5.16b;
+
+#define load_msg9(...) \
+ rev64 vW6.16b, vW6.16b;
+
+#define load_msg10(...) \
+ rev64 vW7.16b, vW7.16b;
+
+#define schedule1(w0, w1, w2, w3, w4, w5, w6, w7) \
+ sha512su0 w0.2d, w1.2d; \
+
+#define schedule2(w0, w1, w2, w3, w4, w5, w6, w7) \
+ ext vT2.16b, w4.16b, w5.16b, #8; \
+ sha512su1 w0.2d, w7.2d, vT2.2d;
+
+#define do_round2(ab, cd, ef, gh, cd_out, \
+ load_nextk_op, k, \
+ sched_op1, sched_op2, w0, w1, w2, w3, w4, w5, w6, w7) \
+ add vT3.2d, k.2d, w0.2d; \
+ load_nextk_op(); \
+ ext vT1.16b, ef.16b, gh.16b, #8; \
+ ext vT3.16b, vT3.16b, vT3.16b, #8; \
+ ext vT0.16b, cd.16b, ef.16b, #8; \
+ add gh.2d, gh.2d, vT3.2d; \
+ sched_op1(w0, w1, w2, w3, w4, w5, w6, w7); \
+ sha512h Q##gh, Q##vT1, vT0.2d; \
+ sched_op2(w0, w1, w2, w3, w4, w5, w6, w7); \
+ add cd_out.2d, gh.2d, cd.2d; \
+ sha512h2 Q##gh, Q##cd, ab.2d; \
+
+
+/* Other functional macros */
+
+#undef CLEAR_REG
+#define CLEAR_REG(reg, ...) movi reg.16b, #0;
+
+
+/*
+ * unsigned int
+ * _gcry_sha512_transform_armv8_ce (u64 state[8], const void *input_data,
+ * size_t num_blks, const u64 k[80])
+ */
+.align 3
+.globl _gcry_sha512_transform_armv8_ce
+ELF(.type _gcry_sha512_transform_armv8_ce,%function;)
+_gcry_sha512_transform_armv8_ce:
+ /* input:
+ * x0: ctx, CTX
+ * x1: data (128*nblks bytes)
+ * x2: nblks
+ * x3: k table
+ */
+ CFI_STARTPROC()
+
+ cbz x2, .Ldo_nothing
+
+ mov x4, x3
+
+ ld1 {vH01.2d-vH67.2d}, [x0] /* load state */
+
+ load_msg1()
+ mov v0.16b, vH01.16b
+ mov v1.16b, vH23.16b
+ load_k_3()
+ load_msg2()
+ load_msg3()
+ load_msg4()
+ mov v2.16b, vH45.16b
+ mov v3.16b, vH67.16b
+ load_msg5()
+ load_msg6()
+ load_msg7()
+ load_msg8()
+ load_msg9()
+ load_msg10()
+
+.Loop:
+ sub x2, x2, #1
+
+ # rounds 1-16
+ do_round2(v0, v1, v2, v3, v4,
+ _, vK0,
+ schedule1, schedule2, vW0, vW1, vW2, vW3, vW4, vW5, vW6, vW7)
+ do_round2(v3, v0, v4, v2, v1,
+ _, vK1,
+ schedule1, schedule2, vW1, vW2, vW3, vW4, vW5, vW6, vW7, vW0)
+ do_round2(v2, v3, v1, v4, v0,
+ load_k_3, vK2,
+ schedule1, schedule2, vW2, vW3, vW4, vW5, vW6, vW7, vW0, vW1)
+ do_round2(v4, v2, v0, v1, v3,
+ _, vK0,
+ schedule1, schedule2, vW3, vW4, vW5, vW6, vW7, vW0, vW1, vW2)
+ do_round2(v1, v4, v3, v0, v2,
+ _, vK1,
+ schedule1, schedule2, vW4, vW5, vW6, vW7, vW0, vW1, vW2, vW3)
+ do_round2(v0, v1, v2, v3, v4,
+ load_k_3, vK2,
+ schedule1, schedule2, vW5, vW6, vW7, vW0, vW1, vW2, vW3, vW4)
+ do_round2(v3, v0, v4, v2, v1,
+ _, vK0,
+ schedule1, schedule2, vW6, vW7, vW0, vW1, vW2, vW3, vW4, vW5)
+ do_round2(v2, v3, v1, v4, v0,
+ _, vK1,
+ schedule1, schedule2, vW7, vW0, vW1, vW2, vW3, vW4, vW5, vW6)
+
+ # rounds 17-32
+ do_round2(v4, v2, v0, v1, v3,
+ load_k_3, vK2,
+ schedule1, schedule2, vW0, vW1, vW2, vW3, vW4, vW5, vW6, vW7)
+ do_round2(v1, v4, v3, v0, v2,
+ _, vK0,
+ schedule1, schedule2, vW1, vW2, vW3, vW4, vW5, vW6, vW7, vW0)
+ do_round2(v0, v1, v2, v3, v4,
+ _, vK1,
+ schedule1, schedule2, vW2, vW3, vW4, vW5, vW6, vW7, vW0, vW1)
+ do_round2(v3, v0, v4, v2, v1,
+ load_k_3, vK2,
+ schedule1, schedule2, vW3, vW4, vW5, vW6, vW7, vW0, vW1, vW2)
+ do_round2(v2, v3, v1, v4, v0,
+ _, vK0,
+ schedule1, schedule2, vW4, vW5, vW6, vW7, vW0, vW1, vW2, vW3)
+ do_round2(v4, v2, v0, v1, v3,
+ _, vK1,
+ schedule1, schedule2, vW5, vW6, vW7, vW0, vW1, vW2, vW3, vW4)
+ do_round2(v1, v4, v3, v0, v2,
+ load_k_3, vK2,
+ schedule1, schedule2, vW6, vW7, vW0, vW1, vW2, vW3, vW4, vW5)
+ do_round2(v0, v1, v2, v3, v4,
+ _, vK0,
+ schedule1, schedule2, vW7, vW0, vW1, vW2, vW3, vW4, vW5, vW6)
+
+ # rounds 33-48
+ do_round2(v3, v0, v4, v2, v1,
+ _, vK1,
+ schedule1, schedule2, vW0, vW1, vW2, vW3, vW4, vW5, vW6, vW7)
+ do_round2(v2, v3, v1, v4, v0,
+ load_k_3, vK2,
+ schedule1, schedule2, vW1, vW2, vW3, vW4, vW5, vW6, vW7, vW0)
+ do_round2(v4, v2, v0, v1, v3,
+ _, vK0,
+ schedule1, schedule2, vW2, vW3, vW4, vW5, vW6, vW7, vW0, vW1)
+ do_round2(v1, v4, v3, v0, v2,
+ _, vK1,
+ schedule1, schedule2, vW3, vW4, vW5, vW6, vW7, vW0, vW1, vW2)
+ do_round2(v0, v1, v2, v3, v4,
+ load_k_3, vK2,
+ schedule1, schedule2, vW4, vW5, vW6, vW7, vW0, vW1, vW2, vW3)
+ do_round2(v3, v0, v4, v2, v1,
+ _, vK0,
+ schedule1, schedule2, vW5, vW6, vW7, vW0, vW1, vW2, vW3, vW4)
+ do_round2(v2, v3, v1, v4, v0,
+ _, vK1,
+ schedule1, schedule2, vW6, vW7, vW0, vW1, vW2, vW3, vW4, vW5)
+ do_round2(v4, v2, v0, v1, v3,
+ load_k_3, vK2,
+ schedule1, schedule2, vW7, vW0, vW1, vW2, vW3, vW4, vW5, vW6)
+
+ # rounds 49-64
+ do_round2(v1, v4, v3, v0, v2,
+ _, vK0,
+ schedule1, schedule2, vW0, vW1, vW2, vW3, vW4, vW5, vW6, vW7)
+ do_round2(v0, v1, v2, v3, v4,
+ _, vK1,
+ schedule1, schedule2, vW1, vW2, vW3, vW4, vW5, vW6, vW7, vW0)
+ do_round2(v3, v0, v4, v2, v1,
+ load_k_3, vK2,
+ schedule1, schedule2, vW2, vW3, vW4, vW5, vW6, vW7, vW0, vW1)
+ do_round2(v2, v3, v1, v4, v0,
+ _, vK0,
+ schedule1, schedule2, vW3, vW4, vW5, vW6, vW7, vW0, vW1, vW2)
+ do_round2(v4, v2, v0, v1, v3,
+ _, vK1,
+ schedule1, schedule2, vW4, vW5, vW6, vW7, vW0, vW1, vW2, vW3)
+ do_round2(v1, v4, v3, v0, v2,
+ load_k_3, vK2,
+ schedule1, schedule2, vW5, vW6, vW7, vW0, vW1, vW2, vW3, vW4)
+ do_round2(v0, v1, v2, v3, v4,
+ _, vK0,
+ schedule1, schedule2, vW6, vW7, vW0, vW1, vW2, vW3, vW4, vW5)
+ do_round2(v3, v0, v4, v2, v1,
+ _, vK1,
+ schedule1, schedule2, vW7, vW0, vW1, vW2, vW3, vW4, vW5, vW6)
+
+ cbz x2, .Lend
+
+ # rounds 65-80
+ do_round2(v2, v3, v1, v4, v0,
+ load_k_3, vK2,
+ _, _, vW0, , , , , , , )
+ do_round2(v4, v2, v0, v1, v3,
+ _, vK0,
+ _, _, vW1, , , , , , , )
+ do_round2(v1, v4, v3, v0, v2,
+ _, vK1,
+ _, _, vW2, , , , , , , )
+ do_round2(v0, v1, v2, v3, v4,
+ load_k_3, vK2,
+ _, _, vW3, , , , , , , )
+ do_round2(v3, v0, v4, v2, v1,
+ _, vK0,
+ load_msg1, _, vW4, , , , , , , )
+ do_round2(v2, v3, v1, v4, v0,
+ _, vK1,
+ load_msg2, _, vW5, , , , , , , )
+ do_round2(v4, v2, v0, v1, v3,
+ load_k_last, vK2,
+ load_msg3, _, vW6, , , , , , , )
+ mov x3, x4
+ do_round2(v1, v4, v3, v0, v2,
+ load_k_3, vK0,
+ load_msg4, load_msg5, vW7, , , , , , , )
+
+ load_msg6()
+ load_msg7()
+
+ add vH01.2d, vH01.2d, v0.2d
+ add vH23.2d, vH23.2d, v1.2d
+ add vH45.2d, vH45.2d, v2.2d
+ add vH67.2d, vH67.2d, v3.2d
+ load_msg8()
+ load_msg9()
+ load_msg10()
+ mov v0.16b, vH01.16b
+ mov v1.16b, vH23.16b
+ mov v2.16b, vH45.16b
+ mov v3.16b, vH67.16b
+
+ b .Loop
+
+.Lend:
+
+ # rounds 65-80
+ do_round2(v2, v3, v1, v4, v0,
+ load_k_3, vK2,
+ CLEAR_REG, _, vW0, , , , , , , )
+ do_round2(v4, v2, v0, v1, v3,
+ _, vK0,
+ CLEAR_REG, _, vW1, , , , , , , )
+ do_round2(v1, v4, v3, v0, v2,
+ _, vK1,
+ CLEAR_REG, _, vW2, , , , , , , )
+ do_round2(v0, v1, v2, v3, v4,
+ load_k_3, vK2,
+ CLEAR_REG, _, vW3, , , , , , , )
+ do_round2(v3, v0, v4, v2, v1,
+ _, vK0,
+ CLEAR_REG, _, vW4, , , , , , , )
+ do_round2(v2, v3, v1, v4, v0,
+ _, vK1,
+ CLEAR_REG, _, vW5, , , , , , , )
+ CLEAR_REG(vK1)
+ do_round2(v4, v2, v0, v1, v3,
+ load_k_last, vK2,
+ CLEAR_REG, _, vW6, , , , , , , )
+ CLEAR_REG(vK2)
+ do_round2(v1, v4, v3, v0, v2,
+ _, vK0,
+ CLEAR_REG, _, vW7, , , , , , , )
+ CLEAR_REG(vK0)
+
+ CLEAR_REG(v4)
+ add vH01.2d, vH01.2d, v0.2d
+ CLEAR_REG(v0)
+ add vH23.2d, vH23.2d, v1.2d
+ CLEAR_REG(v1)
+ add vH45.2d, vH45.2d, v2.2d
+ CLEAR_REG(v2)
+ add vH67.2d, vH67.2d, v3.2d
+ CLEAR_REG(v3)
+ CLEAR_REG(vT0)
+ CLEAR_REG(vT1)
+ CLEAR_REG(vT2)
+ CLEAR_REG(vT3)
+
+ st1 {vH01.2d-vH67.2d}, [x0] /* store state */
+
+ CLEAR_REG(vH01)
+ CLEAR_REG(vH23)
+ CLEAR_REG(vH45)
+ CLEAR_REG(vH67)
+
+.Ldo_nothing:
+ mov x0, #0
+ ret_spec_stop
+ CFI_ENDPROC()
+ELF(.size _gcry_sha512_transform_armv8_ce,.-_gcry_sha512_transform_armv8_ce;)
+
+#endif
diff --git a/cipher/sha512.c b/cipher/sha512.c
index 42eaf1fe..9ac412b3 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -55,6 +55,14 @@
#include "hash-common.h"

+/* Helper macro to force alignment to 64 bytes. */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_64 __attribute__ ((aligned (64)))
+#else
+# define ATTR_ALIGNED_64
+#endif
+
+
/* USE_ARM_NEON_ASM indicates whether to enable ARM NEON assembly code. */
#undef USE_ARM_NEON_ASM
#ifdef ENABLE_NEON_SUPPORT
@@ -72,6 +80,17 @@
# define USE_ARM_ASM 1
#endif

+/* USE_ARM64_SHA512 indicates whether to enable ARMv8 SHA512 extension assembly
+ * code. */
+#undef USE_ARM64_SHA512
+#ifdef ENABLE_ARM_CRYPTO_SUPPORT
+# if defined(__AARCH64EL__) \
+ && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
+ && defined(HAVE_GCC_INLINE_ASM_AARCH64_SHA3_SHA512_SM3_SM4)
+# define USE_ARM64_SHA512 1
+# endif
+#endif
+

/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
#undef USE_SSSE3
@@ -158,7 +177,7 @@ typedef struct
} SHA512_CONTEXT;

-static const u64 k[] =
+static ATTR_ALIGNED_64 const u64 k[] =
{
U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd),
U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc),
@@ -219,6 +238,21 @@ static const u64 k[] =
#endif

+#ifdef USE_ARM64_SHA512
+unsigned int _gcry_sha512_transform_armv8_ce (u64 state[8],
+ const unsigned char *data,
+ size_t num_blks,
+ const u64 k[]);
+
+static unsigned int
+do_sha512_transform_armv8_ce(void *ctx, const unsigned char *data,
+ size_t nblks)
+{
+ SHA512_CONTEXT *hd = ctx;
+ return _gcry_sha512_transform_armv8_ce (hd->state.h, data, nblks, k);
+}
+#endif
+
#ifdef USE_ARM_NEON_ASM
unsigned int _gcry_sha512_transform_armv7_neon (SHA512_STATE *hd,
const unsigned char *data,
@@ -415,6 +449,10 @@ sha512_init_common (SHA512_CONTEXT *ctx, unsigned int flags)
if ((features & HWF_ARM_NEON) != 0)
ctx->bctx.bwrite = do_sha512_transform_armv7_neon;
#endif
+#ifdef USE_ARM64_SHA512
+ if ((features & HWF_ARM_NEON) && (features & HWF_ARM_SHA512))
+ ctx->bctx.bwrite = do_sha512_transform_armv8_ce;
+#endif
#ifdef USE_SSSE3
if ((features & HWF_INTEL_SSSE3) != 0)
ctx->bctx.bwrite = do_sha512_transform_amd64_ssse3;
diff --git a/configure.ac b/configure.ac
index b55510d8..ddba42c0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2054,6 +2054,56 @@ if test "$gcry_cv_gcc_inline_asm_aarch64_sve2" = "yes" ; then
fi

+#
+# Check whether GCC inline assembler supports AArch64 SHA3/SHA512/SM3/SM4 instructions
+#
+AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 SHA3/SHA512/SM3/SM4 instructions],
+ [gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4],
+ [.if test "$mpi_cpu_arch" != "aarch64" ||
+ test "$try_asm_modules" != "yes" ; then
+ gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4="n/a"
+ else
+ gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4=no
+ AC_LINK_IFELSE([.AC_LANG_PROGRAM(
+ [.[.__asm__(
+ ".arch armv8.2-a+sha3+sm4\n\t"
+ ".text\n\t"
+ "testfn:\n\t"
+
+ /* Test for SHA512 instructions */
+ "sha512h q0, q0, v0.2d;\n\t"
+ "sha512h2 q0, q0, v0.2d;\n\t"
+ "sha512su0 v0.2d, v0.2d;\n\t"
+ "sha512su1 v0.2d, v0.2d, v31.2d;\n\t"
+
+ /* Test for SHA3 instructions */
+ "bcax v0.16b, v1.16b, v2.16b, v3.16b;\n\t"
+ "eor3 v0.16b, v1.16b, v2.16b, v3.16b;\n\t"
+ "rax1 v0.2d, v1.2d, v2.2d;\n\t"
+ "xar v0.2d, v1.2d, v2.2d, \#1;\n\t"
+
+ /* Test for SM3 instructions */
+ "sm3partw1 v0.4s, v1.4s, v2.4s;\n\t"
+ "sm3partw2 v0.4s, v1.4s, v2.4s;\n\t"
+ "sm3ss1 v0.4s, v1.4s, v2.4s, v3.4s;\n\t"
+ "sm3tt1a v0.4s, v1.4s, v2.s[0];\n\t"
+ "sm3tt1b v0.4s, v1.4s, v2.s[0];\n\t"
+ "sm3tt2a v0.4s, v1.4s, v2.s[0];\n\t"
+ "sm3tt2b v0.4s, v1.4s, v2.s[0];\n\t"
+
+ /* Test for SM4 instructions */
+ "sm4e v0.4s, v1.4s;\n\t"
+ "sm4ekey v0.4s, v1.4s, v2.4s;\n\t"
+ );
+ ]], [ testfn(); ])],
+ [gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4=yes])
+ fi])
+if test "$gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4" = "yes" ; then
+ AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_SHA3_SHA512_SM3_SM4,1,
+ [Defined if inline assembler supports AArch64 SHA3/SHA512/SM3/SM4 instructions])
+fi
+
+
#
# Check whether PowerPC AltiVec/VSX intrinsics
#
@@ -3123,6 +3173,10 @@ if test "$found" = "1" ; then
# Build with the assembly implementation
GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-arm.lo"
;;
+ aarch64-*-*)
+ # Build with the assembly implementation
+ GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-armv8-aarch64-ce.lo"
+ ;;
powerpc64le-*-*)
# Build with the crypto extension implementation
GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ppc.lo"
--
2.34.1

_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@lists.gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel