Mailing List Archive

[PATCH 2/2] camellia: add AArch64 crypto-extension implementation
* cipher/Makefile.am: Add 'camellia-aarch64-ce.(c|o|lo)'.
(aarch64_neon_cflags): New.
* cipher/camellia-aarch64-ce.c: New.
* cipher/camellia-glue.c (USE_AARCH64_CE): New.
(CAMELLIA_context): Add 'use_aarch64ce'.
(_gcry_camellia_aarch64ce_encrypt_blk16)
(_gcry_camellia_aarch64ce_decrypt_blk16)
(_gcry_camellia_aarch64ce_keygen, camellia_aarch64ce_enc_blk16)
(camellia_aarch64ce_dec_blk16, aarch64ce_burn_stack_depth): New.
(camellia_setkey) [USE_AARCH64_CE]: Set use_aarch64ce if HW has
HWF_ARM_AES; Use AArch64/CE key generation if supported by HW.
(camellia_encrypt_blk1_32, camellia_decrypt_blk1_32)
[USE_AARCH64_CE]: Add AArch64/CE code path.
--

Patch enables 128-bit vector instrinsics implementation of Camellia
cipher for AArch64.

Benchmark on AWS Graviton2:

Before:
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 5.99 ns/B 159.2 MiB/s 14.97 c/B 2500
ECB dec | 5.99 ns/B 159.1 MiB/s 14.98 c/B 2500
CBC enc | 6.16 ns/B 154.7 MiB/s 15.41 c/B 2500
CBC dec | 6.12 ns/B 155.8 MiB/s 15.29 c/B 2499
CFB enc | 6.49 ns/B 147.0 MiB/s 16.21 c/B 2500
CFB dec | 6.05 ns/B 157.6 MiB/s 15.13 c/B 2500
CTR enc | 6.09 ns/B 156.7 MiB/s 15.22 c/B 2500
CTR dec | 6.09 ns/B 156.6 MiB/s 15.22 c/B 2500
XTS enc | 6.16 ns/B 154.9 MiB/s 15.39 c/B 2500
XTS dec | 6.16 ns/B 154.8 MiB/s 15.40 c/B 2499
GCM enc | 6.31 ns/B 151.1 MiB/s 15.78 c/B 2500
GCM dec | 6.31 ns/B 151.1 MiB/s 15.78 c/B 2500
GCM auth | 0.206 ns/B 4635 MiB/s 0.514 c/B 2500
OCB enc | 6.63 ns/B 143.9 MiB/s 16.57 c/B 2499
OCB dec | 6.63 ns/B 143.9 MiB/s 16.56 c/B 2499
OCB auth | 6.55 ns/B 145.7 MiB/s 16.37 c/B 2499

After (ecb ~2.1x faster):
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 2.77 ns/B 344.2 MiB/s 6.93 c/B 2499
ECB dec | 2.76 ns/B 345.3 MiB/s 6.90 c/B 2499
CBC enc | 6.17 ns/B 154.7 MiB/s 15.41 c/B 2499
CBC dec | 2.89 ns/B 330.3 MiB/s 7.22 c/B 2500
CFB enc | 6.48 ns/B 147.1 MiB/s 16.21 c/B 2499
CFB dec | 2.84 ns/B 336.1 MiB/s 7.09 c/B 2499
CTR enc | 2.90 ns/B 328.8 MiB/s 7.25 c/B 2499
CTR dec | 2.90 ns/B 328.9 MiB/s 7.25 c/B 2500
XTS enc | 2.93 ns/B 325.3 MiB/s 7.33 c/B 2500
XTS dec | 2.92 ns/B 326.2 MiB/s 7.31 c/B 2500
GCM enc | 3.10 ns/B 307.2 MiB/s 7.76 c/B 2500
GCM dec | 3.10 ns/B 307.2 MiB/s 7.76 c/B 2499
GCM auth | 0.206 ns/B 4635 MiB/s 0.514 c/B 2500

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
---
cipher/Makefile.am | 14 ++++-
cipher/camellia-aarch64-ce.c | 42 ++++++++++++++
cipher/camellia-glue.c | 70 +++++++++++++++++++++++
configure.ac | 106 +++++++++++++++++++++++++++++++++--
4 files changed, 227 insertions(+), 5 deletions(-)
create mode 100644 cipher/camellia-aarch64-ce.c

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 52435ed5..dcaa68bb 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -148,7 +148,7 @@ EXTRA_libcipher_la_SOURCES = \
camellia-aesni-avx2-amd64.h \
camellia-gfni-avx2-amd64.S camellia-gfni-avx512-amd64.S \
camellia-vaes-avx2-amd64.S camellia-aesni-avx2-amd64.S \
- camellia-arm.S camellia-aarch64.S \
+ camellia-arm.S camellia-aarch64.S camellia-aarch64-ce.c \
camellia-simd128.h camellia-ppc8le.c camellia-ppc9le.c \
blake2.c \
blake2b-amd64-avx2.S blake2b-amd64-avx512.S \
@@ -238,6 +238,12 @@ else
ppc_vcrypto_cflags =
endif

+if ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS
+aarch64_neon_cflags = -O2 -march=armv8-a+crypto
+else
+aarch64_neon_cflags =
+endif
+
rijndael-ppc.o: $(srcdir)/rijndael-ppc.c Makefile
`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `

@@ -297,3 +303,9 @@ camellia-ppc9le.o: $(srcdir)/camellia-ppc9le.c Makefile

camellia-ppc9le.lo: $(srcdir)/camellia-ppc9le.c Makefile
`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+camellia-aarch64-ce.o: $(srcdir)/camellia-aarch64-ce.c Makefile
+ `echo $(COMPILE) $(aarch64_neon_cflags) -c $< | $(instrumentation_munging) `
+
+camellia-aarch64-ce.lo: $(srcdir)/camellia-aarch64-ce.c Makefile
+ `echo $(LTCOMPILE) $(aarch64_neon_cflags) -c $< | $(instrumentation_munging) `
diff --git a/cipher/camellia-aarch64-ce.c b/cipher/camellia-aarch64-ce.c
new file mode 100644
index 00000000..76813e94
--- /dev/null
+++ b/cipher/camellia-aarch64-ce.c
@@ -0,0 +1,42 @@
+/* camellia-aarch64-ce.c - ARMv8/CE Camellia implementation
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(__AARCH64EL__) && \
+ defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
+ defined(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS) && \
+ (__GNUC__ >= 4)
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT
+#endif
+
+#define SIMD128_OPT_ATTR FUNC_ATTR_OPT
+
+#define FUNC_ENC_BLK16 _gcry_camellia_aarch64ce_encrypt_blk16
+#define FUNC_DEC_BLK16 _gcry_camellia_aarch64ce_decrypt_blk16
+#define FUNC_KEY_SETUP _gcry_camellia_aarch64ce_keygen
+
+#include "camellia-simd128.h"
+
+#endif /* __AARCH64EL__ */
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 46bbe182..0b07f2d1 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -119,6 +119,16 @@
# define USE_PPC_CRYPTO 1
#endif

+/* USE_AARCH64_CE indicates whether to enable ARMv8/CE accelerated code. */
+#undef USE_AARCH64_CE
+#if defined(__AARCH64EL__) && \
+ defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
+ defined(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS) && \
+ (__GNUC__ >= 4)
+# define USE_AARCH64_CE 1
+#endif
+
typedef struct
{
KEY_TABLE_TYPE keytable;
@@ -138,6 +148,9 @@ typedef struct
unsigned int use_ppc8:1;
unsigned int use_ppc9:1;
#endif /*USE_PPC_CRYPTO*/
+#ifdef USE_AARCH64_CE
+ unsigned int use_aarch64ce:1;
+#endif /*USE_AARCH64_CE*/
} CAMELLIA_context;

/* Assembly implementations use SystemV ABI, ABI conversion and additional
@@ -472,6 +485,36 @@ static const int ppc_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 16 +
2 * sizeof(void *);
#endif /*USE_PPC_CRYPTO*/

+#ifdef USE_AARCH64_CE
+extern void _gcry_camellia_aarch64ce_encrypt_blk16(const void *key_table,
+ void *out, const void *in,
+ int key_length);
+
+extern void _gcry_camellia_aarch64ce_decrypt_blk16(const void *key_table,
+ void *out, const void *in,
+ int key_length);
+
+extern void _gcry_camellia_aarch64ce_keygen(void *key_table, const void *vkey,
+ unsigned int keylen);
+
+void camellia_aarch64ce_enc_blk16(const CAMELLIA_context *ctx,
+ unsigned char *out, const unsigned char *in)
+{
+ _gcry_camellia_aarch64ce_encrypt_blk16 (ctx->keytable, out, in,
+ ctx->keybitlength / 8);
+}
+
+void camellia_aarch64ce_dec_blk16(const CAMELLIA_context *ctx,
+ unsigned char *out, const unsigned char *in)
+{
+ _gcry_camellia_aarch64ce_decrypt_blk16 (ctx->keytable, out, in,
+ ctx->keybitlength / 8);
+}
+
+static const int aarch64ce_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 16 +
+ 2 * sizeof(void *);
+#endif /*USE_AARCH64_CE*/
+
static const char *selftest(void);

static void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr,
@@ -549,6 +592,9 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
ctx->use_ppc9 = (hwf & HWF_PPC_VCRYPTO) && (hwf & HWF_PPC_ARCH_3_00);
ctx->use_ppc = ctx->use_ppc8 || ctx->use_ppc9;
#endif
+#ifdef USE_AARCH64_CE
+ ctx->use_aarch64ce = (hwf & HWF_ARM_AES) != 0;
+#endif

ctx->keybitlength=keylen*8;

@@ -574,6 +620,10 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
_gcry_camellia_ppc9_keygen(ctx->keytable, key, keylen);
else if (ctx->use_ppc8)
_gcry_camellia_ppc8_keygen(ctx->keytable, key, keylen);
+#endif
+#ifdef USE_AARCH64_CE
+ else if (ctx->use_aarch64ce)
+ _gcry_camellia_aarch64ce_keygen(ctx->keytable, key, keylen);
#endif
else
{
@@ -754,6 +804,16 @@ camellia_encrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf,
num_blks -= 16;
}
#endif
+#ifdef USE_AARCH64_CE
+ while (ctx->use_aarch64ce && num_blks >= 16)
+ {
+ camellia_aarch64ce_enc_blk16 (ctx, outbuf, inbuf);
+ stack_burn_size = aarch64ce_burn_stack_depth;
+ outbuf += CAMELLIA_BLOCK_SIZE * 16;
+ inbuf += CAMELLIA_BLOCK_SIZE * 16;
+ num_blks -= 16;
+ }
+#endif

while (num_blks)
{
@@ -855,6 +915,16 @@ camellia_decrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf,
num_blks -= 16;
}
#endif
+#ifdef USE_AARCH64_CE
+ while (ctx->use_aarch64ce && num_blks >= 16)
+ {
+ camellia_aarch64ce_dec_blk16 (ctx, outbuf, inbuf);
+ stack_burn_size = aarch64ce_burn_stack_depth;
+ outbuf += CAMELLIA_BLOCK_SIZE * 16;
+ inbuf += CAMELLIA_BLOCK_SIZE * 16;
+ num_blks -= 16;
+ }
+#endif

while (num_blks)
{
diff --git a/configure.ac b/configure.ac
index a40a8135..0d5c9160 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2136,7 +2136,103 @@ fi


#
-# Check whether PowerPC AltiVec/VSX intrinsics
+# Check whether compiler supports AArch64/NEON/crypto intrinsics
+#
+AC_CACHE_CHECK([whether compiler supports AArch64/NEON/crypto intrinsics],
+ [gcry_cv_cc_aarch64_neon_intrinsics],
+ [.if test "$mpi_cpu_arch" != "aarch64" ||
+ test "$try_asm_modules" != "yes" ; then
+ gcry_cv_cc_aarch64_neon_intrinsics="n/a"
+ else
+ gcry_cv_cc_aarch64_neon_intrinsics=no
+ AC_COMPILE_IFELSE([.AC_LANG_SOURCE(
+ [.[.#include <arm_neon.h>
+ #define __m128i uint64x2_t
+ #define vpsrldq128(s, a, o) \
+ ({ uint64x2_t __tmp = { 0, 0 }; \
+ o = (__m128i)vextq_u8((uint8x16_t)a, \
+ (uint8x16_t)__tmp, (s) & 15); })
+ #define vaesenclast128(a, b, o) \
+ (o = (__m128i)vaeseq_u8((uint8x16_t)b, (uint8x16_t)a))
+ #define memory_barrier_with_vec(a) __asm__("" : "+w"(a) :: "memory")
+ static inline __attribute__((always_inline)) __m128i
+ fn2(__m128i a)
+ {
+ vpsrldq128(2, a, a);
+ return a;
+ }
+ __m128i fn(__m128i in)
+ {
+ __m128i x;
+ memory_barrier_with_vec(in);
+ x = fn2(in);
+ memory_barrier_with_vec(x);
+ vaesenclast128(in, x, in);
+ memory_barrier_with_vec(in);
+ return in;
+ }
+ ]])],
+ [gcry_cv_cc_aarch64_neon_intrinsics=yes])
+ fi])
+if test "$gcry_cv_cc_aarch64_neon_intrinsics" = "yes" ; then
+ AC_DEFINE(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS,1,
+ [Defined if underlying compiler supports AArch64/NEON/crypto intrinsics])
+fi
+
+_gcc_cflags_save=$CFLAGS
+CFLAGS="$CFLAGS -O2 -march=armv8-a+crypto"
+
+if test "$gcry_cv_cc_aarch64_neon_intrinsics" = "no" &&
+ test "$mpi_cpu_arch" = "aarch64" &&
+ test "$try_asm_modules" = "yes" ; then
+ AC_CACHE_CHECK([whether compiler supports AArch64/NEON/crypto intrinsics with extra GCC flags],
+ [gcry_cv_cc_aarch64_neon_intrinsics_cflags],
+ [.gcry_cv_cc_aarch64_neon_intrinsics_cflags=no
+ AC_COMPILE_IFELSE([.AC_LANG_SOURCE(
+ [.[.#include <arm_neon.h>
+ #define __m128i uint64x2_t
+ #define vpsrldq128(s, a, o) \
+ ({ uint64x2_t __tmp = { 0, 0 }; \
+ o = (__m128i)vextq_u8((uint8x16_t)a, \
+ (uint8x16_t)__tmp, (s) & 15); })
+ #define vaesenclast128(a, b, o) \
+ (o = (__m128i)vaeseq_u8((uint8x16_t)b, (uint8x16_t)a))
+ #define memory_barrier_with_vec(a) __asm__("" : "+w"(a) :: "memory")
+ static inline __attribute__((always_inline)) __m128i
+ fn2(__m128i a)
+ {
+ vpsrldq128(2, a, a);
+ return a;
+ }
+ __m128i fn(__m128i in)
+ {
+ __m128i x;
+ memory_barrier_with_vec(in);
+ x = fn2(in);
+ memory_barrier_with_vec(x);
+ vaesenclast128(in, x, in);
+ memory_barrier_with_vec(in);
+ return in;
+ }
+ ]])],
+ [gcry_cv_cc_aarch64_neon_intrinsics_cflags=yes])])
+ if test "$gcry_cv_cc_aarch64_neon_intrinsics_cflags" = "yes" ; then
+ AC_DEFINE(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS,1,
+ [Defined if underlying compiler supports AArch64/NEON/crypto intrinsics])
+ AC_DEFINE(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS_WITH_CFLAGS,1,
+ [.Defined if underlying compiler supports AArch64/NEON/crypto intrinsics with extra GCC flags])
+ fi
+fi
+
+AM_CONDITIONAL(ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS,
+ test "$gcry_cv_cc_aarch64_neon_intrinsics_cflags" = "yes")
+
+# Restore flags.
+CFLAGS=$_gcc_cflags_save;
+
+
+#
+# Check whether compiler supports PowerPC AltiVec/VSX intrinsics
#
AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics],
[gcry_cv_cc_ppc_altivec],
@@ -2173,8 +2269,8 @@ _gcc_cflags_save=$CFLAGS
CFLAGS="$CFLAGS -O2 -maltivec -mvsx -mcrypto"

if test "$gcry_cv_cc_ppc_altivec" = "no" &&
- test "$mpi_cpu_arch" = "ppc" &&
- test "$try_asm_modules" == "yes" ; then
+ test "$mpi_cpu_arch" = "ppc" &&
+ test "$try_asm_modules" = "yes" ; then
AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags],
[gcry_cv_cc_ppc_altivec_cflags],
[.gcry_cv_cc_ppc_altivec_cflags=no
@@ -2193,7 +2289,8 @@ if test "$gcry_cv_cc_ppc_altivec" = "no" &&
vecu32 y = vec_vsx_ld (0, (unsigned int*)0);
y = vec_sld_u32 (y, y, 3);
return vec_cipher_be (t, in) ^ (block)y;
- }]])],
+ }
+ ]])],
[gcry_cv_cc_ppc_altivec_cflags=yes])])
if test "$gcry_cv_cc_ppc_altivec_cflags" = "yes" ; then
AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1,
@@ -2966,6 +3063,7 @@ if test "$found" = "1" ; then
aarch64-*-*)
# Build with the assembly implementation
GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aarch64.lo"
+ GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aarch64-ce.lo"
;;
powerpc64le-*-*)
# Build with the POWER vector implementations
--
2.37.2


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel