Mailing List Archive

[PATCH] Add SM4 ARMv8/AArch64 assembly implementation
* cipher/Makefile.am: Add 'sm4-aarch64.S'.
* cipher/sm4-aarch64.S: New.
* cipher/sm4.c (USE_AARCH64_SIMD): New.
(SM4_context) [USE_AARCH64_SIMD]: Add 'use_aarch64_simd'.
[USE_AARCH64_SIMD] (_gcry_sm4_aarch64_crypt)
(_gcry_sm4_aarch64_cbc_dec, _gcry_sm4_aarch64_cfb_dec)
(_gcry_sm4_aarch64_ctr_dec): New.
(sm4_setkey): Enable ARMv8/AArch64 if supported by HW.
(_gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec)
(_gcry_sm4_cfb_dec) [USE_AESNI_AVX2]: Add ARMv8/AArch64 bulk functions.
* configure.ac: Add ''sm4-aarch64.lo'.
--

Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
---
cipher/Makefile.am | 2 +-
cipher/sm4-aarch64.S | 390 +++++++++++++++++++++++++++++++++++++++++++
cipher/sm4.c | 87 ++++++++++
configure.ac | 3 +
4 files changed, 481 insertions(+), 1 deletion(-)
create mode 100644 cipher/sm4-aarch64.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 264b3d30..6c1c7693 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -116,7 +116,7 @@ EXTRA_libcipher_la_SOURCES = \
scrypt.c \
seed.c \
serpent.c serpent-sse2-amd64.S \
- sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S \
+ sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S sm4-aarch64.S \
serpent-avx2-amd64.S serpent-armv7-neon.S \
sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \
diff --git a/cipher/sm4-aarch64.S b/cipher/sm4-aarch64.S
new file mode 100644
index 00000000..f9c828be
--- /dev/null
+++ b/cipher/sm4-aarch64.S
@@ -0,0 +1,390 @@
+/* sm4-aarch64.S - ARMv8/AArch64 accelerated SM4 cipher
+ *
+ * Copyright (C) 2021 Alibaba Group.
+ * Copyright (C) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__) && \
+ defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON) && \
+ defined(USE_SM4)
+
+.cpu generic+simd
+
+/* Constants */
+
+.text
+.align 4
+ELF(.type _gcry_sm4_aarch64_consts,@object)
+_gcry_sm4_aarch64_consts:
+.Lsm4_sbox:
+ .byte 0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7
+ .byte 0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05
+ .byte 0x2b, 0x67, 0x9a, 0x76, 0x2a, 0xbe, 0x04, 0xc3
+ .byte 0xaa, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99
+ .byte 0x9c, 0x42, 0x50, 0xf4, 0x91, 0xef, 0x98, 0x7a
+ .byte 0x33, 0x54, 0x0b, 0x43, 0xed, 0xcf, 0xac, 0x62
+ .byte 0xe4, 0xb3, 0x1c, 0xa9, 0xc9, 0x08, 0xe8, 0x95
+ .byte 0x80, 0xdf, 0x94, 0xfa, 0x75, 0x8f, 0x3f, 0xa6
+ .byte 0x47, 0x07, 0xa7, 0xfc, 0xf3, 0x73, 0x17, 0xba
+ .byte 0x83, 0x59, 0x3c, 0x19, 0xe6, 0x85, 0x4f, 0xa8
+ .byte 0x68, 0x6b, 0x81, 0xb2, 0x71, 0x64, 0xda, 0x8b
+ .byte 0xf8, 0xeb, 0x0f, 0x4b, 0x70, 0x56, 0x9d, 0x35
+ .byte 0x1e, 0x24, 0x0e, 0x5e, 0x63, 0x58, 0xd1, 0xa2
+ .byte 0x25, 0x22, 0x7c, 0x3b, 0x01, 0x21, 0x78, 0x87
+ .byte 0xd4, 0x00, 0x46, 0x57, 0x9f, 0xd3, 0x27, 0x52
+ .byte 0x4c, 0x36, 0x02, 0xe7, 0xa0, 0xc4, 0xc8, 0x9e
+ .byte 0xea, 0xbf, 0x8a, 0xd2, 0x40, 0xc7, 0x38, 0xb5
+ .byte 0xa3, 0xf7, 0xf2, 0xce, 0xf9, 0x61, 0x15, 0xa1
+ .byte 0xe0, 0xae, 0x5d, 0xa4, 0x9b, 0x34, 0x1a, 0x55
+ .byte 0xad, 0x93, 0x32, 0x30, 0xf5, 0x8c, 0xb1, 0xe3
+ .byte 0x1d, 0xf6, 0xe2, 0x2e, 0x82, 0x66, 0xca, 0x60
+ .byte 0xc0, 0x29, 0x23, 0xab, 0x0d, 0x53, 0x4e, 0x6f
+ .byte 0xd5, 0xdb, 0x37, 0x45, 0xde, 0xfd, 0x8e, 0x2f
+ .byte 0x03, 0xff, 0x6a, 0x72, 0x6d, 0x6c, 0x5b, 0x51
+ .byte 0x8d, 0x1b, 0xaf, 0x92, 0xbb, 0xdd, 0xbc, 0x7f
+ .byte 0x11, 0xd9, 0x5c, 0x41, 0x1f, 0x10, 0x5a, 0xd8
+ .byte 0x0a, 0xc1, 0x31, 0x88, 0xa5, 0xcd, 0x7b, 0xbd
+ .byte 0x2d, 0x74, 0xd0, 0x12, 0xb8, 0xe5, 0xb4, 0xb0
+ .byte 0x89, 0x69, 0x97, 0x4a, 0x0c, 0x96, 0x77, 0x7e
+ .byte 0x65, 0xb9, 0xf1, 0x09, 0xc5, 0x6e, 0xc6, 0x84
+ .byte 0x18, 0xf0, 0x7d, 0xec, 0x3a, 0xdc, 0x4d, 0x20
+ .byte 0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48
+ELF(.size _gcry_sm4_aarch64_consts,.-_gcry_sm4_aarch64_consts)
+
+/* Register macros */
+
+#define RTMP0 v8
+#define RTMP1 v9
+#define RTMP2 v10
+#define RTMP3 v11
+#define RTMP4 v12
+
+#define RX0 v13
+#define RKEY v14
+#define RIDX v15
+
+/* Helper macros. */
+
+#define preload_sbox(ptr) \
+ GET_DATA_POINTER(ptr, .Lsm4_sbox); \
+ ld1 {v16.16b-v19.16b}, [ptr], #64; \
+ ld1 {v20.16b-v23.16b}, [ptr], #64; \
+ ld1 {v24.16b-v27.16b}, [ptr], #64; \
+ ld1 {v28.16b-v31.16b}, [ptr]; \
+ movi RIDX.16b, #64; /* sizeof(sbox) / 4 */
+
+#define transpose_4x4(s0, s1, s2, s3) \
+ zip1 RTMP0.4s, s0.4s, s1.4s; \
+ zip1 RTMP1.4s, s2.4s, s3.4s; \
+ zip2 RTMP2.4s, s0.4s, s1.4s; \
+ zip2 RTMP3.4s, s2.4s, s3.4s; \
+ zip1 s0.2d, RTMP0.2d, RTMP1.2d; \
+ zip2 s1.2d, RTMP0.2d, RTMP1.2d; \
+ zip1 s2.2d, RTMP2.2d, RTMP3.2d; \
+ zip2 s3.2d, RTMP2.2d, RTMP3.2d;
+
+#define rotate_clockwise_90(s0, s1, s2, s3) \
+ zip1 RTMP0.4s, s1.4s, s0.4s; \
+ zip2 RTMP1.4s, s1.4s, s0.4s; \
+ zip1 RTMP2.4s, s3.4s, s2.4s; \
+ zip2 RTMP3.4s, s3.4s, s2.4s; \
+ zip1 s0.2d, RTMP2.2d, RTMP0.2d; \
+ zip2 s1.2d, RTMP2.2d, RTMP0.2d; \
+ zip1 s2.2d, RTMP3.2d, RTMP1.2d; \
+ zip2 s3.2d, RTMP3.2d, RTMP1.2d;
+
+#define ROUND(round, s0, s1, s2, s3) \
+ dup RX0.4s, RKEY.s[round]; \
+ /* rk ^ s1 ^ s2 ^ s3 */ \
+ eor RTMP1.16b, s2.16b, s3.16b; \
+ eor RX0.16b, RX0.16b, s1.16b; \
+ eor RX0.16b, RX0.16b, RTMP1.16b; \
+ \
+ /* sbox, non-linear part */ \
+ tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
+ sub RX0.16b, RX0.16b, RIDX.16b; \
+ tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
+ sub RX0.16b, RX0.16b, RIDX.16b; \
+ tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
+ sub RX0.16b, RX0.16b, RIDX.16b; \
+ tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
+ \
+ /* linear part */ \
+ shl RTMP1.4s, RTMP0.4s, #8; \
+ shl RTMP2.4s, RTMP0.4s, #16; \
+ shl RTMP3.4s, RTMP0.4s, #24; \
+ sri RTMP1.4s, RTMP0.4s, #(32-8); \
+ sri RTMP2.4s, RTMP0.4s, #(32-16); \
+ sri RTMP3.4s, RTMP0.4s, #(32-24); \
+ /* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
+ eor RTMP1.16b, RTMP1.16b, RTMP0.16b; \
+ eor RTMP1.16b, RTMP1.16b, RTMP2.16b; \
+ /* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */ \
+ eor RTMP3.16b, RTMP3.16b, RTMP0.16b; \
+ shl RTMP2.4s, RTMP1.4s, 2; \
+ sri RTMP2.4s, RTMP1.4s, #(32-2); \
+ eor RTMP3.16b, RTMP3.16b, RTMP2.16b; \
+ /* s0 ^= RTMP3 */ \
+ eor s0.16b, s0.16b, RTMP3.16b;
+
+
+ELF(.type __sm4_crypt_blk4,%function;)
+__sm4_crypt_blk4:
+ /* input:
+ * x0: round key array, CTX
+ * v0 v1 v2 v3: four parallel plaintext blocks
+ * output:
+ * v0 v1 v2 v3: four parallel ciphertext blocks
+ */
+ CFI_STARTPROC();
+
+ rev32 v0.16b, v0.16b;
+ rev32 v1.16b, v1.16b;
+ rev32 v2.16b, v2.16b;
+ rev32 v3.16b, v3.16b;
+
+ transpose_4x4(v0, v1, v2, v3);
+
+ mov x6, 8;
+.Lroundloop:
+ ld1 {RKEY.4s}, [x0], #16;
+ ROUND(0, v0, v1, v2, v3);
+ ROUND(1, v1, v2, v3, v0);
+ ROUND(2, v2, v3, v0, v1);
+ ROUND(3, v3, v0, v1, v2);
+
+ subs x6, x6, #1;
+ bne .Lroundloop;
+
+ rotate_clockwise_90(v0, v1, v2, v3);
+ rev32 v0.16b, v0.16b;
+ rev32 v1.16b, v1.16b;
+ rev32 v2.16b, v2.16b;
+ rev32 v3.16b, v3.16b;
+
+ sub x0, x0, #128; /* repoint to rkey */
+ ret;
+ CFI_ENDPROC();
+ELF(.size __sm4_crypt_blk4,.-__sm4_crypt_blk4;)
+
+.global _gcry_sm4_aarch64_crypt
+ELF(.type _gcry_sm4_aarch64_crypt,%function;)
+_gcry_sm4_aarch64_crypt:
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: nblocks (multiples of 4)
+ */
+ CFI_STARTPROC();
+
+ stp x29, x30, [sp, #-16]!;
+ CFI_ADJUST_CFA_OFFSET(16);
+ CFI_REG_ON_STACK(29, 0);
+ CFI_REG_ON_STACK(30, 8);
+
+ preload_sbox(x5);
+
+.Lcrypt_loop_blk4:
+ subs x3, x3, #4;
+ bmi .Lcrypt_end;
+
+ ld1 {v0.16b-v3.16b}, [x2], #64;
+ bl __sm4_crypt_blk4;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+ b .Lcrypt_loop_blk4;
+
+.Lcrypt_end:
+ ldp x29, x30, [sp], #16;
+ CFI_ADJUST_CFA_OFFSET(-16);
+ CFI_RESTORE(x29);
+ CFI_RESTORE(x30);
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_aarch64_crypt,.-_gcry_sm4_aarch64_crypt;)
+
+.global _gcry_sm4_aarch64_cbc_dec
+ELF(.type _gcry_sm4_aarch64_cbc_dec,%function;)
+_gcry_sm4_aarch64_cbc_dec:
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * x4: nblocks (multiples of 4)
+ */
+ CFI_STARTPROC();
+
+ stp x29, x30, [sp, #-16]!;
+ CFI_ADJUST_CFA_OFFSET(16);
+ CFI_REG_ON_STACK(29, 0);
+ CFI_REG_ON_STACK(30, 8);
+
+ preload_sbox(x5);
+ ld1 {RTMP4.16b}, [x3];
+
+.Lcbc_loop_blk4:
+ subs x4, x4, #4;
+ bmi .Lcbc_end;
+
+ ld1 {v0.16b-v3.16b}, [x2];
+
+ bl __sm4_crypt_blk4;
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v0.16b, v0.16b, RTMP4.16b;
+ eor v1.16b, v1.16b, RTMP0.16b;
+ eor v2.16b, v2.16b, RTMP1.16b;
+ eor v3.16b, v3.16b, RTMP2.16b;
+
+ st1 {v0.16b-v3.16b}, [x1], #64;
+ mov RTMP4.16b, RTMP3.16b;
+
+ b .Lcbc_loop_blk4;
+
+.Lcbc_end:
+ /* store new IV */
+ st1 {RTMP4.16b}, [x3];
+
+ ldp x29, x30, [sp], #16;
+ CFI_ADJUST_CFA_OFFSET(-16);
+ CFI_RESTORE(x29);
+ CFI_RESTORE(x30);
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_aarch64_cbc_dec,.-_gcry_sm4_aarch64_cbc_dec;)
+
+.global _gcry_sm4_aarch64_cfb_dec
+ELF(.type _gcry_sm4_aarch64_cfb_dec,%function;)
+_gcry_sm4_aarch64_cfb_dec:
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * x4: nblocks (multiples of 4)
+ */
+ CFI_STARTPROC();
+
+ stp x29, x30, [sp, #-16]!;
+ CFI_ADJUST_CFA_OFFSET(16);
+ CFI_REG_ON_STACK(29, 0);
+ CFI_REG_ON_STACK(30, 8);
+
+ preload_sbox(x5);
+ ld1 {v0.16b}, [x3];
+
+.Lcfb_loop_blk4:
+ subs x4, x4, #4;
+ bmi .Lcfb_end;
+
+ ld1 {v1.16b, v2.16b, v3.16b}, [x2];
+
+ bl __sm4_crypt_blk4;
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ eor v1.16b, v1.16b, RTMP1.16b;
+ eor v2.16b, v2.16b, RTMP2.16b;
+ eor v3.16b, v3.16b, RTMP3.16b;
+
+ st1 {v0.16b-v3.16b}, [x1], #64;
+ mov v0.16b, RTMP3.16b;
+
+ b .Lcfb_loop_blk4;
+
+.Lcfb_end:
+ /* store new IV */
+ st1 {v0.16b}, [x3];
+
+ ldp x29, x30, [sp], #16;
+ CFI_ADJUST_CFA_OFFSET(-16);
+ CFI_RESTORE(x29);
+ CFI_RESTORE(x30);
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_aarch64_cfb_dec,.-_gcry_sm4_aarch64_cfb_dec;)
+
+.global _gcry_sm4_aarch64_ctr_enc
+ELF(.type _gcry_sm4_aarch64_ctr_enc,%function;)
+_gcry_sm4_aarch64_ctr_enc:
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: ctr (big endian, 128 bit)
+ * x4: nblocks (multiples of 4)
+ */
+ CFI_STARTPROC();
+
+ stp x29, x30, [sp, #-16]!;
+ CFI_ADJUST_CFA_OFFSET(16);
+ CFI_REG_ON_STACK(29, 0);
+ CFI_REG_ON_STACK(30, 8);
+
+ preload_sbox(x5);
+
+ ldp x7, x8, [x3];
+ rev x7, x7;
+ rev x8, x8;
+
+.Lctr_loop_blk4:
+ subs x4, x4, #4;
+ bmi .Lctr_end;
+
+#define inc_le128(vctr) \
+ mov vctr.d[1], x8; \
+ mov vctr.d[0], x7; \
+ adds x8, x8, #1; \
+ adc x7, x7, xzr; \
+ rev64 vctr.16b, vctr.16b;
+
+ /* construct CTRs */
+ inc_le128(v0); /* +0 */
+ inc_le128(v1); /* +1 */
+ inc_le128(v2); /* +2 */
+ inc_le128(v3); /* +3 */
+
+ bl __sm4_crypt_blk4;
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ eor v1.16b, v1.16b, RTMP1.16b;
+ eor v2.16b, v2.16b, RTMP2.16b;
+ eor v3.16b, v3.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+ b .Lctr_loop_blk4;
+
+.Lctr_end:
+ /* store new CTR */
+ rev x7, x7;
+ rev x8, x8;
+ stp x7, x8, [x3];
+
+ ldp x29, x30, [sp], #16;
+ CFI_ADJUST_CFA_OFFSET(-16);
+ CFI_RESTORE(x29);
+ CFI_RESTORE(x30);
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_aarch64_ctr_enc,.-_gcry_sm4_aarch64_ctr_enc;)
+
+#endif
diff --git a/cipher/sm4.c b/cipher/sm4.c
index 81662988..afcfd61b 100644
--- a/cipher/sm4.c
+++ b/cipher/sm4.c
@@ -67,6 +67,15 @@
# endif
#endif

+#undef USE_AARCH64_SIMD
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(__AARCH64EL__) && \
+ defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON)
+# define USE_AARCH64_SIMD 1
+# endif
+#endif
+
static const char *sm4_selftest (void);

static void _gcry_sm4_ctr_enc (void *context, unsigned char *ctr,
@@ -94,6 +103,9 @@ typedef struct
#ifdef USE_AESNI_AVX2
unsigned int use_aesni_avx2:1;
#endif
+#ifdef USE_AARCH64_SIMD
+ unsigned int use_aarch64_simd:1;
+#endif
} SM4_context;

static const u32 fk[4] =
@@ -241,6 +253,27 @@ extern void _gcry_sm4_aesni_avx2_ocb_auth(const u32 *rk_enc,
const u64 Ls[16]) ASM_FUNC_ABI;
#endif /* USE_AESNI_AVX2 */

+#ifdef USE_AARCH64_SIMD
+extern void _gcry_sm4_aarch64_crypt(const u32 *rk, byte *out,
+ const byte *in,
+ int nblocks);
+
+extern void _gcry_sm4_aarch64_cbc_dec(const u32 *rk_dec, byte *out,
+ const byte *in,
+ byte *iv,
+ int nblocks);
+
+extern void _gcry_sm4_aarch64_cfb_dec(const u32 *rk_enc, byte *out,
+ const byte *in,
+ byte *iv,
+ int nblocks);
+
+extern void _gcry_sm4_aarch64_ctr_enc(const u32 *rk_enc, byte *out,
+ const byte *in,
+ byte *ctr,
+ int nblocks);
+#endif /* USE_AARCH64_SIMD */
+
static inline void prefetch_sbox_table(void)
{
const volatile byte *vtab = (void *)&sbox_table;
@@ -372,6 +405,9 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen,
#ifdef USE_AESNI_AVX2
ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2);
#endif
+#ifdef USE_AARCH64_SIMD
+ ctx->use_aarch64_simd = !!(hwf & HWF_ARM_NEON);
+#endif

/* Setup bulk encryption routines. */
memset (bulk_ops, 0, sizeof(*bulk_ops));
@@ -553,6 +589,23 @@ _gcry_sm4_ctr_enc(void *context, unsigned char *ctr,
}
#endif

+#ifdef USE_AARCH64_SIMD
+ if (ctx->use_aarch64_simd)
+ {
+ /* Process multiples of 4 blocks at a time. */
+ if (nblocks >= 4)
+ {
+ size_t nblks = nblocks & ~(4 - 1);
+
+ _gcry_sm4_aarch64_ctr_enc(ctx->rkey_enc, outbuf, inbuf, ctr, nblks);
+
+ nblocks -= nblks;
+ outbuf += nblks * 16;
+ inbuf += nblks * 16;
+ }
+ }
+#endif
+
/* Process remaining blocks. */
if (nblocks)
{
@@ -654,6 +707,23 @@ _gcry_sm4_cbc_dec(void *context, unsigned char *iv,
}
#endif

+#ifdef USE_AARCH64_SIMD
+ if (ctx->use_aarch64_simd)
+ {
+ /* Process multiples of 4 blocks at a time. */
+ if (nblocks >= 4)
+ {
+ size_t nblks = nblocks & ~(4 - 1);
+
+ _gcry_sm4_aarch64_cbc_dec(ctx->rkey_dec, outbuf, inbuf, iv, nblks);
+
+ nblocks -= nblks;
+ outbuf += nblks * 16;
+ inbuf += nblks * 16;
+ }
+ }
+#endif
+
/* Process remaining blocks. */
if (nblocks)
{
@@ -748,6 +818,23 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv,
}
#endif

+#ifdef USE_AARCH64_SIMD
+ if (ctx->use_aarch64_simd)
+ {
+ /* Process multiples of 4 blocks at a time. */
+ if (nblocks >= 4)
+ {
+ size_t nblks = nblocks & ~(4 - 1);
+
+ _gcry_sm4_aarch64_cfb_dec(ctx->rkey_enc, outbuf, inbuf, iv, nblks);
+
+ nblocks -= nblks;
+ outbuf += nblks * 16;
+ inbuf += nblks * 16;
+ }
+ }
+#endif
+
/* Process remaining blocks. */
if (nblocks)
{
diff --git a/configure.ac b/configure.ac
index ea01f5a6..89df9434 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2740,6 +2740,9 @@ if test "$found" = "1" ; then
GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aesni-avx-amd64.lo"
GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aesni-avx2-amd64.lo"
;;
+ aarch64-*-*)
+ # Build with the assembly implementation
+ GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aarch64.lo"
esac
fi

--
2.34.1


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@lists.gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel
Re: [PATCH] Add SM4 ARMv8/AArch64 assembly implementation [ In reply to ]
Hello,

Looks good, just few comments below...

On 16.2.2022 15.12, Tianjia Zhang wrote:
> * cipher/Makefile.am: Add 'sm4-aarch64.S'.
> * cipher/sm4-aarch64.S: New.
> * cipher/sm4.c (USE_AARCH64_SIMD): New.
> (SM4_context) [USE_AARCH64_SIMD]: Add 'use_aarch64_simd'.
> [USE_AARCH64_SIMD] (_gcry_sm4_aarch64_crypt)
> (_gcry_sm4_aarch64_cbc_dec, _gcry_sm4_aarch64_cfb_dec)
> (_gcry_sm4_aarch64_ctr_dec): New.
> (sm4_setkey): Enable ARMv8/AArch64 if supported by HW.
> (_gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec)
> (_gcry_sm4_cfb_dec) [USE_AESNI_AVX2]: Add ARMv8/AArch64 bulk functions.

USE_AARCH64_SIMD here.

> * configure.ac: Add ''sm4-aarch64.lo'.
> --
>
> Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
> ---
> cipher/Makefile.am | 2 +-
> cipher/sm4-aarch64.S | 390 +++++++++++++++++++++++++++++++++++++++++++
> cipher/sm4.c | 87 ++++++++++
> configure.ac | 3 +
> 4 files changed, 481 insertions(+), 1 deletion(-)
> create mode 100644 cipher/sm4-aarch64.S
>
> diff --git a/cipher/Makefile.am b/cipher/Makefile.am
> index 264b3d30..6c1c7693 100644
> --- a/cipher/Makefile.am
> +++ b/cipher/Makefile.am
> @@ -116,7 +116,7 @@ EXTRA_libcipher_la_SOURCES = \
> scrypt.c \
> seed.c \
> serpent.c serpent-sse2-amd64.S \
> - sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S \
> + sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S sm4-aarch64.S \
> serpent-avx2-amd64.S serpent-armv7-neon.S \
> sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
> sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \
> diff --git a/cipher/sm4-aarch64.S b/cipher/sm4-aarch64.S
> new file mode 100644
> index 00000000..f9c828be
> --- /dev/null
> +++ b/cipher/sm4-aarch64.S
> @@ -0,0 +1,390 @@
> +/* sm4-aarch64.S - ARMv8/AArch64 accelerated SM4 cipher
> + *
> + * Copyright (C) 2021 Alibaba Group.
> + * Copyright (C) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
> + *
> + * This file is part of Libgcrypt.
> + *
> + * Libgcrypt is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU Lesser General Public License as
> + * published by the Free Software Foundation; either version 2.1 of
> + * the License, or (at your option) any later version.
> + *
> + * Libgcrypt is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "asm-common-aarch64.h"
> +
> +#if defined(__AARCH64EL__) && \
> + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
> + defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON) && \
> + defined(USE_SM4)
> +
> +.cpu generic+simd
> +
> +/* Constants */
> +
> +.text
> +.align 4
> +ELF(.type _gcry_sm4_aarch64_consts,@object)
> +_gcry_sm4_aarch64_consts:
> +.Lsm4_sbox:
> + .byte 0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7
> + .byte 0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05
> + .byte 0x2b, 0x67, 0x9a, 0x76, 0x2a, 0xbe, 0x04, 0xc3
> + .byte 0xaa, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99
> + .byte 0x9c, 0x42, 0x50, 0xf4, 0x91, 0xef, 0x98, 0x7a
> + .byte 0x33, 0x54, 0x0b, 0x43, 0xed, 0xcf, 0xac, 0x62
> + .byte 0xe4, 0xb3, 0x1c, 0xa9, 0xc9, 0x08, 0xe8, 0x95
> + .byte 0x80, 0xdf, 0x94, 0xfa, 0x75, 0x8f, 0x3f, 0xa6
> + .byte 0x47, 0x07, 0xa7, 0xfc, 0xf3, 0x73, 0x17, 0xba
> + .byte 0x83, 0x59, 0x3c, 0x19, 0xe6, 0x85, 0x4f, 0xa8
> + .byte 0x68, 0x6b, 0x81, 0xb2, 0x71, 0x64, 0xda, 0x8b
> + .byte 0xf8, 0xeb, 0x0f, 0x4b, 0x70, 0x56, 0x9d, 0x35
> + .byte 0x1e, 0x24, 0x0e, 0x5e, 0x63, 0x58, 0xd1, 0xa2
> + .byte 0x25, 0x22, 0x7c, 0x3b, 0x01, 0x21, 0x78, 0x87
> + .byte 0xd4, 0x00, 0x46, 0x57, 0x9f, 0xd3, 0x27, 0x52
> + .byte 0x4c, 0x36, 0x02, 0xe7, 0xa0, 0xc4, 0xc8, 0x9e
> + .byte 0xea, 0xbf, 0x8a, 0xd2, 0x40, 0xc7, 0x38, 0xb5
> + .byte 0xa3, 0xf7, 0xf2, 0xce, 0xf9, 0x61, 0x15, 0xa1
> + .byte 0xe0, 0xae, 0x5d, 0xa4, 0x9b, 0x34, 0x1a, 0x55
> + .byte 0xad, 0x93, 0x32, 0x30, 0xf5, 0x8c, 0xb1, 0xe3
> + .byte 0x1d, 0xf6, 0xe2, 0x2e, 0x82, 0x66, 0xca, 0x60
> + .byte 0xc0, 0x29, 0x23, 0xab, 0x0d, 0x53, 0x4e, 0x6f
> + .byte 0xd5, 0xdb, 0x37, 0x45, 0xde, 0xfd, 0x8e, 0x2f
> + .byte 0x03, 0xff, 0x6a, 0x72, 0x6d, 0x6c, 0x5b, 0x51
> + .byte 0x8d, 0x1b, 0xaf, 0x92, 0xbb, 0xdd, 0xbc, 0x7f
> + .byte 0x11, 0xd9, 0x5c, 0x41, 0x1f, 0x10, 0x5a, 0xd8
> + .byte 0x0a, 0xc1, 0x31, 0x88, 0xa5, 0xcd, 0x7b, 0xbd
> + .byte 0x2d, 0x74, 0xd0, 0x12, 0xb8, 0xe5, 0xb4, 0xb0
> + .byte 0x89, 0x69, 0x97, 0x4a, 0x0c, 0x96, 0x77, 0x7e
> + .byte 0x65, 0xb9, 0xf1, 0x09, 0xc5, 0x6e, 0xc6, 0x84
> + .byte 0x18, 0xf0, 0x7d, 0xec, 0x3a, 0xdc, 0x4d, 0x20
> + .byte 0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48
> +ELF(.size _gcry_sm4_aarch64_consts,.-_gcry_sm4_aarch64_consts)
> +
> +/* Register macros */
> +
> +#define RTMP0 v8
> +#define RTMP1 v9
> +#define RTMP2 v10
> +#define RTMP3 v11
> +#define RTMP4 v12
> +
> +#define RX0 v13
> +#define RKEY v14
> +#define RIDX v15

Vectors registers v8 to v15 are being used, so functions need
to store and restore d8-d15 registers as they are ABI callee
saved. Check "VPUSH_ABI" and "VPOP_API" macros in
"cipher-gcm-armv8-aarch64-ce.S". Those could be moved to
"asm-common-aarch64.h" so that macros can be shared between
different files.

> +
> +/* Helper macros. */
> +
> +#define preload_sbox(ptr) \
> + GET_DATA_POINTER(ptr, .Lsm4_sbox); \
> + ld1 {v16.16b-v19.16b}, [ptr], #64; \
> + ld1 {v20.16b-v23.16b}, [ptr], #64; \
> + ld1 {v24.16b-v27.16b}, [ptr], #64; \
> + ld1 {v28.16b-v31.16b}, [ptr]; \
> + movi RIDX.16b, #64; /* sizeof(sbox) / 4 */
> +
> +#define transpose_4x4(s0, s1, s2, s3) \
> + zip1 RTMP0.4s, s0.4s, s1.4s; \
> + zip1 RTMP1.4s, s2.4s, s3.4s; \
> + zip2 RTMP2.4s, s0.4s, s1.4s; \
> + zip2 RTMP3.4s, s2.4s, s3.4s; \
> + zip1 s0.2d, RTMP0.2d, RTMP1.2d; \
> + zip2 s1.2d, RTMP0.2d, RTMP1.2d; \
> + zip1 s2.2d, RTMP2.2d, RTMP3.2d; \
> + zip2 s3.2d, RTMP2.2d, RTMP3.2d;
> +
> +#define rotate_clockwise_90(s0, s1, s2, s3) \
> + zip1 RTMP0.4s, s1.4s, s0.4s; \
> + zip2 RTMP1.4s, s1.4s, s0.4s; \
> + zip1 RTMP2.4s, s3.4s, s2.4s; \
> + zip2 RTMP3.4s, s3.4s, s2.4s; \
> + zip1 s0.2d, RTMP2.2d, RTMP0.2d; \
> + zip2 s1.2d, RTMP2.2d, RTMP0.2d; \
> + zip1 s2.2d, RTMP3.2d, RTMP1.2d; \
> + zip2 s3.2d, RTMP3.2d, RTMP1.2d;
> +
> +#define ROUND(round, s0, s1, s2, s3) \
> + dup RX0.4s, RKEY.s[round]; \
> + /* rk ^ s1 ^ s2 ^ s3 */ \
> + eor RTMP1.16b, s2.16b, s3.16b; \
> + eor RX0.16b, RX0.16b, s1.16b; \
> + eor RX0.16b, RX0.16b, RTMP1.16b; \
> + \
> + /* sbox, non-linear part */ \
> + tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
> + sub RX0.16b, RX0.16b, RIDX.16b; \
> + tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
> + sub RX0.16b, RX0.16b, RIDX.16b; \
> + tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
> + sub RX0.16b, RX0.16b, RIDX.16b; \
> + tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
> + \
> + /* linear part */ \
> + shl RTMP1.4s, RTMP0.4s, #8; \
> + shl RTMP2.4s, RTMP0.4s, #16; \
> + shl RTMP3.4s, RTMP0.4s, #24; \
> + sri RTMP1.4s, RTMP0.4s, #(32-8); \
> + sri RTMP2.4s, RTMP0.4s, #(32-16); \
> + sri RTMP3.4s, RTMP0.4s, #(32-24); \
> + /* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
> + eor RTMP1.16b, RTMP1.16b, RTMP0.16b; \
> + eor RTMP1.16b, RTMP1.16b, RTMP2.16b; \
> + /* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */ \
> + eor RTMP3.16b, RTMP3.16b, RTMP0.16b; \
> + shl RTMP2.4s, RTMP1.4s, 2; \
> + sri RTMP2.4s, RTMP1.4s, #(32-2); \
> + eor RTMP3.16b, RTMP3.16b, RTMP2.16b; \
> + /* s0 ^= RTMP3 */ \
> + eor s0.16b, s0.16b, RTMP3.16b;
> +
> +
> +ELF(.type __sm4_crypt_blk4,%function;)
> +__sm4_crypt_blk4:
> + /* input:
> + * x0: round key array, CTX
> + * v0 v1 v2 v3: four parallel plaintext blocks
> + * output:
> + * v0 v1 v2 v3: four parallel ciphertext blocks
> + */
> + CFI_STARTPROC();
> +
> + rev32 v0.16b, v0.16b;
> + rev32 v1.16b, v1.16b;
> + rev32 v2.16b, v2.16b;
> + rev32 v3.16b, v3.16b;
> +
> + transpose_4x4(v0, v1, v2, v3);
> +
> + mov x6, 8;
> +.Lroundloop:
> + ld1 {RKEY.4s}, [x0], #16;
> + ROUND(0, v0, v1, v2, v3);
> + ROUND(1, v1, v2, v3, v0);
> + ROUND(2, v2, v3, v0, v1);
> + ROUND(3, v3, v0, v1, v2);
> +
> + subs x6, x6, #1;

Bit of micro-optimization, but this could be moved after
"ld1 {RKEY.4s}" above.

> + bne .Lroundloop;
> +
> + rotate_clockwise_90(v0, v1, v2, v3);
> + rev32 v0.16b, v0.16b;
> + rev32 v1.16b, v1.16b;
> + rev32 v2.16b, v2.16b;
> + rev32 v3.16b, v3.16b;
> +
> + sub x0, x0, #128; /* repoint to rkey */
> + ret;
> + CFI_ENDPROC();
> +ELF(.size __sm4_crypt_blk4,.-__sm4_crypt_blk4;)
> +
> +.global _gcry_sm4_aarch64_crypt
> +ELF(.type _gcry_sm4_aarch64_crypt,%function;)
> +_gcry_sm4_aarch64_crypt:
> + /* input:
> + * x0: round key array, CTX
> + * x1: dst
> + * x2: src
> + * x3: nblocks (multiples of 4)
> + */
> + CFI_STARTPROC();
> +
> + stp x29, x30, [sp, #-16]!;
> + CFI_ADJUST_CFA_OFFSET(16);
> + CFI_REG_ON_STACK(29, 0);
> + CFI_REG_ON_STACK(30, 8);
> +
> + preload_sbox(x5);
> +
> +.Lcrypt_loop_blk4:
> + subs x3, x3, #4;
> + bmi .Lcrypt_end;
> +
> + ld1 {v0.16b-v3.16b}, [x2], #64;
> + bl __sm4_crypt_blk4;
> + st1 {v0.16b-v3.16b}, [x1], #64;
> + b .Lcrypt_loop_blk4;
> +
> +.Lcrypt_end:
> + ldp x29, x30, [sp], #16;
> + CFI_ADJUST_CFA_OFFSET(-16);
> + CFI_RESTORE(x29);
> + CFI_RESTORE(x30);
> + ret_spec_stop;
> + CFI_ENDPROC();
> +ELF(.size _gcry_sm4_aarch64_crypt,.-_gcry_sm4_aarch64_crypt;)
> +
> +.global _gcry_sm4_aarch64_cbc_dec
> +ELF(.type _gcry_sm4_aarch64_cbc_dec,%function;)
> +_gcry_sm4_aarch64_cbc_dec:
> + /* input:
> + * x0: round key array, CTX
> + * x1: dst
> + * x2: src
> + * x3: iv (big endian, 128 bit)
> + * x4: nblocks (multiples of 4)
> + */
> + CFI_STARTPROC();
> +
> + stp x29, x30, [sp, #-16]!;
> + CFI_ADJUST_CFA_OFFSET(16);
> + CFI_REG_ON_STACK(29, 0);
> + CFI_REG_ON_STACK(30, 8);
> +
> + preload_sbox(x5);
> + ld1 {RTMP4.16b}, [x3];
> +
> +.Lcbc_loop_blk4:
> + subs x4, x4, #4;
> + bmi .Lcbc_end;
> +
> + ld1 {v0.16b-v3.16b}, [x2];
> +
> + bl __sm4_crypt_blk4;
> +
> + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
> + eor v0.16b, v0.16b, RTMP4.16b;
> + eor v1.16b, v1.16b, RTMP0.16b;
> + eor v2.16b, v2.16b, RTMP1.16b;
> + eor v3.16b, v3.16b, RTMP2.16b;
> +
> + st1 {v0.16b-v3.16b}, [x1], #64;
> + mov RTMP4.16b, RTMP3.16b;
> +
> + b .Lcbc_loop_blk4;
> +
> +.Lcbc_end:
> + /* store new IV */
> + st1 {RTMP4.16b}, [x3];
> +
> + ldp x29, x30, [sp], #16;
> + CFI_ADJUST_CFA_OFFSET(-16);
> + CFI_RESTORE(x29);
> + CFI_RESTORE(x30);
> + ret_spec_stop;
> + CFI_ENDPROC();
> +ELF(.size _gcry_sm4_aarch64_cbc_dec,.-_gcry_sm4_aarch64_cbc_dec;)
> +
> +.global _gcry_sm4_aarch64_cfb_dec
> +ELF(.type _gcry_sm4_aarch64_cfb_dec,%function;)
> +_gcry_sm4_aarch64_cfb_dec:
> + /* input:
> + * x0: round key array, CTX
> + * x1: dst
> + * x2: src
> + * x3: iv (big endian, 128 bit)
> + * x4: nblocks (multiples of 4)
> + */
> + CFI_STARTPROC();
> +
> + stp x29, x30, [sp, #-16]!;
> + CFI_ADJUST_CFA_OFFSET(16);
> + CFI_REG_ON_STACK(29, 0);
> + CFI_REG_ON_STACK(30, 8);
> +
> + preload_sbox(x5);
> + ld1 {v0.16b}, [x3];
> +
> +.Lcfb_loop_blk4:
> + subs x4, x4, #4;
> + bmi .Lcfb_end;
> +
> + ld1 {v1.16b, v2.16b, v3.16b}, [x2];
> +
> + bl __sm4_crypt_blk4;
> +
> + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
> + eor v0.16b, v0.16b, RTMP0.16b;
> + eor v1.16b, v1.16b, RTMP1.16b;
> + eor v2.16b, v2.16b, RTMP2.16b;
> + eor v3.16b, v3.16b, RTMP3.16b;
> +
> + st1 {v0.16b-v3.16b}, [x1], #64;
> + mov v0.16b, RTMP3.16b;
> +
> + b .Lcfb_loop_blk4;
> +
> +.Lcfb_end:
> + /* store new IV */
> + st1 {v0.16b}, [x3];
> +
> + ldp x29, x30, [sp], #16;
> + CFI_ADJUST_CFA_OFFSET(-16);
> + CFI_RESTORE(x29);
> + CFI_RESTORE(x30);
> + ret_spec_stop;
> + CFI_ENDPROC();
> +ELF(.size _gcry_sm4_aarch64_cfb_dec,.-_gcry_sm4_aarch64_cfb_dec;)
> +
> +.global _gcry_sm4_aarch64_ctr_enc
> +ELF(.type _gcry_sm4_aarch64_ctr_enc,%function;)
> +_gcry_sm4_aarch64_ctr_enc:
> + /* input:
> + * x0: round key array, CTX
> + * x1: dst
> + * x2: src
> + * x3: ctr (big endian, 128 bit)
> + * x4: nblocks (multiples of 4)
> + */
> + CFI_STARTPROC();
> +
> + stp x29, x30, [sp, #-16]!;
> + CFI_ADJUST_CFA_OFFSET(16);
> + CFI_REG_ON_STACK(29, 0);
> + CFI_REG_ON_STACK(30, 8);
> +
> + preload_sbox(x5);
> +
> + ldp x7, x8, [x3];
> + rev x7, x7;
> + rev x8, x8;
> +
> +.Lctr_loop_blk4:
> + subs x4, x4, #4;
> + bmi .Lctr_end;
> +
> +#define inc_le128(vctr) \
> + mov vctr.d[1], x8; \
> + mov vctr.d[0], x7; \
> + adds x8, x8, #1; \
> + adc x7, x7, xzr; \
> + rev64 vctr.16b, vctr.16b;
> +
> + /* construct CTRs */
> + inc_le128(v0); /* +0 */
> + inc_le128(v1); /* +1 */
> + inc_le128(v2); /* +2 */
> + inc_le128(v3); /* +3 */
> +
> + bl __sm4_crypt_blk4;
> +
> + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
> + eor v0.16b, v0.16b, RTMP0.16b;
> + eor v1.16b, v1.16b, RTMP1.16b;
> + eor v2.16b, v2.16b, RTMP2.16b;
> + eor v3.16b, v3.16b, RTMP3.16b;
> + st1 {v0.16b-v3.16b}, [x1], #64;
> + b .Lctr_loop_blk4;
> +
> +.Lctr_end:
> + /* store new CTR */
> + rev x7, x7;
> + rev x8, x8;
> + stp x7, x8, [x3];
> +
> + ldp x29, x30, [sp], #16;
> + CFI_ADJUST_CFA_OFFSET(-16);
> + CFI_RESTORE(x29);
> + CFI_RESTORE(x30);
> + ret_spec_stop;
> + CFI_ENDPROC();
> +ELF(.size _gcry_sm4_aarch64_ctr_enc,.-_gcry_sm4_aarch64_ctr_enc;)
> +
> +#endif
> diff --git a/cipher/sm4.c b/cipher/sm4.c
> index 81662988..afcfd61b 100644
> --- a/cipher/sm4.c
> +++ b/cipher/sm4.c
> @@ -67,6 +67,15 @@
> # endif
> #endif
>
> +#undef USE_AARCH64_SIMD
> +#ifdef ENABLE_NEON_SUPPORT
> +# if defined(__AARCH64EL__) && \
> + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
> + defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON)
> +# define USE_AARCH64_SIMD 1
> +# endif
> +#endif
> +
> static const char *sm4_selftest (void);
>
> static void _gcry_sm4_ctr_enc (void *context, unsigned char *ctr,
> @@ -94,6 +103,9 @@ typedef struct
> #ifdef USE_AESNI_AVX2
> unsigned int use_aesni_avx2:1;
> #endif
> +#ifdef USE_AARCH64_SIMD
> + unsigned int use_aarch64_simd:1;
> +#endif
> } SM4_context;
>
> static const u32 fk[4] =
> @@ -241,6 +253,27 @@ extern void _gcry_sm4_aesni_avx2_ocb_auth(const u32 *rk_enc,
> const u64 Ls[16]) ASM_FUNC_ABI;
> #endif /* USE_AESNI_AVX2 */
>
> +#ifdef USE_AARCH64_SIMD
> +extern void _gcry_sm4_aarch64_crypt(const u32 *rk, byte *out,
> + const byte *in,
> + int nblocks);> +
> +extern void _gcry_sm4_aarch64_cbc_dec(const u32 *rk_dec, byte *out,
> + const byte *in,
> + byte *iv,
> + int nblocks);
> +
> +extern void _gcry_sm4_aarch64_cfb_dec(const u32 *rk_enc, byte *out,
> + const byte *in,
> + byte *iv,
> + int nblocks);
> +
> +extern void _gcry_sm4_aarch64_ctr_enc(const u32 *rk_enc, byte *out,
> + const byte *in,
> + byte *ctr,
> + int nblocks);

Use 'size_t' for nblocks. Clang can make assumption that 'int'
means that target function uses only low 32-bit of 'nblocks'
(assumes that target function accesses only through W3
register) and leave garbage values in upper 32-bit of X3 register
here.

-Jussi

_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@lists.gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel
Re: [PATCH] Add SM4 ARMv8/AArch64 assembly implementation [ In reply to ]
Hi Jussi,

On 2/18/22 2:12 AM, Jussi Kivilinna wrote:
> Hello,
>
> Looks good, just few comments below...
>
> On 16.2.2022 15.12, Tianjia Zhang wrote:
>> * cipher/Makefile.am: Add 'sm4-aarch64.S'.
>> * cipher/sm4-aarch64.S: New.
>> * cipher/sm4.c (USE_AARCH64_SIMD): New.
>> (SM4_context) [USE_AARCH64_SIMD]: Add 'use_aarch64_simd'.
>> [USE_AARCH64_SIMD] (_gcry_sm4_aarch64_crypt)
>> (_gcry_sm4_aarch64_cbc_dec, _gcry_sm4_aarch64_cfb_dec)
>> (_gcry_sm4_aarch64_ctr_dec): New.
>> (sm4_setkey): Enable ARMv8/AArch64 if supported by HW.
>> (_gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec)
>> (_gcry_sm4_cfb_dec) [USE_AESNI_AVX2]: Add ARMv8/AArch64 bulk functions.
>
> USE_AARCH64_SIMD here.
>
>> * configure.ac: Add ''sm4-aarch64.lo'.

>> +/* Register macros */
>> +
>> +#define RTMP0 v8
>> +#define RTMP1 v9
>> +#define RTMP2 v10
>> +#define RTMP3 v11
>> +#define RTMP4 v12
>> +
>> +#define RX0   v13
>> +#define RKEY  v14
>> +#define RIDX  v15
>
> Vectors registers v8 to v15 are being used, so functions need
> to store and restore d8-d15 registers as they are ABI callee
> saved. Check "VPUSH_ABI" and "VPOP_API" macros in
> "cipher-gcm-armv8-aarch64-ce.S". Those could be moved to
> "asm-common-aarch64.h" so that macros can be shared between
> different files.
>
>> +

>> +    mov x6, 8;
>> +.Lroundloop:
>> +    ld1 {RKEY.4s}, [x0], #16;
>> +    ROUND(0, v0, v1, v2, v3);
>> +    ROUND(1, v1, v2, v3, v0);
>> +    ROUND(2, v2, v3, v0, v1);
>> +    ROUND(3, v3, v0, v1, v2);
>> +
>> +    subs x6, x6, #1;
>
> Bit of micro-optimization, but this could be moved after
> "ld1 {RKEY.4s}" above.
>
>> +    bne .Lroundloop;
>> +
>> +    rotate_clockwise_90(v0, v1, v2, v3);
>> +    rev32 v0.16b, v0.16b;
>> +    rev32 v1.16b, v1.16b;
>> +    rev32 v2.16b, v2.16b;
>> +    rev32 v3.16b, v3.16b;

>>   #endif /* USE_AESNI_AVX2 */
>> +#ifdef USE_AARCH64_SIMD
>> +extern void _gcry_sm4_aarch64_crypt(const u32 *rk, byte *out,
>> +                    const byte *in,
>> +                    int nblocks);> +
>> +extern void _gcry_sm4_aarch64_cbc_dec(const u32 *rk_dec, byte *out,
>> +                      const byte *in,
>> +                      byte *iv,
>> +                      int nblocks);
>> +
>> +extern void _gcry_sm4_aarch64_cfb_dec(const u32 *rk_enc, byte *out,
>> +                      const byte *in,
>> +                      byte *iv,
>> +                      int nblocks);
>> +
>> +extern void _gcry_sm4_aarch64_ctr_enc(const u32 *rk_enc, byte *out,
>> +                      const byte *in,
>> +                      byte *ctr,
>> +                      int nblocks);
>
> Use 'size_t' for nblocks. Clang can make assumption that 'int'
> means that target function uses only low 32-bit of 'nblocks'
> (assumes that target function accesses only through W3
> register) and leave garbage values in upper 32-bit of X3 register
> here.
>
> -Jussi

Thanks for your suggestion, I will fix the bugs you mentioned, and
introduce 8 way acceleration support in the v2 patch, and introduce
a new patch to move VPUSH_ABI/VPOP_ABI into asm-common-aarch64.h
header file.

Best regards,
Tianjia

_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@lists.gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel