Mailing List Archive: [PATCH 2/2] Add SM4 ARMv8/AArch64/CE assembly implementation

[PATCH 2/2] Add SM4 ARMv8/AArch64/CE assembly implementation

Feb 24, 2022, 11:41 PM

Post #1 of 3 (363 views)

* cipher/Makefile.am: Add 'sm4-armv8-aarch64-ce.S'.
* cipher/sm4-armv8-aarch64-ce.S: New.
* cipher/sm4.c (USE_ARM_CE): New.
(SM4_context) [USE_ARM_CE]: Add 'use_arm_ce'.
[USE_ARM_CE] (_gcry_sm4_armv8_ce_expand_key)
(_gcry_sm4_armv8_ce_crypt, _gcry_sm4_armv8_ce_ctr_enc)
(_gcry_sm4_armv8_ce_cbc_dec, _gcry_sm4_armv8_ce_cfb_dec)
(_gcry_sm4_armv8_ce_crypt_blk1_8, sm4_armv8_ce_crypt_blk1_8): New.
(sm4_expand_key) [USE_ARM_CE]: Use ARMv8/AArch64/CE key setup.
(sm4_setkey): Enable ARMv8/AArch64/CE if supported by HW.
(_gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec, _gcry_sm4_cfb_dec)
(_gcry_sm4_ocb_crypt, _gcry_sm4_ocb_auth) [USE_ARM_CE]:
Add ARMv8/AArch64/CE bulk functions.
* configure.ac: Add 'sm4-armv8-aarch64-ce.lo'.
--

This patch adds ARMv8/AArch64/CE bulk encryption/decryption. Bulk
functions process eight blocks in parallel.

Benchmark on T-Head Yitian-710 2.75 GHz:

Before:
SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CBC enc | 12.10 ns/B 78.79 MiB/s 33.28 c/B 2750
CBC dec | 4.63 ns/B 205.9 MiB/s 12.74 c/B 2749
CFB enc | 12.14 ns/B 78.58 MiB/s 33.37 c/B 2750
CFB dec | 4.64 ns/B 205.5 MiB/s 12.76 c/B 2750
CTR enc | 4.69 ns/B 203.3 MiB/s 12.90 c/B 2750
CTR dec | 4.69 ns/B 203.3 MiB/s 12.90 c/B 2750
GCM enc | 4.88 ns/B 195.4 MiB/s 13.42 c/B 2750
GCM dec | 4.88 ns/B 195.5 MiB/s 13.42 c/B 2750
GCM auth | 0.189 ns/B 5048 MiB/s 0.520 c/B 2750
OCB enc | 4.86 ns/B 196.0 MiB/s 13.38 c/B 2750
OCB dec | 4.90 ns/B 194.7 MiB/s 13.47 c/B 2750
OCB auth | 4.79 ns/B 199.0 MiB/s 13.18 c/B 2750

After (16x - 19x faster than ARMv8/AArch64 impl):
SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CBC enc | 12.10 ns/B 78.81 MiB/s 33.27 c/B 2750
CBC dec | 0.243 ns/B 3921 MiB/s 0.669 c/B 2750
CFB enc | 12.14 ns/B 78.52 MiB/s 33.39 c/B 2750
CFB dec | 0.241 ns/B 3963 MiB/s 0.662 c/B 2750
CTR enc | 0.298 ns/B 3201 MiB/s 0.819 c/B 2749
CTR dec | 0.298 ns/B 3197 MiB/s 0.820 c/B 2750
GCM enc | 0.488 ns/B 1956 MiB/s 1.34 c/B 2749
GCM dec | 0.487 ns/B 1959 MiB/s 1.34 c/B 2750
GCM auth | 0.189 ns/B 5049 MiB/s 0.519 c/B 2749
OCB enc | 0.461 ns/B 2069 MiB/s 1.27 c/B 2750
OCB dec | 0.495 ns/B 1928 MiB/s 1.36 c/B 2750
OCB auth | 0.385 ns/B 2479 MiB/s 1.06 c/B 2750

Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
---
cipher/Makefile.am | 1 +
cipher/sm4-armv8-aarch64-ce.S | 614 ++++++++++++++++++++++++++++++++++
cipher/sm4.c | 142 ++++++++
configure.ac | 1 +
4 files changed, 758 insertions(+)
create mode 100644 cipher/sm4-armv8-aarch64-ce.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index a7cbf3fc..3339c463 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -117,6 +117,7 @@ EXTRA_libcipher_la_SOURCES = \
seed.c \
serpent.c serpent-sse2-amd64.S \
sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S sm4-aarch64.S \
+ sm4-armv8-aarch64-ce.S \
serpent-avx2-amd64.S serpent-armv7-neon.S \
sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \
diff --git a/cipher/sm4-armv8-aarch64-ce.S b/cipher/sm4-armv8-aarch64-ce.S
new file mode 100644
index 00000000..943f0143
--- /dev/null
+++ b/cipher/sm4-armv8-aarch64-ce.S
@@ -0,0 +1,614 @@
+/* sm4-armv8-aarch64-ce.S - ARMv8/AArch64/CE accelerated SM4 cipher
+ *
+ * Copyright (C) 2022 Alibaba Group.
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__) && \
+ defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
+ defined(USE_SM4)
+
+.cpu generic+simd+crypto
+
+.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 24, 25, 26, 27, 28, 29, 30, 31
+ .set .Lv\b\().4s, \b
+.endr
+
+.macro sm4e, vd, vn
+ .inst 0xcec08400 | (.L\vn << 5) | .L\vd
+.endm
+
+.macro sm4ekey, vd, vn, vm
+ .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
+.endm
+
+.text
+
+/* Register macros */
+
+#define RTMP0 v16
+#define RTMP1 v17
+#define RTMP2 v18
+#define RTMP3 v19
+
+#define RIV v20
+
+/* Helper macros. */
+
+#define load_rkey(ptr) \
+ ld1 {v24.16b-v27.16b}, [ptr], #64; \
+ ld1 {v28.16b-v31.16b}, [ptr];
+
+#define crypt_blk4(b0, b1, b2, b3) \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ sm4e b0.4s, v24.4s; \
+ sm4e b1.4s, v24.4s; \
+ sm4e b2.4s, v24.4s; \
+ sm4e b3.4s, v24.4s; \
+ sm4e b0.4s, v25.4s; \
+ sm4e b1.4s, v25.4s; \
+ sm4e b2.4s, v25.4s; \
+ sm4e b3.4s, v25.4s; \
+ sm4e b0.4s, v26.4s; \
+ sm4e b1.4s, v26.4s; \
+ sm4e b2.4s, v26.4s; \
+ sm4e b3.4s, v26.4s; \
+ sm4e b0.4s, v27.4s; \
+ sm4e b1.4s, v27.4s; \
+ sm4e b2.4s, v27.4s; \
+ sm4e b3.4s, v27.4s; \
+ sm4e b0.4s, v28.4s; \
+ sm4e b1.4s, v28.4s; \
+ sm4e b2.4s, v28.4s; \
+ sm4e b3.4s, v28.4s; \
+ sm4e b0.4s, v29.4s; \
+ sm4e b1.4s, v29.4s; \
+ sm4e b2.4s, v29.4s; \
+ sm4e b3.4s, v29.4s; \
+ sm4e b0.4s, v30.4s; \
+ sm4e b1.4s, v30.4s; \
+ sm4e b2.4s, v30.4s; \
+ sm4e b3.4s, v30.4s; \
+ sm4e b0.4s, v31.4s; \
+ sm4e b1.4s, v31.4s; \
+ sm4e b2.4s, v31.4s; \
+ sm4e b3.4s, v31.4s; \
+ rev64 b0.4s, b0.4s; \
+ rev64 b1.4s, b1.4s; \
+ rev64 b2.4s, b2.4s; \
+ rev64 b3.4s, b3.4s; \
+ ext b0.16b, b0.16b, b0.16b, #8; \
+ ext b1.16b, b1.16b, b1.16b, #8; \
+ ext b2.16b, b2.16b, b2.16b, #8; \
+ ext b3.16b, b3.16b, b3.16b, #8; \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b;
+
+#define crypt_blk8(b0, b1, b2, b3, b4, b5, b6, b7) \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ rev32 b4.16b, b4.16b; \
+ rev32 b5.16b, b5.16b; \
+ rev32 b6.16b, b6.16b; \
+ rev32 b7.16b, b7.16b; \
+ sm4e b0.4s, v24.4s; \
+ sm4e b1.4s, v24.4s; \
+ sm4e b2.4s, v24.4s; \
+ sm4e b3.4s, v24.4s; \
+ sm4e b4.4s, v24.4s; \
+ sm4e b5.4s, v24.4s; \
+ sm4e b6.4s, v24.4s; \
+ sm4e b7.4s, v24.4s; \
+ sm4e b0.4s, v25.4s; \
+ sm4e b1.4s, v25.4s; \
+ sm4e b2.4s, v25.4s; \
+ sm4e b3.4s, v25.4s; \
+ sm4e b4.4s, v25.4s; \
+ sm4e b5.4s, v25.4s; \
+ sm4e b6.4s, v25.4s; \
+ sm4e b7.4s, v25.4s; \
+ sm4e b0.4s, v26.4s; \
+ sm4e b1.4s, v26.4s; \
+ sm4e b2.4s, v26.4s; \
+ sm4e b3.4s, v26.4s; \
+ sm4e b4.4s, v26.4s; \
+ sm4e b5.4s, v26.4s; \
+ sm4e b6.4s, v26.4s; \
+ sm4e b7.4s, v26.4s; \
+ sm4e b0.4s, v27.4s; \
+ sm4e b1.4s, v27.4s; \
+ sm4e b2.4s, v27.4s; \
+ sm4e b3.4s, v27.4s; \
+ sm4e b4.4s, v27.4s; \
+ sm4e b5.4s, v27.4s; \
+ sm4e b6.4s, v27.4s; \
+ sm4e b7.4s, v27.4s; \
+ sm4e b0.4s, v28.4s; \
+ sm4e b1.4s, v28.4s; \
+ sm4e b2.4s, v28.4s; \
+ sm4e b3.4s, v28.4s; \
+ sm4e b4.4s, v28.4s; \
+ sm4e b5.4s, v28.4s; \
+ sm4e b6.4s, v28.4s; \
+ sm4e b7.4s, v28.4s; \
+ sm4e b0.4s, v29.4s; \
+ sm4e b1.4s, v29.4s; \
+ sm4e b2.4s, v29.4s; \
+ sm4e b3.4s, v29.4s; \
+ sm4e b4.4s, v29.4s; \
+ sm4e b5.4s, v29.4s; \
+ sm4e b6.4s, v29.4s; \
+ sm4e b7.4s, v29.4s; \
+ sm4e b0.4s, v30.4s; \
+ sm4e b1.4s, v30.4s; \
+ sm4e b2.4s, v30.4s; \
+ sm4e b3.4s, v30.4s; \
+ sm4e b4.4s, v30.4s; \
+ sm4e b5.4s, v30.4s; \
+ sm4e b6.4s, v30.4s; \
+ sm4e b7.4s, v30.4s; \
+ sm4e b0.4s, v31.4s; \
+ sm4e b1.4s, v31.4s; \
+ sm4e b2.4s, v31.4s; \
+ sm4e b3.4s, v31.4s; \
+ sm4e b4.4s, v31.4s; \
+ sm4e b5.4s, v31.4s; \
+ sm4e b6.4s, v31.4s; \
+ sm4e b7.4s, v31.4s; \
+ rev64 b0.4s, b0.4s; \
+ rev64 b1.4s, b1.4s; \
+ rev64 b2.4s, b2.4s; \
+ rev64 b3.4s, b3.4s; \
+ rev64 b4.4s, b4.4s; \
+ rev64 b5.4s, b5.4s; \
+ rev64 b6.4s, b6.4s; \
+ rev64 b7.4s, b7.4s; \
+ ext b0.16b, b0.16b, b0.16b, #8; \
+ ext b1.16b, b1.16b, b1.16b, #8; \
+ ext b2.16b, b2.16b, b2.16b, #8; \
+ ext b3.16b, b3.16b, b3.16b, #8; \
+ ext b4.16b, b4.16b, b4.16b, #8; \
+ ext b5.16b, b5.16b, b5.16b, #8; \
+ ext b6.16b, b6.16b, b6.16b, #8; \
+ ext b7.16b, b7.16b, b7.16b, #8; \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ rev32 b4.16b, b4.16b; \
+ rev32 b5.16b, b5.16b; \
+ rev32 b6.16b, b6.16b; \
+ rev32 b7.16b, b7.16b;
+
+
+.align 3
+.global _gcry_sm4_armv8_ce_expand_key
+ELF(.type _gcry_sm4_armv8_ce_expand_key,%function;)
+_gcry_sm4_armv8_ce_expand_key:
+ /* input:
+ * x0: 128-bit key
+ * x1: rkey_enc
+ * x2: rkey_dec
+ * x3: fk array
+ * x4: ck array
+ */
+ CFI_STARTPROC();
+
+ ld1 {v0.16b}, [x0];
+ rev32 v0.16b, v0.16b;
+ ld1 {v1.16b}, [x3];
+ load_rkey(x4);
+
+ /* input ^ fk */
+ eor v0.16b, v0.16b, v1.16b;
+
+ sm4ekey v0.4s, v0.4s, v24.4s;
+ sm4ekey v1.4s, v0.4s, v25.4s;
+ sm4ekey v2.4s, v1.4s, v26.4s;
+ sm4ekey v3.4s, v2.4s, v27.4s;
+ sm4ekey v4.4s, v3.4s, v28.4s;
+ sm4ekey v5.4s, v4.4s, v29.4s;
+ sm4ekey v6.4s, v5.4s, v30.4s;
+ sm4ekey v7.4s, v6.4s, v31.4s;
+
+ st1 {v0.16b-v3.16b}, [x1], #64;
+ st1 {v4.16b-v7.16b}, [x1];
+ rev64 v7.4s, v7.4s;
+ rev64 v6.4s, v6.4s;
+ rev64 v5.4s, v5.4s;
+ rev64 v4.4s, v4.4s;
+ rev64 v3.4s, v3.4s;
+ rev64 v2.4s, v2.4s;
+ rev64 v1.4s, v1.4s;
+ rev64 v0.4s, v0.4s;
+ ext v7.16b, v7.16b, v7.16b, #8;
+ ext v6.16b, v6.16b, v6.16b, #8;
+ ext v5.16b, v5.16b, v5.16b, #8;
+ ext v4.16b, v4.16b, v4.16b, #8;
+ ext v3.16b, v3.16b, v3.16b, #8;
+ ext v2.16b, v2.16b, v2.16b, #8;
+ ext v1.16b, v1.16b, v1.16b, #8;
+ ext v0.16b, v0.16b, v0.16b, #8;
+ st1 {v7.16b}, [x2], #16;
+ st1 {v6.16b}, [x2], #16;
+ st1 {v5.16b}, [x2], #16;
+ st1 {v4.16b}, [x2], #16;
+ st1 {v3.16b}, [x2], #16;
+ st1 {v2.16b}, [x2], #16;
+ st1 {v1.16b}, [x2], #16;
+ st1 {v0.16b}, [x2];
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_armv8_ce_expand_key,.-_gcry_sm4_armv8_ce_expand_key;)
+
+.align 3
+ELF(.type sm4_armv8_ce_crypt_blk1_4,%function;)
+sm4_armv8_ce_crypt_blk1_4:
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: num blocks (1..4)
+ */
+ CFI_STARTPROC();
+ VPUSH_ABI;
+
+ load_rkey(x0);
+
+ ld1 {v0.16b}, [x2], #16;
+ mov v1.16b, v0.16b;
+ mov v2.16b, v0.16b;
+ mov v3.16b, v0.16b;
+ cmp x3, #2;
+ blt .Lblk4_load_input_done;
+ ld1 {v1.16b}, [x2], #16;
+ beq .Lblk4_load_input_done;
+ ld1 {v2.16b}, [x2], #16;
+ cmp x3, #3;
+ beq .Lblk4_load_input_done;
+ ld1 {v3.16b}, [x2];
+
+.Lblk4_load_input_done:
+ crypt_blk4(v0, v1, v2, v3);
+
+ st1 {v0.16b}, [x1], #16;
+ cmp x3, #2;
+ blt .Lblk4_store_output_done;
+ st1 {v1.16b}, [x1], #16;
+ beq .Lblk4_store_output_done;
+ st1 {v2.16b}, [x1], #16;
+ cmp x3, #3;
+ beq .Lblk4_store_output_done;
+ st1 {v3.16b}, [x1];
+
+.Lblk4_store_output_done:
+ VPOP_ABI;
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size sm4_armv8_ce_crypt_blk1_4,.-sm4_armv8_ce_crypt_blk1_4;)
+
+.align 3
+.global _gcry_sm4_armv8_ce_crypt_blk1_8
+ELF(.type _gcry_sm4_armv8_ce_crypt_blk1_8,%function;)
+_gcry_sm4_armv8_ce_crypt_blk1_8:
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: num blocks (1..8)
+ */
+ CFI_STARTPROC();
+
+ cmp x3, #5;
+ blt sm4_armv8_ce_crypt_blk1_4;
+
+ stp x29, x30, [sp, #-16]!;
+ CFI_ADJUST_CFA_OFFSET(16);
+ CFI_REG_ON_STACK(29, 0);
+ CFI_REG_ON_STACK(30, 8);
+ VPUSH_ABI;
+
+ load_rkey(x0);
+
+ ld1 {v0.16b-v3.16b}, [x2], #64;
+ ld1 {v4.16b}, [x2], #16;
+ mov v5.16b, v4.16b;
+ mov v6.16b, v4.16b;
+ mov v7.16b, v4.16b;
+ beq .Lblk8_load_input_done;
+ ld1 {v5.16b}, [x2], #16;
+ cmp x3, #7;
+ blt .Lblk8_load_input_done;
+ ld1 {v6.16b}, [x2], #16;
+ beq .Lblk8_load_input_done;
+ ld1 {v7.16b}, [x2];
+
+.Lblk8_load_input_done:
+ crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ cmp x3, #6;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+ st1 {v4.16b}, [x1], #16;
+ blt .Lblk8_store_output_done;
+ st1 {v5.16b}, [x1], #16;
+ beq .Lblk8_store_output_done;
+ st1 {v6.16b}, [x1], #16;
+ cmp x3, #7;
+ beq .Lblk8_store_output_done;
+ st1 {v7.16b}, [x1];
+
+.Lblk8_store_output_done:
+ VPOP_ABI;
+ ldp x29, x30, [sp], #16;
+ CFI_ADJUST_CFA_OFFSET(-16);
+ CFI_RESTORE(x29);
+ CFI_RESTORE(x30);
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_armv8_ce_crypt_blk1_8,.-_gcry_sm4_armv8_ce_crypt_blk1_8;)
+
+.align 3
+.global _gcry_sm4_armv8_ce_crypt
+ELF(.type _gcry_sm4_armv8_ce_crypt,%function;)
+_gcry_sm4_armv8_ce_crypt:
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: nblocks (multiples of 8)
+ */
+ CFI_STARTPROC();
+
+ load_rkey(x0);
+
+.Lcrypt_loop_blk:
+ subs x3, x3, #8;
+ bmi .Lcrypt_end;
+
+ ld1 {v0.16b-v3.16b}, [x2], #64;
+ ld1 {v4.16b-v7.16b}, [x2], #64;
+
+ crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ st1 {v0.16b-v3.16b}, [x1], #64;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ b .Lcrypt_loop_blk;
+
+.Lcrypt_end:
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_armv8_ce_crypt,.-_gcry_sm4_armv8_ce_crypt;)
+
+.align 3
+.global _gcry_sm4_armv8_ce_cbc_dec
+ELF(.type _gcry_sm4_armv8_ce_cbc_dec,%function;)
+_gcry_sm4_armv8_ce_cbc_dec:
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * x4: nblocks (multiples of 8)
+ */
+ CFI_STARTPROC();
+
+ stp x29, x30, [sp, #-16]!;
+ CFI_ADJUST_CFA_OFFSET(16);
+ CFI_REG_ON_STACK(29, 0);
+ CFI_REG_ON_STACK(30, 8);
+ VPUSH_ABI;
+
+ load_rkey(x0);
+ ld1 {RIV.16b}, [x3];
+
+.Lcbc_loop_blk:
+ subs x4, x4, #8;
+ bmi .Lcbc_end;
+
+ ld1 {v0.16b-v3.16b}, [x2], #64;
+ ld1 {v4.16b-v7.16b}, [x2];
+
+ crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ sub x2, x2, #64;
+ eor v0.16b, v0.16b, RIV.16b;
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v1.16b, v1.16b, RTMP0.16b;
+ eor v2.16b, v2.16b, RTMP1.16b;
+ eor v3.16b, v3.16b, RTMP2.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ eor v4.16b, v4.16b, RTMP3.16b;
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v5.16b, v5.16b, RTMP0.16b;
+ eor v6.16b, v6.16b, RTMP1.16b;
+ eor v7.16b, v7.16b, RTMP2.16b;
+
+ mov RIV.16b, RTMP3.16b;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ b .Lcbc_loop_blk;
+
+.Lcbc_end:
+ /* store new IV */
+ st1 {RIV.16b}, [x3];
+
+ VPOP_ABI;
+ ldp x29, x30, [sp], #16;
+ CFI_ADJUST_CFA_OFFSET(-16);
+ CFI_RESTORE(x29);
+ CFI_RESTORE(x30);
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_armv8_ce_cbc_dec,.-_gcry_sm4_armv8_ce_cbc_dec;)
+
+.align 3
+.global _gcry_sm4_armv8_ce_cfb_dec
+ELF(.type _gcry_sm4_armv8_ce_cfb_dec,%function;)
+_gcry_sm4_armv8_ce_cfb_dec:
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * x4: nblocks (multiples of 8)
+ */
+ CFI_STARTPROC();
+
+ stp x29, x30, [sp, #-16]!;
+ CFI_ADJUST_CFA_OFFSET(16);
+ CFI_REG_ON_STACK(29, 0);
+ CFI_REG_ON_STACK(30, 8);
+ VPUSH_ABI;
+
+ load_rkey(x0);
+ ld1 {v0.16b}, [x3];
+
+.Lcfb_loop_blk:
+ subs x4, x4, #8;
+ bmi .Lcfb_end;
+
+ ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48;
+ ld1 {v4.16b-v7.16b}, [x2];
+
+ crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ sub x2, x2, #48;
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ eor v1.16b, v1.16b, RTMP1.16b;
+ eor v2.16b, v2.16b, RTMP2.16b;
+ eor v3.16b, v3.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v4.16b, v4.16b, RTMP0.16b;
+ eor v5.16b, v5.16b, RTMP1.16b;
+ eor v6.16b, v6.16b, RTMP2.16b;
+ eor v7.16b, v7.16b, RTMP3.16b;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ mov v0.16b, RTMP3.16b;
+
+ b .Lcfb_loop_blk;
+
+.Lcfb_end:
+ /* store new IV */
+ st1 {v0.16b}, [x3];
+
+ VPOP_ABI;
+ ldp x29, x30, [sp], #16;
+ CFI_ADJUST_CFA_OFFSET(-16);
+ CFI_RESTORE(x29);
+ CFI_RESTORE(x30);
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_armv8_ce_cfb_dec,.-_gcry_sm4_armv8_ce_cfb_dec;)
+
+.align 3
+.global _gcry_sm4_armv8_ce_ctr_enc
+ELF(.type _gcry_sm4_armv8_ce_ctr_enc,%function;)
+_gcry_sm4_armv8_ce_ctr_enc:
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: ctr (big endian, 128 bit)
+ * x4: nblocks (multiples of 8)
+ */
+ CFI_STARTPROC();
+
+ stp x29, x30, [sp, #-16]!;
+ CFI_ADJUST_CFA_OFFSET(16);
+ CFI_REG_ON_STACK(29, 0);
+ CFI_REG_ON_STACK(30, 8);
+ VPUSH_ABI;
+
+ load_rkey(x0);
+
+ ldp x7, x8, [x3];
+ rev x7, x7;
+ rev x8, x8;
+
+.Lctr_loop_blk:
+ subs x4, x4, #8;
+ bmi .Lctr_end;
+
+#define inc_le128(vctr) \
+ mov vctr.d[1], x8; \
+ mov vctr.d[0], x7; \
+ adds x8, x8, #1; \
+ adc x7, x7, xzr; \
+ rev64 vctr.16b, vctr.16b;
+
+ /* construct CTRs */
+ inc_le128(v0); /* +0 */
+ inc_le128(v1); /* +1 */
+ inc_le128(v2); /* +2 */
+ inc_le128(v3); /* +3 */
+ inc_le128(v4); /* +4 */
+ inc_le128(v5); /* +5 */
+ inc_le128(v6); /* +6 */
+ inc_le128(v7); /* +7 */
+
+ crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v0.16b, v0.16b, RTMP0.16b;
+ eor v1.16b, v1.16b, RTMP1.16b;
+ eor v2.16b, v2.16b, RTMP2.16b;
+ eor v3.16b, v3.16b, RTMP3.16b;
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
+ eor v4.16b, v4.16b, RTMP0.16b;
+ eor v5.16b, v5.16b, RTMP1.16b;
+ eor v6.16b, v6.16b, RTMP2.16b;
+ eor v7.16b, v7.16b, RTMP3.16b;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ b .Lctr_loop_blk;
+
+.Lctr_end:
+ /* store new CTR */
+ rev x7, x7;
+ rev x8, x8;
+ stp x7, x8, [x3];
+
+ VPOP_ABI;
+ ldp x29, x30, [sp], #16;
+ CFI_ADJUST_CFA_OFFSET(-16);
+ CFI_RESTORE(x29);
+ CFI_RESTORE(x30);
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_armv8_ce_ctr_enc,.-_gcry_sm4_armv8_ce_ctr_enc;)
+
+#endif
diff --git a/cipher/sm4.c b/cipher/sm4.c
index ec2281b6..37b9e210 100644
--- a/cipher/sm4.c
+++ b/cipher/sm4.c
@@ -76,6 +76,15 @@
# endif
#endif

+#undef USE_ARM_CE
+#ifdef ENABLE_ARM_CRYPTO_SUPPORT
+# if defined(__AARCH64EL__) && \
+ defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+# define USE_ARM_CE 1
+# endif
+#endif
+
static const char *sm4_selftest (void);

static void _gcry_sm4_ctr_enc (void *context, unsigned char *ctr,
@@ -106,6 +115,9 @@ typedef struct
#ifdef USE_AARCH64_SIMD
unsigned int use_aarch64_simd:1;
#endif
+#ifdef USE_ARM_CE
+ unsigned int use_arm_ce:1;
+#endif
} SM4_context;

static const u32 fk[4] =
@@ -286,6 +298,43 @@ sm4_aarch64_crypt_blk1_8(const u32 *rk, byte *out, const byte *in,
}
#endif /* USE_AARCH64_SIMD */

+#ifdef USE_ARM_CE
+extern void _gcry_sm4_armv8_ce_expand_key(const byte *key,
+ u32 *rkey_enc, u32 *rkey_dec,
+ const u32 *fk, const u32 *ck);
+
+extern void _gcry_sm4_armv8_ce_crypt(const u32 *rk, byte *out,
+ const byte *in,
+ size_t num_blocks);
+
+extern void _gcry_sm4_armv8_ce_ctr_enc(const u32 *rk_enc, byte *out,
+ const byte *in,
+ byte *ctr,
+ size_t nblocks);
+
+extern void _gcry_sm4_armv8_ce_cbc_dec(const u32 *rk_dec, byte *out,
+ const byte *in,
+ byte *iv,
+ size_t nblocks);
+
+extern void _gcry_sm4_armv8_ce_cfb_dec(const u32 *rk_enc, byte *out,
+ const byte *in,
+ byte *iv,
+ size_t nblocks);
+
+extern void _gcry_sm4_armv8_ce_crypt_blk1_8(const u32 *rk, byte *out,
+ const byte *in,
+ size_t num_blocks);
+
+static inline unsigned int
+sm4_armv8_ce_crypt_blk1_8(const u32 *rk, byte *out, const byte *in,
+ unsigned int num_blks)
+{
+ _gcry_sm4_armv8_ce_crypt_blk1_8(rk, out, in, (size_t)num_blks);
+ return 0;
+}
+#endif /* USE_ARM_CE */
+
static inline void prefetch_sbox_table(void)
{
const volatile byte *vtab = (void *)&sbox_table;
@@ -363,6 +412,15 @@ sm4_expand_key (SM4_context *ctx, const byte *key)
}
#endif

+#ifdef USE_ARM_CE
+ if (ctx->use_arm_ce)
+ {
+ _gcry_sm4_armv8_ce_expand_key (key, ctx->rkey_enc, ctx->rkey_dec,
+ fk, ck);
+ return;
+ }
+#endif
+
rk[0] = buf_get_be32(key + 4 * 0) ^ fk[0];
rk[1] = buf_get_be32(key + 4 * 1) ^ fk[1];
rk[2] = buf_get_be32(key + 4 * 2) ^ fk[2];
@@ -420,6 +478,9 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen,
#ifdef USE_AARCH64_SIMD
ctx->use_aarch64_simd = !!(hwf & HWF_ARM_NEON);
#endif
+#ifdef USE_ARM_CE
+ ctx->use_arm_ce = !!(hwf & HWF_ARM_SM4);
+#endif

/* Setup bulk encryption routines. */
memset (bulk_ops, 0, sizeof(*bulk_ops));
@@ -601,6 +662,23 @@ _gcry_sm4_ctr_enc(void *context, unsigned char *ctr,
}
#endif

+#ifdef USE_ARM_CE
+ if (ctx->use_arm_ce)
+ {
+ /* Process multiples of 8 blocks at a time. */
+ if (nblocks >= 8)
+ {
+ size_t nblks = nblocks & ~(8 - 1);
+
+ _gcry_sm4_armv8_ce_ctr_enc(ctx->rkey_enc, outbuf, inbuf, ctr, nblks);
+
+ nblocks -= nblks;
+ outbuf += nblks * 16;
+ inbuf += nblks * 16;
+ }
+ }
+#endif
+
#ifdef USE_AARCH64_SIMD
if (ctx->use_aarch64_simd)
{
@@ -634,6 +712,12 @@ _gcry_sm4_ctr_enc(void *context, unsigned char *ctr,
crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8;
}
#endif
+#ifdef USE_ARM_CE
+ else if (ctx->use_arm_ce)
+ {
+ crypt_blk1_8 = sm4_armv8_ce_crypt_blk1_8;
+ }
+#endif
#ifdef USE_AARCH64_SIMD
else if (ctx->use_aarch64_simd)
{
@@ -725,6 +809,23 @@ _gcry_sm4_cbc_dec(void *context, unsigned char *iv,
}
#endif

+#ifdef USE_ARM_CE
+ if (ctx->use_arm_ce)
+ {
+ /* Process multiples of 8 blocks at a time. */
+ if (nblocks >= 8)
+ {
+ size_t nblks = nblocks & ~(8 - 1);
+
+ _gcry_sm4_armv8_ce_cbc_dec(ctx->rkey_dec, outbuf, inbuf, iv, nblks);
+
+ nblocks -= nblks;
+ outbuf += nblks * 16;
+ inbuf += nblks * 16;
+ }
+ }
+#endif
+
#ifdef USE_AARCH64_SIMD
if (ctx->use_aarch64_simd)
{
@@ -758,6 +859,12 @@ _gcry_sm4_cbc_dec(void *context, unsigned char *iv,
crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8;
}
#endif
+#ifdef USE_ARM_CE
+ else if (ctx->use_arm_ce)
+ {
+ crypt_blk1_8 = sm4_armv8_ce_crypt_blk1_8;
+ }
+#endif
#ifdef USE_AARCH64_SIMD
else if (ctx->use_aarch64_simd)
{
@@ -842,6 +949,23 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv,
}
#endif

+#ifdef USE_ARM_CE
+ if (ctx->use_arm_ce)
+ {
+ /* Process multiples of 8 blocks at a time. */
+ if (nblocks >= 8)
+ {
+ size_t nblks = nblocks & ~(8 - 1);
+
+ _gcry_sm4_armv8_ce_cfb_dec(ctx->rkey_enc, outbuf, inbuf, iv, nblks);
+
+ nblocks -= nblks;
+ outbuf += nblks * 16;
+ inbuf += nblks * 16;
+ }
+ }
+#endif
+
#ifdef USE_AARCH64_SIMD
if (ctx->use_aarch64_simd)
{
@@ -875,6 +999,12 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv,
crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8;
}
#endif
+#ifdef USE_ARM_CE
+ else if (ctx->use_arm_ce)
+ {
+ crypt_blk1_8 = sm4_armv8_ce_crypt_blk1_8;
+ }
+#endif
#ifdef USE_AARCH64_SIMD
else if (ctx->use_aarch64_simd)
{
@@ -1037,6 +1167,12 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8;
}
#endif
+#ifdef USE_ARM_CE
+ else if (ctx->use_arm_ce)
+ {
+ crypt_blk1_8 = sm4_armv8_ce_crypt_blk1_8;
+ }
+#endif
#ifdef USE_AARCH64_SIMD
else if (ctx->use_aarch64_simd)
{
@@ -1203,6 +1339,12 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8;
}
#endif
+#ifdef USE_ARM_CE
+ else if (ctx->use_arm_ce)
+ {
+ crypt_blk1_8 = sm4_armv8_ce_crypt_blk1_8;
+ }
+#endif
#ifdef USE_AARCH64_SIMD
else if (ctx->use_aarch64_simd)
{
diff --git a/configure.ac b/configure.ac
index f5363f22..e20f9d13 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2755,6 +2755,7 @@ if test "$found" = "1" ; then
aarch64-*-*)
# Build with the assembly implementation
GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aarch64.lo"
+ GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-armv8-aarch64-ce.lo"
esac
fi

--
2.34.1

_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@lists.gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel

Re: [PATCH 2/2] Add SM4 ARMv8/AArch64/CE assembly implementation [ In reply to ]

jussi.kivilinna at iki

Feb 25, 2022, 11:10 PM

Post #2 of 3 (363 views)

Permalink

On 25.2.2022 9.41, Tianjia Zhang wrote:
> * cipher/Makefile.am: Add 'sm4-armv8-aarch64-ce.S'.
> * cipher/sm4-armv8-aarch64-ce.S: New.
> * cipher/sm4.c (USE_ARM_CE): New.
> (SM4_context) [USE_ARM_CE]: Add 'use_arm_ce'.
> [USE_ARM_CE] (_gcry_sm4_armv8_ce_expand_key)
> (_gcry_sm4_armv8_ce_crypt, _gcry_sm4_armv8_ce_ctr_enc)
> (_gcry_sm4_armv8_ce_cbc_dec, _gcry_sm4_armv8_ce_cfb_dec)
> (_gcry_sm4_armv8_ce_crypt_blk1_8, sm4_armv8_ce_crypt_blk1_8): New.
> (sm4_expand_key) [USE_ARM_CE]: Use ARMv8/AArch64/CE key setup.
> (sm4_setkey): Enable ARMv8/AArch64/CE if supported by HW.
> (_gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec, _gcry_sm4_cfb_dec)
> (_gcry_sm4_ocb_crypt, _gcry_sm4_ocb_auth) [USE_ARM_CE]:
> Add ARMv8/AArch64/CE bulk functions.
> * configure.ac: Add 'sm4-armv8-aarch64-ce.lo'.
> --
>
> This patch adds ARMv8/AArch64/CE bulk encryption/decryption. Bulk
> functions process eight blocks in parallel.
>
> Benchmark on T-Head Yitian-710 2.75 GHz:
>
> Before:
> SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
> CBC enc | 12.10 ns/B 78.79 MiB/s 33.28 c/B 2750
> CBC dec | 4.63 ns/B 205.9 MiB/s 12.74 c/B 2749
> CFB enc | 12.14 ns/B 78.58 MiB/s 33.37 c/B 2750
> CFB dec | 4.64 ns/B 205.5 MiB/s 12.76 c/B 2750
> CTR enc | 4.69 ns/B 203.3 MiB/s 12.90 c/B 2750
> CTR dec | 4.69 ns/B 203.3 MiB/s 12.90 c/B 2750
> GCM enc | 4.88 ns/B 195.4 MiB/s 13.42 c/B 2750
> GCM dec | 4.88 ns/B 195.5 MiB/s 13.42 c/B 2750
> GCM auth | 0.189 ns/B 5048 MiB/s 0.520 c/B 2750
> OCB enc | 4.86 ns/B 196.0 MiB/s 13.38 c/B 2750
> OCB dec | 4.90 ns/B 194.7 MiB/s 13.47 c/B 2750
> OCB auth | 4.79 ns/B 199.0 MiB/s 13.18 c/B 2750
>
> After (16x - 19x faster than ARMv8/AArch64 impl):
> SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
> CBC enc | 12.10 ns/B 78.81 MiB/s 33.27 c/B 2750
> CBC dec | 0.243 ns/B 3921 MiB/s 0.669 c/B 2750

This implementation is actually so much faster than generic C, that `_gcry_sm4_armv8_ce_crypt_blk1_8` could be used in `sm4_encrypt` and `sm4_decrypt` to speed up single block operations (CBC encryption, etc) ...

static unsigned int
sm4_encrypt (void *context, byte *outbuf, const byte *inbuf)
{
SM4_context *ctx = context;

#ifdef USE_ARM_CE
if (ctx->use_arm_ce)
return sm4_armv8_ce_crypt_blk1_8 (ctx->rkey_enc, outbuf, inbuf, 1);
#endif
...

> CFB enc | 12.14 ns/B 78.52 MiB/s 33.39 c/B 2750
> CFB dec | 0.241 ns/B 3963 MiB/s 0.662 c/B 2750
> CTR enc | 0.298 ns/B 3201 MiB/s 0.819 c/B 2749
> CTR dec | 0.298 ns/B 3197 MiB/s 0.820 c/B 2750
> GCM enc | 0.488 ns/B 1956 MiB/s 1.34 c/B 2749
> GCM dec | 0.487 ns/B 1959 MiB/s 1.34 c/B 2750
> GCM auth | 0.189 ns/B 5049 MiB/s 0.519 c/B 2749
> OCB enc | 0.461 ns/B 2069 MiB/s 1.27 c/B 2750
> OCB dec | 0.495 ns/B 1928 MiB/s 1.36 c/B 2750
> OCB auth | 0.385 ns/B 2479 MiB/s 1.06 c/B 2750
>
> Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
> ---
> cipher/Makefile.am | 1 +
> cipher/sm4-armv8-aarch64-ce.S | 614 ++++++++++++++++++++++++++++++++++
> cipher/sm4.c | 142 ++++++++
> configure.ac | 1 +
> 4 files changed, 758 insertions(+)
> create mode 100644 cipher/sm4-armv8-aarch64-ce.S
>
> diff --git a/cipher/Makefile.am b/cipher/Makefile.am
> index a7cbf3fc..3339c463 100644
> --- a/cipher/Makefile.am
> +++ b/cipher/Makefile.am
> @@ -117,6 +117,7 @@ EXTRA_libcipher_la_SOURCES = \
> seed.c \
> serpent.c serpent-sse2-amd64.S \
> sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S sm4-aarch64.S \
> + sm4-armv8-aarch64-ce.S \
> serpent-avx2-amd64.S serpent-armv7-neon.S \
> sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
> sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \
> diff --git a/cipher/sm4-armv8-aarch64-ce.S b/cipher/sm4-armv8-aarch64-ce.S
> new file mode 100644
> index 00000000..943f0143
> --- /dev/null
> +++ b/cipher/sm4-armv8-aarch64-ce.S
> @@ -0,0 +1,614 @@
> +/* sm4-armv8-aarch64-ce.S - ARMv8/AArch64/CE accelerated SM4 cipher
> + *
> + * Copyright (C) 2022 Alibaba Group.
> + * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
> + *
> + * This file is part of Libgcrypt.
> + *
> + * Libgcrypt is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU Lesser General Public License as
> + * published by the Free Software Foundation; either version 2.1 of
> + * the License, or (at your option) any later version.
> + *
> + * Libgcrypt is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "asm-common-aarch64.h"
> +
> +#if defined(__AARCH64EL__) && \
> + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
> + defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
> + defined(USE_SM4)
> +
> +.cpu generic+simd+crypto
> +
> +.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 24, 25, 26, 27, 28, 29, 30, 31
> + .set .Lv\b\().4s, \b
> +.endr
> +
> +.macro sm4e, vd, vn
> + .inst 0xcec08400 | (.L\vn << 5) | .L\vd
> +.endm
> +
> +.macro sm4ekey, vd, vn, vm
> + .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
> +.endm

We have target architectures where assembler does not support these macros (MacOSX for example). It's better to detect if these instructions are supported with new check in `configure.ac`. For example, see how this is done for `HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO`.

-Jussi

_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@lists.gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel

Re: [PATCH 2/2] Add SM4 ARMv8/AArch64/CE assembly implementation [ In reply to ]

gcrypt-devel at lists

Feb 28, 2022, 4:25 AM

Post #3 of 3 (363 views)

Permalink

Hi Jussi,

On 2/26/22 3:10 PM, Jussi Kivilinna wrote:
> On 25.2.2022 9.41, Tianjia Zhang wrote:
>> * cipher/Makefile.am: Add 'sm4-armv8-aarch64-ce.S'.
>> * cipher/sm4-armv8-aarch64-ce.S: New.
>> * cipher/sm4.c (USE_ARM_CE): New.
>> (SM4_context) [USE_ARM_CE]: Add 'use_arm_ce'.
>> [USE_ARM_CE] (_gcry_sm4_armv8_ce_expand_key)
>> (_gcry_sm4_armv8_ce_crypt, _gcry_sm4_armv8_ce_ctr_enc)
>> (_gcry_sm4_armv8_ce_cbc_dec, _gcry_sm4_armv8_ce_cfb_dec)
>> (_gcry_sm4_armv8_ce_crypt_blk1_8, sm4_armv8_ce_crypt_blk1_8): New.
>> (sm4_expand_key) [USE_ARM_CE]: Use ARMv8/AArch64/CE key setup.
>> (sm4_setkey): Enable ARMv8/AArch64/CE if supported by HW.
>> (_gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec, _gcry_sm4_cfb_dec)
>> (_gcry_sm4_ocb_crypt, _gcry_sm4_ocb_auth) [USE_ARM_CE]:
>> Add ARMv8/AArch64/CE bulk functions.
>> * configure.ac: Add 'sm4-armv8-aarch64-ce.lo'.
>> --
>>
>> This patch adds ARMv8/AArch64/CE bulk encryption/decryption. Bulk
>> functions process eight blocks in parallel.
>>
>> Benchmark on T-Head Yitian-710 2.75 GHz:
>>
>> Before:
>> SM4            | nanosecs/byte   mebibytes/sec   cycles/byte auto Mhz
>>          CBC enc |     12.10 ns/B     78.79 MiB/s     33.28 c/B      2750
>>          CBC dec |      4.63 ns/B     205.9 MiB/s     12.74 c/B      2749
>>          CFB enc |     12.14 ns/B     78.58 MiB/s     33.37 c/B      2750
>>          CFB dec |      4.64 ns/B     205.5 MiB/s     12.76 c/B      2750
>>          CTR enc |      4.69 ns/B     203.3 MiB/s     12.90 c/B      2750
>>          CTR dec |      4.69 ns/B     203.3 MiB/s     12.90 c/B      2750
>>          GCM enc |      4.88 ns/B     195.4 MiB/s     13.42 c/B      2750
>>          GCM dec |      4.88 ns/B     195.5 MiB/s     13.42 c/B      2750
>>         GCM auth |     0.189 ns/B      5048 MiB/s     0.520 c/B      2750
>>          OCB enc |      4.86 ns/B     196.0 MiB/s     13.38 c/B      2750
>>          OCB dec |      4.90 ns/B     194.7 MiB/s     13.47 c/B      2750
>>         OCB auth |      4.79 ns/B     199.0 MiB/s     13.18 c/B      2750
>>
>> After (16x - 19x faster than ARMv8/AArch64 impl):
>> SM4            | nanosecs/byte   mebibytes/sec   cycles/byte auto Mhz
>>          CBC enc |     12.10 ns/B     78.81 MiB/s     33.27 c/B      2750
>>          CBC dec |     0.243 ns/B      3921 MiB/s     0.669 c/B      2750
>
> This implementation is actually so much faster than generic C, that
> `_gcry_sm4_armv8_ce_crypt_blk1_8` could be used in `sm4_encrypt` and
> `sm4_decrypt` to speed up single block operations (CBC encryption, etc) ...
>
> static unsigned int
> sm4_encrypt (void *context, byte *outbuf, const byte *inbuf)
> {
>     SM4_context *ctx = context;
>
> #ifdef USE_ARM_CE
>     if (ctx->use_arm_ce)
>       return sm4_armv8_ce_crypt_blk1_8 (ctx->rkey_enc, outbuf, inbuf, 1);
> #endif
> ...
>

Great suggestion, I will do.

>>          CFB enc |     12.14 ns/B     78.52 MiB/s     33.39 c/B      2750
>>          CFB dec |     0.241 ns/B      3963 MiB/s     0.662 c/B      2750
>>          CTR enc |     0.298 ns/B      3201 MiB/s     0.819 c/B      2749
>>          CTR dec |     0.298 ns/B      3197 MiB/s     0.820 c/B      2750
>>          GCM enc |     0.488 ns/B      1956 MiB/s      1.34 c/B      2749
>>          GCM dec |     0.487 ns/B      1959 MiB/s      1.34 c/B      2750
>>         GCM auth |     0.189 ns/B      5049 MiB/s     0.519 c/B      2749
>>          OCB enc |     0.461 ns/B      2069 MiB/s      1.27 c/B      2750
>>          OCB dec |     0.495 ns/B      1928 MiB/s      1.36 c/B      2750
>>         OCB auth |     0.385 ns/B      2479 MiB/s      1.06 c/B      2750
>>
>> Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
>> ---
>> cipher/Makefile.am            |   1 +
>> cipher/sm4-armv8-aarch64-ce.S | 614 ++++++++++++++++++++++++++++++++++
>> cipher/sm4.c                  | 142 ++++++++
>> configure.ac                  |   1 +
>> 4 files changed, 758 insertions(+)
>> create mode 100644 cipher/sm4-armv8-aarch64-ce.S
>>
>> diff --git a/cipher/Makefile.am b/cipher/Makefile.am
>> index a7cbf3fc..3339c463 100644
>> --- a/cipher/Makefile.am
>> +++ b/cipher/Makefile.am
>> @@ -117,6 +117,7 @@ EXTRA_libcipher_la_SOURCES = \
>>       seed.c \
>>       serpent.c serpent-sse2-amd64.S \
>>       sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S sm4-aarch64.S \
>> +    sm4-armv8-aarch64-ce.S \
>>       serpent-avx2-amd64.S serpent-armv7-neon.S \
>>       sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
>>       sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \
>> diff --git a/cipher/sm4-armv8-aarch64-ce.S
>> b/cipher/sm4-armv8-aarch64-ce.S
>> new file mode 100644
>> index 00000000..943f0143
>> --- /dev/null
>> +++ b/cipher/sm4-armv8-aarch64-ce.S
>> @@ -0,0 +1,614 @@
>> +/* sm4-armv8-aarch64-ce.S - ARMv8/AArch64/CE accelerated SM4 cipher
>> + *
>> + * Copyright (C) 2022 Alibaba Group.
>> + * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
>> + *
>> + * This file is part of Libgcrypt.
>> + *
>> + * Libgcrypt is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU Lesser General Public License as
>> + * published by the Free Software Foundation; either version 2.1 of
>> + * the License, or (at your option) any later version.
>> + *
>> + * Libgcrypt is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
>> + * GNU Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with this program; if not, see
>> <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#include "asm-common-aarch64.h"
>> +
>> +#if defined(__AARCH64EL__) && \
>> +    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
>> +    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
>> +    defined(USE_SM4)
>> +
>> +.cpu generic+simd+crypto
>> +
>> +.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 24, 25, 26, 27, 28, 29, 30, 31
>> +    .set .Lv\b\().4s, \b
>> +.endr
>> +
>> +.macro sm4e, vd, vn
>> +    .inst 0xcec08400 | (.L\vn << 5) | .L\vd
>> +.endm
>> +
>> +.macro sm4ekey, vd, vn, vm
>> +    .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
>> +.endm
>
> We have target architectures where assembler does not support these
> macros (MacOSX for example). It's better to detect if these instructions
> are supported with new check in `configure.ac`. For example, see how
> this is done for `HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO`.
>
> -Jussi

SM Crypto Extensions is an optional ARMv8 extension, so the current
mainstream ARM architecture CPUs do not support this extension due to
various reasons. I will add in the next patch to detect whether the
extension of the SM3/4 instructions is supported.

Best regards,
Tianjia

_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@lists.gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel