Mailing List Archive

[PATCH] sm4: add ARMv8 CE accelerated implementation for XTS mode
* cipher/sm4-armv8-aarch64-ce.S (_gcry_sm4_armv8_ce_xts_crypt): New.
* cipher/sm4.c (_gcry_sm4_armv8_ce_xts_crypt): New.
(_gcry_sm4_xts_crypt) [USE_ARM_CE]: Add ARMv8 CE implementation for XTS.
--

Benchmark on T-Head Yitian-710 2.75 GHz:

Before:
SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
XTS enc | 0.373 ns/B 2560 MiB/s 1.02 c/B 2749
XTS dec | 0.372 ns/B 2562 MiB/s 1.02 c/B 2750

After (1.18x faster):
SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
XTS enc | 0.314 ns/B 3038 MiB/s 0.863 c/B 2749
XTS dec | 0.314 ns/B 3037 MiB/s 0.863 c/B 2749

Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
---
cipher/sm4-armv8-aarch64-ce.S | 151 ++++++++++++++++++++++++++++++++++
cipher/sm4.c | 18 +++-
2 files changed, 168 insertions(+), 1 deletion(-)

diff --git a/cipher/sm4-armv8-aarch64-ce.S b/cipher/sm4-armv8-aarch64-ce.S
index 5fb55947edc1..1a4ff736ad27 100644
--- a/cipher/sm4-armv8-aarch64-ce.S
+++ b/cipher/sm4-armv8-aarch64-ce.S
@@ -62,6 +62,7 @@
#define RTMP3 v19

#define RIV v20
+#define RMASK v21

/* Helper macros. */

@@ -69,6 +70,20 @@
ld1 {v24.16b-v27.16b}, [ptr], #64; \
ld1 {v28.16b-v31.16b}, [ptr];

+#define SM4_CRYPT_BLK(b0) \
+ rev32 b0.16b, b0.16b; \
+ sm4e(b0, v24); \
+ sm4e(b0, v25); \
+ sm4e(b0, v26); \
+ sm4e(b0, v27); \
+ sm4e(b0, v28); \
+ sm4e(b0, v29); \
+ sm4e(b0, v30); \
+ sm4e(b0, v31); \
+ rev64 b0.4s, b0.4s; \
+ ext b0.16b, b0.16b, b0.16b, #8; \
+ rev32 b0.16b, b0.16b;
+
#define crypt_blk4(b0, b1, b2, b3) \
rev32 b0.16b, b0.16b; \
rev32 b1.16b, b1.16b; \
@@ -577,4 +592,140 @@ _gcry_sm4_armv8_ce_ctr_enc:
CFI_ENDPROC();
ELF(.size _gcry_sm4_armv8_ce_ctr_enc,.-_gcry_sm4_armv8_ce_ctr_enc;)

+.align 3
+.global _gcry_sm4_armv8_ce_xts_crypt
+ELF(.type _gcry_sm4_armv8_ce_xts_crypt,%function;)
+_gcry_sm4_armv8_ce_xts_crypt:
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: tweak (big endian, 128 bit)
+ * x4: nblocks
+ */
+ CFI_STARTPROC()
+ VPUSH_ABI
+
+ load_rkey(x0)
+
+ mov x7, #0x87
+ mov x8, #0x1
+ mov RMASK.d[0], x7
+ mov RMASK.d[1], x8
+
+ ld1 {RIV.16b}, [x3]
+ mov v8.16b, RIV.16b
+ ext RIV.16b, RIV.16b, RIV.16b, #8
+
+.Lxts_loop_blk:
+ sub x4, x4, #8
+ tbnz x4, #63, .Lxts_tail8
+
+#define tweak_next(vt, vin, RTMP) \
+ sshr RTMP.2d, RIV.2d, #63; \
+ add vt.2d, vin.2d, vin.2d; \
+ and RTMP.16b, RTMP.16b, RMASK.16b; \
+ add RIV.2d, RIV.2d, RIV.2d; \
+ eor vt.16b, vt.16b, RTMP.16b;
+
+ tweak_next( v9, v8, RTMP0)
+ tweak_next(v10, v9, RTMP1)
+ tweak_next(v11, v10, RTMP2)
+ tweak_next(v12, v11, RTMP3)
+ tweak_next(v13, v12, RTMP0)
+ tweak_next(v14, v13, RTMP1)
+ tweak_next(v15, v14, RTMP2)
+
+ ld1 {v0.16b-v3.16b}, [x2], #64
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+ ld1 {v4.16b-v7.16b}, [x2], #64
+ eor v4.16b, v4.16b, v12.16b
+ eor v5.16b, v5.16b, v13.16b
+ eor v6.16b, v6.16b, v14.16b
+ eor v7.16b, v7.16b, v15.16b
+
+ crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7)
+
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+ st1 {v0.16b-v3.16b}, [x1], #64
+ eor v4.16b, v4.16b, v12.16b
+ eor v5.16b, v5.16b, v13.16b
+ eor v6.16b, v6.16b, v14.16b
+ eor v7.16b, v7.16b, v15.16b
+ st1 {v4.16b-v7.16b}, [x1], #64
+
+ tweak_next(v8, v15, RTMP3)
+
+ cbz x4, .Lxts_end
+ b .Lxts_loop_blk
+
+.Lxts_tail8:
+ add x4, x4, #8
+ cmp x4, #4
+ blt .Lxts_tail4
+
+ sub x4, x4, #4
+
+ tweak_next( v9, v8, RTMP0)
+ tweak_next(v10, v9, RTMP1)
+ tweak_next(v11, v10, RTMP2)
+
+ ld1 {v0.16b-v3.16b}, [x2], #64
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+
+ crypt_blk4(v0, v1, v2, v3);
+
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+ st1 {v0.16b-v3.16b}, [x1], #64
+
+ tweak_next(v8, v11, RTMP3)
+
+ cbz x4, .Lxts_end
+
+.Lxts_tail4:
+ sub x4, x4, #1
+
+ ld1 {v0.16b}, [x2], #16
+ eor v0.16b, v0.16b, v8.16b
+
+ SM4_CRYPT_BLK(v0)
+
+ eor v0.16b, v0.16b, v8.16b
+ st1 {v0.16b}, [x1], #16
+
+ tweak_next(v8, v8, RTMP0)
+
+ cbnz x4, .Lxts_tail4
+
+.Lxts_end:
+ /* store new tweak */
+ st1 {v8.16b}, [x3]
+
+ CLEAR_REG(v8)
+ CLEAR_REG(v9)
+ CLEAR_REG(v10)
+ CLEAR_REG(v11)
+ CLEAR_REG(v12)
+ CLEAR_REG(v13)
+ CLEAR_REG(v14)
+ CLEAR_REG(v15)
+ CLEAR_REG(RIV)
+
+ VPOP_ABI
+ ret_spec_stop
+ CFI_ENDPROC()
+ELF(.size _gcry_sm4_armv8_ce_xts_crypt,.-_gcry_sm4_armv8_ce_xts_crypt;)
+
#endif
diff --git a/cipher/sm4.c b/cipher/sm4.c
index b5d4691ddbcb..4cac3b6c64b0 100644
--- a/cipher/sm4.c
+++ b/cipher/sm4.c
@@ -1,6 +1,6 @@
/* sm4.c - SM4 Cipher Algorithm
* Copyright (C) 2020 Alibaba Group.
- * Copyright (C) 2020 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ * Copyright (C) 2020-2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
* Copyright (C) 2020-2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This file is part of Libgcrypt.
@@ -539,6 +539,11 @@ extern void _gcry_sm4_armv8_ce_cfb_dec(const u32 *rk_enc, byte *out,
byte *iv,
size_t nblocks);

+extern void _gcry_sm4_armv8_ce_xts_crypt(const u32 *rk, byte *out,
+ const byte *in,
+ byte *tweak,
+ size_t nblocks);
+
extern void _gcry_sm4_armv8_ce_crypt_blk1_8(const u32 *rk, byte *out,
const byte *in,
size_t num_blocks);
@@ -1510,6 +1515,17 @@ _gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
const unsigned char *inbuf = inbuf_arg;
int burn_stack_depth = 0;

+#ifdef USE_ARM_CE
+ if (ctx->use_arm_ce)
+ {
+ /* Process all blocks at a time. */
+ _gcry_sm4_armv8_ce_xts_crypt(encrypt ? ctx->rkey_enc : ctx->rkey_dec,
+ outbuf, inbuf, tweak, nblocks);
+
+ nblocks = 0;
+ }
+#endif
+
/* Process remaining blocks. */
if (nblocks)
{
--
2.24.3 (Apple Git-128)


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@lists.gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel
Re: [PATCH] sm4: add ARMv8 CE accelerated implementation for XTS mode [ In reply to ]
Hello,

Patch applied to master, thanks.

-Jussi

On 28.7.2022 11.26, Tianjia Zhang via Gcrypt-devel wrote:
> * cipher/sm4-armv8-aarch64-ce.S (_gcry_sm4_armv8_ce_xts_crypt): New.
> * cipher/sm4.c (_gcry_sm4_armv8_ce_xts_crypt): New.
> (_gcry_sm4_xts_crypt) [USE_ARM_CE]: Add ARMv8 CE implementation for XTS.
> --
>
> Benchmark on T-Head Yitian-710 2.75 GHz:
>
> Before:
> SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
> XTS enc | 0.373 ns/B 2560 MiB/s 1.02 c/B 2749
> XTS dec | 0.372 ns/B 2562 MiB/s 1.02 c/B 2750
>
> After (1.18x faster):
> SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
> XTS enc | 0.314 ns/B 3038 MiB/s 0.863 c/B 2749
> XTS dec | 0.314 ns/B 3037 MiB/s 0.863 c/B 2749
>
> Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
> ---
> cipher/sm4-armv8-aarch64-ce.S | 151 ++++++++++++++++++++++++++++++++++
> cipher/sm4.c | 18 +++-
> 2 files changed, 168 insertions(+), 1 deletion(-)
>
> diff --git a/cipher/sm4-armv8-aarch64-ce.S b/cipher/sm4-armv8-aarch64-ce.S
> index 5fb55947edc1..1a4ff736ad27 100644
> --- a/cipher/sm4-armv8-aarch64-ce.S
> +++ b/cipher/sm4-armv8-aarch64-ce.S
> @@ -62,6 +62,7 @@
> #define RTMP3 v19
>
> #define RIV v20
> +#define RMASK v21
>
> /* Helper macros. */
>
> @@ -69,6 +70,20 @@
> ld1 {v24.16b-v27.16b}, [ptr], #64; \
> ld1 {v28.16b-v31.16b}, [ptr];
>
> +#define SM4_CRYPT_BLK(b0) \
> + rev32 b0.16b, b0.16b; \
> + sm4e(b0, v24); \
> + sm4e(b0, v25); \
> + sm4e(b0, v26); \
> + sm4e(b0, v27); \
> + sm4e(b0, v28); \
> + sm4e(b0, v29); \
> + sm4e(b0, v30); \
> + sm4e(b0, v31); \
> + rev64 b0.4s, b0.4s; \
> + ext b0.16b, b0.16b, b0.16b, #8; \
> + rev32 b0.16b, b0.16b;
> +
> #define crypt_blk4(b0, b1, b2, b3) \
> rev32 b0.16b, b0.16b; \
> rev32 b1.16b, b1.16b; \
> @@ -577,4 +592,140 @@ _gcry_sm4_armv8_ce_ctr_enc:
> CFI_ENDPROC();
> ELF(.size _gcry_sm4_armv8_ce_ctr_enc,.-_gcry_sm4_armv8_ce_ctr_enc;)
>
> +.align 3
> +.global _gcry_sm4_armv8_ce_xts_crypt
> +ELF(.type _gcry_sm4_armv8_ce_xts_crypt,%function;)
> +_gcry_sm4_armv8_ce_xts_crypt:
> + /* input:
> + * x0: round key array, CTX
> + * x1: dst
> + * x2: src
> + * x3: tweak (big endian, 128 bit)
> + * x4: nblocks
> + */
> + CFI_STARTPROC()
> + VPUSH_ABI
> +
> + load_rkey(x0)
> +
> + mov x7, #0x87
> + mov x8, #0x1
> + mov RMASK.d[0], x7
> + mov RMASK.d[1], x8
> +
> + ld1 {RIV.16b}, [x3]
> + mov v8.16b, RIV.16b
> + ext RIV.16b, RIV.16b, RIV.16b, #8
> +
> +.Lxts_loop_blk:
> + sub x4, x4, #8
> + tbnz x4, #63, .Lxts_tail8
> +
> +#define tweak_next(vt, vin, RTMP) \
> + sshr RTMP.2d, RIV.2d, #63; \
> + add vt.2d, vin.2d, vin.2d; \
> + and RTMP.16b, RTMP.16b, RMASK.16b; \
> + add RIV.2d, RIV.2d, RIV.2d; \
> + eor vt.16b, vt.16b, RTMP.16b;
> +
> + tweak_next( v9, v8, RTMP0)
> + tweak_next(v10, v9, RTMP1)
> + tweak_next(v11, v10, RTMP2)
> + tweak_next(v12, v11, RTMP3)
> + tweak_next(v13, v12, RTMP0)
> + tweak_next(v14, v13, RTMP1)
> + tweak_next(v15, v14, RTMP2)
> +
> + ld1 {v0.16b-v3.16b}, [x2], #64
> + eor v0.16b, v0.16b, v8.16b
> + eor v1.16b, v1.16b, v9.16b
> + eor v2.16b, v2.16b, v10.16b
> + eor v3.16b, v3.16b, v11.16b
> + ld1 {v4.16b-v7.16b}, [x2], #64
> + eor v4.16b, v4.16b, v12.16b
> + eor v5.16b, v5.16b, v13.16b
> + eor v6.16b, v6.16b, v14.16b
> + eor v7.16b, v7.16b, v15.16b
> +
> + crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7)
> +
> + eor v0.16b, v0.16b, v8.16b
> + eor v1.16b, v1.16b, v9.16b
> + eor v2.16b, v2.16b, v10.16b
> + eor v3.16b, v3.16b, v11.16b
> + st1 {v0.16b-v3.16b}, [x1], #64
> + eor v4.16b, v4.16b, v12.16b
> + eor v5.16b, v5.16b, v13.16b
> + eor v6.16b, v6.16b, v14.16b
> + eor v7.16b, v7.16b, v15.16b
> + st1 {v4.16b-v7.16b}, [x1], #64
> +
> + tweak_next(v8, v15, RTMP3)
> +
> + cbz x4, .Lxts_end
> + b .Lxts_loop_blk
> +
> +.Lxts_tail8:
> + add x4, x4, #8
> + cmp x4, #4
> + blt .Lxts_tail4
> +
> + sub x4, x4, #4
> +
> + tweak_next( v9, v8, RTMP0)
> + tweak_next(v10, v9, RTMP1)
> + tweak_next(v11, v10, RTMP2)
> +
> + ld1 {v0.16b-v3.16b}, [x2], #64
> + eor v0.16b, v0.16b, v8.16b
> + eor v1.16b, v1.16b, v9.16b
> + eor v2.16b, v2.16b, v10.16b
> + eor v3.16b, v3.16b, v11.16b
> +
> + crypt_blk4(v0, v1, v2, v3);
> +
> + eor v0.16b, v0.16b, v8.16b
> + eor v1.16b, v1.16b, v9.16b
> + eor v2.16b, v2.16b, v10.16b
> + eor v3.16b, v3.16b, v11.16b
> + st1 {v0.16b-v3.16b}, [x1], #64
> +
> + tweak_next(v8, v11, RTMP3)
> +
> + cbz x4, .Lxts_end
> +
> +.Lxts_tail4:
> + sub x4, x4, #1
> +
> + ld1 {v0.16b}, [x2], #16
> + eor v0.16b, v0.16b, v8.16b
> +
> + SM4_CRYPT_BLK(v0)
> +
> + eor v0.16b, v0.16b, v8.16b
> + st1 {v0.16b}, [x1], #16
> +
> + tweak_next(v8, v8, RTMP0)
> +
> + cbnz x4, .Lxts_tail4
> +
> +.Lxts_end:
> + /* store new tweak */
> + st1 {v8.16b}, [x3]
> +
> + CLEAR_REG(v8)
> + CLEAR_REG(v9)
> + CLEAR_REG(v10)
> + CLEAR_REG(v11)
> + CLEAR_REG(v12)
> + CLEAR_REG(v13)
> + CLEAR_REG(v14)
> + CLEAR_REG(v15)
> + CLEAR_REG(RIV)
> +
> + VPOP_ABI
> + ret_spec_stop
> + CFI_ENDPROC()
> +ELF(.size _gcry_sm4_armv8_ce_xts_crypt,.-_gcry_sm4_armv8_ce_xts_crypt;)
> +
> #endif
> diff --git a/cipher/sm4.c b/cipher/sm4.c
> index b5d4691ddbcb..4cac3b6c64b0 100644
> --- a/cipher/sm4.c
> +++ b/cipher/sm4.c
> @@ -1,6 +1,6 @@
> /* sm4.c - SM4 Cipher Algorithm
> * Copyright (C) 2020 Alibaba Group.
> - * Copyright (C) 2020 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
> + * Copyright (C) 2020-2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
> * Copyright (C) 2020-2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
> *
> * This file is part of Libgcrypt.
> @@ -539,6 +539,11 @@ extern void _gcry_sm4_armv8_ce_cfb_dec(const u32 *rk_enc, byte *out,
> byte *iv,
> size_t nblocks);
>
> +extern void _gcry_sm4_armv8_ce_xts_crypt(const u32 *rk, byte *out,
> + const byte *in,
> + byte *tweak,
> + size_t nblocks);
> +
> extern void _gcry_sm4_armv8_ce_crypt_blk1_8(const u32 *rk, byte *out,
> const byte *in,
> size_t num_blocks);
> @@ -1510,6 +1515,17 @@ _gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
> const unsigned char *inbuf = inbuf_arg;
> int burn_stack_depth = 0;
>
> +#ifdef USE_ARM_CE
> + if (ctx->use_arm_ce)
> + {
> + /* Process all blocks at a time. */
> + _gcry_sm4_armv8_ce_xts_crypt(encrypt ? ctx->rkey_enc : ctx->rkey_dec,
> + outbuf, inbuf, tweak, nblocks);
> +
> + nblocks = 0;
> + }
> +#endif
> +
> /* Process remaining blocks. */
> if (nblocks)
> {


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@lists.gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel