Mailing List Archive

[PATCH 3/3] sm4-aesni-avx2: add generic 1 to 16 block bulk processing function
* cipher/sm4-aesni-avx2-amd64.S: Remove unnecessary vzeroupper at
function entries.
(_gcry_sm4_aesni_avx2_crypt_blk1_16): New.
* cipher/sm4.c (_gcry_sm4_aesni_avx2_crypt_blk1_16)
(sm4_aesni_avx2_crypt_blk1_16): New.
(sm4_get_crypt_blk1_16_fn) [USE_AESNI_AVX2]: Add
'sm4_aesni_avx2_crypt_blk1_16'.
--

Benchmark AMD Ryzen 5800X:

Before:
SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
XTS enc | 1.48 ns/B 643.2 MiB/s 7.19 c/B 4850
XTS dec | 1.48 ns/B 644.3 MiB/s 7.18 c/B 4850

After (1.37x faster):
SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
XTS enc | 1.07 ns/B 888.7 MiB/s 5.21 c/B 4850
XTS dec | 1.07 ns/B 889.4 MiB/s 5.20 c/B 4850

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
---
cipher/sm4-aesni-avx2-amd64.S | 82 +++++++++++++++++++++++++++++------
cipher/sm4.c | 26 +++++++++++
2 files changed, 95 insertions(+), 13 deletions(-)

diff --git a/cipher/sm4-aesni-avx2-amd64.S b/cipher/sm4-aesni-avx2-amd64.S
index effe590b..e09fed8f 100644
--- a/cipher/sm4-aesni-avx2-amd64.S
+++ b/cipher/sm4-aesni-avx2-amd64.S
@@ -1,6 +1,6 @@
/* sm4-avx2-amd64.S - AVX2 implementation of SM4 cipher
*
- * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2020, 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -45,11 +45,19 @@
#define RA1 %ymm9
#define RA2 %ymm10
#define RA3 %ymm11
+#define RA0x %xmm8
+#define RA1x %xmm9
+#define RA2x %xmm10
+#define RA3x %xmm11

#define RB0 %ymm12
#define RB1 %ymm13
#define RB2 %ymm14
#define RB3 %ymm15
+#define RB0x %xmm12
+#define RB1x %xmm13
+#define RB2x %xmm14
+#define RB3x %xmm15

#define RNOT %ymm0
#define RBSWAP %ymm1
@@ -280,6 +288,66 @@ __sm4_crypt_blk16:
CFI_ENDPROC();
ELF(.size __sm4_crypt_blk16,.-__sm4_crypt_blk16;)

+.align 8
+.globl _gcry_sm4_aesni_avx2_crypt_blk1_16
+ELF(.type _gcry_sm4_aesni_avx2_crypt_blk1_16,@function;)
+_gcry_sm4_aesni_avx2_crypt_blk1_16:
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (1..16 blocks)
+ * %rdx: src (1..16 blocks)
+ * %rcx: num blocks (1..16)
+ */
+ CFI_STARTPROC();
+
+#define LOAD_INPUT(offset, yreg) \
+ cmpq $(1 + 2 * (offset)), %rcx; \
+ jb .Lblk16_load_input_done; \
+ ja 1f; \
+ vmovdqu (offset) * 32(%rdx), yreg##x; \
+ jmp .Lblk16_load_input_done; \
+ 1: \
+ vmovdqu (offset) * 32(%rdx), yreg;
+
+ LOAD_INPUT(0, RA0);
+ LOAD_INPUT(1, RA1);
+ LOAD_INPUT(2, RA2);
+ LOAD_INPUT(3, RA3);
+ LOAD_INPUT(4, RB0);
+ LOAD_INPUT(5, RB1);
+ LOAD_INPUT(6, RB2);
+ LOAD_INPUT(7, RB3);
+#undef LOAD_INPUT
+
+.Lblk16_load_input_done:
+ call __sm4_crypt_blk16;
+
+#define STORE_OUTPUT(yreg, offset) \
+ cmpq $(1 + 2 * (offset)), %rcx; \
+ jb .Lblk16_store_output_done; \
+ ja 1f; \
+ vmovdqu yreg##x, (offset) * 32(%rsi); \
+ jmp .Lblk16_store_output_done; \
+ 1: \
+ vmovdqu yreg, (offset) * 32(%rsi);
+
+ STORE_OUTPUT(RA0, 0);
+ STORE_OUTPUT(RA1, 1);
+ STORE_OUTPUT(RA2, 2);
+ STORE_OUTPUT(RA3, 3);
+ STORE_OUTPUT(RB0, 4);
+ STORE_OUTPUT(RB1, 5);
+ STORE_OUTPUT(RB2, 6);
+ STORE_OUTPUT(RB3, 7);
+#undef STORE_OUTPUT
+
+.Lblk16_store_output_done:
+ vzeroall;
+ xorl %eax, %eax;
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_aesni_avx2_crypt_blk1_16,.-_gcry_sm4_aesni_avx2_crypt_blk1_16;)
+
#define inc_le128(x, minus_one, tmp) \
vpcmpeqq minus_one, x, tmp; \
vpsubq minus_one, x, x; \
@@ -301,8 +369,6 @@ _gcry_sm4_aesni_avx2_ctr_enc:
movq 8(%rcx), %rax;
bswapq %rax;

- vzeroupper;
-
vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
vpcmpeqd RNOT, RNOT, RNOT;
vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */
@@ -410,8 +476,6 @@ _gcry_sm4_aesni_avx2_cbc_dec:
*/
CFI_STARTPROC();

- vzeroupper;
-
vmovdqu (0 * 32)(%rdx), RA0;
vmovdqu (1 * 32)(%rdx), RA1;
vmovdqu (2 * 32)(%rdx), RA2;
@@ -463,8 +527,6 @@ _gcry_sm4_aesni_avx2_cfb_dec:
*/
CFI_STARTPROC();

- vzeroupper;
-
/* Load input */
vmovdqu (%rcx), RNOTx;
vinserti128 $1, (%rdx), RNOT, RA0;
@@ -521,8 +583,6 @@ _gcry_sm4_aesni_avx2_ocb_enc:
*/
CFI_STARTPROC();

- vzeroupper;
-
subq $(4 * 8), %rsp;
CFI_ADJUST_CFA_OFFSET(4 * 8);

@@ -635,8 +695,6 @@ _gcry_sm4_aesni_avx2_ocb_dec:
*/
CFI_STARTPROC();

- vzeroupper;
-
subq $(4 * 8), %rsp;
CFI_ADJUST_CFA_OFFSET(4 * 8);

@@ -758,8 +816,6 @@ _gcry_sm4_aesni_avx2_ocb_auth:
*/
CFI_STARTPROC();

- vzeroupper;
-
subq $(4 * 8), %rsp;
CFI_ADJUST_CFA_OFFSET(4 * 8);

diff --git a/cipher/sm4.c b/cipher/sm4.c
index 9d00ee05..1f27f508 100644
--- a/cipher/sm4.c
+++ b/cipher/sm4.c
@@ -291,6 +291,24 @@ extern void _gcry_sm4_aesni_avx2_ocb_auth(const u32 *rk_enc,
unsigned char *offset,
unsigned char *checksum,
const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern unsigned int
+_gcry_sm4_aesni_avx2_crypt_blk1_16(const u32 *rk, byte *out, const byte *in,
+ unsigned int num_blks) ASM_FUNC_ABI;
+
+static inline unsigned int
+sm4_aesni_avx2_crypt_blk1_16(const void *rk, byte *out, const byte *in,
+ unsigned int num_blks)
+{
+#ifdef USE_AESNI_AVX
+ /* Use 128-bit register implementation for short input. */
+ if (num_blks <= 8)
+ return _gcry_sm4_aesni_avx_crypt_blk1_8(rk, out, in, num_blks);
+#endif
+
+ return _gcry_sm4_aesni_avx2_crypt_blk1_16(rk, out, in, num_blks);
+}
+
#endif /* USE_AESNI_AVX2 */

#ifdef USE_GFNI_AVX2
@@ -382,6 +400,7 @@ sm4_aarch64_crypt_blk1_16(const void *rk, byte *out, const byte *in,
_gcry_sm4_aarch64_crypt_blk1_8(rk, out, in, num_blks);
return 0;
}
+
#endif /* USE_AARCH64_SIMD */

#ifdef USE_ARM_CE
@@ -427,6 +446,7 @@ sm4_armv8_ce_crypt_blk1_16(const void *rk, byte *out, const byte *in,
_gcry_sm4_armv8_ce_crypt_blk1_8(rk, out, in, num_blks);
return 0;
}
+
#endif /* USE_ARM_CE */

static inline void prefetch_sbox_table(void)
@@ -758,6 +778,12 @@ sm4_get_crypt_blk1_16_fn(SM4_context *ctx)
return &sm4_gfni_avx2_crypt_blk1_16;
}
#endif
+#ifdef USE_AESNI_AVX2
+ else if (ctx->use_aesni_avx2)
+ {
+ return &sm4_aesni_avx2_crypt_blk1_16;
+ }
+#endif
#ifdef USE_AESNI_AVX
else if (ctx->use_aesni_avx)
{
--
2.34.1


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@lists.gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel
Re: [PATCH 3/3] sm4-aesni-avx2: add generic 1 to 16 block bulk processing function [ In reply to ]
Hi Jussi,

On 4/25/22 2:47 AM, Jussi Kivilinna wrote:
> * cipher/sm4-aesni-avx2-amd64.S: Remove unnecessary vzeroupper at
> function entries.
> (_gcry_sm4_aesni_avx2_crypt_blk1_16): New.
> * cipher/sm4.c (_gcry_sm4_aesni_avx2_crypt_blk1_16)
> (sm4_aesni_avx2_crypt_blk1_16): New.
> (sm4_get_crypt_blk1_16_fn) [USE_AESNI_AVX2]: Add
> 'sm4_aesni_avx2_crypt_blk1_16'.
> --
>
> Benchmark AMD Ryzen 5800X:
>
> Before:
> SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
> XTS enc | 1.48 ns/B 643.2 MiB/s 7.19 c/B 4850
> XTS dec | 1.48 ns/B 644.3 MiB/s 7.18 c/B 4850
>
> After (1.37x faster):
> SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
> XTS enc | 1.07 ns/B 888.7 MiB/s 5.21 c/B 4850
> XTS dec | 1.07 ns/B 889.4 MiB/s 5.20 c/B 4850
>
> Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
> ---

Benchmark on Intel i5-6200U 2.30GHz:

Before:
SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
XTS enc | 2.95 ns/B 323.0 MiB/s 8.25 c/B 2792
XTS dec | 2.95 ns/B 323.0 MiB/s 8.24 c/B 2792

After (1.64x faster):
SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
XTS enc | 1.79 ns/B 531.4 MiB/s 5.01 c/B 2791
XTS dec | 1.79 ns/B 531.6 MiB/s 5.01 c/B 2791

Reviewed-and-tested-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>

Best regards,
Tianjia

_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@lists.gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel