Mailing List Archive

[PATCH 1/3] sm4: add XTS bulk processing
* cipher/sm4.c (_gcry_sm4_xts_crypt): New.
(sm4_setkey): Set XTS bulk function.
--

Benchmark on Ryzen 5800X:

Before:
SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
XTS enc | 7.28 ns/B 131.0 MiB/s 35.31 c/B 4850
XTS dec | 7.29 ns/B 130.9 MiB/s 35.34 c/B 4850

After (4.8x faster):
SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
XTS enc | 1.49 ns/B 638.6 MiB/s 7.24 c/B 4850
XTS dec | 1.49 ns/B 639.3 MiB/s 7.24 c/B 4850

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
---
cipher/sm4.c | 35 +++++++++++++++++++++++++++++++++++
1 file changed, 35 insertions(+)

diff --git a/cipher/sm4.c b/cipher/sm4.c
index 4815b184..600850e2 100644
--- a/cipher/sm4.c
+++ b/cipher/sm4.c
@@ -97,6 +97,9 @@ static void _gcry_sm4_cbc_dec (void *context, unsigned char *iv,
static void _gcry_sm4_cfb_dec (void *context, unsigned char *iv,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks);
+static void _gcry_sm4_xts_crypt (void *context, unsigned char *tweak,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks, int encrypt);
static size_t _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
const void *inbuf_arg, size_t nblocks,
int encrypt);
@@ -492,6 +495,7 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen,
bulk_ops->cbc_dec = _gcry_sm4_cbc_dec;
bulk_ops->cfb_dec = _gcry_sm4_cfb_dec;
bulk_ops->ctr_enc = _gcry_sm4_ctr_enc;
+ bulk_ops->xts_crypt = _gcry_sm4_xts_crypt;
bulk_ops->ocb_crypt = _gcry_sm4_ocb_crypt;
bulk_ops->ocb_auth = _gcry_sm4_ocb_auth;

@@ -954,6 +958,37 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv,
_gcry_burn_stack(burn_stack_depth);
}

+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+static void
+_gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+ SM4_context *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ int burn_stack_depth = 0;
+
+ /* Process remaining blocks. */
+ if (nblocks)
+ {
+ crypt_blk1_8_fn_t crypt_blk1_8 = sm4_get_crypt_blk1_8_fn(ctx);
+ u32 *rk = encrypt ? ctx->rkey_enc : ctx->rkey_dec;
+ unsigned char tmpbuf[16 * 8];
+ unsigned int tmp_used = 16;
+ size_t nburn;
+
+ nburn = bulk_xts_crypt_128(rk, crypt_blk1_8, outbuf, inbuf, nblocks,
+ tweak, tmpbuf, sizeof(tmpbuf) / 16,
+ &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory(tmpbuf, tmp_used);
+ }
+
+ if (burn_stack_depth)
+ _gcry_burn_stack(burn_stack_depth);
+}
+
/* Bulk encryption/decryption of complete blocks in OCB mode. */
static size_t
_gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
--
2.34.1


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@lists.gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel
Re: [PATCH 1/3] sm4: add XTS bulk processing [ In reply to ]
Hi Jussi,

On 4/25/22 2:47 AM, Jussi Kivilinna wrote:
> * cipher/sm4.c (_gcry_sm4_xts_crypt): New.
> (sm4_setkey): Set XTS bulk function.
> --
>
> Benchmark on Ryzen 5800X:
>
> Before:
> SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
> XTS enc | 7.28 ns/B 131.0 MiB/s 35.31 c/B 4850
> XTS dec | 7.29 ns/B 130.9 MiB/s 35.34 c/B 4850
>
> After (4.8x faster):
> SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
> XTS enc | 1.49 ns/B 638.6 MiB/s 7.24 c/B 4850
> XTS dec | 1.49 ns/B 639.3 MiB/s 7.24 c/B 4850
>
> Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
> ---
> cipher/sm4.c | 35 +++++++++++++++++++++++++++++++++++
> 1 file changed, 35 insertions(+)
>
> diff --git a/cipher/sm4.c b/cipher/sm4.c
> index 4815b184..600850e2 100644
> --- a/cipher/sm4.c
> +++ b/cipher/sm4.c
> @@ -97,6 +97,9 @@ static void _gcry_sm4_cbc_dec (void *context, unsigned char *iv,
> static void _gcry_sm4_cfb_dec (void *context, unsigned char *iv,
> void *outbuf_arg, const void *inbuf_arg,
> size_t nblocks);
> +static void _gcry_sm4_xts_crypt (void *context, unsigned char *tweak,
> + void *outbuf_arg, const void *inbuf_arg,
> + size_t nblocks, int encrypt);
> static size_t _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
> const void *inbuf_arg, size_t nblocks,
> int encrypt);
> @@ -492,6 +495,7 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen,
> bulk_ops->cbc_dec = _gcry_sm4_cbc_dec;
> bulk_ops->cfb_dec = _gcry_sm4_cfb_dec;
> bulk_ops->ctr_enc = _gcry_sm4_ctr_enc;
> + bulk_ops->xts_crypt = _gcry_sm4_xts_crypt;
> bulk_ops->ocb_crypt = _gcry_sm4_ocb_crypt;
> bulk_ops->ocb_auth = _gcry_sm4_ocb_auth;
>
> @@ -954,6 +958,37 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv,
> _gcry_burn_stack(burn_stack_depth);
> }
>
> +/* Bulk encryption/decryption of complete blocks in XTS mode. */
> +static void
> +_gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
> + const void *inbuf_arg, size_t nblocks, int encrypt)
> +{
> + SM4_context *ctx = context;
> + unsigned char *outbuf = outbuf_arg;
> + const unsigned char *inbuf = inbuf_arg;
> + int burn_stack_depth = 0;
> +
> + /* Process remaining blocks. */
> + if (nblocks)
> + {
> + crypt_blk1_8_fn_t crypt_blk1_8 = sm4_get_crypt_blk1_8_fn(ctx);
> + u32 *rk = encrypt ? ctx->rkey_enc : ctx->rkey_dec;
> + unsigned char tmpbuf[16 * 8];
> + unsigned int tmp_used = 16;
> + size_t nburn;
> +
> + nburn = bulk_xts_crypt_128(rk, crypt_blk1_8, outbuf, inbuf, nblocks,
> + tweak, tmpbuf, sizeof(tmpbuf) / 16,
> + &tmp_used);
> + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
> +
> + wipememory(tmpbuf, tmp_used);
> + }
> +
> + if (burn_stack_depth)
> + _gcry_burn_stack(burn_stack_depth);
> +}
> +
> /* Bulk encryption/decryption of complete blocks in OCB mode. */
> static size_t
> _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,


I can't successfully apply this series to master branch, patch 1/3 is
successfully applied, patch 2/3 can't be successfully applied, it seems
that some code modifications are missing, and patch 1/3 compiles with
errors:

sm4.c: In function '_gcry_sm4_xts_crypt':
sm4.c:1081:7: error: unknown type name 'crypt_blk1_8_fn_t'
1081 | crypt_blk1_8_fn_t crypt_blk1_8 =
sm4_get_crypt_blk1_8_fn(ctx);
| ^~~~~~~~~~~~~~~~~
sm4.c:1081:40: warning: implicit declaration of function
'sm4_get_crypt_blk1_8_fn' [-Wimplicit-function-declaration]
1081 | crypt_blk1_8_fn_t crypt_blk1_8 =
sm4_get_crypt_blk1_8_fn(ctx);
| ^~~~~~~~~~~~~~~~~~~~~~~
sm4.c:1087:15: warning: implicit declaration of function
'bulk_xts_crypt_128' [-Wimplicit-function-declaration]
1087 | nburn = bulk_xts_crypt_128(rk, crypt_blk1_8, outbuf,
inbuf, nblocks,
| ^~~~~~~~~~~~~~~~~~

Best regards,
Tianjia


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@lists.gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel
Re: [PATCH 1/3] sm4: add XTS bulk processing [ In reply to ]
Hi Jussi,

On 4/25/22 2:47 AM, Jussi Kivilinna wrote:
> * cipher/sm4.c (_gcry_sm4_xts_crypt): New.
> (sm4_setkey): Set XTS bulk function.
> --
>
> Benchmark on Ryzen 5800X:
>
> Before:
> SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
> XTS enc | 7.28 ns/B 131.0 MiB/s 35.31 c/B 4850
> XTS dec | 7.29 ns/B 130.9 MiB/s 35.34 c/B 4850
>
> After (4.8x faster):
> SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
> XTS enc | 1.49 ns/B 638.6 MiB/s 7.24 c/B 4850
> XTS dec | 1.49 ns/B 639.3 MiB/s 7.24 c/B 4850
>
> Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
> ---
> cipher/sm4.c | 35 +++++++++++++++++++++++++++++++++++
> 1 file changed, 35 insertions(+)
>
> diff --git a/cipher/sm4.c b/cipher/sm4.c
> index 4815b184..600850e2 100644
> --- a/cipher/sm4.c
> +++ b/cipher/sm4.c
> @@ -97,6 +97,9 @@ static void _gcry_sm4_cbc_dec (void *context, unsigned char *iv,
> static void _gcry_sm4_cfb_dec (void *context, unsigned char *iv,
> void *outbuf_arg, const void *inbuf_arg,
> size_t nblocks);
> +static void _gcry_sm4_xts_crypt (void *context, unsigned char *tweak,
> + void *outbuf_arg, const void *inbuf_arg,
> + size_t nblocks, int encrypt);
> static size_t _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
> const void *inbuf_arg, size_t nblocks,
> int encrypt);
> @@ -492,6 +495,7 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen,
> bulk_ops->cbc_dec = _gcry_sm4_cbc_dec;
> bulk_ops->cfb_dec = _gcry_sm4_cfb_dec;
> bulk_ops->ctr_enc = _gcry_sm4_ctr_enc;
> + bulk_ops->xts_crypt = _gcry_sm4_xts_crypt;
> bulk_ops->ocb_crypt = _gcry_sm4_ocb_crypt;
> bulk_ops->ocb_auth = _gcry_sm4_ocb_auth;
>
> @@ -954,6 +958,37 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv,
> _gcry_burn_stack(burn_stack_depth);
> }
>
> +/* Bulk encryption/decryption of complete blocks in XTS mode. */
> +static void
> +_gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
> + const void *inbuf_arg, size_t nblocks, int encrypt)
> +{
> + SM4_context *ctx = context;
> + unsigned char *outbuf = outbuf_arg;
> + const unsigned char *inbuf = inbuf_arg;
> + int burn_stack_depth = 0;
> +
> + /* Process remaining blocks. */
> + if (nblocks)
> + {
> + crypt_blk1_8_fn_t crypt_blk1_8 = sm4_get_crypt_blk1_8_fn(ctx);
> + u32 *rk = encrypt ? ctx->rkey_enc : ctx->rkey_dec;
> + unsigned char tmpbuf[16 * 8];
> + unsigned int tmp_used = 16;
> + size_t nburn;
> +
> + nburn = bulk_xts_crypt_128(rk, crypt_blk1_8, outbuf, inbuf, nblocks,
> + tweak, tmpbuf, sizeof(tmpbuf) / 16,
> + &tmp_used);
> + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
> +
> + wipememory(tmpbuf, tmp_used);
> + }
> +
> + if (burn_stack_depth)
> + _gcry_burn_stack(burn_stack_depth);
> +}
> +
> /* Bulk encryption/decryption of complete blocks in OCB mode. */
> static size_t
> _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,

Thanks for the reply, this is a great job, I did some performance tests
and reviews, but unfortunately I haven't found a machine that supports
GFNI features at the moment, so for patch 1/3:

Benchmark on Intel i5-6200U 2.30GHz:

Before:
SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
XTS enc | 13.41 ns/B 71.10 MiB/s 37.45 c/B 2792
XTS dec | 13.43 ns/B 71.03 MiB/s 37.49 c/B 2792

After (4.54x faster):
SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
XTS enc | 2.96 ns/B 322.7 MiB/s 8.25 c/B 2792
XTS dec | 2.96 ns/B 322.5 MiB/s 8.26 c/B 2792

Reviewed-and-tested-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>

Best regards,
Tianjia

_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@lists.gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel