Mailing List Archive

[PATCH 1/8] rijndael: add ECB acceleration (for benchmarking purposes)
* cipher/cipher-internal.h (cipher_bulk_ops): Add 'ecb_crypt'.
* cipher/cipher.c (do_ecb_crypt): Use bulk function if available.
* cipher/rijndael-aesni.c (do_aesni_enc_vec8): Change asm label
'.Ldeclast' to '.Lenclast'.
(_gcry_aes_aesni_ecb_crypt): New.
* cipher/rijndael-armv8-aarch32-ce.S (_gcry_aes_ecb_enc_armv8_ce)
(_gcry_aes_ecb_dec_armv8_ce): New.
* cipher/rijndael-armv8-aarch64-ce.S (_gcry_aes_ecb_enc_armv8_ce)
(_gcry_aes_ecb_dec_armv8_ce): New.
* cipher/rijndael-armv8-ce.c (_gcry_aes_ocb_enc_armv8_ce)
(_gcry_aes_ocb_dec_armv8_ce, _gcry_aes_ocb_auth_armv8_ce): Change
return value from void to size_t.
(ocb_crypt_fn_t, xts_crypt_fn_t): Remove.
(_gcry_aes_armv8_ce_ocb_crypt, _gcry_aes_armv8_ce_xts_crypt): Remove
indirect function call; Return value from called function (allows tail
call optimization).
(_gcry_aes_armv8_ce_ocb_auth): Return value from called function (allows
tail call optimization).
(_gcry_aes_ecb_enc_armv8_ce, _gcry_aes_ecb_dec_armv8_ce)
(_gcry_aes_armv8_ce_ecb_crypt): New.
* cipher/rijndael-vaes-avx2-amd64.S
(_gcry_vaes_avx2_ecb_crypt_amd64): New.
* cipher/rijndael-vaes.c (_gcry_vaes_avx2_ecb_crypt_amd64)
(_gcry_aes_vaes_ecb_crypt): New.
* cipher/rijndael.c (_gcry_aes_aesni_ecb_crypt)
(_gcry_aes_vaes_ecb_crypt, _gcry_aes_armv8_ce_ecb_crypt): New.
(do_setkey): Setup ECB bulk function for x86 AESNI/VAES and ARM CE.
--

Benchmark on AMD Ryzen 9 7900X:

Before (OCB for reference):
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.128 ns/B 7460 MiB/s 0.720 c/B 5634±1
ECB dec | 0.134 ns/B 7103 MiB/s 0.753 c/B 5608
OCB enc | 0.029 ns/B 32930 MiB/s 0.163 c/B 5625
OCB dec | 0.029 ns/B 32738 MiB/s 0.164 c/B 5625

After:
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.028 ns/B 33761 MiB/s 0.159 c/B 5625
ECB dec | 0.028 ns/B 33917 MiB/s 0.158 c/B 5625

GnuPG-bug-id: T6242
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
---
cipher/cipher-internal.h | 2 +
cipher/cipher.c | 41 ++-
cipher/rijndael-aesni.c | 160 ++++++++++-
cipher/rijndael-armv8-aarch32-ce.S | 152 +++++++++-
cipher/rijndael-armv8-aarch64-ce.S | 125 ++++++++-
cipher/rijndael-armv8-ce.c | 124 +++++----
cipher/rijndael-vaes-avx2-amd64.S | 432 ++++++++++++++++++++++++++++-
cipher/rijndael-vaes.c | 26 ++
cipher/rijndael.c | 12 +
9 files changed, 997 insertions(+), 77 deletions(-)

diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index 66b75955..4e022f38 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -161,6 +161,8 @@ typedef struct cipher_mode_ops
not NULL. */
typedef struct cipher_bulk_ops
{
+ void (*ecb_crypt)(void *context, void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks, int encrypt);
void (*cfb_enc)(void *context, unsigned char *iv, void *outbuf_arg,
const void *inbuf_arg, size_t nblocks);
void (*cfb_dec)(void *context, unsigned char *iv, void *outbuf_arg,
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 6c335aec..026c1511 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -983,14 +983,11 @@ cipher_reset (gcry_cipher_hd_t c)


static gcry_err_code_t
-do_ecb_crypt (gcry_cipher_hd_t c,
- unsigned char *outbuf, size_t outbuflen,
- const unsigned char *inbuf, size_t inbuflen,
- gcry_cipher_encrypt_t crypt_fn)
+do_ecb_crypt (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen,
+ const unsigned char *inbuf, size_t inbuflen, int encrypt)
{
unsigned int blocksize = c->spec->blocksize;
size_t n, nblocks;
- unsigned int burn, nburn;

if (outbuflen < inbuflen)
return GPG_ERR_BUFFER_TOO_SHORT;
@@ -998,18 +995,32 @@ do_ecb_crypt (gcry_cipher_hd_t c,
return GPG_ERR_INV_LENGTH;

nblocks = inbuflen / blocksize;
- burn = 0;

- for (n=0; n < nblocks; n++ )
+ if (nblocks == 0)
+ return 0;
+
+ if (c->bulk.ecb_crypt)
{
- nburn = crypt_fn (&c->context.c, outbuf, inbuf);
- burn = nburn > burn ? nburn : burn;
- inbuf += blocksize;
- outbuf += blocksize;
+ c->bulk.ecb_crypt (&c->context.c, outbuf, inbuf, nblocks, encrypt);
}
+ else
+ {
+ gcry_cipher_encrypt_t crypt_fn =
+ encrypt ? c->spec->encrypt : c->spec->decrypt;
+ unsigned int burn = 0;
+ unsigned int nburn;

- if (burn > 0)
- _gcry_burn_stack (burn + 4 * sizeof(void *));
+ for (n = 0; n < nblocks; n++)
+ {
+ nburn = crypt_fn (&c->context.c, outbuf, inbuf);
+ burn = nburn > burn ? nburn : burn;
+ inbuf += blocksize;
+ outbuf += blocksize;
+ }
+
+ if (burn > 0)
+ _gcry_burn_stack (burn + 4 * sizeof(void *));
+ }

return 0;
}
@@ -1019,7 +1030,7 @@ do_ecb_encrypt (gcry_cipher_hd_t c,
unsigned char *outbuf, size_t outbuflen,
const unsigned char *inbuf, size_t inbuflen)
{
- return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, c->spec->encrypt);
+ return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, 1);
}

static gcry_err_code_t
@@ -1027,7 +1038,7 @@ do_ecb_decrypt (gcry_cipher_hd_t c,
unsigned char *outbuf, size_t outbuflen,
const unsigned char *inbuf, size_t inbuflen)
{
- return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, c->spec->decrypt);
+ return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, 0);
}


diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 156af015..906737a6 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -870,7 +870,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
"aesenc %%xmm0, %%xmm10\n\t"
"aesenc %%xmm0, %%xmm11\n\t"
"movdqa 0xa0(%[key]), %%xmm0\n\t"
- "jb .Ldeclast%=\n\t"
+ "jb .Lenclast%=\n\t"
"aesenc %%xmm0, %%xmm1\n\t"
"aesenc %%xmm0, %%xmm2\n\t"
"aesenc %%xmm0, %%xmm3\n\t"
@@ -889,7 +889,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
"aesenc %%xmm0, %%xmm10\n\t"
"aesenc %%xmm0, %%xmm11\n\t"
"movdqa 0xc0(%[key]), %%xmm0\n\t"
- "je .Ldeclast%=\n\t"
+ "je .Lenclast%=\n\t"
"aesenc %%xmm0, %%xmm1\n\t"
"aesenc %%xmm0, %%xmm2\n\t"
"aesenc %%xmm0, %%xmm3\n\t"
@@ -909,7 +909,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
"aesenc %%xmm0, %%xmm11\n\t"
"movdqa 0xe0(%[key]), %%xmm0\n"

- ".Ldeclast%=:\n\t"
+ ".Lenclast%=:\n\t"
: /* no output */
: [key] "r" (ctx->keyschenc),
[rounds] "r" (ctx->rounds)
@@ -1717,6 +1717,160 @@ _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
}


+void ASM_FUNC_ATTR
+_gcry_aes_aesni_ecb_crypt (RIJNDAEL_context *ctx, unsigned char *dst,
+ const unsigned char *src, size_t nblocks,
+ int encrypt)
+{
+ aesni_prepare_2_7_variable;
+
+ aesni_prepare ();
+ aesni_prepare_2_7();
+
+ if (!encrypt && !ctx->decryption_prepared)
+ {
+ do_aesni_prepare_decryption ( ctx );
+ ctx->decryption_prepared = 1;
+ }
+
+#ifdef __x86_64__
+ if (nblocks >= 8)
+ {
+ const void *key = encrypt ? ctx->keyschenc : ctx->keyschdec;
+ aesni_prepare_8_15_variable;
+
+ aesni_prepare_8_15();
+
+ for (; nblocks >= 8; nblocks -= 8)
+ {
+ asm volatile
+ ("movdqa (%[key]), %%xmm0\n\t"
+ "movdqu 0*16(%[src]), %%xmm1\n\t"
+ "movdqu 1*16(%[src]), %%xmm2\n\t"
+ "movdqu 2*16(%[src]), %%xmm3\n\t"
+ "movdqu 3*16(%[src]), %%xmm4\n\t"
+ "movdqu 4*16(%[src]), %%xmm8\n\t"
+ "movdqu 5*16(%[src]), %%xmm9\n\t"
+ "movdqu 6*16(%[src]), %%xmm10\n\t"
+ "movdqu 7*16(%[src]), %%xmm11\n\t"
+ "pxor %%xmm0, %%xmm1\n\t"
+ "pxor %%xmm0, %%xmm2\n\t"
+ "pxor %%xmm0, %%xmm3\n\t"
+ "pxor %%xmm0, %%xmm4\n\t"
+ "pxor %%xmm0, %%xmm8\n\t"
+ "pxor %%xmm0, %%xmm9\n\t"
+ "pxor %%xmm0, %%xmm10\n\t"
+ "pxor %%xmm0, %%xmm11\n\t"
+ : /* No output */
+ : [src] "r" (src),
+ [key] "r" (key)
+ : "memory");
+
+ if (encrypt)
+ {
+ do_aesni_enc_vec8 (ctx);
+ asm volatile
+ ("aesenclast %%xmm0, %%xmm1\n\t"
+ "aesenclast %%xmm0, %%xmm2\n\t"
+ "aesenclast %%xmm0, %%xmm3\n\t"
+ "aesenclast %%xmm0, %%xmm4\n\t"
+ "aesenclast %%xmm0, %%xmm8\n\t"
+ "aesenclast %%xmm0, %%xmm9\n\t"
+ "aesenclast %%xmm0, %%xmm10\n\t"
+ "aesenclast %%xmm0, %%xmm11\n\t"
+ ::: "memory" );
+ }
+ else
+ {
+ do_aesni_dec_vec8 (ctx);
+ asm volatile
+ ("aesdeclast %%xmm0, %%xmm1\n\t"
+ "aesdeclast %%xmm0, %%xmm2\n\t"
+ "aesdeclast %%xmm0, %%xmm3\n\t"
+ "aesdeclast %%xmm0, %%xmm4\n\t"
+ "aesdeclast %%xmm0, %%xmm8\n\t"
+ "aesdeclast %%xmm0, %%xmm9\n\t"
+ "aesdeclast %%xmm0, %%xmm10\n\t"
+ "aesdeclast %%xmm0, %%xmm11\n\t"
+ ::: "memory" );
+ }
+
+ asm volatile
+ ("movdqu %%xmm1, 0*16(%[dst])\n\t"
+ "movdqu %%xmm2, 1*16(%[dst])\n\t"
+ "movdqu %%xmm3, 2*16(%[dst])\n\t"
+ "movdqu %%xmm4, 3*16(%[dst])\n\t"
+ "movdqu %%xmm8, 4*16(%[dst])\n\t"
+ "movdqu %%xmm9, 5*16(%[dst])\n\t"
+ "movdqu %%xmm10, 6*16(%[dst])\n\t"
+ "movdqu %%xmm11, 7*16(%[dst])\n\t"
+ : /* No output */
+ : [dst] "r" (dst)
+ : "memory");
+
+ dst += 8*BLOCKSIZE;
+ src += 8*BLOCKSIZE;
+ }
+
+ aesni_cleanup_8_15();
+ }
+#endif
+
+ for (; nblocks >= 4; nblocks -= 4)
+ {
+ asm volatile
+ ("movdqu 0*16(%[src]), %%xmm1\n\t"
+ "movdqu 1*16(%[src]), %%xmm2\n\t"
+ "movdqu 2*16(%[src]), %%xmm3\n\t"
+ "movdqu 3*16(%[src]), %%xmm4\n\t"
+ : /* No output */
+ : [src] "r" (src)
+ : "memory");
+
+ if (encrypt)
+ do_aesni_enc_vec4 (ctx);
+ else
+ do_aesni_dec_vec4 (ctx);
+
+ asm volatile
+ ("movdqu %%xmm1, 0*16(%[dst])\n\t"
+ "movdqu %%xmm2, 1*16(%[dst])\n\t"
+ "movdqu %%xmm3, 2*16(%[dst])\n\t"
+ "movdqu %%xmm4, 3*16(%[dst])\n\t"
+ : /* No output */
+ : [dst] "r" (dst)
+ : "memory");
+
+ dst += 4*BLOCKSIZE;
+ src += 4*BLOCKSIZE;
+ }
+
+ for (; nblocks; nblocks--)
+ {
+ asm volatile ("movdqu %[src], %%xmm0\n\t"
+ :
+ : [src] "m" (*src)
+ : "memory" );
+
+ if (encrypt)
+ do_aesni_enc (ctx);
+ else
+ do_aesni_dec (ctx);
+
+ asm volatile ("movdqu %%xmm0, %[dst]\n\t"
+ : [dst] "=m" (*dst)
+ :
+ : "memory" );
+
+ dst += BLOCKSIZE;
+ src += BLOCKSIZE;
+ }
+
+ aesni_cleanup ();
+ aesni_cleanup_2_7 ();
+}
+
+
void ASM_FUNC_ATTR
_gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, unsigned char *iv,
unsigned char *outbuf, const unsigned char *inbuf,
diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S
index 1eafa93e..6208652b 100644
--- a/cipher/rijndael-armv8-aarch32-ce.S
+++ b/cipher/rijndael-armv8-aarch32-ce.S
@@ -653,6 +653,149 @@ _gcry_aes_cbc_dec_armv8_ce:
.size _gcry_aes_cbc_dec_armv8_ce,.-_gcry_aes_cbc_dec_armv8_ce;


+/*
+ * void _gcry_aes_ecb_enc_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * size_t nblocks,
+ * unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ecb_enc_armv8_ce
+.type _gcry_aes_ecb_enc_armv8_ce,%function;
+_gcry_aes_ecb_enc_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: outbuf
+ * r2: inbuf
+ * r3: nblocks
+ * %st+0: nrounds => r4
+ */
+
+ push {r4-r6,lr} /* 4*4 = 16b */
+ cmp r3, #0
+ beq .Lecb_enc_skip
+ ldr r4, [sp, #(16+0)]
+ vpush {q4-q7}
+
+ cmp r4, #12
+ aes_preload_keys(r0, lr);
+
+ beq .Lecb_entry_192e
+ bhi .Lecb_entry_256e
+
+#define ECB_CRYPT(bits, e_d, mc_imc, ...) \
+ .Lecb_entry_##bits##e_d: \
+ cmp r3, #4; \
+ blo .Lecb_loop_##bits##e_d; \
+ \
+ .Lecb_loop4_##bits##e_d: \
+ vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \
+ sub r3, r3, #4; \
+ vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \
+ cmp r3, #4; \
+ \
+ do_aes_4_##bits(e_d, mc_imc, q1, q2, q3, q4, ##__VA_ARGS__); \
+ \
+ vst1.8 {q1-q2}, [r1]!; /* store ciphertext */ \
+ vst1.8 {q3-q4}, [r1]!; /* store ciphertext */ \
+ \
+ bhs .Lecb_loop4_##bits##e_d; \
+ cmp r3, #0; \
+ beq .Lecb_done_##e_d; \
+ \
+ .Lecb_loop_##bits##e_d: \
+ vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+ subs r3, r3, #1; \
+ \
+ do_aes_one##bits(e_d, mc_imc, q1, q1, ##__VA_ARGS__); \
+ \
+ vst1.8 {q1}, [r1]!; /* store plaintext */ \
+ bne .Lecb_loop_##bits##e_d; \
+ b .Lecb_done_##e_d;
+
+ ECB_CRYPT(128, e, mc)
+ ECB_CRYPT(192, e, mc, r0, lr)
+ ECB_CRYPT(256, e, mc, r0, lr)
+
+.Lecb_done_e:
+ CLEAR_REG(q0)
+ CLEAR_REG(q1)
+ CLEAR_REG(q2)
+ CLEAR_REG(q3)
+ CLEAR_REG(q8)
+ CLEAR_REG(q9)
+ vpop {q4-q7}
+ CLEAR_REG(q10)
+ CLEAR_REG(q11)
+ CLEAR_REG(q12)
+ CLEAR_REG(q13)
+ CLEAR_REG(q14)
+
+.Lecb_enc_skip:
+ pop {r4-r6,pc}
+.size _gcry_aes_ecb_enc_armv8_ce,.-_gcry_aes_ecb_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_ecb_dec_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * size_t nblocks,
+ * unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ecb_dec_armv8_ce
+.type _gcry_aes_ecb_dec_armv8_ce,%function;
+_gcry_aes_ecb_dec_armv8_ce:
+ /* input:
+ * r0: keysched
+ * r1: outbuf
+ * r2: inbuf
+ * r3: nblocks
+ * %st+0: nrounds => r4
+ */
+
+ push {r4-r6,lr} /* 4*4 = 16b */
+ cmp r3, #0
+ beq .Lecb_enc_skip
+ ldr r4, [sp, #(16+0)]
+ vpush {q4-q7}
+
+ cmp r4, #12
+
+ aes_preload_keys(r0, lr);
+
+ beq .Lecb_entry_192d
+ bhi .Lecb_entry_256d
+
+ ECB_CRYPT(128, d, imc)
+ ECB_CRYPT(192, d, imc, r0, lr)
+ ECB_CRYPT(256, d, imc, r0, lr)
+
+#undef ECB_CRYPT
+
+.Lecb_done_d:
+ CLEAR_REG(q0)
+ CLEAR_REG(q1)
+ CLEAR_REG(q2)
+ CLEAR_REG(q3)
+ CLEAR_REG(q8)
+ CLEAR_REG(q9)
+ vpop {q4-q7}
+ CLEAR_REG(q10)
+ CLEAR_REG(q11)
+ CLEAR_REG(q12)
+ CLEAR_REG(q13)
+ CLEAR_REG(q14)
+
+.Lecb_dec_skip:
+ pop {r4-r6,pc}
+.size _gcry_aes_ecb_dec_armv8_ce,.-_gcry_aes_ecb_dec_armv8_ce;
+
+
/*
* void _gcry_aes_cfb_enc_armv8_ce (const void *keysched,
* unsigned char *outbuf,
@@ -1138,7 +1281,7 @@ _gcry_aes_ctr32le_enc_armv8_ce:


/*
- * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
* unsigned char *outbuf,
* const unsigned char *inbuf,
* unsigned char *offset,
@@ -1305,6 +1448,7 @@ _gcry_aes_ocb_enc_armv8_ce:
CLEAR_REG(q13)
CLEAR_REG(q14)

+ mov r0, #0
pop {r4-r12,lr}
vpop {q4-q7}
bx lr
@@ -1312,7 +1456,7 @@ _gcry_aes_ocb_enc_armv8_ce:


/*
- * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
* unsigned char *outbuf,
* const unsigned char *inbuf,
* unsigned char *offset,
@@ -1479,6 +1623,7 @@ _gcry_aes_ocb_dec_armv8_ce:
CLEAR_REG(q13)
CLEAR_REG(q14)

+ mov r0, #0
pop {r4-r12,lr}
vpop {q4-q7}
bx lr
@@ -1486,7 +1631,7 @@ _gcry_aes_ocb_dec_armv8_ce:


/*
- * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
* const unsigned char *abuf,
* unsigned char *offset,
* unsigned char *checksum,
@@ -1632,6 +1777,7 @@ _gcry_aes_ocb_auth_armv8_ce:
CLEAR_REG(q13)
CLEAR_REG(q14)

+ mov r0, #0
pop {r4-r12,lr}
vpop {q4-q7}
bx lr
diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S
index 4fef0345..97d3d7eb 100644
--- a/cipher/rijndael-armv8-aarch64-ce.S
+++ b/cipher/rijndael-armv8-aarch64-ce.S
@@ -385,6 +385,119 @@ _gcry_aes_dec_armv8_ce:
ELF(.size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce;)


+/*
+ * void _gcry_aes_ecb_enc_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * size_t nblocks, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ecb_enc_armv8_ce
+ELF(.type _gcry_aes_ecb_enc_armv8_ce,%function;)
+_gcry_aes_ecb_enc_armv8_ce:
+ /* input:
+ * x0: keysched
+ * x1: outbuf
+ * x2: inbuf
+ * x3: nblocks
+ * w4: nrounds
+ */
+ CFI_STARTPROC();
+
+ cbz x3, .Lecb_enc_skip
+
+ aes_preload_keys(x0, w4);
+
+ b.eq .Lecb_entry_192e
+ b.hi .Lecb_entry_256e
+
+#define ECB_CRYPT(bits, e_d, mc_imc) \
+ .Lecb_entry_##bits##e_d: \
+ cmp x3, #4; \
+ b.lo .Lecb_loop_##bits##e_d; \
+ \
+ .Lecb_loop4_##bits##e_d: \
+ sub x3, x3, #4; \
+ ld1 {v0.16b-v3.16b}, [x2], #64; /* load ciphertext */ \
+ cmp x3, #4; \
+ do_aes_4_##bits(e_d, mc_imc, v0, v1, v2, v3); \
+ st1 {v0.16b-v3.16b}, [x1], #64; /* store plaintext */ \
+ \
+ b.hs .Lecb_loop4_##bits##e_d; \
+ CLEAR_REG(v1); \
+ CLEAR_REG(v2); \
+ CLEAR_REG(v3); \
+ cbz x3, .Lecb_done_##e_d; \
+ \
+ .Lecb_loop_##bits##e_d: \
+ ld1 {v0.16b}, [x2], #16; /* load ciphertext */ \
+ sub x3, x3, #1; \
+ do_aes_one##bits(e_d, mc_imc, v0, v0, vk0); \
+ st1 {v0.16b}, [x1], #16; /* store plaintext */ \
+ \
+ cbnz x3, .Lecb_loop_##bits##e_d; \
+ b .Lecb_done_##e_d;
+
+ ECB_CRYPT(128, e, mc)
+ ECB_CRYPT(192, e, mc)
+ ECB_CRYPT(256, e, mc)
+
+.Lecb_done_e:
+ aes_clear_keys(w4)
+
+ CLEAR_REG(v0)
+
+.Lecb_enc_skip:
+ ret_spec_stop
+ CFI_ENDPROC();
+ELF(.size _gcry_aes_ecb_enc_armv8_ce,.-_gcry_aes_ecb_enc_armv8_ce;)
+
+
+/*
+ * void _gcry_aes_ecb_dec_armv8_ce (const void *keysched,
+ * unsigned char *outbuf,
+ * const unsigned char *inbuf,
+ * size_t nblocks, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ecb_dec_armv8_ce
+ELF(.type _gcry_aes_ecb_dec_armv8_ce,%function;)
+_gcry_aes_ecb_dec_armv8_ce:
+ /* input:
+ * x0: keysched
+ * x1: outbuf
+ * x2: inbuf
+ * x3: nblocks
+ * w4: nrounds
+ */
+ CFI_STARTPROC();
+
+ cbz x3, .Lecb_enc_skip
+
+ aes_preload_keys(x0, w4);
+
+ b.eq .Lecb_entry_192d
+ b.hi .Lecb_entry_256d
+
+ ECB_CRYPT(128, d, imc)
+ ECB_CRYPT(192, d, imc)
+ ECB_CRYPT(256, d, imc)
+
+#undef ECB_CRYPT
+
+.Lecb_done_d:
+ aes_clear_keys(w4)
+
+ CLEAR_REG(v0)
+
+.Lecb_dec_skip:
+ ret_spec_stop
+ CFI_ENDPROC();
+ELF(.size _gcry_aes_ecb_dec_armv8_ce,.-_gcry_aes_ecb_dec_armv8_ce;)
+
+
/*
* void _gcry_aes_cbc_enc_armv8_ce (const void *keysched,
* unsigned char *outbuf,
@@ -471,7 +584,8 @@ ELF(.size _gcry_aes_cbc_enc_armv8_ce,.-_gcry_aes_cbc_enc_armv8_ce;)
* void _gcry_aes_cbc_dec_armv8_ce (const void *keysched,
* unsigned char *outbuf,
* const unsigned char *inbuf,
- * unsigned char *iv, unsigned int nrounds);
+ * unsigned char *iv,
+ * size_t nblocks, unsigned int nrounds);
*/

.align 3
@@ -1136,7 +1250,7 @@ ELF(.size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce;)


/*
- * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
* unsigned char *outbuf,
* const unsigned char *inbuf,
* unsigned char *offset,
@@ -1379,13 +1493,14 @@ _gcry_aes_ocb_enc_armv8_ce:
add sp, sp, #128;
CFI_ADJUST_CFA_OFFSET(-128);

+ mov x0, #0
ret_spec_stop
CFI_ENDPROC();
ELF(.size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce;)


/*
- * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
* unsigned char *outbuf,
* const unsigned char *inbuf,
* unsigned char *offset,
@@ -1458,13 +1573,14 @@ _gcry_aes_ocb_dec_armv8_ce:
add sp, sp, #128;
CFI_ADJUST_CFA_OFFSET(-128);

+ mov x0, #0
ret_spec_stop
CFI_ENDPROC();
ELF(.size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce;)


/*
- * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
* const unsigned char *abuf,
* unsigned char *offset,
* unsigned char *checksum,
@@ -1605,6 +1721,7 @@ _gcry_aes_ocb_auth_armv8_ce:
CLEAR_REG(v2)
CLEAR_REG(v16)

+ mov x0, #0
ret_spec_stop
CFI_ENDPROC();
ELF(.size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;)
diff --git a/cipher/rijndael-armv8-ce.c b/cipher/rijndael-armv8-ce.c
index c9c37654..042b7d42 100644
--- a/cipher/rijndael-armv8-ce.c
+++ b/cipher/rijndael-armv8-ce.c
@@ -80,32 +80,32 @@ extern void _gcry_aes_ctr32le_enc_armv8_ce (const void *keysched,
unsigned char *iv, size_t nblocks,
unsigned int nrounds);

-extern void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
- unsigned char *outbuf,
- const unsigned char *inbuf,
- unsigned char *offset,
- unsigned char *checksum,
- unsigned char *L_table,
- size_t nblocks,
- unsigned int nrounds,
- unsigned int blkn);
-extern void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
- unsigned char *outbuf,
- const unsigned char *inbuf,
- unsigned char *offset,
- unsigned char *checksum,
- unsigned char *L_table,
- size_t nblocks,
- unsigned int nrounds,
- unsigned int blkn);
-extern void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
- const unsigned char *abuf,
- unsigned char *offset,
- unsigned char *checksum,
- unsigned char *L_table,
- size_t nblocks,
- unsigned int nrounds,
- unsigned int blkn);
+extern size_t _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *offset,
+ unsigned char *checksum,
+ unsigned char *L_table,
+ size_t nblocks,
+ unsigned int nrounds,
+ unsigned int blkn);
+extern size_t _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *offset,
+ unsigned char *checksum,
+ unsigned char *L_table,
+ size_t nblocks,
+ unsigned int nrounds,
+ unsigned int blkn);
+extern size_t _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
+ const unsigned char *abuf,
+ unsigned char *offset,
+ unsigned char *checksum,
+ unsigned char *L_table,
+ size_t nblocks,
+ unsigned int nrounds,
+ unsigned int blkn);
extern void _gcry_aes_xts_enc_armv8_ce (const void *keysched,
unsigned char *outbuf,
const unsigned char *inbuf,
@@ -116,17 +116,14 @@ extern void _gcry_aes_xts_dec_armv8_ce (const void *keysched,
const unsigned char *inbuf,
unsigned char *tweak,
size_t nblocks, unsigned int nrounds);
-
-typedef void (*ocb_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
- const unsigned char *inbuf,
- unsigned char *offset, unsigned char *checksum,
- unsigned char *L_table, size_t nblocks,
- unsigned int nrounds, unsigned int blkn);
-
-typedef void (*xts_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
- const unsigned char *inbuf,
- unsigned char *tweak, size_t nblocks,
- unsigned int nrounds);
+extern void _gcry_aes_ecb_enc_armv8_ce (const void *keysched,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ size_t nblocks, unsigned int nrounds);
+extern void _gcry_aes_ecb_dec_armv8_ce (const void *keysched,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ size_t nblocks, unsigned int nrounds);


void
@@ -312,8 +309,6 @@ _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
{
RIJNDAEL_context *ctx = (void *)&c->context.c;
const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
- ocb_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_ocb_enc_armv8_ce
- : _gcry_aes_ocb_dec_armv8_ce;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
unsigned int nrounds = ctx->rounds;
@@ -327,10 +322,16 @@ _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,

c->u_mode.ocb.data_nblocks = blkn + nblocks;

- crypt_fn(keysched, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr,
- c->u_mode.ocb.L[0], nblocks, nrounds, (unsigned int)blkn);
-
- return 0;
+ if (encrypt)
+ return _gcry_aes_ocb_enc_armv8_ce (keysched, outbuf, inbuf,
+ c->u_iv.iv, c->u_ctr.ctr,
+ c->u_mode.ocb.L[0], nblocks, nrounds,
+ (unsigned int)blkn);
+ else
+ return _gcry_aes_ocb_dec_armv8_ce (keysched, outbuf, inbuf,
+ c->u_iv.iv, c->u_ctr.ctr,
+ c->u_mode.ocb.L[0], nblocks, nrounds,
+ (unsigned int)blkn);
}

size_t
@@ -345,11 +346,9 @@ _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,

c->u_mode.ocb.aad_nblocks = blkn + nblocks;

- _gcry_aes_ocb_auth_armv8_ce(keysched, abuf, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum, c->u_mode.ocb.L[0],
- nblocks, nrounds, (unsigned int)blkn);
-
- return 0;
+ return _gcry_aes_ocb_auth_armv8_ce (keysched, abuf, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, c->u_mode.ocb.L[0],
+ nblocks, nrounds, (unsigned int)blkn);
}

void
@@ -358,8 +357,6 @@ _gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
size_t nblocks, int encrypt)
{
const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
- xts_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_xts_enc_armv8_ce
- : _gcry_aes_xts_dec_armv8_ce;
unsigned int nrounds = ctx->rounds;

if ( !encrypt && !ctx->decryption_prepared )
@@ -368,7 +365,32 @@ _gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
ctx->decryption_prepared = 1;
}

- crypt_fn(keysched, outbuf, inbuf, tweak, nblocks, nrounds);
+ if (encrypt)
+ _gcry_aes_xts_enc_armv8_ce (keysched, outbuf, inbuf, tweak,
+ nblocks, nrounds);
+ else
+ _gcry_aes_xts_dec_armv8_ce (keysched, outbuf, inbuf, tweak,
+ nblocks, nrounds);
}

+void
+_gcry_aes_armv8_ce_ecb_crypt (void *context, void *outbuf,
+ const void *inbuf, size_t nblocks,
+ int encrypt)
+{
+ RIJNDAEL_context *ctx = context;
+ const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
+ unsigned int nrounds = ctx->rounds;
+
+ if ( !encrypt && !ctx->decryption_prepared )
+ {
+ _gcry_aes_armv8_ce_prepare_decryption ( ctx );
+ ctx->decryption_prepared = 1;
+ }
+
+ if (encrypt)
+ _gcry_aes_ecb_enc_armv8_ce (keysched, outbuf, inbuf, nblocks, nrounds);
+ else
+ _gcry_aes_ecb_dec_armv8_ce (keysched, outbuf, inbuf, nblocks, nrounds);
+}
#endif /* USE_ARM_CE */
diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S
index e36e82a0..655fdf55 100644
--- a/cipher/rijndael-vaes-avx2-amd64.S
+++ b/cipher/rijndael-vaes-avx2-amd64.S
@@ -2357,7 +2357,7 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
ELF(.size _gcry_vaes_avx2_ocb_crypt_amd64,.-_gcry_vaes_avx2_ocb_crypt_amd64)

/**********************************************************************
- CTR-mode encryption
+ XTS-mode encryption
**********************************************************************/
ELF(.type _gcry_vaes_avx2_xts_crypt_amd64,@function)
.globl _gcry_vaes_avx2_xts_crypt_amd64
@@ -2873,6 +2873,436 @@ _gcry_vaes_avx2_xts_crypt_amd64:
CFI_ENDPROC();
ELF(.size _gcry_vaes_avx2_xts_crypt_amd64,.-_gcry_vaes_avx2_xts_crypt_amd64)

+/**********************************************************************
+ ECB-mode encryption
+ **********************************************************************/
+ELF(.type _gcry_vaes_avx2_ecb_crypt_amd64,@function)
+.globl _gcry_vaes_avx2_ecb_crypt_amd64
+_gcry_vaes_avx2_ecb_crypt_amd64:
+ /* input:
+ * %rdi: round keys
+ * %esi: encrypt
+ * %rdx: dst
+ * %rcx: src
+ * %r8: nblocks
+ * %r9: nrounds
+ */
+ CFI_STARTPROC();
+
+ /* Process 16 blocks per loop. */
+.align 8
+.Lecb_blk16:
+ cmpq $16, %r8;
+ jb .Lecb_blk8;
+
+ leaq -16(%r8), %r8;
+
+ /* Load input and xor first key. */
+ vbroadcasti128 (0 * 16)(%rdi), %ymm8;
+ vmovdqu (0 * 16)(%rcx), %ymm0;
+ vmovdqu (2 * 16)(%rcx), %ymm1;
+ vmovdqu (4 * 16)(%rcx), %ymm2;
+ vmovdqu (6 * 16)(%rcx), %ymm3;
+ vmovdqu (8 * 16)(%rcx), %ymm4;
+ vmovdqu (10 * 16)(%rcx), %ymm5;
+ vmovdqu (12 * 16)(%rcx), %ymm6;
+ vmovdqu (14 * 16)(%rcx), %ymm7;
+ vpxor %ymm8, %ymm0, %ymm0;
+ vpxor %ymm8, %ymm1, %ymm1;
+ vpxor %ymm8, %ymm2, %ymm2;
+ vpxor %ymm8, %ymm3, %ymm3;
+ vpxor %ymm8, %ymm4, %ymm4;
+ vpxor %ymm8, %ymm5, %ymm5;
+ vpxor %ymm8, %ymm6, %ymm6;
+ vpxor %ymm8, %ymm7, %ymm7;
+ vbroadcasti128 (1 * 16)(%rdi), %ymm8;
+ leaq (16 * 16)(%rcx), %rcx;
+
+ testl %esi, %esi;
+ jz .Lecb_dec_blk16;
+ /* AES rounds */
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (10 * 16)(%rdi), %ymm8;
+ cmpl $12, %r9d;
+ jb .Lecb_enc_blk16_last;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (12 * 16)(%rdi), %ymm8;
+ jz .Lecb_enc_blk16_last;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (14 * 16)(%rdi), %ymm8;
+ .Lecb_enc_blk16_last:
+ vaesenclast %ymm8, %ymm0, %ymm0;
+ vaesenclast %ymm8, %ymm1, %ymm1;
+ vaesenclast %ymm8, %ymm2, %ymm2;
+ vaesenclast %ymm8, %ymm3, %ymm3;
+ vaesenclast %ymm8, %ymm4, %ymm4;
+ vaesenclast %ymm8, %ymm5, %ymm5;
+ vaesenclast %ymm8, %ymm6, %ymm6;
+ vaesenclast %ymm8, %ymm7, %ymm7;
+ jmp .Lecb_blk16_end;
+
+ .align 8
+ .Lecb_dec_blk16:
+ /* AES rounds */
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm8;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm8;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm8;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm8;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm8;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm8;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm8;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm8;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (10 * 16)(%rdi), %ymm8;
+ cmpl $12, %r9d;
+ jb .Lecb_dec_blk16_last;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm8;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (12 * 16)(%rdi), %ymm8;
+ jz .Lecb_dec_blk16_last;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm8;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (14 * 16)(%rdi), %ymm8;
+ .Lecb_dec_blk16_last:
+ vaesdeclast %ymm8, %ymm0, %ymm0;
+ vaesdeclast %ymm8, %ymm1, %ymm1;
+ vaesdeclast %ymm8, %ymm2, %ymm2;
+ vaesdeclast %ymm8, %ymm3, %ymm3;
+ vaesdeclast %ymm8, %ymm4, %ymm4;
+ vaesdeclast %ymm8, %ymm5, %ymm5;
+ vaesdeclast %ymm8, %ymm6, %ymm6;
+ vaesdeclast %ymm8, %ymm7, %ymm7;
+ jmp .Lecb_blk16_end;
+
+ .align 8
+ .Lecb_blk16_end:
+ vmovdqu %ymm0, (0 * 16)(%rdx);
+ vmovdqu %ymm1, (2 * 16)(%rdx);
+ vmovdqu %ymm2, (4 * 16)(%rdx);
+ vmovdqu %ymm3, (6 * 16)(%rdx);
+ vmovdqu %ymm4, (8 * 16)(%rdx);
+ vmovdqu %ymm5, (10 * 16)(%rdx);
+ vmovdqu %ymm6, (12 * 16)(%rdx);
+ vmovdqu %ymm7, (14 * 16)(%rdx);
+ leaq (16 * 16)(%rdx), %rdx;
+
+ jmp .Lecb_blk16;
+
+ /* Handle trailing eight blocks. */
+.align 8
+.Lecb_blk8:
+ cmpq $8, %r8;
+ jmp .Lecb_blk4;
+
+ leaq -8(%r8), %r8;
+
+ /* Load input and xor first key. */
+ vbroadcasti128 (0 * 16)(%rdi), %ymm4;
+ vmovdqu (0 * 16)(%rcx), %ymm0;
+ vmovdqu (2 * 16)(%rcx), %ymm1;
+ vmovdqu (4 * 16)(%rcx), %ymm2;
+ vmovdqu (6 * 16)(%rcx), %ymm3;
+ vpxor %ymm4, %ymm0, %ymm0;
+ vpxor %ymm4, %ymm1, %ymm1;
+ vpxor %ymm4, %ymm2, %ymm2;
+ vpxor %ymm4, %ymm3, %ymm3;
+ vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+ leaq (8 * 16)(%rcx), %rcx;
+
+ testl %esi, %esi;
+ jz .Lecb_dec_blk8;
+ /* AES rounds */
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+ cmpl $12, %r9d;
+ jb .Lecb_enc_blk8_last;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+ jz .Lecb_enc_blk8_last;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+ .Lecb_enc_blk8_last:
+ vaesenclast %ymm4, %ymm0, %ymm0;
+ vaesenclast %ymm4, %ymm1, %ymm1;
+ vaesenclast %ymm4, %ymm2, %ymm2;
+ vaesenclast %ymm4, %ymm3, %ymm3;
+ vmovdqu %ymm0, (0 * 16)(%rdx);
+ vmovdqu %ymm1, (2 * 16)(%rdx);
+ vmovdqu %ymm2, (4 * 16)(%rdx);
+ vmovdqu %ymm3, (6 * 16)(%rdx);
+ leaq (8 * 16)(%rdx), %rdx;
+ jmp .Lecb_blk4;
+
+ .align 8
+ .Lecb_dec_blk8:
+ /* AES rounds */
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+ cmpl $12, %r9d;
+ jb .Lecb_dec_blk8_last;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+ jz .Lecb_dec_blk8_last;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+ .Lecb_dec_blk8_last:
+ vaesdeclast %ymm4, %ymm0, %ymm0;
+ vaesdeclast %ymm4, %ymm1, %ymm1;
+ vaesdeclast %ymm4, %ymm2, %ymm2;
+ vaesdeclast %ymm4, %ymm3, %ymm3;
+ vmovdqu %ymm0, (0 * 16)(%rdx);
+ vmovdqu %ymm1, (2 * 16)(%rdx);
+ vmovdqu %ymm2, (4 * 16)(%rdx);
+ vmovdqu %ymm3, (6 * 16)(%rdx);
+ leaq (8 * 16)(%rdx), %rdx;
+
+ /* Handle trailing four blocks. */
+.align 8
+.Lecb_blk4:
+ cmpq $4, %r8;
+ jb .Lecb_blk1;
+
+ leaq -4(%r8), %r8;
+
+ /* Load input and xor first key. */
+ vbroadcasti128 (0 * 16)(%rdi), %ymm4;
+ vmovdqu (0 * 16)(%rcx), %ymm0;
+ vmovdqu (2 * 16)(%rcx), %ymm1;
+ vpxor %ymm4, %ymm0, %ymm0;
+ vpxor %ymm4, %ymm1, %ymm1;
+ vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+ leaq (4 * 16)(%rcx), %rcx;
+
+ testl %esi, %esi;
+ jz .Lecb_dec_blk4;
+ /* AES rounds */
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+ cmpl $12, %r9d;
+ jb .Lecb_enc_blk4_last;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+ jz .Lecb_enc_blk4_last;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+ .Lecb_enc_blk4_last:
+ vaesenclast %ymm4, %ymm0, %ymm0;
+ vaesenclast %ymm4, %ymm1, %ymm1;
+ vmovdqu %ymm0, (0 * 16)(%rdx);
+ vmovdqu %ymm1, (2 * 16)(%rdx);
+ leaq (4 * 16)(%rdx), %rdx;
+ jmp .Lecb_blk1;
+
+ .align 8
+ .Lecb_dec_blk4:
+ /* AES rounds */
+ VAESDEC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+ VAESDEC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+ VAESDEC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+ VAESDEC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+ VAESDEC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+ VAESDEC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+ VAESDEC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+ VAESDEC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+ VAESDEC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+ cmpl $12, %r9d;
+ jb .Lecb_dec_blk4_last;
+ VAESDEC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+ VAESDEC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+ jz .Lecb_dec_blk4_last;
+ VAESDEC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+ VAESDEC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+ .Lecb_dec_blk4_last:
+ vaesdeclast %ymm4, %ymm0, %ymm0;
+ vaesdeclast %ymm4, %ymm1, %ymm1;
+ vmovdqu %ymm0, (0 * 16)(%rdx);
+ vmovdqu %ymm1, (2 * 16)(%rdx);
+ leaq (4 * 16)(%rdx), %rdx;
+
+ /* Process trailing one to three blocks, one per loop. */
+.align 8
+.Lecb_blk1:
+ cmpq $1, %r8;
+ jb .Ldone_ecb;
+
+ leaq -1(%r8), %r8;
+
+ /* Load input. */
+ vmovdqu (%rcx), %xmm2;
+ leaq 16(%rcx), %rcx;
+
+ /* Xor first key. */
+ vpxor (0 * 16)(%rdi), %xmm2, %xmm0;
+
+ testl %esi, %esi;
+ jz .Lecb_dec_blk1;
+ /* AES rounds. */
+ vaesenc (1 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (2 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (3 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (4 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (5 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (6 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (7 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (8 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (9 * 16)(%rdi), %xmm0, %xmm0;
+ vmovdqa (10 * 16)(%rdi), %xmm1;
+ cmpl $12, %r9d;
+ jb .Lecb_enc_blk1_last;
+ vaesenc %xmm1, %xmm0, %xmm0;
+ vaesenc (11 * 16)(%rdi), %xmm0, %xmm0;
+ vmovdqa (12 * 16)(%rdi), %xmm1;
+ jz .Lecb_enc_blk1_last;
+ vaesenc %xmm1, %xmm0, %xmm0;
+ vaesenc (13 * 16)(%rdi), %xmm0, %xmm0;
+ vmovdqa (14 * 16)(%rdi), %xmm1;
+ .Lecb_enc_blk1_last:
+ vaesenclast %xmm1, %xmm0, %xmm0;
+ jmp .Lecb_blk1_end;
+
+ .align 8
+ .Lecb_dec_blk1:
+ /* AES rounds. */
+ vaesdec (1 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (2 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (3 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (4 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (5 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (6 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (7 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (8 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (9 * 16)(%rdi), %xmm0, %xmm0;
+ vmovdqa (10 * 16)(%rdi), %xmm1;
+ cmpl $12, %r9d;
+ jb .Lecb_dec_blk1_last;
+ vaesdec %xmm1, %xmm0, %xmm0;
+ vaesdec (11 * 16)(%rdi), %xmm0, %xmm0;
+ vmovdqa (12 * 16)(%rdi), %xmm1;
+ jz .Lecb_dec_blk1_last;
+ vaesdec %xmm1, %xmm0, %xmm0;
+ vaesdec (13 * 16)(%rdi), %xmm0, %xmm0;
+ vmovdqa (14 * 16)(%rdi), %xmm1;
+ .Lecb_dec_blk1_last:
+ vaesdeclast %xmm1, %xmm0, %xmm0;
+ jmp .Lecb_blk1_end;
+
+ .align 8
+ .Lecb_blk1_end:
+ vmovdqu %xmm0, (%rdx);
+ leaq 16(%rdx), %rdx;
+
+ jmp .Lecb_blk1;
+
+.align 8
+.Ldone_ecb:
+ vzeroall;
+ ret_spec_stop
+ CFI_ENDPROC();
+ELF(.size _gcry_vaes_avx2_ecb_crypt_amd64,.-_gcry_vaes_avx2_ecb_crypt_amd64)
+
/**********************************************************************
constants
**********************************************************************/
diff --git a/cipher/rijndael-vaes.c b/cipher/rijndael-vaes.c
index dbcf9afa..978c86da 100644
--- a/cipher/rijndael-vaes.c
+++ b/cipher/rijndael-vaes.c
@@ -91,6 +91,32 @@ extern void _gcry_vaes_avx2_xts_crypt_amd64 (const void *keysched,
unsigned int nrounds,
int encrypt) ASM_FUNC_ABI;

+extern void _gcry_vaes_avx2_ecb_crypt_amd64 (const void *keysched,
+ int encrypt,
+ void *outbuf_arg,
+ const void *inbuf_arg,
+ size_t nblocks,
+ unsigned int nrounds) ASM_FUNC_ABI;
+
+
+void
+_gcry_aes_vaes_ecb_crypt (void *context, void *outbuf,
+ const void *inbuf, size_t nblocks,
+ int encrypt)
+{
+ RIJNDAEL_context *ctx = context;
+ const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
+ unsigned int nrounds = ctx->rounds;
+
+ if (!encrypt && !ctx->decryption_prepared)
+ {
+ _gcry_aes_aesni_prepare_decryption (ctx);
+ ctx->decryption_prepared = 1;
+ }
+
+ _gcry_vaes_avx2_ecb_crypt_amd64 (keysched, encrypt, outbuf, inbuf,
+ nblocks, nrounds);
+}

void
_gcry_aes_vaes_cbc_dec (void *context, unsigned char *iv,
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index f3060ea5..84cb7109 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -102,6 +102,9 @@ extern size_t _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg
extern void _gcry_aes_aesni_xts_crypt (void *context, unsigned char *tweak,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks, int encrypt);
+extern void _gcry_aes_aesni_ecb_crypt (void *context, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks,
+ int encrypt);
#endif

#ifdef USE_VAES
@@ -125,6 +128,9 @@ extern size_t _gcry_aes_vaes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
extern void _gcry_aes_vaes_xts_crypt (void *context, unsigned char *tweak,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks, int encrypt);
+extern void _gcry_aes_vaes_ecb_crypt (void *context, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks,
+ int encrypt);
#endif

#ifdef USE_SSSE3
@@ -227,6 +233,9 @@ extern void _gcry_aes_armv8_ce_xts_crypt (void *context, unsigned char *tweak,
void *outbuf_arg,
const void *inbuf_arg,
size_t nblocks, int encrypt);
+extern void _gcry_aes_armv8_ce_ecb_crypt (void *context, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks,
+ int encrypt);
#endif /*USE_ARM_ASM*/

#ifdef USE_PPC_CRYPTO
@@ -524,6 +533,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
bulk_ops->ocb_crypt = _gcry_aes_aesni_ocb_crypt;
bulk_ops->ocb_auth = _gcry_aes_aesni_ocb_auth;
bulk_ops->xts_crypt = _gcry_aes_aesni_xts_crypt;
+ bulk_ops->ecb_crypt = _gcry_aes_aesni_ecb_crypt;

#ifdef USE_VAES
if ((hwfeatures & HWF_INTEL_VAES_VPCLMUL) &&
@@ -536,6 +546,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
bulk_ops->ctr32le_enc = _gcry_aes_vaes_ctr32le_enc;
bulk_ops->ocb_crypt = _gcry_aes_vaes_ocb_crypt;
bulk_ops->xts_crypt = _gcry_aes_vaes_xts_crypt;
+ bulk_ops->ecb_crypt = _gcry_aes_vaes_ecb_crypt;
}
#endif
}
@@ -591,6 +602,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
bulk_ops->ocb_crypt = _gcry_aes_armv8_ce_ocb_crypt;
bulk_ops->ocb_auth = _gcry_aes_armv8_ce_ocb_auth;
bulk_ops->xts_crypt = _gcry_aes_armv8_ce_xts_crypt;
+ bulk_ops->ecb_crypt = _gcry_aes_armv8_ce_ecb_crypt;
}
#endif
#ifdef USE_PPC_CRYPTO_WITH_PPC9LE
--
2.37.2


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel