Mailing List Archive

[PATCH 1/4] rijndael-vaes-avx2-amd64: acceleration for OCB auth
* cipher/rijndael-vaes-avx2-amd64.S
(_gcry_vaes_avx2_ocb_crypt_amd64): Add authentication mode support.
* cipher/rijndael-vaes.c (_gcry_vaes_avx2_ocb_crypt_amd64): Change
to return 'size_t' value.
(_gcry_aes_vaes_ocb_auth): New.
* cipher/rijndael.c (_gcry_aes_vaes_ocb_auth): New.
(do_setkey) [USE_VAES]: Add setup for 'bulk_ops->ocb_auth'.
--

Benchmark on AMD Ryzen 9 7900X (zen4):

Before:

AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
OCB auth | 0.071 ns/B 13470 MiB/s 0.333 c/B 4700

After (~2.0x faster):
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
OCB auth | 0.034 ns/B 27946 MiB/s 0.160 c/B 4700

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
---
cipher/rijndael-vaes-avx2-amd64.S | 300 ++++++++++++++++++++++++++++--
cipher/rijndael-vaes.c | 48 +++--
cipher/rijndael.c | 4 +
3 files changed, 323 insertions(+), 29 deletions(-)

diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S
index 843ad9cf..fd012982 100644
--- a/cipher/rijndael-vaes-avx2-amd64.S
+++ b/cipher/rijndael-vaes-avx2-amd64.S
@@ -1402,7 +1402,7 @@ _gcry_vaes_avx2_ctr32le_enc_amd64:
ELF(.size _gcry_vaes_avx2_ctr32le_enc_amd64,.-_gcry_vaes_avx2_ctr32le_enc_amd64)

/**********************************************************************
- OCB-mode encryption/decryption
+ OCB-mode encryption/decryption/authentication
**********************************************************************/
ELF(.type _gcry_vaes_avx2_ocb_crypt_amd64,@function)
.globl _gcry_vaes_avx2_ocb_crypt_amd64
@@ -1418,7 +1418,7 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
* 16(%rbp): offset
* 24(%rbp): checksum
* 32(%rbp): L-array
- * 40(%rbp): encrypt (%r15d)
+ * 40(%rbp): decrypt/encrypt/auth (%r15d)
*/
CFI_STARTPROC();

@@ -1427,7 +1427,7 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
#define OFFSET_PTR_Q 16(%rbp)
#define CHECKSUM_PTR_Q 24(%rbp)
#define L_ARRAY_PTR_L 32(%rbp)
-#define ENCRYPT_FLAG_L 40(%rbp)
+#define OPER_MODE_L 40(%rbp)

pushq %rbp;
CFI_PUSH(%rbp);
@@ -1448,7 +1448,7 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
movq %rbx, (STACK_REGS_POS + 4 * 8)(%rsp);
CFI_REG_ON_STACK(rbx, STACK_REGS_POS + 4 * 8);

- movl ENCRYPT_FLAG_L, %r15d; /* encrypt-flag. */
+ movl OPER_MODE_L, %r15d; /* decrypt/encrypt/auth-mode. */
movq OFFSET_PTR_Q, %r14; /* offset ptr. */
movq CHECKSUM_PTR_Q, %rbx; /* checksum ptr. */

@@ -1531,8 +1531,9 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
vinserti128 $1, %xmm10, %ymm9, %ymm7;
vinserti128 $1, %xmm15, %ymm11, %ymm8;

- testl %r15d, %r15d;
- jz .Locb_unaligned_blk8_dec;
+ cmpl $1, %r15d;
+ jb .Locb_unaligned_blk8_dec;
+ ja .Locb_unaligned_blk8_auth;
vmovdqu (0 * 16)(%rcx), %ymm0;
vmovdqu (2 * 16)(%rcx), %ymm1;
vmovdqu (4 * 16)(%rcx), %ymm2;
@@ -1598,6 +1599,59 @@ _gcry_vaes_avx2_ocb_crypt_amd64:

jmp .Locb_unaligned_blk8;

+ .align 8
+ .Locb_unaligned_blk8_auth:
+ vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
+ vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
+ vpxor (4 * 16)(%rcx), %ymm7, %ymm2;
+ vpxor (6 * 16)(%rcx), %ymm8, %ymm3;
+ leaq (8 * 16)(%rcx), %rcx;
+
+ /* AES rounds */
+ vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+ cmpl $12, %r9d;
+ jb .Locb_unaligned_blk8_auth_last;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+ jz .Locb_unaligned_blk8_auth_last;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+
+ /* Last round and output handling. */
+ .Locb_unaligned_blk8_auth_last:
+ vaesenclast %ymm4, %ymm0, %ymm0;
+ vaesenclast %ymm4, %ymm1, %ymm1;
+ vaesenclast %ymm4, %ymm2, %ymm2;
+ vaesenclast %ymm4, %ymm3, %ymm3;
+ vpxor %ymm0, %ymm14, %ymm14;
+ vpxor %ymm1, %ymm13, %ymm13;
+ vpxor %ymm2, %ymm14, %ymm14;
+ vpxor %ymm3, %ymm13, %ymm13;
+
+ jmp .Locb_unaligned_blk8;
+
.align 8
.Locb_unaligned_blk8_dec:
vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
@@ -1690,8 +1744,9 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
vpxor (%r14, %rax), %xmm7, %xmm15;
vinserti128 $1, %xmm15, %ymm7, %ymm6;

- testl %r15d, %r15d;
- jz .Locb_unaligned_blk4_dec;
+ cmpl $1, %r15d;
+ jb .Locb_unaligned_blk4_dec;
+ ja .Locb_unaligned_blk4_auth;
vmovdqu (0 * 16)(%rcx), %ymm0;
vmovdqu (2 * 16)(%rcx), %ymm1;
leaq (4 * 16)(%rcx), %rcx;
@@ -1744,6 +1799,53 @@ _gcry_vaes_avx2_ocb_crypt_amd64:

jmp .Locb_unaligned_blk1;

+ .align 8
+ .Locb_unaligned_blk4_auth:
+ vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
+ vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
+ leaq (4 * 16)(%rcx), %rcx;
+
+ /* AES rounds */
+ vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+ cmpl $12, %r9d;
+ jb .Locb_unaligned_blk4_auth_last;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+ jz .Locb_unaligned_blk4_auth_last;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+ VAESENC2(%ymm4, %ymm0, %ymm1);
+ vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+
+ /* Last round and output handling. */
+ .Locb_unaligned_blk4_auth_last:
+ vaesenclast %ymm4, %ymm0, %ymm0;
+ vaesenclast %ymm4, %ymm1, %ymm1;
+ vpxor %ymm0, %ymm14, %ymm14;
+ vpxor %ymm1, %ymm13, %ymm13;
+
+ jmp .Locb_unaligned_blk1;
+
.align 8
.Locb_unaligned_blk4_dec:
vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
@@ -1808,8 +1910,9 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
shll $4, %r11d;
vpxor (%r14, %r11), %xmm15, %xmm15;

- testl %r15d, %r15d;
- jz .Locb_unaligned_blk1_dec;
+ cmpl $1, %r15d;
+ jb .Locb_unaligned_blk1_dec;
+ ja .Locb_unaligned_blk1_auth;
vmovdqu (%rcx), %xmm0;
vpxor %ymm0, %ymm14, %ymm14;
vpxor %xmm15, %xmm0, %xmm0;
@@ -1842,6 +1945,39 @@ _gcry_vaes_avx2_ocb_crypt_amd64:

jmp .Locb_unaligned_blk1;

+ .align 8
+ .Locb_unaligned_blk1_auth:
+ vpxor (%rcx), %xmm15, %xmm0;
+ leaq 16(%rcx), %rcx;
+
+ /* AES rounds. */
+ vaesenc (1 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (2 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (3 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (4 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (5 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (6 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (7 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (8 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (9 * 16)(%rdi), %xmm0, %xmm0;
+ vmovdqa (10 * 16)(%rdi), %xmm1;
+ cmpl $12, %r9d;
+ jb .Locb_unaligned_blk1_auth_last;
+ vaesenc %xmm1, %xmm0, %xmm0;
+ vaesenc (11 * 16)(%rdi), %xmm0, %xmm0;
+ vmovdqa (12 * 16)(%rdi), %xmm1;
+ jz .Locb_unaligned_blk1_auth_last;
+ vaesenc %xmm1, %xmm0, %xmm0;
+ vaesenc (13 * 16)(%rdi), %xmm0, %xmm0;
+ vmovdqa (14 * 16)(%rdi), %xmm1;
+
+ /* Last round and output handling. */
+ .Locb_unaligned_blk1_auth_last:
+ vaesenclast %xmm1, %xmm0, %xmm0;
+ vpxor %ymm0, %ymm14, %ymm14;
+
+ jmp .Locb_unaligned_blk1;
+
.align 8
.Locb_unaligned_blk1_dec:
vpxor (%rcx), %xmm15, %xmm0;
@@ -1961,8 +2097,9 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
vpxor (%r14, %rax), %xmm13, %xmm14; /* offset ^ first key ^ L[3] ^ L[ntz{nblk+16}] */
vinserti128 $1, %xmm14, %ymm13, %ymm14;

- testl %r15d, %r15d;
- jz .Locb_aligned_blk16_dec;
+ cmpl $1, %r15d;
+ jb .Locb_aligned_blk16_dec;
+ ja .Locb_aligned_blk16_auth;
vmovdqu (0 * 16)(%rcx), %ymm0;
vmovdqu (2 * 16)(%rcx), %ymm1;
vmovdqu (4 * 16)(%rcx), %ymm2;
@@ -2057,6 +2194,81 @@ _gcry_vaes_avx2_ocb_crypt_amd64:

jmp .Locb_aligned_blk16;

+ .align 8
+ .Locb_aligned_blk16_auth:
+ vpxor (10 * 16)(%rsp), %ymm15, %ymm13;
+ vpxor (14 * 16)(%rcx), %ymm14, %ymm7;
+
+ vpxor (0 * 16)(%rcx), %ymm8, %ymm0;
+ vpxor (2 * 16)(%rcx), %ymm9, %ymm1;
+ vpxor (4 * 16)(%rcx), %ymm10, %ymm2;
+ vpxor (6 * 16)(%rcx), %ymm11, %ymm3;
+ vpxor (8 * 16)(%rcx), %ymm12, %ymm4;
+ vpxor (10 * 16)(%rcx), %ymm13, %ymm5;
+ vmovdqa %ymm13, (16 * 16)(%rsp);
+ vpxor (12 * 16)(%rsp), %ymm15, %ymm13;
+ vpxor (12 * 16)(%rcx), %ymm13, %ymm6;
+ vmovdqa %ymm13, (18 * 16)(%rsp);
+
+ leaq (16 * 16)(%rcx), %rcx;
+
+ vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
+
+ /* AES rounds */
+ vbroadcasti128 (1 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (10 * 16)(%rdi), %ymm13;
+ cmpl $12, %r9d;
+ jb .Locb_aligned_blk16_auth_last;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (12 * 16)(%rdi), %ymm13;
+ jz .Locb_aligned_blk16_auth_last;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (14 * 16)(%rdi), %ymm13;
+
+ /* Last round and output handling. */
+ .Locb_aligned_blk16_auth_last:
+ vaesenclast %ymm13, %ymm0, %ymm0;
+ vaesenclast %ymm13, %ymm1, %ymm1;
+ vaesenclast %ymm13, %ymm2, %ymm2;
+ vaesenclast %ymm13, %ymm3, %ymm3;
+ vaesenclast %ymm13, %ymm4, %ymm4;
+ vaesenclast %ymm13, %ymm5, %ymm5;
+ vaesenclast %ymm13, %ymm6, %ymm6;
+ vaesenclast %ymm13, %ymm7, %ymm7;
+
+ vpxor %ymm1, %ymm0, %ymm0;
+ vpxor %ymm3, %ymm2, %ymm2;
+ vpxor %ymm5, %ymm4, %ymm4;
+ vpxor %ymm7, %ymm6, %ymm6;
+ vpxor %ymm2, %ymm0, %ymm0;
+ vpxor %ymm6, %ymm4, %ymm4;
+ vpxor %ymm4, %ymm0, %ymm0;
+ vpxor (20 * 16)(%rsp), %ymm0, %ymm0;
+ vmovdqa %ymm0, (20 * 16)(%rsp);
+
+ jmp .Locb_aligned_blk16;
+
.align 8
.Locb_aligned_blk16_dec:
vpxor (10 * 16)(%rsp), %ymm15, %ymm13;
@@ -2169,8 +2381,9 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
vpxor (%r14, %rax), %xmm13, %xmm14; /* offset ^ first key ^ L[2] ^ L[ntz{nblk+8}] */
vinserti128 $1, %xmm14, %ymm13, %ymm14;

- testl %r15d, %r15d;
- jz .Locb_aligned_blk8_dec;
+ cmpl $1, %r15d;
+ jb .Locb_aligned_blk8_dec;
+ ja .Locb_aligned_blk8_auth;
vmovdqu (0 * 16)(%rcx), %ymm0;
vmovdqu (2 * 16)(%rcx), %ymm1;
vmovdqu (4 * 16)(%rcx), %ymm2;
@@ -2240,6 +2453,63 @@ _gcry_vaes_avx2_ocb_crypt_amd64:

jmp .Locb_aligned_done;

+ .align 8
+ .Locb_aligned_blk8_auth:
+ vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
+ vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
+ vpxor (4 * 16)(%rcx), %ymm7, %ymm2;
+ vpxor (6 * 16)(%rcx), %ymm14, %ymm3;
+ leaq (8 * 16)(%rcx), %rcx;
+
+ vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
+
+ /* AES rounds */
+ vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+ cmpl $12, %r9d;
+ jb .Locb_aligned_blk8_auth_last;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+ jz .Locb_aligned_blk8_auth_last;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+
+ /* Last round and output handling. */
+ .Locb_aligned_blk8_auth_last:
+ vaesenclast %ymm4, %ymm0, %ymm0;
+ vaesenclast %ymm4, %ymm1, %ymm1;
+ vaesenclast %ymm4, %ymm2, %ymm2;
+ vaesenclast %ymm4, %ymm3, %ymm3;
+
+ vpxor %ymm1, %ymm0, %ymm0;
+ vpxor %ymm3, %ymm2, %ymm2;
+ vpxor %ymm2, %ymm0, %ymm0;
+ vpxor (20 * 16)(%rsp), %ymm0, %ymm0;
+ vmovdqa %ymm0, (20 * 16)(%rsp);
+
+ jmp .Locb_aligned_done;
+
.align 8
.Locb_aligned_blk8_dec:
vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
@@ -2357,6 +2627,8 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
movq (STACK_REGS_POS + 4 * 8)(%rsp), %rbx;
CFI_RESTORE(%rbx);

+ xorl %eax, %eax;
+
leave;
CFI_LEAVE();
ret_spec_stop
diff --git a/cipher/rijndael-vaes.c b/cipher/rijndael-vaes.c
index 978c86da..ce9e18e7 100644
--- a/cipher/rijndael-vaes.c
+++ b/cipher/rijndael-vaes.c
@@ -40,7 +40,7 @@
# endif


-extern void _gcry_aes_aesni_prepare_decryption(RIJNDAEL_context *ctx);
+extern void _gcry_aes_aesni_prepare_decryption (RIJNDAEL_context *ctx);


extern void _gcry_vaes_avx2_cbc_dec_amd64 (const void *keysched,
@@ -72,16 +72,16 @@ extern void _gcry_vaes_avx2_ctr32le_enc_amd64 (const void *keysched,
unsigned int nrounds)
ASM_FUNC_ABI;

-extern void _gcry_vaes_avx2_ocb_crypt_amd64 (const void *keysched,
- unsigned int blkn,
- void *outbuf_arg,
- const void *inbuf_arg,
- size_t nblocks,
- unsigned int nrounds,
- unsigned char *offset,
- unsigned char *checksum,
- unsigned char *L_table,
- int encrypt) ASM_FUNC_ABI;
+extern size_t _gcry_vaes_avx2_ocb_crypt_amd64 (const void *keysched,
+ unsigned int blkn,
+ void *outbuf_arg,
+ const void *inbuf_arg,
+ size_t nblocks,
+ unsigned int nrounds,
+ unsigned char *offset,
+ unsigned char *checksum,
+ unsigned char *L_table,
+ int encrypt) ASM_FUNC_ABI;

extern void _gcry_vaes_avx2_xts_crypt_amd64 (const void *keysched,
unsigned char *tweak,
@@ -193,11 +193,29 @@ _gcry_aes_vaes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,

c->u_mode.ocb.data_nblocks = blkn + nblocks;

- _gcry_vaes_avx2_ocb_crypt_amd64 (keysched, (unsigned int)blkn, outbuf, inbuf,
- nblocks, nrounds, c->u_iv.iv, c->u_ctr.ctr,
- c->u_mode.ocb.L[0], encrypt);
+ return _gcry_vaes_avx2_ocb_crypt_amd64 (keysched, (unsigned int)blkn, outbuf,
+ inbuf, nblocks, nrounds, c->u_iv.iv,
+ c->u_ctr.ctr, c->u_mode.ocb.L[0],
+ encrypt);
+}
+
+size_t
+_gcry_aes_vaes_ocb_auth (gcry_cipher_hd_t c, const void *inbuf_arg,
+ size_t nblocks)
+{
+ RIJNDAEL_context *ctx = (void *)&c->context.c;
+ const void *keysched = ctx->keyschenc32;
+ const unsigned char *inbuf = inbuf_arg;
+ unsigned int nrounds = ctx->rounds;
+ u64 blkn = c->u_mode.ocb.aad_nblocks;
+
+ c->u_mode.ocb.aad_nblocks = blkn + nblocks;

- return 0;
+ return _gcry_vaes_avx2_ocb_crypt_amd64 (keysched, (unsigned int)blkn, NULL,
+ inbuf, nblocks, nrounds,
+ c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum,
+ c->u_mode.ocb.L[0], 2);
}

void
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index b49a0642..56acb199 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -125,6 +125,9 @@ extern void _gcry_aes_vaes_ctr32le_enc (void *context, unsigned char *ctr,
extern size_t _gcry_aes_vaes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
const void *inbuf_arg, size_t nblocks,
int encrypt);
+extern size_t _gcry_aes_vaes_ocb_auth (gcry_cipher_hd_t c,
+ const void *inbuf_arg,
+ size_t nblocks);
extern void _gcry_aes_vaes_xts_crypt (void *context, unsigned char *tweak,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks, int encrypt);
@@ -562,6 +565,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
bulk_ops->ctr_enc = _gcry_aes_vaes_ctr_enc;
bulk_ops->ctr32le_enc = _gcry_aes_vaes_ctr32le_enc;
bulk_ops->ocb_crypt = _gcry_aes_vaes_ocb_crypt;
+ bulk_ops->ocb_auth = _gcry_aes_vaes_ocb_auth;
bulk_ops->xts_crypt = _gcry_aes_vaes_xts_crypt;
bulk_ops->ecb_crypt = _gcry_aes_vaes_ecb_crypt;
}
--
2.39.2


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel