Mailing List Archive

[PATCH] Reduce size of x86-64 stitched Chacha20-Poly1305 implementations
* cipher/chacha20-amd64-avx2.c
(_gcry_chacha20_poly1305_amd64_avx2_blocks8): De-unroll round loop.
* cipher/chacha20-amd64-ssse3.c
(_gcry_chacha20_poly1305_amd64_ssse3_blocks4):
(_gcry_chacha20_poly1305_amd64_ssse3_blocks1): Ditto.
--

Object size before:
text data bss dec hex filename
13428 0 0 13428 3474 cipher/.libs/chacha20-amd64-avx2.o
23175 0 0 23175 5a87 cipher/.libs/chacha20-amd64-ssse3.o

Object size after:
text data bss dec hex filename
4815 0 0 4815 12cf cipher/.libs/chacha20-amd64-avx2.o
9284 0 0 9284 2444 cipher/.libs/chacha20-amd64-ssse3.o

Benchmark on AMD Ryzen 3700X (AVX2 impl.):

Before:
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
STREAM enc | 0.267 ns/B 3575 MiB/s 1.15 c/B 4318
STREAM dec | 0.266 ns/B 3586 MiB/s 1.15 c/B 4329
POLY1305 enc | 0.315 ns/B 3024 MiB/s 1.36 c/B 4315±1
POLY1305 dec | 0.296 ns/B 3220 MiB/s 1.28 c/B 4310
POLY1305 auth | 0.223 ns/B 4270 MiB/s 0.968 c/B 4335

After:
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
STREAM enc | 0.266 ns/B 3583 MiB/s 1.15 c/B 4327
STREAM dec | 0.265 ns/B 3603 MiB/s 1.16 c/B 4371±1
POLY1305 enc | 0.293 ns/B 3251 MiB/s 1.27 c/B 4315
POLY1305 dec | 0.279 ns/B 3418 MiB/s 1.19 c/B 4282±3
POLY1305 auth | 0.225 ns/B 4241 MiB/s 0.978 c/B 4351

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
---
0 files changed

diff --git a/cipher/chacha20-amd64-avx2.S b/cipher/chacha20-amd64-avx2.S
index de6263b69..053638d02 100644
--- a/cipher/chacha20-amd64-avx2.S
+++ b/cipher/chacha20-amd64-avx2.S
@@ -331,6 +331,8 @@ ELF(.size _gcry_chacha20_amd64_avx2_blocks8,
8-way stitched chacha20-poly1305
**********************************************************************/

+#define _ /*_*/
+
.align 8
.globl _gcry_chacha20_poly1305_amd64_avx2_blocks8
ELF(.type _gcry_chacha20_poly1305_amd64_avx2_blocks8,@function;)
@@ -353,7 +355,7 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8:

vzeroupper;

- subq $(8 * 8) + STACK_MAX + 32, %rsp;
+ subq $(9 * 8) + STACK_MAX + 32, %rsp;
andq $~31, %rsp;

movq %rbx, (STACK_MAX + 0 * 8)(%rsp);
@@ -406,33 +408,14 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8:
vpbroadcastd (15 * 4)(INPUT), X15;
vmovdqa X15, (STACK_TMP)(%rsp);

- # rounds 0,1
- QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,
- POLY1305_BLOCK_PART1(0 * 16),
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4())
- vmovdqa (STACK_TMP)(%rsp), X15;
- vmovdqa X8, (STACK_TMP)(%rsp);
- QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(1 * 16),
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3())
- QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(2 * 16),
- POLY1305_BLOCK_PART2())
- vmovdqa (STACK_TMP)(%rsp), X8;
- vmovdqa X15, (STACK_TMP)(%rsp);
- QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(3 * 16))
+ /* Process eight ChaCha20 blocks and 32 Poly1305 blocks. */

- # rounds 2,3
+ movl $20, (STACK_MAX + 8 * 8 + 4)(%rsp);
+.Lround8_with_poly1305_outer:
+ movl $8, (STACK_MAX + 8 * 8)(%rsp);
+.Lround8_with_poly1305_inner:
+ /* rounds 0-7 & 10-17 */
+ POLY1305_BLOCK_PART1(0 * 16)
QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,
POLY1305_BLOCK_PART2(),
POLY1305_BLOCK_PART3(),
@@ -440,231 +423,59 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8:
POLY1305_BLOCK_PART5())
vmovdqa (STACK_TMP)(%rsp), X15;
vmovdqa X8, (STACK_TMP)(%rsp);
- QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,
- POLY1305_BLOCK_PART1(4 * 16),
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4())
- QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(5 * 16),
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3())
- vmovdqa (STACK_TMP)(%rsp), X8;
- vmovdqa X15, (STACK_TMP)(%rsp);
- QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(6 * 16),
- POLY1305_BLOCK_PART2())
-
- # rounds 4,5
- QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(7 * 16))
- vmovdqa (STACK_TMP)(%rsp), X15;
- vmovdqa X8, (STACK_TMP)(%rsp);
+ POLY1305_BLOCK_PART1(1 * 16)
QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,
POLY1305_BLOCK_PART2(),
POLY1305_BLOCK_PART3(),
POLY1305_BLOCK_PART4(),
POLY1305_BLOCK_PART5())
+ POLY1305_BLOCK_PART1(2 * 16)
QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,
- POLY1305_BLOCK_PART1(8 * 16),
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4())
- vmovdqa (STACK_TMP)(%rsp), X8;
- vmovdqa X15, (STACK_TMP)(%rsp);
- QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(9 * 16),
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3())
-
- # rounds 6,7
- QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(10 * 16),
- POLY1305_BLOCK_PART2())
- vmovdqa (STACK_TMP)(%rsp), X15;
- vmovdqa X8, (STACK_TMP)(%rsp);
- QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(11 * 16))
- QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5())
- vmovdqa (STACK_TMP)(%rsp), X8;
- vmovdqa X15, (STACK_TMP)(%rsp);
- QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,
- POLY1305_BLOCK_PART1(12 * 16),
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4())
-
- # rounds 8,9
- QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(13 * 16),
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3())
- vmovdqa (STACK_TMP)(%rsp), X15;
- vmovdqa X8, (STACK_TMP)(%rsp);
- QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(14 * 16),
- POLY1305_BLOCK_PART2())
- QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(15 * 16))
- vmovdqa (STACK_TMP)(%rsp), X8;
- vmovdqa X15, (STACK_TMP)(%rsp);
- QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,
POLY1305_BLOCK_PART2(),
POLY1305_BLOCK_PART3(),
POLY1305_BLOCK_PART4(),
POLY1305_BLOCK_PART5())
-
- # rounds 10,11
- QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,
- POLY1305_BLOCK_PART1(16 * 16),
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4())
- vmovdqa (STACK_TMP)(%rsp), X15;
- vmovdqa X8, (STACK_TMP)(%rsp);
- QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(17 * 16),
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3())
- QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(18 * 16),
- POLY1305_BLOCK_PART2())
vmovdqa (STACK_TMP)(%rsp), X8;
vmovdqa X15, (STACK_TMP)(%rsp);
+ POLY1305_BLOCK_PART1(3 * 16)
+ lea (4 * 16)(POLY_RSRC), POLY_RSRC;
QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(19 * 16))
-
- # rounds 12,13
- QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,
POLY1305_BLOCK_PART2(),
POLY1305_BLOCK_PART3(),
POLY1305_BLOCK_PART4(),
POLY1305_BLOCK_PART5())
- vmovdqa (STACK_TMP)(%rsp), X15;
- vmovdqa X8, (STACK_TMP)(%rsp);
- QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,
- POLY1305_BLOCK_PART1(20 * 16),
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4())
- QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(21 * 16),
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3())
- vmovdqa (STACK_TMP)(%rsp), X8;
- vmovdqa X15, (STACK_TMP)(%rsp);
- QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(22 * 16),
- POLY1305_BLOCK_PART2())

- # rounds 14,15
- QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(23 * 16))
- vmovdqa (STACK_TMP)(%rsp), X15;
- vmovdqa X8, (STACK_TMP)(%rsp);
- QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5())
- QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,
- POLY1305_BLOCK_PART1(24 * 16),
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4())
- vmovdqa (STACK_TMP)(%rsp), X8;
- vmovdqa X15, (STACK_TMP)(%rsp);
- QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(25 * 16),
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3())
+ subl $2, (STACK_MAX + 8 * 8)(%rsp);
+ jnz .Lround8_with_poly1305_inner;

- # rounds 16,17
+ /* rounds 8-9 & 18-19 */
QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(26 * 16),
- POLY1305_BLOCK_PART2())
+ _,
+ _,
+ _,
+ _)
vmovdqa (STACK_TMP)(%rsp), X15;
vmovdqa X8, (STACK_TMP)(%rsp);
QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(27 * 16))
+ _,
+ _,
+ _,
+ _)
QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5())
+ _,
+ _,
+ _,
+ _)
vmovdqa (STACK_TMP)(%rsp), X8;
vmovdqa X15, (STACK_TMP)(%rsp);
QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,
- POLY1305_BLOCK_PART1(28 * 16),
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4())
+ _,
+ _,
+ _,
+ _)

- # rounds 18,19
- QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(29 * 16),
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3())
- vmovdqa (STACK_TMP)(%rsp), X15;
- vmovdqa X8, (STACK_TMP)(%rsp);
- QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(30 * 16),
- POLY1305_BLOCK_PART2())
- QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(31 * 16))
- vmovdqa (STACK_TMP)(%rsp), X8;
- vmovdqa X15, (STACK_TMP)(%rsp);
- QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5())
+ subl $10, (STACK_MAX + 8 * 8 + 4)(%rsp);
+ jnz .Lround8_with_poly1305_outer;

movq (STACK_MAX + 5 * 8)(%rsp), SRC;
movq (STACK_MAX + 6 * 8)(%rsp), DST;
@@ -741,7 +552,6 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8:

subq $8, (STACK_MAX + 7 * 8)(%rsp); # NBLKS

- lea (32 * 16)(POLY_RSRC), POLY_RSRC;
lea (8 * 64)(DST), DST;
lea (8 * 64)(SRC), SRC;
movq SRC, (STACK_MAX + 5 * 8)(%rsp);
diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S
index 6bbf12fc1..77a27d349 100644
--- a/cipher/chacha20-amd64-ssse3.S
+++ b/cipher/chacha20-amd64-ssse3.S
@@ -511,6 +511,8 @@ ELF(.size _gcry_chacha20_amd64_ssse3_blocks1,
4-way stitched chacha20-poly1305
**********************************************************************/

+#define _ /*_*/
+
.align 8
.globl _gcry_chacha20_poly1305_amd64_ssse3_blocks4
ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks4,@function;)
@@ -531,7 +533,7 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks4:
movq %rsp, %rbp;
CFI_DEF_CFA_REGISTER(%rbp);

- subq $(8 * 8) + STACK_MAX + 16, %rsp;
+ subq $(9 * 8) + STACK_MAX + 16, %rsp;
andq $~15, %rsp;

movq %rbx, (STACK_MAX + 0 * 8)(%rsp);
@@ -586,51 +588,14 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks4:
movdqa X11, (STACK_TMP)(%rsp);
movdqa X15, (STACK_TMP1)(%rsp);

- /* rounds 0,1 */
- QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,
- POLY1305_BLOCK_PART1(0 * 16),
- POLY1305_BLOCK_PART2())
- movdqa (STACK_TMP)(%rsp), X11;
- movdqa (STACK_TMP1)(%rsp), X15;
- movdqa X8, (STACK_TMP)(%rsp);
- movdqa X9, (STACK_TMP1)(%rsp);
- QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4())
- QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(1 * 16))
- movdqa (STACK_TMP)(%rsp), X8;
- movdqa (STACK_TMP1)(%rsp), X9;
- movdqa X11, (STACK_TMP)(%rsp);
- movdqa X15, (STACK_TMP1)(%rsp);
- QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3())
-
- /* rounds 2,3 */
- QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5())
- movdqa (STACK_TMP)(%rsp), X11;
- movdqa (STACK_TMP1)(%rsp), X15;
- movdqa X8, (STACK_TMP)(%rsp);
- movdqa X9, (STACK_TMP1)(%rsp);
- QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,
- POLY1305_BLOCK_PART1(2 * 16),
- POLY1305_BLOCK_PART2())
- QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4())
- movdqa (STACK_TMP)(%rsp), X8;
- movdqa (STACK_TMP1)(%rsp), X9;
- movdqa X11, (STACK_TMP)(%rsp);
- movdqa X15, (STACK_TMP1)(%rsp);
- QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(3 * 16))
+ /* Process four ChaCha20 blocks and sixteen Poly1305 blocks. */

- /* rounds 4,5 */
+ movl $20, (STACK_MAX + 8 * 8 + 4)(%rsp);
+.Lround4_with_poly1305_outer:
+ movl $8, (STACK_MAX + 8 * 8)(%rsp);
+.Lround4_with_poly1305_inner:
+ /* rounds 0-7 & 10-17 */
+ POLY1305_BLOCK_PART1(0 * 16)
QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,
POLY1305_BLOCK_PART2(),
POLY1305_BLOCK_PART3())
@@ -641,50 +606,8 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks4:
QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,
POLY1305_BLOCK_PART4(),
POLY1305_BLOCK_PART5())
- QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,
- POLY1305_BLOCK_PART1(4 * 16),
- POLY1305_BLOCK_PART2())
- movdqa (STACK_TMP)(%rsp), X8;
- movdqa (STACK_TMP1)(%rsp), X9;
- movdqa X11, (STACK_TMP)(%rsp);
- movdqa X15, (STACK_TMP1)(%rsp);
- QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4())
-
- /* rounds 6,7 */
- QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(5 * 16))
- movdqa (STACK_TMP)(%rsp), X11;
- movdqa (STACK_TMP1)(%rsp), X15;
- movdqa X8, (STACK_TMP)(%rsp);
- movdqa X9, (STACK_TMP1)(%rsp);
- QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3())
- QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5())
- movdqa (STACK_TMP)(%rsp), X8;
- movdqa (STACK_TMP1)(%rsp), X9;
- movdqa X11, (STACK_TMP)(%rsp);
- movdqa X15, (STACK_TMP1)(%rsp);
- QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,
- POLY1305_BLOCK_PART1(6 * 16),
- POLY1305_BLOCK_PART2())
-
- /* rounds 8,9 */
- QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4())
- movdqa (STACK_TMP)(%rsp), X11;
- movdqa (STACK_TMP1)(%rsp), X15;
- movdqa X8, (STACK_TMP)(%rsp);
- movdqa X9, (STACK_TMP1)(%rsp);
- QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(7 * 16))
+ POLY1305_BLOCK_PART1(1 * 16)
+ lea (2 * 16)(POLY_RSRC), POLY_RSRC;
QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,
POLY1305_BLOCK_PART2(),
POLY1305_BLOCK_PART3())
@@ -696,115 +619,33 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks4:
POLY1305_BLOCK_PART4(),
POLY1305_BLOCK_PART5())

- /* rounds 10,11 */
- QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,
- POLY1305_BLOCK_PART1(8 * 16),
- POLY1305_BLOCK_PART2())
- movdqa (STACK_TMP)(%rsp), X11;
- movdqa (STACK_TMP1)(%rsp), X15;
- movdqa X8, (STACK_TMP)(%rsp);
- movdqa X9, (STACK_TMP1)(%rsp);
- QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4())
- QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(9 * 16))
- movdqa (STACK_TMP)(%rsp), X8;
- movdqa (STACK_TMP1)(%rsp), X9;
- movdqa X11, (STACK_TMP)(%rsp);
- movdqa X15, (STACK_TMP1)(%rsp);
- QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3())
-
- /* rounds 12,13 */
- QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5())
- movdqa (STACK_TMP)(%rsp), X11;
- movdqa (STACK_TMP1)(%rsp), X15;
- movdqa X8, (STACK_TMP)(%rsp);
- movdqa X9, (STACK_TMP1)(%rsp);
- QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,
- POLY1305_BLOCK_PART1(10 * 16),
- POLY1305_BLOCK_PART2())
- QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4())
- movdqa (STACK_TMP)(%rsp), X8;
- movdqa (STACK_TMP1)(%rsp), X9;
- movdqa X11, (STACK_TMP)(%rsp);
- movdqa X15, (STACK_TMP1)(%rsp);
- QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(11 * 16))
-
- /* rounds 14,15 */
- QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3())
- movdqa (STACK_TMP)(%rsp), X11;
- movdqa (STACK_TMP1)(%rsp), X15;
- movdqa X8, (STACK_TMP)(%rsp);
- movdqa X9, (STACK_TMP1)(%rsp);
- QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5())
- QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,
- POLY1305_BLOCK_PART1(12 * 16),
- POLY1305_BLOCK_PART2())
- movdqa (STACK_TMP)(%rsp), X8;
- movdqa (STACK_TMP1)(%rsp), X9;
- movdqa X11, (STACK_TMP)(%rsp);
- movdqa X15, (STACK_TMP1)(%rsp);
- QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4())
+ subl $2, (STACK_MAX + 8 * 8)(%rsp);
+ jnz .Lround4_with_poly1305_inner;

- /* rounds 16,17 */
+ /* rounds 8-9 & 18-19 */
QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(13 * 16))
+ _,
+ _)
movdqa (STACK_TMP)(%rsp), X11;
movdqa (STACK_TMP1)(%rsp), X15;
movdqa X8, (STACK_TMP)(%rsp);
movdqa X9, (STACK_TMP1)(%rsp);
QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3())
+ _,
+ _)
QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5())
+ _,
+ _)
movdqa (STACK_TMP)(%rsp), X8;
movdqa (STACK_TMP1)(%rsp), X9;
movdqa X11, (STACK_TMP)(%rsp);
movdqa X15, (STACK_TMP1)(%rsp);
QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,
- POLY1305_BLOCK_PART1(14 * 16),
- POLY1305_BLOCK_PART2())
+ _,
+ _)

- /* rounds 18,19 */
- QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,
- POLY1305_BLOCK_PART3(),
- POLY1305_BLOCK_PART4())
- movdqa (STACK_TMP)(%rsp), X11;
- movdqa (STACK_TMP1)(%rsp), X15;
- movdqa X8, (STACK_TMP)(%rsp);
- movdqa X9, (STACK_TMP1)(%rsp);
- QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,
- POLY1305_BLOCK_PART5(),
- POLY1305_BLOCK_PART1(15 * 16))
- QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,
- POLY1305_BLOCK_PART2(),
- POLY1305_BLOCK_PART3())
- movdqa (STACK_TMP)(%rsp), X8;
- movdqa (STACK_TMP1)(%rsp), X9;
- movdqa X11, (STACK_TMP)(%rsp);
- movdqa X15, (STACK_TMP1)(%rsp);
- QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,
- POLY1305_BLOCK_PART4(),
- POLY1305_BLOCK_PART5())
+ subl $10, (STACK_MAX + 8 * 8 + 4)(%rsp);
+ jnz .Lround4_with_poly1305_outer;

/* tmp := X15 */
movdqa (STACK_TMP)(%rsp), X11;
@@ -877,7 +718,6 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks4:

subq $4, (STACK_MAX + 7 * 8)(%rsp); # NBLKS

- lea (16 * 16)(POLY_RSRC), POLY_RSRC;
lea (4 * 64)(DST), DST;
lea (4 * 64)(SRC), SRC;
movq SRC, (STACK_MAX + 5 * 8)(%rsp);
@@ -954,7 +794,7 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1:
movq %rsp, %rbp;
CFI_DEF_CFA_REGISTER(%rbp);

- subq $(8 * 8), %rsp;
+ subq $(9 * 8), %rsp;
movq %rbx, (0 * 8)(%rsp);
movq %r12, (1 * 8)(%rsp);
movq %r13, (2 * 8)(%rsp);
@@ -999,95 +839,31 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1:

/* Process two ChaCha20 blocks and eight Poly1305 blocks. */

+ movl $20, (8 * 8 + 4)(%rsp);
+.Lround2_with_poly1305_outer:
+ movl $8, (8 * 8)(%rsp);
+.Lround2_with_poly1305_inner:
POLY1305_BLOCK_PART1(0 * 16);
QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+ lea (1 * 16)(POLY_RSRC), POLY_RSRC;
POLY1305_BLOCK_PART2();
QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
POLY1305_BLOCK_PART3();
QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
POLY1305_BLOCK_PART4();
QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
-
- POLY1305_BLOCK_PART5();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART1(1 * 16);
- QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART2();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
- POLY1305_BLOCK_PART3();
- QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
-
- POLY1305_BLOCK_PART4();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART5();
- QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART1(2 * 16);
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
- POLY1305_BLOCK_PART2();
- QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
-
- POLY1305_BLOCK_PART3();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART4();
- QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART5();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
- POLY1305_BLOCK_PART1(3 * 16);
- QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
-
- POLY1305_BLOCK_PART2();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART3();
- QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART4();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
- POLY1305_BLOCK_PART5();
- QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
-
- POLY1305_BLOCK_PART1(4 * 16);
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART2();
- QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART3();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
- POLY1305_BLOCK_PART4();
- QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
-
POLY1305_BLOCK_PART5();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART1(5 * 16);
- QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART2();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
- POLY1305_BLOCK_PART3();
- QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);

- POLY1305_BLOCK_PART4();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART5();
- QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART1(6 * 16);
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
- POLY1305_BLOCK_PART2();
- QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
+ subl $2, (8 * 8)(%rsp);
+ jnz .Lround2_with_poly1305_inner;

- POLY1305_BLOCK_PART3();
QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART4();
QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART5();
QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
- POLY1305_BLOCK_PART1(7 * 16);
QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);

- POLY1305_BLOCK_PART2();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART3();
- QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART4();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
- POLY1305_BLOCK_PART5();
- QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
+ subl $10, (8 * 8 + 4)(%rsp);
+ jnz .Lround2_with_poly1305_outer;

movq (5 * 8)(%rsp), SRC;
movq (6 * 8)(%rsp), DST;
@@ -1123,7 +899,6 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1:
clear(X15);

subq $2, (7 * 8)(%rsp); # NBLKS
- lea (2 * 64)(POLY_RSRC), POLY_RSRC;
lea (2 * 64)(SRC), SRC;
lea (2 * 64)(DST), DST;
movq SRC, (5 * 8)(%rsp);
@@ -1137,55 +912,31 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1:
movdqa X13, X3;

/* Process one ChaCha20 block and four Poly1305 blocks. */
+
+ movl $20, (8 * 8 + 4)(%rsp);
+.Lround1_with_poly1305_outer:
+ movl $8, (8 * 8)(%rsp);
+.Lround1_with_poly1305_inner:
POLY1305_BLOCK_PART1(0 * 16);
QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
POLY1305_BLOCK_PART2();
QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+ lea (1 * 16)(POLY_RSRC), POLY_RSRC;

POLY1305_BLOCK_PART3();
QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
POLY1305_BLOCK_PART4();
QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
-
- POLY1305_BLOCK_PART5();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART1(1 * 16);
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
-
- POLY1305_BLOCK_PART2();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART3();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
-
- POLY1305_BLOCK_PART4();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
POLY1305_BLOCK_PART5();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
-
- POLY1305_BLOCK_PART1(2 * 16);
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART2();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);

- POLY1305_BLOCK_PART3();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART4();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
-
- POLY1305_BLOCK_PART5();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART1(3 * 16);
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+ subl $4, (8 * 8)(%rsp);
+ jnz .Lround1_with_poly1305_inner;

- POLY1305_BLOCK_PART2();
QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART3();
QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);

- POLY1305_BLOCK_PART4();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
- POLY1305_BLOCK_PART5();
- QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+ subl $10, (8 * 8 + 4)(%rsp);
+ jnz .Lround1_with_poly1305_outer;

movq (5 * 8)(%rsp), SRC;
movq (6 * 8)(%rsp), DST;
@@ -1204,7 +955,6 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1:
xor_src_dst(DST, SRC, 12 * 4, X3, X7);

subq $1, (7 * 8)(%rsp); # NBLKS
- lea (64)(POLY_RSRC), POLY_RSRC;
lea (64)(SRC), SRC;
lea (64)(DST), DST;
movq SRC, (5 * 8)(%rsp);


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@gnupg.org
http://lists.gnupg.org/mailman/listinfo/gcrypt-devel