Mailing List Archive

[PATCH] rijndael-aesni: use inline checksumming for OCB decryption
* cipher/rijndael-aesni.c (aesni_ocb_checksum): Remove.
(aesni_ocb_dec): Add inline checksumming.
--

Inline checksumming is far faster on Ryzen processors on i386
builds than two-pass checksumming.

Benchmark on AMD Ryzen 9 7900X (i386):

Before:
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
OCB dec | 0.180 ns/B 5292 MiB/s 0.847 c/B 4700

After (~2x faster):
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
OCB dec | 0.091 ns/B 10491 MiB/s 0.427 c/B 4700

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
---
cipher/rijndael-aesni.c | 220 ++++++++--------------------------------
1 file changed, 43 insertions(+), 177 deletions(-)

diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 906737a6..b33ef7ed 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -2710,174 +2710,6 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv,
}


-static ASM_FUNC_ATTR_INLINE void
-aesni_ocb_checksum (gcry_cipher_hd_t c, const unsigned char *plaintext,
- size_t nblocks)
-{
- RIJNDAEL_context *ctx = (void *)&c->context.c;
-
- /* Calculate checksum */
- asm volatile ("movdqu %[checksum], %%xmm6\n\t"
- "pxor %%xmm1, %%xmm1\n\t"
- "pxor %%xmm2, %%xmm2\n\t"
- "pxor %%xmm3, %%xmm3\n\t"
- :
- :[checksum] "m" (*c->u_ctr.ctr)
- : "memory" );
-
- if (0) {}
-#if defined(HAVE_GCC_INLINE_ASM_AVX2)
- else if (nblocks >= 16 && ctx->use_avx2)
- {
- /* Use wider 256-bit registers for fast xoring of plaintext. */
- asm volatile ("vzeroupper\n\t"
- "vpxor %%xmm0, %%xmm0, %%xmm0\n\t"
- "vpxor %%xmm4, %%xmm4, %%xmm4\n\t"
- "vpxor %%xmm5, %%xmm5, %%xmm5\n\t"
- "vpxor %%xmm7, %%xmm7, %%xmm7\n\t"
- :
- :
- : "memory");
-
- for (;nblocks >= 16; nblocks -= 16)
- {
- asm volatile ("vpxor %[ptr0], %%ymm6, %%ymm6\n\t"
- "vpxor %[ptr1], %%ymm1, %%ymm1\n\t"
- "vpxor %[ptr2], %%ymm2, %%ymm2\n\t"
- "vpxor %[ptr3], %%ymm3, %%ymm3\n\t"
- :
- : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE * 2)),
- [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE * 2)),
- [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE * 2)),
- [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE * 2))
- : "memory" );
- asm volatile ("vpxor %[ptr4], %%ymm0, %%ymm0\n\t"
- "vpxor %[ptr5], %%ymm4, %%ymm4\n\t"
- "vpxor %[ptr6], %%ymm5, %%ymm5\n\t"
- "vpxor %[ptr7], %%ymm7, %%ymm7\n\t"
- :
- : [ptr4] "m" (*(plaintext + 4 * BLOCKSIZE * 2)),
- [ptr5] "m" (*(plaintext + 5 * BLOCKSIZE * 2)),
- [ptr6] "m" (*(plaintext + 6 * BLOCKSIZE * 2)),
- [ptr7] "m" (*(plaintext + 7 * BLOCKSIZE * 2))
- : "memory" );
- plaintext += BLOCKSIZE * 16;
- }
-
- asm volatile ("vpxor %%ymm0, %%ymm6, %%ymm6\n\t"
- "vpxor %%ymm4, %%ymm1, %%ymm1\n\t"
- "vpxor %%ymm5, %%ymm2, %%ymm2\n\t"
- "vpxor %%ymm7, %%ymm3, %%ymm3\n\t"
- "vextracti128 $1, %%ymm6, %%xmm0\n\t"
- "vextracti128 $1, %%ymm1, %%xmm4\n\t"
- "vextracti128 $1, %%ymm2, %%xmm5\n\t"
- "vextracti128 $1, %%ymm3, %%xmm7\n\t"
- "vpxor %%xmm0, %%xmm6, %%xmm6\n\t"
- "vpxor %%xmm4, %%xmm1, %%xmm1\n\t"
- "vpxor %%xmm5, %%xmm2, %%xmm2\n\t"
- "vpxor %%xmm7, %%xmm3, %%xmm3\n\t"
- "vzeroupper\n\t"
- :
- :
- : "memory" );
- }
-#endif
-#if defined(HAVE_GCC_INLINE_ASM_AVX)
- else if (nblocks >= 16 && ctx->use_avx)
- {
- /* Same as AVX2, except using 256-bit floating point instructions. */
- asm volatile ("vzeroupper\n\t"
- "vxorpd %%xmm0, %%xmm0, %%xmm0\n\t"
- "vxorpd %%xmm4, %%xmm4, %%xmm4\n\t"
- "vxorpd %%xmm5, %%xmm5, %%xmm5\n\t"
- "vxorpd %%xmm7, %%xmm7, %%xmm7\n\t"
- :
- :
- : "memory");
-
- for (;nblocks >= 16; nblocks -= 16)
- {
- asm volatile ("vxorpd %[ptr0], %%ymm6, %%ymm6\n\t"
- "vxorpd %[ptr1], %%ymm1, %%ymm1\n\t"
- "vxorpd %[ptr2], %%ymm2, %%ymm2\n\t"
- "vxorpd %[ptr3], %%ymm3, %%ymm3\n\t"
- :
- : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE * 2)),
- [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE * 2)),
- [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE * 2)),
- [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE * 2))
- : "memory" );
- asm volatile ("vxorpd %[ptr4], %%ymm0, %%ymm0\n\t"
- "vxorpd %[ptr5], %%ymm4, %%ymm4\n\t"
- "vxorpd %[ptr6], %%ymm5, %%ymm5\n\t"
- "vxorpd %[ptr7], %%ymm7, %%ymm7\n\t"
- :
- : [ptr4] "m" (*(plaintext + 4 * BLOCKSIZE * 2)),
- [ptr5] "m" (*(plaintext + 5 * BLOCKSIZE * 2)),
- [ptr6] "m" (*(plaintext + 6 * BLOCKSIZE * 2)),
- [ptr7] "m" (*(plaintext + 7 * BLOCKSIZE * 2))
- : "memory" );
- plaintext += BLOCKSIZE * 16;
- }
-
- asm volatile ("vxorpd %%ymm0, %%ymm6, %%ymm6\n\t"
- "vxorpd %%ymm4, %%ymm1, %%ymm1\n\t"
- "vxorpd %%ymm5, %%ymm2, %%ymm2\n\t"
- "vxorpd %%ymm7, %%ymm3, %%ymm3\n\t"
- "vextractf128 $1, %%ymm6, %%xmm0\n\t"
- "vextractf128 $1, %%ymm1, %%xmm4\n\t"
- "vextractf128 $1, %%ymm2, %%xmm5\n\t"
- "vextractf128 $1, %%ymm3, %%xmm7\n\t"
- "vxorpd %%xmm0, %%xmm6, %%xmm6\n\t"
- "vxorpd %%xmm4, %%xmm1, %%xmm1\n\t"
- "vxorpd %%xmm5, %%xmm2, %%xmm2\n\t"
- "vxorpd %%xmm7, %%xmm3, %%xmm3\n\t"
- "vzeroupper\n\t"
- :
- :
- : "memory" );
- }
-#endif
-
- for (;nblocks >= 4; nblocks -= 4)
- {
- asm volatile ("movdqu %[ptr0], %%xmm0\n\t"
- "movdqu %[ptr1], %%xmm4\n\t"
- "movdqu %[ptr2], %%xmm5\n\t"
- "movdqu %[ptr3], %%xmm7\n\t"
- "pxor %%xmm0, %%xmm6\n\t"
- "pxor %%xmm4, %%xmm1\n\t"
- "pxor %%xmm5, %%xmm2\n\t"
- "pxor %%xmm7, %%xmm3\n\t"
- :
- : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE)),
- [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE)),
- [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE)),
- [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE))
- : "memory" );
- plaintext += BLOCKSIZE * 4;
- }
-
- for (;nblocks >= 1; nblocks -= 1)
- {
- asm volatile ("movdqu %[ptr0], %%xmm0\n\t"
- "pxor %%xmm0, %%xmm6\n\t"
- :
- : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE))
- : "memory" );
- plaintext += BLOCKSIZE;
- }
-
- asm volatile ("pxor %%xmm1, %%xmm6\n\t"
- "pxor %%xmm2, %%xmm6\n\t"
- "pxor %%xmm3, %%xmm6\n\t"
- "movdqu %%xmm6, %[checksum]\n\t"
- : [checksum] "=m" (*c->u_ctr.ctr)
- :
- : "memory" );
-}
-
-
static unsigned int ASM_FUNC_ATTR_NOINLINE
aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
const void *inbuf_arg, size_t nblocks)
@@ -3401,9 +3233,11 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,

/* Preload Offset */
asm volatile ("movdqu %[iv], %%xmm5\n\t"
- : /* No output */
- : [iv] "m" (*c->u_iv.iv)
- : "memory" );
+ "movdqu %[ctr], %%xmm7\n\t"
+ : /* No output */
+ : [iv] "m" (*c->u_iv.iv),
+ [ctr] "m" (*c->u_ctr.ctr)
+ : "memory" );

for ( ;nblocks && n % 4; nblocks-- )
{
@@ -3424,6 +3258,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,

asm volatile ("pxor %%xmm5, %%xmm0\n\t"
"movdqu %%xmm0, %[outbuf]\n\t"
+ "pxor %%xmm0, %%xmm7\n\t"
: [outbuf] "=m" (*outbuf)
:
: "memory" );
@@ -3452,6 +3287,15 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
"pxor %[first_key], %%xmm5\n\t"
"pxor %[first_key], %%xmm0\n\t"
"movdqa %%xmm0, %[lxfkey]\n\t"
+ /* Clear plaintext blocks */
+ "pxor %%xmm1, %%xmm1\n\t"
+ "pxor %%xmm2, %%xmm2\n\t"
+ "pxor %%xmm3, %%xmm3\n\t"
+ "pxor %%xmm4, %%xmm4\n\t"
+ "pxor %%xmm8, %%xmm8\n\t"
+ "pxor %%xmm9, %%xmm9\n\t"
+ "pxor %%xmm10, %%xmm10\n\t"
+ "pxor %%xmm11, %%xmm11\n\t"
: [lxfkey] "=m" (*lxf_key)
: [l0] "m" (*c->u_mode.ocb.L[0]),
[last_key] "m" (ctx->keyschdec[ctx->rounds][0][0]),
@@ -3463,7 +3307,9 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
n += 4;
l = aes_ocb_get_l(c, n);

- asm volatile ("movdqu %[l0l1], %%xmm10\n\t"
+ asm volatile ("pxor %%xmm10, %%xmm1\n\t"
+ "pxor %%xmm11, %%xmm2\n\t"
+ "movdqu %[l0l1], %%xmm10\n\t"
"movdqu %[l1], %%xmm11\n\t"
"movdqu %[l3], %%xmm15\n\t"
:
@@ -3477,7 +3323,10 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,

/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* P_i = Offset_i xor ENCIPHER(K, C_i xor Offset_i) */
- asm volatile ("movdqu %[inbuf0], %%xmm1\n\t"
+ asm volatile ("pxor %%xmm1, %%xmm4\n\t"
+ "pxor %%xmm2, %%xmm8\n\t"
+ "pxor %%xmm3, %%xmm9\n\t"
+ "movdqu %[inbuf0], %%xmm1\n\t"
"movdqu %[inbuf1], %%xmm2\n\t"
"movdqu %[inbuf2], %%xmm3\n\t"
:
@@ -3485,8 +3334,11 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
[inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)),
[inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
: "memory" );
- asm volatile ("movdqu %[inbuf3], %%xmm4\n\t"
+ asm volatile ("pxor %%xmm4, %%xmm7\n\t"
+ "movdqu %[inbuf3], %%xmm4\n\t"
+ "pxor %%xmm8, %%xmm7\n\t"
"movdqu %[inbuf4], %%xmm8\n\t"
+ "pxor %%xmm9, %%xmm7\n\t"
"movdqu %[inbuf5], %%xmm9\n\t"
:
: [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)),
@@ -3722,6 +3574,15 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
asm volatile ("pxor %[first_key], %%xmm5\n\t"
"pxor %%xmm0, %%xmm0\n\t"
"movdqu %%xmm0, %[lxfkey]\n\t"
+ /* Add plaintext blocks to checksum */
+ "pxor %%xmm1, %%xmm2\n\t"
+ "pxor %%xmm3, %%xmm4\n\t"
+ "pxor %%xmm9, %%xmm8\n\t"
+ "pxor %%xmm11, %%xmm10\n\t"
+ "pxor %%xmm2, %%xmm4\n\t"
+ "pxor %%xmm8, %%xmm10\n\t"
+ "pxor %%xmm4, %%xmm7\n\t"
+ "pxor %%xmm10, %%xmm7\n\t"
: [lxfkey] "=m" (*lxf_key)
: [first_key] "m" (ctx->keyschdec[0][0][0])
: "memory" );
@@ -3782,8 +3643,10 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,

asm volatile ("pxor %[tmpbuf0],%%xmm1\n\t"
"movdqu %%xmm1, %[outbuf0]\n\t"
+ "pxor %%xmm1, %%xmm7\n\t"
"pxor %[tmpbuf1],%%xmm2\n\t"
"movdqu %%xmm2, %[outbuf1]\n\t"
+ "pxor %%xmm2, %%xmm7\n\t"
: [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
[outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
: [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)),
@@ -3791,8 +3654,10 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
: "memory" );
asm volatile ("pxor %[tmpbuf2],%%xmm3\n\t"
"movdqu %%xmm3, %[outbuf2]\n\t"
+ "pxor %%xmm3, %%xmm7\n\t"
"pxor %%xmm5, %%xmm4\n\t"
"movdqu %%xmm4, %[outbuf3]\n\t"
+ "pxor %%xmm4, %%xmm7\n\t"
: [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
[outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
: [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE))
@@ -3822,6 +3687,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,

asm volatile ("pxor %%xmm5, %%xmm0\n\t"
"movdqu %%xmm0, %[outbuf]\n\t"
+ "pxor %%xmm0, %%xmm7\n\t"
: [outbuf] "=m" (*outbuf)
:
: "memory" );
@@ -3832,7 +3698,9 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,

c->u_mode.ocb.data_nblocks = n;
asm volatile ("movdqu %%xmm5, %[iv]\n\t"
- : [iv] "=m" (*c->u_iv.iv)
+ "movdqu %%xmm7, %[ctr]\n\t"
+ : [iv] "=m" (*c->u_iv.iv),
+ [ctr] "=m" (*c->u_ctr.ctr)
:
: "memory" );

@@ -3846,8 +3714,6 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
:
: "memory" );

- aesni_ocb_checksum (c, outbuf_arg, nblocks_arg);
-
aesni_cleanup ();
aesni_cleanup_2_7 ();

--
2.39.2


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel
Re: [PATCH] rijndael-aesni: use inline checksumming for OCB decryption [ In reply to ]
On Sun, 28 May 2023 17:53, Jussi Kivilinna said:

> Inline checksumming is far faster on Ryzen processors on i386
> builds than two-pass checksumming.

That is indeed a large performance boost. Did you had a chance to
benchmark it on some common Intel CPU?


Shalom-Salam,

Werner

--
The pioneers of a warless world are the youth that
refuse military service. - A. Einstein
Re: [PATCH] rijndael-aesni: use inline checksumming for OCB decryption [ In reply to ]
On 30.5.2023 13.32, Werner Koch via Gcrypt-devel wrote:
> On Sun, 28 May 2023 17:53, Jussi Kivilinna said:
>
>> Inline checksumming is far faster on Ryzen processors on i386
>> builds than two-pass checksumming.
>
> That is indeed a large performance boost. Did you had a chance to
> benchmark it on some common Intel CPU?
>

I tested now with Intel tigerlake, performance dropped by 9% which is
unexpectedly large change. I'll try few different things to see if
I can avoid such drop.

-Jussi

>
> Shalom-Salam,
>
> Werner
>
>
> _______________________________________________
> Gcrypt-devel mailing list
> Gcrypt-devel@gnupg.org
> https://lists.gnupg.org/mailman/listinfo/gcrypt-devel


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel
Re: [PATCH] rijndael-aesni: use inline checksumming for OCB decryption [ In reply to ]
On 31.5.2023 7.48, Jussi Kivilinna wrote:
> On 30.5.2023 13.32, Werner Koch via Gcrypt-devel wrote:
>> On Sun, 28 May 2023 17:53, Jussi Kivilinna said:
>>
>>> Inline checksumming is far faster on Ryzen processors on i386
>>> builds than two-pass checksumming.
>>
>> That is indeed a large performance boost.  Did you had a chance to
>> benchmark it on some common Intel CPU?
>>
>
> I tested now with Intel tigerlake, performance dropped by 9% which is
> unexpectedly large change. I'll try few different things to see if
> I can avoid such drop.

Performance problem I'm seen is limited to Zen4 and only 32-bit execution
mode. Running same code in 64-bit mode does not suffer from this problem.
Seems to be somehow related to mixed XMM/YMM register usage and vzeroupper
instruction not work as expected.

For example, if I disable 8-blocks AES-OCB-enc loop and add following
instructions at the end of 4-blocks loop:
"vpcmpeqd %%ymm0, %%ymm0, %%ymm0\n\t"
"vzeroupper\n\t"

With these instructions, I see approx two times slower performance in
32bit-mode (0.851 cycles/byte vs 0.411 c/B). In 64bit-mode above
instructions slow execution only about 1% (0.418 c/B vs 0.414 c/B).

So, I won't apply this patch after all.

-Jussi

>
> -Jussi
>
>>
>> Shalom-Salam,
>>
>>     Werner
>>
>>
>> _______________________________________________
>> Gcrypt-devel mailing list
>> Gcrypt-devel@gnupg.org
>> https://lists.gnupg.org/mailman/listinfo/gcrypt-devel
>
>
> _______________________________________________
> Gcrypt-devel mailing list
> Gcrypt-devel@gnupg.org
> https://lists.gnupg.org/mailman/listinfo/gcrypt-devel


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel