Mailing List Archive: [PATCH 1/3] ghash|polyval: add x86_64 VPCLMUL/AVX2 accelerated implementation

* cipher/cipher-gcm-intel-pclmul.c (GCM_INTEL_USE_VPCLMUL_AVX2)
(GCM_INTEL_AGGR8_TABLE_INITIALIZED)
(GCM_INTEL_AGGR16_TABLE_INITIALIZED): New.
(gfmul_pclmul): Fixes to comments.
[GCM_USE_INTEL_VPCLMUL_AVX2] (GFMUL_AGGR16_ASM_VPCMUL_AVX2)
(gfmul_vpclmul_avx2_aggr16, gfmul_vpclmul_avx2_aggr16_le)
(gfmul_pclmul_avx2, gcm_lsh_avx2, load_h1h2_to_ymm1)
(ghash_setup_aggr8_avx2, ghash_setup_aggr16_avx2): New.
(_gcry_ghash_setup_intel_pclmul): Add 'hw_features' parameter; Setup
ghash and polyval function pointers for context; Add VPCLMUL/AVX2 code
path; Defer aggr8 and aggr16 table initialization to until first use in
'_gcry_ghash_intel_pclmul' or '_gcry_polyval_intel_pclmul'.
[__x86_64__] (ghash_setup_aggr8): New.
(_gcry_ghash_intel_pclmul): Add VPCLMUL/AVX2 code path; Add call for
aggr8 table initialization.
(_gcry_polyval_intel_pclmul): Add VPCLMUL/AVX2 code path; Add call for
aggr8 table initialization.
* cipher/cipher-gcm.c [GCM_USE_INTEL_PCLMUL] (_gcry_ghash_intel_pclmul)
(_gcry_polyval_intel_pclmul): Remove.
[GCM_USE_INTEL_PCLMUL] (_gcry_ghash_setup_intel_pclmul): Add
'hw_features' parameter.
(setupM) [GCM_USE_INTEL_PCLMUL]: Pass HW features to
'_gcry_ghash_setup_intel_pclmul'; Let '_gcry_ghash_setup_intel_pclmul'
setup function pointers.
* cipher/cipher-internal.h (GCM_USE_INTEL_VPCLMUL_AVX2): New.
(gcry_cipher_handle): Add member 'gcm.hw_impl_flags'.
--

Patch adds VPCLMUL/AVX2 accelerated implementation for GHASH (GCM) and
POLYVAL (GCM-SIV).

Benchmark on AMD Ryzen 5800X (zen3):

Before:
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
GCM auth | 0.088 ns/B 10825 MiB/s 0.427 c/B 4850
GCM-SIV auth | 0.083 ns/B 11472 MiB/s 0.403 c/B 4850

After: (~1.93x faster)
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
GCM auth | 0.045 ns/B 21098 MiB/s 0.219 c/B 4850
GCM-SIV auth | 0.043 ns/B 22181 MiB/s 0.209 c/B 4850

AES128-GCM / AES128-GCM-SIV encryption:
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
GCM enc | 0.079 ns/B 12073 MiB/s 0.383 c/B 4850
GCM-SIV enc | 0.076 ns/B 12500 MiB/s 0.370 c/B 4850

Benchmark on Intel Core i3-1115G4 (tigerlake):

Before:
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
GCM auth | 0.080 ns/B 11919 MiB/s 0.327 c/B 4090
GCM-SIV auth | 0.075 ns/B 12643 MiB/s 0.309 c/B 4090

After: (~1.28x faster)
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
GCM auth | 0.062 ns/B 15348 MiB/s 0.254 c/B 4090
GCM-SIV auth | 0.058 ns/B 16381 MiB/s 0.238 c/B 4090

AES128-GCM / AES128-GCM-SIV encryption:
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
GCM enc | 0.101 ns/B 9441 MiB/s 0.413 c/B 4090
GCM-SIV enc | 0.098 ns/B 9692 MiB/s 0.402 c/B 4089

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
---
cipher/cipher-gcm-intel-pclmul.c | 809 +++++++++++++++++++++++++++----
cipher/cipher-gcm.c | 15 +-
cipher/cipher-internal.h | 11 +
3 files changed, 724 insertions(+), 111 deletions(-)

diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c
index daf807d0..b7324e8f 100644
--- a/cipher/cipher-gcm-intel-pclmul.c
+++ b/cipher/cipher-gcm-intel-pclmul.c
@@ -1,6 +1,6 @@
/* cipher-gcm-intel-pclmul.c - Intel PCLMUL accelerated Galois Counter Mode
* implementation
- * Copyright (C) 2013-2014,2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2013-2014,2019,2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -49,12 +49,18 @@
#define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE

+#define GCM_INTEL_USE_VPCLMUL_AVX2 (1 << 0)
+#define GCM_INTEL_AGGR8_TABLE_INITIALIZED (1 << 1)
+#define GCM_INTEL_AGGR16_TABLE_INITIALIZED (1 << 2)
+
+
/*
Intel PCLMUL ghash based on white paper:
"Intel® Carry-Less Multiplication Instruction and its Usage for Computing the
GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis.
*/
-static ASM_FUNC_ATTR_INLINE void reduction(void)
+static ASM_FUNC_ATTR_INLINE
+void reduction(void)
{
/* input: <xmm1:xmm3> */

@@ -83,7 +89,8 @@ static ASM_FUNC_ATTR_INLINE void reduction(void)
::: "memory" );
}

-static ASM_FUNC_ATTR_INLINE void gfmul_pclmul(void)
+static ASM_FUNC_ATTR_INLINE
+void gfmul_pclmul(void)
{
/* Input: XMM0 and XMM1, Output: XMM1. Input XMM0 stays unmodified.
Input must be converted to little-endian.
@@ -358,12 +365,12 @@ gfmul_pclmul_aggr4_le(const void *buf, const void *h_1, const void *h_table)
\
"pshufd $78, %%xmm8, %%xmm11\n\t" \
"pshufd $78, %%xmm5, %%xmm7\n\t" \
- "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */ \
- "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 4:b0+b1 */ \
+ "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 2:a0+a1 */ \
+ "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 2:b0+b1 */ \
"movdqa %%xmm8, %%xmm6\n\t" \
- "pclmulqdq $0, %%xmm5, %%xmm6\n\t" /* xmm6 holds 4:a0*b0 */ \
- "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 4:a1*b1 */ \
- "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 4:(a0+a1)*(b0+b1) */ \
+ "pclmulqdq $0, %%xmm5, %%xmm6\n\t" /* xmm6 holds 2:a0*b0 */ \
+ "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 2:a1*b1 */ \
+ "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 2:(a0+a1)*(b0+b1) */ \
\
"pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4+5+6+7+8:a0*b0 */ \
"pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4+5+6+7+8:a1*b1 */ \
@@ -371,16 +378,16 @@ gfmul_pclmul_aggr4_le(const void *buf, const void *h_1, const void *h_table)
\
"pshufd $78, %%xmm0, %%xmm11\n\t" \
"pshufd $78, %%xmm2, %%xmm7\n\t" \
- "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */ \
- "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:b0+b1 */ \
+ "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 1:a0+a1 */ \
+ "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 1:b0+b1 */ \
"movdqa %%xmm0, %%xmm6\n\t" \
- "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 3:a0*b0 */ \
- "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */ \
- "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */ \
+ "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 1:a0*b0 */ \
+ "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 1:a1*b1 */ \
+ "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 1:(a0+a1)*(b0+b1) */ \
\
- "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+3+4+5+6+7+8:a0*b0 */ \
- "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+3+4+5+6+7+8:a1*b1 */ \
- "pxor %%xmm7, %%xmm4\n\t"/* xmm4 holds 1+2+3+3+4+5+6+7+8:(a0+a1)*(b0+b1) */\
+ "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+4+5+6+7+8:a0*b0 */ \
+ "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+4+5+6+7+8:a1*b1 */ \
+ "pxor %%xmm7, %%xmm4\n\t"/* xmm4 holds 1+2+3+4+5+6+7+8:(a0+a1)*(b0+b1) */ \
\
/* aggregated reduction... */ \
"movdqa %%xmm3, %%xmm5\n\t" \
@@ -432,14 +439,409 @@ gfmul_pclmul_aggr8_le(const void *buf, const void *h_table)

reduction();
}
-#endif

-static ASM_FUNC_ATTR_INLINE void gcm_lsh(void *h, unsigned int hoffs)
+#ifdef GCM_USE_INTEL_VPCLMUL_AVX2
+
+#define GFMUL_AGGR16_ASM_VPCMUL_AVX2(be_to_le) \
+ /* perform clmul and merge results... */ \
+ "vmovdqu 0*16(%[buf]), %%ymm5\n\t" \
+ "vmovdqu 2*16(%[buf]), %%ymm2\n\t" \
+ be_to_le("vpshufb %%ymm15, %%ymm5, %%ymm5\n\t") /* be => le */ \
+ be_to_le("vpshufb %%ymm15, %%ymm2, %%ymm2\n\t") /* be => le */ \
+ "vpxor %%ymm5, %%ymm1, %%ymm1\n\t" \
+ \
+ "vpshufd $78, %%ymm0, %%ymm5\n\t" \
+ "vpshufd $78, %%ymm1, %%ymm4\n\t" \
+ "vpxor %%ymm0, %%ymm5, %%ymm5\n\t" /* ymm5 holds 15|16:a0+a1 */ \
+ "vpxor %%ymm1, %%ymm4, %%ymm4\n\t" /* ymm4 holds 15|16:b0+b1 */ \
+ "vpclmulqdq $0, %%ymm1, %%ymm0, %%ymm3\n\t" /* ymm3 holds 15|16:a0*b0 */ \
+ "vpclmulqdq $17, %%ymm0, %%ymm1, %%ymm1\n\t" /* ymm1 holds 15|16:a1*b1 */ \
+ "vpclmulqdq $0, %%ymm5, %%ymm4, %%ymm4\n\t" /* ymm4 holds 15|16:(a0+a1)*(b0+b1) */ \
+ \
+ "vmovdqu %[h1_h2], %%ymm0\n\t" \
+ \
+ "vpshufd $78, %%ymm13, %%ymm14\n\t" \
+ "vpshufd $78, %%ymm2, %%ymm7\n\t" \
+ "vpxor %%ymm13, %%ymm14, %%ymm14\n\t" /* ymm14 holds 13|14:a0+a1 */ \
+ "vpxor %%ymm2, %%ymm7, %%ymm7\n\t" /* ymm7 holds 13|14:b0+b1 */ \
+ "vpclmulqdq $0, %%ymm2, %%ymm13, %%ymm6\n\t" /* ymm6 holds 13|14:a0*b0 */ \
+ "vpclmulqdq $17, %%ymm13, %%ymm2, %%ymm2\n\t" /* ymm2 holds 13|14:a1*b1 */ \
+ "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 13|14:(a0+a1)*(b0+b1) */\
+ \
+ "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 13+15|14+16:a0*b0 */ \
+ "vpxor %%ymm2, %%ymm1, %%ymm1\n\t" /* ymm1 holds 13+15|14+16:a1*b1 */ \
+ "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 13+15|14+16:(a0+a1)*(b0+b1) */ \
+ \
+ "vmovdqu 4*16(%[buf]), %%ymm5\n\t" \
+ "vmovdqu 6*16(%[buf]), %%ymm2\n\t" \
+ be_to_le("vpshufb %%ymm15, %%ymm5, %%ymm5\n\t") /* be => le */ \
+ be_to_le("vpshufb %%ymm15, %%ymm2, %%ymm2\n\t") /* be => le */ \
+ \
+ "vpshufd $78, %%ymm12, %%ymm14\n\t" \
+ "vpshufd $78, %%ymm5, %%ymm7\n\t" \
+ "vpxor %%ymm12, %%ymm14, %%ymm14\n\t" /* ymm14 holds 11|12:a0+a1 */ \
+ "vpxor %%ymm5, %%ymm7, %%ymm7\n\t" /* ymm7 holds 11|12:b0+b1 */ \
+ "vpclmulqdq $0, %%ymm5, %%ymm12, %%ymm6\n\t" /* ymm6 holds 11|12:a0*b0 */ \
+ "vpclmulqdq $17, %%ymm12, %%ymm5, %%ymm5\n\t" /* ymm5 holds 11|12:a1*b1 */ \
+ "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 11|12:(a0+a1)*(b0+b1) */\
+ \
+ "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 11+13+15|12+14+16:a0*b0 */ \
+ "vpxor %%ymm5, %%ymm1, %%ymm1\n\t" /* ymm1 holds 11+13+15|12+14+16:a1*b1 */ \
+ "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 11+13+15|12+14+16:(a0+a1)*(b0+b1) */\
+ \
+ "vpshufd $78, %%ymm11, %%ymm14\n\t" \
+ "vpshufd $78, %%ymm2, %%ymm7\n\t" \
+ "vpxor %%ymm11, %%ymm14, %%ymm14\n\t" /* ymm14 holds 9|10:a0+a1 */ \
+ "vpxor %%ymm2, %%ymm7, %%ymm7\n\t" /* ymm7 holds 9|10:b0+b1 */ \
+ "vpclmulqdq $0, %%ymm2, %%ymm11, %%ymm6\n\t" /* ymm6 holds 9|10:a0*b0 */ \
+ "vpclmulqdq $17, %%ymm11, %%ymm2, %%ymm2\n\t" /* ymm2 holds 9|10:a1*b1 */ \
+ "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 9|10:(a0+a1)*(b0+b1) */ \
+ \
+ "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 9+11+…+15|10+12+…+16:a0*b0 */ \
+ "vpxor %%ymm2, %%ymm1, %%ymm1\n\t" /* ymm1 holds 9+11+…+15|10+12+…+16:a1*b1 */ \
+ "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 9+11+…+15|10+12+…+16:(a0+a1)*(b0+b1) */\
+ \
+ "vmovdqu 8*16(%[buf]), %%ymm5\n\t" \
+ "vmovdqu 10*16(%[buf]), %%ymm2\n\t" \
+ be_to_le("vpshufb %%ymm15, %%ymm5, %%ymm5\n\t") /* be => le */ \
+ be_to_le("vpshufb %%ymm15, %%ymm2, %%ymm2\n\t") /* be => le */ \
+ \
+ "vpshufd $78, %%ymm10, %%ymm14\n\t" \
+ "vpshufd $78, %%ymm5, %%ymm7\n\t" \
+ "vpxor %%ymm10, %%ymm14, %%ymm14\n\t" /* ymm14 holds 7|8:a0+a1 */ \
+ "vpxor %%ymm5, %%ymm7, %%ymm7\n\t" /* ymm7 holds 7|8:b0+b1 */ \
+ "vpclmulqdq $0, %%ymm5, %%ymm10, %%ymm6\n\t" /* ymm6 holds 7|8:a0*b0 */ \
+ "vpclmulqdq $17, %%ymm10, %%ymm5, %%ymm5\n\t" /* ymm5 holds 7|8:a1*b1 */ \
+ "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 7|8:(a0+a1)*(b0+b1) */ \
+ \
+ "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 7+9+…+15|8+10+…+16:a0*b0 */ \
+ "vpxor %%ymm5, %%ymm1, %%ymm1\n\t" /* ymm1 holds 7+9+…+15|8+10+…+16:a1*b1 */ \
+ "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 7+9+…+15|8+10+…+16:(a0+a1)*(b0+b1) */\
+ \
+ "vpshufd $78, %%ymm9, %%ymm14\n\t" \
+ "vpshufd $78, %%ymm2, %%ymm7\n\t" \
+ "vpxor %%ymm9, %%ymm14, %%ymm14\n\t" /* ymm14 holds 5|6:a0+a1 */ \
+ "vpxor %%ymm2, %%ymm7, %%ymm7\n\t" /* ymm7 holds 5|6:b0+b1 */ \
+ "vpclmulqdq $0, %%ymm2, %%ymm9, %%ymm6\n\t" /* ymm6 holds 5|6:a0*b0 */ \
+ "vpclmulqdq $17, %%ymm9, %%ymm2, %%ymm2\n\t" /* ymm2 holds 5|6:a1*b1 */ \
+ "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 5|6:(a0+a1)*(b0+b1) */ \
+ \
+ "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 5+7+…+15|6+8+…+16:a0*b0 */ \
+ "vpxor %%ymm2, %%ymm1, %%ymm1\n\t" /* ymm1 holds 5+7+…+15|6+8+…+16:a1*b1 */ \
+ "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 5+7+…+15|6+8+…+16:(a0+a1)*(b0+b1) */\
+ \
+ "vmovdqu 12*16(%[buf]), %%ymm5\n\t" \
+ "vmovdqu 14*16(%[buf]), %%ymm2\n\t" \
+ be_to_le("vpshufb %%ymm15, %%ymm5, %%ymm5\n\t") /* be => le */ \
+ be_to_le("vpshufb %%ymm15, %%ymm2, %%ymm2\n\t") /* be => le */ \
+ \
+ "vpshufd $78, %%ymm8, %%ymm14\n\t" \
+ "vpshufd $78, %%ymm5, %%ymm7\n\t" \
+ "vpxor %%ymm8, %%ymm14, %%ymm14\n\t" /* ymm14 holds 3|4:a0+a1 */ \
+ "vpxor %%ymm5, %%ymm7, %%ymm7\n\t" /* ymm7 holds 3|4:b0+b1 */ \
+ "vpclmulqdq $0, %%ymm5, %%ymm8, %%ymm6\n\t" /* ymm6 holds 3|4:a0*b0 */ \
+ "vpclmulqdq $17, %%ymm8, %%ymm5, %%ymm5\n\t" /* ymm5 holds 3|4:a1*b1 */ \
+ "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 3|4:(a0+a1)*(b0+b1) */ \
+ \
+ "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 3+5+…+15|4+6+…+16:a0*b0 */ \
+ "vpxor %%ymm5, %%ymm1, %%ymm1\n\t" /* ymm1 holds 3+5+…+15|4+6+…+16:a1*b1 */ \
+ "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 3+5+…+15|4+6+…+16:(a0+a1)*(b0+b1) */\
+ \
+ "vpshufd $78, %%ymm0, %%ymm14\n\t" \
+ "vpshufd $78, %%ymm2, %%ymm7\n\t" \
+ "vpxor %%ymm0, %%ymm14, %%ymm14\n\t" /* ymm14 holds 1|2:a0+a1 */ \
+ "vpxor %%ymm2, %%ymm7, %%ymm7\n\t" /* ymm7 holds 1|2:b0+b1 */ \
+ "vpclmulqdq $0, %%ymm2, %%ymm0, %%ymm6\n\t" /* ymm6 holds 1|2:a0*b0 */ \
+ "vpclmulqdq $17, %%ymm0, %%ymm2, %%ymm2\n\t" /* ymm2 holds 1|2:a1*b1 */ \
+ "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 1|2:(a0+a1)*(b0+b1) */ \
+ \
+ "vmovdqu %[h15_h16], %%ymm0\n\t" \
+ \
+ "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 1+3+…+15|2+4+…+16:a0*b0 */ \
+ "vpxor %%ymm2, %%ymm1, %%ymm1\n\t" /* ymm1 holds 1+3+…+15|2+4+…+16:a1*b1 */ \
+ "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 1+3+…+15|2+4+…+16:(a0+a1)*(b0+b1) */\
+ \
+ /* aggregated reduction... */ \
+ "vpxor %%ymm1, %%ymm3, %%ymm5\n\t" /* ymm5 holds a0*b0+a1*b1 */ \
+ "vpxor %%ymm5, %%ymm4, %%ymm4\n\t" /* ymm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */ \
+ "vpslldq $8, %%ymm4, %%ymm5\n\t" \
+ "vpsrldq $8, %%ymm4, %%ymm4\n\t" \
+ "vpxor %%ymm5, %%ymm3, %%ymm3\n\t" \
+ "vpxor %%ymm4, %%ymm1, %%ymm1\n\t" /* <ymm1:xmm3> holds the result of the \
+ carry-less multiplication of ymm0 \
+ by ymm1 */ \
+ \
+ /* first phase of the reduction */ \
+ "vpsllq $1, %%ymm3, %%ymm6\n\t" /* packed right shifting << 63 */ \
+ "vpxor %%ymm3, %%ymm6, %%ymm6\n\t" \
+ "vpsllq $57, %%ymm3, %%ymm5\n\t" /* packed right shifting << 57 */ \
+ "vpsllq $62, %%ymm6, %%ymm6\n\t" /* packed right shifting << 62 */ \
+ "vpxor %%ymm5, %%ymm6, %%ymm6\n\t" /* xor the shifted versions */ \
+ "vpshufd $0x6a, %%ymm6, %%ymm5\n\t" \
+ "vpshufd $0xae, %%ymm6, %%ymm6\n\t" \
+ "vpxor %%ymm5, %%ymm3, %%ymm3\n\t" /* first phase of the reduction complete */ \
+ \
+ /* second phase of the reduction */ \
+ "vpxor %%ymm3, %%ymm1, %%ymm1\n\t" /* xor the shifted versions */ \
+ "vpsrlq $1, %%ymm3, %%ymm3\n\t" /* packed left shifting >> 1 */ \
+ "vpxor %%ymm3, %%ymm6, %%ymm6\n\t" \
+ "vpsrlq $1, %%ymm3, %%ymm3\n\t" /* packed left shifting >> 2 */ \
+ "vpxor %%ymm3, %%ymm1, %%ymm1\n\t" \
+ "vpsrlq $5, %%ymm3, %%ymm3\n\t" /* packed left shifting >> 7 */ \
+ "vpxor %%ymm3, %%ymm6, %%ymm6\n\t" \
+ "vpxor %%ymm6, %%ymm1, %%ymm1\n\t" /* the result is in ymm1 */ \
+ \
+ /* merge 128-bit halves */ \
+ "vextracti128 $1, %%ymm1, %%xmm2\n\t" \
+ "vpxor %%xmm2, %%xmm1, %%xmm1\n\t"
+
+static ASM_FUNC_ATTR_INLINE void
+gfmul_vpclmul_avx2_aggr16(const void *buf, const void *h_table,
+ const u64 *h1_h2_h15_h16)
+{
+ /* Input:
+ Hx: YMM0, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13
+ bemask: YMM15
+ Hash: XMM1
+ Output:
+ Hash: XMM1
+ Inputs YMM0, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13 and YMM15 stay
+ unmodified.
+ */
+ asm volatile (GFMUL_AGGR16_ASM_VPCMUL_AVX2(be_to_le)
+ :
+ : [buf] "r" (buf),
+ [h_table] "r" (h_table),
+ [h1_h2] "m" (h1_h2_h15_h16[0]),
+ [h15_h16] "m" (h1_h2_h15_h16[4])
+ : "memory" );
+}
+
+static ASM_FUNC_ATTR_INLINE void
+gfmul_vpclmul_avx2_aggr16_le(const void *buf, const void *h_table,
+ const u64 *h1_h2_h15_h16)
+{
+ /* Input:
+ Hx: YMM0, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13
+ bemask: YMM15
+ Hash: XMM1
+ Output:
+ Hash: XMM1
+ Inputs YMM0, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13 and YMM15 stay
+ unmodified.
+ */
+ asm volatile (GFMUL_AGGR16_ASM_VPCMUL_AVX2(le_to_le)
+ :
+ : [buf] "r" (buf),
+ [h_table] "r" (h_table),
+ [h1_h2] "m" (h1_h2_h15_h16[0]),
+ [h15_h16] "m" (h1_h2_h15_h16[4])
+ : "memory" );
+}
+
+static ASM_FUNC_ATTR_INLINE
+void gfmul_pclmul_avx2(void)
+{
+ /* Input: YMM0 and YMM1, Output: YMM1. Input YMM0 stays unmodified.
+ Input must be converted to little-endian.
+ */
+ asm volatile (/* gfmul, ymm0 has operator a and ymm1 has operator b. */
+ "vpshufd $78, %%ymm0, %%ymm2\n\t"
+ "vpshufd $78, %%ymm1, %%ymm4\n\t"
+ "vpxor %%ymm0, %%ymm2, %%ymm2\n\t" /* ymm2 holds a0+a1 */
+ "vpxor %%ymm1, %%ymm4, %%ymm4\n\t" /* ymm4 holds b0+b1 */
+
+ "vpclmulqdq $0, %%ymm1, %%ymm0, %%ymm3\n\t" /* ymm3 holds a0*b0 */
+ "vpclmulqdq $17, %%ymm0, %%ymm1, %%ymm1\n\t" /* ymm6 holds a1*b1 */
+ "vpclmulqdq $0, %%ymm2, %%ymm4, %%ymm4\n\t" /* ymm4 holds (a0+a1)*(b0+b1) */
+
+ "vpxor %%ymm1, %%ymm3, %%ymm5\n\t" /* ymm5 holds a0*b0+a1*b1 */
+ "vpxor %%ymm5, %%ymm4, %%ymm4\n\t" /* ymm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
+ "vpslldq $8, %%ymm4, %%ymm5\n\t"
+ "vpsrldq $8, %%ymm4, %%ymm4\n\t"
+ "vpxor %%ymm5, %%ymm3, %%ymm3\n\t"
+ "vpxor %%ymm4, %%ymm1, %%ymm1\n\t" /* <ymm1:ymm3> holds the result of the
+ carry-less multiplication of ymm0
+ by ymm1 */
+
+ /* first phase of the reduction */
+ "vpsllq $1, %%ymm3, %%ymm6\n\t" /* packed right shifting << 63 */
+ "vpxor %%ymm3, %%ymm6, %%ymm6\n\t"
+ "vpsllq $57, %%ymm3, %%ymm5\n\t" /* packed right shifting << 57 */
+ "vpsllq $62, %%ymm6, %%ymm6\n\t" /* packed right shifting << 62 */
+ "vpxor %%ymm5, %%ymm6, %%ymm6\n\t" /* xor the shifted versions */
+ "vpshufd $0x6a, %%ymm6, %%ymm5\n\t"
+ "vpshufd $0xae, %%ymm6, %%ymm6\n\t"
+ "vpxor %%ymm5, %%ymm3, %%ymm3\n\t" /* first phase of the reduction complete */
+
+ /* second phase of the reduction */
+ "vpxor %%ymm3, %%ymm1, %%ymm1\n\t" /* xor the shifted versions */
+ "vpsrlq $1, %%ymm3, %%ymm3\n\t" /* packed left shifting >> 1 */
+ "vpxor %%ymm3, %%ymm6, %%ymm6\n\t"
+ "vpsrlq $1, %%ymm3, %%ymm3\n\t" /* packed left shifting >> 2 */
+ "vpxor %%ymm3, %%ymm1, %%ymm1\n\t"
+ "vpsrlq $5, %%ymm3, %%ymm3\n\t" /* packed left shifting >> 7 */
+ "vpxor %%ymm3, %%ymm6, %%ymm6\n\t"
+ "vpxor %%ymm6, %%ymm1, %%ymm1\n\t" /* the result is in ymm1 */
+ ::: "memory" );
+}
+
+static ASM_FUNC_ATTR_INLINE void
+gcm_lsh_avx2(void *h, unsigned int hoffs)
+{
+ static const u64 pconst[4] __attribute__ ((aligned (32))) =
+ {
+ U64_C(0x0000000000000001), U64_C(0xc200000000000000),
+ U64_C(0x0000000000000001), U64_C(0xc200000000000000)
+ };
+
+ asm volatile ("vmovdqu %[h], %%ymm2\n\t"
+ "vpshufd $0xff, %%ymm2, %%ymm3\n\t"
+ "vpsrad $31, %%ymm3, %%ymm3\n\t"
+ "vpslldq $8, %%ymm2, %%ymm4\n\t"
+ "vpand %[pconst], %%ymm3, %%ymm3\n\t"
+ "vpaddq %%ymm2, %%ymm2, %%ymm2\n\t"
+ "vpsrlq $63, %%ymm4, %%ymm4\n\t"
+ "vpxor %%ymm3, %%ymm2, %%ymm2\n\t"
+ "vpxor %%ymm4, %%ymm2, %%ymm2\n\t"
+ "vmovdqu %%ymm2, %[h]\n\t"
+ : [h] "+m" (*((byte *)h + hoffs))
+ : [pconst] "m" (*pconst)
+ : "memory" );
+}
+
+static ASM_FUNC_ATTR_INLINE void
+load_h1h2_to_ymm1(gcry_cipher_hd_t c)
+{
+ unsigned int key_pos =
+ offsetof(struct gcry_cipher_handle, u_mode.gcm.u_ghash_key.key);
+ unsigned int table_pos =
+ offsetof(struct gcry_cipher_handle, u_mode.gcm.gcm_table);
+
+ if (key_pos + 16 == table_pos)
+ {
+ /* Optimization: Table follows immediately after key. */
+ asm volatile ("vmovdqu %[key], %%ymm1\n\t"
+ :
+ : [key] "m" (*c->u_mode.gcm.u_ghash_key.key)
+ : "memory");
+ }
+ else
+ {
+ asm volatile ("vmovdqa %[key], %%xmm1\n\t"
+ "vinserti128 $1, 0*16(%[h_table]), %%ymm1, %%ymm1\n\t"
+ :
+ : [h_table] "r" (c->u_mode.gcm.gcm_table),
+ [key] "m" (*c->u_mode.gcm.u_ghash_key.key)
+ : "memory");
+ }
+}
+
+static ASM_FUNC_ATTR void
+ghash_setup_aggr8_avx2(gcry_cipher_hd_t c)
+{
+ c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_AGGR8_TABLE_INITIALIZED;
+
+ asm volatile (/* load H? */
+ "vbroadcasti128 3*16(%[h_table]), %%ymm0\n\t"
+ :
+ : [h_table] "r" (c->u_mode.gcm.gcm_table)
+ : "memory");
+ /* load H <<< 1, H² <<< 1 */
+ load_h1h2_to_ymm1 (c);
+
+ gfmul_pclmul_avx2 (); /* H<<<1•H? => H?, H²<<<1•H? => H? */
+
+ asm volatile ("vmovdqu %%ymm1, 3*16(%[h_table])\n\t"
+ /* load H³ <<< 1, H? <<< 1 */
+ "vmovdqu 1*16(%[h_table]), %%ymm1\n\t"
+ :
+ : [h_table] "r" (c->u_mode.gcm.gcm_table)
+ : "memory");
+
+ gfmul_pclmul_avx2 (); /* H³<<<1•H? => H?, H?<<<1•H? => H? */
+
+ asm volatile ("vmovdqu %%ymm1, 6*16(%[h_table])\n\t" /* store H? for aggr16 setup */
+ "vmovdqu %%ymm1, 5*16(%[h_table])\n\t"
+ :
+ : [h_table] "r" (c->u_mode.gcm.gcm_table)
+ : "memory");
+
+ gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 3 * 16); /* H? <<< 1, H? <<< 1 */
+ gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 5 * 16); /* H? <<< 1, H? <<< 1 */
+}
+
+static ASM_FUNC_ATTR void
+ghash_setup_aggr16_avx2(gcry_cipher_hd_t c)
+{
+ c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_AGGR16_TABLE_INITIALIZED;
+
+ asm volatile (/* load H? */
+ "vbroadcasti128 7*16(%[h_table]), %%ymm0\n\t"
+ :
+ : [h_table] "r" (c->u_mode.gcm.gcm_table)
+ : "memory");
+ /* load H <<< 1, H² <<< 1 */
+ load_h1h2_to_ymm1 (c);
+
+ gfmul_pclmul_avx2 (); /* H<<<1•H? => H?, H²<<<1•H? => H¹? */
+
+ asm volatile ("vmovdqu %%ymm1, 7*16(%[h_table])\n\t"
+ /* load H³ <<< 1, H? <<< 1 */
+ "vmovdqu 1*16(%[h_table]), %%ymm1\n\t"
+ :
+ : [h_table] "r" (c->u_mode.gcm.gcm_table)
+ : "memory");
+
+ gfmul_pclmul_avx2 (); /* H³<<<1•H? => H¹¹, H?<<<1•H? => H¹² */
+
+ asm volatile ("vmovdqu %%ymm1, 9*16(%[h_table])\n\t"
+ /* load H? <<< 1, H? <<< 1 */
+ "vmovdqu 3*16(%[h_table]), %%ymm1\n\t"
+ :
+ : [h_table] "r" (c->u_mode.gcm.gcm_table)
+ : "memory");
+
+ gfmul_pclmul_avx2 (); /* H?<<<1•H? => H¹³, H?<<<1•H? => H¹? */
+
+ asm volatile ("vmovdqu %%ymm1, 11*16(%[h_table])\n\t"
+ /* load H? <<< 1, H? <<< 1 */
+ "vmovdqu 5*16(%[h_table]), %%ymm1\n\t"
+ :
+ : [h_table] "r" (c->u_mode.gcm.gcm_table)
+ : "memory");
+
+ gfmul_pclmul_avx2 (); /* H?<<<1•H? => H¹?, H?<<<1•H? => H¹? */
+
+ asm volatile ("vmovdqu %%ymm1, 13*16(%[h_table])\n\t"
+ :
+ : [h_table] "r" (c->u_mode.gcm.gcm_table)
+ : "memory");
+
+ gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 7 * 16); /* H? <<< 1, H¹? <<< 1 */
+ gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 9 * 16); /* H¹¹ <<< 1, H¹² <<< 1 */
+ gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 11 * 16); /* H¹³ <<< 1, H¹? <<< 1 */
+ gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 13 * 16); /* H¹? <<< 1, H¹? <<< 1 */
+}
+
+#endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */
+#endif /* __x86_64__ */
+
+static unsigned int ASM_FUNC_ATTR
+_gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
+ size_t nblocks);
+
+static unsigned int ASM_FUNC_ATTR
+_gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
+ size_t nblocks);
+
+static ASM_FUNC_ATTR_INLINE void
+gcm_lsh(void *h, unsigned int hoffs)
{
static const u64 pconst[2] __attribute__ ((aligned (16))) =
{ U64_C(0x0000000000000001), U64_C(0xc200000000000000) };

- asm volatile ("movdqu (%[h]), %%xmm2\n\t"
+ asm volatile ("movdqu %[h], %%xmm2\n\t"
"pshufd $0xff, %%xmm2, %%xmm3\n\t"
"movdqa %%xmm2, %%xmm4\n\t"
"psrad $31, %%xmm3\n\t"
@@ -449,15 +851,14 @@ static ASM_FUNC_ATTR_INLINE void gcm_lsh(void *h, unsigned int hoffs)
"psrlq $63, %%xmm4\n\t"
"pxor %%xmm3, %%xmm2\n\t"
"pxor %%xmm4, %%xmm2\n\t"
- "movdqu %%xmm2, (%[h])\n\t"
- :
- : [pconst] "m" (*pconst),
- [h] "r" ((byte *)h + hoffs)
+ "movdqu %%xmm2, %[h]\n\t"
+ : [h] "+m" (*((byte *)h + hoffs))
+ : [pconst] "m" (*pconst)
: "memory" );
}

void ASM_FUNC_ATTR
-_gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
+_gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c, unsigned int hw_features)
{
static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
{ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
@@ -480,6 +881,12 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
: "memory" );
#endif

+ (void)hw_features;
+
+ c->u_mode.gcm.hw_impl_flags = 0;
+ c->u_mode.gcm.ghash_fn = _gcry_ghash_intel_pclmul;
+ c->u_mode.gcm.polyval_fn = _gcry_polyval_intel_pclmul;
+
/* Swap endianness of hsub. */
asm volatile ("movdqu (%[key]), %%xmm0\n\t"
"pshufb %[be_mask], %%xmm0\n\t"
@@ -489,7 +896,7 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
[be_mask] "m" (*be_mask)
: "memory");

- gcm_lsh(c->u_mode.gcm.u_ghash_key.key, 0); /* H <<< 1 */
+ gcm_lsh (c->u_mode.gcm.u_ghash_key.key, 0); /* H <<< 1 */

asm volatile ("movdqa %%xmm0, %%xmm1\n\t"
"movdqu (%[key]), %%xmm0\n\t" /* load H <<< 1 */
@@ -500,80 +907,81 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
gfmul_pclmul (); /* H<<<1•H => H² */

asm volatile ("movdqu %%xmm1, 0*16(%[h_table])\n\t"
- "movdqa %%xmm1, %%xmm7\n\t"
:
: [h_table] "r" (c->u_mode.gcm.gcm_table)
: "memory");

- gcm_lsh(c->u_mode.gcm.gcm_table, 0 * 16); /* H² <<< 1 */
- gfmul_pclmul (); /* H<<<1•H² => H³ */
+ gcm_lsh (c->u_mode.gcm.gcm_table, 0 * 16); /* H² <<< 1 */

- asm volatile ("movdqa %%xmm7, %%xmm0\n\t"
- "movdqu %%xmm1, 1*16(%[h_table])\n\t"
- "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H² <<< 1 */
- :
- : [h_table] "r" (c->u_mode.gcm.gcm_table)
- : "memory");
+ if (0)
+ { }
+#ifdef GCM_USE_INTEL_VPCLMUL_AVX2
+ else if ((hw_features & HWF_INTEL_VAES_VPCLMUL)
+ && (hw_features & HWF_INTEL_AVX2))
+ {
+ c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_USE_VPCLMUL_AVX2;

- gfmul_pclmul (); /* H²<<<1•H² => H? */
+ asm volatile (/* H² */
+ "vinserti128 $1, %%xmm1, %%ymm1, %%ymm1\n\t"
+ /* load H <<< 1, H² <<< 1 */
+ "vinserti128 $1, 0*16(%[h_table]), %%ymm0, %%ymm0\n\t"
+ :
+ : [h_table] "r" (c->u_mode.gcm.gcm_table)
+ : "memory");

- asm volatile ("movdqu %%xmm1, 2*16(%[h_table])\n\t"
- "movdqa %%xmm1, %%xmm0\n\t"
- "movdqu (%[key]), %%xmm1\n\t" /* load H <<< 1 */
- :
- : [h_table] "r" (c->u_mode.gcm.gcm_table),
- [key] "r" (c->u_mode.gcm.u_ghash_key.key)
- : "memory");
+ gfmul_pclmul_avx2 (); /* H<<<1•H² => H³, H²<<<1•H² => H? */

- gcm_lsh(c->u_mode.gcm.gcm_table, 1 * 16); /* H³ <<< 1 */
- gcm_lsh(c->u_mode.gcm.gcm_table, 2 * 16); /* H? <<< 1 */
+ asm volatile ("vmovdqu %%ymm1, 2*16(%[h_table])\n\t" /* store H? for aggr8 setup */
+ "vmovdqu %%ymm1, 1*16(%[h_table])\n\t"
+ :
+ : [h_table] "r" (c->u_mode.gcm.gcm_table)
+ : "memory");

-#ifdef __x86_64__
- gfmul_pclmul (); /* H<<<1•H? => H? */
+ gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 1 * 16); /* H³ <<< 1, H? <<< 1 */

- asm volatile ("movdqu %%xmm1, 3*16(%[h_table])\n\t"
- "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H² <<< 1 */
- :
- : [h_table] "r" (c->u_mode.gcm.gcm_table)
- : "memory");
-
- gfmul_pclmul (); /* H²<<<1•H? => H? */
-
- asm volatile ("movdqu %%xmm1, 4*16(%[h_table])\n\t"
- "movdqu 1*16(%[h_table]), %%xmm1\n\t" /* load H³ <<< 1 */
- :
- : [h_table] "r" (c->u_mode.gcm.gcm_table)
- : "memory");
+ asm volatile ("vzeroupper\n\t"
+ ::: "memory" );
+ }
+#endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */
+ else
+ {
+ asm volatile ("movdqa %%xmm1, %%xmm7\n\t"
+ ::: "memory");

- gfmul_pclmul (); /* H³<<<1•H? => H? */
+ gfmul_pclmul (); /* H<<<1•H² => H³ */

- asm volatile ("movdqu %%xmm1, 5*16(%[h_table])\n\t"
- "movdqu 2*16(%[h_table]), %%xmm1\n\t" /* load H? <<< 1 */
- :
- : [h_table] "r" (c->u_mode.gcm.gcm_table)
- : "memory");
+ asm volatile ("movdqa %%xmm7, %%xmm0\n\t"
+ "movdqu %%xmm1, 1*16(%[h_table])\n\t"
+ "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H² <<< 1 */
+ :
+ : [h_table] "r" (c->u_mode.gcm.gcm_table)
+ : "memory");

- gfmul_pclmul (); /* H³<<<1•H? => H? */
+ gfmul_pclmul (); /* H²<<<1•H² => H? */

- asm volatile ("movdqu %%xmm1, 6*16(%[h_table])\n\t"
- :
- : [h_table] "r" (c->u_mode.gcm.gcm_table)
- : "memory");
+ asm volatile ("movdqu %%xmm1, 3*16(%[h_table])\n\t" /* store H? for aggr8 setup */
+ "movdqu %%xmm1, 2*16(%[h_table])\n\t"
+ :
+ : [h_table] "r" (c->u_mode.gcm.gcm_table)
+ : "memory");

- gcm_lsh(c->u_mode.gcm.gcm_table, 3 * 16); /* H? <<< 1 */
- gcm_lsh(c->u_mode.gcm.gcm_table, 4 * 16); /* H? <<< 1 */
- gcm_lsh(c->u_mode.gcm.gcm_table, 5 * 16); /* H? <<< 1 */
- gcm_lsh(c->u_mode.gcm.gcm_table, 6 * 16); /* H? <<< 1 */
+ gcm_lsh (c->u_mode.gcm.gcm_table, 1 * 16); /* H³ <<< 1 */
+ gcm_lsh (c->u_mode.gcm.gcm_table, 2 * 16); /* H? <<< 1 */
+ }

-#ifdef __WIN64__
/* Clear/restore used registers. */
- asm volatile( "pxor %%xmm0, %%xmm0\n\t"
- "pxor %%xmm1, %%xmm1\n\t"
- "pxor %%xmm2, %%xmm2\n\t"
- "pxor %%xmm3, %%xmm3\n\t"
- "pxor %%xmm4, %%xmm4\n\t"
- "pxor %%xmm5, %%xmm5\n\t"
- "movdqu 0*16(%0), %%xmm6\n\t"
+ asm volatile ("pxor %%xmm0, %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm1\n\t"
+ "pxor %%xmm2, %%xmm2\n\t"
+ "pxor %%xmm3, %%xmm3\n\t"
+ "pxor %%xmm4, %%xmm4\n\t"
+ "pxor %%xmm5, %%xmm5\n\t"
+ "pxor %%xmm6, %%xmm6\n\t"
+ "pxor %%xmm7, %%xmm7\n\t"
+ ::: "memory" );
+#ifdef __x86_64__
+#ifdef __WIN64__
+ asm volatile ("movdqu 0*16(%0), %%xmm6\n\t"
"movdqu 1*16(%0), %%xmm7\n\t"
"movdqu 2*16(%0), %%xmm8\n\t"
"movdqu 3*16(%0), %%xmm9\n\t"
@@ -587,16 +995,7 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
: "r" (win64tmp)
: "memory" );
#else
- /* Clear used registers. */
- asm volatile( "pxor %%xmm0, %%xmm0\n\t"
- "pxor %%xmm1, %%xmm1\n\t"
- "pxor %%xmm2, %%xmm2\n\t"
- "pxor %%xmm3, %%xmm3\n\t"
- "pxor %%xmm4, %%xmm4\n\t"
- "pxor %%xmm5, %%xmm5\n\t"
- "pxor %%xmm6, %%xmm6\n\t"
- "pxor %%xmm7, %%xmm7\n\t"
- "pxor %%xmm8, %%xmm8\n\t"
+ asm volatile ("pxor %%xmm8, %%xmm8\n\t"
"pxor %%xmm9, %%xmm9\n\t"
"pxor %%xmm10, %%xmm10\n\t"
"pxor %%xmm11, %%xmm11\n\t"
@@ -605,14 +1004,67 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
"pxor %%xmm14, %%xmm14\n\t"
"pxor %%xmm15, %%xmm15\n\t"
::: "memory" );
-#endif
-#endif
+#endif /* __WIN64__ */
+#endif /* __x86_64__ */
}

+#ifdef __x86_64__
+static ASM_FUNC_ATTR void
+ghash_setup_aggr8(gcry_cipher_hd_t c)
+{
+ c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_AGGR8_TABLE_INITIALIZED;
+
+ asm volatile ("movdqa 3*16(%[h_table]), %%xmm0\n\t" /* load H? */
+ "movdqu %[key], %%xmm1\n\t" /* load H <<< 1 */
+ :
+ : [h_table] "r" (c->u_mode.gcm.gcm_table),
+ [key] "m" (*c->u_mode.gcm.u_ghash_key.key)
+ : "memory");
+
+ gfmul_pclmul (); /* H<<<1•H? => H? */
+
+ asm volatile ("movdqu %%xmm1, 3*16(%[h_table])\n\t"
+ "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H² <<< 1 */
+ :
+ : [h_table] "r" (c->u_mode.gcm.gcm_table)
+ : "memory");
+
+ gfmul_pclmul (); /* H²<<<1•H? => H? */
+
+ asm volatile ("movdqu %%xmm1, 4*16(%[h_table])\n\t"
+ "movdqu 1*16(%[h_table]), %%xmm1\n\t" /* load H³ <<< 1 */
+ :
+ : [h_table] "r" (c->u_mode.gcm.gcm_table)
+ : "memory");
+
+ gfmul_pclmul (); /* H³<<<1•H? => H? */
+
+ asm volatile ("movdqu %%xmm1, 5*16(%[h_table])\n\t"
+ "movdqu 2*16(%[h_table]), %%xmm1\n\t" /* load H? <<< 1 */
+ :
+ : [h_table] "r" (c->u_mode.gcm.gcm_table)
+ : "memory");
+
+ gfmul_pclmul (); /* H?<<<1•H? => H? */
+
+ asm volatile ("movdqu %%xmm1, 6*16(%[h_table])\n\t"
+ "movdqu %%xmm1, 7*16(%[h_table])\n\t" /* store H? for aggr16 setup */
+ :
+ : [h_table] "r" (c->u_mode.gcm.gcm_table)
+ : "memory");
+
+ gcm_lsh (c->u_mode.gcm.gcm_table, 3 * 16); /* H? <<< 1 */
+ gcm_lsh (c->u_mode.gcm.gcm_table, 4 * 16); /* H? <<< 1 */
+ gcm_lsh (c->u_mode.gcm.gcm_table, 5 * 16); /* H? <<< 1 */
+ gcm_lsh (c->u_mode.gcm.gcm_table, 6 * 16); /* H? <<< 1 */
+}
+#endif /* __x86_64__ */
+
+
unsigned int ASM_FUNC_ATTR
_gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
- size_t nblocks)
+ size_t nblocks)
{
static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
{ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
@@ -650,12 +1102,93 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
[be_mask] "m" (*be_mask)
: "memory" );

+#if defined(GCM_USE_INTEL_VPCLMUL_AVX2)
+ if (nblocks >= 16
+ && (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX2))
+ {
+ u64 h1_h2_h15_h16[4*2];
+
+ asm volatile ("vinserti128 $1, %%xmm7, %%ymm7, %%ymm15\n\t"
+ "vmovdqa %%xmm1, %%xmm8\n\t"
+ ::: "memory" );
+
+ if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED))
+ {
+ ghash_setup_aggr8_avx2 (c);
+ }
+ if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR16_TABLE_INITIALIZED))
+ {
+ ghash_setup_aggr16_avx2 (c);
+ }
+
+ /* Preload H1, H2, H3, H4, H5, H6, H7, H8, H9, H10, H11, H12. */
+ asm volatile ("vmovdqa %%xmm8, %%xmm1\n\t"
+ "vmovdqu 0*16(%[h_table]), %%xmm7\n\t"
+ "vpxor %%xmm8, %%xmm8, %%xmm8\n\t"
+ "vperm2i128 $0x23, 13*16(%[h_table]), %%ymm8, %%ymm0\n\t" /* H15|H16 */
+ "vperm2i128 $0x23, 11*16(%[h_table]), %%ymm8, %%ymm13\n\t" /* H13|H14 */
+ "vperm2i128 $0x23, 9*16(%[h_table]), %%ymm8, %%ymm12\n\t" /* H11|H12 */
+ "vperm2i128 $0x23, 7*16(%[h_table]), %%ymm8, %%ymm11\n\t" /* H9|H10 */
+ "vperm2i128 $0x23, 5*16(%[h_table]), %%ymm8, %%ymm10\n\t" /* H7|H8 */
+ "vperm2i128 $0x23, 3*16(%[h_table]), %%ymm8, %%ymm9\n\t" /* H5|H6 */
+ "vperm2i128 $0x23, 1*16(%[h_table]), %%ymm8, %%ymm8\n\t" /* H3|H4 */
+ "vinserti128 $1, %[h_1], %%ymm7, %%ymm7\n\t" /* H1|H2 */
+ "vmovdqu %%ymm0, %[h15_h16]\n\t"
+ "vmovdqu %%ymm7, %[h1_h2]\n\t"
+ : [h1_h2] "=m" (h1_h2_h15_h16[0]),
+ [h15_h16] "=m" (h1_h2_h15_h16[4])
+ : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key),
+ [h_table] "r" (c->u_mode.gcm.gcm_table)
+ : "memory" );
+
+ while (nblocks >= 16)
+ {
+ gfmul_vpclmul_avx2_aggr16 (buf, c->u_mode.gcm.gcm_table,
+ h1_h2_h15_h16);
+
+ buf += 16 * blocksize;
+ nblocks -= 16;
+ }
+
+ /* Clear used x86-64/XMM registers. */
+ asm volatile("vmovdqu %%ymm15, %[h15_h16]\n\t"
+ "vmovdqu %%ymm15, %[h1_h2]\n\t"
+ "vzeroupper\n\t"
+#ifndef __WIN64__
+ "pxor %%xmm8, %%xmm8\n\t"
+ "pxor %%xmm9, %%xmm9\n\t"
+ "pxor %%xmm10, %%xmm10\n\t"
+ "pxor %%xmm11, %%xmm11\n\t"
+ "pxor %%xmm12, %%xmm12\n\t"
+ "pxor %%xmm13, %%xmm13\n\t"
+ "pxor %%xmm14, %%xmm14\n\t"
+ "pxor %%xmm15, %%xmm15\n\t"
+#endif
+ "movdqa %[be_mask], %%xmm7\n\t"
+ : [h1_h2] "=m" (h1_h2_h15_h16[0]),
+ [h15_h16] "=m" (h1_h2_h15_h16[4])
+ : [be_mask] "m" (*be_mask)
+ : "memory" );
+ }
+#endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */
+
#ifdef __x86_64__
if (nblocks >= 8)
{
- /* Preload H1. */
asm volatile ("movdqa %%xmm7, %%xmm15\n\t"
- "movdqa %[h_1], %%xmm0\n\t"
+ ::: "memory" );
+
+ if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED))
+ {
+ asm volatile ("movdqa %%xmm1, %%xmm8\n\t"
+ ::: "memory" );
+ ghash_setup_aggr8 (c);
+ asm volatile ("movdqa %%xmm8, %%xmm1\n\t"
+ ::: "memory" );
+ }
+
+ /* Preload H1. */
+ asm volatile ("movdqa %[h_1], %%xmm0\n\t"
:
: [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key)
: "memory" );
@@ -667,6 +1200,7 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
buf += 8 * blocksize;
nblocks -= 8;
}
+
#ifndef __WIN64__
/* Clear used x86-64/XMM registers. */
asm volatile( "pxor %%xmm8, %%xmm8\n\t"
@@ -680,7 +1214,7 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
::: "memory" );
#endif
}
-#endif
+#endif /* __x86_64__ */

while (nblocks >= 4)
{
@@ -761,7 +1295,7 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,

unsigned int ASM_FUNC_ATTR
_gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
- size_t nblocks)
+ size_t nblocks)
{
static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
{ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
@@ -799,9 +1333,86 @@ _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
[be_mask] "m" (*be_mask)
: "memory" );

+#if defined(GCM_USE_INTEL_VPCLMUL_AVX2)
+ if (nblocks >= 16
+ && (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX2))
+ {
+ u64 h1_h2_h15_h16[4*2];
+
+ asm volatile ("vmovdqa %%xmm1, %%xmm8\n\t"
+ ::: "memory" );
+
+ if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED))
+ {
+ ghash_setup_aggr8_avx2 (c);
+ }
+ if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR16_TABLE_INITIALIZED))
+ {
+ ghash_setup_aggr16_avx2 (c);
+ }
+
+ /* Preload H1, H2, H3, H4, H5, H6, H7, H8, H9, H10, H11, H12. */
+ asm volatile ("vmovdqa %%xmm8, %%xmm1\n\t"
+ "vpxor %%xmm8, %%xmm8, %%xmm8\n\t"
+ "vmovdqu 0*16(%[h_table]), %%xmm7\n\t"
+ "vperm2i128 $0x23, 13*16(%[h_table]), %%ymm8, %%ymm0\n\t" /* H15|H16 */
+ "vperm2i128 $0x23, 11*16(%[h_table]), %%ymm8, %%ymm13\n\t" /* H13|H14 */
+ "vperm2i128 $0x23, 9*16(%[h_table]), %%ymm8, %%ymm12\n\t" /* H11|H12 */
+ "vperm2i128 $0x23, 7*16(%[h_table]), %%ymm8, %%ymm11\n\t" /* H9|H10 */
+ "vperm2i128 $0x23, 5*16(%[h_table]), %%ymm8, %%ymm10\n\t" /* H7|H8 */
+ "vperm2i128 $0x23, 3*16(%[h_table]), %%ymm8, %%ymm9\n\t" /* H5|H6 */
+ "vperm2i128 $0x23, 1*16(%[h_table]), %%ymm8, %%ymm8\n\t" /* H3|H4 */
+ "vinserti128 $1, %[h_1], %%ymm7, %%ymm7\n\t" /* H1|H2 */
+ "vmovdqu %%ymm0, %[h15_h16]\n\t"
+ "vmovdqu %%ymm7, %[h1_h2]\n\t"
+ : [h1_h2] "=m" (h1_h2_h15_h16[0]),
+ [h15_h16] "=m" (h1_h2_h15_h16[4])
+ : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key),
+ [h_table] "r" (c->u_mode.gcm.gcm_table)
+ : "memory" );
+
+ while (nblocks >= 16)
+ {
+ gfmul_vpclmul_avx2_aggr16_le (buf, c->u_mode.gcm.gcm_table,
+ h1_h2_h15_h16);
+
+ buf += 16 * blocksize;
+ nblocks -= 16;
+ }
+
+ /* Clear used x86-64/XMM registers. */
+ asm volatile("vpxor %%xmm7, %%xmm7, %%xmm7\n\t"
+ "vmovdqu %%ymm7, %[h15_h16]\n\t"
+ "vmovdqu %%ymm7, %[h1_h2]\n\t"
+ "vzeroupper\n\t"
+#ifndef __WIN64__
+ "pxor %%xmm8, %%xmm8\n\t"
+ "pxor %%xmm9, %%xmm9\n\t"
+ "pxor %%xmm10, %%xmm10\n\t"
+ "pxor %%xmm11, %%xmm11\n\t"
+ "pxor %%xmm12, %%xmm12\n\t"
+ "pxor %%xmm13, %%xmm13\n\t"
+ "pxor %%xmm14, %%xmm14\n\t"
+#endif
+ : [h1_h2] "=m" (h1_h2_h15_h16[0]),
+ [h15_h16] "=m" (h1_h2_h15_h16[4])
+ :
+ : "memory" );
+ }
+#endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */
+
#ifdef __x86_64__
if (nblocks >= 8)
{
+ if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED))
+ {
+ asm volatile ("movdqa %%xmm1, %%xmm8\n\t"
+ ::: "memory" );
+ ghash_setup_aggr8 (c);
+ asm volatile ("movdqa %%xmm8, %%xmm1\n\t"
+ ::: "memory" );
+ }
+
/* Preload H1. */
asm volatile ("pxor %%xmm15, %%xmm15\n\t"
"movdqa %[h_1], %%xmm0\n\t"
diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index 69ff0de6..683f07b0 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -39,15 +39,8 @@

#ifdef GCM_USE_INTEL_PCLMUL
-extern void _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c);
-
-extern unsigned int _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result,
- const byte *buf, size_t nblocks);
-
-extern unsigned int _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c,
- byte *result,
- const byte *buf,
- size_t nblocks);
+extern void _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c,
+ unsigned int hw_features);
#endif

#ifdef GCM_USE_ARM_PMULL
@@ -594,9 +587,7 @@ setupM (gcry_cipher_hd_t c)
#ifdef GCM_USE_INTEL_PCLMUL
else if (features & HWF_INTEL_PCLMUL)
{
- c->u_mode.gcm.ghash_fn = _gcry_ghash_intel_pclmul;
- c->u_mode.gcm.polyval_fn = _gcry_polyval_intel_pclmul;
- _gcry_ghash_setup_intel_pclmul (c);
+ _gcry_ghash_setup_intel_pclmul (c, features);
}
#endif
#ifdef GCM_USE_ARM_PMULL
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index c8a1097a..e31ac860 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -72,6 +72,14 @@
# endif
#endif /* GCM_USE_INTEL_PCLMUL */

+/* GCM_USE_INTEL_VPCLMUL_AVX2 indicates whether to compile GCM with Intel
+ VPCLMUL/AVX2 code. */
+#undef GCM_USE_INTEL_VPCLMUL_AVX2
+#if defined(__x86_64__) && defined(GCM_USE_INTEL_PCLMUL) && \
+ defined(ENABLE_AVX2_SUPPORT) && defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL)
+# define GCM_USE_INTEL_VPCLMUL_AVX2 1
+#endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */
+
/* GCM_USE_ARM_PMULL indicates whether to compile GCM with ARMv8 PMULL code. */
#undef GCM_USE_ARM_PMULL
#if defined(ENABLE_ARM_CRYPTO_SUPPORT) && defined(GCM_USE_TABLES)
@@ -355,6 +363,9 @@ struct gcry_cipher_handle

/* Key length used for GCM-SIV key generating key. */
unsigned int siv_keylen;
+
+ /* Flags for accelerated implementations. */
+ unsigned int hw_impl_flags;
} gcm;

/* Mode specific storage for OCB mode. */
--
2.32.0

_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@lists.gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel