Mailing List Archive: [PATCH] sha3: Add x86-64 AVX512 accelerated implementation

* LICENSES: Add 'cipher/keccak-amd64-avx512.S'.
* configure.ac: Add 'keccak-amd64-avx512.lo'.
* cipher/Makefile.am: Add 'keccak-amd64-avx512.S'.
* cipher/keccak-amd64-avx512.S: New.
* cipher/keccak.c (USE_64BIT_AVX512): New.
[USE_64BIT_AVX512] (_gcry_keccak_f1600_state_permute64_avx512)
(_gcry_keccak_absorb_blocks_avx512, keccak_f1600_state_permute64_avx512)
(keccak_absorb_lanes64_avx512, keccak_avx512_64_ops): New.
(keccak_init) [USE_64BIT_AVX512]: Enable x86-64 AVX512 implementation
if supported by HW features.
--

Benchmark on Intel Core i3-1115G4 (tigerlake):

Before (BMI2 instructions):
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
SHA3-224 | 1.77 ns/B 540.3 MiB/s 7.22 c/B 4088
SHA3-256 | 1.86 ns/B 514.0 MiB/s 7.59 c/B 4089
SHA3-384 | 2.43 ns/B 393.1 MiB/s 9.92 c/B 4089
SHA3-512 | 3.49 ns/B 273.2 MiB/s 14.27 c/B 4088
SHAKE128 | 1.52 ns/B 629.1 MiB/s 6.20 c/B 4089
SHAKE256 | 1.86 ns/B 511.6 MiB/s 7.62 c/B 4089

After (~33% faster):
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
SHA3-224 | 1.32 ns/B 721.8 MiB/s 5.40 c/B 4089
SHA3-256 | 1.40 ns/B 681.7 MiB/s 5.72 c/B 4089
SHA3-384 | 1.83 ns/B 522.5 MiB/s 7.46 c/B 4089
SHA3-512 | 2.63 ns/B 362.1 MiB/s 10.77 c/B 4088
SHAKE128 | 1.13 ns/B 840.4 MiB/s 4.64 c/B 4089
SHAKE256 | 1.40 ns/B 682.1 MiB/s 5.72 c/B 4089

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
---
LICENSES | 1 +
cipher/Makefile.am | 3 +-
cipher/keccak-amd64-avx512.S | 583 +++++++++++++++++++++++++++++++++++
cipher/keccak.c | 73 +++++
configure.ac | 2 +-
5 files changed, 660 insertions(+), 2 deletions(-)
create mode 100644 cipher/keccak-amd64-avx512.S

diff --git a/LICENSES b/LICENSES
index 67b80e64..c2fea82d 100644
--- a/LICENSES
+++ b/LICENSES
@@ -139,6 +139,7 @@ with any binary distributions derived from the GNU C Library.

For files:
- cipher/cipher-gcm-ppc.c
+ - cipher/keccak-amd64-avx512.S

#+begin_quote
Copyright (c) 2006, CRYPTOGAMS by <appro@openssl.org>
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 97823cb4..29690358 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -134,7 +134,8 @@ EXTRA_libcipher_la_SOURCES = \
sha512-armv7-neon.S sha512-arm.S \
sha512-ppc.c sha512-ssse3-i386.c \
sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S sm3-armv8-aarch64-ce.S \
- keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
+ keccak.c keccak_permute_32.h keccak_permute_64.h \
+ keccak-armv7-neon.S keccak-amd64-avx512.S \
stribog.c \
tiger.c \
whirlpool.c whirlpool-sse2-amd64.S \
diff --git a/cipher/keccak-amd64-avx512.S b/cipher/keccak-amd64-avx512.S
new file mode 100644
index 00000000..f44e0285
--- /dev/null
+++ b/cipher/keccak-amd64-avx512.S
@@ -0,0 +1,583 @@
+/* keccak-amd64-avx512.S - x86-64 AVX512 implementation of Keccak
+ *
+ * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * ---
+ *
+ * Core function `KeccakF1600_ce` based on ARMv8-CE KeccakF1600 implementation
+ * by Andy Polyakov from CRYPTOGAMS distribution `arm/keccak1600-armv8.pl`.
+ * `KeccakF1600_ce` was ported to x86-64 AVX512 and converted to use GCC
+ * preprocessed assembly and fitted with new absorb function optimized for
+ * x86-64. SHA3-256 performance on Intel tigerlake, 5.72 cpB.
+ *
+ * Original copyright license follows:
+ *
+ * Copyright (c) 2006, CRYPTOGAMS by <appro@openssl.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain copyright notices,
+ * this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * * Neither the name of the CRYPTOGAMS nor the names of its
+ * copyright holder and contributors may be used to endorse or
+ * promote products derived from this software without specific
+ * prior written permission.
+ *
+ * ALTERNATIVELY, provided that this notice is retained in full, this
+ * product may be distributed under the terms of the GNU General Public
+ * License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+ * those given above.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#include "asm-common-amd64.h"
+
+.text
+
+/* Register macros. */
+#define A_0_0 %xmm31
+#define A_0_1 %xmm30
+#define A_0_2 %xmm29
+#define A_0_3 %xmm28
+#define A_0_4 %xmm27
+#define A_1_0 %xmm26
+#define A_1_1 %xmm25
+#define A_1_2 %xmm24
+#define A_1_3 %xmm23
+#define A_1_4 %xmm22
+#define A_2_0 %xmm21
+#define A_2_1 %xmm20
+#define A_2_2 %xmm19
+#define A_2_3 %xmm18
+#define A_2_4 %xmm17
+#define A_3_0 %xmm16
+#define A_3_1 %xmm15
+#define A_3_2 %xmm14
+#define A_3_3 %xmm13
+#define A_3_4 %xmm12
+#define A_4_0 %xmm11
+#define A_4_1 %xmm10
+#define A_4_2 %xmm9
+#define A_4_3 %xmm8
+#define A_4_4 %xmm7
+
+#define C_0 %xmm6
+#define C_1 %xmm5
+#define C_2 %xmm4
+#define C_3 %xmm3
+#define C_4 %xmm2
+#define C_5 %xmm1
+#define C_6 %xmm0
+
+#define D_0 C_4
+#define D_1 C_5
+#define D_2 C_6
+#define D_3 C_2
+#define D_4 C_3
+
+/* Helper macros for ARMv8-CE to x86-64/AVX512 conversion. */
+#define eor3_d(dst_s1, s2, s3) \
+ vpternlogq $0x96, s3, s2, dst_s1;
+
+#define eor3(dst, s1, s2, s3) \
+ vmovdqa s1, dst; \
+ eor3_d(dst, s2, s3);
+
+#define rax1_c(dst, s1, s2_rol1) \
+ vprolq $1, s2_rol1, dst; \
+ vpxor s1, dst, dst;
+
+#define rax1_t(dst_s1, s2_rol1, tmp) \
+ vprolq $1, s2_rol1, tmp; \
+ vpxor tmp, dst_s1, dst_s1;
+
+#define rax1_s(dst_s1, s2_rol1) \
+ vprolq $1, s2_rol1, s2_rol1; \
+ vpxor s2_rol1, dst_s1, dst_s1;
+
+#define xar(dst, s1, s2, rol) \
+ vpxorq s2, s1, dst; \
+ vprolq $(rol), dst, dst;
+
+#define xar_x(dst, s1, s2, rol) \
+ vpxor s2, s1, dst; \
+ vprolq $(rol), dst, dst;
+
+#define bcax_d(dst_s1, s2, s3) \
+ vpternlogq $0xb4, s3, s2, dst_s1;
+
+#define bcax(dst, s1, s2, s3) \
+ vmovdqa64 s1, dst; \
+ bcax_d(dst, s2, s3);
+
+#define bcax_x(dst, s1, s2, s3) \
+ vmovdqa s1, dst; \
+ bcax_d(dst, s2, s3);
+
+#define eor(dst, s1, s2) \
+ vpxorq s2, s1, dst;
+
+/* Misc helper macros. */
+#define clear_avx512_4regs(a, b, c, d) \
+ eor(a, a, a); vmovdqa64 a, b; vmovdqa64 a, c; vmovdqa64 a, d;
+
+#define clear_regs() \
+ vzeroall; /* xmm0-xmm15 */ \
+ clear_avx512_4regs(%xmm16, %xmm17, %xmm18, %xmm19); \
+ clear_avx512_4regs(%xmm20, %xmm21, %xmm22, %xmm23); \
+ clear_avx512_4regs(%xmm24, %xmm25, %xmm26, %xmm27); \
+ clear_avx512_4regs(%xmm28, %xmm29, %xmm30, %xmm31);
+
+ELF(.type KeccakF1600_ce,@function)
+.align 64, 0xcc
+KeccakF1600_ce:
+.Loop_ce:
+ CFI_STARTPROC()
+
+ ////////////////////////////////////////////////// Theta
+ eor3( C_0, A_4_0, A_3_0, A_2_0)
+ eor3( C_1, A_4_1, A_3_1, A_2_1)
+ eor3( C_3, A_4_3, A_3_3, A_2_3)
+ eor3( C_2, A_4_2, A_3_2, A_2_2)
+ eor3( C_4, A_4_4, A_3_4, A_2_4)
+ eor3_d( C_0, A_1_0, A_0_0)
+ eor3_d( C_1, A_1_1, A_0_1)
+ eor3_d( C_3, A_1_3, A_0_3)
+ eor3_d( C_2, A_1_2, A_0_2)
+ eor3_d( C_4, A_1_4, A_0_4)
+
+ rax1_c( C_5, C_0, C_2) // D[1]
+ rax1_t( C_2, C_4, C_6) // D[3]
+ rax1_c( C_6, C_1, C_3) // D[2]
+ rax1_s( C_3, C_0) // D[4]
+ rax1_s( C_4, C_1) // D[0]
+
+ ////////////////////////////////////////////////// Theta+Rho+Pi
+ xar( C_0, A_0_1, D_1, 1) // C[0]=A[2][0]
+
+ xar( A_0_1, A_1_1, D_1, 44)
+ xar( A_1_1, A_1_4, D_4, 20)
+ xar( A_1_4, A_4_2, D_2, 61)
+ xar( A_4_2, A_2_4, D_4, 39)
+ xar( A_2_4, A_4_0, D_0, 18)
+
+ xar( C_1, A_0_2, D_2, 62) // C[1]=A[4][0]
+
+ xar( A_0_2, A_2_2, D_2, 43)
+ xar( A_2_2, A_2_3, D_3, 25)
+ xar( A_2_3, A_3_4, D_4, 8)
+ xar_x( A_3_4, A_4_3, D_3, 56)
+ xar( A_4_3, A_3_0, D_0, 41)
+
+ xar( A_3_0, A_0_4, D_4, 27)
+
+ xar_x( D_4, A_4_4, D_4, 14) // D[4]=A[0][4]
+ xar_x( A_4_4, A_4_1, D_1, 2)
+ xar( A_1_3, A_1_3, D_3, 55) // A[1][3]=A[4][1]
+ xar( A_0_4, A_3_1, D_1, 45) // A[0][4]=A[1][3]
+ xar( A_3_1, A_1_0, D_0, 36)
+
+ xar( A_1_0, A_0_3, D_3, 28)
+
+ eor( A_0_0, A_0_0, D_0)
+
+ xar_x( D_3, A_3_3, D_3, 21) // D[3]=A[0][3]
+ xar( A_0_3, A_3_2, D_2, 15) // A[0][3]=A[3][3]
+ xar( D_1, A_2_1, D_1, 10) // D[1]=A[3][2]
+ xar( D_2, A_1_2, D_2, 6) // D[2]=A[2][1]
+ xar( D_0, A_2_0, D_0, 3) // D[0]=A[1][2]
+
+ ////////////////////////////////////////////////// Chi+Iota
+ bcax_x( A_4_0, C_1, A_4_2, A_1_3) // A[1][3]=A[4][1]
+ bcax( A_4_1, A_1_3, A_4_3, A_4_2) // A[1][3]=A[4][1]
+ bcax_d( A_4_2, A_4_4, A_4_3)
+ bcax_d( A_4_3, C_1, A_4_4)
+ bcax_d( A_4_4, A_1_3, C_1) // A[1][3]=A[4][1]
+
+ bcax_x( A_3_2, D_1, A_3_4, A_0_3) // A[0][3]=A[3][3]
+ bcax( A_3_3, A_0_3, A_3_0, A_3_4) // A[0][3]=A[3][3]
+ bcax_d( A_3_4, A_3_1, A_3_0)
+ bcax_d( A_3_0, D_1, A_3_1)
+ bcax_d( A_3_1, A_0_3, D_1) // A[0][3]=A[3][3]
+
+ bcax( A_2_0, C_0, A_2_2, D_2)
+ bcax( A_2_1, D_2, A_2_3, A_2_2)
+ bcax_d( A_2_2, A_2_4, A_2_3)
+ bcax_d( A_2_3, C_0, A_2_4)
+ bcax_d( A_2_4, D_2, C_0)
+
+ bcax( A_1_2, D_0, A_1_4, A_0_4) // A[0][4]=A[1][3]
+ bcax( A_1_3, A_0_4, A_1_0, A_1_4) // A[0][4]=A[1][3]
+ bcax_d( A_1_4, A_1_1, A_1_0)
+ bcax_d( A_1_0, D_0, A_1_1)
+ bcax_d( A_1_1, A_0_4, D_0) // A[0][4]=A[1][3]
+
+ bcax( A_0_3, D_3, A_0_0, D_4)
+ bcax( A_0_4, D_4, A_0_1, A_0_0)
+ bcax_d( A_0_0, A_0_2, A_0_1)
+ bcax_d( A_0_1, D_3, A_0_2)
+ bcax_d( A_0_2, D_4, D_3)
+ eor( A_0_0, A_0_0, (%r10))
+
+ cmpq %r10, %r11
+ je .Lend_ce
+
+ addq $8, %r10
+ jmp .Loop_ce
+
+.align 64, 0xcc
+.Lend_ce:
+ ret_spec_stop
+ CFI_ENDPROC()
+ELF(.size KeccakF1600_ce,.-KeccakF1600_ce)
+
+.globl _gcry_keccak_f1600_state_permute64_avx512
+ELF(.type _gcry_keccak_f1600_state_permute64_avx512,@function)
+.align 64, 0xcc
+_gcry_keccak_f1600_state_permute64_avx512:
+ /* input:
+ * %rdi: state
+ * %rsi: round constants
+ */
+ CFI_STARTPROC()
+
+ leaq 12*8(%rdi), %rax
+ leaq (24-1)*8(%rsi), %r11
+
+ vmovdqu64 0*8(%rdi), A_0_0
+ vmovdqu64 1*8(%rdi), A_0_1
+ vmovdqu64 2*8(%rdi), A_0_2
+ vmovdqu64 3*8(%rdi), A_0_3
+ vmovdqu64 4*8(%rdi), A_0_4
+ vmovdqu64 5*8(%rdi), A_1_0
+ vmovdqu64 6*8(%rdi), A_1_1
+ vmovdqu64 7*8(%rdi), A_1_2
+ vmovdqu64 8*8(%rdi), A_1_3
+ vmovdqu64 9*8(%rdi), A_1_4
+ vmovdqu64 10*8(%rdi), A_2_0
+ vmovdqu64 11*8(%rdi), A_2_1
+ vmovdqu64 0*8(%rax), A_2_2
+ vmovdqu64 1*8(%rax), A_2_3
+ vmovdqu64 2*8(%rax), A_2_4
+ vmovdqu64 3*8(%rax), A_3_0
+ vmovdqu 4*8(%rax), A_3_1
+ vmovdqu 5*8(%rax), A_3_2
+ vmovdqu 6*8(%rax), A_3_3
+ vmovdqu 7*8(%rax), A_3_4
+ vmovdqu 8*8(%rax), A_4_0
+ vmovdqu 9*8(%rax), A_4_1
+ vmovdqu 10*8(%rax), A_4_2
+ vmovdqu 11*8(%rax), A_4_3
+ vmovq 12*8(%rax), A_4_4
+
+ movq %rsi, %r10
+ call KeccakF1600_ce
+
+ vpunpcklqdq A_0_1, A_0_0, A_0_0
+ vpunpcklqdq A_0_3, A_0_2, A_0_2
+ vpunpcklqdq A_1_0, A_0_4, A_0_4
+ vpunpcklqdq A_1_2, A_1_1, A_1_1
+ vpunpcklqdq A_1_4, A_1_3, A_1_3
+ vpunpcklqdq A_2_1, A_2_0, A_2_0
+ vpunpcklqdq A_2_3, A_2_2, A_2_2
+ vpunpcklqdq A_3_0, A_2_4, A_2_4
+ vpunpcklqdq A_3_2, A_3_1, A_3_1
+ vpunpcklqdq A_3_4, A_3_3, A_3_3
+ vpunpcklqdq A_4_1, A_4_0, A_4_0
+ vpunpcklqdq A_4_3, A_4_2, A_4_2
+ vmovdqu64 A_0_0, 0*8(%rdi)
+ vmovdqu64 A_0_2, 2*8(%rdi)
+ vmovdqu64 A_0_4, 4*8(%rdi)
+ vmovdqu64 A_1_1, 6*8(%rdi)
+ vmovdqu64 A_1_3, 8*8(%rdi)
+ vmovdqu64 A_2_0, 10*8(%rdi)
+ vmovdqu64 A_2_2, 0*8(%rax)
+ vmovdqu64 A_2_4, 2*8(%rax)
+ vmovdqu A_3_1, 4*8(%rax)
+ vmovdqu A_3_3, 6*8(%rax)
+ vmovdqu A_4_0, 8*8(%rax)
+ vmovdqu A_4_2, 10*8(%rax)
+ vmovq A_4_4, 12*8(%rax)
+
+ xorl %eax, %eax
+
+ clear_regs()
+ ret_spec_stop
+ CFI_ENDPROC()
+ELF(.size _gcry_keccak_f1600_state_permute64_avx512,
+ .-_gcry_keccak_f1600_state_permute64_avx512)
+
+.globl _gcry_keccak_absorb_blocks_avx512
+ELF(.type _gcry_keccak_absorb_blocks_avx512,@function)
+.align 64, 0xcc
+_gcry_keccak_absorb_blocks_avx512:
+ /* input:
+ * %rdi: state
+ * %rsi: round constants
+ * %rdx: lanes
+ * %rcx: nlanes
+ * %r8 : blocklanes
+ * %r9 : lanes output pointer
+ */
+ CFI_STARTPROC()
+
+ leaq 12*8(%rdi), %rax
+ leaq (24-1)*8(%rsi), %r11
+
+ vmovdqu64 0*8(%rdi), A_0_0
+ vmovdqu64 1*8(%rdi), A_0_1
+ vmovdqu64 2*8(%rdi), A_0_2
+ vmovdqu64 3*8(%rdi), A_0_3
+ vmovdqu64 4*8(%rdi), A_0_4
+ vmovdqu64 5*8(%rdi), A_1_0
+ vmovdqu64 6*8(%rdi), A_1_1
+ vmovdqu64 7*8(%rdi), A_1_2
+ vmovdqu64 8*8(%rdi), A_1_3
+ vmovdqu64 9*8(%rdi), A_1_4
+ vmovdqu64 10*8(%rdi), A_2_0
+ vmovdqu64 11*8(%rdi), A_2_1
+ vmovdqu64 0*8(%rax), A_2_2
+ vmovdqu64 1*8(%rax), A_2_3
+ vmovdqu64 2*8(%rax), A_2_4
+ vmovdqu64 3*8(%rax), A_3_0
+ vmovdqu 4*8(%rax), A_3_1
+ vmovdqu 5*8(%rax), A_3_2
+ vmovdqu 6*8(%rax), A_3_3
+ vmovdqu 7*8(%rax), A_3_4
+ vmovdqu 8*8(%rax), A_4_0
+ vmovdqu 9*8(%rax), A_4_1
+ vmovdqu 10*8(%rax), A_4_2
+ vmovdqu 11*8(%rax), A_4_3
+ vmovq 12*8(%rax), A_4_4
+
+ cmpq $(104 >> 3), %r8
+ jb .Loop_absorb_72_ce
+ je .Loop_absorb_104_ce
+ cmpq $(144 >> 3), %r8
+ jb .Loop_absorb_136_ce
+ je .Loop_absorb_144_ce
+ jmp .Loop_absorb_168_ce
+
+.align 64, 0xcc
+.Loop_absorb_168_ce:
+ subq %r8, %rcx // len - bsz
+ jb .Labsorbed_ce
+
+ vpxorq 0*8(%rdx), A_0_0, A_0_0
+ vpxorq 1*8(%rdx), A_0_1, A_0_1
+ vpxorq 2*8(%rdx), A_0_2, A_0_2
+ vpxorq 3*8(%rdx), A_0_3, A_0_3
+ vpxorq 4*8(%rdx), A_0_4, A_0_4
+ vpxorq 5*8(%rdx), A_1_0, A_1_0
+ vpxorq 6*8(%rdx), A_1_1, A_1_1
+ vpxorq 7*8(%rdx), A_1_2, A_1_2
+ vpxorq 8*8(%rdx), A_1_3, A_1_3
+ vpxorq 9*8(%rdx), A_1_4, A_1_4
+ vpxorq 10*8(%rdx), A_2_0, A_2_0
+ vpxorq 11*8(%rdx), A_2_1, A_2_1
+ vpxorq 12*8(%rdx), A_2_2, A_2_2
+ vpxorq 13*8(%rdx), A_2_3, A_2_3
+ vpxorq 14*8(%rdx), A_2_4, A_2_4
+ vpxorq 15*8(%rdx), A_3_0, A_3_0
+ vpxor 16*8(%rdx), A_3_1, A_3_1
+ vpxor 17*8(%rdx), A_3_2, A_3_2
+ vpxor 18*8(%rdx), A_3_3, A_3_3
+ vpxor 19*8(%rdx), A_3_4, A_3_4
+ vmovq 20*8(%rdx), C_0
+ leaq 21*8(%rdx), %rdx
+ vpxorq C_0, A_4_0, A_4_0
+
+ movq %rsi, %r10
+ call KeccakF1600_ce
+
+ jmp .Loop_absorb_168_ce
+
+.align 64, 0xcc
+.Loop_absorb_144_ce:
+ subq %r8, %rcx // len - bsz
+ jb .Labsorbed_ce
+
+ vpxorq 0*8(%rdx), A_0_0, A_0_0
+ vpxorq 1*8(%rdx), A_0_1, A_0_1
+ vpxorq 2*8(%rdx), A_0_2, A_0_2
+ vpxorq 3*8(%rdx), A_0_3, A_0_3
+ vpxorq 4*8(%rdx), A_0_4, A_0_4
+ vpxorq 5*8(%rdx), A_1_0, A_1_0
+ vpxorq 6*8(%rdx), A_1_1, A_1_1
+ vpxorq 7*8(%rdx), A_1_2, A_1_2
+ vpxorq 8*8(%rdx), A_1_3, A_1_3
+ vpxorq 9*8(%rdx), A_1_4, A_1_4
+ vpxorq 10*8(%rdx), A_2_0, A_2_0
+ vpxorq 11*8(%rdx), A_2_1, A_2_1
+ vpxorq 12*8(%rdx), A_2_2, A_2_2
+ vpxorq 13*8(%rdx), A_2_3, A_2_3
+ vpxorq 14*8(%rdx), A_2_4, A_2_4
+ vpxorq 15*8(%rdx), A_3_0, A_3_0
+ vpxor 16*8(%rdx), A_3_1, A_3_1
+ vmovq 17*8(%rdx), C_0
+ leaq 18*8(%rdx), %rdx
+ vpxor C_0, A_3_2, A_3_2
+
+ movq %rsi, %r10
+ call KeccakF1600_ce
+
+ jmp .Loop_absorb_144_ce
+
+.align 64, 0xcc
+.Loop_absorb_136_ce:
+ subq %r8, %rcx // len - bsz
+ jb .Labsorbed_ce
+
+ vpxorq 0*8(%rdx), A_0_0, A_0_0
+ vpxorq 1*8(%rdx), A_0_1, A_0_1
+ vpxorq 2*8(%rdx), A_0_2, A_0_2
+ vpxorq 3*8(%rdx), A_0_3, A_0_3
+ vpxorq 4*8(%rdx), A_0_4, A_0_4
+ vpxorq 5*8(%rdx), A_1_0, A_1_0
+ vpxorq 6*8(%rdx), A_1_1, A_1_1
+ vpxorq 7*8(%rdx), A_1_2, A_1_2
+ vpxorq 8*8(%rdx), A_1_3, A_1_3
+ vpxorq 9*8(%rdx), A_1_4, A_1_4
+ vpxorq 10*8(%rdx), A_2_0, A_2_0
+ vpxorq 11*8(%rdx), A_2_1, A_2_1
+ vpxorq 12*8(%rdx), A_2_2, A_2_2
+ vpxorq 13*8(%rdx), A_2_3, A_2_3
+ vpxorq 14*8(%rdx), A_2_4, A_2_4
+ vpxorq 15*8(%rdx), A_3_0, A_3_0
+ vmovq 16*8(%rdx), C_0
+ leaq 17*8(%rdx), %rdx
+ vpxor C_0, A_3_1, A_3_1
+
+ movq %rsi, %r10
+ call KeccakF1600_ce
+
+ jmp .Loop_absorb_136_ce
+
+.align 64, 0xcc
+.Loop_absorb_104_ce:
+ subq %r8, %rcx // len - bsz
+ jb .Labsorbed_ce
+
+ vpxorq 0*8(%rdx), A_0_0, A_0_0
+ vpxorq 1*8(%rdx), A_0_1, A_0_1
+ vpxorq 2*8(%rdx), A_0_2, A_0_2
+ vpxorq 3*8(%rdx), A_0_3, A_0_3
+ vpxorq 4*8(%rdx), A_0_4, A_0_4
+ vpxorq 5*8(%rdx), A_1_0, A_1_0
+ vpxorq 6*8(%rdx), A_1_1, A_1_1
+ vpxorq 7*8(%rdx), A_1_2, A_1_2
+ vpxorq 8*8(%rdx), A_1_3, A_1_3
+ vpxorq 9*8(%rdx), A_1_4, A_1_4
+ vpxorq 10*8(%rdx), A_2_0, A_2_0
+ vpxorq 11*8(%rdx), A_2_1, A_2_1
+ vmovq 12*8(%rdx), C_0
+ leaq 13*8(%rdx), %rdx
+ vpxorq C_0, A_2_2, A_2_2
+
+ movq %rsi, %r10
+ call KeccakF1600_ce
+
+ jmp .Loop_absorb_104_ce
+
+.align 64, 0xcc
+.Loop_absorb_72_ce:
+ subq %r8, %rcx // len - bsz
+ jb .Labsorbed_ce
+
+ vpxorq 0*8(%rdx), A_0_0, A_0_0
+ vpxorq 1*8(%rdx), A_0_1, A_0_1
+ vpxorq 2*8(%rdx), A_0_2, A_0_2
+ vpxorq 3*8(%rdx), A_0_3, A_0_3
+ vpxorq 4*8(%rdx), A_0_4, A_0_4
+ vpxorq 5*8(%rdx), A_1_0, A_1_0
+ vpxorq 6*8(%rdx), A_1_1, A_1_1
+ vpxorq 7*8(%rdx), A_1_2, A_1_2
+ vmovq 8*8(%rdx), C_0
+ leaq 9*8(%rdx), %rdx
+ vpxorq C_0, A_1_3, A_1_3
+
+ movq %rsi, %r10
+ call KeccakF1600_ce
+
+ jmp .Loop_absorb_72_ce
+
+.align 64, 0xcc
+.Labsorbed_ce:
+ vpunpcklqdq A_0_1, A_0_0, A_0_0
+ vpunpcklqdq A_0_3, A_0_2, A_0_2
+ vpunpcklqdq A_1_0, A_0_4, A_0_4
+ vpunpcklqdq A_1_2, A_1_1, A_1_1
+ vpunpcklqdq A_1_4, A_1_3, A_1_3
+ vpunpcklqdq A_2_1, A_2_0, A_2_0
+ vpunpcklqdq A_2_3, A_2_2, A_2_2
+ vpunpcklqdq A_3_0, A_2_4, A_2_4
+ vpunpcklqdq A_3_2, A_3_1, A_3_1
+ vpunpcklqdq A_3_4, A_3_3, A_3_3
+ vpunpcklqdq A_4_1, A_4_0, A_4_0
+ vpunpcklqdq A_4_3, A_4_2, A_4_2
+ vmovdqu64 A_0_0, 0*8(%rdi)
+ vmovdqu64 A_0_2, 2*8(%rdi)
+ vmovdqu64 A_0_4, 4*8(%rdi)
+ vmovdqu64 A_1_1, 6*8(%rdi)
+ vmovdqu64 A_1_3, 8*8(%rdi)
+ vmovdqu64 A_2_0, 10*8(%rdi)
+ vmovdqu64 A_2_2, 0*8(%rax)
+ vmovdqu64 A_2_4, 2*8(%rax)
+ vmovdqu A_3_1, 4*8(%rax)
+ vmovdqu A_3_3, 6*8(%rax)
+ vmovdqu A_4_0, 8*8(%rax)
+ vmovdqu A_4_2, 10*8(%rax)
+ vmovq A_4_4, 12*8(%rax)
+
+ leaq (%r8, %rcx), %rax // return value
+ movq %rdx, (%r9) // return buffer pointer
+
+ clear_regs()
+ ret_spec_stop
+ CFI_ENDPROC()
+ELF(.size _gcry_keccak_absorb_blocks_avx512,
+ .-_gcry_keccak_absorb_blocks_avx512)
+
+#endif /* HAVE_GCC_INLINE_ASM_AVX512 */
+#endif /* __x86_64 */
diff --git a/cipher/keccak.c b/cipher/keccak.c
index f3502022..6c027eac 100644
--- a/cipher/keccak.c
+++ b/cipher/keccak.c
@@ -62,6 +62,16 @@
#endif

+/* USE_64BIT_AVX512 indicates whether to compile with Intel AVX512 code. */
+#undef USE_64BIT_AVX512
+#if defined(USE_64BIT) && defined(__x86_64__) && \
+ defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_64BIT_AVX512 1
+#endif
+
+
/* USE_64BIT_ARM_NEON indicates whether to enable 64-bit ARM/NEON assembly
* code. */
#undef USE_64BIT_ARM_NEON
@@ -428,6 +438,65 @@ static const keccak_ops_t keccak_bmi2_64_ops =
#endif /* USE_64BIT_BMI2 */

+/* 64-bit Intel AVX512 implementation. */
+#ifdef USE_64BIT_AVX512
+
+extern unsigned int
+_gcry_keccak_f1600_state_permute64_avx512(u64 *state, const u64 *rconst);
+
+extern unsigned int
+_gcry_keccak_absorb_blocks_avx512(u64 *state, const u64 *rconst,
+ const byte *lanes, size_t nlanes,
+ size_t blocklanes, const byte **new_lanes);
+
+static unsigned int
+keccak_f1600_state_permute64_avx512(KECCAK_STATE *hd)
+{
+ return _gcry_keccak_f1600_state_permute64_avx512 (
+ hd->u.state64, _gcry_keccak_round_consts_64bit);
+}
+
+static unsigned int
+keccak_absorb_lanes64_avx512(KECCAK_STATE *hd, int pos, const byte *lanes,
+ unsigned int nlanes, int blocklanes)
+{
+ while (nlanes)
+ {
+ if (pos == 0 && blocklanes > 0 && nlanes >= (unsigned int)blocklanes)
+ {
+ nlanes = _gcry_keccak_absorb_blocks_avx512 (
+ hd->u.state64, _gcry_keccak_round_consts_64bit,
+ lanes, nlanes, blocklanes, &lanes);
+ }
+
+ while (nlanes)
+ {
+ hd->u.state64[pos] ^= buf_get_le64 (lanes);
+ lanes += 8;
+ nlanes--;
+
+ if (++pos == blocklanes)
+ {
+ keccak_f1600_state_permute64_avx512 (hd);
+ pos = 0;
+ break;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static const keccak_ops_t keccak_avx512_64_ops =
+{
+ .permute = keccak_f1600_state_permute64_avx512,
+ .absorb = keccak_absorb_lanes64_avx512,
+ .extract = keccak_extract64,
+};
+
+#endif /* USE_64BIT_AVX512 */
+
+
/* 64-bit ARMv7/NEON implementation. */
#ifdef USE_64BIT_ARM_NEON

@@ -894,6 +963,10 @@ keccak_init (int algo, void *context, unsigned int flags)

/* Select optimized implementation based in hw features. */
if (0) {}
+#ifdef USE_64BIT_AVX512
+ else if (features & HWF_INTEL_AVX512)
+ ctx->ops = &keccak_avx512_64_ops;
+#endif
#ifdef USE_64BIT_ARM_NEON
else if (features & HWF_ARM_NEON)
ctx->ops = &keccak_armv7_neon_64_ops;
diff --git a/configure.ac b/configure.ac
index b55510d8..3abee22d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3152,7 +3152,7 @@ if test "$found" = "1" ; then
case "${host}" in
x86_64-*-*)
# Build with the assembly implementation
- :
+ GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS keccak-amd64-avx512.lo"
;;
esac

--
2.34.1

_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@lists.gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel