Mailing List Archive

[PATCH 2/7] Add GFNI/AVX2 implementation of Camellia
* cipher/Makefile.am: Add "camellia-gfni-avx2-amd64.S".
* cipher/camellia-aesni-avx2-amd64.h [CAMELLIA_GFNI_BUILD]: Add GFNI
support.
* cipher/camellia-gfni-avx2-amd64.S: New.
* cipher/camellia-glue.c (USE_GFNI_AVX2): New.
(CAMELLIA_context) [USE_AESNI_AVX2]: New member "use_gfni_avx2".
[USE_GFNI_AVX2] (_gcry_camellia_gfni_avx2_ctr_enc)
(_gcry_camellia_gfni_avx2_cbc_dec, _gcry_camellia_gfni_avx2_cfb_dec)
(_gcry_camellia_gfni_avx2_ocb_enc, _gcry_camellia_gfni_avx2_ocb_dec)
(_gcry_camellia_gfni_avx2_ocb_auth): New.
(camellia_setkey) [USE_GFNI_AVX2]: Enable GFNI if supported by HW.
(_gcry_camellia_ctr_enc) [USE_GFNI_AVX2]: Add GFNI support.
(_gcry_camellia_cbc_dec) [USE_GFNI_AVX2]: Add GFNI support.
(_gcry_camellia_cfb_dec) [USE_GFNI_AVX2]: Add GFNI support.
(_gcry_camellia_ocb_crypt) [USE_GFNI_AVX2]: Add GFNI support.
(_gcry_camellia_ocb_auth) [USE_GFNI_AVX2]: Add GFNI support.
* configure.ac: Add "camellia-gfni-avx2-amd64.lo".
--

Benchmark on Intel Core i3-1115G4 (tigerlake):

Before (VAES/AVX2 implementation):
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CBC dec | 0.579 ns/B 1646 MiB/s 2.37 c/B 4090
CFB dec | 0.579 ns/B 1648 MiB/s 2.37 c/B 4089
CTR enc | 0.586 ns/B 1628 MiB/s 2.40 c/B 4090
CTR dec | 0.587 ns/B 1626 MiB/s 2.40 c/B 4090
OCB enc | 0.607 ns/B 1570 MiB/s 2.48 c/B 4089
OCB dec | 0.611 ns/B 1561 MiB/s 2.50 c/B 4089
OCB auth | 0.602 ns/B 1585 MiB/s 2.46 c/B 4089

After (~80% faster):
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CBC dec | 0.299 ns/B 3186 MiB/s 1.22 c/B 4090
CFB dec | 0.314 ns/B 3039 MiB/s 1.28 c/B 4089
CTR enc | 0.322 ns/B 2962 MiB/s 1.32 c/B 4090
CTR dec | 0.321 ns/B 2970 MiB/s 1.31 c/B 4090
OCB enc | 0.339 ns/B 2817 MiB/s 1.38 c/B 4089
OCB dec | 0.346 ns/B 2756 MiB/s 1.41 c/B 4089
OCB auth | 0.337 ns/B 2831 MiB/s 1.38 c/B 4089

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
---
cipher/Makefile.am | 5 +-
cipher/camellia-aesni-avx2-amd64.h | 249 ++++++++++++++++++++++++++++-
cipher/camellia-gfni-avx2-amd64.S | 34 ++++
cipher/camellia-glue.c | 170 +++++++++++++-------
configure.ac | 3 +
5 files changed, 398 insertions(+), 63 deletions(-)
create mode 100644 cipher/camellia-gfni-avx2-amd64.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 07e5ba26..7a429e8b 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -139,8 +139,9 @@ EXTRA_libcipher_la_SOURCES = \
twofish-avx2-amd64.S \
rfc2268.c \
camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \
- camellia-aesni-avx2-amd64.h camellia-vaes-avx2-amd64.S \
- camellia-aesni-avx2-amd64.S camellia-arm.S camellia-aarch64.S \
+ camellia-aesni-avx2-amd64.h camellia-gfni-avx2-amd64.S \
+ camellia-vaes-avx2-amd64.S camellia-aesni-avx2-amd64.S \
+ camellia-arm.S camellia-aarch64.S \
blake2.c \
blake2b-amd64-avx2.S blake2s-amd64-avx.S

diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h
index e93c40b8..8cd4b1cd 100644
--- a/cipher/camellia-aesni-avx2-amd64.h
+++ b/cipher/camellia-aesni-avx2-amd64.h
@@ -1,6 +1,6 @@
-/* camellia-aesni-avx2-amd64.h - AES-NI/VAES/AVX2 implementation of Camellia
+/* camellia-aesni-avx2-amd64.h - AES-NI/VAES/GFNI/AVX2 implementation of Camellia
*
- * Copyright (C) 2013-2015,2020-2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2013-2015,2020-2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -36,6 +36,8 @@
/**********************************************************************
helper macros
**********************************************************************/
+
+#ifndef CAMELLIA_GFNI_BUILD
#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
vpand x, mask4bit, tmp0; \
vpandn x, mask4bit, x; \
@@ -44,6 +46,7 @@
vpshufb tmp0, lo_t, tmp0; \
vpshufb x, hi_t, x; \
vpxor tmp0, x, x;
+#endif

#define ymm0_x xmm0
#define ymm1_x xmm1
@@ -70,11 +73,61 @@
# define IF_VAES(...)
#endif

+/**********************************************************************
+ GFNI helper macros and constants
+ **********************************************************************/
+
+#ifdef CAMELLIA_GFNI_BUILD
+
+#define BV8(a0,a1,a2,a3,a4,a5,a6,a7) \
+ ( (((a0) & 1) << 0) | \
+ (((a1) & 1) << 1) | \
+ (((a2) & 1) << 2) | \
+ (((a3) & 1) << 3) | \
+ (((a4) & 1) << 4) | \
+ (((a5) & 1) << 5) | \
+ (((a6) & 1) << 6) | \
+ (((a7) & 1) << 7) )
+
+#define BM8X8(l0,l1,l2,l3,l4,l5,l6,l7) \
+ ( ((l7) << (0 * 8)) | \
+ ((l6) << (1 * 8)) | \
+ ((l5) << (2 * 8)) | \
+ ((l4) << (3 * 8)) | \
+ ((l3) << (4 * 8)) | \
+ ((l2) << (5 * 8)) | \
+ ((l1) << (6 * 8)) | \
+ ((l0) << (7 * 8)) )
+
+/* Pre-filters and post-filters constants for Camellia sboxes s1, s2, s3 and s4.
+ * See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48.
+ *
+ * Pre-filters are directly from above source, "??"/"??". Post-filters are
+ * combination of function "A" (AES SubBytes affine transformation) and
+ * "??"/"??"/"??".
+ */
+
+/* Constant from "??(x)" and "??(x)" functions. */
+#define pre_filter_constant_s1234 BV8(1, 0, 1, 0, 0, 0, 1, 0)
+
+/* Constant from "??(A(x))" function: */
+#define post_filter_constant_s14 BV8(0, 1, 1, 1, 0, 1, 1, 0)
+
+/* Constant from "??(A(x))" function: */
+#define post_filter_constant_s2 BV8(0, 0, 1, 1, 1, 0, 1, 1)
+
+/* Constant from "??(A(x))" function: */
+#define post_filter_constant_s3 BV8(1, 1, 1, 0, 1, 1, 0, 0)
+
+#endif /* CAMELLIA_GFNI_BUILD */
+
/**********************************************************************
32-way camellia
**********************************************************************/

-/*
+#ifdef CAMELLIA_GFNI_BUILD
+
+/* roundsm32 (GFNI version)
* IN:
* x0..x7: byte-sliced AB state
* mem_cd: register pointer storing CD state
@@ -82,7 +135,119 @@
* OUT:
* x0..x7: new byte-sliced CD state
*/
+#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \
+ t6, t7, mem_cd, key) \
+ /* \
+ * S-function with AES subbytes \
+ */ \
+ vpbroadcastq .Lpre_filter_bitmatrix_s123 rRIP, t5; \
+ vpbroadcastq .Lpre_filter_bitmatrix_s4 rRIP, t2; \
+ vpbroadcastq .Lpost_filter_bitmatrix_s14 rRIP, t4; \
+ vpbroadcastq .Lpost_filter_bitmatrix_s2 rRIP, t3; \
+ vpbroadcastq .Lpost_filter_bitmatrix_s3 rRIP, t6; \
+ vpxor t7##_x, t7##_x, t7##_x; \
+ vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
+ \
+ /* prefilter sboxes */ \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x0, x0; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x7, x7; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t2, x3, x3; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t2, x6, x6; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x2, x2; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x5, x5; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x1, x1; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x4, x4; \
+ \
+ /* sbox GF8 inverse + postfilter sboxes 1 and 4 */ \
+ vgf2p8affineinvqb $(post_filter_constant_s14), t4, x0, x0; \
+ vgf2p8affineinvqb $(post_filter_constant_s14), t4, x7, x7; \
+ vgf2p8affineinvqb $(post_filter_constant_s14), t4, x3, x3; \
+ vgf2p8affineinvqb $(post_filter_constant_s14), t4, x6, x6; \
+ \
+ /* sbox GF8 inverse + postfilter sbox 3 */ \
+ vgf2p8affineinvqb $(post_filter_constant_s3), t6, x2, x2; \
+ vgf2p8affineinvqb $(post_filter_constant_s3), t6, x5, x5; \
+ \
+ /* sbox GF8 inverse + postfilter sbox 2 */ \
+ vgf2p8affineinvqb $(post_filter_constant_s2), t3, x1, x1; \
+ vgf2p8affineinvqb $(post_filter_constant_s2), t3, x4, x4; \
+ \
+ vpsrldq $1, t0, t1; \
+ vpsrldq $2, t0, t2; \
+ vpshufb t7, t1, t1; \
+ vpsrldq $3, t0, t3; \
+ \
+ /* P-function */ \
+ vpxor x5, x0, x0; \
+ vpxor x6, x1, x1; \
+ vpxor x7, x2, x2; \
+ vpxor x4, x3, x3; \
+ \
+ vpshufb t7, t2, t2; \
+ vpsrldq $4, t0, t4; \
+ vpshufb t7, t3, t3; \
+ vpsrldq $5, t0, t5; \
+ vpshufb t7, t4, t4; \
+ \
+ vpxor x2, x4, x4; \
+ vpxor x3, x5, x5; \
+ vpxor x0, x6, x6; \
+ vpxor x1, x7, x7; \
+ \
+ vpsrldq $6, t0, t6; \
+ vpshufb t7, t5, t5; \
+ vpshufb t7, t6, t6; \
+ \
+ vpxor x7, x0, x0; \
+ vpxor x4, x1, x1; \
+ vpxor x5, x2, x2; \
+ vpxor x6, x3, x3; \
+ \
+ vpxor x3, x4, x4; \
+ vpxor x0, x5, x5; \
+ vpxor x1, x6, x6; \
+ vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+ \
+ /* Add key material and result to CD (x becomes new CD) */ \
+ \
+ vpxor t6, x1, x1; \
+ vpxor 5 * 32(mem_cd), x1, x1; \
+ \
+ vpsrldq $7, t0, t6; \
+ vpshufb t7, t0, t0; \
+ vpshufb t7, t6, t7; \
+ \
+ vpxor t7, x0, x0; \
+ vpxor 4 * 32(mem_cd), x0, x0; \
+ \
+ vpxor t5, x2, x2; \
+ vpxor 6 * 32(mem_cd), x2, x2; \
+ \
+ vpxor t4, x3, x3; \
+ vpxor 7 * 32(mem_cd), x3, x3; \
+ \
+ vpxor t3, x4, x4; \
+ vpxor 0 * 32(mem_cd), x4, x4; \
+ \
+ vpxor t2, x5, x5; \
+ vpxor 1 * 32(mem_cd), x5, x5; \
+ \
+ vpxor t1, x6, x6; \
+ vpxor 2 * 32(mem_cd), x6, x6; \
+ \
+ vpxor t0, x7, x7; \
+ vpxor 3 * 32(mem_cd), x7, x7;

+#else /* CAMELLIA_GFNI_BUILD */
+
+/* roundsm32 (AES-NI / VAES version)
+ * IN:
+ * x0..x7: byte-sliced AB state
+ * mem_cd: register pointer storing CD state
+ * key: index for key material
+ * OUT:
+ * x0..x7: new byte-sliced CD state
+ */
#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \
t6, t7, mem_cd, key) \
/* \
@@ -181,7 +346,7 @@
/* postfilter sbox 2 */ \
filter_8bit(x1, t4, t5, t7, t2); \
filter_8bit(x4, t4, t5, t7, t2); \
- vpxor t7, t7, t7; \
+ vpxor t7##_x, t7##_x, t7##_x; \
\
vpsrldq $1, t0, t1; \
vpsrldq $2, t0, t2; \
@@ -249,6 +414,8 @@
vpxor t0, x7, x7; \
vpxor 3 * 32(mem_cd), x7, x7;

+#endif /* CAMELLIA_GFNI_BUILD */
+
/*
* IN/OUT:
* x0..x7: byte-sliced AB state preloaded
@@ -623,6 +790,9 @@
#define SHUFB_BYTES(idx) \
0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)

+FUNC_NAME(_constants):
+ELF(.type FUNC_NAME(_constants),@object;)
+
.Lshufb_16x16b:
.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
@@ -635,6 +805,74 @@
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0

+#ifdef CAMELLIA_GFNI_BUILD
+
+/* Pre-filters and post-filters bit-matrixes for Camellia sboxes s1, s2, s3
+ * and s4.
+ * See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48.
+ *
+ * Pre-filters are directly from above source, "??"/"??". Post-filters are
+ * combination of function "A" (AES SubBytes affine transformation) and
+ * "??"/"??"/"??".
+ */
+
+/* Bit-matrix from "??(x)" function: */
+.Lpre_filter_bitmatrix_s123:
+ .quad BM8X8(BV8(1, 1, 1, 0, 1, 1, 0, 1),
+ BV8(0, 0, 1, 1, 0, 0, 1, 0),
+ BV8(1, 1, 0, 1, 0, 0, 0, 0),
+ BV8(1, 0, 1, 1, 0, 0, 1, 1),
+ BV8(0, 0, 0, 0, 1, 1, 0, 0),
+ BV8(1, 0, 1, 0, 0, 1, 0, 0),
+ BV8(0, 0, 1, 0, 1, 1, 0, 0),
+ BV8(1, 0, 0, 0, 0, 1, 1, 0))
+
+/* Bit-matrix from "??(x)" function: */
+.Lpre_filter_bitmatrix_s4:
+ .quad BM8X8(BV8(1, 1, 0, 1, 1, 0, 1, 1),
+ BV8(0, 1, 1, 0, 0, 1, 0, 0),
+ BV8(1, 0, 1, 0, 0, 0, 0, 1),
+ BV8(0, 1, 1, 0, 0, 1, 1, 1),
+ BV8(0, 0, 0, 1, 1, 0, 0, 0),
+ BV8(0, 1, 0, 0, 1, 0, 0, 1),
+ BV8(0, 1, 0, 1, 1, 0, 0, 0),
+ BV8(0, 0, 0, 0, 1, 1, 0, 1))
+
+/* Bit-matrix from "??(A(x))" function: */
+.Lpost_filter_bitmatrix_s14:
+ .quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1),
+ BV8(0, 1, 1, 0, 0, 1, 1, 0),
+ BV8(1, 0, 1, 1, 1, 1, 1, 0),
+ BV8(0, 0, 0, 1, 1, 0, 1, 1),
+ BV8(1, 0, 0, 0, 1, 1, 1, 0),
+ BV8(0, 1, 0, 1, 1, 1, 1, 0),
+ BV8(0, 1, 1, 1, 1, 1, 1, 1),
+ BV8(0, 0, 0, 1, 1, 1, 0, 0))
+
+/* Bit-matrix from "??(A(x))" function: */
+.Lpost_filter_bitmatrix_s2:
+ .quad BM8X8(BV8(0, 0, 0, 1, 1, 1, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 1),
+ BV8(0, 1, 1, 0, 0, 1, 1, 0),
+ BV8(1, 0, 1, 1, 1, 1, 1, 0),
+ BV8(0, 0, 0, 1, 1, 0, 1, 1),
+ BV8(1, 0, 0, 0, 1, 1, 1, 0),
+ BV8(0, 1, 0, 1, 1, 1, 1, 0),
+ BV8(0, 1, 1, 1, 1, 1, 1, 1))
+
+/* Bit-matrix from "??(A(x))" function: */
+.Lpost_filter_bitmatrix_s3:
+ .quad BM8X8(BV8(0, 1, 1, 0, 0, 1, 1, 0),
+ BV8(1, 0, 1, 1, 1, 1, 1, 0),
+ BV8(0, 0, 0, 1, 1, 0, 1, 1),
+ BV8(1, 0, 0, 0, 1, 1, 1, 0),
+ BV8(0, 1, 0, 1, 1, 1, 1, 0),
+ BV8(0, 1, 1, 1, 1, 1, 1, 1),
+ BV8(0, 0, 0, 1, 1, 1, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 1))
+
+#else /* CAMELLIA_GFNI_BUILD */
+
/*
* pre-SubByte transform
*
@@ -756,6 +994,9 @@
.L0f0f0f0f:
.long 0x0f0f0f0f

+#endif /* CAMELLIA_GFNI_BUILD */
+
+ELF(.size FUNC_NAME(_constants),.-FUNC_NAME(_constants);)

.align 8
ELF(.type __camellia_enc_blk32,@function;)
diff --git a/cipher/camellia-gfni-avx2-amd64.S b/cipher/camellia-gfni-avx2-amd64.S
new file mode 100644
index 00000000..20c9a432
--- /dev/null
+++ b/cipher/camellia-gfni-avx2-amd64.S
@@ -0,0 +1,34 @@
+/* camellia-vaes-avx2-amd64.S - GFNI/AVX2 implementation of Camellia cipher
+ *
+ * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+ defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
+
+#define CAMELLIA_GFNI_BUILD 1
+#define FUNC_NAME(func) _gcry_camellia_gfni_avx2_ ## func
+
+#include "camellia-aesni-avx2-amd64.h"
+
+#endif /* defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) */
+#endif /* __x86_64 */
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 72c02d77..7f009db4 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -97,6 +97,12 @@
# define USE_VAES_AVX2 1
#endif

+/* USE_GFNI_AVX2 inidicates whether to compile with Intel GFNI/AVX2 code. */
+#undef USE_GFNI_AVX2
+#if defined(USE_AESNI_AVX2) && defined(ENABLE_GFNI_SUPPORT)
+# define USE_GFNI_AVX2 1
+#endif
+
typedef struct
{
KEY_TABLE_TYPE keytable;
@@ -107,6 +113,7 @@ typedef struct
#ifdef USE_AESNI_AVX2
unsigned int use_aesni_avx2:1;/* AES-NI/AVX2 implementation shall be used. */
unsigned int use_vaes_avx2:1; /* VAES/AVX2 implementation shall be used. */
+ unsigned int use_gfni_avx2:1; /* GFNI/AVX2 implementation shall be used. */
#endif /*USE_AESNI_AVX2*/
} CAMELLIA_context;

@@ -248,6 +255,46 @@ extern void _gcry_camellia_vaes_avx2_ocb_auth(CAMELLIA_context *ctx,
const u64 Ls[32]) ASM_FUNC_ABI;
#endif

+#ifdef USE_GFNI_AVX2
+/* Assembler implementations of Camellia using GFNI and AVX2. Process data
+ in 32 block same time.
+ */
+extern void _gcry_camellia_gfni_avx2_ctr_enc(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_cbc_dec(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_cfb_dec(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_ocb_enc(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_ocb_dec(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_ocb_auth(CAMELLIA_context *ctx,
+ const unsigned char *abuf,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[32]) ASM_FUNC_ABI;
+#endif
+
static const char *selftest(void);

static void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr,
@@ -272,7 +319,8 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
CAMELLIA_context *ctx=c;
static int initialized=0;
static const char *selftest_failed=NULL;
-#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) || defined(USE_VAES_AVX2)
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) \
+ || defined(USE_VAES_AVX2) || defined(USE_GFNI_AVX2)
unsigned int hwf = _gcry_get_hw_features ();
#endif

@@ -296,10 +344,14 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
#ifdef USE_AESNI_AVX2
ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2);
ctx->use_vaes_avx2 = 0;
+ ctx->use_gfni_avx2 = 0;
#endif
#ifdef USE_VAES_AVX2
ctx->use_vaes_avx2 = (hwf & HWF_INTEL_VAES_VPCLMUL) && (hwf & HWF_INTEL_AVX2);
#endif
+#ifdef USE_GFNI_AVX2
+ ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX2);
+#endif

ctx->keybitlength=keylen*8;

@@ -440,20 +492,22 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
if (ctx->use_aesni_avx2)
{
int did_use_aesni_avx2 = 0;
+ typeof (&_gcry_camellia_aesni_avx2_ctr_enc) bulk_ctr_fn =
+ _gcry_camellia_aesni_avx2_ctr_enc;
+
#ifdef USE_VAES_AVX2
- int use_vaes = ctx->use_vaes_avx2;
+ if (ctx->use_vaes_avx2)
+ bulk_ctr_fn =_gcry_camellia_vaes_avx2_ctr_enc;
+#endif
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ bulk_ctr_fn =_gcry_camellia_gfni_avx2_ctr_enc;
#endif

/* Process data in 32 block chunks. */
while (nblocks >= 32)
{
-#ifdef USE_VAES_AVX2
- if (use_vaes)
- _gcry_camellia_vaes_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
- else
-#endif
- _gcry_camellia_aesni_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
-
+ bulk_ctr_fn (ctx, outbuf, inbuf, ctr);
nblocks -= 32;
outbuf += 32 * CAMELLIA_BLOCK_SIZE;
inbuf += 32 * CAMELLIA_BLOCK_SIZE;
@@ -537,20 +591,22 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
if (ctx->use_aesni_avx2)
{
int did_use_aesni_avx2 = 0;
+ typeof (&_gcry_camellia_aesni_avx2_cbc_dec) bulk_cbc_fn =
+ _gcry_camellia_aesni_avx2_cbc_dec;
+
#ifdef USE_VAES_AVX2
- int use_vaes = ctx->use_vaes_avx2;
+ if (ctx->use_vaes_avx2)
+ bulk_cbc_fn =_gcry_camellia_vaes_avx2_cbc_dec;
+#endif
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ bulk_cbc_fn =_gcry_camellia_gfni_avx2_cbc_dec;
#endif

/* Process data in 32 block chunks. */
while (nblocks >= 32)
{
-#ifdef USE_VAES_AVX2
- if (use_vaes)
- _gcry_camellia_vaes_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
- else
-#endif
- _gcry_camellia_aesni_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
-
+ bulk_cbc_fn (ctx, outbuf, inbuf, iv);
nblocks -= 32;
outbuf += 32 * CAMELLIA_BLOCK_SIZE;
inbuf += 32 * CAMELLIA_BLOCK_SIZE;
@@ -631,20 +687,22 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
if (ctx->use_aesni_avx2)
{
int did_use_aesni_avx2 = 0;
+ typeof (&_gcry_camellia_aesni_avx2_cfb_dec) bulk_cfb_fn =
+ _gcry_camellia_aesni_avx2_cfb_dec;
+
#ifdef USE_VAES_AVX2
- int use_vaes = ctx->use_vaes_avx2;
+ if (ctx->use_vaes_avx2)
+ bulk_cfb_fn =_gcry_camellia_vaes_avx2_cfb_dec;
+#endif
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ bulk_cfb_fn =_gcry_camellia_gfni_avx2_cfb_dec;
#endif

/* Process data in 32 block chunks. */
while (nblocks >= 32)
{
-#ifdef USE_VAES_AVX2
- if (use_vaes)
- _gcry_camellia_vaes_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
- else
-#endif
- _gcry_camellia_aesni_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
-
+ bulk_cfb_fn (ctx, outbuf, inbuf, iv);
nblocks -= 32;
outbuf += 32 * CAMELLIA_BLOCK_SIZE;
inbuf += 32 * CAMELLIA_BLOCK_SIZE;
@@ -729,10 +787,6 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
if (ctx->use_aesni_avx2)
{
int did_use_aesni_avx2 = 0;
-#ifdef USE_VAES_AVX2
- int encrypt_use_vaes = encrypt && ctx->use_vaes_avx2;
- int decrypt_use_vaes = !encrypt && ctx->use_vaes_avx2;
-#endif
u64 Ls[32];
unsigned int n = 32 - (blkn % 32);
u64 *l;
@@ -740,6 +794,21 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,

if (nblocks >= 32)
{
+ typeof (&_gcry_camellia_aesni_avx2_ocb_dec) bulk_ocb_fn =
+ encrypt ? _gcry_camellia_aesni_avx2_ocb_enc
+ : _gcry_camellia_aesni_avx2_ocb_dec;
+
+#ifdef USE_VAES_AVX2
+ if (ctx->use_vaes_avx2)
+ bulk_ocb_fn = encrypt ? _gcry_camellia_vaes_avx2_ocb_enc
+ : _gcry_camellia_vaes_avx2_ocb_dec;
+#endif
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ bulk_ocb_fn = encrypt ? _gcry_camellia_gfni_avx2_ocb_enc
+ : _gcry_camellia_gfni_avx2_ocb_dec;
+#endif
+
for (i = 0; i < 32; i += 8)
{
/* Use u64 to store pointers for x32 support (assembly function
@@ -764,21 +833,7 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
blkn += 32;
*l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32);

- if (0) {}
-#ifdef USE_VAES_AVX2
- else if (encrypt_use_vaes)
- _gcry_camellia_vaes_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
- else if (decrypt_use_vaes)
- _gcry_camellia_vaes_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
-#endif
- else if (encrypt)
- _gcry_camellia_aesni_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
- else
- _gcry_camellia_aesni_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
+ bulk_ocb_fn (ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls);

nblocks -= 32;
outbuf += 32 * CAMELLIA_BLOCK_SIZE;
@@ -891,9 +946,6 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
if (ctx->use_aesni_avx2)
{
int did_use_aesni_avx2 = 0;
-#ifdef USE_VAES_AVX2
- int use_vaes = ctx->use_vaes_avx2;
-#endif
u64 Ls[32];
unsigned int n = 32 - (blkn % 32);
u64 *l;
@@ -901,6 +953,18 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,

if (nblocks >= 32)
{
+ typeof (&_gcry_camellia_aesni_avx2_ocb_auth) bulk_auth_fn =
+ _gcry_camellia_aesni_avx2_ocb_auth;
+
+#ifdef USE_VAES_AVX2
+ if (ctx->use_vaes_avx2)
+ bulk_auth_fn = _gcry_camellia_vaes_avx2_ocb_auth;
+#endif
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ bulk_auth_fn = _gcry_camellia_gfni_avx2_ocb_auth;
+#endif
+
for (i = 0; i < 32; i += 8)
{
/* Use u64 to store pointers for x32 support (assembly function
@@ -925,16 +989,8 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
blkn += 32;
*l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32);

-#ifdef USE_VAES_AVX2
- if (use_vaes)
- _gcry_camellia_vaes_avx2_ocb_auth(ctx, abuf,
- c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum, Ls);
- else
-#endif
- _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf,
- c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum, Ls);
+ bulk_auth_fn (ctx, abuf, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls);

nblocks -= 32;
abuf += 32 * CAMELLIA_BLOCK_SIZE;
diff --git a/configure.ac b/configure.ac
index 15c92018..c5d61657 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2755,6 +2755,9 @@ if test "$found" = "1" ; then

# Build with the VAES/AVX2 implementation
GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-vaes-avx2-amd64.lo"
+
+ # Build with the GFNI/AVX2 implementation
+ GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-gfni-avx2-amd64.lo"
fi
fi
fi
--
2.34.1


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@lists.gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel