Mailing List Archive: [PATCH v2] camellia-gfni: use GFNI for uint8 right shift in FLS

* cipher/camellia-aesni-avx2-amd64.h (IF_GFNI, IF_NOT_GFNI): New.
[CAMELLIA_GFNI_BUILD] (rol32_1_32): Add GFNI variant which uses
vgf2p8affineqb for uint8 right shift by 7.
(fls32): Load 'right shift by 7' bit-matrix on GFNI build.
[CAMELLIA_GFNI_BUILD] (.Lright_shift_by_7): New.
* cipher/camellia-gfni-avx512-amd64.S (clear_regs): Don't clear %k1.
(rol32_1_64): Use vgf2p8affineqb for uint8 right shift by 7.
(fls64): Adjust for rol32_1_64 changes.
(.Lbyte_ones): Remove.
(.Lright_shift_by_7): New.
(_gcry_camellia_gfni_avx512_ctr_enc): Clear %k1 after use.
--

Benchmark on Intel Core i3-1115G4:

Before:
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.194 ns/B 4920 MiB/s 0.794 c/B 4096±4
ECB dec | 0.194 ns/B 4916 MiB/s 0.793 c/B 4089

After (~1.7% faster)
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.190 ns/B 5008 MiB/s 0.780 c/B 4096±3
ECB dec | 0.191 ns/B 5002 MiB/s 0.781 c/B 4096±3

[v2]:
Do same optimization for GFNI build of "cipher/camellia-aesni-avx2-amd64.h".

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
---
cipher/camellia-aesni-avx2-amd64.h | 43 ++++++++++++++++++++++++++++-
cipher/camellia-gfni-avx512-amd64.S | 37 +++++++++++++------------
2 files changed, 61 insertions(+), 19 deletions(-)

diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h
index 003c4496..dff8b386 100644
--- a/cipher/camellia-aesni-avx2-amd64.h
+++ b/cipher/camellia-aesni-avx2-amd64.h
@@ -73,6 +73,14 @@
# define IF_VAES(...)
#endif

+#ifdef CAMELLIA_GFNI_BUILD
+# define IF_GFNI(...) __VA_ARGS__
+# define IF_NOT_GFNI(...)
+#else
+# define IF_GFNI(...)
+# define IF_NOT_GFNI(...) __VA_ARGS__
+#endif
+
/**********************************************************************
GFNI helper macros and constants
**********************************************************************/
@@ -459,6 +467,26 @@
* OUT:
* v0..3: (IN <<< 1)
*/
+#ifdef CAMELLIA_GFNI_BUILD
+#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, right_shift_by_7) \
+ vgf2p8affineqb $0, right_shift_by_7, v0, t0; \
+ vpaddb v0, v0, v0; \
+ \
+ vgf2p8affineqb $0, right_shift_by_7, v1, t1; \
+ vpaddb v1, v1, v1; \
+ \
+ vgf2p8affineqb $0, right_shift_by_7, v2, t2; \
+ vpaddb v2, v2, v2; \
+ \
+ vpor t0, v1, v1; \
+ \
+ vgf2p8affineqb $0, right_shift_by_7, v3, t0; \
+ vpaddb v3, v3, v3; \
+ \
+ vpor t1, v2, v2; \
+ vpor t2, v3, v3; \
+ vpor t0, v0, v0;
+#else
#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
vpcmpgtb v0, zero, t0; \
vpaddb v0, v0, v0; \
@@ -481,6 +509,7 @@
vpor t1, v2, v2; \
vpor t2, v3, v3; \
vpor t0, v0, v0;
+#endif

/*
* IN:
@@ -496,7 +525,8 @@
* t0 &= ll; \
* lr ^= rol32(t0, 1); \
*/ \
- vpxor tt0, tt0, tt0; \
+ IF_NOT_GFNI(vpxor tt0, tt0, tt0); \
+ IF_GFNI(vpbroadcastq .Lright_shift_by_7 rRIP, tt0); \
vpbroadcastb 0+kll, t3; \
vpbroadcastb 1+kll, t2; \
vpbroadcastb 2+kll, t1; \
@@ -867,6 +897,17 @@ ELF(.type FUNC_NAME(_constants),@object;)
BV8(0, 0, 0, 1, 1, 1, 0, 0),
BV8(0, 0, 0, 0, 0, 0, 0, 1))

+/* Bit-matrix for right shifting uint8_t values in vector by 7. */
+.Lright_shift_by_7:
+ .quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1),
+ BV8(0, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 0))
+
#else /* CAMELLIA_GFNI_BUILD */

/*
diff --git a/cipher/camellia-gfni-avx512-amd64.S b/cipher/camellia-gfni-avx512-amd64.S
index b676379f..643eed3e 100644
--- a/cipher/camellia-gfni-avx512-amd64.S
+++ b/cipher/camellia-gfni-avx512-amd64.S
@@ -105,7 +105,6 @@
clear_vec4(%ymm19, %ymm23, %ymm27, %ymm31)

#define clear_regs() \
- kxorq %k1, %k1, %k1; \
vzeroall; \
clear_zmm16_zmm31()

@@ -307,22 +306,18 @@
* v0..3: (IN << 1)
* t0, t1, t2, zero: (IN >> 7)
*/
-#define rol32_1_64(v0, v1, v2, v3, t0, t1, t2, zero, one) \
- vpcmpltb zero, v0, %k1; \
+#define rol32_1_64(v0, v1, v2, v3, t0, t1, t2, t3, right_shift_by_7) \
+ vgf2p8affineqb $0, right_shift_by_7, v0, t0; \
vpaddb v0, v0, v0; \
- vpaddb one, zero, t0{%k1}{z}; \
\
- vpcmpltb zero, v1, %k1; \
+ vgf2p8affineqb $0, right_shift_by_7, v1, t1; \
vpaddb v1, v1, v1; \
- vpaddb one, zero, t1{%k1}{z}; \
\
- vpcmpltb zero, v2, %k1; \
+ vgf2p8affineqb $0, right_shift_by_7, v2, t2; \
vpaddb v2, v2, v2; \
- vpaddb one, zero, t2{%k1}{z}; \
\
- vpcmpltb zero, v3, %k1; \
- vpaddb v3, v3, v3; \
- vpaddb one, zero, zero{%k1}{z};
+ vgf2p8affineqb $0, right_shift_by_7, v3, t3; \
+ vpaddb v3, v3, v3;

/*
* IN:
@@ -338,8 +333,7 @@
* t0 &= ll; \
* lr ^= rol32(t0, 1); \
*/ \
- vpbroadcastq .Lbyte_ones rRIP, tmp; \
- vpxor tt3##_y, tt3##_y, tt3##_y; \
+ vpbroadcastq .Lright_shift_by_7 rRIP, tmp; \
vpbroadcastb 0+kll, t3; \
vpbroadcastb 1+kll, t2; \
vpbroadcastb 2+kll, t1; \
@@ -360,7 +354,6 @@
vmovdqu64 l6, l##_6; \
vpternlogq $0x96, tt3, t3, l7; \
vmovdqu64 l7, l##_7; \
- vpxor tt3##_y, tt3##_y, tt3##_y; \
\
/* \
* t2 = krr; \
@@ -399,7 +392,6 @@
vpternlogq $0x96, tt1, t1, r##_5; \
vpternlogq $0x96, tt0, t2, r##_6; \
vpternlogq $0x96, tt3, t3, r##_7; \
- vpxor tt3##_y, tt3##_y, tt3##_y; \
\
/* \
* t0 = klr; \
@@ -596,9 +588,6 @@ ELF(.type _gcry_camellia_gfni_avx512__constants,@object;)
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0

-.Lbyte_ones:
- .quad 0x0101010101010101
-
/* Pre-filters and post-filters bit-matrixes for Camellia sboxes s1, s2, s3
* and s4.
* See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48.
@@ -663,6 +652,17 @@ ELF(.type _gcry_camellia_gfni_avx512__constants,@object;)
BV8(0, 0, 0, 1, 1, 1, 0, 0),
BV8(0, 0, 0, 0, 0, 0, 0, 1))

+/* Bit-matrix for right shifting uint8_t values in vector by 7. */
+.Lright_shift_by_7:
+ .quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1),
+ BV8(0, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 0))
+
/* CTR byte addition constants */
.align 64
.Lbige_addb_0_1:
@@ -904,6 +904,7 @@ _gcry_camellia_gfni_avx512_ctr_enc:
add_le128(%zmm2, %zmm6, %zmm24, %zmm25); /* +52... */
add_le128(%zmm1, %zmm5, %zmm24, %zmm25); /* +56... */
add_le128(%zmm0, %zmm4, %zmm24, %zmm25); /* +60... */
+ kxorq %k1, %k1, %k1;

.align 4
.Lload_ctr_done:
--
2.37.2

_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel