Mailing List Archive

[PATCH 5/5] chacha20-ppc: use target and optimize attributes for P8 and P9
* cipher/chacha20-ppc.c (_gcry_chacha20_ppc8_blocks1): Rename to...
(chacha20_ppc_blocks1): ...this; Add 'always inline' attribute.
(_gcry_chacha20_ppc8_blocks4): Rename to...
(chacha20_ppc_blocks4): ...this; Add 'always inline' attribute.
(_gcry_chacha20_poly1305_ppc8_blocks4): Rename to...
(chacha20_poly1305_ppc_blocks4): ...this; Add 'always inline'
attribute.
(FUNC_ATTR_OPT_O2, FUNC_ATTR_TARGET_P8, FUNC_ATTR_TARGET_P9): New.
(_gcry_chacha20_ppc8_blocks1, _gcry_chacha20_ppc8_blocks4)
(_gcry_chacha20_poly1305_ppc8_blocks4): New.
(_gcry_chacha20_ppc9_blocks1, _gcry_chacha20_ppc9_blocks4)
(_gcry_chacha20_poly1305_ppc9_blocks4): New.
* cipher/chacha20.c (CHACHA20_context_t): Add 'use_p9'.
(_gcry_chacha20_ppc9_blocks1, _gcry_chacha20_ppc9_blocks4)
(_gcry_chacha20_poly1305_ppc9_blocks4): New.
(chacha20_do_setkey): Set 'use_p9' if HW has HWF_PPC_ARCH_3_00.
(chacha20_blocks, do_chacha20_encrypt_stream_tail)
(_gcry_chacha20_poly1305_encrypt)
(_gcry_chacha20_poly1305_decrypt) [USE_PPC_VEC]: Add 'use_p9' paths.
--

This change makes sure that chacha20-ppc gets compiled
with proper optimization level and right target setting.

Benchmark on POWER9:

CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 1.11 ns/B 856.0 MiB/s 2.56 c/B
STREAM dec | 1.11 ns/B 856.0 MiB/s 2.56 c/B
POLY1305 enc | 1.57 ns/B 606.2 MiB/s 3.62 c/B
POLY1305 dec | 1.56 ns/B 610.4 MiB/s 3.59 c/B
POLY1305 auth | 0.876 ns/B 1089 MiB/s 2.02 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
---
cipher/chacha20-ppc.c | 118 ++++++++++++++++++++++++++++++++++++++----
cipher/chacha20.c | 55 ++++++++++++++++----
2 files changed, 154 insertions(+), 19 deletions(-)

diff --git a/cipher/chacha20-ppc.c b/cipher/chacha20-ppc.c
index 4a21b837..3fe7bc8c 100644
--- a/cipher/chacha20-ppc.c
+++ b/cipher/chacha20-ppc.c
@@ -136,9 +136,8 @@ vec_add_ctr_u64(vector4x_u32 v, vector4x_u32 a)
#define ADD_U64(v,a) \
(v = vec_add_ctr_u64(v, a))

-unsigned int ASM_FUNC_ATTR
-_gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src,
- size_t nblks)
+static unsigned int ASM_FUNC_ATTR_INLINE
+chacha20_ppc_blocks1(u32 *state, byte *dst, const byte *src, size_t nblks)
{
vector4x_u32 counter_1 = { 1, 0, 0, 0 };
vector4x_u32 rotate_16 = { 16, 16, 16, 16 };
@@ -283,9 +282,8 @@ _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src,
PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \
ROTATE(b1, rotate_7); ROTATE(b2, rotate_7);

-unsigned int ASM_FUNC_ATTR
-_gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
- size_t nblks)
+static unsigned int ASM_FUNC_ATTR_INLINE
+chacha20_ppc_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks)
{
vector4x_u32 counters_0123 = { 0, 1, 2, 3 };
vector4x_u32 counter_4 = { 4, 0, 0, 0 };
@@ -470,10 +468,10 @@ _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
MUL_MOD_1305_64_PART2(h2, h1, h0, r1, r0, r1_mult5); \
} while (0)

-unsigned int ASM_FUNC_ATTR
-_gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
- size_t nblks, POLY1305_STATE *st,
- const byte *poly1305_src)
+static unsigned int ASM_FUNC_ATTR_INLINE
+chacha20_poly1305_ppc_blocks4(u32 *state, byte *dst, const byte *src,
+ size_t nblks, POLY1305_STATE *st,
+ const byte *poly1305_src)
{
vector4x_u32 counters_0123 = { 0, 1, 2, 3 };
vector4x_u32 counter_4 = { 4, 0, 0, 0 };
@@ -641,6 +639,106 @@ _gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
return 0;
}

+#else
+
+static unsigned int ASM_FUNC_ATTR_INLINE
+chacha20_poly1305_ppc_blocks4(u32 *state, byte *dst, const byte *src,
+ size_t nblks, POLY1305_STATE *st,
+ const byte *poly1305_src)
+{
+}
+
#endif /* SIZEOF_UNSIGNED_LONG == 8 */

+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT_O2 __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT_O2
+#endif
+
+#ifdef HAVE_GCC_ATTRIBUTE_PPC_TARGET
+# define FUNC_ATTR_TARGET_P8 __attribute__((target("cpu=power8")))
+# define FUNC_ATTR_TARGET_P9 __attribute__((target("cpu=power9")))
+#else
+# define FUNC_ATTR_TARGET_P8
+# define FUNC_ATTR_TARGET_P9
+#endif
+
+
+/* Functions targetting POWER8. */
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P8 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src,
+ size_t nblks)
+{
+ return chacha20_ppc_blocks1(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P8 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
+ size_t nblks)
+{
+ return chacha20_ppc_blocks4(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P8 FUNC_ATTR_OPT_O2
+_gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
+ size_t nblks, POLY1305_STATE *st,
+ const byte *poly1305_src)
+{
+ return chacha20_poly1305_ppc_blocks4(state, dst, src, nblks, st,
+ poly1305_src);
+}
+
+#ifdef HAVE_GCC_ATTRIBUTE_PPC_TARGET
+/* Functions targetting POWER9. */
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc9_blocks1(u32 *state, byte *dst, const byte *src,
+ size_t nblks)
+{
+ return chacha20_ppc_blocks1(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc9_blocks4(u32 *state, byte *dst, const byte *src,
+ size_t nblks)
+{
+ return chacha20_ppc_blocks4(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_poly1305_ppc9_blocks4(u32 *state, byte *dst, const byte *src,
+ size_t nblks, POLY1305_STATE *st,
+ const byte *poly1305_src)
+{
+ return chacha20_poly1305_ppc_blocks4(state, dst, src, nblks, st,
+ poly1305_src);
+}
+#else
+/* Compiler does not support target attribute, use same functions for POWER9
+ * as for POWER8. */
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc9_blocks1(u32 *state, byte *dst, const byte *src,
+ size_t nblks)
+{
+ return _gcry_chacha20_ppc8_blocks1(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc9_blocks4(u32 *state, byte *dst, const byte *src,
+ size_t nblks)
+{
+ return _gcry_chacha20_ppc8_blocks4(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_poly1305_ppc9_blocks4(u32 *state, byte *dst, const byte *src,
+ size_t nblks, POLY1305_STATE *st,
+ const byte *poly1305_src)
+{
+ return _gcry_chacha20_poly1305_ppc8_blocks4(state, dst, src, nblks, st,
+ poly1305_src);
+}
+#endif /* HAVE_GCC_ATTRIBUTE_PPC_TARGET */
+
#endif /* ENABLE_PPC_CRYPTO_SUPPORT */
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index a7e0dd63..d979d263 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -134,6 +134,7 @@ typedef struct CHACHA20_context_s
unsigned int use_avx512:1;
unsigned int use_neon:1;
unsigned int use_ppc:1;
+ unsigned int use_p9:1;
unsigned int use_p10:1;
unsigned int use_s390x:1;
} CHACHA20_context_t;
@@ -195,12 +196,24 @@ unsigned int _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst,
const byte *src,
size_t nblks);

+unsigned int _gcry_chacha20_ppc9_blocks4(u32 *state, byte *dst,
+ const byte *src,
+ size_t nblks);
+
+unsigned int _gcry_chacha20_ppc9_blocks1(u32 *state, byte *dst,
+ const byte *src,
+ size_t nblks);
+
#undef USE_PPC_VEC_POLY1305
#if SIZEOF_UNSIGNED_LONG == 8
#define USE_PPC_VEC_POLY1305 1
unsigned int _gcry_chacha20_poly1305_ppc8_blocks4(
u32 *state, byte *dst, const byte *src, size_t nblks,
POLY1305_STATE *st, const byte *poly1305_src);
+
+unsigned int _gcry_chacha20_poly1305_ppc9_blocks4(
+ u32 *state, byte *dst, const byte *src, size_t nblks,
+ POLY1305_STATE *st, const byte *poly1305_src);
#endif /* SIZEOF_UNSIGNED_LONG == 8 */

#endif /* USE_PPC_VEC */
@@ -369,7 +382,10 @@ chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
#ifdef USE_PPC_VEC
if (ctx->use_ppc)
{
- return _gcry_chacha20_ppc8_blocks1(ctx->input, dst, src, nblks);
+ if (ctx->use_p9)
+ return _gcry_chacha20_ppc9_blocks1(ctx->input, dst, src, nblks);
+ else
+ return _gcry_chacha20_ppc8_blocks1(ctx->input, dst, src, nblks);
}
#endif

@@ -509,6 +525,7 @@ chacha20_do_setkey (CHACHA20_context_t *ctx,
#endif
#ifdef USE_PPC_VEC
ctx->use_ppc = (features & HWF_PPC_ARCH_2_07) != 0;
+ ctx->use_p9 = (features & HWF_PPC_ARCH_3_00) != 0;
# ifndef WORDS_BIGENDIAN
ctx->use_p10 = (features & HWF_PPC_ARCH_3_10) != 0;
# ifdef ENABLE_FORCE_SOFT_HWFEATURES
@@ -626,18 +643,25 @@ do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
{
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
nblocks -= nblocks % 4;
+ if (0)
+ {}
#ifndef WORDS_BIGENDIAN
/*
* A workaround to skip counter overflow. This is rare.
*/
- if (ctx->use_p10 && nblocks >= 8
- && ((u64)ctx->input[12] + nblocks) <= 0xffffffffU)
+ else if (ctx->use_p10 && nblocks >= 8
+ && ((u64)ctx->input[12] + nblocks) <= 0xffffffffU)
{
size_t len = nblocks * CHACHA20_BLOCK_SIZE;
nburn = _gcry_chacha20_p10le_8x(ctx->input, outbuf, inbuf, len);
}
- else
#endif
+ else if (ctx->use_p9)
+ {
+ nburn = _gcry_chacha20_ppc9_blocks4(ctx->input, outbuf, inbuf,
+ nblocks);
+ }
+ else
{
nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf,
nblocks);
@@ -844,7 +868,10 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
}
else if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4)
{
- nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, 4);
+ if (ctx->use_p9)
+ nburn = _gcry_chacha20_ppc9_blocks4(ctx->input, outbuf, inbuf, 4);
+ else
+ nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, 4);
burn = nburn > burn ? nburn : burn;

authptr = outbuf;
@@ -986,7 +1013,12 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
nblocks -= nblocks % 4;

- nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
+ if (ctx->use_p9)
+ nburn = _gcry_chacha20_poly1305_ppc9_blocks4(
+ ctx->input, outbuf, inbuf, nblocks,
+ &c->u_mode.poly1305.ctx.state, authptr);
+ else
+ nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
ctx->input, outbuf, inbuf, nblocks,
&c->u_mode.poly1305.ctx.state, authptr);
burn = nburn > burn ? nburn : burn;
@@ -1212,9 +1244,14 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
nblocks -= nblocks % 4;

- nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
- ctx->input, outbuf, inbuf, nblocks,
- &c->u_mode.poly1305.ctx.state, inbuf);
+ if (ctx->use_p9)
+ nburn = _gcry_chacha20_poly1305_ppc9_blocks4(
+ ctx->input, outbuf, inbuf, nblocks,
+ &c->u_mode.poly1305.ctx.state, inbuf);
+ else
+ nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
+ ctx->input, outbuf, inbuf, nblocks,
+ &c->u_mode.poly1305.ctx.state, inbuf);
burn = nburn > burn ? nburn : burn;

length -= nblocks * CHACHA20_BLOCK_SIZE;
--
2.37.2


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel