Mailing List Archive

[PATCH 6/7] camellia-avx2: add partial parallel block processing
* cipher/camellia-aesni-avx2-amd64.h: Remove unnecessary vzeroupper
from function entry.
(enc_blk1_32, dec_blk1_32): New.
* cipher/camellia-glue.c (avx_burn_stack_depth)
(avx2_burn_stack_depth): Move outside of bulk functions to deduplicate.
(_gcry_camellia_aesni_avx2_enc_blk1_32)
(_gcry_camellia_aesni_avx2_dec_blk1_32)
(_gcry_camellia_vaes_avx2_enc_blk1_32)
(_gcry_camellia_vaes_avx2_dec_blk1_32)
(_gcry_camellia_gfni_avx2_enc_blk1_32)
(_gcry_camellia_gfni_avx2_dec_blk1_32, camellia_encrypt_blk1_32)
(camellia_decrypt_blk1_32): New.
(_gcry_camellia_ctr_enc, _gcry_camellia_cbc_dec, _gcry_camellia_cfb_dec)
(_gcry_camellia_ocb_crypt, _gcry_camellia_ocb_auth): Use new bulk
processing helpers from 'bulkhelp.h' and 'camellia_encrypt_blk1_32'
and 'camellia_decrypt_blk1_32' for partial parallel processing.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
---
cipher/camellia-aesni-avx2-amd64.h | 209 +++++++++++++++++++--
cipher/camellia-glue.c | 292 ++++++++++++++++++++++-------
2 files changed, 421 insertions(+), 80 deletions(-)

diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h
index 8cd4b1cd..9cc5621e 100644
--- a/cipher/camellia-aesni-avx2-amd64.h
+++ b/cipher/camellia-aesni-avx2-amd64.h
@@ -1152,8 +1152,6 @@ FUNC_NAME(ctr_enc):
movq 8(%rcx), %r11;
bswapq %r11;

- vzeroupper;
-
cmpl $128, key_bitlength(CTX);
movl $32, %r8d;
movl $24, %eax;
@@ -1347,8 +1345,6 @@ FUNC_NAME(cbc_dec):
movq %rsp, %rbp;
CFI_DEF_CFA_REGISTER(%rbp);

- vzeroupper;
-
movq %rcx, %r9;

cmpl $128, key_bitlength(CTX);
@@ -1424,8 +1420,6 @@ FUNC_NAME(cfb_dec):
movq %rsp, %rbp;
CFI_DEF_CFA_REGISTER(%rbp);

- vzeroupper;
-
cmpl $128, key_bitlength(CTX);
movl $32, %r8d;
movl $24, %eax;
@@ -1510,8 +1504,6 @@ FUNC_NAME(ocb_enc):
movq %rsp, %rbp;
CFI_DEF_CFA_REGISTER(%rbp);

- vzeroupper;
-
subq $(16 * 32 + 4 * 8), %rsp;
andq $~63, %rsp;
movq %rsp, %rax;
@@ -1684,8 +1676,6 @@ FUNC_NAME(ocb_dec):
movq %rsp, %rbp;
CFI_DEF_CFA_REGISTER(%rbp);

- vzeroupper;
-
subq $(16 * 32 + 4 * 8), %rsp;
andq $~63, %rsp;
movq %rsp, %rax;
@@ -1880,8 +1870,6 @@ FUNC_NAME(ocb_auth):
movq %rsp, %rbp;
CFI_DEF_CFA_REGISTER(%rbp);

- vzeroupper;
-
subq $(16 * 32 + 4 * 8), %rsp;
andq $~63, %rsp;
movq %rsp, %rax;
@@ -2032,4 +2020,201 @@ FUNC_NAME(ocb_auth):
CFI_ENDPROC();
ELF(.size FUNC_NAME(ocb_auth),.-FUNC_NAME(ocb_auth);)

+.align 8
+.globl FUNC_NAME(enc_blk1_32)
+ELF(.type FUNC_NAME(enc_blk1_32),@function;)
+
+FUNC_NAME(enc_blk1_32):
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ * %ecx: nblocks (1 to 32)
+ */
+ CFI_STARTPROC();
+
+ pushq %rbp;
+ CFI_PUSH(%rbp);
+ movq %rsp, %rbp;
+ CFI_DEF_CFA_REGISTER(%rbp);
+
+ movl %ecx, %r9d;
+
+ cmpl $128, key_bitlength(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ subq $(16 * 32), %rsp;
+ andq $~63, %rsp;
+ movq %rsp, %rax;
+
+ cmpl $31, %ecx;
+ vpxor %xmm0, %xmm0, %xmm0;
+ ja 1f;
+ jb 2f;
+ vmovdqu 15 * 32(%rdx), %xmm0;
+ jmp 2f;
+ 1:
+ vmovdqu 15 * 32(%rdx), %ymm0;
+ 2:
+ vmovdqu %ymm0, (%rax);
+
+ vpbroadcastq (key_table)(CTX), %ymm0;
+ vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0;
+
+#define LOAD_INPUT(offset, ymm) \
+ cmpl $(1 + 2 * (offset)), %ecx; \
+ jb 2f; \
+ ja 1f; \
+ vmovdqu (offset) * 32(%rdx), %ymm##_x; \
+ vpxor %ymm0, %ymm, %ymm; \
+ jmp 2f; \
+ 1: \
+ vpxor (offset) * 32(%rdx), %ymm0, %ymm;
+
+ LOAD_INPUT(0, ymm15);
+ LOAD_INPUT(1, ymm14);
+ LOAD_INPUT(2, ymm13);
+ LOAD_INPUT(3, ymm12);
+ LOAD_INPUT(4, ymm11);
+ LOAD_INPUT(5, ymm10);
+ LOAD_INPUT(6, ymm9);
+ LOAD_INPUT(7, ymm8);
+ LOAD_INPUT(8, ymm7);
+ LOAD_INPUT(9, ymm6);
+ LOAD_INPUT(10, ymm5);
+ LOAD_INPUT(11, ymm4);
+ LOAD_INPUT(12, ymm3);
+ LOAD_INPUT(13, ymm2);
+ LOAD_INPUT(14, ymm1);
+ vpxor (%rax), %ymm0, %ymm0;
+
+2:
+ call __camellia_enc_blk32;
+
+#define STORE_OUTPUT(ymm, offset) \
+ cmpl $(1 + 2 * (offset)), %r9d; \
+ jb 2f; \
+ ja 1f; \
+ vmovdqu %ymm##_x, (offset) * 32(%rsi); \
+ jmp 2f; \
+ 1: \
+ vmovdqu %ymm, (offset) * 32(%rsi);
+
+ STORE_OUTPUT(ymm7, 0);
+ STORE_OUTPUT(ymm6, 1);
+ STORE_OUTPUT(ymm5, 2);
+ STORE_OUTPUT(ymm4, 3);
+ STORE_OUTPUT(ymm3, 4);
+ STORE_OUTPUT(ymm2, 5);
+ STORE_OUTPUT(ymm1, 6);
+ STORE_OUTPUT(ymm0, 7);
+ STORE_OUTPUT(ymm15, 8);
+ STORE_OUTPUT(ymm14, 9);
+ STORE_OUTPUT(ymm13, 10);
+ STORE_OUTPUT(ymm12, 11);
+ STORE_OUTPUT(ymm11, 12);
+ STORE_OUTPUT(ymm10, 13);
+ STORE_OUTPUT(ymm9, 14);
+ STORE_OUTPUT(ymm8, 15);
+
+2:
+ vzeroall;
+
+ leave;
+ CFI_LEAVE();
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size FUNC_NAME(enc_blk1_32),.-FUNC_NAME(enc_blk1_32);)
+
+.align 8
+.globl FUNC_NAME(dec_blk1_32)
+ELF(.type FUNC_NAME(dec_blk1_32),@function;)
+
+FUNC_NAME(dec_blk1_32):
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ * %ecx: nblocks (1 to 32)
+ */
+ CFI_STARTPROC();
+
+ pushq %rbp;
+ CFI_PUSH(%rbp);
+ movq %rsp, %rbp;
+ CFI_DEF_CFA_REGISTER(%rbp);
+
+ movl %ecx, %r9d;
+
+ cmpl $128, key_bitlength(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ subq $(16 * 32), %rsp;
+ andq $~63, %rsp;
+ movq %rsp, %rax;
+
+ cmpl $31, %ecx;
+ vpxor %xmm0, %xmm0, %xmm0;
+ ja 1f;
+ jb 2f;
+ vmovdqu 15 * 32(%rdx), %xmm0;
+ jmp 2f;
+ 1:
+ vmovdqu 15 * 32(%rdx), %ymm0;
+ 2:
+ vmovdqu %ymm0, (%rax);
+
+ vpbroadcastq (key_table)(CTX, %r8, 8), %ymm0;
+ vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0;
+
+ LOAD_INPUT(0, ymm15);
+ LOAD_INPUT(1, ymm14);
+ LOAD_INPUT(2, ymm13);
+ LOAD_INPUT(3, ymm12);
+ LOAD_INPUT(4, ymm11);
+ LOAD_INPUT(5, ymm10);
+ LOAD_INPUT(6, ymm9);
+ LOAD_INPUT(7, ymm8);
+ LOAD_INPUT(8, ymm7);
+ LOAD_INPUT(9, ymm6);
+ LOAD_INPUT(10, ymm5);
+ LOAD_INPUT(11, ymm4);
+ LOAD_INPUT(12, ymm3);
+ LOAD_INPUT(13, ymm2);
+ LOAD_INPUT(14, ymm1);
+ vpxor (%rax), %ymm0, %ymm0;
+
+2:
+ call __camellia_dec_blk32;
+
+ STORE_OUTPUT(ymm7, 0);
+ STORE_OUTPUT(ymm6, 1);
+ STORE_OUTPUT(ymm5, 2);
+ STORE_OUTPUT(ymm4, 3);
+ STORE_OUTPUT(ymm3, 4);
+ STORE_OUTPUT(ymm2, 5);
+ STORE_OUTPUT(ymm1, 6);
+ STORE_OUTPUT(ymm0, 7);
+ STORE_OUTPUT(ymm15, 8);
+ STORE_OUTPUT(ymm14, 9);
+ STORE_OUTPUT(ymm13, 10);
+ STORE_OUTPUT(ymm12, 11);
+ STORE_OUTPUT(ymm11, 12);
+ STORE_OUTPUT(ymm10, 13);
+ STORE_OUTPUT(ymm9, 14);
+ STORE_OUTPUT(ymm8, 15);
+
+2:
+ vzeroall;
+
+ leave;
+ CFI_LEAVE();
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size FUNC_NAME(dec_blk1_32),.-FUNC_NAME(dec_blk1_32);)
+
#endif /* GCRY_CAMELLIA_AESNI_AVX2_AMD64_H */
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 7f6e92d2..20ab7f7d 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -174,6 +174,10 @@ extern void _gcry_camellia_aesni_avx_ocb_auth(CAMELLIA_context *ctx,
extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx,
const unsigned char *key,
unsigned int keylen) ASM_FUNC_ABI;
+
+static const int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 16 +
+ 2 * sizeof(void *) + ASM_EXTRA_STACK;
+
#endif

#ifdef USE_AESNI_AVX2
@@ -214,6 +218,22 @@ extern void _gcry_camellia_aesni_avx2_ocb_auth(CAMELLIA_context *ctx,
unsigned char *offset,
unsigned char *checksum,
const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx2_enc_blk1_32(const CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned int nblocks)
+ ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx2_dec_blk1_32(const CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned int nblocks)
+ ASM_FUNC_ABI;
+
+static const int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
+ 2 * sizeof(void *) + ASM_EXTRA_STACK;
+
#endif

#ifdef USE_VAES_AVX2
@@ -254,6 +274,18 @@ extern void _gcry_camellia_vaes_avx2_ocb_auth(CAMELLIA_context *ctx,
unsigned char *offset,
unsigned char *checksum,
const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_vaes_avx2_enc_blk1_32(const CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned int nblocks)
+ ASM_FUNC_ABI;
+
+extern void _gcry_camellia_vaes_avx2_dec_blk1_32(const CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned int nblocks)
+ ASM_FUNC_ABI;
#endif

#ifdef USE_GFNI_AVX2
@@ -294,6 +326,18 @@ extern void _gcry_camellia_gfni_avx2_ocb_auth(CAMELLIA_context *ctx,
unsigned char *offset,
unsigned char *checksum,
const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_enc_blk1_32(const CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned int nblocks)
+ ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_dec_blk1_32(const CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned int nblocks)
+ ASM_FUNC_ABI;
#endif

static const char *selftest(void);
@@ -475,6 +519,105 @@ camellia_decrypt(void *c, byte *outbuf, const byte *inbuf)

#endif /*!USE_ARM_ASM*/

+
+static unsigned int
+camellia_encrypt_blk1_32 (const void *priv, byte *outbuf, const byte *inbuf,
+ unsigned int num_blks)
+{
+ const CAMELLIA_context *ctx = priv;
+ unsigned int stack_burn_size = 0;
+
+ gcry_assert (num_blks <= 32);
+
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2 && num_blks >= 3)
+ {
+ /* 3 or more parallel block GFNI processing is faster than
+ * generic C implementation. */
+ _gcry_camellia_gfni_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks);
+ return avx2_burn_stack_depth;
+ }
+#endif
+#ifdef USE_VAES_AVX2
+ if (ctx->use_vaes_avx2 && num_blks >= 6)
+ {
+ /* 6 or more parallel block VAES processing is faster than
+ * generic C implementation. */
+ _gcry_camellia_vaes_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks);
+ return avx2_burn_stack_depth;
+ }
+#endif
+#ifdef USE_AESNI_AVX2
+ if (ctx->use_aesni_avx2 && num_blks >= 6)
+ {
+ /* 6 or more parallel block AESNI processing is faster than
+ * generic C implementation. */
+ _gcry_camellia_aesni_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks);
+ return avx2_burn_stack_depth;
+ }
+#endif
+
+ while (num_blks)
+ {
+ stack_burn_size = camellia_encrypt((void *)ctx, outbuf, inbuf);
+ outbuf += CAMELLIA_BLOCK_SIZE;
+ inbuf += CAMELLIA_BLOCK_SIZE;
+ num_blks--;
+ }
+
+ return stack_burn_size;
+}
+
+
+static unsigned int
+camellia_decrypt_blk1_32 (const void *priv, byte *outbuf, const byte *inbuf,
+ unsigned int num_blks)
+{
+ const CAMELLIA_context *ctx = priv;
+ unsigned int stack_burn_size = 0;
+
+ gcry_assert (num_blks <= 32);
+
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2 && num_blks >= 3)
+ {
+ /* 3 or more parallel block GFNI processing is faster than
+ * generic C implementation. */
+ _gcry_camellia_gfni_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks);
+ return avx2_burn_stack_depth;
+ }
+#endif
+#ifdef USE_VAES_AVX2
+ if (ctx->use_vaes_avx2 && num_blks >= 6)
+ {
+ /* 6 or more parallel block VAES processing is faster than
+ * generic C implementation. */
+ _gcry_camellia_vaes_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks);
+ return avx2_burn_stack_depth;
+ }
+#endif
+#ifdef USE_AESNI_AVX2
+ if (ctx->use_aesni_avx2 && num_blks >= 6)
+ {
+ /* 6 or more parallel block AESNI processing is faster than
+ * generic C implementation. */
+ _gcry_camellia_aesni_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks);
+ return avx2_burn_stack_depth;
+ }
+#endif
+
+ while (num_blks)
+ {
+ stack_burn_size = camellia_decrypt((void *)ctx, outbuf, inbuf);
+ outbuf += CAMELLIA_BLOCK_SIZE;
+ inbuf += CAMELLIA_BLOCK_SIZE;
+ num_blks--;
+ }
+
+ return stack_burn_size;
+}
+
+
/* Bulk encryption of complete blocks in CTR mode. This function is only
intended for the bulk encryption feature of cipher.c. CTR is expected to be
of size CAMELLIA_BLOCK_SIZE. */
@@ -486,8 +629,7 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
CAMELLIA_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
- unsigned char tmpbuf[CAMELLIA_BLOCK_SIZE];
- int burn_stack_depth = CAMELLIA_encrypt_stack_burn_size;
+ int burn_stack_depth = 0;

#ifdef USE_AESNI_AVX2
if (ctx->use_aesni_avx2)
@@ -517,9 +659,6 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,

if (did_use_aesni_avx2)
{
- int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
- 2 * sizeof(void *) + ASM_EXTRA_STACK;
-
if (burn_stack_depth < avx2_burn_stack_depth)
burn_stack_depth = avx2_burn_stack_depth;
}
@@ -547,9 +686,6 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,

if (did_use_aesni_avx)
{
- int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
- 2 * sizeof(void *) + ASM_EXTRA_STACK;
-
if (burn_stack_depth < avx_burn_stack_depth)
burn_stack_depth = avx_burn_stack_depth;
}
@@ -559,20 +695,23 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
}
#endif

- for ( ;nblocks; nblocks-- )
+ /* Process remaining blocks. */
+ if (nblocks)
{
- /* Encrypt the counter. */
- Camellia_EncryptBlock(ctx->keybitlength, ctr, ctx->keytable, tmpbuf);
- /* XOR the input with the encrypted counter and store in output. */
- cipher_block_xor(outbuf, tmpbuf, inbuf, CAMELLIA_BLOCK_SIZE);
- outbuf += CAMELLIA_BLOCK_SIZE;
- inbuf += CAMELLIA_BLOCK_SIZE;
- /* Increment the counter. */
- cipher_block_add(ctr, 1, CAMELLIA_BLOCK_SIZE);
+ byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+ unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+ size_t nburn;
+
+ nburn = bulk_ctr_enc_128(ctx, camellia_encrypt_blk1_32, outbuf, inbuf,
+ nblocks, ctr, tmpbuf,
+ sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory(tmpbuf, tmp_used);
}

- wipememory(tmpbuf, sizeof(tmpbuf));
- _gcry_burn_stack(burn_stack_depth);
+ if (burn_stack_depth)
+ _gcry_burn_stack(burn_stack_depth);
}

/* Bulk decryption of complete blocks in CBC mode. This function is only
@@ -585,8 +724,7 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
CAMELLIA_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
- unsigned char savebuf[CAMELLIA_BLOCK_SIZE];
- int burn_stack_depth = CAMELLIA_decrypt_stack_burn_size;
+ int burn_stack_depth = 0;

#ifdef USE_AESNI_AVX2
if (ctx->use_aesni_avx2)
@@ -616,9 +754,6 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,

if (did_use_aesni_avx2)
{
- int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
- 2 * sizeof(void *) + ASM_EXTRA_STACK;;
-
if (burn_stack_depth < avx2_burn_stack_depth)
burn_stack_depth = avx2_burn_stack_depth;
}
@@ -645,9 +780,6 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,

if (did_use_aesni_avx)
{
- int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
- 2 * sizeof(void *) + ASM_EXTRA_STACK;
-
if (burn_stack_depth < avx_burn_stack_depth)
burn_stack_depth = avx_burn_stack_depth;
}
@@ -656,20 +788,23 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
}
#endif

- for ( ;nblocks; nblocks-- )
+ /* Process remaining blocks. */
+ if (nblocks)
{
- /* INBUF is needed later and it may be identical to OUTBUF, so store
- the intermediate result to SAVEBUF. */
- Camellia_DecryptBlock(ctx->keybitlength, inbuf, ctx->keytable, savebuf);
+ byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+ unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+ size_t nburn;

- cipher_block_xor_n_copy_2(outbuf, savebuf, iv, inbuf,
- CAMELLIA_BLOCK_SIZE);
- inbuf += CAMELLIA_BLOCK_SIZE;
- outbuf += CAMELLIA_BLOCK_SIZE;
+ nburn = bulk_cbc_dec_128(ctx, camellia_decrypt_blk1_32, outbuf, inbuf,
+ nblocks, iv, tmpbuf,
+ sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory(tmpbuf, tmp_used);
}

- wipememory(savebuf, sizeof(savebuf));
- _gcry_burn_stack(burn_stack_depth);
+ if (burn_stack_depth)
+ _gcry_burn_stack(burn_stack_depth);
}

/* Bulk decryption of complete blocks in CFB mode. This function is only
@@ -682,7 +817,7 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
CAMELLIA_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
- int burn_stack_depth = CAMELLIA_decrypt_stack_burn_size;
+ int burn_stack_depth = 0;

#ifdef USE_AESNI_AVX2
if (ctx->use_aesni_avx2)
@@ -712,9 +847,6 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,

if (did_use_aesni_avx2)
{
- int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
- 2 * sizeof(void *) + ASM_EXTRA_STACK;
-
if (burn_stack_depth < avx2_burn_stack_depth)
burn_stack_depth = avx2_burn_stack_depth;
}
@@ -741,9 +873,6 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,

if (did_use_aesni_avx)
{
- int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
- 2 * sizeof(void *) + ASM_EXTRA_STACK;
-
if (burn_stack_depth < avx_burn_stack_depth)
burn_stack_depth = avx_burn_stack_depth;
}
@@ -752,15 +881,23 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
}
#endif

- for ( ;nblocks; nblocks-- )
+ /* Process remaining blocks. */
+ if (nblocks)
{
- Camellia_EncryptBlock(ctx->keybitlength, iv, ctx->keytable, iv);
- cipher_block_xor_n_copy(outbuf, iv, inbuf, CAMELLIA_BLOCK_SIZE);
- outbuf += CAMELLIA_BLOCK_SIZE;
- inbuf += CAMELLIA_BLOCK_SIZE;
+ byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+ unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+ size_t nburn;
+
+ nburn = bulk_cfb_dec_128(ctx, camellia_encrypt_blk1_32, outbuf, inbuf,
+ nblocks, iv, tmpbuf,
+ sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory(tmpbuf, tmp_used);
}

- _gcry_burn_stack(burn_stack_depth);
+ if (burn_stack_depth)
+ _gcry_burn_stack(burn_stack_depth);
}

/* Bulk encryption/decryption of complete blocks in OCB mode. */
@@ -772,11 +909,9 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
CAMELLIA_context *ctx = (void *)&c->context.c;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
- int burn_stack_depth;
+ int burn_stack_depth = 0;
u64 blkn = c->u_mode.ocb.data_nblocks;

- burn_stack_depth = encrypt ? CAMELLIA_encrypt_stack_burn_size :
- CAMELLIA_decrypt_stack_burn_size;
#else
(void)c;
(void)outbuf_arg;
@@ -826,9 +961,6 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,

if (did_use_aesni_avx2)
{
- int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE +
- 2 * sizeof(void *) + ASM_EXTRA_STACK;
-
if (burn_stack_depth < avx2_burn_stack_depth)
burn_stack_depth = avx2_burn_stack_depth;
}
@@ -870,9 +1002,6 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,

if (did_use_aesni_avx)
{
- int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
- 2 * sizeof(void *) + ASM_EXTRA_STACK;
-
if (burn_stack_depth < avx_burn_stack_depth)
burn_stack_depth = avx_burn_stack_depth;
}
@@ -882,6 +1011,24 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
#endif

#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+ /* Process remaining blocks. */
+ if (nblocks)
+ {
+ byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+ unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+ size_t nburn;
+
+ nburn = bulk_ocb_crypt_128 (c, ctx, encrypt ? camellia_encrypt_blk1_32
+ : camellia_decrypt_blk1_32,
+ outbuf, inbuf, nblocks, &blkn, encrypt,
+ tmpbuf, sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE,
+ &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory(tmpbuf, tmp_used);
+ nblocks = 0;
+ }
+
c->u_mode.ocb.data_nblocks = blkn;

if (burn_stack_depth)
@@ -899,10 +1046,8 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
CAMELLIA_context *ctx = (void *)&c->context.c;
const unsigned char *abuf = abuf_arg;
- int burn_stack_depth;
+ int burn_stack_depth = 0;
u64 blkn = c->u_mode.ocb.aad_nblocks;
-
- burn_stack_depth = CAMELLIA_encrypt_stack_burn_size;
#else
(void)c;
(void)abuf_arg;
@@ -948,9 +1093,6 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,

if (did_use_aesni_avx2)
{
- int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE +
- 2 * sizeof(void *) + ASM_EXTRA_STACK;
-
if (burn_stack_depth < avx2_burn_stack_depth)
burn_stack_depth = avx2_burn_stack_depth;
}
@@ -988,9 +1130,6 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,

if (did_use_aesni_avx)
{
- int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
- 2 * sizeof(void *) + ASM_EXTRA_STACK;
-
if (burn_stack_depth < avx_burn_stack_depth)
burn_stack_depth = avx_burn_stack_depth;
}
@@ -1000,6 +1139,23 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
#endif

#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+ /* Process remaining blocks. */
+ if (nblocks)
+ {
+ byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+ unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+ size_t nburn;
+
+ nburn = bulk_ocb_auth_128 (c, ctx, camellia_encrypt_blk1_32,
+ abuf, nblocks, &blkn, tmpbuf,
+ sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE,
+ &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory(tmpbuf, tmp_used);
+ nblocks = 0;
+ }
+
c->u_mode.ocb.aad_nblocks = blkn;

if (burn_stack_depth)
--
2.34.1


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@lists.gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel