Mailing List Archive

[PATCH 6/8] mpi/ec: small optimization for ec_mulm_448
* mpi/ec.c (ec_addm_448, ec_subm_448): Change order of sub_n and
set_cond to remove need to clear 'n'.
(ec_mulm_448): Use memcpy where possible; Use mpih_rshift where
possible; Use mpih_lshift for doubling a3; Remove one addition
at end.
--

Benchmarks on AMD Ryzen 7 5800X:

Before:
Ed448 | nanosecs/iter cycles/iter auto Mhz
keygen | 893096 4343326 4863
sign | 988422 4795694 4852
verify | 1899706 9215952 4851

After (~5% faster):
Ed448 | nanosecs/iter cycles/iter auto Mhz
keygen | 822078 3987952 4851
sign | 947327 4595433 4851
verify | 1776259 8616675 4851

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
---
mpi/ec.c | 76 ++++++++++++++++----------------------------------------
1 file changed, 22 insertions(+), 54 deletions(-)

diff --git a/mpi/ec.c b/mpi/ec.c
index e1d4b32c..4fabf9b4 100644
--- a/mpi/ec.c
+++ b/mpi/ec.c
@@ -464,14 +464,13 @@ ec_addm_448 (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx)
if (w->nlimbs != wsize || u->nlimbs != wsize || v->nlimbs != wsize)
log_bug ("addm_448: different sizes\n");

- memset (n, 0, sizeof n);
up = u->d;
vp = v->d;
wp = w->d;

cy = _gcry_mpih_add_n (wp, up, vp, wsize);
- mpih_set_cond (n, ctx->p->d, wsize, (cy != 0UL));
- _gcry_mpih_sub_n (wp, wp, n, wsize);
+ _gcry_mpih_sub_n (n, wp, ctx->p->d, wsize);
+ mpih_set_cond (wp, n, wsize, (cy != 0UL));
}

static void
@@ -485,14 +484,13 @@ ec_subm_448 (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx)
if (w->nlimbs != wsize || u->nlimbs != wsize || v->nlimbs != wsize)
log_bug ("subm_448: different sizes\n");

- memset (n, 0, sizeof n);
up = u->d;
vp = v->d;
wp = w->d;

borrow = _gcry_mpih_sub_n (wp, up, vp, wsize);
- mpih_set_cond (n, ctx->p->d, wsize, (borrow != 0UL));
- _gcry_mpih_add_n (wp, wp, n, wsize);
+ _gcry_mpih_add_n (n, wp, ctx->p->d, wsize);
+ mpih_set_cond (wp, n, wsize, (borrow != 0UL));
}

static void
@@ -506,10 +504,6 @@ ec_mulm_448 (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx)
mpi_limb_t b0[LIMB_SIZE_HALF_448];
mpi_limb_t b1[LIMB_SIZE_HALF_448];
mpi_limb_t cy;
- int i;
-#if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2)
- mpi_limb_t b1_rest, a3_rest;
-#endif

if (w->nlimbs != wsize || u->nlimbs != wsize || v->nlimbs != wsize)
log_bug ("mulm_448: different sizes\n");
@@ -520,61 +514,37 @@ ec_mulm_448 (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx)

_gcry_mpih_mul_n (n, up, vp, wsize);

- for (i = 0; i < (wsize + 1)/ 2; i++)
- {
- b0[i] = n[i];
- b1[i] = n[i+wsize/2];
- a2[i] = n[i+wsize];
- a3[i] = n[i+wsize+wsize/2];
- }
+ memcpy (b0, n, LIMB_SIZE_HALF_448 * BYTES_PER_MPI_LIMB);
+ memcpy (a2, n + wsize, LIMB_SIZE_HALF_448 * BYTES_PER_MPI_LIMB);

#if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2)
b0[LIMB_SIZE_HALF_448-1] &= ((mpi_limb_t)1UL<<32)-1;
a2[LIMB_SIZE_HALF_448-1] &= ((mpi_limb_t)1UL<<32)-1;
-
- b1_rest = 0;
- a3_rest = 0;
-
- for (i = (wsize + 1)/ 2 -1; i >= 0; i--)
- {
- mpi_limb_t b1v, a3v;
- b1v = b1[i];
- a3v = a3[i];
- b1[i] = (b1_rest<<32) | (b1v >> 32);
- a3[i] = (a3_rest<<32) | (a3v >> 32);
- b1_rest = b1v & (((mpi_limb_t)1UL <<32)-1);
- a3_rest = a3v & (((mpi_limb_t)1UL <<32)-1);
- }
+ _gcry_mpih_rshift (b1, n + wsize/2, LIMB_SIZE_HALF_448, 32);
+ _gcry_mpih_rshift (a3, n + wsize + wsize/2, LIMB_SIZE_HALF_448, 32);
+#else
+ memcpy (b1, n + wsize/2, LIMB_SIZE_HALF_448 * BYTES_PER_MPI_LIMB);
+ memcpy (a3, n + wsize + wsize/2, LIMB_SIZE_HALF_448 * BYTES_PER_MPI_LIMB);
#endif

cy = _gcry_mpih_add_n (b0, b0, a2, LIMB_SIZE_HALF_448);
- cy += _gcry_mpih_add_n (b0, b0, a3, LIMB_SIZE_HALF_448);
- for (i = 0; i < (wsize + 1)/ 2; i++)
- wp[i] = b0[i];
+ cy += _gcry_mpih_add_n (wp, b0, a3, LIMB_SIZE_HALF_448);
#if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2)
+ cy = wp[LIMB_SIZE_HALF_448-1] >> 32;
wp[LIMB_SIZE_HALF_448-1] &= (((mpi_limb_t)1UL <<32)-1);
#endif
+ memset (b0, 0, LIMB_SIZE_HALF_448 * BYTES_PER_MPI_LIMB);
+ b0[0] = cy;

-#if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2)
- cy = b0[LIMB_SIZE_HALF_448-1] >> 32;
-#endif
-
- cy = _gcry_mpih_add_1 (b1, b1, LIMB_SIZE_HALF_448, cy);
+ cy = _gcry_mpih_add_n (b1, b1, b0, LIMB_SIZE_HALF_448);
+ cy += _gcry_mpih_lshift (a3, a3, LIMB_SIZE_HALF_448, 1);
cy += _gcry_mpih_add_n (b1, b1, a2, LIMB_SIZE_HALF_448);
cy += _gcry_mpih_add_n (b1, b1, a3, LIMB_SIZE_HALF_448);
- cy += _gcry_mpih_add_n (b1, b1, a3, LIMB_SIZE_HALF_448);
#if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2)
- b1_rest = 0;
- for (i = (wsize + 1)/ 2 -1; i >= 0; i--)
- {
- mpi_limb_t b1v = b1[i];
- b1[i] = (b1_rest<<32) | (b1v >> 32);
- b1_rest = b1v & (((mpi_limb_t)1UL <<32)-1);
- }
- wp[LIMB_SIZE_HALF_448-1] |= (b1_rest << 32);
+ cy = _gcry_mpih_rshift (b1, b1, LIMB_SIZE_HALF_448, 32);
+ wp[LIMB_SIZE_HALF_448-1] |= cy;
#endif
- for (i = 0; i < wsize / 2; i++)
- wp[i+(wsize + 1) / 2] = b1[i];
+ memcpy (wp + LIMB_SIZE_HALF_448, b1, (wsize / 2) * BYTES_PER_MPI_LIMB);

#if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2)
cy = b1[LIMB_SIZE_HALF_448-1];
@@ -590,10 +560,8 @@ ec_mulm_448 (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx)
n[0] = cy;
_gcry_mpih_add_n (wp, wp, n, wsize);

- memset (n, 0, wsize * BYTES_PER_MPI_LIMB);
- cy = _gcry_mpih_sub_n (wp, wp, ctx->p->d, wsize);
- mpih_set_cond (n, ctx->p->d, wsize, (cy != 0UL));
- _gcry_mpih_add_n (wp, wp, n, wsize);
+ cy = _gcry_mpih_sub_n (n, wp, ctx->p->d, wsize);
+ mpih_set_cond (wp, n, wsize, (cy == 0UL));
}

static void
--
2.30.2


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@gnupg.org
http://lists.gnupg.org/mailman/listinfo/gcrypt-devel