Mailing List Archive

[PATCH 7/8] mpi/ec: add fast reduction functions for NIST curves
* configure.ac (ASM_DISABLED): New.
* mpi/Makefile.am: Add 'ec-nist.c' and 'ec-inline.h'.
* mpi/ec-nist.c: New.
* mpi/ec-inline.h: New.
* mpi/ec-internal.h (_gcry_mpi_ec_nist192_mod)
(_gcry_mpi_ec_nist224_mod, _gcry_mpi_ec_nist256_mod)
(_gcry_mpi_ec_nist384_mod, _gcry_mpi_ec_nist521_mod): New.
* mpi/ec.c (ec_addm, ec_subm, ec_mulm, ec_mul2): Use
'ctx->mod'.
(field_table): Add 'mod' function; Add NIST reduction
functions.
(ec_p_init): Setup ctx->mod; Setup function pointers
from field_table only if pointer is not NULL; Resize
ctx->a and ctx->b only if set.
* mpi/mpi-internal.h (RESIZE_AND_CLEAR_IF_NEEDED): New.
* mpi/mpiutil.c (_gcry_mpi_resize): Clear all unused
limbs also in realloc case.
* src/ec-context.h (mpi_ec_ctx_s): Add 'mod' function.
--

Benchmark on AMD Ryzen 7 5800X (x86_64):

Before:
NIST-P192 | nanosecs/iter cycles/iter auto Mhz
mult | 283346 1369473 4833
keygen | 1688442 8185744 4848
sign | 549683 2662984 4845
verify | 615284 2984325 4850
=
NIST-P224 | nanosecs/iter cycles/iter auto Mhz
mult | 516443 2501173 4843
keygen | 2859746 13866802 4849
sign | 918472 4455043 4850
verify | 1057940 5131372 4850
=
NIST-P256 | nanosecs/iter cycles/iter auto Mhz
mult | 423536 2054040 4850
keygen | 2383097 11557572 4850
sign | 774346 3754243 4848
verify | 864934 4196315 4852
=
NIST-P384 | nanosecs/iter cycles/iter auto Mhz
mult | 929985 4511881 4852
keygen | 5230788 25367299 4850
sign | 1671432 8109726 4852
verify | 1902729 9228568 4850
=
NIST-P521 | nanosecs/iter cycles/iter auto Mhz
mult | 2123546 10300952 4851
keygen | 12019340 58297774 4850
sign | 3886988 18853054 4850
verify | 4507885 21864015 4850

After (P192 ~40% faster, P224 ~75%, P256 ~20%, P384 ~30%,
P521 ~70%):
NIST-P192 | nanosecs/iter cycles/iter auto Mhz
mult | 202359 981824 4852
keygen | 1200904 5826518 4852
sign | 401894 1949316 4850
verify | 423803 2056328 4852
=
NIST-P224 | nanosecs/iter cycles/iter auto Mhz
mult | 263122 1276615 4852
keygen | 1597091 7744280 4849
sign | 533359 2586848 4850
verify | 566961 2749355 4849
=
NIST-P256 | nanosecs/iter cycles/iter auto Mhz
mult | 327378 1587801 4850
keygen | 1940152 9412679 4852
sign | 635126 3080381 4850
verify | 696237 3377836 4852
=
NIST-P384 | nanosecs/iter cycles/iter auto Mhz
mult | 679447 3295097 4850
keygen | 3947485 19153305 4852
sign | 1299829 6304910 4851
verify | 1432189 6947326 4851
=
NIST-P521 | nanosecs/iter cycles/iter auto Mhz
mult | 1193042 5783179 4847
keygen | 7066699 34268023 4849
sign | 2316380 11243342 4854
verify | 2561696 12427975 4851

Benchmark on AMD Ryzen 7 5800X (i386):

Before:
NIST-P192 | nanosecs/iter cycles/iter auto Mhz
mult | 648039 3143236 4850
keygen | 3554452 17244822 4852
sign | 1163173 5641932 4850
verify | 1300076 6305673 4850
=
NIST-P224 | nanosecs/iter cycles/iter auto Mhz
mult | 798607 3874405 4851
keygen | 4657604 22589864 4850
sign | 1515803 7352049 4850
verify | 1635470 7935373 4852
=
NIST-P256 | nanosecs/iter cycles/iter auto Mhz
mult | 927033 4496283 4850
keygen | 5313601 25771983 4850
sign | 1735795 8418514 4850
verify | 1945804 9438212 4851
=
NIST-P384 | nanosecs/iter cycles/iter auto Mhz
mult | 2301781 11164473 4850
keygen | 12856001 62353242 4850
sign | 4161041 20180651 4850
verify | 4705961 22827478 4851
=
NIST-P521 | nanosecs/iter cycles/iter auto Mhz
mult | 6066635 29422721 4850
keygen | 32995868 160046407 4850
sign | 10503306 50945387 4850
verify | 12225252 59294323 4850

After (P192 ~30% faster, P224 ~30%, P256 ~15%, P384 ~30%,
P512 ~60%):
NIST-P192 | nanosecs/iter cycles/iter auto Mhz
mult | 447264 2171704 4856
keygen | 2663624 12919311 4850
sign | 888711 4310193 4850
verify | 953472 4625715 4851
=
NIST-P224 | nanosecs/iter cycles/iter auto Mhz
mult | 583938 2832214 4850
keygen | 3586202 17400678 4852
sign | 1161292 5632489 4850
verify | 1290940 6260807 4850
=
NIST-P256 | nanosecs/iter cycles/iter auto Mhz
mult | 776855 3767916 4850
keygen | 4602972 22319865 4849
sign | 1515662 7351140 4850
verify | 1670549 8102798 4850
=
NIST-P384 | nanosecs/iter cycles/iter auto Mhz
mult | 1723216 8357338 4850
keygen | 9917621 48103110 4850
sign | 3245974 15742417 4850
verify | 3611383 17517179 4851
=
NIST-P521 | nanosecs/iter cycles/iter auto Mhz
mult | 3407184 16523457 4850
keygen | 20007819 97041269 4850
sign | 6556658 31799190 4850
verify | 7185487 34853568 4851

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
---
configure.ac | 3 +
mpi/Makefile.am | 2 +-
mpi/ec-inline.h | 1141 ++++++++++++++++++++++++++++++++++++++++++++
mpi/ec-internal.h | 16 +
mpi/ec-nist.c | 787 ++++++++++++++++++++++++++++++
mpi/ec.c | 90 +++-
mpi/mpi-internal.h | 5 +
mpi/mpiutil.c | 2 +-
src/ec-context.h | 1 +
9 files changed, 2029 insertions(+), 18 deletions(-)
create mode 100644 mpi/ec-inline.h
create mode 100644 mpi/ec-nist.c

diff --git a/configure.ac b/configure.ac
index a874c411..9017efc7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -546,6 +546,9 @@ AC_ARG_ENABLE([asm],
[try_asm_modules=$enableval],
[try_asm_modules=yes])
AC_MSG_RESULT($try_asm_modules)
+if test "$try_asm_modules" != yes ; then
+ AC_DEFINE(ASM_DISABLED,1,[Defined if --disable-asm was used to configure])
+fi

# Implementation of the --enable-m-guard switch.
AC_MSG_CHECKING([whether memory guard is requested])
diff --git a/mpi/Makefile.am b/mpi/Makefile.am
index d06594e1..adb8e6f5 100644
--- a/mpi/Makefile.am
+++ b/mpi/Makefile.am
@@ -175,5 +175,5 @@ libmpi_la_SOURCES = longlong.h \
mpih-mul.c \
mpih-const-time.c \
mpiutil.c \
- ec.c ec-internal.h ec-ed25519.c
+ ec.c ec-internal.h ec-ed25519.c ec-nist.c ec-inline.h
EXTRA_libmpi_la_SOURCES = asm-common-aarch64.h
diff --git a/mpi/ec-inline.h b/mpi/ec-inline.h
new file mode 100644
index 00000000..e77e6c15
--- /dev/null
+++ b/mpi/ec-inline.h
@@ -0,0 +1,1141 @@
+/* ec-inline.h - EC inline addition/substraction helpers
+ * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_EC_INLINE_H
+#define GCRY_EC_INLINE_H
+
+
+#if BYTES_PER_MPI_LIMB == 8
+
+/* 64-bit limb definitions for 64-bit architectures. */
+
+#define LIMBS_PER_LIMB64 1
+#define LOAD64(x, pos) ((x)[pos])
+#define STORE64(x, pos, v) ((x)[pos] = (mpi_limb_t)(v))
+#define LIMB_TO64(v) ((mpi_limb_t)(v))
+#define LIMB_FROM64(v) ((mpi_limb_t)(v))
+#define HIBIT_LIMB64(v) ((mpi_limb_t)(v) >> (BITS_PER_MPI_LIMB - 1))
+#define HI32_LIMB64(v) (u32)((mpi_limb_t)(v) >> (BITS_PER_MPI_LIMB - 32))
+#define LO32_LIMB64(v) ((u32)(v))
+#define LIMB64_C(hi, lo) (((mpi_limb_t)(u32)(hi) << 32) | (u32)(lo))
+#define STORE64_COND(x, pos, mask1, val1, mask2, val2) \
+ ((x)[(pos)] = ((mask1) & (val1)) | ((mask2) & (val2)))
+
+typedef mpi_limb_t mpi_limb64_t;
+
+static inline u32
+LOAD32(mpi_ptr_t x, unsigned int pos)
+{
+ unsigned int shr = (pos % 2) * 32;
+ return (x[pos / 2] >> shr);
+}
+
+static inline mpi_limb64_t
+LIMB64_HILO(u32 hi, u32 lo)
+{
+ mpi_limb64_t v = hi;
+ return (v << 32) | lo;
+}
+
+
+/* x86-64 addition/subtraction helpers. */
+#if defined (__x86_64__) && defined(HAVE_CPU_ARCH_X86) && __GNUC__ >= 4
+
+#define ADD3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \
+ __asm__ ("addq %8, %2\n" \
+ "adcq %7, %1\n" \
+ "adcq %6, %0\n" \
+ : "=r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "0" ((mpi_limb_t)(B2)), \
+ "1" ((mpi_limb_t)(B1)), \
+ "2" ((mpi_limb_t)(B0)), \
+ "g" ((mpi_limb_t)(C2)), \
+ "g" ((mpi_limb_t)(C1)), \
+ "g" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define SUB3_LIMB64(A3, A2, A1, A0, B2, B1, B0, C2, C1, C0) \
+ __asm__ ("subq %8, %2\n" \
+ "sbbq %7, %1\n" \
+ "sbbq %6, %0\n" \
+ : "=r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "0" ((mpi_limb_t)(B2)), \
+ "1" ((mpi_limb_t)(B1)), \
+ "2" ((mpi_limb_t)(B0)), \
+ "g" ((mpi_limb_t)(C2)), \
+ "g" ((mpi_limb_t)(C1)), \
+ "g" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define ADD4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
+ __asm__ ("addq %11, %3\n" \
+ "adcq %10, %2\n" \
+ "adcq %9, %1\n" \
+ "adcq %8, %0\n" \
+ : "=r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "0" ((mpi_limb_t)(B3)), \
+ "1" ((mpi_limb_t)(B2)), \
+ "2" ((mpi_limb_t)(B1)), \
+ "3" ((mpi_limb_t)(B0)), \
+ "g" ((mpi_limb_t)(C3)), \
+ "g" ((mpi_limb_t)(C2)), \
+ "g" ((mpi_limb_t)(C1)), \
+ "g" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define SUB4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
+ __asm__ ("subq %11, %3\n" \
+ "sbbq %10, %2\n" \
+ "sbbq %9, %1\n" \
+ "sbbq %8, %0\n" \
+ : "=r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "0" ((mpi_limb_t)(B3)), \
+ "1" ((mpi_limb_t)(B2)), \
+ "2" ((mpi_limb_t)(B1)), \
+ "3" ((mpi_limb_t)(B0)), \
+ "g" ((mpi_limb_t)(C3)), \
+ "g" ((mpi_limb_t)(C2)), \
+ "g" ((mpi_limb_t)(C1)), \
+ "g" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define ADD5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
+ C4, C3, C2, C1, C0) \
+ __asm__ ("addq %14, %4\n" \
+ "adcq %13, %3\n" \
+ "adcq %12, %2\n" \
+ "adcq %11, %1\n" \
+ "adcq %10, %0\n" \
+ : "=r" (A4), \
+ "=&r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "0" ((mpi_limb_t)(B4)), \
+ "1" ((mpi_limb_t)(B3)), \
+ "2" ((mpi_limb_t)(B2)), \
+ "3" ((mpi_limb_t)(B1)), \
+ "4" ((mpi_limb_t)(B0)), \
+ "g" ((mpi_limb_t)(C4)), \
+ "g" ((mpi_limb_t)(C3)), \
+ "g" ((mpi_limb_t)(C2)), \
+ "g" ((mpi_limb_t)(C1)), \
+ "g" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define SUB5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
+ C4, C3, C2, C1, C0) \
+ __asm__ ("subq %14, %4\n" \
+ "sbbq %13, %3\n" \
+ "sbbq %12, %2\n" \
+ "sbbq %11, %1\n" \
+ "sbbq %10, %0\n" \
+ : "=r" (A4), \
+ "=&r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "0" ((mpi_limb_t)(B4)), \
+ "1" ((mpi_limb_t)(B3)), \
+ "2" ((mpi_limb_t)(B2)), \
+ "3" ((mpi_limb_t)(B1)), \
+ "4" ((mpi_limb_t)(B0)), \
+ "g" ((mpi_limb_t)(C4)), \
+ "g" ((mpi_limb_t)(C3)), \
+ "g" ((mpi_limb_t)(C2)), \
+ "g" ((mpi_limb_t)(C1)), \
+ "g" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define ADD7_LIMB64(A6, A5, A4, A3, A2, A1, A0, B6, B5, B4, B3, B2, B1, B0, \
+ C6, C5, C4, C3, C2, C1, C0) do { \
+ mpi_limb64_t __C[7] = { C0, C1, C2, C3, C4, C5, C6 }; \
+ __asm__ ("addq 0*8(%14), %6\n" \
+ "adcq 1*8(%14), %5\n" \
+ "adcq 2*8(%14), %4\n" \
+ "adcq 3*8(%14), %3\n" \
+ "adcq 4*8(%14), %2\n" \
+ "adcq 5*8(%14), %1\n" \
+ "adcq 6*8(%14), %0\n" \
+ : "=r" (A6), \
+ "=&r" (A5), \
+ "=&r" (A4), \
+ "=&r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "0" ((mpi_limb_t)(B6)), \
+ "1" ((mpi_limb_t)(B5)), \
+ "2" ((mpi_limb_t)(B4)), \
+ "3" ((mpi_limb_t)(B3)), \
+ "4" ((mpi_limb_t)(B2)), \
+ "5" ((mpi_limb_t)(B1)), \
+ "6" ((mpi_limb_t)(B0)), \
+ "r" (__C) \
+ : "cc", "memory"); \
+ } while (0)
+
+#define SUB7_LIMB64(A6, A5, A4, A3, A2, A1, A0, B6, B5, B4, B3, B2, B1, B0, \
+ C6, C5, C4, C3, C2, C1, C0) do { \
+ mpi_limb64_t __C[7] = { C0, C1, C2, C3, C4, C5, C6 }; \
+ __asm__ ("subq 0*8(%14), %6\n" \
+ "sbbq 1*8(%14), %5\n" \
+ "sbbq 2*8(%14), %4\n" \
+ "sbbq 3*8(%14), %3\n" \
+ "sbbq 4*8(%14), %2\n" \
+ "sbbq 5*8(%14), %1\n" \
+ "sbbq 6*8(%14), %0\n" \
+ : "=r" (A6), \
+ "=&r" (A5), \
+ "=&r" (A4), \
+ "=&r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "0" ((mpi_limb_t)(B6)), \
+ "1" ((mpi_limb_t)(B5)), \
+ "2" ((mpi_limb_t)(B4)), \
+ "3" ((mpi_limb_t)(B3)), \
+ "4" ((mpi_limb_t)(B2)), \
+ "5" ((mpi_limb_t)(B1)), \
+ "6" ((mpi_limb_t)(B0)), \
+ "r" (__C) \
+ : "cc", "memory"); \
+ } while (0)
+
+#endif /* __x86_64__ */
+
+
+/* ARM AArch64 addition/subtraction helpers. */
+#if defined (__aarch64__) && defined(HAVE_CPU_ARCH_ARM) && __GNUC__ >= 4
+
+#define ADD3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \
+ __asm__ ("adds %2, %5, %8\n" \
+ "adcs %1, %4, %7\n" \
+ "adc %0, %3, %6\n" \
+ : "=r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "r" ((mpi_limb_t)(B2)), \
+ "r" ((mpi_limb_t)(B1)), \
+ "r" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define SUB3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \
+ __asm__ ("subs %2, %5, %8\n" \
+ "sbcs %1, %4, %7\n" \
+ "sbc %0, %3, %6\n" \
+ : "=r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "r" ((mpi_limb_t)(B2)), \
+ "r" ((mpi_limb_t)(B1)), \
+ "r" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define ADD4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
+ __asm__ ("adds %3, %7, %11\n" \
+ "adcs %2, %6, %10\n" \
+ "adcs %1, %5, %9\n" \
+ "adc %0, %4, %8\n" \
+ : "=r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "r" ((mpi_limb_t)(B3)), \
+ "r" ((mpi_limb_t)(B2)), \
+ "r" ((mpi_limb_t)(B1)), \
+ "r" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C3)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define SUB4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
+ __asm__ ("subs %3, %7, %11\n" \
+ "sbcs %2, %6, %10\n" \
+ "sbcs %1, %5, %9\n" \
+ "sbc %0, %4, %8\n" \
+ : "=r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "r" ((mpi_limb_t)(B3)), \
+ "r" ((mpi_limb_t)(B2)), \
+ "r" ((mpi_limb_t)(B1)), \
+ "r" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C3)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define ADD5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
+ C4, C3, C2, C1, C0) \
+ __asm__ ("adds %4, %9, %14\n" \
+ "adcs %3, %8, %13\n" \
+ "adcs %2, %7, %12\n" \
+ "adcs %1, %6, %11\n" \
+ "adc %0, %5, %10\n" \
+ : "=r" (A4), \
+ "=&r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "r" ((mpi_limb_t)(B4)), \
+ "r" ((mpi_limb_t)(B3)), \
+ "r" ((mpi_limb_t)(B2)), \
+ "r" ((mpi_limb_t)(B1)), \
+ "r" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C4)), \
+ "r" ((mpi_limb_t)(C3)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define SUB5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
+ C4, C3, C2, C1, C0) \
+ __asm__ ("subs %4, %9, %14\n" \
+ "sbcs %3, %8, %13\n" \
+ "sbcs %2, %7, %12\n" \
+ "sbcs %1, %6, %11\n" \
+ "sbc %0, %5, %10\n" \
+ : "=r" (A4), \
+ "=&r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "r" ((mpi_limb_t)(B4)), \
+ "r" ((mpi_limb_t)(B3)), \
+ "r" ((mpi_limb_t)(B2)), \
+ "r" ((mpi_limb_t)(B1)), \
+ "r" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C4)), \
+ "r" ((mpi_limb_t)(C3)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define ADD7_LIMB64(A6, A5, A4, A3, A2, A1, A0, B6, B5, B4, B3, B2, B1, B0, \
+ C6, C5, C4, C3, C2, C1, C0) \
+ __asm__ ("adds %6, %13, %20\n" \
+ "adcs %5, %12, %19\n" \
+ "adcs %4, %11, %18\n" \
+ "adcs %3, %10, %17\n" \
+ "adcs %2, %9, %16\n" \
+ "adcs %1, %8, %15\n" \
+ "adc %0, %7, %14\n" \
+ : "=r" (A6), \
+ "=&r" (A5), \
+ "=&r" (A4), \
+ "=&r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "r" ((mpi_limb_t)(B6)), \
+ "r" ((mpi_limb_t)(B5)), \
+ "r" ((mpi_limb_t)(B4)), \
+ "r" ((mpi_limb_t)(B3)), \
+ "r" ((mpi_limb_t)(B2)), \
+ "r" ((mpi_limb_t)(B1)), \
+ "r" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C6)), \
+ "r" ((mpi_limb_t)(C5)), \
+ "r" ((mpi_limb_t)(C4)), \
+ "r" ((mpi_limb_t)(C3)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define SUB7_LIMB64(A6, A5, A4, A3, A2, A1, A0, B6, B5, B4, B3, B2, B1, B0, \
+ C6, C5, C4, C3, C2, C1, C0) \
+ __asm__ ("subs %6, %13, %20\n" \
+ "sbcs %5, %12, %19\n" \
+ "sbcs %4, %11, %18\n" \
+ "sbcs %3, %10, %17\n" \
+ "sbcs %2, %9, %16\n" \
+ "sbcs %1, %8, %15\n" \
+ "sbc %0, %7, %14\n" \
+ : "=r" (A6), \
+ "=&r" (A5), \
+ "=&r" (A4), \
+ "=&r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "r" ((mpi_limb_t)(B6)), \
+ "r" ((mpi_limb_t)(B5)), \
+ "r" ((mpi_limb_t)(B4)), \
+ "r" ((mpi_limb_t)(B3)), \
+ "r" ((mpi_limb_t)(B2)), \
+ "r" ((mpi_limb_t)(B1)), \
+ "r" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C6)), \
+ "r" ((mpi_limb_t)(C5)), \
+ "r" ((mpi_limb_t)(C4)), \
+ "r" ((mpi_limb_t)(C3)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#endif /* __aarch64__ */
+
+
+/* PowerPC64 addition/subtraction helpers. */
+#if defined (__powerpc__) && defined(HAVE_CPU_ARCH_PPC) && __GNUC__ >= 4
+
+#define ADD3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \
+ __asm__ ("addc %2, %8, %5\n" \
+ "adde %1, %7, %4\n" \
+ "adde %0, %6, %3\n" \
+ : "=r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "r" ((mpi_limb_t)(B2)), \
+ "r" ((mpi_limb_t)(B1)), \
+ "r" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc", "r0")
+
+#define SUB3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \
+ __asm__ ("subfc %2, %8, %5\n" \
+ "subfe %1, %7, %4\n" \
+ "subfe %0, %6, %3\n" \
+ : "=r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "r" ((mpi_limb_t)(B2)), \
+ "r" ((mpi_limb_t)(B1)), \
+ "r" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc", "r0")
+
+#define ADD4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
+ __asm__ ("addc %3, %11, %7\n" \
+ "adde %2, %10, %6\n" \
+ "adde %1, %9, %5\n" \
+ "adde %0, %8, %4\n" \
+ : "=r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "r" ((mpi_limb_t)(B3)), \
+ "r" ((mpi_limb_t)(B2)), \
+ "r" ((mpi_limb_t)(B1)), \
+ "r" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C3)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define SUB4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
+ __asm__ ("subfc %3, %11, %7\n" \
+ "subfe %2, %10, %6\n" \
+ "subfe %1, %9, %5\n" \
+ "subfe %0, %8, %4\n" \
+ : "=r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "r" ((mpi_limb_t)(B3)), \
+ "r" ((mpi_limb_t)(B2)), \
+ "r" ((mpi_limb_t)(B1)), \
+ "r" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C3)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define ADD5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
+ C4, C3, C2, C1, C0) \
+ __asm__ ("addc %4, %14, %9\n" \
+ "adde %3, %13, %8\n" \
+ "adde %2, %12, %7\n" \
+ "adde %1, %11, %6\n" \
+ "adde %0, %10, %5\n" \
+ : "=r" (A4), \
+ "=&r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "r" ((mpi_limb_t)(B4)), \
+ "r" ((mpi_limb_t)(B3)), \
+ "r" ((mpi_limb_t)(B2)), \
+ "r" ((mpi_limb_t)(B1)), \
+ "r" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C4)), \
+ "r" ((mpi_limb_t)(C3)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define SUB5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
+ C4, C3, C2, C1, C0) \
+ __asm__ ("subfc %4, %14, %9\n" \
+ "subfe %3, %13, %8\n" \
+ "subfe %2, %12, %7\n" \
+ "subfe %1, %11, %6\n" \
+ "subfe %0, %10, %5\n" \
+ : "=r" (A4), \
+ "=&r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "r" ((mpi_limb_t)(B4)), \
+ "r" ((mpi_limb_t)(B3)), \
+ "r" ((mpi_limb_t)(B2)), \
+ "r" ((mpi_limb_t)(B1)), \
+ "r" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C4)), \
+ "r" ((mpi_limb_t)(C3)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define ADD7_LIMB64(A6, A5, A4, A3, A2, A1, A0, B6, B5, B4, B3, B2, B1, B0, \
+ C6, C5, C4, C3, C2, C1, C0) \
+ __asm__ ("addc %6, %20, %13\n" \
+ "adde %5, %19, %12\n" \
+ "adde %4, %18, %11\n" \
+ "adde %3, %17, %10\n" \
+ "adde %2, %16, %9\n" \
+ "adde %1, %15, %8\n" \
+ "adde %0, %14, %7\n" \
+ : "=r" (A6), \
+ "=&r" (A5), \
+ "=&r" (A4), \
+ "=&r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "r" ((mpi_limb_t)(B6)), \
+ "r" ((mpi_limb_t)(B5)), \
+ "r" ((mpi_limb_t)(B4)), \
+ "r" ((mpi_limb_t)(B3)), \
+ "r" ((mpi_limb_t)(B2)), \
+ "r" ((mpi_limb_t)(B1)), \
+ "r" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C6)), \
+ "r" ((mpi_limb_t)(C5)), \
+ "r" ((mpi_limb_t)(C4)), \
+ "r" ((mpi_limb_t)(C3)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define SUB7_LIMB64(A6, A5, A4, A3, A2, A1, A0, B6, B5, B4, B3, B2, B1, B0, \
+ C6, C5, C4, C3, C2, C1, C0) \
+ __asm__ ("subfc %6, %20, %13\n" \
+ "subfe %5, %19, %12\n" \
+ "subfe %4, %18, %11\n" \
+ "subfe %3, %17, %10\n" \
+ "subfe %2, %16, %9\n" \
+ "subfe %1, %15, %8\n" \
+ "subfe %0, %14, %7\n" \
+ : "=r" (A6), \
+ "=&r" (A5), \
+ "=&r" (A4), \
+ "=&r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "r" ((mpi_limb_t)(B6)), \
+ "r" ((mpi_limb_t)(B5)), \
+ "r" ((mpi_limb_t)(B4)), \
+ "r" ((mpi_limb_t)(B3)), \
+ "r" ((mpi_limb_t)(B2)), \
+ "r" ((mpi_limb_t)(B1)), \
+ "r" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C6)), \
+ "r" ((mpi_limb_t)(C5)), \
+ "r" ((mpi_limb_t)(C4)), \
+ "r" ((mpi_limb_t)(C3)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#endif /* __powerpc__ */
+
+
+/* s390x/zSeries addition/subtraction helpers. */
+#if defined (__s390x__) && defined(HAVE_CPU_ARCH_S390X) && __GNUC__ >= 4
+
+#define ADD3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) \
+ __asm__ ("algr %2, %8\n" \
+ "alcgr %1, %7\n" \
+ "alcgr %0, %6\n" \
+ : "=r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "0" ((mpi_limb_t)(B2)), \
+ "1" ((mpi_limb_t)(B1)), \
+ "2" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define SUB3_LIMB64(A3, A2, A1, A0, B2, B1, B0, C2, C1, C0) \
+ __asm__ ("slgr %2, %8\n" \
+ "slbgr %1, %7\n" \
+ "slbgr %0, %6\n" \
+ : "=r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "0" ((mpi_limb_t)(B2)), \
+ "1" ((mpi_limb_t)(B1)), \
+ "2" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define ADD4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
+ __asm__ ("algr %3, %11\n" \
+ "alcgr %2, %10\n" \
+ "alcgr %1, %9\n" \
+ "alcgr %0, %8\n" \
+ : "=r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "0" ((mpi_limb_t)(B3)), \
+ "1" ((mpi_limb_t)(B2)), \
+ "2" ((mpi_limb_t)(B1)), \
+ "3" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C3)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define SUB4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
+ __asm__ ("slgr %3, %11\n" \
+ "slbgr %2, %10\n" \
+ "slbgr %1, %9\n" \
+ "slbgr %0, %8\n" \
+ : "=r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "0" ((mpi_limb_t)(B3)), \
+ "1" ((mpi_limb_t)(B2)), \
+ "2" ((mpi_limb_t)(B1)), \
+ "3" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C3)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define ADD5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
+ C4, C3, C2, C1, C0) \
+ __asm__ ("algr %4, %14\n" \
+ "alcgr %3, %13\n" \
+ "alcgr %2, %12\n" \
+ "alcgr %1, %11\n" \
+ "alcgr %0, %10\n" \
+ : "=r" (A4), \
+ "=&r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "0" ((mpi_limb_t)(B4)), \
+ "1" ((mpi_limb_t)(B3)), \
+ "2" ((mpi_limb_t)(B2)), \
+ "3" ((mpi_limb_t)(B1)), \
+ "4" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C4)), \
+ "r" ((mpi_limb_t)(C3)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define SUB5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
+ C4, C3, C2, C1, C0) \
+ __asm__ ("slgr %4, %14\n" \
+ "slbgr %3, %13\n" \
+ "slbgr %2, %12\n" \
+ "slbgr %1, %11\n" \
+ "slbgr %0, %10\n" \
+ : "=r" (A4), \
+ "=&r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "0" ((mpi_limb_t)(B4)), \
+ "1" ((mpi_limb_t)(B3)), \
+ "2" ((mpi_limb_t)(B2)), \
+ "3" ((mpi_limb_t)(B1)), \
+ "4" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C4)), \
+ "r" ((mpi_limb_t)(C3)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define ADD7_LIMB64(A6, A5, A4, A3, A2, A1, A0, B6, B5, B4, B3, B2, B1, B0, \
+ C6, C5, C4, C3, C2, C1, C0) \
+ __asm__ ("algr %6, %20\n" \
+ "alcgr %5, %19\n" \
+ "alcgr %4, %18\n" \
+ "alcgr %3, %17\n" \
+ "alcgr %2, %16\n" \
+ "alcgr %1, %15\n" \
+ "alcgr %0, %14\n" \
+ : "=r" (A6), \
+ "=&r" (A5), \
+ "=&r" (A4), \
+ "=&r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "0" ((mpi_limb_t)(B6)), \
+ "1" ((mpi_limb_t)(B5)), \
+ "2" ((mpi_limb_t)(B4)), \
+ "3" ((mpi_limb_t)(B3)), \
+ "4" ((mpi_limb_t)(B2)), \
+ "5" ((mpi_limb_t)(B1)), \
+ "6" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C6)), \
+ "r" ((mpi_limb_t)(C5)), \
+ "r" ((mpi_limb_t)(C4)), \
+ "r" ((mpi_limb_t)(C3)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define SUB7_LIMB64(A6, A5, A4, A3, A2, A1, A0, B6, B5, B4, B3, B2, B1, B0, \
+ C6, C5, C4, C3, C2, C1, C0) \
+ __asm__ ("slgr %6, %20\n" \
+ "slbgr %5, %19\n" \
+ "slbgr %4, %18\n" \
+ "slbgr %3, %17\n" \
+ "slbgr %2, %16\n" \
+ "slbgr %1, %15\n" \
+ "slbgr %0, %14\n" \
+ : "=r" (A6), \
+ "=&r" (A5), \
+ "=&r" (A4), \
+ "=&r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "0" ((mpi_limb_t)(B6)), \
+ "1" ((mpi_limb_t)(B5)), \
+ "2" ((mpi_limb_t)(B4)), \
+ "3" ((mpi_limb_t)(B3)), \
+ "4" ((mpi_limb_t)(B2)), \
+ "5" ((mpi_limb_t)(B1)), \
+ "6" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C6)), \
+ "r" ((mpi_limb_t)(C5)), \
+ "r" ((mpi_limb_t)(C4)), \
+ "r" ((mpi_limb_t)(C3)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#endif /* __x86_64__ */
+
+
+/* Common 64-bit arch addition/subtraction macros. */
+
+#define ADD2_LIMB64(A1, A0, B1, B0, C1, C0) \
+ add_ssaaaa(A1, A0, B1, B0, C1, C0)
+
+#define SUB2_LIMB64(A1, A0, B1, B0, C1, C0) \
+ sub_ddmmss(A1, A0, B1, B0, C1, C0)
+
+#endif /* BYTES_PER_MPI_LIMB == 8 */
+
+
+#if BYTES_PER_MPI_LIMB == 4
+
+/* 64-bit limb definitions for 32-bit architectures. */
+
+#define LIMBS_PER_LIMB64 2
+#define LIMB_FROM64(v) ((v).lo)
+#define HIBIT_LIMB64(v) ((v).hi >> (BITS_PER_MPI_LIMB - 1))
+#define HI32_LIMB64(v) ((v).hi)
+#define LO32_LIMB64(v) ((v).lo)
+#define LOAD32(x, pos) ((x)[pos])
+#define LIMB64_C(hi, lo) { (lo), (hi) }
+
+typedef struct
+{
+ mpi_limb_t lo;
+ mpi_limb_t hi;
+} mpi_limb64_t;
+
+static inline mpi_limb64_t
+LOAD64(const mpi_ptr_t x, unsigned int pos)
+{
+ mpi_limb64_t v;
+ v.lo = x[pos * 2 + 0];
+ v.hi = x[pos * 2 + 1];
+ return v;
+}
+
+static inline void
+STORE64(mpi_ptr_t x, unsigned int pos, mpi_limb64_t v)
+{
+ x[pos * 2 + 0] = v.lo;
+ x[pos * 2 + 1] = v.hi;
+}
+
+static inline void
+STORE64_COND(mpi_ptr_t x, unsigned int pos, mpi_limb_t mask1,
+ mpi_limb64_t val1, mpi_limb_t mask2, mpi_limb64_t val2)
+{
+ x[pos * 2 + 0] = (mask1 & val1.lo) | (mask2 & val2.lo);
+ x[pos * 2 + 1] = (mask1 & val1.hi) | (mask2 & val2.hi);
+}
+
+static inline mpi_limb64_t
+LIMB_TO64(mpi_limb_t x)
+{
+ mpi_limb64_t v;
+ v.lo = x;
+ v.hi = 0;
+ return v;
+}
+
+static inline mpi_limb64_t
+LIMB64_HILO(mpi_limb_t hi, mpi_limb_t lo)
+{
+ mpi_limb64_t v;
+ v.lo = lo;
+ v.hi = hi;
+ return v;
+}
+
+
+/* i386 addition/subtraction helpers. */
+#if defined (__i386__) && defined(HAVE_CPU_ARCH_X86) && __GNUC__ >= 4
+
+#define ADD4_LIMB32(a3, a2, a1, a0, b3, b2, b1, b0, c3, c2, c1, c0) \
+ __asm__ ("addl %11, %3\n" \
+ "adcl %10, %2\n" \
+ "adcl %9, %1\n" \
+ "adcl %8, %0\n" \
+ : "=r" (a3), \
+ "=&r" (a2), \
+ "=&r" (a1), \
+ "=&r" (a0) \
+ : "0" ((mpi_limb_t)(b3)), \
+ "1" ((mpi_limb_t)(b2)), \
+ "2" ((mpi_limb_t)(b1)), \
+ "3" ((mpi_limb_t)(b0)), \
+ "g" ((mpi_limb_t)(c3)), \
+ "g" ((mpi_limb_t)(c2)), \
+ "g" ((mpi_limb_t)(c1)), \
+ "g" ((mpi_limb_t)(c0)) \
+ : "cc")
+
+#define SUB4_LIMB32(a3, a2, a1, a0, b3, b2, b1, b0, c3, c2, c1, c0) \
+ __asm__ ("subl %11, %3\n" \
+ "sbbl %10, %2\n" \
+ "sbbl %9, %1\n" \
+ "sbbl %8, %0\n" \
+ : "=r" (a3), \
+ "=&r" (a2), \
+ "=&r" (a1), \
+ "=&r" (a0) \
+ : "0" ((mpi_limb_t)(b3)), \
+ "1" ((mpi_limb_t)(b2)), \
+ "2" ((mpi_limb_t)(b1)), \
+ "3" ((mpi_limb_t)(b0)), \
+ "g" ((mpi_limb_t)(c3)), \
+ "g" ((mpi_limb_t)(c2)), \
+ "g" ((mpi_limb_t)(c1)), \
+ "g" ((mpi_limb_t)(c0)) \
+ : "cc")
+
+#endif /* __i386__ */
+
+
+/* ARM addition/subtraction helpers. */
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+#define ADD4_LIMB32(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
+ __asm__ ("adds %3, %7, %11\n" \
+ "adcs %2, %6, %10\n" \
+ "adcs %1, %5, %9\n" \
+ "adc %0, %4, %8\n" \
+ : "=r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "r" ((mpi_limb_t)(B3)), \
+ "r" ((mpi_limb_t)(B2)), \
+ "r" ((mpi_limb_t)(B1)), \
+ "r" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C3)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#define SUB4_LIMB32(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) \
+ __asm__ ("subs %3, %7, %11\n" \
+ "sbcs %2, %6, %10\n" \
+ "sbcs %1, %5, %9\n" \
+ "sbc %0, %4, %8\n" \
+ : "=r" (A3), \
+ "=&r" (A2), \
+ "=&r" (A1), \
+ "=&r" (A0) \
+ : "r" ((mpi_limb_t)(B3)), \
+ "r" ((mpi_limb_t)(B2)), \
+ "r" ((mpi_limb_t)(B1)), \
+ "r" ((mpi_limb_t)(B0)), \
+ "r" ((mpi_limb_t)(C3)), \
+ "r" ((mpi_limb_t)(C2)), \
+ "r" ((mpi_limb_t)(C1)), \
+ "r" ((mpi_limb_t)(C0)) \
+ : "cc")
+
+#endif /* HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS */
+
+
+/* Common 32-bit arch addition/subtraction macros. */
+
+#if defined(ADD4_LIMB32)
+/* A[0..1] = B[0..1] + C[0..1] */
+#define ADD2_LIMB64(A1, A0, B1, B0, C1, C0) \
+ ADD4_LIMB32(A1.hi, A1.lo, A0.hi, A0.lo, \
+ B1.hi, B1.lo, B0.hi, B0.lo, \
+ C1.hi, C1.lo, C0.hi, C0.lo)
+#else
+/* A[0..1] = B[0..1] + C[0..1] */
+#define ADD2_LIMB64(A1, A0, B1, B0, C1, C0) do { \
+ mpi_limb_t __carry2_0, __carry2_1; \
+ add_ssaaaa(__carry2_0, A0.lo, 0, B0.lo, 0, C0.lo); \
+ add_ssaaaa(__carry2_1, A0.hi, 0, B0.hi, 0, C0.hi); \
+ add_ssaaaa(__carry2_1, A0.hi, __carry2_1, A0.hi, 0, __carry2_0); \
+ add_ssaaaa(A1.hi, A1.lo, B1.hi, B1.lo, C1.hi, C1.lo); \
+ add_ssaaaa(A1.hi, A1.lo, A1.hi, A1.lo, 0, __carry2_1); \
+ } while (0)
+#endif
+
+#if defined(SUB4_LIMB32)
+/* A[0..1] = B[0..1] - C[0..1] */
+#define SUB2_LIMB64(A1, A0, B1, B0, C1, C0) \
+ SUB4_LIMB32(A1.hi, A1.lo, A0.hi, A0.lo, \
+ B1.hi, B1.lo, B0.hi, B0.lo, \
+ C1.hi, C1.lo, C0.hi, C0.lo)
+#else
+/* A[0..1] = B[0..1] - C[0..1] */
+#define SUB2_LIMB64(A1, A0, B1, B0, C1, C0) do { \
+ mpi_limb_t __borrow2_0, __borrow2_1; \
+ sub_ddmmss(__borrow2_0, A0.lo, 0, B0.lo, 0, C0.lo); \
+ sub_ddmmss(__borrow2_1, A0.hi, 0, B0.hi, 0, C0.hi); \
+ sub_ddmmss(__borrow2_1, A0.hi, __borrow2_1, A0.hi, 0, -__borrow2_0); \
+ sub_ddmmss(A1.hi, A1.lo, B1.hi, B1.lo, C1.hi, C1.lo); \
+ sub_ddmmss(A1.hi, A1.lo, A1.hi, A1.lo, 0, -__borrow2_1); \
+ } while (0)
+#endif
+
+#endif /* BYTES_PER_MPI_LIMB == 4 */
+
+
+/* Common definitions. */
+#define BITS_PER_MPI_LIMB64 (BITS_PER_MPI_LIMB * LIMBS_PER_LIMB64)
+#define BYTES_PER_MPI_LIMB64 (BYTES_PER_MPI_LIMB * LIMBS_PER_LIMB64)
+
+
+/* Common addition/subtraction macros. */
+
+#ifndef ADD3_LIMB64
+/* A[0..2] = B[0..2] + C[0..2] */
+#define ADD3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) do { \
+ mpi_limb64_t __carry3; \
+ ADD2_LIMB64(__carry3, A0, zero, B0, zero, C0); \
+ ADD2_LIMB64(A2, A1, B2, B1, C2, C1); \
+ ADD2_LIMB64(A2, A1, A2, A1, zero, __carry3); \
+ } while (0)
+#endif
+
+#ifndef ADD4_LIMB64
+/* A[0..3] = B[0..3] + C[0..3] */
+#define ADD4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) do { \
+ mpi_limb64_t __carry4; \
+ ADD3_LIMB64(__carry4, A1, A0, zero, B1, B0, zero, C1, C0); \
+ ADD2_LIMB64(A3, A2, B3, B2, C3, C2); \
+ ADD2_LIMB64(A3, A2, A3, A2, zero, __carry4); \
+ } while (0)
+#endif
+
+#ifndef ADD5_LIMB64
+/* A[0..4] = B[0..4] + C[0..4] */
+#define ADD5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
+ C4, C3, C2, C1, C0) do { \
+ mpi_limb64_t __carry5; \
+ ADD4_LIMB64(__carry5, A2, A1, A0, zero, B2, B1, B0, zero, C2, C1, C0); \
+ ADD2_LIMB64(A4, A3, B4, B3, C4, C3); \
+ ADD2_LIMB64(A4, A3, A4, A3, zero, __carry5); \
+ } while (0)
+#endif
+
+#ifndef ADD6_LIMB64
+/* A[0..5] = B[0..5] + C[0..5] */
+#define ADD6_LIMB64(A5, A4, A3, A2, A1, A0, B5, B4, B3, B2, B1, B0, \
+ C5, C4, C3, C2, C1, C0) do { \
+ mpi_limb64_t __carry6; \
+ ADD5_LIMB64(__carry6, A3, A2, A1, A0, zero, B3, B2, B1, B0, \
+ zero, C3, C2, C1, C0); \
+ ADD2_LIMB64(A5, A4, B5, B4, C5, C4); \
+ ADD2_LIMB64(A5, A4, A5, A4, zero, __carry6); \
+ } while (0)
+#endif
+
+#ifndef ADD7_LIMB64
+/* A[0..6] = B[0..6] + C[0..6] */
+#define ADD7_LIMB64(A6, A5, A4, A3, A2, A1, A0, B6, B5, B4, B3, B2, B1, B0, \
+ C6, C5, C4, C3, C2, C1, C0) do { \
+ mpi_limb64_t __carry7; \
+ ADD6_LIMB64(__carry7, A4, A3, A2, A1, A0, zero, B4, B3, B2, B1, B0, \
+ zero, C4, C3, C2, C1, C0); \
+ ADD2_LIMB64(A6, A5, B6, B5, C6, C5); \
+ ADD2_LIMB64(A6, A5, A6, A5, zero, __carry7); \
+ } while (0)
+#endif
+
+#ifndef SUB3_LIMB64
+/* A[0..2] = B[0..2] - C[0..2] */
+#define SUB3_LIMB64(A2, A1, A0, B2, B1, B0, C2, C1, C0) do { \
+ mpi_limb64_t __borrow3; \
+ SUB2_LIMB64(__borrow3, A0, zero, B0, zero, C0); \
+ SUB2_LIMB64(A2, A1, B2, B1, C2, C1); \
+ SUB2_LIMB64(A2, A1, A2, A1, zero, LIMB_TO64(-LIMB_FROM64(__borrow3))); \
+ } while (0)
+#endif
+
+#ifndef SUB4_LIMB64
+/* A[0..3] = B[0..3] - C[0..3] */
+#define SUB4_LIMB64(A3, A2, A1, A0, B3, B2, B1, B0, C3, C2, C1, C0) do { \
+ mpi_limb64_t __borrow4; \
+ SUB3_LIMB64(__borrow4, A1, A0, zero, B1, B0, zero, C1, C0); \
+ SUB2_LIMB64(A3, A2, B3, B2, C3, C2); \
+ SUB2_LIMB64(A3, A2, A3, A2, zero, LIMB_TO64(-LIMB_FROM64(__borrow4))); \
+ } while (0)
+#endif
+
+#ifndef SUB5_LIMB64
+/* A[0..4] = B[0..4] - C[0..4] */
+#define SUB5_LIMB64(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0, \
+ C4, C3, C2, C1, C0) do { \
+ mpi_limb64_t __borrow5; \
+ SUB4_LIMB64(__borrow5, A2, A1, A0, zero, B2, B1, B0, zero, C2, C1, C0); \
+ SUB2_LIMB64(A4, A3, B4, B3, C4, C3); \
+ SUB2_LIMB64(A4, A3, A4, A3, zero, LIMB_TO64(-LIMB_FROM64(__borrow5))); \
+ } while (0)
+#endif
+
+#ifndef SUB6_LIMB64
+/* A[0..5] = B[0..5] - C[0..5] */
+#define SUB6_LIMB64(A5, A4, A3, A2, A1, A0, B5, B4, B3, B2, B1, B0, \
+ C5, C4, C3, C2, C1, C0) do { \
+ mpi_limb64_t __borrow6; \
+ SUB5_LIMB64(__borrow6, A3, A2, A1, A0, zero, B3, B2, B1, B0, \
+ zero, C3, C2, C1, C0); \
+ SUB2_LIMB64(A5, A4, B5, B4, C5, C4); \
+ SUB2_LIMB64(A5, A4, A5, A4, zero, LIMB_TO64(-LIMB_FROM64(__borrow6))); \
+ } while (0)
+#endif
+
+#ifndef SUB7_LIMB64
+/* A[0..6] = B[0..6] - C[0..6] */
+#define SUB7_LIMB64(A6, A5, A4, A3, A2, A1, A0, B6, B5, B4, B3, B2, B1, B0, \
+ C6, C5, C4, C3, C2, C1, C0) do { \
+ mpi_limb64_t __borrow7; \
+ SUB6_LIMB64(__borrow7, A4, A3, A2, A1, A0, zero, B4, B3, B2, B1, B0, \
+ zero, C4, C3, C2, C1, C0); \
+ SUB2_LIMB64(A6, A5, B6, B5, C6, C5); \
+ SUB2_LIMB64(A6, A5, A6, A5, zero, LIMB_TO64(-LIMB_FROM64(__borrow7))); \
+ } while (0)
+#endif
+
+
+/* Helper functions. */
+
+static inline int
+mpi_nbits_more_than (gcry_mpi_t w, unsigned int nbits)
+{
+ unsigned int nbits_nlimbs;
+ mpi_limb_t wlimb;
+ unsigned int n;
+
+ nbits_nlimbs = (nbits + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB;
+
+ /* Note: Assumes that 'w' is normalized. */
+
+ if (w->nlimbs > nbits_nlimbs)
+ return 1;
+ if (w->nlimbs < nbits_nlimbs)
+ return 0;
+ if ((nbits % BITS_PER_MPI_LIMB) == 0)
+ return 0;
+
+ wlimb = w->d[nbits_nlimbs - 1];
+ if (wlimb == 0)
+ log_bug ("mpi_nbits_more_than: input mpi not normalized\n");
+
+ count_leading_zeros (n, wlimb);
+
+ return (BITS_PER_MPI_LIMB - n) > (nbits % BITS_PER_MPI_LIMB);
+}
+
+#endif /* GCRY_EC_INLINE_H */
diff --git a/mpi/ec-internal.h b/mpi/ec-internal.h
index 759335aa..2296d55d 100644
--- a/mpi/ec-internal.h
+++ b/mpi/ec-internal.h
@@ -20,6 +20,22 @@
#ifndef GCRY_EC_INTERNAL_H
#define GCRY_EC_INTERNAL_H

+#include <config.h>
+
void _gcry_mpi_ec_ed25519_mod (gcry_mpi_t a);

+#ifndef ASM_DISABLED
+void _gcry_mpi_ec_nist192_mod (gcry_mpi_t w, mpi_ec_t ctx);
+void _gcry_mpi_ec_nist224_mod (gcry_mpi_t w, mpi_ec_t ctx);
+void _gcry_mpi_ec_nist256_mod (gcry_mpi_t w, mpi_ec_t ctx);
+void _gcry_mpi_ec_nist384_mod (gcry_mpi_t w, mpi_ec_t ctx);
+void _gcry_mpi_ec_nist521_mod (gcry_mpi_t w, mpi_ec_t ctx);
+#else
+# define _gcry_mpi_ec_nist192_mod NULL
+# define _gcry_mpi_ec_nist224_mod NULL
+# define _gcry_mpi_ec_nist256_mod NULL
+# define _gcry_mpi_ec_nist384_mod NULL
+# define _gcry_mpi_ec_nist521_mod NULL
+#endif
+
#endif /*GCRY_EC_INTERNAL_H*/
diff --git a/mpi/ec-nist.c b/mpi/ec-nist.c
new file mode 100644
index 00000000..1a8281be
--- /dev/null
+++ b/mpi/ec-nist.c
@@ -0,0 +1,787 @@
+/* ec-nist.c - NIST optimized elliptic curve functions
+ * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+
+
+#ifndef ASM_DISABLED
+
+
+#include "mpi-internal.h"
+#include "longlong.h"
+#include "g10lib.h"
+#include "context.h"
+#include "ec-context.h"
+#include "ec-inline.h"
+
+
+/* These variables are used to generate masks from conditional operation
+ * flag parameters. Use of volatile prevents compiler optimizations from
+ * converting AND-masking to conditional branches. */
+static volatile mpi_limb_t vzero = 0;
+static volatile mpi_limb_t vone = 1;
+
+
+static inline
+void prefetch(const void *tab, size_t len)
+{
+ const volatile byte *vtab = tab;
+
+ if (len > 0 * 64)
+ (void)vtab[0 * 64];
+ if (len > 1 * 64)
+ (void)vtab[1 * 64];
+ if (len > 2 * 64)
+ (void)vtab[2 * 64];
+ if (len > 3 * 64)
+ (void)vtab[3 * 64];
+ if (len > 4 * 64)
+ (void)vtab[4 * 64];
+ if (len > 5 * 64)
+ (void)vtab[5 * 64];
+ if (len > 6 * 64)
+ (void)vtab[6 * 64];
+ if (len > 7 * 64)
+ (void)vtab[7 * 64];
+ if (len > 8 * 64)
+ (void)vtab[8 * 64];
+ if (len > 9 * 64)
+ (void)vtab[9 * 64];
+ if (len > 10 * 64)
+ (void)vtab[10 * 64];
+ (void)vtab[len - 1];
+}
+
+
+/* Fast reduction routines for NIST curves. */
+
+void
+_gcry_mpi_ec_nist192_mod (gcry_mpi_t w, mpi_ec_t ctx)
+{
+ static const mpi_limb64_t p_mult[3][4] =
+ {
+ { /* P * 1 */
+ LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xfffffffeU),
+ LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0x00000000U)
+ },
+ { /* P * 2 */
+ LIMB64_C(0xffffffffU, 0xfffffffeU), LIMB64_C(0xffffffffU, 0xfffffffdU),
+ LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0x00000001U)
+ },
+ { /* P * 3 */
+ LIMB64_C(0xffffffffU, 0xfffffffdU), LIMB64_C(0xffffffffU, 0xfffffffcU),
+ LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0x00000002U)
+ }
+ };
+ const mpi_limb64_t zero = LIMB_TO64(0);
+ mpi_ptr_t wp;
+ mpi_ptr_t pp;
+ mpi_size_t wsize = 192 / BITS_PER_MPI_LIMB64;
+ mpi_limb64_t s[wsize + 1];
+ mpi_limb64_t o[wsize + 1];
+ mpi_limb_t mask1;
+ mpi_limb_t mask2;
+ int carry;
+
+ MPN_NORMALIZE (w->d, w->nlimbs);
+ if (mpi_nbits_more_than (w, 2 * 192))
+ log_bug ("W must be less than m^2\n");
+
+ RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2 * LIMBS_PER_LIMB64);
+ RESIZE_AND_CLEAR_IF_NEEDED (ctx->p, wsize * LIMBS_PER_LIMB64);
+
+ pp = ctx->p->d;
+ wp = w->d;
+
+ prefetch (p_mult, sizeof(p_mult));
+
+ /* See "FIPS 186-4, D.2.1 Curve P-192". */
+
+ s[0] = LOAD64(wp, 3);
+ ADD3_LIMB64 (s[3], s[2], s[1],
+ zero, zero, LOAD64(wp, 3),
+ zero, LOAD64(wp, 4), LOAD64(wp, 4));
+
+ ADD4_LIMB64 (s[3], s[2], s[1], s[0],
+ s[3], s[2], s[1], s[0],
+ zero, LOAD64(wp, 5), LOAD64(wp, 5), LOAD64(wp, 5));
+
+ ADD4_LIMB64 (s[3], s[2], s[1], s[0],
+ s[3], s[2], s[1], s[0],
+ zero, LOAD64(wp, 2), LOAD64(wp, 1), LOAD64(wp, 0));
+
+ /* mod p:
+ * 's[3]' holds carry value (0..2). Subtract (carry + 1) * p. Result will be
+ * with in range -p...p. Handle result being negative with addition and
+ * conditional store. */
+
+ carry = LO32_LIMB64(s[3]);
+
+ SUB4_LIMB64 (s[3], s[2], s[1], s[0],
+ s[3], s[2], s[1], s[0],
+ p_mult[carry][3], p_mult[carry][2],
+ p_mult[carry][1], p_mult[carry][0]);
+
+ ADD4_LIMB64 (o[3], o[2], o[1], o[0],
+ s[3], s[2], s[1], s[0],
+ zero, LOAD64(pp, 2), LOAD64(pp, 1), LOAD64(pp, 0));
+ mask1 = vzero - (LO32_LIMB64(o[3]) >> 31);
+ mask2 = (LO32_LIMB64(o[3]) >> 31) - vone;
+
+ STORE64_COND(wp, 0, mask2, o[0], mask1, s[0]);
+ STORE64_COND(wp, 1, mask2, o[1], mask1, s[1]);
+ STORE64_COND(wp, 2, mask2, o[2], mask1, s[2]);
+
+ w->nlimbs = 192 / BITS_PER_MPI_LIMB;
+ MPN_NORMALIZE (wp, w->nlimbs);
+}
+
+void
+_gcry_mpi_ec_nist224_mod (gcry_mpi_t w, mpi_ec_t ctx)
+{
+ static const mpi_limb64_t p_mult[5][4] =
+ {
+ { /* P * -1 */
+ LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0xffffffffU),
+ LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xffffffffU, 0x00000000U)
+ },
+ { /* P * 0 */
+ LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
+ LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U)
+ },
+ { /* P * 1 */
+ LIMB64_C(0x00000000U, 0x00000001U), LIMB64_C(0xffffffffU, 0x00000000U),
+ LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0xffffffffU)
+ },
+ { /* P * 2 */
+ LIMB64_C(0x00000000U, 0x00000002U), LIMB64_C(0xfffffffeU, 0x00000000U),
+ LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000001U, 0xffffffffU)
+ },
+ { /* P * 3 */
+ LIMB64_C(0x00000000U, 0x00000003U), LIMB64_C(0xfffffffdU, 0x00000000U),
+ LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000002U, 0xffffffffU)
+ }
+ };
+ const mpi_limb64_t zero = LIMB_TO64(0);
+ mpi_ptr_t wp;
+ mpi_ptr_t pp;
+ mpi_size_t wsize = (224 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64;
+ mpi_size_t psize = ctx->p->nlimbs;
+ mpi_limb64_t s[wsize];
+ mpi_limb64_t d[wsize];
+ mpi_limb_t mask1;
+ mpi_limb_t mask2;
+ int carry;
+
+ MPN_NORMALIZE (w->d, w->nlimbs);
+ if (mpi_nbits_more_than (w, 2 * 224))
+ log_bug ("W must be less than m^2\n");
+
+ RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2 * LIMBS_PER_LIMB64);
+ RESIZE_AND_CLEAR_IF_NEEDED (ctx->p, wsize * LIMBS_PER_LIMB64);
+ ctx->p->nlimbs = psize;
+
+ pp = ctx->p->d;
+ wp = w->d;
+
+ prefetch (p_mult, sizeof(p_mult));
+
+ /* See "FIPS 186-4, D.2.2 Curve P-224". */
+
+ /* "S1 + S2" with 64-bit limbs:
+ * [0:A10]:[ A9: A8]:[ A7:0]:[0:0]
+ * + [0:0]:[A13:A12]:[A11:0]:[0:0]
+ * => s[3]:s[2]:s[1]:s[0]
+ */
+ s[0] = zero;
+ ADD3_LIMB64 (s[3], s[2], s[1],
+ LIMB64_HILO(0, LOAD32(wp, 10)),
+ LOAD64(wp, 8 / 2),
+ LIMB64_HILO(LOAD32(wp, 7), 0),
+ zero,
+ LOAD64(wp, 12 / 2),
+ LIMB64_HILO(LOAD32(wp, 11), 0));
+
+ /* "T + S1 + S2" */
+ ADD4_LIMB64 (s[3], s[2], s[1], s[0],
+ s[3], s[2], s[1], s[0],
+ LIMB64_HILO(0, LOAD32(wp, 6)),
+ LOAD64(wp, 4 / 2),
+ LOAD64(wp, 2 / 2),
+ LOAD64(wp, 0 / 2));
+
+ /* "D1 + D2" with 64-bit limbs:
+ * [0:A13]:[A12:A11]:[A10: A9]:[ A8: A7]
+ * + [0:0]:[ 0: 0]:[ 0:A13]:[A12:A11]
+ * => d[3]:d[2]:d[1]:d[0]
+ */
+ ADD4_LIMB64 (d[3], d[2], d[1], d[0],
+ LIMB64_HILO(0, LOAD32(wp, 13)),
+ LIMB64_HILO(LOAD32(wp, 12), LOAD32(wp, 11)),
+ LIMB64_HILO(LOAD32(wp, 10), LOAD32(wp, 9)),
+ LIMB64_HILO(LOAD32(wp, 8), LOAD32(wp, 7)),
+ zero,
+ zero,
+ LIMB64_HILO(0, LOAD32(wp, 13)),
+ LIMB64_HILO(LOAD32(wp, 12), LOAD32(wp, 11)));
+
+ /* "T + S1 + S2 - D1 - D2" */
+ SUB4_LIMB64 (s[3], s[2], s[1], s[0],
+ s[3], s[2], s[1], s[0],
+ d[3], d[2], d[1], d[0]);
+
+ /* mod p:
+ * Upper 32-bits of 's[3]' holds carry value (-2..2).
+ * Subtract (carry + 1) * p. Result will be with in range -p...p.
+ * Handle result being negative with addition and conditional store. */
+
+ carry = HI32_LIMB64(s[3]);
+
+ SUB4_LIMB64 (s[3], s[2], s[1], s[0],
+ s[3], s[2], s[1], s[0],
+ p_mult[carry + 2][3], p_mult[carry + 2][2],
+ p_mult[carry + 2][1], p_mult[carry + 2][0]);
+
+ ADD4_LIMB64 (d[3], d[2], d[1], d[0],
+ s[3], s[2], s[1], s[0],
+ LOAD64(pp, 3), LOAD64(pp, 2), LOAD64(pp, 1), LOAD64(pp, 0));
+
+ mask1 = vzero - (HI32_LIMB64(d[3]) >> 31);
+ mask2 = (HI32_LIMB64(d[3]) >> 31) - vone;
+
+ STORE64_COND(wp, 0, mask2, d[0], mask1, s[0]);
+ STORE64_COND(wp, 1, mask2, d[1], mask1, s[1]);
+ STORE64_COND(wp, 2, mask2, d[2], mask1, s[2]);
+ STORE64_COND(wp, 3, mask2, d[3], mask1, s[3]);
+
+ w->nlimbs = wsize * LIMBS_PER_LIMB64;
+ MPN_NORMALIZE (wp, w->nlimbs);
+}
+
+void
+_gcry_mpi_ec_nist256_mod (gcry_mpi_t w, mpi_ec_t ctx)
+{
+ static const mpi_limb64_t p_mult[11][5] =
+ {
+ { /* P * -3 */
+ LIMB64_C(0x00000000U, 0x00000003U), LIMB64_C(0xfffffffdU, 0x00000000U),
+ LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000002U, 0xfffffffcU),
+ LIMB64_C(0xffffffffU, 0xfffffffdU)
+ },
+ { /* P * -2 */
+ LIMB64_C(0x00000000U, 0x00000002U), LIMB64_C(0xfffffffeU, 0x00000000U),
+ LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000001U, 0xfffffffdU),
+ LIMB64_C(0xffffffffU, 0xfffffffeU)
+ },
+ { /* P * -1 */
+ LIMB64_C(0x00000000U, 0x00000001U), LIMB64_C(0xffffffffU, 0x00000000U),
+ LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0xfffffffeU),
+ LIMB64_C(0xffffffffU, 0xffffffffU)
+ },
+ { /* P * 0 */
+ LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
+ LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
+ LIMB64_C(0x00000000U, 0x00000000U)
+ },
+ { /* P * 1 */
+ LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0xffffffffU),
+ LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xffffffffU, 0x00000001U),
+ LIMB64_C(0x00000000U, 0x00000000U)
+ },
+ { /* P * 2 */
+ LIMB64_C(0xffffffffU, 0xfffffffeU), LIMB64_C(0x00000001U, 0xffffffffU),
+ LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffeU, 0x00000002U),
+ LIMB64_C(0x00000000U, 0x00000001U)
+ },
+ { /* P * 3 */
+ LIMB64_C(0xffffffffU, 0xfffffffdU), LIMB64_C(0x00000002U, 0xffffffffU),
+ LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffdU, 0x00000003U),
+ LIMB64_C(0x00000000U, 0x00000002U)
+ },
+ { /* P * 4 */
+ LIMB64_C(0xffffffffU, 0xfffffffcU), LIMB64_C(0x00000003U, 0xffffffffU),
+ LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffcU, 0x00000004U),
+ LIMB64_C(0x00000000U, 0x00000003U)
+ },
+ { /* P * 5 */
+ LIMB64_C(0xffffffffU, 0xfffffffbU), LIMB64_C(0x00000004U, 0xffffffffU),
+ LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffbU, 0x00000005U),
+ LIMB64_C(0x00000000U, 0x00000004U)
+ },
+ { /* P * 6 */
+ LIMB64_C(0xffffffffU, 0xfffffffaU), LIMB64_C(0x00000005U, 0xffffffffU),
+ LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffaU, 0x00000006U),
+ LIMB64_C(0x00000000U, 0x00000005U)
+ },
+ { /* P * 7 */
+ LIMB64_C(0xffffffffU, 0xfffffff9U), LIMB64_C(0x00000006U, 0xffffffffU),
+ LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffff9U, 0x00000007U),
+ LIMB64_C(0x00000000U, 0x00000006U)
+ }
+ };
+ const mpi_limb64_t zero = LIMB_TO64(0);
+ mpi_ptr_t wp;
+ mpi_ptr_t pp;
+ mpi_size_t wsize = (256 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64;
+ mpi_size_t psize = ctx->p->nlimbs;
+ mpi_limb64_t s[wsize + 1];
+ mpi_limb64_t t[wsize + 1];
+ mpi_limb64_t d[wsize + 1];
+ mpi_limb_t mask1;
+ mpi_limb_t mask2;
+ int carry;
+
+ MPN_NORMALIZE (w->d, w->nlimbs);
+ if (mpi_nbits_more_than (w, 2 * 256))
+ log_bug ("W must be less than m^2\n");
+
+ RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2 * LIMBS_PER_LIMB64);
+ RESIZE_AND_CLEAR_IF_NEEDED (ctx->p, wsize * LIMBS_PER_LIMB64);
+ ctx->p->nlimbs = psize;
+
+ pp = ctx->p->d;
+ wp = w->d;
+
+ prefetch (p_mult, sizeof(p_mult));
+
+ /* See "FIPS 186-4, D.2.3 Curve P-256". */
+
+ /* "S1 + S2" with 64-bit limbs:
+ * [A15:A14]:[A13:A12]:[A11:0]:[0:0]
+ * + [0:A15]:[A14:A13]:[A12:0]:[0:0]
+ * => s[4]:s[3]:s[2]:s[1]:s[0]
+ */
+ s[0] = zero;
+ ADD4_LIMB64 (s[4], s[3], s[2], s[1],
+ zero,
+ LOAD64(wp, 14 / 2),
+ LOAD64(wp, 12 / 2),
+ LIMB64_HILO(LOAD32(wp, 11), 0),
+ zero,
+ LIMB64_HILO(0, LOAD32(wp, 15)),
+ LIMB64_HILO(LOAD32(wp, 14), LOAD32(wp, 13)),
+ LIMB64_HILO(LOAD32(wp, 12), 0));
+
+ /* "S3 + S4" with 64-bit limbs:
+ * [A15:A14]:[ 0: 0]:[ 0:A10]:[ A9:A8]
+ * + [A8:A13]:[A15:A14]:[A13:A11]:[A10:A9]
+ * => t[4]:t[3]:t[2]:t[1]:t[0]
+ */
+ ADD5_LIMB64 (t[4], t[3], t[2], t[1], t[0],
+ zero,
+ LOAD64(wp, 14 / 2),
+ zero,
+ LIMB64_HILO(0, LOAD32(wp, 10)),
+ LOAD64(wp, 8 / 2),
+ zero,
+ LIMB64_HILO(LOAD32(wp, 8), LOAD32(wp, 13)),
+ LOAD64(wp, 14 / 2),
+ LIMB64_HILO(LOAD32(wp, 13), LOAD32(wp, 11)),
+ LIMB64_HILO(LOAD32(wp, 10), LOAD32(wp, 9)));
+
+ /* "2*S1 + 2*S2" */
+ ADD5_LIMB64 (s[4], s[3], s[2], s[1], s[0],
+ s[4], s[3], s[2], s[1], s[0],
+ s[4], s[3], s[2], s[1], s[0]);
+
+ /* "D1 + D2" with 64-bit limbs:
+ * [0:A13]:[A12:A11] + [A15:A14]:[A13:A12] => d[2]:d[1]:d[0]
+ * [A10:A8] + [A11:A9] => d[4]:d[3]
+ */
+ ADD3_LIMB64 (d[2], d[1], d[0],
+ zero,
+ LIMB64_HILO(0, LOAD32(wp, 13)),
+ LIMB64_HILO(LOAD32(wp, 12), LOAD32(wp, 11)),
+ zero,
+ LOAD64(wp, 14 / 2),
+ LOAD64(wp, 12 / 2));
+ ADD2_LIMB64 (d[4], d[3],
+ zero, LIMB64_HILO(LOAD32(wp, 10), LOAD32(wp, 8)),
+ zero, LIMB64_HILO(LOAD32(wp, 11), LOAD32(wp, 9)));
+
+ /* "T + S3 + S4" */
+ ADD5_LIMB64 (t[4], t[3], t[2], t[1], t[0],
+ t[4], t[3], t[2], t[1], t[0],
+ zero,
+ LOAD64(wp, 6 / 2),
+ LOAD64(wp, 4 / 2),
+ LOAD64(wp, 2 / 2),
+ LOAD64(wp, 0 / 2));
+
+ /* "2*S1 + 2*S2 - D3" with 64-bit limbs:
+ * s[4]: s[3]: s[2]: s[1]: s[0]
+ * - [A12:0]:[A10:A9]:[A8:A15]:[A14:A13]
+ * => s[4]:s[3]:s[2]:s[1]:s[0]
+ */
+ SUB5_LIMB64 (s[4], s[3], s[2], s[1], s[0],
+ s[4], s[3], s[2], s[1], s[0],
+ zero,
+ LIMB64_HILO(LOAD32(wp, 12), 0),
+ LIMB64_HILO(LOAD32(wp, 10), LOAD32(wp, 9)),
+ LIMB64_HILO(LOAD32(wp, 8), LOAD32(wp, 15)),
+ LIMB64_HILO(LOAD32(wp, 14), LOAD32(wp, 13)));
+
+ /* "D1 + D2 + D4" with 64-bit limbs:
+ * d[4]: d[3]: d[2]: d[1]: d[0]
+ * - [A13:0]:[A11:A10]:[A9:0]:[A15:A14]
+ * => d[4]:d[3]:d[2]:d[1]:d[0]
+ */
+ ADD5_LIMB64 (d[4], d[3], d[2], d[1], d[0],
+ d[4], d[3], d[2], d[1], d[0],
+ zero,
+ LIMB64_HILO(LOAD32(wp, 13), 0),
+ LOAD64(wp, 10 / 2),
+ LIMB64_HILO(LOAD32(wp, 9), 0),
+ LOAD64(wp, 14 / 2));
+
+ /* "T + 2*S1 + 2*S2 + S3 + S4 - D3" */
+ ADD5_LIMB64 (s[4], s[3], s[2], s[1], s[0],
+ s[4], s[3], s[2], s[1], s[0],
+ t[4], t[3], t[2], t[1], t[0]);
+
+ /* "T + 2*S1 + 2*S2 + S3 + S4 - D1 - D2 - D3 - D4" */
+ SUB5_LIMB64 (s[4], s[3], s[2], s[1], s[0],
+ s[4], s[3], s[2], s[1], s[0],
+ d[4], d[3], d[2], d[1], d[0]);
+
+ /* mod p:
+ * 's[4]' holds carry value (-4..6). Subtract (carry + 1) * p. Result
+ * will be with in range -p...p. Handle result being negative with
+ * addition and conditional store. */
+
+ carry = LO32_LIMB64(s[4]);
+
+ SUB5_LIMB64 (s[4], s[3], s[2], s[1], s[0],
+ s[4], s[3], s[2], s[1], s[0],
+ p_mult[carry + 4][4], p_mult[carry + 4][3],
+ p_mult[carry + 4][2], p_mult[carry + 4][1],
+ p_mult[carry + 4][0]);
+
+ ADD5_LIMB64 (d[4], d[3], d[2], d[1], d[0],
+ s[4], s[3], s[2], s[1], s[0],
+ zero,
+ LOAD64(pp, 3), LOAD64(pp, 2), LOAD64(pp, 1), LOAD64(pp, 0));
+
+ mask1 = vzero - (LO32_LIMB64(d[4]) >> 31);
+ mask2 = (LO32_LIMB64(d[4]) >> 31) - vone;
+
+ STORE64_COND(wp, 0, mask2, d[0], mask1, s[0]);
+ STORE64_COND(wp, 1, mask2, d[1], mask1, s[1]);
+ STORE64_COND(wp, 2, mask2, d[2], mask1, s[2]);
+ STORE64_COND(wp, 3, mask2, d[3], mask1, s[3]);
+
+ w->nlimbs = wsize * LIMBS_PER_LIMB64;
+ MPN_NORMALIZE (wp, w->nlimbs);
+}
+
+void
+_gcry_mpi_ec_nist384_mod (gcry_mpi_t w, mpi_ec_t ctx)
+{
+ static const mpi_limb64_t p_mult[11][7] =
+ {
+ { /* P * -2 */
+ LIMB64_C(0xfffffffeU, 0x00000002U), LIMB64_C(0x00000001U, 0xffffffffU),
+ LIMB64_C(0x00000000U, 0x00000002U), LIMB64_C(0x00000000U, 0x00000000U),
+ LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
+ LIMB64_C(0xffffffffU, 0xfffffffeU)
+ },
+ { /* P * -1 */
+ LIMB64_C(0xffffffffU, 0x00000001U), LIMB64_C(0x00000000U, 0xffffffffU),
+ LIMB64_C(0x00000000U, 0x00000001U), LIMB64_C(0x00000000U, 0x00000000U),
+ LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
+ LIMB64_C(0xffffffffU, 0xffffffffU)
+ },
+ { /* P * 0 */
+ LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
+ LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
+ LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
+ LIMB64_C(0x00000000U, 0x00000000U)
+ },
+ { /* P * 1 */
+ LIMB64_C(0x00000000U, 0xffffffffU), LIMB64_C(0xffffffffU, 0x00000000U),
+ LIMB64_C(0xffffffffU, 0xfffffffeU), LIMB64_C(0xffffffffU, 0xffffffffU),
+ LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
+ LIMB64_C(0x00000000U, 0x00000000U)
+ },
+ { /* P * 2 */
+ LIMB64_C(0x00000001U, 0xfffffffeU), LIMB64_C(0xfffffffeU, 0x00000000U),
+ LIMB64_C(0xffffffffU, 0xfffffffdU), LIMB64_C(0xffffffffU, 0xffffffffU),
+ LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
+ LIMB64_C(0x00000000U, 0x00000001U)
+ },
+ { /* P * 3 */
+ LIMB64_C(0x00000002U, 0xfffffffdU), LIMB64_C(0xfffffffdU, 0x00000000U),
+ LIMB64_C(0xffffffffU, 0xfffffffcU), LIMB64_C(0xffffffffU, 0xffffffffU),
+ LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
+ LIMB64_C(0x00000000U, 0x00000002U)
+ },
+ { /* P * 4 */
+ LIMB64_C(0x00000003U, 0xfffffffcU), LIMB64_C(0xfffffffcU, 0x00000000U),
+ LIMB64_C(0xffffffffU, 0xfffffffbU), LIMB64_C(0xffffffffU, 0xffffffffU),
+ LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
+ LIMB64_C(0x00000000U, 0x00000003U)
+ },
+ { /* P * 5 */
+ LIMB64_C(0x00000004U, 0xfffffffbU), LIMB64_C(0xfffffffbU, 0x00000000U),
+ LIMB64_C(0xffffffffU, 0xfffffffaU), LIMB64_C(0xffffffffU, 0xffffffffU),
+ LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
+ LIMB64_C(0x00000000U, 0x00000004U)
+ },
+ { /* P * 6 */
+ LIMB64_C(0x00000005U, 0xfffffffaU), LIMB64_C(0xfffffffaU, 0x00000000U),
+ LIMB64_C(0xffffffffU, 0xfffffff9U), LIMB64_C(0xffffffffU, 0xffffffffU),
+ LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
+ LIMB64_C(0x00000000U, 0x00000005U)
+ },
+ { /* P * 7 */
+ LIMB64_C(0x00000006U, 0xfffffff9U), LIMB64_C(0xfffffff9U, 0x00000000U),
+ LIMB64_C(0xffffffffU, 0xfffffff8U), LIMB64_C(0xffffffffU, 0xffffffffU),
+ LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
+ LIMB64_C(0x00000000U, 0x00000006U)
+ },
+ { /* P * 8 */
+ LIMB64_C(0x00000007U, 0xfffffff8U), LIMB64_C(0xfffffff8U, 0x00000000U),
+ LIMB64_C(0xffffffffU, 0xfffffff7U), LIMB64_C(0xffffffffU, 0xffffffffU),
+ LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
+ LIMB64_C(0x00000000U, 0x00000007U)
+ },
+ };
+ const mpi_limb64_t zero = LIMB_TO64(0);
+ mpi_ptr_t wp;
+ mpi_ptr_t pp;
+ mpi_size_t wsize = (384 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64;
+ mpi_size_t psize = ctx->p->nlimbs;
+#if BITS_PER_MPI_LIMB64 == BITS_PER_MPI_LIMB
+ mpi_limb_t wp_shr32[wsize * LIMBS_PER_LIMB64];
+#endif
+ mpi_limb64_t s[wsize + 1];
+ mpi_limb64_t t[wsize + 1];
+ mpi_limb64_t d[wsize + 1];
+ mpi_limb64_t x[wsize + 1];
+ mpi_limb_t mask1;
+ mpi_limb_t mask2;
+ int carry;
+
+ MPN_NORMALIZE (w->d, w->nlimbs);
+ if (mpi_nbits_more_than (w, 2 * 384))
+ log_bug ("W must be less than m^2\n");
+
+ RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2 * LIMBS_PER_LIMB64);
+ RESIZE_AND_CLEAR_IF_NEEDED (ctx->p, wsize * LIMBS_PER_LIMB64);
+ ctx->p->nlimbs = psize;
+
+ pp = ctx->p->d;
+ wp = w->d;
+
+ prefetch (p_mult, sizeof(p_mult));
+
+ /* See "FIPS 186-4, D.2.4 Curve P-384". */
+
+#if BITS_PER_MPI_LIMB64 == BITS_PER_MPI_LIMB
+# define LOAD64_SHR32(idx) LOAD64(wp_shr32, ((idx) / 2 - wsize))
+ _gcry_mpih_rshift (wp_shr32, wp + 384 / BITS_PER_MPI_LIMB,
+ wsize * LIMBS_PER_LIMB64, 32);
+#else
+# define LOAD64_SHR32(idx) LIMB64_HILO(LOAD32(wp, (idx) + 1), LOAD32(wp, idx))
+#endif
+
+ /* "S1 + S1" with 64-bit limbs:
+ * [0:A23]:[A22:A21]
+ * + [0:A23]:[A22:A21]
+ * => s[3]:s[2]
+ */
+ ADD2_LIMB64 (s[3], s[2],
+ LIMB64_HILO(0, LOAD32(wp, 23)),
+ LOAD64_SHR32(21),
+ LIMB64_HILO(0, LOAD32(wp, 23)),
+ LOAD64_SHR32(21));
+
+ /* "S5 + S6" with 64-bit limbs:
+ * [A23:A22]:[A21:A20]:[ 0:0]:[0: 0]
+ * + [ 0: 0]:[A23:A22]:[A21:0]:[0:A20]
+ * => x[4]:x[3]:x[2]:x[1]:x[0]
+ */
+ x[0] = LIMB64_HILO(0, LOAD32(wp, 20));
+ x[1] = LIMB64_HILO(LOAD32(wp, 21), 0);
+ ADD3_LIMB64 (x[4], x[3], x[2],
+ zero, LOAD64(wp, 22 / 2), LOAD64(wp, 20 / 2),
+ zero, zero, LOAD64(wp, 22 / 2));
+
+ /* "D2 + D3" with 64-bit limbs:
+ * [0:A23]:[A22:A21]:[A20:0]
+ * + [0:A23]:[A23:0]:[0:0]
+ * => d[2]:d[1]:d[0]
+ */
+ d[0] = LIMB64_HILO(LOAD32(wp, 20), 0);
+ ADD2_LIMB64 (d[2], d[1],
+ LIMB64_HILO(0, LOAD32(wp, 23)),
+ LOAD64_SHR32(21),
+ LIMB64_HILO(0, LOAD32(wp, 23)),
+ LIMB64_HILO(LOAD32(wp, 23), 0));
+
+ /* "2*S1 + S5 + S6" with 64-bit limbs:
+ * s[4]:s[3]:s[2]:s[1]:s[0]
+ * + x[4]:x[3]:x[2]:x[1]:x[0]
+ * => s[4]:s[3]:s[2]:s[1]:s[0]
+ */
+ s[0] = x[0];
+ s[1] = x[1];
+ ADD3_LIMB64(s[4], s[3], s[2],
+ zero, s[3], s[2],
+ x[4], x[3], x[2]);
+
+ /* "T + S2" with 64-bit limbs:
+ * [A11:A10]:[ A9: A8]:[ A7: A6]:[ A5: A4]:[ A3: A2]:[ A1: A0]
+ * + [A23:A22]:[A21:A20]:[A19:A18]:[A17:A16]:[A15:A14]:[A13:A12]
+ * => t[6]:t[5]:t[4]:t[3]:t[2]:t[1]:t[0]
+ */
+ ADD7_LIMB64 (t[6], t[5], t[4], t[3], t[2], t[1], t[0],
+ zero,
+ LOAD64(wp, 10 / 2), LOAD64(wp, 8 / 2), LOAD64(wp, 6 / 2),
+ LOAD64(wp, 4 / 2), LOAD64(wp, 2 / 2), LOAD64(wp, 0 / 2),
+ zero,
+ LOAD64(wp, 22 / 2), LOAD64(wp, 20 / 2), LOAD64(wp, 18 / 2),
+ LOAD64(wp, 16 / 2), LOAD64(wp, 14 / 2), LOAD64(wp, 12 / 2));
+
+ /* "2*S1 + S4 + S5 + S6" with 64-bit limbs:
+ * s[6]: s[5]: s[4]: s[3]: s[2]: s[1]: s[0]
+ * + [A19:A18]:[A17:A16]:[A15:A14]:[A13:A12]:[A20:0]:[A23:0]
+ * => s[6]:s[5]:s[4]:s[3]:s[2]:s[1]:s[0]
+ */
+ ADD7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0],
+ zero, zero, s[4], s[3], s[2], s[1], s[0],
+ zero,
+ LOAD64(wp, 18 / 2), LOAD64(wp, 16 / 2),
+ LOAD64(wp, 14 / 2), LOAD64(wp, 12 / 2),
+ LIMB64_HILO(LOAD32(wp, 20), 0),
+ LIMB64_HILO(LOAD32(wp, 23), 0));
+
+ /* "D1 + D2 + D3" with 64-bit limbs:
+ * d[6]: d[5]: d[4]: d[3]: d[2]: d[1]: d[0]
+ * + [A22:A21]:[A20:A19]:[A18:A17]:[A16:A15]:[A14:A13]:[A12:A23]
+ * => d[6]:d[5]:d[4]:d[3]:d[2]:d[1]:d[0]
+ */
+ ADD7_LIMB64 (d[6], d[5], d[4], d[3], d[2], d[1], d[0],
+ zero, zero, zero, zero, d[2], d[1], d[0],
+ zero,
+ LOAD64_SHR32(21),
+ LOAD64_SHR32(19),
+ LOAD64_SHR32(17),
+ LOAD64_SHR32(15),
+ LOAD64_SHR32(13),
+ LIMB64_HILO(LOAD32(wp, 12), LOAD32(wp, 23)));
+
+ /* "2*S1 + S3 + S4 + S5 + S6" with 64-bit limbs:
+ * s[6]: s[5]: s[4]: s[3]: s[2]: s[1]: s[0]
+ * + [A20:A19]:[A18:A17]:[A16:A15]:[A14:A13]:[A12:A23]:[A22:A21]
+ * => s[6]:s[5]:s[4]:s[3]:s[2]:s[1]:s[0]
+ */
+ ADD7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0],
+ s[6], s[5], s[4], s[3], s[2], s[1], s[0],
+ zero,
+ LOAD64_SHR32(19),
+ LOAD64_SHR32(17),
+ LOAD64_SHR32(15),
+ LOAD64_SHR32(13),
+ LIMB64_HILO(LOAD32(wp, 12), LOAD32(wp, 23)),
+ LOAD64_SHR32(21));
+
+ /* "T + 2*S1 + S2 + S3 + S4 + S5 + S6" */
+ ADD7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0],
+ s[6], s[5], s[4], s[3], s[2], s[1], s[0],
+ t[6], t[5], t[4], t[3], t[2], t[1], t[0]);
+
+ /* "T + 2*S1 + S2 + S3 + S4 + S5 + S6 - D1 - D2 - D3" */
+ SUB7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0],
+ s[6], s[5], s[4], s[3], s[2], s[1], s[0],
+ d[6], d[5], d[4], d[3], d[2], d[1], d[0]);
+
+#undef LOAD64_SHR32
+
+ /* mod p:
+ * 's[6]' holds carry value (-3..7). Subtract (carry + 1) * p. Result
+ * will be with in range -p...p. Handle result being negative with
+ * addition and conditional store. */
+
+ carry = LO32_LIMB64(s[6]);
+
+ SUB7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0],
+ s[6], s[5], s[4], s[3], s[2], s[1], s[0],
+ p_mult[carry + 3][6], p_mult[carry + 3][5],
+ p_mult[carry + 3][4], p_mult[carry + 3][3],
+ p_mult[carry + 3][2], p_mult[carry + 3][1],
+ p_mult[carry + 3][0]);
+
+ ADD7_LIMB64 (d[6], d[5], d[4], d[3], d[2], d[1], d[0],
+ s[6], s[5], s[4], s[3], s[2], s[1], s[0],
+ zero,
+ LOAD64(pp, 5), LOAD64(pp, 4),
+ LOAD64(pp, 3), LOAD64(pp, 2),
+ LOAD64(pp, 1), LOAD64(pp, 0));
+
+ mask1 = vzero - (LO32_LIMB64(d[6]) >> 31);
+ mask2 = (LO32_LIMB64(d[6]) >> 31) - vone;
+
+ STORE64_COND(wp, 0, mask2, d[0], mask1, s[0]);
+ STORE64_COND(wp, 1, mask2, d[1], mask1, s[1]);
+ STORE64_COND(wp, 2, mask2, d[2], mask1, s[2]);
+ STORE64_COND(wp, 3, mask2, d[3], mask1, s[3]);
+ STORE64_COND(wp, 4, mask2, d[4], mask1, s[4]);
+ STORE64_COND(wp, 5, mask2, d[5], mask1, s[5]);
+
+ w->nlimbs = wsize * LIMBS_PER_LIMB64;
+ MPN_NORMALIZE (wp, w->nlimbs);
+}
+
+void
+_gcry_mpi_ec_nist521_mod (gcry_mpi_t w, mpi_ec_t ctx)
+{
+ mpi_size_t wsize = (521 + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB;
+ mpi_limb_t s[wsize];
+ mpi_limb_t cy;
+ mpi_ptr_t wp;
+
+ MPN_NORMALIZE (w->d, w->nlimbs);
+ if (mpi_nbits_more_than (w, 2 * 521))
+ log_bug ("W must be less than m^2\n");
+
+ RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2);
+
+ wp = w->d;
+
+ /* See "FIPS 186-4, D.2.5 Curve P-521". */
+
+ _gcry_mpih_rshift (s, wp + wsize - 1, wsize, 521 % BITS_PER_MPI_LIMB);
+ s[wsize - 1] &= (1 << (521 % BITS_PER_MPI_LIMB)) - 1;
+ wp[wsize - 1] &= (1 << (521 % BITS_PER_MPI_LIMB)) - 1;
+ _gcry_mpih_add_n (wp, wp, s, wsize);
+
+ /* "mod p" */
+ cy = _gcry_mpih_sub_n (wp, wp, ctx->p->d, wsize);
+ _gcry_mpih_add_n (s, wp, ctx->p->d, wsize);
+ mpih_set_cond (wp, s, wsize, (cy != 0UL));
+
+ w->nlimbs = wsize;
+ MPN_NORMALIZE (wp, w->nlimbs);
+}
+
+#endif /* !ASM_DISABLED */
diff --git a/mpi/ec.c b/mpi/ec.c
index 4fabf9b4..ae1d036a 100644
--- a/mpi/ec.c
+++ b/mpi/ec.c
@@ -285,7 +285,7 @@ static void
ec_addm (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx)
{
mpi_add (w, u, v);
- ec_mod (w, ctx);
+ ctx->mod (w, ctx);
}

static void
@@ -294,14 +294,14 @@ ec_subm (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ec)
mpi_sub (w, u, v);
while (w->sign)
mpi_add (w, w, ec->p);
- /*ec_mod (w, ec);*/
+ /*ctx->mod (w, ec);*/
}

static void
ec_mulm (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx)
{
mpi_mul (w, u, v);
- ec_mod (w, ctx);
+ ctx->mod (w, ctx);
}

/* W = 2 * U mod P. */
@@ -309,7 +309,7 @@ static void
ec_mul2 (gcry_mpi_t w, gcry_mpi_t u, mpi_ec_t ctx)
{
mpi_lshift (w, u, 1);
- ec_mod (w, ctx);
+ ctx->mod (w, ctx);
}

static void
@@ -585,6 +585,7 @@ struct field_table {
void (* mulm) (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx);
void (* mul2) (gcry_mpi_t w, gcry_mpi_t u, mpi_ec_t ctx);
void (* pow2) (gcry_mpi_t w, const gcry_mpi_t b, mpi_ec_t ctx);
+ void (* mod) (gcry_mpi_t w, mpi_ec_t ctx);
};

static const struct field_table field_table[] = {
@@ -594,7 +595,8 @@ static const struct field_table field_table[] = {
ec_subm_25519,
ec_mulm_25519,
ec_mul2_25519,
- ec_pow2_25519
+ ec_pow2_25519,
+ NULL
},
{
"0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE"
@@ -603,7 +605,55 @@ static const struct field_table field_table[] = {
ec_subm_448,
ec_mulm_448,
ec_mul2_448,
- ec_pow2_448
+ ec_pow2_448,
+ NULL
+ },
+ {
+ "0xfffffffffffffffffffffffffffffffeffffffffffffffff",
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ _gcry_mpi_ec_nist192_mod
+ },
+ {
+ "0xffffffffffffffffffffffffffffffff000000000000000000000001",
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ _gcry_mpi_ec_nist224_mod
+ },
+ {
+ "0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff",
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ _gcry_mpi_ec_nist256_mod
+ },
+ {
+ "0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffe"
+ "ffffffff0000000000000000ffffffff",
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ _gcry_mpi_ec_nist384_mod
+ },
+ {
+ "0x01ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+ "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ _gcry_mpi_ec_nist521_mod
},
{ NULL, NULL, NULL, NULL, NULL, NULL },
};
@@ -757,6 +807,7 @@ ec_p_init (mpi_ec_t ctx, enum gcry_mpi_ec_models model,
ctx->mulm = ec_mulm;
ctx->mul2 = ec_mul2;
ctx->pow2 = ec_pow2;
+ ctx->mod = ec_mod;

for (i=0; field_table[i].p; i++)
{
@@ -769,18 +820,25 @@ ec_p_init (mpi_ec_t ctx, enum gcry_mpi_ec_models model,

if (!mpi_cmp (p, f_p))
{
- ctx->addm = field_table[i].addm;
- ctx->subm = field_table[i].subm;
- ctx->mulm = field_table[i].mulm;
- ctx->mul2 = field_table[i].mul2;
- ctx->pow2 = field_table[i].pow2;
+ ctx->addm = field_table[i].addm ? field_table[i].addm : ctx->addm;
+ ctx->subm = field_table[i].subm ? field_table[i].subm : ctx->subm;
+ ctx->mulm = field_table[i].mulm ? field_table[i].mulm : ctx->mulm;
+ ctx->mul2 = field_table[i].mul2 ? field_table[i].mul2 : ctx->mul2;
+ ctx->pow2 = field_table[i].pow2 ? field_table[i].pow2 : ctx->pow2;
+ ctx->mod = field_table[i].mod ? field_table[i].mod : ctx->mod;
_gcry_mpi_release (f_p);

- mpi_resize (ctx->a, ctx->p->nlimbs);
- ctx->a->nlimbs = ctx->p->nlimbs;
-
- mpi_resize (ctx->b, ctx->p->nlimbs);
- ctx->b->nlimbs = ctx->p->nlimbs;
+ if (ctx->a)
+ {
+ mpi_resize (ctx->a, ctx->p->nlimbs);
+ ctx->a->nlimbs = ctx->p->nlimbs;
+ }
+
+ if (ctx->b)
+ {
+ mpi_resize (ctx->b, ctx->p->nlimbs);
+ ctx->b->nlimbs = ctx->p->nlimbs;
+ }

for (i=0; i< DIM(ctx->t.scratch) && ctx->t.scratch[i]; i++)
ctx->t.scratch[i]->nlimbs = ctx->p->nlimbs;
diff --git a/mpi/mpi-internal.h b/mpi/mpi-internal.h
index 8ccdeada..11fcbde4 100644
--- a/mpi/mpi-internal.h
+++ b/mpi/mpi-internal.h
@@ -79,6 +79,11 @@ typedef int mpi_size_t; /* (must be a signed type) */
if( (a)->alloced < (b) ) \
mpi_resize((a), (b)); \
} while(0)
+#define RESIZE_AND_CLEAR_IF_NEEDED(a,b) \
+ do { \
+ if( (a)->nlimbs < (b) ) \
+ mpi_resize((a), (b)); \
+ } while(0)

/* Copy N limbs from S to D. */
#define MPN_COPY( d, s, n) \
diff --git a/mpi/mpiutil.c b/mpi/mpiutil.c
index 5320f4d8..a5583c79 100644
--- a/mpi/mpiutil.c
+++ b/mpi/mpiutil.c
@@ -197,7 +197,7 @@ _gcry_mpi_resize (gcry_mpi_t a, unsigned nlimbs)
if (a->d)
{
a->d = xrealloc (a->d, nlimbs * sizeof (mpi_limb_t));
- for (i=a->alloced; i < nlimbs; i++)
+ for (i=a->nlimbs; i < nlimbs; i++)
a->d[i] = 0;
}
else
diff --git a/src/ec-context.h b/src/ec-context.h
index d1c64804..479862f6 100644
--- a/src/ec-context.h
+++ b/src/ec-context.h
@@ -74,6 +74,7 @@ struct mpi_ec_ctx_s
void (* mulm) (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx);
void (* pow2) (gcry_mpi_t w, const gcry_mpi_t b, mpi_ec_t ctx);
void (* mul2) (gcry_mpi_t w, gcry_mpi_t u, mpi_ec_t ctx);
+ void (* mod) (gcry_mpi_t w, mpi_ec_t ctx);
};


--
2.30.2


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@gnupg.org
http://lists.gnupg.org/mailman/listinfo/gcrypt-devel