Mailing List Archive

[PATCH] rijndael-ppc: performance improvements
* cipher/rijndael-ppc.c (ALIGNED_LOAD, ALIGNED_STORE, VEC_LOAD_BE)
(VEC_STORE_BE): Rewrite.
(VEC_BE_SWAP, VEC_LOAD_BE_NOSWAP, VEC_STORE_BE_NOSWAP): New.
(PRELOAD_ROUND_KEYS, AES_ENCRYPT, AES_DECRYPT): Adjust to new
input parameters for vector load macros.
(ROUND_KEY_VARIABLES_ALL, PRELOAD_ROUND_KEYS_ALL)
(AES_ENCRYPT_ALL): New.
(vec_bswap32_const_neg): New.
(vec_aligned_ld, vec_aligned_st, vec_load_be_const): Rename to...
(asm_aligned_ls, asm_aligned_st, asm_load_be_const): ...these.
(asm_be_swap, asm_vperm1, asm_load_be_noswap)
(asm_store_be_noswap): New.
(vec_add_uint128): Rename to...
(asm_add_uint128): ...this.
(asm_xor, asm_cipher_be, asm_cipherlast_be, asm_ncipher_be)
(asm_ncipherlast_be): New inline assembly functions with volatile
keyword to allow manual instruction ordering.
(_gcry_aes_ppc8_setkey, aes_ppc8_prepare_decryption)
(_gcry_aes_ppc8_encrypt, _gcry_aes_ppc8_decrypt)
(_gcry_aes_ppc8_cfb_enc, _gcry_aes_ppc8_cbc_enc)
(_gcry_aes_ppc8_ocb_auth): Update to use new&rewritten helper macros.
(_gcry_aes_ppc8_cfb_dec, _gcry_aes_ppc8_cbc_dec)
(_gcry_aes_ppc8_ctr_enc, _gcry_aes_ppc8_ocb_crypt)
(_gcry_aes_ppc8_xts_crypt): Update to use new&rewritten helper
macros; Tune 8-block parallel paths with manual instruction ordering.
--

Benchmarks on POWER8 (ppc64le, ~3.8Ghz):

Before:
AES | nanosecs/byte mebibytes/sec cycles/byte
CBC enc | 1.06 ns/B 902.2 MiB/s 4.02 c/B
CBC dec | 0.208 ns/B 4585 MiB/s 0.790 c/B
CFB enc | 1.06 ns/B 900.4 MiB/s 4.02 c/B
CFB dec | 0.208 ns/B 4588 MiB/s 0.790 c/B
CTR enc | 0.238 ns/B 4007 MiB/s 0.904 c/B
CTR dec | 0.238 ns/B 4009 MiB/s 0.904 c/B
XTS enc | 0.492 ns/B 1937 MiB/s 1.87 c/B
XTS dec | 0.488 ns/B 1955 MiB/s 1.85 c/B
OCB enc | 0.243 ns/B 3928 MiB/s 0.922 c/B
OCB dec | 0.247 ns/B 3858 MiB/s 0.939 c/B
OCB auth | 0.213 ns/B 4482 MiB/s 0.809 c/B

After (cbc-dec & cfb-dec & xts & ocb ~6% faster, ctr ~11% faster):
AES | nanosecs/byte mebibytes/sec cycles/byte
CBC enc | 1.06 ns/B 902.1 MiB/s 4.02 c/B
CBC dec | 0.196 ns/B 4877 MiB/s 0.743 c/B
CFB enc | 1.06 ns/B 902.2 MiB/s 4.02 c/B
CFB dec | 0.195 ns/B 4889 MiB/s 0.741 c/B
CTR enc | 0.214 ns/B 4448 MiB/s 0.815 c/B
CTR dec | 0.214 ns/B 4452 MiB/s 0.814 c/B
XTS enc | 0.461 ns/B 2067 MiB/s 1.75 c/B
XTS dec | 0.456 ns/B 2092 MiB/s 1.73 c/B
OCB enc | 0.227 ns/B 4200 MiB/s 0.863 c/B
OCB dec | 0.234 ns/B 4072 MiB/s 0.890 c/B
OCB auth | 0.207 ns/B 4604 MiB/s 0.787 c/B

Benchmarks on POWER9 (ppc64le, ~3.8Ghz):

Before:
AES | nanosecs/byte mebibytes/sec cycles/byte
CBC enc | 1.04 ns/B 918.7 MiB/s 3.94 c/B
CBC dec | 0.240 ns/B 3982 MiB/s 0.910 c/B
CFB enc | 1.04 ns/B 917.6 MiB/s 3.95 c/B
CFB dec | 0.241 ns/B 3963 MiB/s 0.914 c/B
CTR enc | 0.249 ns/B 3835 MiB/s 0.945 c/B
CTR dec | 0.252 ns/B 3787 MiB/s 0.957 c/B
XTS enc | 0.505 ns/B 1889 MiB/s 1.92 c/B
XTS dec | 0.495 ns/B 1926 MiB/s 1.88 c/B
OCB enc | 0.303 ns/B 3152 MiB/s 1.15 c/B
OCB dec | 0.305 ns/B 3129 MiB/s 1.16 c/B
OCB auth | 0.265 ns/B 3595 MiB/s 1.01 c/B

After (cbc-dec & cfb-dec ~6% faster, ctr ~11% faster, ocb ~4% faster):
AES | nanosecs/byte mebibytes/sec cycles/byte
CBC enc | 1.04 ns/B 917.3 MiB/s 3.95 c/B
CBC dec | 0.225 ns/B 4234 MiB/s 0.856 c/B
CFB enc | 1.04 ns/B 917.8 MiB/s 3.95 c/B
CFB dec | 0.226 ns/B 4214 MiB/s 0.860 c/B
CTR enc | 0.221 ns/B 4306 MiB/s 0.842 c/B
CTR dec | 0.223 ns/B 4271 MiB/s 0.848 c/B
XTS enc | 0.503 ns/B 1897 MiB/s 1.91 c/B
XTS dec | 0.495 ns/B 1928 MiB/s 1.88 c/B
OCB enc | 0.288 ns/B 3309 MiB/s 1.10 c/B
OCB dec | 0.292 ns/B 3266 MiB/s 1.11 c/B
OCB auth | 0.267 ns/B 3570 MiB/s 1.02 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
---
0 files changed

diff --git a/cipher/rijndael-ppc.c b/cipher/rijndael-ppc.c
index 48a47eddb..a8bcae468 100644
--- a/cipher/rijndael-ppc.c
+++ b/cipher/rijndael-ppc.c
@@ -51,17 +51,27 @@ typedef union
#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE


-#define ALIGNED_LOAD(in_ptr) \
- (vec_aligned_ld (0, (const unsigned char *)(in_ptr)))
+#define ALIGNED_LOAD(in_ptr, offs) \
+ (asm_aligned_ld ((offs) * 16, (const void *)(in_ptr)))

-#define ALIGNED_STORE(out_ptr, vec) \
- (vec_aligned_st ((vec), 0, (unsigned char *)(out_ptr)))
+#define ALIGNED_STORE(out_ptr, offs, vec) \
+ (asm_aligned_st ((vec), (offs) * 16, (void *)(out_ptr)))

-#define VEC_LOAD_BE(in_ptr, bige_const) \
- (vec_load_be (0, (const unsigned char *)(in_ptr), bige_const))
+#define VEC_BE_SWAP(vec, bige_const) (asm_be_swap ((vec), (bige_const)))

-#define VEC_STORE_BE(out_ptr, vec, bige_const) \
- (vec_store_be ((vec), 0, (unsigned char *)(out_ptr), bige_const))
+#define VEC_LOAD_BE(in_ptr, offs, bige_const) \
+ (asm_be_swap (asm_load_be_noswap ((offs) * 16, (const void *)(in_ptr)), \
+ bige_const))
+
+#define VEC_LOAD_BE_NOSWAP(in_ptr, offs) \
+ (asm_load_be_noswap ((offs) * 16, (const unsigned char *)(in_ptr)))
+
+#define VEC_STORE_BE(out_ptr, offs, vec, bige_const) \
+ (asm_store_be_noswap (asm_be_swap ((vec), (bige_const)), (offs) * 16, \
+ (void *)(out_ptr)))
+
+#define VEC_STORE_BE_NOSWAP(out_ptr, offs, vec) \
+ (asm_store_be_noswap ((vec), (offs) * 16, (void *)(out_ptr)))


#define ROUND_KEY_VARIABLES \
@@ -69,166 +79,257 @@ typedef union

#define PRELOAD_ROUND_KEYS(nrounds) \
do { \
- rkey0 = ALIGNED_LOAD(&rk[0]); \
- rkeylast = ALIGNED_LOAD(&rk[nrounds]); \
+ rkey0 = ALIGNED_LOAD (rk, 0); \
+ rkeylast = ALIGNED_LOAD (rk, nrounds); \
} while (0)

-
#define AES_ENCRYPT(blk, nrounds) \
do { \
blk ^= rkey0; \
- blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[1])); \
- blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[2])); \
- blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[3])); \
- blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[4])); \
- blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[5])); \
- blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[6])); \
- blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[7])); \
- blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[8])); \
- blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[9])); \
+ blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 1)); \
+ blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 2)); \
+ blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 3)); \
+ blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 4)); \
+ blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 5)); \
+ blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 6)); \
+ blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 7)); \
+ blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 8)); \
+ blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 9)); \
if (nrounds >= 12) \
{ \
- blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[10])); \
- blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[11])); \
+ blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 10)); \
+ blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 11)); \
if (rounds > 12) \
{ \
- blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[12])); \
- blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[13])); \
+ blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 12)); \
+ blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 13)); \
} \
} \
- blk = vec_cipherlast_be (blk, rkeylast); \
+ blk = asm_cipherlast_be (blk, rkeylast); \
} while (0)

-
#define AES_DECRYPT(blk, nrounds) \
do { \
blk ^= rkey0; \
- blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[1])); \
- blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[2])); \
- blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[3])); \
- blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[4])); \
- blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[5])); \
- blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[6])); \
- blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[7])); \
- blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[8])); \
- blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[9])); \
+ blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 1)); \
+ blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 2)); \
+ blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 3)); \
+ blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 4)); \
+ blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 5)); \
+ blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 6)); \
+ blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 7)); \
+ blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 8)); \
+ blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 9)); \
if (nrounds >= 12) \
{ \
- blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[10])); \
- blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[11])); \
+ blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 10)); \
+ blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 11)); \
if (rounds > 12) \
{ \
- blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[12])); \
- blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[13])); \
+ blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 12)); \
+ blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 13)); \
} \
} \
- blk = vec_ncipherlast_be (blk, rkeylast); \
+ blk = asm_ncipherlast_be (blk, rkeylast); \
} while (0)


+#define ROUND_KEY_VARIABLES_ALL \
+ block rkey0, rkey1, rkey2, rkey3, rkey4, rkey5, rkey6, rkey7, rkey8, \
+ rkey9, rkey10, rkey11, rkey12, rkey13, rkeylast
+
+#define PRELOAD_ROUND_KEYS_ALL(nrounds) \
+ do { \
+ rkey0 = ALIGNED_LOAD (rk, 0); \
+ rkey1 = ALIGNED_LOAD (rk, 1); \
+ rkey2 = ALIGNED_LOAD (rk, 2); \
+ rkey3 = ALIGNED_LOAD (rk, 3); \
+ rkey4 = ALIGNED_LOAD (rk, 4); \
+ rkey5 = ALIGNED_LOAD (rk, 5); \
+ rkey6 = ALIGNED_LOAD (rk, 6); \
+ rkey7 = ALIGNED_LOAD (rk, 7); \
+ rkey8 = ALIGNED_LOAD (rk, 8); \
+ rkey9 = ALIGNED_LOAD (rk, 9); \
+ if (nrounds >= 12) \
+ { \
+ rkey10 = ALIGNED_LOAD (rk, 10); \
+ rkey11 = ALIGNED_LOAD (rk, 11); \
+ if (rounds > 12) \
+ { \
+ rkey12 = ALIGNED_LOAD (rk, 12); \
+ rkey13 = ALIGNED_LOAD (rk, 13); \
+ } \
+ } \
+ rkeylast = ALIGNED_LOAD (rk, nrounds); \
+ } while (0)
+
+#define AES_ENCRYPT_ALL(blk, nrounds) \
+ do { \
+ blk ^= rkey0; \
+ blk = asm_cipher_be (blk, rkey1); \
+ blk = asm_cipher_be (blk, rkey2); \
+ blk = asm_cipher_be (blk, rkey3); \
+ blk = asm_cipher_be (blk, rkey4); \
+ blk = asm_cipher_be (blk, rkey5); \
+ blk = asm_cipher_be (blk, rkey6); \
+ blk = asm_cipher_be (blk, rkey7); \
+ blk = asm_cipher_be (blk, rkey8); \
+ blk = asm_cipher_be (blk, rkey9); \
+ if (nrounds >= 12) \
+ { \
+ blk = asm_cipher_be (blk, rkey10); \
+ blk = asm_cipher_be (blk, rkey11); \
+ if (rounds > 12) \
+ { \
+ blk = asm_cipher_be (blk, rkey12); \
+ blk = asm_cipher_be (blk, rkey13); \
+ } \
+ } \
+ blk = asm_cipherlast_be (blk, rkeylast); \
+ } while (0)
+
+
+#ifdef WORDS_BIGENDIAN
static const block vec_bswap32_const =
{ 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+#else
+static const block vec_bswap32_const_neg =
+ { ~3, ~2, ~1, ~0, ~7, ~6, ~5, ~4, ~11, ~10, ~9, ~8, ~15, ~14, ~13, ~12 };
+#endif


static ASM_FUNC_ATTR_INLINE block
-vec_aligned_ld(unsigned long offset, const unsigned char *ptr)
+asm_aligned_ld(unsigned long offset, const void *ptr)
{
-#ifndef WORDS_BIGENDIAN
block vec;
- __asm__ ("lvx %0,%1,%2\n\t"
- : "=v" (vec)
- : "r" (offset), "r" ((uintptr_t)ptr)
- : "memory", "r0");
+ __asm__ volatile ("lvx %0,%1,%2\n\t"
+ : "=v" (vec)
+ : "r" (offset), "r" ((uintptr_t)ptr)
+ : "memory", "r0");
return vec;
-#else
- return vec_vsx_ld (offset, ptr);
-#endif
}

+static ASM_FUNC_ATTR_INLINE void
+asm_aligned_st(block vec, unsigned long offset, void *ptr)
+{
+ __asm__ volatile ("stvx %0,%1,%2\n\t"
+ :
+ : "v" (vec), "r" (offset), "r" ((uintptr_t)ptr)
+ : "memory", "r0");
+}

static ASM_FUNC_ATTR_INLINE block
-vec_load_be_const(void)
+asm_load_be_const(void)
{
#ifndef WORDS_BIGENDIAN
- return ~ALIGNED_LOAD(&vec_bswap32_const);
+ return ALIGNED_LOAD (&vec_bswap32_const_neg, 0);
#else
static const block vec_dummy = { 0 };
return vec_dummy;
#endif
}

-
static ASM_FUNC_ATTR_INLINE block
-vec_load_be(unsigned long offset, const unsigned char *ptr,
- block be_bswap_const)
+asm_vperm1(block vec, block mask)
{
-#ifndef WORDS_BIGENDIAN
- block vec;
- /* GCC vec_vsx_ld is generating two instructions on little-endian. Use
- * lxvw4x directly instead. */
- __asm__ ("lxvw4x %x0,%1,%2\n\t"
- : "=wa" (vec)
- : "r" (offset), "r" ((uintptr_t)ptr)
- : "memory", "r0");
- __asm__ ("vperm %0,%1,%1,%2\n\t"
- : "=v" (vec)
- : "v" (vec), "v" (be_bswap_const));
- return vec;
-#else
- (void)be_bswap_const;
- return vec_vsx_ld (offset, ptr);
-#endif
+ block o;
+ __asm__ volatile ("vperm %0,%1,%1,%2\n\t"
+ : "=v" (o)
+ : "v" (vec), "v" (mask));
+ return o;
}

-
-static ASM_FUNC_ATTR_INLINE void
-vec_aligned_st(block vec, unsigned long offset, unsigned char *ptr)
+static ASM_FUNC_ATTR_INLINE block
+asm_be_swap(block vec, block be_bswap_const)
{
+ (void)be_bswap_const;
#ifndef WORDS_BIGENDIAN
- __asm__ ("stvx %0,%1,%2\n\t"
- :
- : "v" (vec), "r" (offset), "r" ((uintptr_t)ptr)
- : "memory", "r0");
+ return asm_vperm1 (vec, be_bswap_const);
#else
- vec_vsx_st (vec, offset, ptr);
+ return vec;
#endif
}

+static ASM_FUNC_ATTR_INLINE block
+asm_load_be_noswap(unsigned long offset, const void *ptr)
+{
+ block vec;
+ __asm__ volatile ("lxvw4x %x0,%1,%2\n\t"
+ : "=wa" (vec)
+ : "r" (offset), "r" ((uintptr_t)ptr)
+ : "memory", "r0");
+ /* NOTE: vec needs to be be-swapped using 'asm_be_swap' by caller */
+ return vec;
+}

static ASM_FUNC_ATTR_INLINE void
-vec_store_be(block vec, unsigned long offset, unsigned char *ptr,
- block be_bswap_const)
+asm_store_be_noswap(block vec, unsigned long offset, void *ptr)
{
-#ifndef WORDS_BIGENDIAN
- /* GCC vec_vsx_st is generating two instructions on little-endian. Use
- * stxvw4x directly instead. */
- __asm__ ("vperm %0,%1,%1,%2\n\t"
- : "=v" (vec)
- : "v" (vec), "v" (be_bswap_const));
- __asm__ ("stxvw4x %x0,%1,%2\n\t"
- :
- : "wa" (vec), "r" (offset), "r" ((uintptr_t)ptr)
- : "memory", "r0");
-#else
- (void)be_bswap_const;
- vec_vsx_st (vec, offset, ptr);
-#endif
+ /* NOTE: vec be-swapped using 'asm_be_swap' by caller */
+ __asm__ volatile ("stxvw4x %x0,%1,%2\n\t"
+ :
+ : "wa" (vec), "r" (offset), "r" ((uintptr_t)ptr)
+ : "memory", "r0");
}

+static ASM_FUNC_ATTR_INLINE block
+asm_add_uint128(block a, block b)
+{
+ block res;
+ __asm__ volatile ("vadduqm %0,%1,%2\n\t"
+ : "=v" (res)
+ : "v" (a), "v" (b));
+ return res;
+}

static ASM_FUNC_ATTR_INLINE block
-vec_add_uint128(block a, block b)
+asm_xor(block a, block b)
{
-#if 1
block res;
- /* Use assembly as GCC (v8.3) generates slow code for vec_vadduqm. */
- __asm__ ("vadduqm %0,%1,%2\n\t"
- : "=v" (res)
- : "v" (a), "v" (b));
+ __asm__ volatile ("vxor %0,%1,%2\n\t"
+ : "=v" (res)
+ : "v" (a), "v" (b));
return res;
-#else
- return (block)vec_vadduqm((vector __uint128_t)a, (vector __uint128_t)b);
-#endif
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_cipher_be(block b, block rk)
+{
+ block o;
+ __asm__ volatile ("vcipher %0, %1, %2\n\t"
+ : "=v" (o)
+ : "v" (b), "v" (rk));
+ return o;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_cipherlast_be(block b, block rk)
+{
+ block o;
+ __asm__ volatile ("vcipherlast %0, %1, %2\n\t"
+ : "=v" (o)
+ : "v" (b), "v" (rk));
+ return o;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_ncipher_be(block b, block rk)
+{
+ block o;
+ __asm__ volatile ("vncipher %0, %1, %2\n\t"
+ : "=v" (o)
+ : "v" (b), "v" (rk));
+ return o;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_ncipherlast_be(block b, block rk)
+{
+ block o;
+ __asm__ volatile ("vncipherlast %0, %1, %2\n\t"
+ : "=v" (o)
+ : "v" (b), "v" (rk));
+ return o;
}


@@ -250,7 +351,7 @@ _gcry_aes_sbox4_ppc8(u32 fourbytes)
void
_gcry_aes_ppc8_setkey (RIJNDAEL_context *ctx, const byte *key)
{
- const block bige_const = vec_load_be_const();
+ const block bige_const = asm_load_be_const();
union
{
PROPERLY_ALIGNED_TYPE dummy;
@@ -345,11 +446,11 @@ _gcry_aes_ppc8_setkey (RIJNDAEL_context *ctx, const byte *key)
for (r = 0; r <= rounds; r++)
{
#ifndef WORDS_BIGENDIAN
- VEC_STORE_BE(&ekey[r], ALIGNED_LOAD(&ekey[r]), bige_const);
+ VEC_STORE_BE(ekey, r, ALIGNED_LOAD (ekey, r), bige_const);
#else
- block rvec = ALIGNED_LOAD(&ekey[r]);
- ALIGNED_STORE(&ekey[r],
- vec_perm(rvec, rvec, vec_bswap32_const));
+ block rvec = ALIGNED_LOAD (ekey, r);
+ ALIGNED_STORE (ekey, r,
+ vec_perm(rvec, rvec, vec_bswap32_const));
(void)bige_const;
#endif
}
@@ -378,7 +479,7 @@ aes_ppc8_prepare_decryption (RIJNDAEL_context *ctx)
rr = rounds;
for (r = 0, rr = rounds; r <= rounds; r++, rr--)
{
- ALIGNED_STORE(&dkey[r], ALIGNED_LOAD(&ekey[rr]));
+ ALIGNED_STORE (dkey, r, ALIGNED_LOAD (ekey, rr));
}
}

@@ -394,18 +495,18 @@ unsigned int _gcry_aes_ppc8_encrypt (const RIJNDAEL_context *ctx,
unsigned char *out,
const unsigned char *in)
{
- const block bige_const = vec_load_be_const();
+ const block bige_const = asm_load_be_const();
const u128_t *rk = (u128_t *)&ctx->keyschenc;
int rounds = ctx->rounds;
ROUND_KEY_VARIABLES;
block b;

- b = VEC_LOAD_BE (in, bige_const);
+ b = VEC_LOAD_BE (in, 0, bige_const);

PRELOAD_ROUND_KEYS (rounds);

AES_ENCRYPT (b, rounds);
- VEC_STORE_BE (out, b, bige_const);
+ VEC_STORE_BE (out, 0, b, bige_const);

return 0; /* does not use stack */
}
@@ -415,18 +516,18 @@ unsigned int _gcry_aes_ppc8_decrypt (const RIJNDAEL_context *ctx,
unsigned char *out,
const unsigned char *in)
{
- const block bige_const = vec_load_be_const();
+ const block bige_const = asm_load_be_const();
const u128_t *rk = (u128_t *)&ctx->keyschdec;
int rounds = ctx->rounds;
ROUND_KEY_VARIABLES;
block b;

- b = VEC_LOAD_BE (in, bige_const);
+ b = VEC_LOAD_BE (in, 0, bige_const);

PRELOAD_ROUND_KEYS (rounds);

AES_DECRYPT (b, rounds);
- VEC_STORE_BE (out, b, bige_const);
+ VEC_STORE_BE (out, 0, b, bige_const);

return 0; /* does not use stack */
}
@@ -436,41 +537,41 @@ void _gcry_aes_ppc8_cfb_enc (void *context, unsigned char *iv_arg,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks)
{
- const block bige_const = vec_load_be_const();
+ const block bige_const = asm_load_be_const();
RIJNDAEL_context *ctx = context;
const u128_t *rk = (u128_t *)&ctx->keyschenc;
const u128_t *in = (const u128_t *)inbuf_arg;
u128_t *out = (u128_t *)outbuf_arg;
int rounds = ctx->rounds;
- ROUND_KEY_VARIABLES;
+ ROUND_KEY_VARIABLES_ALL;
block rkeylast_orig;
block iv;

- iv = VEC_LOAD_BE (iv_arg, bige_const);
+ iv = VEC_LOAD_BE (iv_arg, 0, bige_const);

- PRELOAD_ROUND_KEYS (rounds);
+ PRELOAD_ROUND_KEYS_ALL (rounds);
rkeylast_orig = rkeylast;

for (; nblocks; nblocks--)
{
- rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, bige_const);
+ rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, 0, bige_const);

- AES_ENCRYPT (iv, rounds);
+ AES_ENCRYPT_ALL (iv, rounds);

- VEC_STORE_BE (out, iv, bige_const);
+ VEC_STORE_BE (out, 0, iv, bige_const);

out++;
in++;
}

- VEC_STORE_BE (iv_arg, iv, bige_const);
+ VEC_STORE_BE (iv_arg, 0, iv, bige_const);
}

void _gcry_aes_ppc8_cfb_dec (void *context, unsigned char *iv_arg,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks)
{
- const block bige_const = vec_load_be_const();
+ const block bige_const = asm_load_be_const();
RIJNDAEL_context *ctx = context;
const u128_t *rk = (u128_t *)&ctx->keyschenc;
const u128_t *in = (const u128_t *)inbuf_arg;
@@ -483,7 +584,7 @@ void _gcry_aes_ppc8_cfb_dec (void *context, unsigned char *iv_arg,
block b0, b1, b2, b3, b4, b5, b6, b7;
block rkey;

- iv = VEC_LOAD_BE (iv_arg, bige_const);
+ iv = VEC_LOAD_BE (iv_arg, 0, bige_const);

PRELOAD_ROUND_KEYS (rounds);
rkeylast_orig = rkeylast;
@@ -491,34 +592,42 @@ void _gcry_aes_ppc8_cfb_dec (void *context, unsigned char *iv_arg,
for (; nblocks >= 8; nblocks -= 8)
{
in0 = iv;
- in1 = VEC_LOAD_BE (in + 0, bige_const);
- in2 = VEC_LOAD_BE (in + 1, bige_const);
- in3 = VEC_LOAD_BE (in + 2, bige_const);
- in4 = VEC_LOAD_BE (in + 3, bige_const);
- in5 = VEC_LOAD_BE (in + 4, bige_const);
- in6 = VEC_LOAD_BE (in + 5, bige_const);
- in7 = VEC_LOAD_BE (in + 6, bige_const);
- iv = VEC_LOAD_BE (in + 7, bige_const);
-
- b0 = rkey0 ^ in0;
- b1 = rkey0 ^ in1;
- b2 = rkey0 ^ in2;
- b3 = rkey0 ^ in3;
- b4 = rkey0 ^ in4;
- b5 = rkey0 ^ in5;
- b6 = rkey0 ^ in6;
- b7 = rkey0 ^ in7;
+ in1 = VEC_LOAD_BE_NOSWAP (in, 0);
+ in2 = VEC_LOAD_BE_NOSWAP (in, 1);
+ in3 = VEC_LOAD_BE_NOSWAP (in, 2);
+ in4 = VEC_LOAD_BE_NOSWAP (in, 3);
+ in1 = VEC_BE_SWAP (in1, bige_const);
+ in2 = VEC_BE_SWAP (in2, bige_const);
+ in5 = VEC_LOAD_BE_NOSWAP (in, 4);
+ in6 = VEC_LOAD_BE_NOSWAP (in, 5);
+ in3 = VEC_BE_SWAP (in3, bige_const);
+ in4 = VEC_BE_SWAP (in4, bige_const);
+ in7 = VEC_LOAD_BE_NOSWAP (in, 6);
+ iv = VEC_LOAD_BE_NOSWAP (in, 7);
+ in += 8;
+ in5 = VEC_BE_SWAP (in5, bige_const);
+ in6 = VEC_BE_SWAP (in6, bige_const);
+ b0 = asm_xor (rkey0, in0);
+ b1 = asm_xor (rkey0, in1);
+ in7 = VEC_BE_SWAP (in7, bige_const);
+ iv = VEC_BE_SWAP (iv, bige_const);
+ b2 = asm_xor (rkey0, in2);
+ b3 = asm_xor (rkey0, in3);
+ b4 = asm_xor (rkey0, in4);
+ b5 = asm_xor (rkey0, in5);
+ b6 = asm_xor (rkey0, in6);
+ b7 = asm_xor (rkey0, in7);

#define DO_ROUND(r) \
- rkey = ALIGNED_LOAD(&rk[r]); \
- b0 = vec_cipher_be (b0, rkey); \
- b1 = vec_cipher_be (b1, rkey); \
- b2 = vec_cipher_be (b2, rkey); \
- b3 = vec_cipher_be (b3, rkey); \
- b4 = vec_cipher_be (b4, rkey); \
- b5 = vec_cipher_be (b5, rkey); \
- b6 = vec_cipher_be (b6, rkey); \
- b7 = vec_cipher_be (b7, rkey);
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_cipher_be (b0, rkey); \
+ b1 = asm_cipher_be (b1, rkey); \
+ b2 = asm_cipher_be (b2, rkey); \
+ b3 = asm_cipher_be (b3, rkey); \
+ b4 = asm_cipher_be (b4, rkey); \
+ b5 = asm_cipher_be (b5, rkey); \
+ b6 = asm_cipher_be (b6, rkey); \
+ b7 = asm_cipher_be (b7, rkey);

DO_ROUND(1);
DO_ROUND(2);
@@ -542,48 +651,60 @@ void _gcry_aes_ppc8_cfb_dec (void *context, unsigned char *iv_arg,

#undef DO_ROUND

- rkey = rkeylast;
- b0 = vec_cipherlast_be (b0, rkey ^ in1);
- b1 = vec_cipherlast_be (b1, rkey ^ in2);
- b2 = vec_cipherlast_be (b2, rkey ^ in3);
- b3 = vec_cipherlast_be (b3, rkey ^ in4);
- b4 = vec_cipherlast_be (b4, rkey ^ in5);
- b5 = vec_cipherlast_be (b5, rkey ^ in6);
- b6 = vec_cipherlast_be (b6, rkey ^ in7);
- b7 = vec_cipherlast_be (b7, rkey ^ iv);
-
- VEC_STORE_BE (out + 0, b0, bige_const);
- VEC_STORE_BE (out + 1, b1, bige_const);
- VEC_STORE_BE (out + 2, b2, bige_const);
- VEC_STORE_BE (out + 3, b3, bige_const);
- VEC_STORE_BE (out + 4, b4, bige_const);
- VEC_STORE_BE (out + 5, b5, bige_const);
- VEC_STORE_BE (out + 6, b6, bige_const);
- VEC_STORE_BE (out + 7, b7, bige_const);
-
- in += 8;
+ in1 = asm_xor (rkeylast, in1);
+ in2 = asm_xor (rkeylast, in2);
+ in3 = asm_xor (rkeylast, in3);
+ in4 = asm_xor (rkeylast, in4);
+ b0 = asm_cipherlast_be (b0, in1);
+ b1 = asm_cipherlast_be (b1, in2);
+ in5 = asm_xor (rkeylast, in5);
+ in6 = asm_xor (rkeylast, in6);
+ b2 = asm_cipherlast_be (b2, in3);
+ b3 = asm_cipherlast_be (b3, in4);
+ in7 = asm_xor (rkeylast, in7);
+ in0 = asm_xor (rkeylast, iv);
+ b0 = VEC_BE_SWAP (b0, bige_const);
+ b1 = VEC_BE_SWAP (b1, bige_const);
+ b4 = asm_cipherlast_be (b4, in5);
+ b5 = asm_cipherlast_be (b5, in6);
+ b2 = VEC_BE_SWAP (b2, bige_const);
+ b3 = VEC_BE_SWAP (b3, bige_const);
+ b6 = asm_cipherlast_be (b6, in7);
+ b7 = asm_cipherlast_be (b7, in0);
+ b4 = VEC_BE_SWAP (b4, bige_const);
+ b5 = VEC_BE_SWAP (b5, bige_const);
+ b6 = VEC_BE_SWAP (b6, bige_const);
+ b7 = VEC_BE_SWAP (b7, bige_const);
+ VEC_STORE_BE_NOSWAP (out, 0, b0);
+ VEC_STORE_BE_NOSWAP (out, 1, b1);
+ VEC_STORE_BE_NOSWAP (out, 2, b2);
+ VEC_STORE_BE_NOSWAP (out, 3, b3);
+ VEC_STORE_BE_NOSWAP (out, 4, b4);
+ VEC_STORE_BE_NOSWAP (out, 5, b5);
+ VEC_STORE_BE_NOSWAP (out, 6, b6);
+ VEC_STORE_BE_NOSWAP (out, 7, b7);
out += 8;
}

if (nblocks >= 4)
{
in0 = iv;
- in1 = VEC_LOAD_BE (in + 0, bige_const);
- in2 = VEC_LOAD_BE (in + 1, bige_const);
- in3 = VEC_LOAD_BE (in + 2, bige_const);
- iv = VEC_LOAD_BE (in + 3, bige_const);
+ in1 = VEC_LOAD_BE (in, 0, bige_const);
+ in2 = VEC_LOAD_BE (in, 1, bige_const);
+ in3 = VEC_LOAD_BE (in, 2, bige_const);
+ iv = VEC_LOAD_BE (in, 3, bige_const);

- b0 = rkey0 ^ in0;
- b1 = rkey0 ^ in1;
- b2 = rkey0 ^ in2;
- b3 = rkey0 ^ in3;
+ b0 = asm_xor (rkey0, in0);
+ b1 = asm_xor (rkey0, in1);
+ b2 = asm_xor (rkey0, in2);
+ b3 = asm_xor (rkey0, in3);

#define DO_ROUND(r) \
- rkey = ALIGNED_LOAD(&rk[r]); \
- b0 = vec_cipher_be (b0, rkey); \
- b1 = vec_cipher_be (b1, rkey); \
- b2 = vec_cipher_be (b2, rkey); \
- b3 = vec_cipher_be (b3, rkey);
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_cipher_be (b0, rkey); \
+ b1 = asm_cipher_be (b1, rkey); \
+ b2 = asm_cipher_be (b2, rkey); \
+ b3 = asm_cipher_be (b3, rkey);

DO_ROUND(1);
DO_ROUND(2);
@@ -607,16 +728,18 @@ void _gcry_aes_ppc8_cfb_dec (void *context, unsigned char *iv_arg,

#undef DO_ROUND

- rkey = rkeylast;
- b0 = vec_cipherlast_be (b0, rkey ^ in1);
- b1 = vec_cipherlast_be (b1, rkey ^ in2);
- b2 = vec_cipherlast_be (b2, rkey ^ in3);
- b3 = vec_cipherlast_be (b3, rkey ^ iv);
-
- VEC_STORE_BE (out + 0, b0, bige_const);
- VEC_STORE_BE (out + 1, b1, bige_const);
- VEC_STORE_BE (out + 2, b2, bige_const);
- VEC_STORE_BE (out + 3, b3, bige_const);
+ in1 = asm_xor (rkeylast, in1);
+ in2 = asm_xor (rkeylast, in2);
+ in3 = asm_xor (rkeylast, in3);
+ in0 = asm_xor (rkeylast, iv);
+ b0 = asm_cipherlast_be (b0, in1);
+ b1 = asm_cipherlast_be (b1, in2);
+ b2 = asm_cipherlast_be (b2, in3);
+ b3 = asm_cipherlast_be (b3, in0);
+ VEC_STORE_BE (out, 0, b0, bige_const);
+ VEC_STORE_BE (out, 1, b1, bige_const);
+ VEC_STORE_BE (out, 2, b2, bige_const);
+ VEC_STORE_BE (out, 3, b3, bige_const);

in += 4;
out += 4;
@@ -625,20 +748,20 @@ void _gcry_aes_ppc8_cfb_dec (void *context, unsigned char *iv_arg,

for (; nblocks; nblocks--)
{
- bin = VEC_LOAD_BE (in, bige_const);
+ bin = VEC_LOAD_BE (in, 0, bige_const);
rkeylast = rkeylast_orig ^ bin;
b = iv;
iv = bin;

AES_ENCRYPT (b, rounds);

- VEC_STORE_BE (out, b, bige_const);
+ VEC_STORE_BE (out, 0, b, bige_const);

out++;
in++;
}

- VEC_STORE_BE (iv_arg, iv, bige_const);
+ VEC_STORE_BE (iv_arg, 0, iv, bige_const);
}


@@ -646,41 +769,41 @@ void _gcry_aes_ppc8_cbc_enc (void *context, unsigned char *iv_arg,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks, int cbc_mac)
{
- const block bige_const = vec_load_be_const();
+ const block bige_const = asm_load_be_const();
RIJNDAEL_context *ctx = context;
const u128_t *rk = (u128_t *)&ctx->keyschenc;
const u128_t *in = (const u128_t *)inbuf_arg;
u128_t *out = (u128_t *)outbuf_arg;
int rounds = ctx->rounds;
- ROUND_KEY_VARIABLES;
+ ROUND_KEY_VARIABLES_ALL;
block lastiv, b;
+ unsigned int outadd = !cbc_mac;

- lastiv = VEC_LOAD_BE (iv_arg, bige_const);
+ lastiv = VEC_LOAD_BE (iv_arg, 0, bige_const);

- PRELOAD_ROUND_KEYS (rounds);
+ PRELOAD_ROUND_KEYS_ALL (rounds);

for (; nblocks; nblocks--)
{
- b = lastiv ^ VEC_LOAD_BE (in, bige_const);
+ b = lastiv ^ VEC_LOAD_BE (in, 0, bige_const);

- AES_ENCRYPT (b, rounds);
+ AES_ENCRYPT_ALL (b, rounds);

lastiv = b;
- VEC_STORE_BE (out, b, bige_const);
+ VEC_STORE_BE (out, 0, b, bige_const);

in++;
- if (!cbc_mac)
- out++;
+ out += outadd;
}

- VEC_STORE_BE (iv_arg, lastiv, bige_const);
+ VEC_STORE_BE (iv_arg, 0, lastiv, bige_const);
}

void _gcry_aes_ppc8_cbc_dec (void *context, unsigned char *iv_arg,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks)
{
- const block bige_const = vec_load_be_const();
+ const block bige_const = asm_load_be_const();
RIJNDAEL_context *ctx = context;
const u128_t *rk = (u128_t *)&ctx->keyschdec;
const u128_t *in = (const u128_t *)inbuf_arg;
@@ -699,41 +822,49 @@ void _gcry_aes_ppc8_cbc_dec (void *context, unsigned char *iv_arg,
ctx->decryption_prepared = 1;
}

- iv = VEC_LOAD_BE (iv_arg, bige_const);
+ iv = VEC_LOAD_BE (iv_arg, 0, bige_const);

PRELOAD_ROUND_KEYS (rounds);
rkeylast_orig = rkeylast;

for (; nblocks >= 8; nblocks -= 8)
{
- in0 = VEC_LOAD_BE (in + 0, bige_const);
- in1 = VEC_LOAD_BE (in + 1, bige_const);
- in2 = VEC_LOAD_BE (in + 2, bige_const);
- in3 = VEC_LOAD_BE (in + 3, bige_const);
- in4 = VEC_LOAD_BE (in + 4, bige_const);
- in5 = VEC_LOAD_BE (in + 5, bige_const);
- in6 = VEC_LOAD_BE (in + 6, bige_const);
- in7 = VEC_LOAD_BE (in + 7, bige_const);
-
- b0 = rkey0 ^ in0;
- b1 = rkey0 ^ in1;
- b2 = rkey0 ^ in2;
- b3 = rkey0 ^ in3;
- b4 = rkey0 ^ in4;
- b5 = rkey0 ^ in5;
- b6 = rkey0 ^ in6;
- b7 = rkey0 ^ in7;
+ in0 = VEC_LOAD_BE_NOSWAP (in, 0);
+ in1 = VEC_LOAD_BE_NOSWAP (in, 1);
+ in2 = VEC_LOAD_BE_NOSWAP (in, 2);
+ in3 = VEC_LOAD_BE_NOSWAP (in, 3);
+ in0 = VEC_BE_SWAP (in0, bige_const);
+ in1 = VEC_BE_SWAP (in1, bige_const);
+ in4 = VEC_LOAD_BE_NOSWAP (in, 4);
+ in5 = VEC_LOAD_BE_NOSWAP (in, 5);
+ in2 = VEC_BE_SWAP (in2, bige_const);
+ in3 = VEC_BE_SWAP (in3, bige_const);
+ in6 = VEC_LOAD_BE_NOSWAP (in, 6);
+ in7 = VEC_LOAD_BE_NOSWAP (in, 7);
+ in += 8;
+ b0 = asm_xor (rkey0, in0);
+ b1 = asm_xor (rkey0, in1);
+ in4 = VEC_BE_SWAP (in4, bige_const);
+ in5 = VEC_BE_SWAP (in5, bige_const);
+ b2 = asm_xor (rkey0, in2);
+ b3 = asm_xor (rkey0, in3);
+ in6 = VEC_BE_SWAP (in6, bige_const);
+ in7 = VEC_BE_SWAP (in7, bige_const);
+ b4 = asm_xor (rkey0, in4);
+ b5 = asm_xor (rkey0, in5);
+ b6 = asm_xor (rkey0, in6);
+ b7 = asm_xor (rkey0, in7);

#define DO_ROUND(r) \
- rkey = ALIGNED_LOAD(&rk[r]); \
- b0 = vec_ncipher_be (b0, rkey); \
- b1 = vec_ncipher_be (b1, rkey); \
- b2 = vec_ncipher_be (b2, rkey); \
- b3 = vec_ncipher_be (b3, rkey); \
- b4 = vec_ncipher_be (b4, rkey); \
- b5 = vec_ncipher_be (b5, rkey); \
- b6 = vec_ncipher_be (b6, rkey); \
- b7 = vec_ncipher_be (b7, rkey);
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_ncipher_be (b0, rkey); \
+ b1 = asm_ncipher_be (b1, rkey); \
+ b2 = asm_ncipher_be (b2, rkey); \
+ b3 = asm_ncipher_be (b3, rkey); \
+ b4 = asm_ncipher_be (b4, rkey); \
+ b5 = asm_ncipher_be (b5, rkey); \
+ b6 = asm_ncipher_be (b6, rkey); \
+ b7 = asm_ncipher_be (b7, rkey);

DO_ROUND(1);
DO_ROUND(2);
@@ -757,48 +888,60 @@ void _gcry_aes_ppc8_cbc_dec (void *context, unsigned char *iv_arg,

#undef DO_ROUND

- rkey = rkeylast;
- b0 = vec_ncipherlast_be (b0, rkey ^ iv);
- b1 = vec_ncipherlast_be (b1, rkey ^ in0);
- b2 = vec_ncipherlast_be (b2, rkey ^ in1);
- b3 = vec_ncipherlast_be (b3, rkey ^ in2);
- b4 = vec_ncipherlast_be (b4, rkey ^ in3);
- b5 = vec_ncipherlast_be (b5, rkey ^ in4);
- b6 = vec_ncipherlast_be (b6, rkey ^ in5);
- b7 = vec_ncipherlast_be (b7, rkey ^ in6);
+ iv = asm_xor (rkeylast, iv);
+ in0 = asm_xor (rkeylast, in0);
+ in1 = asm_xor (rkeylast, in1);
+ in2 = asm_xor (rkeylast, in2);
+ b0 = asm_ncipherlast_be (b0, iv);
iv = in7;
-
- VEC_STORE_BE (out + 0, b0, bige_const);
- VEC_STORE_BE (out + 1, b1, bige_const);
- VEC_STORE_BE (out + 2, b2, bige_const);
- VEC_STORE_BE (out + 3, b3, bige_const);
- VEC_STORE_BE (out + 4, b4, bige_const);
- VEC_STORE_BE (out + 5, b5, bige_const);
- VEC_STORE_BE (out + 6, b6, bige_const);
- VEC_STORE_BE (out + 7, b7, bige_const);
-
- in += 8;
+ b1 = asm_ncipherlast_be (b1, in0);
+ in3 = asm_xor (rkeylast, in3);
+ in4 = asm_xor (rkeylast, in4);
+ b2 = asm_ncipherlast_be (b2, in1);
+ b3 = asm_ncipherlast_be (b3, in2);
+ in5 = asm_xor (rkeylast, in5);
+ in6 = asm_xor (rkeylast, in6);
+ b0 = VEC_BE_SWAP (b0, bige_const);
+ b1 = VEC_BE_SWAP (b1, bige_const);
+ b4 = asm_ncipherlast_be (b4, in3);
+ b5 = asm_ncipherlast_be (b5, in4);
+ b2 = VEC_BE_SWAP (b2, bige_const);
+ b3 = VEC_BE_SWAP (b3, bige_const);
+ b6 = asm_ncipherlast_be (b6, in5);
+ b7 = asm_ncipherlast_be (b7, in6);
+ b4 = VEC_BE_SWAP (b4, bige_const);
+ b5 = VEC_BE_SWAP (b5, bige_const);
+ b6 = VEC_BE_SWAP (b6, bige_const);
+ b7 = VEC_BE_SWAP (b7, bige_const);
+ VEC_STORE_BE_NOSWAP (out, 0, b0);
+ VEC_STORE_BE_NOSWAP (out, 1, b1);
+ VEC_STORE_BE_NOSWAP (out, 2, b2);
+ VEC_STORE_BE_NOSWAP (out, 3, b3);
+ VEC_STORE_BE_NOSWAP (out, 4, b4);
+ VEC_STORE_BE_NOSWAP (out, 5, b5);
+ VEC_STORE_BE_NOSWAP (out, 6, b6);
+ VEC_STORE_BE_NOSWAP (out, 7, b7);
out += 8;
}

if (nblocks >= 4)
{
- in0 = VEC_LOAD_BE (in + 0, bige_const);
- in1 = VEC_LOAD_BE (in + 1, bige_const);
- in2 = VEC_LOAD_BE (in + 2, bige_const);
- in3 = VEC_LOAD_BE (in + 3, bige_const);
+ in0 = VEC_LOAD_BE (in, 0, bige_const);
+ in1 = VEC_LOAD_BE (in, 1, bige_const);
+ in2 = VEC_LOAD_BE (in, 2, bige_const);
+ in3 = VEC_LOAD_BE (in, 3, bige_const);

- b0 = rkey0 ^ in0;
- b1 = rkey0 ^ in1;
- b2 = rkey0 ^ in2;
- b3 = rkey0 ^ in3;
+ b0 = asm_xor (rkey0, in0);
+ b1 = asm_xor (rkey0, in1);
+ b2 = asm_xor (rkey0, in2);
+ b3 = asm_xor (rkey0, in3);

#define DO_ROUND(r) \
- rkey = ALIGNED_LOAD(&rk[r]); \
- b0 = vec_ncipher_be (b0, rkey); \
- b1 = vec_ncipher_be (b1, rkey); \
- b2 = vec_ncipher_be (b2, rkey); \
- b3 = vec_ncipher_be (b3, rkey);
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_ncipher_be (b0, rkey); \
+ b1 = asm_ncipher_be (b1, rkey); \
+ b2 = asm_ncipher_be (b2, rkey); \
+ b3 = asm_ncipher_be (b3, rkey);

DO_ROUND(1);
DO_ROUND(2);
@@ -822,17 +965,21 @@ void _gcry_aes_ppc8_cbc_dec (void *context, unsigned char *iv_arg,

#undef DO_ROUND

- rkey = rkeylast;
- b0 = vec_ncipherlast_be (b0, rkey ^ iv);
- b1 = vec_ncipherlast_be (b1, rkey ^ in0);
- b2 = vec_ncipherlast_be (b2, rkey ^ in1);
- b3 = vec_ncipherlast_be (b3, rkey ^ in2);
+ iv = asm_xor (rkeylast, iv);
+ in0 = asm_xor (rkeylast, in0);
+ in1 = asm_xor (rkeylast, in1);
+ in2 = asm_xor (rkeylast, in2);
+
+ b0 = asm_ncipherlast_be (b0, iv);
iv = in3;
+ b1 = asm_ncipherlast_be (b1, in0);
+ b2 = asm_ncipherlast_be (b2, in1);
+ b3 = asm_ncipherlast_be (b3, in2);

- VEC_STORE_BE (out + 0, b0, bige_const);
- VEC_STORE_BE (out + 1, b1, bige_const);
- VEC_STORE_BE (out + 2, b2, bige_const);
- VEC_STORE_BE (out + 3, b3, bige_const);
+ VEC_STORE_BE (out, 0, b0, bige_const);
+ VEC_STORE_BE (out, 1, b1, bige_const);
+ VEC_STORE_BE (out, 2, b2, bige_const);
+ VEC_STORE_BE (out, 3, b3, bige_const);

in += 4;
out += 4;
@@ -843,17 +990,17 @@ void _gcry_aes_ppc8_cbc_dec (void *context, unsigned char *iv_arg,
{
rkeylast = rkeylast_orig ^ iv;

- iv = VEC_LOAD_BE (in, bige_const);
+ iv = VEC_LOAD_BE (in, 0, bige_const);
b = iv;
AES_DECRYPT (b, rounds);

- VEC_STORE_BE (out, b, bige_const);
+ VEC_STORE_BE (out, 0, b, bige_const);

in++;
out++;
}

- VEC_STORE_BE (iv_arg, iv, bige_const);
+ VEC_STORE_BE (iv_arg, 0, iv, bige_const);
}


@@ -863,7 +1010,7 @@ void _gcry_aes_ppc8_ctr_enc (void *context, unsigned char *ctr_arg,
{
static const unsigned char vec_one_const[16] =
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 };
- const block bige_const = vec_load_be_const();
+ const block bige_const = asm_load_be_const();
RIJNDAEL_context *ctx = context;
const u128_t *rk = (u128_t *)&ctx->keyschenc;
const u128_t *in = (const u128_t *)inbuf_arg;
@@ -873,56 +1020,80 @@ void _gcry_aes_ppc8_ctr_enc (void *context, unsigned char *ctr_arg,
block rkeylast_orig;
block ctr, b, one;

- ctr = VEC_LOAD_BE (ctr_arg, bige_const);
- one = VEC_LOAD_BE (&vec_one_const, bige_const);
+ ctr = VEC_LOAD_BE (ctr_arg, 0, bige_const);
+ one = VEC_LOAD_BE (&vec_one_const, 0, bige_const);

PRELOAD_ROUND_KEYS (rounds);
rkeylast_orig = rkeylast;

if (nblocks >= 4)
{
+ block in0, in1, in2, in3, in4, in5, in6, in7;
block b0, b1, b2, b3, b4, b5, b6, b7;
block two, three, four;
- block ctr4;
block rkey;

- two = vec_add_uint128 (one, one);
- three = vec_add_uint128 (two, one);
- four = vec_add_uint128 (two, two);
+ two = asm_add_uint128 (one, one);
+ three = asm_add_uint128 (two, one);
+ four = asm_add_uint128 (two, two);

for (; nblocks >= 8; nblocks -= 8)
{
- ctr4 = vec_add_uint128 (ctr, four);
- b0 = rkey0 ^ ctr;
- b1 = rkey0 ^ vec_add_uint128 (ctr, one);
- b2 = rkey0 ^ vec_add_uint128 (ctr, two);
- b3 = rkey0 ^ vec_add_uint128 (ctr, three);
- b4 = rkey0 ^ ctr4;
- b5 = rkey0 ^ vec_add_uint128 (ctr4, one);
- b6 = rkey0 ^ vec_add_uint128 (ctr4, two);
- b7 = rkey0 ^ vec_add_uint128 (ctr4, three);
- ctr = vec_add_uint128 (ctr4, four);
+ b1 = asm_add_uint128 (ctr, one);
+ b2 = asm_add_uint128 (ctr, two);
+ b3 = asm_add_uint128 (ctr, three);
+ b4 = asm_add_uint128 (ctr, four);
+ b5 = asm_add_uint128 (b1, four);
+ b6 = asm_add_uint128 (b2, four);
+ b7 = asm_add_uint128 (b3, four);
+ b0 = asm_xor (rkey0, ctr);
+ rkey = ALIGNED_LOAD (rk, 1);
+ ctr = asm_add_uint128 (b4, four);
+ b1 = asm_xor (rkey0, b1);
+ b2 = asm_xor (rkey0, b2);
+ b3 = asm_xor (rkey0, b3);
+ b0 = asm_cipher_be (b0, rkey);
+ b1 = asm_cipher_be (b1, rkey);
+ b2 = asm_cipher_be (b2, rkey);
+ b3 = asm_cipher_be (b3, rkey);
+ b4 = asm_xor (rkey0, b4);
+ b5 = asm_xor (rkey0, b5);
+ b6 = asm_xor (rkey0, b6);
+ b7 = asm_xor (rkey0, b7);
+ b4 = asm_cipher_be (b4, rkey);
+ b5 = asm_cipher_be (b5, rkey);
+ b6 = asm_cipher_be (b6, rkey);
+ b7 = asm_cipher_be (b7, rkey);

#define DO_ROUND(r) \
- rkey = ALIGNED_LOAD(&rk[r]); \
- b0 = vec_cipher_be (b0, rkey); \
- b1 = vec_cipher_be (b1, rkey); \
- b2 = vec_cipher_be (b2, rkey); \
- b3 = vec_cipher_be (b3, rkey); \
- b4 = vec_cipher_be (b4, rkey); \
- b5 = vec_cipher_be (b5, rkey); \
- b6 = vec_cipher_be (b6, rkey); \
- b7 = vec_cipher_be (b7, rkey);
-
- DO_ROUND(1);
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_cipher_be (b0, rkey); \
+ b1 = asm_cipher_be (b1, rkey); \
+ b2 = asm_cipher_be (b2, rkey); \
+ b3 = asm_cipher_be (b3, rkey); \
+ b4 = asm_cipher_be (b4, rkey); \
+ b5 = asm_cipher_be (b5, rkey); \
+ b6 = asm_cipher_be (b6, rkey); \
+ b7 = asm_cipher_be (b7, rkey);
+
+ in0 = VEC_LOAD_BE_NOSWAP (in, 0);
DO_ROUND(2);
+ in1 = VEC_LOAD_BE_NOSWAP (in, 1);
DO_ROUND(3);
+ in2 = VEC_LOAD_BE_NOSWAP (in, 2);
DO_ROUND(4);
+ in3 = VEC_LOAD_BE_NOSWAP (in, 3);
DO_ROUND(5);
+ in4 = VEC_LOAD_BE_NOSWAP (in, 4);
DO_ROUND(6);
+ in5 = VEC_LOAD_BE_NOSWAP (in, 5);
DO_ROUND(7);
+ in6 = VEC_LOAD_BE_NOSWAP (in, 6);
DO_ROUND(8);
+ in7 = VEC_LOAD_BE_NOSWAP (in, 7);
+ in += 8;
DO_ROUND(9);
+
if (rounds >= 12)
{
DO_ROUND(10);
@@ -936,43 +1107,68 @@ void _gcry_aes_ppc8_ctr_enc (void *context, unsigned char *ctr_arg,

#undef DO_ROUND

- rkey = rkeylast;
- b0 = vec_cipherlast_be (b0, rkey ^ VEC_LOAD_BE (in + 0, bige_const));
- b1 = vec_cipherlast_be (b1, rkey ^ VEC_LOAD_BE (in + 1, bige_const));
- b2 = vec_cipherlast_be (b2, rkey ^ VEC_LOAD_BE (in + 2, bige_const));
- b3 = vec_cipherlast_be (b3, rkey ^ VEC_LOAD_BE (in + 3, bige_const));
- b4 = vec_cipherlast_be (b4, rkey ^ VEC_LOAD_BE (in + 4, bige_const));
- b5 = vec_cipherlast_be (b5, rkey ^ VEC_LOAD_BE (in + 5, bige_const));
- b6 = vec_cipherlast_be (b6, rkey ^ VEC_LOAD_BE (in + 6, bige_const));
- b7 = vec_cipherlast_be (b7, rkey ^ VEC_LOAD_BE (in + 7, bige_const));
-
- VEC_STORE_BE (out + 0, b0, bige_const);
- VEC_STORE_BE (out + 1, b1, bige_const);
- VEC_STORE_BE (out + 2, b2, bige_const);
- VEC_STORE_BE (out + 3, b3, bige_const);
- VEC_STORE_BE (out + 4, b4, bige_const);
- VEC_STORE_BE (out + 5, b5, bige_const);
- VEC_STORE_BE (out + 6, b6, bige_const);
- VEC_STORE_BE (out + 7, b7, bige_const);
-
- in += 8;
+ in0 = VEC_BE_SWAP (in0, bige_const);
+ in1 = VEC_BE_SWAP (in1, bige_const);
+ in2 = VEC_BE_SWAP (in2, bige_const);
+ in3 = VEC_BE_SWAP (in3, bige_const);
+ in4 = VEC_BE_SWAP (in4, bige_const);
+ in5 = VEC_BE_SWAP (in5, bige_const);
+ in6 = VEC_BE_SWAP (in6, bige_const);
+ in7 = VEC_BE_SWAP (in7, bige_const);
+
+ in0 = asm_xor (rkeylast, in0);
+ in1 = asm_xor (rkeylast, in1);
+ in2 = asm_xor (rkeylast, in2);
+ in3 = asm_xor (rkeylast, in3);
+ b0 = asm_cipherlast_be (b0, in0);
+ b1 = asm_cipherlast_be (b1, in1);
+ in4 = asm_xor (rkeylast, in4);
+ in5 = asm_xor (rkeylast, in5);
+ b2 = asm_cipherlast_be (b2, in2);
+ b3 = asm_cipherlast_be (b3, in3);
+ in6 = asm_xor (rkeylast, in6);
+ in7 = asm_xor (rkeylast, in7);
+ b4 = asm_cipherlast_be (b4, in4);
+ b5 = asm_cipherlast_be (b5, in5);
+ b6 = asm_cipherlast_be (b6, in6);
+ b7 = asm_cipherlast_be (b7, in7);
+
+ b0 = VEC_BE_SWAP (b0, bige_const);
+ b1 = VEC_BE_SWAP (b1, bige_const);
+ b2 = VEC_BE_SWAP (b2, bige_const);
+ b3 = VEC_BE_SWAP (b3, bige_const);
+ b4 = VEC_BE_SWAP (b4, bige_const);
+ b5 = VEC_BE_SWAP (b5, bige_const);
+ b6 = VEC_BE_SWAP (b6, bige_const);
+ b7 = VEC_BE_SWAP (b7, bige_const);
+ VEC_STORE_BE_NOSWAP (out, 0, b0);
+ VEC_STORE_BE_NOSWAP (out, 1, b1);
+ VEC_STORE_BE_NOSWAP (out, 2, b2);
+ VEC_STORE_BE_NOSWAP (out, 3, b3);
+ VEC_STORE_BE_NOSWAP (out, 4, b4);
+ VEC_STORE_BE_NOSWAP (out, 5, b5);
+ VEC_STORE_BE_NOSWAP (out, 6, b6);
+ VEC_STORE_BE_NOSWAP (out, 7, b7);
out += 8;
}

if (nblocks >= 4)
{
- b0 = rkey0 ^ ctr;
- b1 = rkey0 ^ vec_add_uint128 (ctr, one);
- b2 = rkey0 ^ vec_add_uint128 (ctr, two);
- b3 = rkey0 ^ vec_add_uint128 (ctr, three);
- ctr = vec_add_uint128 (ctr, four);
+ b1 = asm_add_uint128 (ctr, one);
+ b2 = asm_add_uint128 (ctr, two);
+ b3 = asm_add_uint128 (ctr, three);
+ b0 = asm_xor (rkey0, ctr);
+ ctr = asm_add_uint128 (ctr, four);
+ b1 = asm_xor (rkey0, b1);
+ b2 = asm_xor (rkey0, b2);
+ b3 = asm_xor (rkey0, b3);

#define DO_ROUND(r) \
- rkey = ALIGNED_LOAD(&rk[r]); \
- b0 = vec_cipher_be (b0, rkey); \
- b1 = vec_cipher_be (b1, rkey); \
- b2 = vec_cipher_be (b2, rkey); \
- b3 = vec_cipher_be (b3, rkey);
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_cipher_be (b0, rkey); \
+ b1 = asm_cipher_be (b1, rkey); \
+ b2 = asm_cipher_be (b2, rkey); \
+ b3 = asm_cipher_be (b3, rkey);

DO_ROUND(1);
DO_ROUND(2);
@@ -982,6 +1178,12 @@ void _gcry_aes_ppc8_ctr_enc (void *context, unsigned char *ctr_arg,
DO_ROUND(6);
DO_ROUND(7);
DO_ROUND(8);
+
+ in0 = VEC_LOAD_BE (in, 0, bige_const);
+ in1 = VEC_LOAD_BE (in, 1, bige_const);
+ in2 = VEC_LOAD_BE (in, 2, bige_const);
+ in3 = VEC_LOAD_BE (in, 3, bige_const);
+
DO_ROUND(9);
if (rounds >= 12)
{
@@ -996,16 +1198,21 @@ void _gcry_aes_ppc8_ctr_enc (void *context, unsigned char *ctr_arg,

#undef DO_ROUND

- rkey = rkeylast;
- b0 = vec_cipherlast_be (b0, rkey ^ VEC_LOAD_BE (in + 0, bige_const));
- b1 = vec_cipherlast_be (b1, rkey ^ VEC_LOAD_BE (in + 1, bige_const));
- b2 = vec_cipherlast_be (b2, rkey ^ VEC_LOAD_BE (in + 2, bige_const));
- b3 = vec_cipherlast_be (b3, rkey ^ VEC_LOAD_BE (in + 3, bige_const));
-
- VEC_STORE_BE (out + 0, b0, bige_const);
- VEC_STORE_BE (out + 1, b1, bige_const);
- VEC_STORE_BE (out + 2, b2, bige_const);
- VEC_STORE_BE (out + 3, b3, bige_const);
+ in0 = asm_xor (rkeylast, in0);
+ in1 = asm_xor (rkeylast, in1);
+ in2 = asm_xor (rkeylast, in2);
+ in3 = asm_xor (rkeylast, in3);
+
+ b0 = asm_cipherlast_be (b0, in0);
+ b1 = asm_cipherlast_be (b1, in1);
+ b2 = asm_cipherlast_be (b2, in2);
+ b3 = asm_cipherlast_be (b3, in3);
+
+ VEC_STORE_BE (out, 0, b0, bige_const);
+ VEC_STORE_BE (out, 1, b1, bige_const);
+ VEC_STORE_BE (out, 2, b2, bige_const);
+ VEC_STORE_BE (out, 3, b3, bige_const);
+
in += 4;
out += 4;
nblocks -= 4;
@@ -1015,18 +1222,18 @@ void _gcry_aes_ppc8_ctr_enc (void *context, unsigned char *ctr_arg,
for (; nblocks; nblocks--)
{
b = ctr;
- ctr = vec_add_uint128 (ctr, one);
- rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, bige_const);
+ ctr = asm_add_uint128 (ctr, one);
+ rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, 0, bige_const);

AES_ENCRYPT (b, rounds);

- VEC_STORE_BE (out, b, bige_const);
+ VEC_STORE_BE (out, 0, b, bige_const);

out++;
in++;
}

- VEC_STORE_BE (ctr_arg, ctr, bige_const);
+ VEC_STORE_BE (ctr_arg, 0, ctr, bige_const);
}


@@ -1034,7 +1241,7 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
const void *inbuf_arg, size_t nblocks,
int encrypt)
{
- const block bige_const = vec_load_be_const();
+ const block bige_const = asm_load_be_const();
RIJNDAEL_context *ctx = (void *)&c->context.c;
const u128_t *in = (const u128_t *)inbuf_arg;
u128_t *out = (u128_t *)outbuf_arg;
@@ -1043,16 +1250,16 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
block l0, l1, l2, l;
block b0, b1, b2, b3, b4, b5, b6, b7, b;
block iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7;
- block rkey;
+ block rkey, rkeylf;
block ctr, iv;
ROUND_KEY_VARIABLES;

- iv = VEC_LOAD_BE (c->u_iv.iv, bige_const);
- ctr = VEC_LOAD_BE (c->u_ctr.ctr, bige_const);
+ iv = VEC_LOAD_BE (c->u_iv.iv, 0, bige_const);
+ ctr = VEC_LOAD_BE (c->u_ctr.ctr, 0, bige_const);

- l0 = VEC_LOAD_BE (c->u_mode.ocb.L[0], bige_const);
- l1 = VEC_LOAD_BE (c->u_mode.ocb.L[1], bige_const);
- l2 = VEC_LOAD_BE (c->u_mode.ocb.L[2], bige_const);
+ l0 = VEC_LOAD_BE (c->u_mode.ocb.L[0], 0, bige_const);
+ l1 = VEC_LOAD_BE (c->u_mode.ocb.L[1], 0, bige_const);
+ l2 = VEC_LOAD_BE (c->u_mode.ocb.L[2], 0, bige_const);

if (encrypt)
{
@@ -1062,8 +1269,8 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,

for (; nblocks >= 8 && data_nblocks % 8; nblocks--)
{
- l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), bige_const);
- b = VEC_LOAD_BE (in, bige_const);
+ l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
+ b = VEC_LOAD_BE (in, 0, bige_const);

/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
iv ^= l;
@@ -1074,7 +1281,7 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
AES_ENCRYPT (b, rounds);
b ^= iv;

- VEC_STORE_BE (out, b, bige_const);
+ VEC_STORE_BE (out, 0, b, bige_const);

in += 1;
out += 1;
@@ -1082,16 +1289,25 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,

for (; nblocks >= 8; nblocks -= 8)
{
- b0 = VEC_LOAD_BE (in + 0, bige_const);
- b1 = VEC_LOAD_BE (in + 1, bige_const);
- b2 = VEC_LOAD_BE (in + 2, bige_const);
- b3 = VEC_LOAD_BE (in + 3, bige_const);
- b4 = VEC_LOAD_BE (in + 4, bige_const);
- b5 = VEC_LOAD_BE (in + 5, bige_const);
- b6 = VEC_LOAD_BE (in + 6, bige_const);
- b7 = VEC_LOAD_BE (in + 7, bige_const);
-
- l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 8), bige_const);
+ b0 = VEC_LOAD_BE_NOSWAP (in, 0);
+ b1 = VEC_LOAD_BE_NOSWAP (in, 1);
+ b2 = VEC_LOAD_BE_NOSWAP (in, 2);
+ b3 = VEC_LOAD_BE_NOSWAP (in, 3);
+ b4 = VEC_LOAD_BE_NOSWAP (in, 4);
+ b5 = VEC_LOAD_BE_NOSWAP (in, 5);
+ b6 = VEC_LOAD_BE_NOSWAP (in, 6);
+ b7 = VEC_LOAD_BE_NOSWAP (in, 7);
+ in += 8;
+ l = VEC_LOAD_BE_NOSWAP (ocb_get_l (c, data_nblocks += 8), 0);
+ b0 = VEC_BE_SWAP(b0, bige_const);
+ b1 = VEC_BE_SWAP(b1, bige_const);
+ b2 = VEC_BE_SWAP(b2, bige_const);
+ b3 = VEC_BE_SWAP(b3, bige_const);
+ b4 = VEC_BE_SWAP(b4, bige_const);
+ b5 = VEC_BE_SWAP(b5, bige_const);
+ b6 = VEC_BE_SWAP(b6, bige_const);
+ b7 = VEC_BE_SWAP(b7, bige_const);
+ l = VEC_BE_SWAP(l, bige_const);

ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7;

@@ -1117,15 +1333,15 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
iv = iv7 ^ rkey0;

#define DO_ROUND(r) \
- rkey = ALIGNED_LOAD (&rk[r]); \
- b0 = vec_cipher_be (b0, rkey); \
- b1 = vec_cipher_be (b1, rkey); \
- b2 = vec_cipher_be (b2, rkey); \
- b3 = vec_cipher_be (b3, rkey); \
- b4 = vec_cipher_be (b4, rkey); \
- b5 = vec_cipher_be (b5, rkey); \
- b6 = vec_cipher_be (b6, rkey); \
- b7 = vec_cipher_be (b7, rkey);
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_cipher_be (b0, rkey); \
+ b1 = asm_cipher_be (b1, rkey); \
+ b2 = asm_cipher_be (b2, rkey); \
+ b3 = asm_cipher_be (b3, rkey); \
+ b4 = asm_cipher_be (b4, rkey); \
+ b5 = asm_cipher_be (b5, rkey); \
+ b6 = asm_cipher_be (b6, rkey); \
+ b7 = asm_cipher_be (b7, rkey);

DO_ROUND(1);
DO_ROUND(2);
@@ -1134,7 +1350,20 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
DO_ROUND(5);
DO_ROUND(6);
DO_ROUND(7);
+
+ rkeylf = asm_xor (rkeylast, rkey0);
+
DO_ROUND(8);
+
+ iv0 = asm_xor (rkeylf, iv0);
+ iv1 = asm_xor (rkeylf, iv1);
+ iv2 = asm_xor (rkeylf, iv2);
+ iv3 = asm_xor (rkeylf, iv3);
+ iv4 = asm_xor (rkeylf, iv4);
+ iv5 = asm_xor (rkeylf, iv5);
+ iv6 = asm_xor (rkeylf, iv6);
+ iv7 = asm_xor (rkeylf, iv7);
+
DO_ROUND(9);
if (rounds >= 12)
{
@@ -1149,37 +1378,42 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,

#undef DO_ROUND

- rkey = rkeylast ^ rkey0;
- b0 = vec_cipherlast_be (b0, rkey ^ iv0);
- b1 = vec_cipherlast_be (b1, rkey ^ iv1);
- b2 = vec_cipherlast_be (b2, rkey ^ iv2);
- b3 = vec_cipherlast_be (b3, rkey ^ iv3);
- b4 = vec_cipherlast_be (b4, rkey ^ iv4);
- b5 = vec_cipherlast_be (b5, rkey ^ iv5);
- b6 = vec_cipherlast_be (b6, rkey ^ iv6);
- b7 = vec_cipherlast_be (b7, rkey ^ iv7);
-
- VEC_STORE_BE (out + 0, b0, bige_const);
- VEC_STORE_BE (out + 1, b1, bige_const);
- VEC_STORE_BE (out + 2, b2, bige_const);
- VEC_STORE_BE (out + 3, b3, bige_const);
- VEC_STORE_BE (out + 4, b4, bige_const);
- VEC_STORE_BE (out + 5, b5, bige_const);
- VEC_STORE_BE (out + 6, b6, bige_const);
- VEC_STORE_BE (out + 7, b7, bige_const);
-
- in += 8;
+ b0 = asm_cipherlast_be (b0, iv0);
+ b1 = asm_cipherlast_be (b1, iv1);
+ b2 = asm_cipherlast_be (b2, iv2);
+ b3 = asm_cipherlast_be (b3, iv3);
+ b4 = asm_cipherlast_be (b4, iv4);
+ b5 = asm_cipherlast_be (b5, iv5);
+ b6 = asm_cipherlast_be (b6, iv6);
+ b7 = asm_cipherlast_be (b7, iv7);
+
+ b0 = VEC_BE_SWAP (b0, bige_const);
+ b1 = VEC_BE_SWAP (b1, bige_const);
+ b2 = VEC_BE_SWAP (b2, bige_const);
+ b3 = VEC_BE_SWAP (b3, bige_const);
+ b4 = VEC_BE_SWAP (b4, bige_const);
+ b5 = VEC_BE_SWAP (b5, bige_const);
+ b6 = VEC_BE_SWAP (b6, bige_const);
+ b7 = VEC_BE_SWAP (b7, bige_const);
+ VEC_STORE_BE_NOSWAP (out, 0, b0);
+ VEC_STORE_BE_NOSWAP (out, 1, b1);
+ VEC_STORE_BE_NOSWAP (out, 2, b2);
+ VEC_STORE_BE_NOSWAP (out, 3, b3);
+ VEC_STORE_BE_NOSWAP (out, 4, b4);
+ VEC_STORE_BE_NOSWAP (out, 5, b5);
+ VEC_STORE_BE_NOSWAP (out, 6, b6);
+ VEC_STORE_BE_NOSWAP (out, 7, b7);
out += 8;
}

if (nblocks >= 4 && (data_nblocks % 4) == 0)
{
- b0 = VEC_LOAD_BE (in + 0, bige_const);
- b1 = VEC_LOAD_BE (in + 1, bige_const);
- b2 = VEC_LOAD_BE (in + 2, bige_const);
- b3 = VEC_LOAD_BE (in + 3, bige_const);
+ b0 = VEC_LOAD_BE (in, 0, bige_const);
+ b1 = VEC_LOAD_BE (in, 1, bige_const);
+ b2 = VEC_LOAD_BE (in, 2, bige_const);
+ b3 = VEC_LOAD_BE (in, 3, bige_const);

- l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), bige_const);
+ l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), 0, bige_const);

ctr ^= b0 ^ b1 ^ b2 ^ b3;

@@ -1197,11 +1431,11 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
iv = iv3 ^ rkey0;

#define DO_ROUND(r) \
- rkey = ALIGNED_LOAD (&rk[r]); \
- b0 = vec_cipher_be (b0, rkey); \
- b1 = vec_cipher_be (b1, rkey); \
- b2 = vec_cipher_be (b2, rkey); \
- b3 = vec_cipher_be (b3, rkey);
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_cipher_be (b0, rkey); \
+ b1 = asm_cipher_be (b1, rkey); \
+ b2 = asm_cipher_be (b2, rkey); \
+ b3 = asm_cipher_be (b3, rkey);

DO_ROUND(1);
DO_ROUND(2);
@@ -1226,15 +1460,15 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
#undef DO_ROUND

rkey = rkeylast ^ rkey0;
- b0 = vec_cipherlast_be (b0, rkey ^ iv0);
- b1 = vec_cipherlast_be (b1, rkey ^ iv1);
- b2 = vec_cipherlast_be (b2, rkey ^ iv2);
- b3 = vec_cipherlast_be (b3, rkey ^ iv3);
+ b0 = asm_cipherlast_be (b0, rkey ^ iv0);
+ b1 = asm_cipherlast_be (b1, rkey ^ iv1);
+ b2 = asm_cipherlast_be (b2, rkey ^ iv2);
+ b3 = asm_cipherlast_be (b3, rkey ^ iv3);

- VEC_STORE_BE (out + 0, b0, bige_const);
- VEC_STORE_BE (out + 1, b1, bige_const);
- VEC_STORE_BE (out + 2, b2, bige_const);
- VEC_STORE_BE (out + 3, b3, bige_const);
+ VEC_STORE_BE (out, 0, b0, bige_const);
+ VEC_STORE_BE (out, 1, b1, bige_const);
+ VEC_STORE_BE (out, 2, b2, bige_const);
+ VEC_STORE_BE (out, 3, b3, bige_const);

in += 4;
out += 4;
@@ -1243,8 +1477,8 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,

for (; nblocks; nblocks--)
{
- l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), bige_const);
- b = VEC_LOAD_BE (in, bige_const);
+ l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
+ b = VEC_LOAD_BE (in, 0, bige_const);

/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
iv ^= l;
@@ -1255,7 +1489,7 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
AES_ENCRYPT (b, rounds);
b ^= iv;

- VEC_STORE_BE (out, b, bige_const);
+ VEC_STORE_BE (out, 0, b, bige_const);

in += 1;
out += 1;
@@ -1275,8 +1509,8 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,

for (; nblocks >= 8 && data_nblocks % 8; nblocks--)
{
- l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), bige_const);
- b = VEC_LOAD_BE (in, bige_const);
+ l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
+ b = VEC_LOAD_BE (in, 0, bige_const);

/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
iv ^= l;
@@ -1287,7 +1521,7 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
/* Checksum_i = Checksum_{i-1} xor P_i */
ctr ^= b;

- VEC_STORE_BE (out, b, bige_const);
+ VEC_STORE_BE (out, 0, b, bige_const);

in += 1;
out += 1;
@@ -1295,16 +1529,25 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,

for (; nblocks >= 8; nblocks -= 8)
{
- b0 = VEC_LOAD_BE (in + 0, bige_const);
- b1 = VEC_LOAD_BE (in + 1, bige_const);
- b2 = VEC_LOAD_BE (in + 2, bige_const);
- b3 = VEC_LOAD_BE (in + 3, bige_const);
- b4 = VEC_LOAD_BE (in + 4, bige_const);
- b5 = VEC_LOAD_BE (in + 5, bige_const);
- b6 = VEC_LOAD_BE (in + 6, bige_const);
- b7 = VEC_LOAD_BE (in + 7, bige_const);
-
- l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 8), bige_const);
+ b0 = VEC_LOAD_BE_NOSWAP (in, 0);
+ b1 = VEC_LOAD_BE_NOSWAP (in, 1);
+ b2 = VEC_LOAD_BE_NOSWAP (in, 2);
+ b3 = VEC_LOAD_BE_NOSWAP (in, 3);
+ b4 = VEC_LOAD_BE_NOSWAP (in, 4);
+ b5 = VEC_LOAD_BE_NOSWAP (in, 5);
+ b6 = VEC_LOAD_BE_NOSWAP (in, 6);
+ b7 = VEC_LOAD_BE_NOSWAP (in, 7);
+ in += 8;
+ l = VEC_LOAD_BE_NOSWAP (ocb_get_l (c, data_nblocks += 8), 0);
+ b0 = VEC_BE_SWAP(b0, bige_const);
+ b1 = VEC_BE_SWAP(b1, bige_const);
+ b2 = VEC_BE_SWAP(b2, bige_const);
+ b3 = VEC_BE_SWAP(b3, bige_const);
+ b4 = VEC_BE_SWAP(b4, bige_const);
+ b5 = VEC_BE_SWAP(b5, bige_const);
+ b6 = VEC_BE_SWAP(b6, bige_const);
+ b7 = VEC_BE_SWAP(b7, bige_const);
+ l = VEC_BE_SWAP(l, bige_const);

iv ^= rkey0;

@@ -1328,15 +1571,15 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
iv = iv7 ^ rkey0;

#define DO_ROUND(r) \
- rkey = ALIGNED_LOAD (&rk[r]); \
- b0 = vec_ncipher_be (b0, rkey); \
- b1 = vec_ncipher_be (b1, rkey); \
- b2 = vec_ncipher_be (b2, rkey); \
- b3 = vec_ncipher_be (b3, rkey); \
- b4 = vec_ncipher_be (b4, rkey); \
- b5 = vec_ncipher_be (b5, rkey); \
- b6 = vec_ncipher_be (b6, rkey); \
- b7 = vec_ncipher_be (b7, rkey);
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_ncipher_be (b0, rkey); \
+ b1 = asm_ncipher_be (b1, rkey); \
+ b2 = asm_ncipher_be (b2, rkey); \
+ b3 = asm_ncipher_be (b3, rkey); \
+ b4 = asm_ncipher_be (b4, rkey); \
+ b5 = asm_ncipher_be (b5, rkey); \
+ b6 = asm_ncipher_be (b6, rkey); \
+ b7 = asm_ncipher_be (b7, rkey);

DO_ROUND(1);
DO_ROUND(2);
@@ -1345,7 +1588,20 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
DO_ROUND(5);
DO_ROUND(6);
DO_ROUND(7);
+
+ rkeylf = asm_xor (rkeylast, rkey0);
+
DO_ROUND(8);
+
+ iv0 = asm_xor (rkeylf, iv0);
+ iv1 = asm_xor (rkeylf, iv1);
+ iv2 = asm_xor (rkeylf, iv2);
+ iv3 = asm_xor (rkeylf, iv3);
+ iv4 = asm_xor (rkeylf, iv4);
+ iv5 = asm_xor (rkeylf, iv5);
+ iv6 = asm_xor (rkeylf, iv6);
+ iv7 = asm_xor (rkeylf, iv7);
+
DO_ROUND(9);
if (rounds >= 12)
{
@@ -1360,39 +1616,44 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,

#undef DO_ROUND

- rkey = rkeylast ^ rkey0;
- b0 = vec_ncipherlast_be (b0, rkey ^ iv0);
- b1 = vec_ncipherlast_be (b1, rkey ^ iv1);
- b2 = vec_ncipherlast_be (b2, rkey ^ iv2);
- b3 = vec_ncipherlast_be (b3, rkey ^ iv3);
- b4 = vec_ncipherlast_be (b4, rkey ^ iv4);
- b5 = vec_ncipherlast_be (b5, rkey ^ iv5);
- b6 = vec_ncipherlast_be (b6, rkey ^ iv6);
- b7 = vec_ncipherlast_be (b7, rkey ^ iv7);
-
- VEC_STORE_BE (out + 0, b0, bige_const);
- VEC_STORE_BE (out + 1, b1, bige_const);
- VEC_STORE_BE (out + 2, b2, bige_const);
- VEC_STORE_BE (out + 3, b3, bige_const);
- VEC_STORE_BE (out + 4, b4, bige_const);
- VEC_STORE_BE (out + 5, b5, bige_const);
- VEC_STORE_BE (out + 6, b6, bige_const);
- VEC_STORE_BE (out + 7, b7, bige_const);
+ b0 = asm_ncipherlast_be (b0, iv0);
+ b1 = asm_ncipherlast_be (b1, iv1);
+ b2 = asm_ncipherlast_be (b2, iv2);
+ b3 = asm_ncipherlast_be (b3, iv3);
+ b4 = asm_ncipherlast_be (b4, iv4);
+ b5 = asm_ncipherlast_be (b5, iv5);
+ b6 = asm_ncipherlast_be (b6, iv6);
+ b7 = asm_ncipherlast_be (b7, iv7);

ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7;

- in += 8;
+ b0 = VEC_BE_SWAP (b0, bige_const);
+ b1 = VEC_BE_SWAP (b1, bige_const);
+ b2 = VEC_BE_SWAP (b2, bige_const);
+ b3 = VEC_BE_SWAP (b3, bige_const);
+ b4 = VEC_BE_SWAP (b4, bige_const);
+ b5 = VEC_BE_SWAP (b5, bige_const);
+ b6 = VEC_BE_SWAP (b6, bige_const);
+ b7 = VEC_BE_SWAP (b7, bige_const);
+ VEC_STORE_BE_NOSWAP (out, 0, b0);
+ VEC_STORE_BE_NOSWAP (out, 1, b1);
+ VEC_STORE_BE_NOSWAP (out, 2, b2);
+ VEC_STORE_BE_NOSWAP (out, 3, b3);
+ VEC_STORE_BE_NOSWAP (out, 4, b4);
+ VEC_STORE_BE_NOSWAP (out, 5, b5);
+ VEC_STORE_BE_NOSWAP (out, 6, b6);
+ VEC_STORE_BE_NOSWAP (out, 7, b7);
out += 8;
}

if (nblocks >= 4 && (data_nblocks % 4) == 0)
{
- b0 = VEC_LOAD_BE (in + 0, bige_const);
- b1 = VEC_LOAD_BE (in + 1, bige_const);
- b2 = VEC_LOAD_BE (in + 2, bige_const);
- b3 = VEC_LOAD_BE (in + 3, bige_const);
+ b0 = VEC_LOAD_BE (in, 0, bige_const);
+ b1 = VEC_LOAD_BE (in, 1, bige_const);
+ b2 = VEC_LOAD_BE (in, 2, bige_const);
+ b3 = VEC_LOAD_BE (in, 3, bige_const);

- l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), bige_const);
+ l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), 0, bige_const);

iv ^= rkey0;

@@ -1408,11 +1669,11 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
iv = iv3 ^ rkey0;

#define DO_ROUND(r) \
- rkey = ALIGNED_LOAD (&rk[r]); \
- b0 = vec_ncipher_be (b0, rkey); \
- b1 = vec_ncipher_be (b1, rkey); \
- b2 = vec_ncipher_be (b2, rkey); \
- b3 = vec_ncipher_be (b3, rkey);
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_ncipher_be (b0, rkey); \
+ b1 = asm_ncipher_be (b1, rkey); \
+ b2 = asm_ncipher_be (b2, rkey); \
+ b3 = asm_ncipher_be (b3, rkey);

DO_ROUND(1);
DO_ROUND(2);
@@ -1437,15 +1698,15 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
#undef DO_ROUND

rkey = rkeylast ^ rkey0;
- b0 = vec_ncipherlast_be (b0, rkey ^ iv0);
- b1 = vec_ncipherlast_be (b1, rkey ^ iv1);
- b2 = vec_ncipherlast_be (b2, rkey ^ iv2);
- b3 = vec_ncipherlast_be (b3, rkey ^ iv3);
+ b0 = asm_ncipherlast_be (b0, rkey ^ iv0);
+ b1 = asm_ncipherlast_be (b1, rkey ^ iv1);
+ b2 = asm_ncipherlast_be (b2, rkey ^ iv2);
+ b3 = asm_ncipherlast_be (b3, rkey ^ iv3);

- VEC_STORE_BE (out + 0, b0, bige_const);
- VEC_STORE_BE (out + 1, b1, bige_const);
- VEC_STORE_BE (out + 2, b2, bige_const);
- VEC_STORE_BE (out + 3, b3, bige_const);
+ VEC_STORE_BE (out, 0, b0, bige_const);
+ VEC_STORE_BE (out, 1, b1, bige_const);
+ VEC_STORE_BE (out, 2, b2, bige_const);
+ VEC_STORE_BE (out, 3, b3, bige_const);

ctr ^= b0 ^ b1 ^ b2 ^ b3;

@@ -1456,8 +1717,8 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,

for (; nblocks; nblocks--)
{
- l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), bige_const);
- b = VEC_LOAD_BE (in, bige_const);
+ l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
+ b = VEC_LOAD_BE (in, 0, bige_const);

/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
iv ^= l;
@@ -1468,15 +1729,15 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
/* Checksum_i = Checksum_{i-1} xor P_i */
ctr ^= b;

- VEC_STORE_BE (out, b, bige_const);
+ VEC_STORE_BE (out, 0, b, bige_const);

in += 1;
out += 1;
}
}

- VEC_STORE_BE (c->u_iv.iv, iv, bige_const);
- VEC_STORE_BE (c->u_ctr.ctr, ctr, bige_const);
+ VEC_STORE_BE (c->u_iv.iv, 0, iv, bige_const);
+ VEC_STORE_BE (c->u_ctr.ctr, 0, ctr, bige_const);
c->u_mode.ocb.data_nblocks = data_nblocks;

return 0;
@@ -1485,7 +1746,7 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,
size_t nblocks)
{
- const block bige_const = vec_load_be_const();
+ const block bige_const = asm_load_be_const();
RIJNDAEL_context *ctx = (void *)&c->context.c;
const u128_t *rk = (u128_t *)&ctx->keyschenc;
const u128_t *abuf = (const u128_t *)abuf_arg;
@@ -1498,19 +1759,19 @@ size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,
block ctr, iv;
ROUND_KEY_VARIABLES;

- iv = VEC_LOAD_BE (c->u_mode.ocb.aad_offset, bige_const);
- ctr = VEC_LOAD_BE (c->u_mode.ocb.aad_sum, bige_const);
+ iv = VEC_LOAD_BE (c->u_mode.ocb.aad_offset, 0, bige_const);
+ ctr = VEC_LOAD_BE (c->u_mode.ocb.aad_sum, 0, bige_const);

- l0 = VEC_LOAD_BE (c->u_mode.ocb.L[0], bige_const);
- l1 = VEC_LOAD_BE (c->u_mode.ocb.L[1], bige_const);
- l2 = VEC_LOAD_BE (c->u_mode.ocb.L[2], bige_const);
+ l0 = VEC_LOAD_BE (c->u_mode.ocb.L[0], 0, bige_const);
+ l1 = VEC_LOAD_BE (c->u_mode.ocb.L[1], 0, bige_const);
+ l2 = VEC_LOAD_BE (c->u_mode.ocb.L[2], 0, bige_const);

PRELOAD_ROUND_KEYS (rounds);

for (; nblocks >= 8 && data_nblocks % 8; nblocks--)
{
- l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), bige_const);
- b = VEC_LOAD_BE (abuf, bige_const);
+ l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
+ b = VEC_LOAD_BE (abuf, 0, bige_const);

/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
iv ^= l;
@@ -1524,16 +1785,16 @@ size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,

for (; nblocks >= 8; nblocks -= 8)
{
- b0 = VEC_LOAD_BE (abuf + 0, bige_const);
- b1 = VEC_LOAD_BE (abuf + 1, bige_const);
- b2 = VEC_LOAD_BE (abuf + 2, bige_const);
- b3 = VEC_LOAD_BE (abuf + 3, bige_const);
- b4 = VEC_LOAD_BE (abuf + 4, bige_const);
- b5 = VEC_LOAD_BE (abuf + 5, bige_const);
- b6 = VEC_LOAD_BE (abuf + 6, bige_const);
- b7 = VEC_LOAD_BE (abuf + 7, bige_const);
+ b0 = VEC_LOAD_BE (abuf, 0, bige_const);
+ b1 = VEC_LOAD_BE (abuf, 1, bige_const);
+ b2 = VEC_LOAD_BE (abuf, 2, bige_const);
+ b3 = VEC_LOAD_BE (abuf, 3, bige_const);
+ b4 = VEC_LOAD_BE (abuf, 4, bige_const);
+ b5 = VEC_LOAD_BE (abuf, 5, bige_const);
+ b6 = VEC_LOAD_BE (abuf, 6, bige_const);
+ b7 = VEC_LOAD_BE (abuf, 7, bige_const);

- l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 8), bige_const);
+ l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 8), 0, bige_const);

frkey = rkey0;
iv ^= frkey;
@@ -1558,15 +1819,15 @@ size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,
iv = iv7 ^ frkey;

#define DO_ROUND(r) \
- rkey = ALIGNED_LOAD (&rk[r]); \
- b0 = vec_cipher_be (b0, rkey); \
- b1 = vec_cipher_be (b1, rkey); \
- b2 = vec_cipher_be (b2, rkey); \
- b3 = vec_cipher_be (b3, rkey); \
- b4 = vec_cipher_be (b4, rkey); \
- b5 = vec_cipher_be (b5, rkey); \
- b6 = vec_cipher_be (b6, rkey); \
- b7 = vec_cipher_be (b7, rkey);
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_cipher_be (b0, rkey); \
+ b1 = asm_cipher_be (b1, rkey); \
+ b2 = asm_cipher_be (b2, rkey); \
+ b3 = asm_cipher_be (b3, rkey); \
+ b4 = asm_cipher_be (b4, rkey); \
+ b5 = asm_cipher_be (b5, rkey); \
+ b6 = asm_cipher_be (b6, rkey); \
+ b7 = asm_cipher_be (b7, rkey);

DO_ROUND(1);
DO_ROUND(2);
@@ -1591,14 +1852,14 @@ size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,
#undef DO_ROUND

rkey = rkeylast;
- b0 = vec_cipherlast_be (b0, rkey);
- b1 = vec_cipherlast_be (b1, rkey);
- b2 = vec_cipherlast_be (b2, rkey);
- b3 = vec_cipherlast_be (b3, rkey);
- b4 = vec_cipherlast_be (b4, rkey);
- b5 = vec_cipherlast_be (b5, rkey);
- b6 = vec_cipherlast_be (b6, rkey);
- b7 = vec_cipherlast_be (b7, rkey);
+ b0 = asm_cipherlast_be (b0, rkey);
+ b1 = asm_cipherlast_be (b1, rkey);
+ b2 = asm_cipherlast_be (b2, rkey);
+ b3 = asm_cipherlast_be (b3, rkey);
+ b4 = asm_cipherlast_be (b4, rkey);
+ b5 = asm_cipherlast_be (b5, rkey);
+ b6 = asm_cipherlast_be (b6, rkey);
+ b7 = asm_cipherlast_be (b7, rkey);

ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7;

@@ -1607,12 +1868,12 @@ size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,

if (nblocks >= 4 && (data_nblocks % 4) == 0)
{
- b0 = VEC_LOAD_BE (abuf + 0, bige_const);
- b1 = VEC_LOAD_BE (abuf + 1, bige_const);
- b2 = VEC_LOAD_BE (abuf + 2, bige_const);
- b3 = VEC_LOAD_BE (abuf + 3, bige_const);
+ b0 = VEC_LOAD_BE (abuf, 0, bige_const);
+ b1 = VEC_LOAD_BE (abuf, 1, bige_const);
+ b2 = VEC_LOAD_BE (abuf, 2, bige_const);
+ b3 = VEC_LOAD_BE (abuf, 3, bige_const);

- l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), bige_const);
+ l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), 0, bige_const);

frkey = rkey0;
iv ^= frkey;
@@ -1629,11 +1890,11 @@ size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,
iv = iv3 ^ frkey;

#define DO_ROUND(r) \
- rkey = ALIGNED_LOAD (&rk[r]); \
- b0 = vec_cipher_be (b0, rkey); \
- b1 = vec_cipher_be (b1, rkey); \
- b2 = vec_cipher_be (b2, rkey); \
- b3 = vec_cipher_be (b3, rkey);
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_cipher_be (b0, rkey); \
+ b1 = asm_cipher_be (b1, rkey); \
+ b2 = asm_cipher_be (b2, rkey); \
+ b3 = asm_cipher_be (b3, rkey);

DO_ROUND(1);
DO_ROUND(2);
@@ -1658,10 +1919,10 @@ size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,
#undef DO_ROUND

rkey = rkeylast;
- b0 = vec_cipherlast_be (b0, rkey);
- b1 = vec_cipherlast_be (b1, rkey);
- b2 = vec_cipherlast_be (b2, rkey);
- b3 = vec_cipherlast_be (b3, rkey);
+ b0 = asm_cipherlast_be (b0, rkey);
+ b1 = asm_cipherlast_be (b1, rkey);
+ b2 = asm_cipherlast_be (b2, rkey);
+ b3 = asm_cipherlast_be (b3, rkey);

ctr ^= b0 ^ b1 ^ b2 ^ b3;

@@ -1671,8 +1932,8 @@ size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,

for (; nblocks; nblocks--)
{
- l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), bige_const);
- b = VEC_LOAD_BE (abuf, bige_const);
+ l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
+ b = VEC_LOAD_BE (abuf, 0, bige_const);

/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
iv ^= l;
@@ -1684,8 +1945,8 @@ size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,
abuf += 1;
}

- VEC_STORE_BE (c->u_mode.ocb.aad_offset, iv, bige_const);
- VEC_STORE_BE (c->u_mode.ocb.aad_sum, ctr, bige_const);
+ VEC_STORE_BE (c->u_mode.ocb.aad_offset, 0, iv, bige_const);
+ VEC_STORE_BE (c->u_mode.ocb.aad_sum, 0, ctr, bige_const);
c->u_mode.ocb.aad_nblocks = data_nblocks;

return 0;
@@ -1696,44 +1957,59 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks, int encrypt)
{
+#ifdef WORDS_BIGENDIAN
static const block vec_bswap64_const =
- { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+ { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 };
static const block vec_bswap128_const =
{ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+#else
+ static const block vec_bswap64_const =
+ { ~8, ~9, ~10, ~11, ~12, ~13, ~14, ~15, ~0, ~1, ~2, ~3, ~4, ~5, ~6, ~7 };
+ static const block vec_bswap128_const =
+ { ~15, ~14, ~13, ~12, ~11, ~10, ~9, ~8, ~7, ~6, ~5, ~4, ~3, ~2, ~1, ~0 };
+ static const block vec_tweakin_swap_const =
+ { ~12, ~13, ~14, ~15, ~8, ~9, ~10, ~11, ~4, ~5, ~6, ~7, ~0, ~1, ~2, ~3 };
+#endif
static const unsigned char vec_tweak_const[16] =
{ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0x87 };
static const vector unsigned long long vec_shift63_const =
{ 63, 63 };
static const vector unsigned long long vec_shift1_const =
{ 1, 1 };
- const block bige_const = vec_load_be_const();
+ const block bige_const = asm_load_be_const();
RIJNDAEL_context *ctx = context;
const u128_t *in = (const u128_t *)inbuf_arg;
u128_t *out = (u128_t *)outbuf_arg;
int rounds = ctx->rounds;
- block tweak_tmp, tweak_next, tweak;
- block b0, b1, b2, b3, b4, b5, b6, b7, b, rkey;
+ block tweak;
+ block b0, b1, b2, b3, b4, b5, b6, b7, b, rkey, rkeylf;
block tweak0, tweak1, tweak2, tweak3, tweak4, tweak5, tweak6, tweak7;
block tweak_const, bswap64_const, bswap128_const;
vector unsigned long long shift63_const, shift1_const;
ROUND_KEY_VARIABLES;

- tweak_const = VEC_LOAD_BE (&vec_tweak_const, bige_const);
- bswap64_const = ALIGNED_LOAD (&vec_bswap64_const);
- bswap128_const = ALIGNED_LOAD (&vec_bswap128_const);
- shift63_const = (vector unsigned long long)ALIGNED_LOAD (&vec_shift63_const);
- shift1_const = (vector unsigned long long)ALIGNED_LOAD (&vec_shift1_const);
+ tweak_const = VEC_LOAD_BE (&vec_tweak_const, 0, bige_const);
+ bswap64_const = ALIGNED_LOAD (&vec_bswap64_const, 0);
+ bswap128_const = ALIGNED_LOAD (&vec_bswap128_const, 0);
+ shift63_const = (vector unsigned long long)ALIGNED_LOAD (&vec_shift63_const, 0);
+ shift1_const = (vector unsigned long long)ALIGNED_LOAD (&vec_shift1_const, 0);

- tweak_next = VEC_LOAD_BE (tweak_arg, bige_const);
+#ifdef WORDS_BIGENDIAN
+ tweak = VEC_LOAD_BE (tweak_arg, 0, bige_const);
+ tweak = asm_vperm1 (tweak, bswap128_const);
+#else
+ tweak = VEC_LOAD_BE (tweak_arg, 0, vec_tweakin_swap_const);
+#endif

-#define GEN_TWEAK(tweak, tmp) /* Generate next tweak. */ \
- tmp = vec_vperm(tweak, tweak, bswap64_const); \
- tweak = vec_vperm(tweak, tweak, bswap128_const); \
- tmp = (block)(vec_sra((vector unsigned long long)tmp, shift63_const)) & \
- tweak_const; \
- tweak = (block)vec_sl((vector unsigned long long)tweak, shift1_const); \
- tweak = tweak ^ tmp; \
- tweak = vec_vperm(tweak, tweak, bswap128_const);
+#define GEN_TWEAK(tout, tin) /* Generate next tweak. */ \
+ do { \
+ block tmp1, tmp2; \
+ tmp1 = asm_vperm1((tin), bswap64_const); \
+ tmp2 = (block)vec_sl((vector unsigned long long)(tin), shift1_const); \
+ tmp1 = (block)(vec_sra((vector unsigned long long)tmp1, shift63_const)) & \
+ tweak_const; \
+ tout = asm_xor(tmp1, tmp2); \
+ } while (0)

if (encrypt)
{
@@ -1743,42 +2019,70 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg,

for (; nblocks >= 8; nblocks -= 8)
{
- tweak0 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
- tweak1 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
- tweak2 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
- tweak3 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
- tweak4 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
- tweak5 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
- tweak6 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
- tweak7 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
-
- b0 = VEC_LOAD_BE (in + 0, bige_const) ^ tweak0 ^ rkey0;
- b1 = VEC_LOAD_BE (in + 1, bige_const) ^ tweak1 ^ rkey0;
- b2 = VEC_LOAD_BE (in + 2, bige_const) ^ tweak2 ^ rkey0;
- b3 = VEC_LOAD_BE (in + 3, bige_const) ^ tweak3 ^ rkey0;
- b4 = VEC_LOAD_BE (in + 4, bige_const) ^ tweak4 ^ rkey0;
- b5 = VEC_LOAD_BE (in + 5, bige_const) ^ tweak5 ^ rkey0;
- b6 = VEC_LOAD_BE (in + 6, bige_const) ^ tweak6 ^ rkey0;
- b7 = VEC_LOAD_BE (in + 7, bige_const) ^ tweak7 ^ rkey0;
+ b0 = VEC_LOAD_BE_NOSWAP (in, 0);
+ b1 = VEC_LOAD_BE_NOSWAP (in, 1);
+ b2 = VEC_LOAD_BE_NOSWAP (in, 2);
+ b3 = VEC_LOAD_BE_NOSWAP (in, 3);
+ tweak0 = tweak;
+ GEN_TWEAK (tweak1, tweak0);
+ tweak0 = asm_vperm1 (tweak0, bswap128_const);
+ b4 = VEC_LOAD_BE_NOSWAP (in, 4);
+ b5 = VEC_LOAD_BE_NOSWAP (in, 5);
+ GEN_TWEAK (tweak2, tweak1);
+ tweak1 = asm_vperm1 (tweak1, bswap128_const);
+ b6 = VEC_LOAD_BE_NOSWAP (in, 6);
+ b7 = VEC_LOAD_BE_NOSWAP (in, 7);
+ in += 8;
+
+ b0 = VEC_BE_SWAP(b0, bige_const);
+ b1 = VEC_BE_SWAP(b1, bige_const);
+ GEN_TWEAK (tweak3, tweak2);
+ tweak2 = asm_vperm1 (tweak2, bswap128_const);
+ GEN_TWEAK (tweak4, tweak3);
+ tweak3 = asm_vperm1 (tweak3, bswap128_const);
+ b2 = VEC_BE_SWAP(b2, bige_const);
+ b3 = VEC_BE_SWAP(b3, bige_const);
+ GEN_TWEAK (tweak5, tweak4);
+ tweak4 = asm_vperm1 (tweak4, bswap128_const);
+ GEN_TWEAK (tweak6, tweak5);
+ tweak5 = asm_vperm1 (tweak5, bswap128_const);
+ b4 = VEC_BE_SWAP(b4, bige_const);
+ b5 = VEC_BE_SWAP(b5, bige_const);
+ GEN_TWEAK (tweak7, tweak6);
+ tweak6 = asm_vperm1 (tweak6, bswap128_const);
+ GEN_TWEAK (tweak, tweak7);
+ tweak7 = asm_vperm1 (tweak7, bswap128_const);
+ b6 = VEC_BE_SWAP(b6, bige_const);
+ b7 = VEC_BE_SWAP(b7, bige_const);
+
+ tweak0 = asm_xor (tweak0, rkey0);
+ tweak1 = asm_xor (tweak1, rkey0);
+ tweak2 = asm_xor (tweak2, rkey0);
+ tweak3 = asm_xor (tweak3, rkey0);
+ tweak4 = asm_xor (tweak4, rkey0);
+ tweak5 = asm_xor (tweak5, rkey0);
+ tweak6 = asm_xor (tweak6, rkey0);
+ tweak7 = asm_xor (tweak7, rkey0);
+
+ b0 = asm_xor (b0, tweak0);
+ b1 = asm_xor (b1, tweak1);
+ b2 = asm_xor (b2, tweak2);
+ b3 = asm_xor (b3, tweak3);
+ b4 = asm_xor (b4, tweak4);
+ b5 = asm_xor (b5, tweak5);
+ b6 = asm_xor (b6, tweak6);
+ b7 = asm_xor (b7, tweak7);

#define DO_ROUND(r) \
- rkey = ALIGNED_LOAD (&rk[r]); \
- b0 = vec_cipher_be (b0, rkey); \
- b1 = vec_cipher_be (b1, rkey); \
- b2 = vec_cipher_be (b2, rkey); \
- b3 = vec_cipher_be (b3, rkey); \
- b4 = vec_cipher_be (b4, rkey); \
- b5 = vec_cipher_be (b5, rkey); \
- b6 = vec_cipher_be (b6, rkey); \
- b7 = vec_cipher_be (b7, rkey);
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_cipher_be (b0, rkey); \
+ b1 = asm_cipher_be (b1, rkey); \
+ b2 = asm_cipher_be (b2, rkey); \
+ b3 = asm_cipher_be (b3, rkey); \
+ b4 = asm_cipher_be (b4, rkey); \
+ b5 = asm_cipher_be (b5, rkey); \
+ b6 = asm_cipher_be (b6, rkey); \
+ b7 = asm_cipher_be (b7, rkey);

DO_ROUND(1);
DO_ROUND(2);
@@ -1787,7 +2091,20 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg,
DO_ROUND(5);
DO_ROUND(6);
DO_ROUND(7);
+
+ rkeylf = asm_xor (rkeylast, rkey0);
+
DO_ROUND(8);
+
+ tweak0 = asm_xor (tweak0, rkeylf);
+ tweak1 = asm_xor (tweak1, rkeylf);
+ tweak2 = asm_xor (tweak2, rkeylf);
+ tweak3 = asm_xor (tweak3, rkeylf);
+ tweak4 = asm_xor (tweak4, rkeylf);
+ tweak5 = asm_xor (tweak5, rkeylf);
+ tweak6 = asm_xor (tweak6, rkeylf);
+ tweak7 = asm_xor (tweak7, rkeylf);
+
DO_ROUND(9);
if (rounds >= 12)
{
@@ -1802,51 +2119,62 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg,

#undef DO_ROUND

- rkey = rkeylast;
- b0 = vec_cipherlast_be (b0, rkey ^ tweak0);
- b1 = vec_cipherlast_be (b1, rkey ^ tweak1);
- b2 = vec_cipherlast_be (b2, rkey ^ tweak2);
- b3 = vec_cipherlast_be (b3, rkey ^ tweak3);
- b4 = vec_cipherlast_be (b4, rkey ^ tweak4);
- b5 = vec_cipherlast_be (b5, rkey ^ tweak5);
- b6 = vec_cipherlast_be (b6, rkey ^ tweak6);
- b7 = vec_cipherlast_be (b7, rkey ^ tweak7);
-
- VEC_STORE_BE (out + 0, b0, bige_const);
- VEC_STORE_BE (out + 1, b1, bige_const);
- VEC_STORE_BE (out + 2, b2, bige_const);
- VEC_STORE_BE (out + 3, b3, bige_const);
- VEC_STORE_BE (out + 4, b4, bige_const);
- VEC_STORE_BE (out + 5, b5, bige_const);
- VEC_STORE_BE (out + 6, b6, bige_const);
- VEC_STORE_BE (out + 7, b7, bige_const);
-
- in += 8;
+ b0 = asm_cipherlast_be (b0, tweak0);
+ b1 = asm_cipherlast_be (b1, tweak1);
+ b2 = asm_cipherlast_be (b2, tweak2);
+ b3 = asm_cipherlast_be (b3, tweak3);
+ b0 = VEC_BE_SWAP (b0, bige_const);
+ b1 = VEC_BE_SWAP (b1, bige_const);
+ b4 = asm_cipherlast_be (b4, tweak4);
+ b5 = asm_cipherlast_be (b5, tweak5);
+ b2 = VEC_BE_SWAP (b2, bige_const);
+ b3 = VEC_BE_SWAP (b3, bige_const);
+ b6 = asm_cipherlast_be (b6, tweak6);
+ b7 = asm_cipherlast_be (b7, tweak7);
+ VEC_STORE_BE_NOSWAP (out, 0, b0);
+ VEC_STORE_BE_NOSWAP (out, 1, b1);
+ b4 = VEC_BE_SWAP (b4, bige_const);
+ b5 = VEC_BE_SWAP (b5, bige_const);
+ VEC_STORE_BE_NOSWAP (out, 2, b2);
+ VEC_STORE_BE_NOSWAP (out, 3, b3);
+ b6 = VEC_BE_SWAP (b6, bige_const);
+ b7 = VEC_BE_SWAP (b7, bige_const);
+ VEC_STORE_BE_NOSWAP (out, 4, b4);
+ VEC_STORE_BE_NOSWAP (out, 5, b5);
+ VEC_STORE_BE_NOSWAP (out, 6, b6);
+ VEC_STORE_BE_NOSWAP (out, 7, b7);
out += 8;
}

if (nblocks >= 4)
{
- tweak0 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
- tweak1 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
- tweak2 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
- tweak3 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
-
- b0 = VEC_LOAD_BE (in + 0, bige_const) ^ tweak0 ^ rkey0;
- b1 = VEC_LOAD_BE (in + 1, bige_const) ^ tweak1 ^ rkey0;
- b2 = VEC_LOAD_BE (in + 2, bige_const) ^ tweak2 ^ rkey0;
- b3 = VEC_LOAD_BE (in + 3, bige_const) ^ tweak3 ^ rkey0;
+ tweak0 = tweak;
+ GEN_TWEAK (tweak1, tweak0);
+ GEN_TWEAK (tweak2, tweak1);
+ GEN_TWEAK (tweak3, tweak2);
+ GEN_TWEAK (tweak, tweak3);
+
+ b0 = VEC_LOAD_BE (in, 0, bige_const);
+ b1 = VEC_LOAD_BE (in, 1, bige_const);
+ b2 = VEC_LOAD_BE (in, 2, bige_const);
+ b3 = VEC_LOAD_BE (in, 3, bige_const);
+
+ tweak0 = asm_vperm1 (tweak0, bswap128_const);
+ tweak1 = asm_vperm1 (tweak1, bswap128_const);
+ tweak2 = asm_vperm1 (tweak2, bswap128_const);
+ tweak3 = asm_vperm1 (tweak3, bswap128_const);
+
+ b0 ^= tweak0 ^ rkey0;
+ b1 ^= tweak1 ^ rkey0;
+ b2 ^= tweak2 ^ rkey0;
+ b3 ^= tweak3 ^ rkey0;

#define DO_ROUND(r) \
- rkey = ALIGNED_LOAD (&rk[r]); \
- b0 = vec_cipher_be (b0, rkey); \
- b1 = vec_cipher_be (b1, rkey); \
- b2 = vec_cipher_be (b2, rkey); \
- b3 = vec_cipher_be (b3, rkey);
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_cipher_be (b0, rkey); \
+ b1 = asm_cipher_be (b1, rkey); \
+ b2 = asm_cipher_be (b2, rkey); \
+ b3 = asm_cipher_be (b3, rkey);

DO_ROUND(1);
DO_ROUND(2);
@@ -1871,15 +2199,15 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg,
#undef DO_ROUND

rkey = rkeylast;
- b0 = vec_cipherlast_be (b0, rkey ^ tweak0);
- b1 = vec_cipherlast_be (b1, rkey ^ tweak1);
- b2 = vec_cipherlast_be (b2, rkey ^ tweak2);
- b3 = vec_cipherlast_be (b3, rkey ^ tweak3);
+ b0 = asm_cipherlast_be (b0, rkey ^ tweak0);
+ b1 = asm_cipherlast_be (b1, rkey ^ tweak1);
+ b2 = asm_cipherlast_be (b2, rkey ^ tweak2);
+ b3 = asm_cipherlast_be (b3, rkey ^ tweak3);

- VEC_STORE_BE (out + 0, b0, bige_const);
- VEC_STORE_BE (out + 1, b1, bige_const);
- VEC_STORE_BE (out + 2, b2, bige_const);
- VEC_STORE_BE (out + 3, b3, bige_const);
+ VEC_STORE_BE (out, 0, b0, bige_const);
+ VEC_STORE_BE (out, 1, b1, bige_const);
+ VEC_STORE_BE (out, 2, b2, bige_const);
+ VEC_STORE_BE (out, 3, b3, bige_const);

in += 4;
out += 4;
@@ -1888,18 +2216,18 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg,

for (; nblocks; nblocks--)
{
- tweak = tweak_next;
+ tweak0 = asm_vperm1 (tweak, bswap128_const);

/* Xor-Encrypt/Decrypt-Xor block. */
- b = VEC_LOAD_BE (in, bige_const) ^ tweak;
+ b = VEC_LOAD_BE (in, 0, bige_const) ^ tweak0;

/* Generate next tweak. */
- GEN_TWEAK (tweak_next, tweak_tmp);
+ GEN_TWEAK (tweak, tweak);

AES_ENCRYPT (b, rounds);

- b ^= tweak;
- VEC_STORE_BE (out, b, bige_const);
+ b ^= tweak0;
+ VEC_STORE_BE (out, 0, b, bige_const);

in++;
out++;
@@ -1919,42 +2247,70 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg,

for (; nblocks >= 8; nblocks -= 8)
{
- tweak0 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
- tweak1 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
- tweak2 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
- tweak3 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
- tweak4 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
- tweak5 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
- tweak6 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
- tweak7 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
-
- b0 = VEC_LOAD_BE (in + 0, bige_const) ^ tweak0 ^ rkey0;
- b1 = VEC_LOAD_BE (in + 1, bige_const) ^ tweak1 ^ rkey0;
- b2 = VEC_LOAD_BE (in + 2, bige_const) ^ tweak2 ^ rkey0;
- b3 = VEC_LOAD_BE (in + 3, bige_const) ^ tweak3 ^ rkey0;
- b4 = VEC_LOAD_BE (in + 4, bige_const) ^ tweak4 ^ rkey0;
- b5 = VEC_LOAD_BE (in + 5, bige_const) ^ tweak5 ^ rkey0;
- b6 = VEC_LOAD_BE (in + 6, bige_const) ^ tweak6 ^ rkey0;
- b7 = VEC_LOAD_BE (in + 7, bige_const) ^ tweak7 ^ rkey0;
+ b0 = VEC_LOAD_BE_NOSWAP (in, 0);
+ b1 = VEC_LOAD_BE_NOSWAP (in, 1);
+ b2 = VEC_LOAD_BE_NOSWAP (in, 2);
+ b3 = VEC_LOAD_BE_NOSWAP (in, 3);
+ tweak0 = tweak;
+ GEN_TWEAK (tweak1, tweak0);
+ tweak0 = asm_vperm1 (tweak0, bswap128_const);
+ b4 = VEC_LOAD_BE_NOSWAP (in, 4);
+ b5 = VEC_LOAD_BE_NOSWAP (in, 5);
+ GEN_TWEAK (tweak2, tweak1);
+ tweak1 = asm_vperm1 (tweak1, bswap128_const);
+ b6 = VEC_LOAD_BE_NOSWAP (in, 6);
+ b7 = VEC_LOAD_BE_NOSWAP (in, 7);
+ in += 8;
+
+ b0 = VEC_BE_SWAP(b0, bige_const);
+ b1 = VEC_BE_SWAP(b1, bige_const);
+ GEN_TWEAK (tweak3, tweak2);
+ tweak2 = asm_vperm1 (tweak2, bswap128_const);
+ GEN_TWEAK (tweak4, tweak3);
+ tweak3 = asm_vperm1 (tweak3, bswap128_const);
+ b2 = VEC_BE_SWAP(b2, bige_const);
+ b3 = VEC_BE_SWAP(b3, bige_const);
+ GEN_TWEAK (tweak5, tweak4);
+ tweak4 = asm_vperm1 (tweak4, bswap128_const);
+ GEN_TWEAK (tweak6, tweak5);
+ tweak5 = asm_vperm1 (tweak5, bswap128_const);
+ b4 = VEC_BE_SWAP(b4, bige_const);
+ b5 = VEC_BE_SWAP(b5, bige_const);
+ GEN_TWEAK (tweak7, tweak6);
+ tweak6 = asm_vperm1 (tweak6, bswap128_const);
+ GEN_TWEAK (tweak, tweak7);
+ tweak7 = asm_vperm1 (tweak7, bswap128_const);
+ b6 = VEC_BE_SWAP(b6, bige_const);
+ b7 = VEC_BE_SWAP(b7, bige_const);
+
+ tweak0 = asm_xor (tweak0, rkey0);
+ tweak1 = asm_xor (tweak1, rkey0);
+ tweak2 = asm_xor (tweak2, rkey0);
+ tweak3 = asm_xor (tweak3, rkey0);
+ tweak4 = asm_xor (tweak4, rkey0);
+ tweak5 = asm_xor (tweak5, rkey0);
+ tweak6 = asm_xor (tweak6, rkey0);
+ tweak7 = asm_xor (tweak7, rkey0);
+
+ b0 = asm_xor (b0, tweak0);
+ b1 = asm_xor (b1, tweak1);
+ b2 = asm_xor (b2, tweak2);
+ b3 = asm_xor (b3, tweak3);
+ b4 = asm_xor (b4, tweak4);
+ b5 = asm_xor (b5, tweak5);
+ b6 = asm_xor (b6, tweak6);
+ b7 = asm_xor (b7, tweak7);

#define DO_ROUND(r) \
- rkey = ALIGNED_LOAD (&rk[r]); \
- b0 = vec_ncipher_be (b0, rkey); \
- b1 = vec_ncipher_be (b1, rkey); \
- b2 = vec_ncipher_be (b2, rkey); \
- b3 = vec_ncipher_be (b3, rkey); \
- b4 = vec_ncipher_be (b4, rkey); \
- b5 = vec_ncipher_be (b5, rkey); \
- b6 = vec_ncipher_be (b6, rkey); \
- b7 = vec_ncipher_be (b7, rkey);
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_ncipher_be (b0, rkey); \
+ b1 = asm_ncipher_be (b1, rkey); \
+ b2 = asm_ncipher_be (b2, rkey); \
+ b3 = asm_ncipher_be (b3, rkey); \
+ b4 = asm_ncipher_be (b4, rkey); \
+ b5 = asm_ncipher_be (b5, rkey); \
+ b6 = asm_ncipher_be (b6, rkey); \
+ b7 = asm_ncipher_be (b7, rkey);

DO_ROUND(1);
DO_ROUND(2);
@@ -1963,7 +2319,20 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg,
DO_ROUND(5);
DO_ROUND(6);
DO_ROUND(7);
+
+ rkeylf = asm_xor (rkeylast, rkey0);
+
DO_ROUND(8);
+
+ tweak0 = asm_xor (tweak0, rkeylf);
+ tweak1 = asm_xor (tweak1, rkeylf);
+ tweak2 = asm_xor (tweak2, rkeylf);
+ tweak3 = asm_xor (tweak3, rkeylf);
+ tweak4 = asm_xor (tweak4, rkeylf);
+ tweak5 = asm_xor (tweak5, rkeylf);
+ tweak6 = asm_xor (tweak6, rkeylf);
+ tweak7 = asm_xor (tweak7, rkeylf);
+
DO_ROUND(9);
if (rounds >= 12)
{
@@ -1978,51 +2347,62 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg,

#undef DO_ROUND

- rkey = rkeylast;
- b0 = vec_ncipherlast_be (b0, rkey ^ tweak0);
- b1 = vec_ncipherlast_be (b1, rkey ^ tweak1);
- b2 = vec_ncipherlast_be (b2, rkey ^ tweak2);
- b3 = vec_ncipherlast_be (b3, rkey ^ tweak3);
- b4 = vec_ncipherlast_be (b4, rkey ^ tweak4);
- b5 = vec_ncipherlast_be (b5, rkey ^ tweak5);
- b6 = vec_ncipherlast_be (b6, rkey ^ tweak6);
- b7 = vec_ncipherlast_be (b7, rkey ^ tweak7);
-
- VEC_STORE_BE (out + 0, b0, bige_const);
- VEC_STORE_BE (out + 1, b1, bige_const);
- VEC_STORE_BE (out + 2, b2, bige_const);
- VEC_STORE_BE (out + 3, b3, bige_const);
- VEC_STORE_BE (out + 4, b4, bige_const);
- VEC_STORE_BE (out + 5, b5, bige_const);
- VEC_STORE_BE (out + 6, b6, bige_const);
- VEC_STORE_BE (out + 7, b7, bige_const);
-
- in += 8;
+ b0 = asm_ncipherlast_be (b0, tweak0);
+ b1 = asm_ncipherlast_be (b1, tweak1);
+ b2 = asm_ncipherlast_be (b2, tweak2);
+ b3 = asm_ncipherlast_be (b3, tweak3);
+ b0 = VEC_BE_SWAP (b0, bige_const);
+ b1 = VEC_BE_SWAP (b1, bige_const);
+ b4 = asm_ncipherlast_be (b4, tweak4);
+ b5 = asm_ncipherlast_be (b5, tweak5);
+ b2 = VEC_BE_SWAP (b2, bige_const);
+ b3 = VEC_BE_SWAP (b3, bige_const);
+ b6 = asm_ncipherlast_be (b6, tweak6);
+ b7 = asm_ncipherlast_be (b7, tweak7);
+ VEC_STORE_BE_NOSWAP (out, 0, b0);
+ VEC_STORE_BE_NOSWAP (out, 1, b1);
+ b4 = VEC_BE_SWAP (b4, bige_const);
+ b5 = VEC_BE_SWAP (b5, bige_const);
+ VEC_STORE_BE_NOSWAP (out, 2, b2);
+ VEC_STORE_BE_NOSWAP (out, 3, b3);
+ b6 = VEC_BE_SWAP (b6, bige_const);
+ b7 = VEC_BE_SWAP (b7, bige_const);
+ VEC_STORE_BE_NOSWAP (out, 4, b4);
+ VEC_STORE_BE_NOSWAP (out, 5, b5);
+ VEC_STORE_BE_NOSWAP (out, 6, b6);
+ VEC_STORE_BE_NOSWAP (out, 7, b7);
out += 8;
}

if (nblocks >= 4)
{
- tweak0 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
- tweak1 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
- tweak2 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
- tweak3 = tweak_next;
- GEN_TWEAK (tweak_next, tweak_tmp);
-
- b0 = VEC_LOAD_BE (in + 0, bige_const) ^ tweak0 ^ rkey0;
- b1 = VEC_LOAD_BE (in + 1, bige_const) ^ tweak1 ^ rkey0;
- b2 = VEC_LOAD_BE (in + 2, bige_const) ^ tweak2 ^ rkey0;
- b3 = VEC_LOAD_BE (in + 3, bige_const) ^ tweak3 ^ rkey0;
+ tweak0 = tweak;
+ GEN_TWEAK (tweak1, tweak0);
+ GEN_TWEAK (tweak2, tweak1);
+ GEN_TWEAK (tweak3, tweak2);
+ GEN_TWEAK (tweak, tweak3);
+
+ b0 = VEC_LOAD_BE (in, 0, bige_const);
+ b1 = VEC_LOAD_BE (in, 1, bige_const);
+ b2 = VEC_LOAD_BE (in, 2, bige_const);
+ b3 = VEC_LOAD_BE (in, 3, bige_const);
+
+ tweak0 = asm_vperm1 (tweak0, bswap128_const);
+ tweak1 = asm_vperm1 (tweak1, bswap128_const);
+ tweak2 = asm_vperm1 (tweak2, bswap128_const);
+ tweak3 = asm_vperm1 (tweak3, bswap128_const);
+
+ b0 ^= tweak0 ^ rkey0;
+ b1 ^= tweak1 ^ rkey0;
+ b2 ^= tweak2 ^ rkey0;
+ b3 ^= tweak3 ^ rkey0;

#define DO_ROUND(r) \
- rkey = ALIGNED_LOAD (&rk[r]); \
- b0 = vec_ncipher_be (b0, rkey); \
- b1 = vec_ncipher_be (b1, rkey); \
- b2 = vec_ncipher_be (b2, rkey); \
- b3 = vec_ncipher_be (b3, rkey);
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_ncipher_be (b0, rkey); \
+ b1 = asm_ncipher_be (b1, rkey); \
+ b2 = asm_ncipher_be (b2, rkey); \
+ b3 = asm_ncipher_be (b3, rkey);

DO_ROUND(1);
DO_ROUND(2);
@@ -2047,15 +2427,15 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg,
#undef DO_ROUND

rkey = rkeylast;
- b0 = vec_ncipherlast_be (b0, rkey ^ tweak0);
- b1 = vec_ncipherlast_be (b1, rkey ^ tweak1);
- b2 = vec_ncipherlast_be (b2, rkey ^ tweak2);
- b3 = vec_ncipherlast_be (b3, rkey ^ tweak3);
+ b0 = asm_ncipherlast_be (b0, rkey ^ tweak0);
+ b1 = asm_ncipherlast_be (b1, rkey ^ tweak1);
+ b2 = asm_ncipherlast_be (b2, rkey ^ tweak2);
+ b3 = asm_ncipherlast_be (b3, rkey ^ tweak3);

- VEC_STORE_BE (out + 0, b0, bige_const);
- VEC_STORE_BE (out + 1, b1, bige_const);
- VEC_STORE_BE (out + 2, b2, bige_const);
- VEC_STORE_BE (out + 3, b3, bige_const);
+ VEC_STORE_BE (out, 0, b0, bige_const);
+ VEC_STORE_BE (out, 1, b1, bige_const);
+ VEC_STORE_BE (out, 2, b2, bige_const);
+ VEC_STORE_BE (out, 3, b3, bige_const);

in += 4;
out += 4;
@@ -2064,25 +2444,30 @@ void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg,

for (; nblocks; nblocks--)
{
- tweak = tweak_next;
+ tweak0 = asm_vperm1 (tweak, bswap128_const);

/* Xor-Encrypt/Decrypt-Xor block. */
- b = VEC_LOAD_BE (in, bige_const) ^ tweak;
+ b = VEC_LOAD_BE (in, 0, bige_const) ^ tweak0;

/* Generate next tweak. */
- GEN_TWEAK (tweak_next, tweak_tmp);
+ GEN_TWEAK (tweak, tweak);

AES_DECRYPT (b, rounds);

- b ^= tweak;
- VEC_STORE_BE (out, b, bige_const);
+ b ^= tweak0;
+ VEC_STORE_BE (out, 0, b, bige_const);

in++;
out++;
}
}

- VEC_STORE_BE (tweak_arg, tweak_next, bige_const);
+#ifdef WORDS_BIGENDIAN
+ tweak = asm_vperm1 (tweak, bswap128_const);
+ VEC_STORE_BE (tweak_arg, 0, tweak, bige_const);
+#else
+ VEC_STORE_BE (tweak_arg, 0, tweak, vec_tweakin_swap_const);
+#endif

#undef GEN_TWEAK
}


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@gnupg.org
http://lists.gnupg.org/mailman/listinfo/gcrypt-devel