Mailing List Archive

[PATCH 3/7] amd64-asm: move constant data to read-only section for cipher algos
* cipher/arcfour-amd64.S: Move constant data to read-only
section; Align text section to 64 bytes and functions to 16 bytes.
* cipher/blowfish-amd64.S: Likewise.
* cipher/camellia-aesni-avx-amd64.S: Likewise.
* cipher/camellia-aesni-avx2-amd64.h: Likewise.
* cipher/camellia-gfni-avx512-amd64.S: Likewise.
* cipher/cast5-amd64.S: Likewise.
* cipher/chacha20-amd64-avx2.S: Likewise.
* cipher/chacha20-amd64-avx512.S: Likewise.
* cipher/chacha20-amd64-ssse3.S: Likewise.
* cipher/des-amd64.s: Likewise.
* cipher/rijndael-amd64.S: Likewise.
* cipher/rijndael-ssse3-amd64-asm.S: Likewise.
* cipher/rijndael-vaes-avx2-amd64.S: Likewise.
* cipher/salsa20-amd64.S: Likewise.
* cipher/serpent-avx2-amd64.S: Likewise.
* cipher/serpent-sse2-amd64.S: Likewise.
* cipher/sm4-aesni-avx-amd64.S: Likewise.
* cipher/sm4-aesni-avx2-amd64.S: Likewise.
* cipher/sm4-gfni-avx2-amd64.S: Likewise.
* cipher/sm4-gfni-avx512-amd64.S: Likewise.
* cipher/twofish-amd64.S: Likewise.
* cipher/twofish-avx2-amd64.S: Likewise.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
---
cipher/arcfour-amd64.S | 2 +-
cipher/blowfish-amd64.S | 19 ++++++++--------
cipher/camellia-aesni-avx-amd64.S | 34 +++++++++++++++++++----------
cipher/camellia-aesni-avx2-amd64.h | 26 ++++++++++++----------
cipher/camellia-gfni-avx512-amd64.S | 23 ++++++++++---------
cipher/cast5-amd64.S | 15 +++++++------
cipher/chacha20-amd64-avx2.S | 14 +++++++-----
cipher/chacha20-amd64-avx512.S | 8 ++++---
cipher/chacha20-amd64-ssse3.S | 18 +++++++++------
cipher/des-amd64.S | 17 ++++++++++-----
cipher/rijndael-amd64.S | 5 +++--
cipher/rijndael-ssse3-amd64-asm.S | 8 ++++++-
cipher/rijndael-vaes-avx2-amd64.S | 3 +++
cipher/salsa20-amd64.S | 7 +++---
cipher/serpent-avx2-amd64.S | 25 ++++++++++++---------
cipher/serpent-sse2-amd64.S | 19 ++++++++--------
cipher/sm4-aesni-avx-amd64.S | 28 ++++++++++++++----------
cipher/sm4-aesni-avx2-amd64.S | 24 ++++++++++++--------
cipher/sm4-gfni-avx2-amd64.S | 32 ++++++++++++++++-----------
cipher/sm4-gfni-avx512-amd64.S | 5 ++++-
cipher/twofish-amd64.S | 23 +++++++++----------
cipher/twofish-avx2-amd64.S | 24 +++++++++++---------
22 files changed, 229 insertions(+), 150 deletions(-)

diff --git a/cipher/arcfour-amd64.S b/cipher/arcfour-amd64.S
index 2abd90a7..d4cd6083 100644
--- a/cipher/arcfour-amd64.S
+++ b/cipher/arcfour-amd64.S
@@ -21,7 +21,7 @@
#include "asm-common-amd64.h"

.text
-.align 16
+.align 64
.globl _gcry_arcfour_amd64
ELF(.type _gcry_arcfour_amd64,@function)
_gcry_arcfour_amd64:
diff --git a/cipher/blowfish-amd64.S b/cipher/blowfish-amd64.S
index 2b4ffa1a..9db3dc1b 100644
--- a/cipher/blowfish-amd64.S
+++ b/cipher/blowfish-amd64.S
@@ -27,6 +27,7 @@
#include "asm-common-amd64.h"

.text
+.align 64

/* structure of BLOWFISH_context: */
#define s0 0
@@ -123,7 +124,7 @@
bswapq RX0; \
movq RX0, (RIO);

-.align 8
+.align 16
ELF(.type __blowfish_enc_blk1,@function;)

__blowfish_enc_blk1:
@@ -155,7 +156,7 @@ __blowfish_enc_blk1:
CFI_ENDPROC();
ELF(.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;)

-.align 8
+.align 16
.globl _gcry_blowfish_amd64_do_encrypt
ELF(.type _gcry_blowfish_amd64_do_encrypt,@function;)

@@ -186,7 +187,7 @@ _gcry_blowfish_amd64_do_encrypt:
CFI_ENDPROC();
ELF(.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;)

-.align 8
+.align 16
.globl _gcry_blowfish_amd64_encrypt_block
ELF(.type _gcry_blowfish_amd64_encrypt_block,@function;)

@@ -214,7 +215,7 @@ _gcry_blowfish_amd64_encrypt_block:
CFI_ENDPROC();
ELF(.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;)

-.align 8
+.align 16
.globl _gcry_blowfish_amd64_decrypt_block
ELF(.type _gcry_blowfish_amd64_decrypt_block,@function;)

@@ -342,7 +343,7 @@ ELF(.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_bloc
bswapq RX2; \
bswapq RX3;

-.align 8
+.align 16
ELF(.type __blowfish_enc_blk4,@function;)

__blowfish_enc_blk4:
@@ -371,7 +372,7 @@ __blowfish_enc_blk4:
CFI_ENDPROC();
ELF(.size __blowfish_enc_blk4,.-__blowfish_enc_blk4;)

-.align 8
+.align 16
ELF(.type __blowfish_dec_blk4,@function;)

__blowfish_dec_blk4:
@@ -402,7 +403,7 @@ __blowfish_dec_blk4:
CFI_ENDPROC();
ELF(.size __blowfish_dec_blk4,.-__blowfish_dec_blk4;)

-.align 8
+.align 16
.globl _gcry_blowfish_amd64_ctr_enc
ELF(.type _gcry_blowfish_amd64_ctr_enc,@function;)
_gcry_blowfish_amd64_ctr_enc:
@@ -472,7 +473,7 @@ _gcry_blowfish_amd64_ctr_enc:
CFI_ENDPROC();
ELF(.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;)

-.align 8
+.align 16
.globl _gcry_blowfish_amd64_cbc_dec
ELF(.type _gcry_blowfish_amd64_cbc_dec,@function;)
_gcry_blowfish_amd64_cbc_dec:
@@ -533,7 +534,7 @@ _gcry_blowfish_amd64_cbc_dec:
CFI_ENDPROC();
ELF(.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;)

-.align 8
+.align 16
.globl _gcry_blowfish_amd64_cfb_dec
ELF(.type _gcry_blowfish_amd64_cfb_dec,@function;)
_gcry_blowfish_amd64_cfb_dec:
diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index 5c304e57..9240d70b 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -619,7 +619,10 @@
vmovdqu y6, 14 * 16(rio); \
vmovdqu y7, 15 * 16(rio);

-.text
+SECTION_RODATA
+
+ELF(.type _camellia_aesni_avx_data,@object;)
+_camellia_aesni_avx_data:
.align 16

#define SHUFB_BYTES(idx) \
@@ -763,9 +766,11 @@
.L0f0f0f0f:
.long 0x0f0f0f0f

+.text
+.align 64

-.align 8
ELF(.type __camellia_enc_blk16,@function;)
+.align 16

__camellia_enc_blk16:
/* input:
@@ -826,7 +831,7 @@ __camellia_enc_blk16:
CFI_ENDPROC();
ELF(.size __camellia_enc_blk16,.-__camellia_enc_blk16;)

-.align 8
+.align 16
ELF(.type __camellia_dec_blk16,@function;)

__camellia_dec_blk16:
@@ -897,7 +902,7 @@ ELF(.size __camellia_dec_blk16,.-__camellia_dec_blk16;)
vpslldq $8, tmp, tmp; \
vpsubq tmp, x, x;

-.align 8
+.align 16
.globl _gcry_camellia_aesni_avx_ctr_enc
ELF(.type _gcry_camellia_aesni_avx_ctr_enc,@function;)

@@ -1025,7 +1030,7 @@ _gcry_camellia_aesni_avx_ctr_enc:
CFI_ENDPROC();
ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;)

-.align 8
+.align 16
.globl _gcry_camellia_aesni_avx_cbc_dec
ELF(.type _gcry_camellia_aesni_avx_cbc_dec,@function;)

@@ -1098,7 +1103,7 @@ _gcry_camellia_aesni_avx_cbc_dec:
CFI_ENDPROC();
ELF(.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;)

-.align 8
+.align 16
.globl _gcry_camellia_aesni_avx_cfb_dec
ELF(.type _gcry_camellia_aesni_avx_cfb_dec,@function;)

@@ -1180,7 +1185,7 @@ _gcry_camellia_aesni_avx_cfb_dec:
CFI_ENDPROC();
ELF(.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;)

-.align 8
+.align 16
.globl _gcry_camellia_aesni_avx_ocb_enc
ELF(.type _gcry_camellia_aesni_avx_ocb_enc,@function;)

@@ -1332,7 +1337,7 @@ _gcry_camellia_aesni_avx_ocb_enc:
CFI_ENDPROC();
ELF(.size _gcry_camellia_aesni_avx_ocb_enc,.-_gcry_camellia_aesni_avx_ocb_enc;)

-.align 8
+.align 16
.globl _gcry_camellia_aesni_avx_ocb_dec
ELF(.type _gcry_camellia_aesni_avx_ocb_dec,@function;)

@@ -1503,7 +1508,7 @@ _gcry_camellia_aesni_avx_ocb_dec:
CFI_ENDPROC();
ELF(.size _gcry_camellia_aesni_avx_ocb_dec,.-_gcry_camellia_aesni_avx_ocb_dec;)

-.align 8
+.align 16
.globl _gcry_camellia_aesni_avx_ocb_auth
ELF(.type _gcry_camellia_aesni_avx_ocb_auth,@function;)

@@ -1720,6 +1725,10 @@ ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;
vpsllq $(64-(nror)), out, out; \
vpaddd t0, out, out;

+SECTION_RODATA
+
+ELF(.type _camellia_aesni_avx_keysetup_data,@object;)
+_camellia_aesni_avx_keysetup_data:

.align 16
.Linv_shift_row_and_unpcklbw:
@@ -1752,8 +1761,9 @@ ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;
.Lsigma6:
.long 0xB3E6C1FD, 0xB05688C2;

+.text

-.align 8
+.align 16
ELF(.type __camellia_avx_setup128,@function;)
__camellia_avx_setup128:
/* input:
@@ -2100,7 +2110,7 @@ __camellia_avx_setup128:
CFI_ENDPROC();
ELF(.size __camellia_avx_setup128,.-__camellia_avx_setup128;)

-.align 8
+.align 16
ELF(.type __camellia_avx_setup256,@function;)

__camellia_avx_setup256:
@@ -2580,7 +2590,7 @@ __camellia_avx_setup256:
CFI_ENDPROC();
ELF(.size __camellia_avx_setup256,.-__camellia_avx_setup256;)

-.align 8
+.align 16
.globl _gcry_camellia_aesni_avx_keygen
ELF(.type _gcry_camellia_aesni_avx_keygen,@function;)

diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h
index 411e790f..46c2be81 100644
--- a/cipher/camellia-aesni-avx2-amd64.h
+++ b/cipher/camellia-aesni-avx2-amd64.h
@@ -784,7 +784,8 @@
vmovdqu y6, 14 * 32(rio); \
vmovdqu y7, 15 * 32(rio);

-.text
+SECTION_RODATA
+
.align 32

#define SHUFB_BYTES(idx) \
@@ -997,7 +998,10 @@ ELF(.type FUNC_NAME(_constants),@object;)

ELF(.size FUNC_NAME(_constants),.-FUNC_NAME(_constants);)

-.align 8
+.text
+.align 64
+
+.align 16
ELF(.type FUNC_NAME(enc_blk32),@function;)

FUNC_NAME(enc_blk32):
@@ -1059,7 +1063,7 @@ FUNC_NAME(enc_blk32):
CFI_ENDPROC();
ELF(.size FUNC_NAME(enc_blk32),.-FUNC_NAME(enc_blk32);)

-.align 8
+.align 16
ELF(.type FUNC_NAME(dec_blk32),@function;)

FUNC_NAME(dec_blk32):
@@ -1130,7 +1134,7 @@ ELF(.size FUNC_NAME(dec_blk32),.-FUNC_NAME(dec_blk32);)
vpslldq $8, tmp, tmp; \
vpsubq tmp, x, x;

-.align 8
+.align 16
.globl FUNC_NAME(ctr_enc)
ELF(.type FUNC_NAME(ctr_enc),@function;)

@@ -1325,7 +1329,7 @@ FUNC_NAME(ctr_enc):
CFI_ENDPROC();
ELF(.size FUNC_NAME(ctr_enc),.-FUNC_NAME(ctr_enc);)

-.align 8
+.align 16
.globl FUNC_NAME(cbc_dec)
ELF(.type FUNC_NAME(cbc_dec),@function;)

@@ -1400,7 +1404,7 @@ FUNC_NAME(cbc_dec):
CFI_ENDPROC();
ELF(.size FUNC_NAME(cbc_dec),.-FUNC_NAME(cbc_dec);)

-.align 8
+.align 16
.globl FUNC_NAME(cfb_dec)
ELF(.type FUNC_NAME(cfb_dec),@function;)

@@ -1482,7 +1486,7 @@ FUNC_NAME(cfb_dec):
CFI_ENDPROC();
ELF(.size FUNC_NAME(cfb_dec),.-FUNC_NAME(cfb_dec);)

-.align 8
+.align 16
.globl FUNC_NAME(ocb_enc)
ELF(.type FUNC_NAME(ocb_enc),@function;)

@@ -1654,7 +1658,7 @@ FUNC_NAME(ocb_enc):
CFI_ENDPROC();
ELF(.size FUNC_NAME(ocb_enc),.-FUNC_NAME(ocb_enc);)

-.align 8
+.align 16
.globl FUNC_NAME(ocb_dec)
ELF(.type FUNC_NAME(ocb_dec),@function;)

@@ -1849,7 +1853,7 @@ FUNC_NAME(ocb_dec):
CFI_ENDPROC();
ELF(.size FUNC_NAME(ocb_dec),.-FUNC_NAME(ocb_dec);)

-.align 8
+.align 16
.globl FUNC_NAME(ocb_auth)
ELF(.type FUNC_NAME(ocb_auth),@function;)

@@ -2018,7 +2022,7 @@ FUNC_NAME(ocb_auth):
CFI_ENDPROC();
ELF(.size FUNC_NAME(ocb_auth),.-FUNC_NAME(ocb_auth);)

-.align 8
+.align 16
.globl FUNC_NAME(enc_blk1_32)
ELF(.type FUNC_NAME(enc_blk1_32),@function;)

@@ -2126,7 +2130,7 @@ FUNC_NAME(enc_blk1_32):
CFI_ENDPROC();
ELF(.size FUNC_NAME(enc_blk1_32),.-FUNC_NAME(enc_blk1_32);)

-.align 8
+.align 16
.globl FUNC_NAME(dec_blk1_32)
ELF(.type FUNC_NAME(dec_blk1_32),@function;)

diff --git a/cipher/camellia-gfni-avx512-amd64.S b/cipher/camellia-gfni-avx512-amd64.S
index 14725b4a..7a98a3ce 100644
--- a/cipher/camellia-gfni-avx512-amd64.S
+++ b/cipher/camellia-gfni-avx512-amd64.S
@@ -584,7 +584,7 @@
vmovdqu64 y6, 14 * 64(rio); \
vmovdqu64 y7, 15 * 64(rio);

-.text
+SECTION_RODATA

#define SHUFB_BYTES(idx) \
0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
@@ -691,7 +691,10 @@ ELF(.type _gcry_camellia_gfni_avx512__constants,@object;)

ELF(.size _gcry_camellia_gfni_avx512__constants,.-_gcry_camellia_gfni_avx512__constants;)

-.align 8
+.text
+.align 64
+
+.align 16
ELF(.type __camellia_gfni_avx512_enc_blk64,@function;)

__camellia_gfni_avx512_enc_blk64:
@@ -751,7 +754,7 @@ __camellia_gfni_avx512_enc_blk64:
CFI_ENDPROC();
ELF(.size __camellia_gfni_avx512_enc_blk64,.-__camellia_gfni_avx512_enc_blk64;)

-.align 8
+.align 16
ELF(.type __camellia_gfni_avx512_dec_blk64,@function;)

__camellia_gfni_avx512_dec_blk64:
@@ -820,7 +823,7 @@ ELF(.size __camellia_gfni_avx512_dec_blk64,.-__camellia_gfni_avx512_dec_blk64;)
kaddb %k1, %k1, %k1; \
vpaddq hi_counter1, out, out{%k1};

-.align 8
+.align 16
.globl _gcry_camellia_gfni_avx512_ctr_enc
ELF(.type _gcry_camellia_gfni_avx512_ctr_enc,@function;)

@@ -973,7 +976,7 @@ _gcry_camellia_gfni_avx512_ctr_enc:
CFI_ENDPROC();
ELF(.size _gcry_camellia_gfni_avx512_ctr_enc,.-_gcry_camellia_gfni_avx512_ctr_enc;)

-.align 8
+.align 16
.globl _gcry_camellia_gfni_avx512_cbc_dec
ELF(.type _gcry_camellia_gfni_avx512_cbc_dec,@function;)

@@ -1035,7 +1038,7 @@ _gcry_camellia_gfni_avx512_cbc_dec:
CFI_ENDPROC();
ELF(.size _gcry_camellia_gfni_avx512_cbc_dec,.-_gcry_camellia_gfni_avx512_cbc_dec;)

-.align 8
+.align 16
.globl _gcry_camellia_gfni_avx512_cfb_dec
ELF(.type _gcry_camellia_gfni_avx512_cfb_dec,@function;)

@@ -1108,7 +1111,7 @@ _gcry_camellia_gfni_avx512_cfb_dec:
CFI_ENDPROC();
ELF(.size _gcry_camellia_gfni_avx512_cfb_dec,.-_gcry_camellia_gfni_avx512_cfb_dec;)

-.align 8
+.align 16
.globl _gcry_camellia_gfni_avx512_ocb_enc
ELF(.type _gcry_camellia_gfni_avx512_ocb_enc,@function;)

@@ -1271,7 +1274,7 @@ _gcry_camellia_gfni_avx512_ocb_enc:
CFI_ENDPROC();
ELF(.size _gcry_camellia_gfni_avx512_ocb_enc,.-_gcry_camellia_gfni_avx512_ocb_enc;)

-.align 8
+.align 16
.globl _gcry_camellia_gfni_avx512_ocb_dec
ELF(.type _gcry_camellia_gfni_avx512_ocb_dec,@function;)

@@ -1440,7 +1443,7 @@ _gcry_camellia_gfni_avx512_ocb_dec:
CFI_ENDPROC();
ELF(.size _gcry_camellia_gfni_avx512_ocb_dec,.-_gcry_camellia_gfni_avx512_ocb_dec;)

-.align 8
+.align 16
.globl _gcry_camellia_gfni_avx512_enc_blk64
ELF(.type _gcry_camellia_gfni_avx512_enc_blk64,@function;)

@@ -1504,7 +1507,7 @@ _gcry_camellia_gfni_avx512_enc_blk64:
CFI_ENDPROC();
ELF(.size _gcry_camellia_gfni_avx512_enc_blk64,.-_gcry_camellia_gfni_avx512_enc_blk64;)

-.align 8
+.align 16
.globl _gcry_camellia_gfni_avx512_dec_blk64
ELF(.type _gcry_camellia_gfni_avx512_dec_blk64,@function;)

diff --git a/cipher/cast5-amd64.S b/cipher/cast5-amd64.S
index a804654c..39171587 100644
--- a/cipher/cast5-amd64.S
+++ b/cipher/cast5-amd64.S
@@ -26,6 +26,7 @@
#include "asm-common-amd64.h"

.text
+.align 64

.extern _gcry_cast5_s1to4;

@@ -173,7 +174,7 @@
rorq $32, RLR0; \
movq RLR0, (RIO);

-.align 8
+.align 16
.globl _gcry_cast5_amd64_encrypt_block
ELF(.type _gcry_cast5_amd64_encrypt_block,@function;)

@@ -223,7 +224,7 @@ _gcry_cast5_amd64_encrypt_block:
CFI_ENDPROC();
ELF(.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;)

-.align 8
+.align 16
.globl _gcry_cast5_amd64_decrypt_block
ELF(.type _gcry_cast5_amd64_decrypt_block,@function;)

@@ -373,7 +374,7 @@ ELF(.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;)
rorq $32, c; \
rorq $32, d;

-.align 8
+.align 16
ELF(.type __cast5_enc_blk4,@function;)

__cast5_enc_blk4:
@@ -403,7 +404,7 @@ __cast5_enc_blk4:
CFI_ENDPROC();
ELF(.size __cast5_enc_blk4,.-__cast5_enc_blk4;)

-.align 8
+.align 16
ELF(.type __cast5_dec_blk4,@function;)

__cast5_dec_blk4:
@@ -435,7 +436,7 @@ __cast5_dec_blk4:
ret_spec_stop;
ELF(.size __cast5_dec_blk4,.-__cast5_dec_blk4;)

-.align 8
+.align 16
.globl _gcry_cast5_amd64_ctr_enc
ELF(.type _gcry_cast5_amd64_ctr_enc,@function;)
_gcry_cast5_amd64_ctr_enc:
@@ -512,7 +513,7 @@ _gcry_cast5_amd64_ctr_enc:
CFI_ENDPROC();
ELF(.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;)

-.align 8
+.align 16
.globl _gcry_cast5_amd64_cbc_dec
ELF(.type _gcry_cast5_amd64_cbc_dec,@function;)
_gcry_cast5_amd64_cbc_dec:
@@ -586,7 +587,7 @@ _gcry_cast5_amd64_cbc_dec:
CFI_ENDPROC();
ELF(.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;)

-.align 8
+.align 16
.globl _gcry_cast5_amd64_cfb_dec
ELF(.type _gcry_cast5_amd64_cfb_dec,@function;)
_gcry_cast5_amd64_cfb_dec:
diff --git a/cipher/chacha20-amd64-avx2.S b/cipher/chacha20-amd64-avx2.S
index 9f2a036a..99ff7469 100644
--- a/cipher/chacha20-amd64-avx2.S
+++ b/cipher/chacha20-amd64-avx2.S
@@ -33,8 +33,6 @@
(defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))

-.text
-
#include "asm-common-amd64.h"
#include "asm-poly1305-amd64.h"

@@ -157,8 +155,11 @@
PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \
ROTATE2(b1, b2, 7, tmp1);

+SECTION_RODATA
+
+ELF(.type _chacha20_avx2_data,@object;)
.align 32
-chacha20_data:
+_chacha20_avx2_data:
.Lshuf_rol16:
.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
.Lshuf_rol8:
@@ -168,7 +169,10 @@ chacha20_data:
.Lunsigned_cmp:
.long 0x80000000

-.align 8
+.text
+.align 64
+
+.align 16
.globl _gcry_chacha20_amd64_avx2_blocks8
ELF(.type _gcry_chacha20_amd64_avx2_blocks8,@function;)

@@ -333,7 +337,7 @@ ELF(.size _gcry_chacha20_amd64_avx2_blocks8,

#define _ /*_*/

-.align 8
+.align 16
.globl _gcry_chacha20_poly1305_amd64_avx2_blocks8
ELF(.type _gcry_chacha20_poly1305_amd64_avx2_blocks8,@function;)

diff --git a/cipher/chacha20-amd64-avx512.S b/cipher/chacha20-amd64-avx512.S
index 4b183528..e39a505a 100644
--- a/cipher/chacha20-amd64-avx512.S
+++ b/cipher/chacha20-amd64-avx512.S
@@ -33,8 +33,6 @@
(defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))

-.text
-
#include "asm-common-amd64.h"

/* register macros */
@@ -269,6 +267,8 @@
ROTATE(x1, 7); ROTATE(y1, 7); \
WORD_SHUF(x1, shuf_x1); WORD_SHUF(y1, shuf_x1);

+SECTION_RODATA
+
.align 64
ELF(.type _gcry_chacha20_amd64_avx512_data,@object;)
_gcry_chacha20_amd64_avx512_data:
@@ -286,7 +286,9 @@ _gcry_chacha20_amd64_avx512_data:
.byte 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
ELF(.size _gcry_chacha20_amd64_avx512_data,.-_gcry_chacha20_amd64_avx512_data)

-.align 16
+.text
+
+.align 64
.globl _gcry_chacha20_amd64_avx512_blocks
ELF(.type _gcry_chacha20_amd64_avx512_blocks,@function;)
_gcry_chacha20_amd64_avx512_blocks:
diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S
index 6c737978..50c4755e 100644
--- a/cipher/chacha20-amd64-ssse3.S
+++ b/cipher/chacha20-amd64-ssse3.S
@@ -33,8 +33,6 @@
(defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))

-.text
-
#include "asm-common-amd64.h"
#include "asm-poly1305-amd64.h"

@@ -151,7 +149,10 @@
PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \
ROTATE2(b1, b2, 7, tmp1, tmp2);

-chacha20_data:
+SECTION_RODATA
+
+ELF(.type _chacha20_ssse3_data,@object;)
+_chacha20_ssse3_data:
.align 16
.Lshuf_rol16:
.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
@@ -164,7 +165,10 @@ chacha20_data:
.Lunsigned_cmp:
.long 0x80000000,0x80000000,0x80000000,0x80000000

-.align 8
+.text
+.align 64
+
+.align 16
.globl _gcry_chacha20_amd64_ssse3_blocks4
ELF(.type _gcry_chacha20_amd64_ssse3_blocks4,@function;)

@@ -366,7 +370,7 @@ ELF(.size _gcry_chacha20_amd64_ssse3_blocks4,
ROTATE(x1, 7, tmp1); \
WORD_SHUF(x1, shuf_x1);

-.align 8
+.align 16
.globl _gcry_chacha20_amd64_ssse3_blocks1
ELF(.type _gcry_chacha20_amd64_ssse3_blocks1,@function;)

@@ -513,7 +517,7 @@ ELF(.size _gcry_chacha20_amd64_ssse3_blocks1,

#define _ /*_*/

-.align 8
+.align 16
.globl _gcry_chacha20_poly1305_amd64_ssse3_blocks4
ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks4,@function;)

@@ -781,7 +785,7 @@ ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks4,
2-way && 1-way stitched chacha20-poly1305
**********************************************************************/

-.align 8
+.align 16
.globl _gcry_chacha20_poly1305_amd64_ssse3_blocks1
ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks1,@function;)

diff --git a/cipher/des-amd64.S b/cipher/des-amd64.S
index c1bf9f29..44a8a90c 100644
--- a/cipher/des-amd64.S
+++ b/cipher/des-amd64.S
@@ -26,6 +26,7 @@
#include "asm-common-amd64.h"

.text
+.align 64

#define s1 0
#define s2 ((s1) + (64*8))
@@ -180,7 +181,7 @@
movl left##d, (io); \
movl right##d, 4(io);

-.align 8
+.align 16
.globl _gcry_3des_amd64_crypt_block
ELF(.type _gcry_3des_amd64_crypt_block,@function;)

@@ -473,7 +474,7 @@ ELF(.size _gcry_3des_amd64_crypt_block,.-_gcry_3des_amd64_crypt_block;)
movl left##d, (io); \
movl right##d, 4(io);

-.align 8
+.align 16
ELF(.type _gcry_3des_amd64_crypt_blk3,@function;)
_gcry_3des_amd64_crypt_blk3:
/* input:
@@ -548,7 +549,7 @@ _gcry_3des_amd64_crypt_blk3:
CFI_ENDPROC();
ELF(.size _gcry_3des_amd64_crypt_blk3,.-_gcry_3des_amd64_crypt_blk3;)

-.align 8
+.align 16
.globl _gcry_3des_amd64_cbc_dec
ELF(.type _gcry_3des_amd64_cbc_dec,@function;)
_gcry_3des_amd64_cbc_dec:
@@ -603,6 +604,7 @@ _gcry_3des_amd64_cbc_dec:
popq %rdx; /*src*/
CFI_POP_TMP_REG();
popq %rsi; /*dst*/
+.align 8
CFI_POP_TMP_REG();

bswapl RR0d;
@@ -646,7 +648,7 @@ _gcry_3des_amd64_cbc_dec:
CFI_ENDPROC();
ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;)

-.align 8
+.align 16
.globl _gcry_3des_amd64_ctr_enc
ELF(.type _gcry_3des_amd64_ctr_enc,@function;)
_gcry_3des_amd64_ctr_enc:
@@ -744,7 +746,7 @@ _gcry_3des_amd64_ctr_enc:
CFI_ENDPROC();
ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;)

-.align 8
+.align 16
.globl _gcry_3des_amd64_cfb_dec
ELF(.type _gcry_3des_amd64_cfb_dec,@function;)
_gcry_3des_amd64_cfb_dec:
@@ -841,7 +843,12 @@ _gcry_3des_amd64_cfb_dec:
CFI_ENDPROC();
ELF(.size _gcry_3des_amd64_cfb_dec,.-_gcry_3des_amd64_cfb_dec;)

+
+SECTION_RODATA
+ELF(.type _des_amd64_data,@object;)
+
.align 16
+_des_amd64_data:
.L_s1:
.quad 0x0010100001010400, 0x0000000000000000
.quad 0x0000100000010000, 0x0010100001010404
diff --git a/cipher/rijndael-amd64.S b/cipher/rijndael-amd64.S
index 6e3cc819..3d5a0bd2 100644
--- a/cipher/rijndael-amd64.S
+++ b/cipher/rijndael-amd64.S
@@ -26,6 +26,7 @@
#include "asm-common-amd64.h"

.text
+.align 64

/* table macros */
#define E0 (0)
@@ -200,7 +201,7 @@
#define lastencround(round) \
do_lastencround((round) + 1);

-.align 8
+.align 16
.globl _gcry_aes_amd64_encrypt_block
ELF(.type _gcry_aes_amd64_encrypt_block,@function;)

@@ -377,7 +378,7 @@ ELF(.size _gcry_aes_amd64_encrypt_block,.-_gcry_aes_amd64_encrypt_block;)
#define lastdecround(round) \
do_lastdecround(round);

-.align 8
+.align 16
.globl _gcry_aes_amd64_decrypt_block
ELF(.type _gcry_aes_amd64_decrypt_block,@function;)

diff --git a/cipher/rijndael-ssse3-amd64-asm.S b/cipher/rijndael-ssse3-amd64-asm.S
index b98dca26..52cd0db2 100644
--- a/cipher/rijndael-ssse3-amd64-asm.S
+++ b/cipher/rijndael-ssse3-amd64-asm.S
@@ -43,10 +43,12 @@
#include "asm-common-amd64.h"

.text
+.align 64

##
## _gcry_aes_ssse3_enc_preload
##
+.align 16
ELF(.type _gcry_aes_ssse3_enc_preload,@function)
.globl _gcry_aes_ssse3_enc_preload
_gcry_aes_ssse3_enc_preload:
@@ -68,6 +70,7 @@ ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload)
##
## _gcry_aes_ssse3_dec_preload
##
+.align 16
ELF(.type _gcry_aes_ssse3_dec_preload,@function)
.globl _gcry_aes_ssse3_dec_preload
_gcry_aes_ssse3_dec_preload:
@@ -689,8 +692,11 @@ ELF(.size _gcry_aes_ssse3_schedule_core,.-_gcry_aes_ssse3_schedule_core)
## ##
########################################################

+SECTION_RODATA
+
.align 16
-ELF(.type _aes_consts,@object)
+ELF(.type _aes_ssse3_consts,@object)
+_aes_ssse3_consts:
.Laes_consts:
_aes_consts:
# s0F
diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S
index 13fe7ab0..a801ad90 100644
--- a/cipher/rijndael-vaes-avx2-amd64.S
+++ b/cipher/rijndael-vaes-avx2-amd64.S
@@ -27,6 +27,7 @@
#include "asm-common-amd64.h"

.text
+.align 64

/**********************************************************************
helper macros
@@ -3313,6 +3314,8 @@ ELF(.size _gcry_vaes_avx2_ecb_crypt_amd64,.-_gcry_vaes_avx2_ecb_crypt_amd64)
/**********************************************************************
constants
**********************************************************************/
+SECTION_RODATA
+
ELF(.type _gcry_vaes_consts,@object)
_gcry_vaes_consts:
.align 32
diff --git a/cipher/salsa20-amd64.S b/cipher/salsa20-amd64.S
index 64626063..b681a060 100644
--- a/cipher/salsa20-amd64.S
+++ b/cipher/salsa20-amd64.S
@@ -31,8 +31,9 @@
#include "asm-common-amd64.h"

.text
+.align 64

-.align 8
+.align 16
.globl _gcry_salsa20_amd64_keysetup
ELF(.type _gcry_salsa20_amd64_keysetup,@function;)
_gcry_salsa20_amd64_keysetup:
@@ -86,7 +87,7 @@ _gcry_salsa20_amd64_keysetup:
ret_spec_stop
CFI_ENDPROC();

-.align 8
+.align 16
.globl _gcry_salsa20_amd64_ivsetup
ELF(.type _gcry_salsa20_amd64_ivsetup,@function;)
_gcry_salsa20_amd64_ivsetup:
@@ -102,7 +103,7 @@ _gcry_salsa20_amd64_ivsetup:
ret_spec_stop
CFI_ENDPROC();

-.align 8
+.align 16
.globl _gcry_salsa20_amd64_encrypt_blocks
ELF(.type _gcry_salsa20_amd64_encrypt_blocks,@function;)
_gcry_salsa20_amd64_encrypt_blocks:
diff --git a/cipher/serpent-avx2-amd64.S b/cipher/serpent-avx2-amd64.S
index 54ff61e4..4da0a228 100644
--- a/cipher/serpent-avx2-amd64.S
+++ b/cipher/serpent-avx2-amd64.S
@@ -400,8 +400,9 @@
BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);

.text
+.align 64

-.align 8
+.align 16
ELF(.type __serpent_enc_blk16,@function;)
__serpent_enc_blk16:
/* input:
@@ -491,7 +492,7 @@ __serpent_enc_blk16:
CFI_ENDPROC();
ELF(.size __serpent_enc_blk16,.-__serpent_enc_blk16;)

-.align 8
+.align 16
ELF(.type __serpent_dec_blk16,@function;)
__serpent_dec_blk16:
/* input:
@@ -583,7 +584,7 @@ __serpent_dec_blk16:
CFI_ENDPROC();
ELF(.size __serpent_dec_blk16,.-__serpent_dec_blk16;)

-.align 8
+.align 16
.globl _gcry_serpent_avx2_blk16
ELF(.type _gcry_serpent_avx2_blk16,@function;)
_gcry_serpent_avx2_blk16:
@@ -639,7 +640,7 @@ ELF(.size _gcry_serpent_avx2_blk16,.-_gcry_serpent_avx2_blk16;)
vpslldq $8, tmp, tmp; \
vpsubq tmp, x, x;

-.align 8
+.align 16
.globl _gcry_serpent_avx2_ctr_enc
ELF(.type _gcry_serpent_avx2_ctr_enc,@function;)
_gcry_serpent_avx2_ctr_enc:
@@ -751,7 +752,7 @@ _gcry_serpent_avx2_ctr_enc:
CFI_ENDPROC();
ELF(.size _gcry_serpent_avx2_ctr_enc,.-_gcry_serpent_avx2_ctr_enc;)

-.align 8
+.align 16
.globl _gcry_serpent_avx2_cbc_dec
ELF(.type _gcry_serpent_avx2_cbc_dec,@function;)
_gcry_serpent_avx2_cbc_dec:
@@ -804,7 +805,7 @@ _gcry_serpent_avx2_cbc_dec:
CFI_ENDPROC();
ELF(.size _gcry_serpent_avx2_cbc_dec,.-_gcry_serpent_avx2_cbc_dec;)

-.align 8
+.align 16
.globl _gcry_serpent_avx2_cfb_dec
ELF(.type _gcry_serpent_avx2_cfb_dec,@function;)
_gcry_serpent_avx2_cfb_dec:
@@ -859,7 +860,7 @@ _gcry_serpent_avx2_cfb_dec:
CFI_ENDPROC();
ELF(.size _gcry_serpent_avx2_cfb_dec,.-_gcry_serpent_avx2_cfb_dec;)

-.align 8
+.align 16
.globl _gcry_serpent_avx2_ocb_enc
ELF(.type _gcry_serpent_avx2_ocb_enc,@function;)

@@ -973,7 +974,7 @@ _gcry_serpent_avx2_ocb_enc:
CFI_ENDPROC();
ELF(.size _gcry_serpent_avx2_ocb_enc,.-_gcry_serpent_avx2_ocb_enc;)

-.align 8
+.align 16
.globl _gcry_serpent_avx2_ocb_dec
ELF(.type _gcry_serpent_avx2_ocb_dec,@function;)

@@ -1097,7 +1098,7 @@ _gcry_serpent_avx2_ocb_dec:
CFI_ENDPROC();
ELF(.size _gcry_serpent_avx2_ocb_dec,.-_gcry_serpent_avx2_ocb_dec;)

-.align 8
+.align 16
.globl _gcry_serpent_avx2_ocb_auth
ELF(.type _gcry_serpent_avx2_ocb_auth,@function;)

@@ -1200,9 +1201,13 @@ _gcry_serpent_avx2_ocb_auth:
CFI_ENDPROC();
ELF(.size _gcry_serpent_avx2_ocb_auth,.-_gcry_serpent_avx2_ocb_auth;)

-.align 16
+
+SECTION_RODATA
+ELF(.type _serpent_avx2_consts,@object)
+_serpent_avx2_consts:

/* For CTR-mode IV byteswap */
+.align 16
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0

diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S
index 01723a2a..e7a250d9 100644
--- a/cipher/serpent-sse2-amd64.S
+++ b/cipher/serpent-sse2-amd64.S
@@ -422,8 +422,9 @@
BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);

.text
+.align 64

-.align 8
+.align 16
ELF(.type __serpent_enc_blk8,@function;)
__serpent_enc_blk8:
/* input:
@@ -513,7 +514,7 @@ __serpent_enc_blk8:
CFI_ENDPROC();
ELF(.size __serpent_enc_blk8,.-__serpent_enc_blk8;)

-.align 8
+.align 16
ELF(.type __serpent_dec_blk8,@function;)
__serpent_dec_blk8:
/* input:
@@ -605,7 +606,7 @@ __serpent_dec_blk8:
CFI_ENDPROC();
ELF(.size __serpent_dec_blk8,.-__serpent_dec_blk8;)

-.align 8
+.align 16
.globl _gcry_serpent_sse2_blk8
ELF(.type _gcry_serpent_sse2_blk8,@function;)
_gcry_serpent_sse2_blk8:
@@ -670,7 +671,7 @@ _gcry_serpent_sse2_blk8:
CFI_ENDPROC();
ELF(.size _gcry_serpent_sse2_blk8,.-_gcry_serpent_sse2_blk8;)

-.align 8
+.align 16
.globl _gcry_serpent_sse2_ctr_enc
ELF(.type _gcry_serpent_sse2_ctr_enc,@function;)
_gcry_serpent_sse2_ctr_enc:
@@ -802,7 +803,7 @@ _gcry_serpent_sse2_ctr_enc:
CFI_ENDPROC();
ELF(.size _gcry_serpent_sse2_ctr_enc,.-_gcry_serpent_sse2_ctr_enc;)

-.align 8
+.align 16
.globl _gcry_serpent_sse2_cbc_dec
ELF(.type _gcry_serpent_sse2_cbc_dec,@function;)
_gcry_serpent_sse2_cbc_dec:
@@ -865,7 +866,7 @@ _gcry_serpent_sse2_cbc_dec:
CFI_ENDPROC();
ELF(.size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec;)

-.align 8
+.align 16
.globl _gcry_serpent_sse2_cfb_dec
ELF(.type _gcry_serpent_sse2_cfb_dec,@function;)
_gcry_serpent_sse2_cfb_dec:
@@ -931,7 +932,7 @@ _gcry_serpent_sse2_cfb_dec:
CFI_ENDPROC();
ELF(.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec;)

-.align 8
+.align 16
.globl _gcry_serpent_sse2_ocb_enc
ELF(.type _gcry_serpent_sse2_ocb_enc,@function;)

@@ -1045,7 +1046,7 @@ _gcry_serpent_sse2_ocb_enc:
CFI_ENDPROC();
ELF(.size _gcry_serpent_sse2_ocb_enc,.-_gcry_serpent_sse2_ocb_enc;)

-.align 8
+.align 16
.globl _gcry_serpent_sse2_ocb_dec
ELF(.type _gcry_serpent_sse2_ocb_dec,@function;)

@@ -1169,7 +1170,7 @@ _gcry_serpent_sse2_ocb_dec:
CFI_ENDPROC();
ELF(.size _gcry_serpent_sse2_ocb_dec,.-_gcry_serpent_sse2_ocb_dec;)

-.align 8
+.align 16
.globl _gcry_serpent_sse2_ocb_auth
ELF(.type _gcry_serpent_sse2_ocb_auth,@function;)

diff --git a/cipher/sm4-aesni-avx-amd64.S b/cipher/sm4-aesni-avx-amd64.S
index 7a99e070..bb0d20c6 100644
--- a/cipher/sm4-aesni-avx-amd64.S
+++ b/cipher/sm4-aesni-avx-amd64.S
@@ -97,9 +97,12 @@
4-way && 8-way SM4 with AES-NI and AVX
**********************************************************************/

-.text
+SECTION_RODATA
.align 16

+ELF(.type _sm4_aesni_avx_consts,@object)
+_sm4_aesni_avx_consts:
+
/*
* Following four affine transform look-up tables are from work by
* Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni
@@ -152,7 +155,10 @@
.L0f0f0f0f:
.long 0x0f0f0f0f

-.align 8
+.text
+.align 64
+
+.align 16
.globl _gcry_sm4_aesni_avx_expand_key
ELF(.type _gcry_sm4_aesni_avx_expand_key,@function;)
_gcry_sm4_aesni_avx_expand_key:
@@ -244,7 +250,7 @@ _gcry_sm4_aesni_avx_expand_key:
CFI_ENDPROC();
ELF(.size _gcry_sm4_aesni_avx_expand_key,.-_gcry_sm4_aesni_avx_expand_key;)

-.align 8
+.align 16
ELF(.type sm4_aesni_avx_crypt_blk1_4,@function;)
sm4_aesni_avx_crypt_blk1_4:
/* input:
@@ -349,7 +355,7 @@ sm4_aesni_avx_crypt_blk1_4:
CFI_ENDPROC();
ELF(.size sm4_aesni_avx_crypt_blk1_4,.-sm4_aesni_avx_crypt_blk1_4;)

-.align 8
+.align 16
ELF(.type __sm4_crypt_blk8,@function;)
__sm4_crypt_blk8:
/* input:
@@ -458,7 +464,7 @@ __sm4_crypt_blk8:
CFI_ENDPROC();
ELF(.size __sm4_crypt_blk8,.-__sm4_crypt_blk8;)

-.align 8
+.align 16
.globl _gcry_sm4_aesni_avx_crypt_blk1_8
ELF(.type _gcry_sm4_aesni_avx_crypt_blk1_8,@function;)
_gcry_sm4_aesni_avx_crypt_blk1_8:
@@ -512,7 +518,7 @@ _gcry_sm4_aesni_avx_crypt_blk1_8:
CFI_ENDPROC();
ELF(.size _gcry_sm4_aesni_avx_crypt_blk1_8,.-_gcry_sm4_aesni_avx_crypt_blk1_8;)

-.align 8
+.align 16
.globl _gcry_sm4_aesni_avx_ctr_enc
ELF(.type _gcry_sm4_aesni_avx_ctr_enc,@function;)
_gcry_sm4_aesni_avx_ctr_enc:
@@ -586,7 +592,7 @@ _gcry_sm4_aesni_avx_ctr_enc:
CFI_ENDPROC();
ELF(.size _gcry_sm4_aesni_avx_ctr_enc,.-_gcry_sm4_aesni_avx_ctr_enc;)

-.align 8
+.align 16
.globl _gcry_sm4_aesni_avx_cbc_dec
ELF(.type _gcry_sm4_aesni_avx_cbc_dec,@function;)
_gcry_sm4_aesni_avx_cbc_dec:
@@ -635,7 +641,7 @@ _gcry_sm4_aesni_avx_cbc_dec:
CFI_ENDPROC();
ELF(.size _gcry_sm4_aesni_avx_cbc_dec,.-_gcry_sm4_aesni_avx_cbc_dec;)

-.align 8
+.align 16
.globl _gcry_sm4_aesni_avx_cfb_dec
ELF(.type _gcry_sm4_aesni_avx_cfb_dec,@function;)
_gcry_sm4_aesni_avx_cfb_dec:
@@ -687,7 +693,7 @@ _gcry_sm4_aesni_avx_cfb_dec:
CFI_ENDPROC();
ELF(.size _gcry_sm4_aesni_avx_cfb_dec,.-_gcry_sm4_aesni_avx_cfb_dec;)

-.align 8
+.align 16
.globl _gcry_sm4_aesni_avx_ocb_enc
ELF(.type _gcry_sm4_aesni_avx_ocb_enc,@function;)

@@ -786,7 +792,7 @@ _gcry_sm4_aesni_avx_ocb_enc:
CFI_ENDPROC();
ELF(.size _gcry_sm4_aesni_avx_ocb_enc,.-_gcry_sm4_aesni_avx_ocb_enc;)

-.align 8
+.align 16
.globl _gcry_sm4_aesni_avx_ocb_dec
ELF(.type _gcry_sm4_aesni_avx_ocb_dec,@function;)

@@ -895,7 +901,7 @@ _gcry_sm4_aesni_avx_ocb_dec:
CFI_ENDPROC();
ELF(.size _gcry_sm4_aesni_avx_ocb_dec,.-_gcry_sm4_aesni_avx_ocb_dec;)

-.align 8
+.align 16
.globl _gcry_sm4_aesni_avx_ocb_auth
ELF(.type _gcry_sm4_aesni_avx_ocb_auth,@function;)

diff --git a/cipher/sm4-aesni-avx2-amd64.S b/cipher/sm4-aesni-avx2-amd64.S
index e09fed8f..db94be90 100644
--- a/cipher/sm4-aesni-avx2-amd64.S
+++ b/cipher/sm4-aesni-avx2-amd64.S
@@ -118,9 +118,12 @@
16-way SM4 with AES-NI and AVX
**********************************************************************/

-.text
+SECTION_RODATA
.align 16

+ELF(.type _sm4_aesni_avx2_consts,@object)
+_sm4_aesni_avx2_consts:
+
/*
* Following four affine transform look-up tables are from work by
* Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni
@@ -173,7 +176,10 @@
.L0f0f0f0f:
.long 0x0f0f0f0f

-.align 8
+.text
+.align 64
+
+.align 16
ELF(.type __sm4_crypt_blk16,@function;)
__sm4_crypt_blk16:
/* input:
@@ -288,7 +294,7 @@ __sm4_crypt_blk16:
CFI_ENDPROC();
ELF(.size __sm4_crypt_blk16,.-__sm4_crypt_blk16;)

-.align 8
+.align 16
.globl _gcry_sm4_aesni_avx2_crypt_blk1_16
ELF(.type _gcry_sm4_aesni_avx2_crypt_blk1_16,@function;)
_gcry_sm4_aesni_avx2_crypt_blk1_16:
@@ -354,7 +360,7 @@ ELF(.size _gcry_sm4_aesni_avx2_crypt_blk1_16,.-_gcry_sm4_aesni_avx2_crypt_blk1_1
vpslldq $8, tmp, tmp; \
vpsubq tmp, x, x;

-.align 8
+.align 16
.globl _gcry_sm4_aesni_avx2_ctr_enc
ELF(.type _gcry_sm4_aesni_avx2_ctr_enc,@function;)
_gcry_sm4_aesni_avx2_ctr_enc:
@@ -464,7 +470,7 @@ _gcry_sm4_aesni_avx2_ctr_enc:
CFI_ENDPROC();
ELF(.size _gcry_sm4_aesni_avx2_ctr_enc,.-_gcry_sm4_aesni_avx2_ctr_enc;)

-.align 8
+.align 16
.globl _gcry_sm4_aesni_avx2_cbc_dec
ELF(.type _gcry_sm4_aesni_avx2_cbc_dec,@function;)
_gcry_sm4_aesni_avx2_cbc_dec:
@@ -515,7 +521,7 @@ _gcry_sm4_aesni_avx2_cbc_dec:
CFI_ENDPROC();
ELF(.size _gcry_sm4_aesni_avx2_cbc_dec,.-_gcry_sm4_aesni_avx2_cbc_dec;)

-.align 8
+.align 16
.globl _gcry_sm4_aesni_avx2_cfb_dec
ELF(.type _gcry_sm4_aesni_avx2_cfb_dec,@function;)
_gcry_sm4_aesni_avx2_cfb_dec:
@@ -568,7 +574,7 @@ _gcry_sm4_aesni_avx2_cfb_dec:
CFI_ENDPROC();
ELF(.size _gcry_sm4_aesni_avx2_cfb_dec,.-_gcry_sm4_aesni_avx2_cfb_dec;)

-.align 8
+.align 16
.globl _gcry_sm4_aesni_avx2_ocb_enc
ELF(.type _gcry_sm4_aesni_avx2_ocb_enc,@function;)

@@ -680,7 +686,7 @@ _gcry_sm4_aesni_avx2_ocb_enc:
CFI_ENDPROC();
ELF(.size _gcry_sm4_aesni_avx2_ocb_enc,.-_gcry_sm4_aesni_avx2_ocb_enc;)

-.align 8
+.align 16
.globl _gcry_sm4_aesni_avx2_ocb_dec
ELF(.type _gcry_sm4_aesni_avx2_ocb_dec,@function;)

@@ -802,7 +808,7 @@ _gcry_sm4_aesni_avx2_ocb_dec:
CFI_ENDPROC();
ELF(.size _gcry_sm4_aesni_avx2_ocb_dec,.-_gcry_sm4_aesni_avx2_ocb_dec;)

-.align 8
+.align 16
.globl _gcry_sm4_aesni_avx2_ocb_auth
ELF(.type _gcry_sm4_aesni_avx2_ocb_auth,@function;)

diff --git a/cipher/sm4-gfni-avx2-amd64.S b/cipher/sm4-gfni-avx2-amd64.S
index 4ec0ea39..7c87400e 100644
--- a/cipher/sm4-gfni-avx2-amd64.S
+++ b/cipher/sm4-gfni-avx2-amd64.S
@@ -87,9 +87,12 @@
#define RB2x %xmm14
#define RB3x %xmm15

-.text
+SECTION_RODATA
.align 32

+ELF(.type _sm4_gfni_avx2_consts,@object)
+_sm4_gfni_avx2_consts:
+
/* Affine transform, SM4 field to AES field */
.Lpre_affine_s:
.byte 0x52, 0xbc, 0x2d, 0x02, 0x9e, 0x25, 0xac, 0x34
@@ -133,7 +136,10 @@
.Lbswap32_mask:
.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12

-.align 8
+.text
+.align 64
+
+.align 16
.globl _gcry_sm4_gfni_avx2_expand_key
ELF(.type _gcry_sm4_gfni_avx2_expand_key,@function;)
_gcry_sm4_gfni_avx2_expand_key:
@@ -216,7 +222,7 @@ _gcry_sm4_gfni_avx2_expand_key:
CFI_ENDPROC();
ELF(.size _gcry_sm4_gfni_avx2_expand_key,.-_gcry_sm4_gfni_avx2_expand_key;)

-.align 8
+.align 16
ELF(.type sm4_gfni_avx2_crypt_blk1_4,@function;)
sm4_gfni_avx2_crypt_blk1_4:
/* input:
@@ -314,7 +320,7 @@ sm4_gfni_avx2_crypt_blk1_4:
CFI_ENDPROC();
ELF(.size sm4_gfni_avx2_crypt_blk1_4,.-sm4_gfni_avx2_crypt_blk1_4;)

-.align 8
+.align 16
ELF(.type __sm4_gfni_crypt_blk8,@function;)
__sm4_gfni_crypt_blk8:
/* input:
@@ -415,7 +421,7 @@ __sm4_gfni_crypt_blk8:
CFI_ENDPROC();
ELF(.size __sm4_gfni_crypt_blk8,.-__sm4_gfni_crypt_blk8;)

-.align 8
+.align 16
ELF(.type _gcry_sm4_gfni_avx2_crypt_blk1_8,@function;)
_gcry_sm4_gfni_avx2_crypt_blk1_8:
/* input:
@@ -472,7 +478,7 @@ ELF(.size _gcry_sm4_gfni_avx2_crypt_blk1_8,.-_gcry_sm4_gfni_avx2_crypt_blk1_8;)
16-way SM4 with GFNI and AVX2
**********************************************************************/

-.align 8
+.align 16
ELF(.type __sm4_gfni_crypt_blk16,@function;)
__sm4_gfni_crypt_blk16:
/* input:
@@ -573,7 +579,7 @@ __sm4_gfni_crypt_blk16:
CFI_ENDPROC();
ELF(.size __sm4_gfni_crypt_blk16,.-__sm4_gfni_crypt_blk16;)

-.align 8
+.align 16
.globl _gcry_sm4_gfni_avx2_crypt_blk1_16
ELF(.type _gcry_sm4_gfni_avx2_crypt_blk1_16,@function;)
_gcry_sm4_gfni_avx2_crypt_blk1_16:
@@ -641,7 +647,7 @@ ELF(.size _gcry_sm4_gfni_avx2_crypt_blk1_16,.-_gcry_sm4_gfni_avx2_crypt_blk1_16;
vpslldq $8, tmp, tmp; \
vpsubq tmp, x, x;

-.align 8
+.align 16
.globl _gcry_sm4_gfni_avx2_ctr_enc
ELF(.type _gcry_sm4_gfni_avx2_ctr_enc,@function;)
_gcry_sm4_gfni_avx2_ctr_enc:
@@ -751,7 +757,7 @@ _gcry_sm4_gfni_avx2_ctr_enc:
CFI_ENDPROC();
ELF(.size _gcry_sm4_gfni_avx2_ctr_enc,.-_gcry_sm4_gfni_avx2_ctr_enc;)

-.align 8
+.align 16
.globl _gcry_sm4_gfni_avx2_cbc_dec
ELF(.type _gcry_sm4_gfni_avx2_cbc_dec,@function;)
_gcry_sm4_gfni_avx2_cbc_dec:
@@ -802,7 +808,7 @@ _gcry_sm4_gfni_avx2_cbc_dec:
CFI_ENDPROC();
ELF(.size _gcry_sm4_gfni_avx2_cbc_dec,.-_gcry_sm4_gfni_avx2_cbc_dec;)

-.align 8
+.align 16
.globl _gcry_sm4_gfni_avx2_cfb_dec
ELF(.type _gcry_sm4_gfni_avx2_cfb_dec,@function;)
_gcry_sm4_gfni_avx2_cfb_dec:
@@ -855,7 +861,7 @@ _gcry_sm4_gfni_avx2_cfb_dec:
CFI_ENDPROC();
ELF(.size _gcry_sm4_gfni_avx2_cfb_dec,.-_gcry_sm4_gfni_avx2_cfb_dec;)

-.align 8
+.align 16
.globl _gcry_sm4_gfni_avx2_ocb_enc
ELF(.type _gcry_sm4_gfni_avx2_ocb_enc,@function;)

@@ -967,7 +973,7 @@ _gcry_sm4_gfni_avx2_ocb_enc:
CFI_ENDPROC();
ELF(.size _gcry_sm4_gfni_avx2_ocb_enc,.-_gcry_sm4_gfni_avx2_ocb_enc;)

-.align 8
+.align 16
.globl _gcry_sm4_gfni_avx2_ocb_dec
ELF(.type _gcry_sm4_gfni_avx2_ocb_dec,@function;)

@@ -1089,7 +1095,7 @@ _gcry_sm4_gfni_avx2_ocb_dec:
CFI_ENDPROC();
ELF(.size _gcry_sm4_gfni_avx2_ocb_dec,.-_gcry_sm4_gfni_avx2_ocb_dec;)

-.align 8
+.align 16
.globl _gcry_sm4_gfni_avx2_ocb_auth
ELF(.type _gcry_sm4_gfni_avx2_ocb_auth,@function;)

diff --git a/cipher/sm4-gfni-avx512-amd64.S b/cipher/sm4-gfni-avx512-amd64.S
index 0f9899d4..00a1c921 100644
--- a/cipher/sm4-gfni-avx512-amd64.S
+++ b/cipher/sm4-gfni-avx512-amd64.S
@@ -103,7 +103,7 @@
#define RB2z %zmm14
#define RB3z %zmm15

-.text
+SECTION_RODATA
.align 32

/* Affine transform, SM4 field to AES field */
@@ -146,6 +146,9 @@
.quad 2, 0
.quad 3, 0

+.text
+.align 64
+
.align 16
.globl _gcry_sm4_gfni_avx512_expand_key
ELF(.type _gcry_sm4_gfni_avx512_expand_key,@function;)
diff --git a/cipher/twofish-amd64.S b/cipher/twofish-amd64.S
index 8998d296..b19a5b1b 100644
--- a/cipher/twofish-amd64.S
+++ b/cipher/twofish-amd64.S
@@ -26,6 +26,7 @@
#include "asm-common-amd64.h"

.text
+.align 64

/* structure of TWOFISH_context: */
#define s0 0
@@ -161,7 +162,7 @@
xorl (w + 4 * (m))(CTX), x; \
movl x, (4 * (n))(out);

-.align 8
+.align 16
.globl _gcry_twofish_amd64_encrypt_block
ELF(.type _gcry_twofish_amd64_encrypt_block,@function;)

@@ -215,7 +216,7 @@ _gcry_twofish_amd64_encrypt_block:
CFI_ENDPROC();
ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;)

-.align 8
+.align 16
.globl _gcry_twofish_amd64_decrypt_block
ELF(.type _gcry_twofish_amd64_decrypt_block,@function;)

@@ -486,7 +487,7 @@ ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;
rorq $32, RAB2; \
outunpack3(RAB, 2);

-.align 8
+.align 16
ELF(.type __twofish_enc_blk3,@function;)

__twofish_enc_blk3:
@@ -515,7 +516,7 @@ __twofish_enc_blk3:
CFI_ENDPROC();
ELF(.size __twofish_enc_blk3,.-__twofish_enc_blk3;)

-.align 8
+.align 16
ELF(.type __twofish_dec_blk3,@function;)

__twofish_dec_blk3:
@@ -544,7 +545,7 @@ __twofish_dec_blk3:
CFI_ENDPROC();
ELF(.size __twofish_dec_blk3,.-__twofish_dec_blk3;)

-.align 8
+.align 16
.globl _gcry_twofish_amd64_blk3
ELF(.type _gcry_twofish_amd64_blk3,@function;)
_gcry_twofish_amd64_blk3:
@@ -618,7 +619,7 @@ _gcry_twofish_amd64_blk3:
CFI_ENDPROC();
ELF(.size _gcry_twofish_amd64_blk3,.-_gcry_twofish_amd64_blk3;)

-.align 8
+.align 16
.globl _gcry_twofish_amd64_ctr_enc
ELF(.type _gcry_twofish_amd64_ctr_enc,@function;)
_gcry_twofish_amd64_ctr_enc:
@@ -719,7 +720,7 @@ _gcry_twofish_amd64_ctr_enc:
CFI_ENDPROC();
ELF(.size _gcry_twofish_amd64_ctr_enc,.-_gcry_twofish_amd64_ctr_enc;)

-.align 8
+.align 16
.globl _gcry_twofish_amd64_cbc_dec
ELF(.type _gcry_twofish_amd64_cbc_dec,@function;)
_gcry_twofish_amd64_cbc_dec:
@@ -804,7 +805,7 @@ _gcry_twofish_amd64_cbc_dec:
CFI_ENDPROC();
ELF(.size _gcry_twofish_amd64_cbc_dec,.-_gcry_twofish_amd64_cbc_dec;)

-.align 8
+.align 16
.globl _gcry_twofish_amd64_cfb_dec
ELF(.type _gcry_twofish_amd64_cfb_dec,@function;)
_gcry_twofish_amd64_cfb_dec:
@@ -889,7 +890,7 @@ _gcry_twofish_amd64_cfb_dec:
CFI_ENDPROC();
ELF(.size _gcry_twofish_amd64_cfb_dec,.-_gcry_twofish_amd64_cfb_dec;)

-.align 8
+.align 16
.globl _gcry_twofish_amd64_ocb_enc
ELF(.type _gcry_twofish_amd64_ocb_enc,@function;)
_gcry_twofish_amd64_ocb_enc:
@@ -1015,7 +1016,7 @@ _gcry_twofish_amd64_ocb_enc:
CFI_ENDPROC();
ELF(.size _gcry_twofish_amd64_ocb_enc,.-_gcry_twofish_amd64_ocb_enc;)

-.align 8
+.align 16
.globl _gcry_twofish_amd64_ocb_dec
ELF(.type _gcry_twofish_amd64_ocb_dec,@function;)
_gcry_twofish_amd64_ocb_dec:
@@ -1149,7 +1150,7 @@ _gcry_twofish_amd64_ocb_dec:
CFI_ENDPROC();
ELF(.size _gcry_twofish_amd64_ocb_dec,.-_gcry_twofish_amd64_ocb_dec;)

-.align 8
+.align 16
.globl _gcry_twofish_amd64_ocb_auth
ELF(.type _gcry_twofish_amd64_ocb_auth,@function;)
_gcry_twofish_amd64_ocb_auth:
diff --git a/cipher/twofish-avx2-amd64.S b/cipher/twofish-avx2-amd64.S
index 0cb9a64c..19fe0d9c 100644
--- a/cipher/twofish-avx2-amd64.S
+++ b/cipher/twofish-avx2-amd64.S
@@ -27,6 +27,7 @@
#include "asm-common-amd64.h"

.text
+.align 64

/* structure of TWOFISH_context: */
#define s0 0
@@ -402,7 +403,7 @@
outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);

-.align 8
+.align 16
ELF(.type __twofish_enc_blk16,@function;)
__twofish_enc_blk16:
/* input:
@@ -435,7 +436,7 @@ __twofish_enc_blk16:
CFI_ENDPROC();
ELF(.size __twofish_enc_blk16,.-__twofish_enc_blk16;)

-.align 8
+.align 16
ELF(.type __twofish_dec_blk16,@function;)
__twofish_dec_blk16:
/* input:
@@ -468,7 +469,7 @@ __twofish_dec_blk16:
CFI_ENDPROC();
ELF(.size __twofish_dec_blk16,.-__twofish_dec_blk16;)

-.align 8
+.align 16
.globl _gcry_twofish_avx2_blk16
ELF(.type _gcry_twofish_avx2_blk16,@function;)
_gcry_twofish_avx2_blk16:
@@ -520,7 +521,7 @@ ELF(.size _gcry_twofish_avx2_blk16,.-_gcry_twofish_avx2_blk16;)
vpslldq $8, tmp, tmp; \
vpsubq tmp, x, x;

-.align 8
+.align 16
.globl _gcry_twofish_avx2_ctr_enc
ELF(.type _gcry_twofish_avx2_ctr_enc,@function;)
_gcry_twofish_avx2_ctr_enc:
@@ -632,7 +633,7 @@ _gcry_twofish_avx2_ctr_enc:
CFI_ENDPROC();
ELF(.size _gcry_twofish_avx2_ctr_enc,.-_gcry_twofish_avx2_ctr_enc;)

-.align 8
+.align 16
.globl _gcry_twofish_avx2_cbc_dec
ELF(.type _gcry_twofish_avx2_cbc_dec,@function;)
_gcry_twofish_avx2_cbc_dec:
@@ -685,7 +686,7 @@ _gcry_twofish_avx2_cbc_dec:
CFI_ENDPROC();
ELF(.size _gcry_twofish_avx2_cbc_dec,.-_gcry_twofish_avx2_cbc_dec;)

-.align 8
+.align 16
.globl _gcry_twofish_avx2_cfb_dec
ELF(.type _gcry_twofish_avx2_cfb_dec,@function;)
_gcry_twofish_avx2_cfb_dec:
@@ -740,7 +741,7 @@ _gcry_twofish_avx2_cfb_dec:
CFI_ENDPROC();
ELF(.size _gcry_twofish_avx2_cfb_dec,.-_gcry_twofish_avx2_cfb_dec;)

-.align 8
+.align 16
.globl _gcry_twofish_avx2_ocb_enc
ELF(.type _gcry_twofish_avx2_ocb_enc,@function;)

@@ -854,7 +855,7 @@ _gcry_twofish_avx2_ocb_enc:
CFI_ENDPROC();
ELF(.size _gcry_twofish_avx2_ocb_enc,.-_gcry_twofish_avx2_ocb_enc;)

-.align 8
+.align 16
.globl _gcry_twofish_avx2_ocb_dec
ELF(.type _gcry_twofish_avx2_ocb_dec,@function;)

@@ -979,7 +980,7 @@ _gcry_twofish_avx2_ocb_dec:
CFI_ENDPROC();
ELF(.size _gcry_twofish_avx2_ocb_dec,.-_gcry_twofish_avx2_ocb_dec;)

-.align 8
+.align 16
.globl _gcry_twofish_avx2_ocb_auth
ELF(.type _gcry_twofish_avx2_ocb_auth,@function;)

@@ -1082,10 +1083,13 @@ _gcry_twofish_avx2_ocb_auth:
CFI_ENDPROC();
ELF(.size _gcry_twofish_avx2_ocb_auth,.-_gcry_twofish_avx2_ocb_auth;)

+SECTION_RODATA
+
.align 16

/* For CTR-mode IV byteswap */
- _gcry_twofish_bswap128_mask:
+ELF(.type _gcry_twofish_bswap128_mask,@object)
+_gcry_twofish_bswap128_mask:
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
ELF(.size _gcry_twofish_bswap128_mask,.-_gcry_twofish_bswap128_mask;)
--
2.37.2


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@gnupg.org
https://lists.gnupg.org/mailman/listinfo/gcrypt-devel