-rw-r--r-- 11973 lib1305-20250415/crypto_onetimeauth/poly1305/amd64-maa64-g1/poly1305_maa64_g1.S raw
/* assembly to compute poly1305 */
#include "crypto_asm_hidden.h"
// linker define poly1305_maa64_g1
// linker use mask2
// linker use mask2c
// linker use p0
// linker use p1
// linker use p2
#define mask2 CRYPTO_SHARED_NAMESPACE(mask2)
#define mask2c CRYPTO_SHARED_NAMESPACE(mask2c)
#define p0 CRYPTO_SHARED_NAMESPACE(p0)
#define p1 CRYPTO_SHARED_NAMESPACE(p1)
#define p2 CRYPTO_SHARED_NAMESPACE(p2)
.p2align 5
ASM_HIDDEN _CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g1)
ASM_HIDDEN CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g1)
.global _CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g1)
.global CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g1)
_CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g1):
CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g1):
movq %rsp,%r11
andq $-32,%rsp
subq $128,%rsp
movq %r11,0(%rsp)
movq %r12,8(%rsp)
movq %r13,16(%rsp)
movq %r14,24(%rsp)
movq %r15,32(%rsp)
movq %rbx,40(%rsp)
movq %rbp,48(%rsp)
movq %rdi,56(%rsp)
movq %r8,64(%rsp)
movq %r9,72(%rsp)
/* store high 16 bytes of key */
movq 16(%rdx),%r14
movq 24(%rdx),%r15
movq %r14,80(%rsp)
movq %r15,88(%rsp)
/* key = (r15 : r14) */
movq 0(%rdx),%r14
movq 8(%rdx),%r15
/* initialize a quad-word on the stack with 0 */
movq $0,96(%rsp)
/* if the message has a single block */
cmpq $1,%rcx
je .L9
/* message block = (rbp : rbx) */
movq 0(%rsi),%rbx
movq 8(%rsi),%rbp
/* else loop around and multiply the 129-byte (3-limb)
* message block with the 128-byte (2-limb) key;
* read the 129th bit in %rdi before proceeding
*/
movq $1,%rdi
.L1:
/* integer multiplication */
movq %rdi,%rax
mulq %r15
movq %rax,%r8
xorq %r9,%r9
movq %rax,%r10
xorq %r11,%r11
movq %rdx,%r12
xorq %r13,%r13
xorq %rax,%rax
shld $62,%rdx,%rax
shlq $62,%rdx
addq %rdx,%r10
adcq %rax,%r11
movq %rbp,%rax
mulq %r15
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq %rdi,%rax
mulq %r14
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
shld $62,%r8,%r9
shlq $62,%r8
movq %rbx,%rax
mulq %r14
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq %rbx,%rax
mulq %r15
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq %rbp,%rax
mulq %r14
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
/* reduction on (r13 : r12) : (r11 : r10) : (r9 : r8) */
addq %r9,%r10
adcq $0,%r11
addq %r11,%r12
adcq $0,%r13
/* reduction on (r13 : r12 : r10 : r8) */
movq %r12,%rdi
andq mask2(%rip),%rdi
andq mask2c(%rip),%r12
addq %r12,%r8
adcq %r13,%r10
adcq $0,%rdi
shrd $2,%r13,%r12
shrq $2,%r13
addq %r12,%r8
adcq %r13,%r10
adcq $0,%rdi
movq %r8,%rbx
movq %r10,%rbp
addq $16,%rsi
subq $1,%rcx
cmpq $2,%rcx
jg .L2
je .L3
jl .L4
.L2:
/* add the next message block */
addq 0(%rsi),%rbx
adcq 8(%rsi),%rbp
adcq $1,%rdi
jmp .L1
.L3:
/* add the second last block and proceed */
addq 0(%rsi),%rbx
adcq 8(%rsi),%rbp
adcq $1,%rdi
/* integer multiplication */
movq %rdi,%rax
mulq %r15
movq %rax,%r8
xorq %r9,%r9
movq %rax,%r10
xorq %r11,%r11
movq %rdx,%r12
xorq %r13,%r13
xorq %rax,%rax
shld $62,%rdx,%rax
shlq $62,%rdx
addq %rdx,%r10
adcq %rax,%r11
movq %rbp,%rax
mulq %r15
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq %rdi,%rax
mulq %r14
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
shld $62,%r8,%r9
shlq $62,%r8
movq %rbx,%rax
mulq %r14
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq %rbx,%rax
mulq %r15
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq %rbp,%rax
mulq %r14
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
/* reduction on (r13 : r12) : (r11 : r10) : (r9 : r8) */
addq %r9,%r10
adcq $0,%r11
addq %r11,%r12
adcq $0,%r13
/* reduction on (r13 : r12 : r10 : r8) */
movq %r12,%rdi
andq mask2(%rip),%rdi
andq mask2c(%rip),%r12
addq %r12,%r8
adcq %r13,%r10
adcq $0,%rdi
shrd $2,%r13,%r12
shrq $2,%r13
addq %r12,%r8
adcq %r13,%r10
adcq $0,%rdi
movq %r8,%rbx
movq %r10,%rbp
addq $16,%rsi
subq $1,%rcx
.L4:
/* process the last block */
movq %rdi,%r12
/* if the last block is full */
cmpq $0,64(%rsp)
je .L7
/* if the last block has 8 bytes */
cmpq $64,64(%rsp)
je .L6
/* if the last block has 1 to 7 bytes */
jl .L5
/* else if the last block has 9 to 15 bytes */
/* first chunk of message block = (r8) */
movq 0(%rsi),%r8
addq $8,%rsi
movq $128,%rax
subq 64(%rsp),%rax
movq $64,%rcx
subq %rax,%rcx
shrq $3,%rcx
leaq 96(%rsp),%rdi
rep movsb (%rsi),(%rdi)
/* second chunk of message block = (r9) */
movq 96(%rsp),%r9
movq $-1,%r11
movq %rax,%rcx
shrq %cl,%r11
andq %r11,%r9
addq $1,%r11
orq %r11,%r9
movq $0,%r10
jmp .L8
.L5:
movq 64(%rsp),%rcx
shrq $3,%rcx
leaq 96(%rsp),%rdi
rep movsb (%rsi),(%rdi)
/* first chunk of message block = (r8) */
movq 96(%rsp),%r8
movq $-1,%r11
movb $64,%cl
subb 64(%rsp),%cl
shrq %cl,%r11
andq %r11,%r8
addq $1,%r11
orq %r11,%r8
/* second chunk of message block = (r9) */
movq $0,%r9
movq $0,%r10
jmp .L8
.L6:
movq 0(%rsi),%r8
movq $1,%r9
movq $0,%r10
jmp .L8
.L7:
movq 0(%rsi),%r8
movq 8(%rsi),%r9
movq $1,%r10
.L8:
movq %r12,%rdi
addq %r8,%rbx
adcq %r9,%rbp
adcq %r10,%rdi
/* integer multiplication */
movq %rdi,%rax
mulq %r15
movq %rax,%r8
xorq %r9,%r9
movq %rax,%r10
xorq %r11,%r11
movq %rdx,%r12
xorq %r13,%r13
xorq %rax,%rax
shld $62,%rdx,%rax
shlq $62,%rdx
addq %rdx,%r10
adcq %rax,%r11
movq %rbp,%rax
mulq %r15
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq %rdi,%rax
mulq %r14
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
shld $62,%r8,%r9
shlq $62,%r8
movq %rbx,%rax
mulq %r14
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq %rbx,%rax
mulq %r15
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq %rbp,%rax
mulq %r14
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
/* reduction on (r13 : r12) : (r11 : r10) : (r9 : r8) */
addq %r10,%r9
adcq $0,%r11
addq %r11,%r12
adcq $0,%r13
/* reduction on (r13 : r12 : r9 : r8) */
movq %r12,%r10
andq mask2(%rip),%r10
andq mask2c(%rip),%r12
addq %r12,%r8
adcq %r13,%r9
adcq $0,%r10
shrd $2,%r13,%r12
shrq $2,%r13
addq %r12,%r8
adcq %r13,%r9
adcq $0,%r10
jmp .L13
.L9:
/* if the single message block is full */
cmpq $0,64(%rsp)
je .L12
/* if the single message block has 1 to 7 bytes */
cmpq $8,72(%rsp)
jl .L10
/* if the single message block has 8 bytes */
je .L11
/* else if the single message block has 9 to 15 bytes */
/* first chunk of message block = (rbx) */
movq 0(%rsi),%rbx
addq $8,%rsi
movq $128,%rax
subq 64(%rsp),%rax
movq $64,%rcx
subq %rax,%rcx
shrq $3,%rcx
leaq 96(%rsp),%rdi
rep movsb (%rsi),(%rdi)
/* second chunk of message block = (rbp) */
movq 96(%rsp),%rbp
movq $-1,%r11
movq %rax,%rcx
shrq %cl,%r11
andq %r11,%rbp
addq $1,%r11
orq %r11,%rbp
/* integer multiplication */
xorq %r8,%r8
xorq %r9,%r9
xorq %r10,%r10
xorq %r11,%r11
xorq %r12,%r12
xorq %r13,%r13
movq %rbp,%rax
mulq %r15
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
shld $62,%r8,%r9
shlq $62,%r8
movq %rbx,%rax
mulq %r14
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq %rbx,%rax
mulq %r15
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq %rbp,%rax
mulq %r14
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
/* reduction on (r13 : r12) : (r11 : r10) : (r9 : r8) */
addq %r10,%r9
adcq $0,%r11
addq %r11,%r12
adcq $0,%r13
/* reduction on the integer product (r11 : r10 : r9 : r8) */
movq %r12,%r10
andq mask2(%rip),%r10
andq mask2c(%rip),%r12
addq %r12,%r8
adcq %r13,%r9
adcq $0,%r10
shrd $2,%r13,%r12
shrq $2,%r13
addq %r12,%r8
adcq %r13,%r9
adcq $0,%r10
jmp .L13
.L10:
/* read the remainder bytes onto stack */
movq 64(%rsp),%rcx
shrq $3,%rcx
leaq 96(%rsp),%rdi
rep movsb (%rsi),(%rdi)
/* message block = (rbx) */
movq 96(%rsp),%rbx
movq $-1,%r11
movb $64,%cl
subb 64(%rsp),%cl
shrq %cl,%r11
andq %r11,%rbx
addq $1,%r11
orq %r11,%rbx
/* integer multiplication */
movq %r14,%rax
mulq %rbx
movq %rax,%r8
movq %rdx,%r9
xorq %r10,%r10
movq %r15,%rax
mulq %rbx
addq %rax,%r9
adcq %rdx,%r10
jmp .L13
.L11:
/* message block = (rbx) */
movq 0(%rsi),%rbx
/* integer multiplication */
movq %r14,%rax
mulq %rbx
movq %rax,%r8
movq %rdx,%r9
xorq %r10,%r10
movq %r15,%rax
mulq %rbx
addq %rax,%r9
adcq %rdx,%r10
xorq %r11,%r11
addq %r14,%r9
adcq %r15,%r10
adcq %r11,%r11
/* reduction on the integer product (r11 : r10 : r9 : r8) */
movq %r10,%r13
andq mask2(%rip),%r10
andq mask2c(%rip),%r13
addq %r13,%r8
adcq %r11,%r9
adcq $0,%r10
shrd $2,%r11,%r13
shrq $2,%r11
addq %r13,%r8
adcq %r11,%r9
adcq $0,%r10
jmp .L13
.L12:
/* message block = (rbp : rbx) */
movq 0(%rsi),%rbx
movq 8(%rsi),%rbp
/* integer multiplication */
movq %r15,%r8
xorq %r9,%r9
movq %r15,%r10
xorq %r11,%r11
xorq %r12,%r12
xorq %r13,%r13
movq %rbp,%rax
mulq %r15
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
addq %r14,%r12
adcq $0,%r13
shld $62,%r8,%r9
shlq $62,%r8
movq %rbx,%rax
mulq %r14
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq %rbx,%rax
mulq %r15
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq %rbp,%rax
mulq %r14
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
/* reduction on (r13 : r12) : (r11 : r10) : (r9 : r8) */
addq %r10,%r9
adcq $0,%r11
addq %r11,%r12
adcq $0,%r13
/* reduction on the integer product (r13 : r12 : r9 : r8) */
movq %r12,%r10
andq mask2(%rip),%r10
andq mask2c(%rip),%r12
addq %r12,%r8
adcq %r13,%r9
adcq $0,%r10
shrd $2,%r13,%r12
shrq $2,%r13
addq %r12,%r8
adcq %r13,%r9
adcq $0,%r10
.L13:
/* final reduction on (r10 : r9 : r8) */
movq %r10,%r11
shrq $2,%r11
andq mask2(%rip),%r10
imul $5,%r11,%r11
addq %r11,%r8
adcq $0,%r9
adcq $0,%r10
/* freeze the reduced field element (r10 : r9 : r8) */
movq %r8,%r11
movq %r9,%r12
movq %r10,%r13
subq p0(%rip),%r8
sbbq p1(%rip),%r9
sbbq p2(%rip),%r10
movq %r10,%rcx
shlq $62,%rcx
cmovc %r11,%r8
cmovc %r12,%r9
cmovc %r13,%r10
/* add last 16 bytes of the key */
addq 80(%rsp),%r8
adcq 88(%rsp),%r9
adcq $0,%r10
/* store first 128 bytes of the result */
movq 56(%rsp),%rdi
movq %r8,0(%rdi)
movq %r9,8(%rdi)
movq 0(%rsp),%r11
movq 8(%rsp),%r12
movq 16(%rsp),%r13
movq 24(%rsp),%r14
movq 32(%rsp),%r15
movq 40(%rsp),%rbx
movq 48(%rsp),%rbp
movq %r11,%rsp
ret