-rw-r--r-- 11321 lib1305-20250407/crypto_onetimeauth/poly1305/amd64-maa44-g4/poly1305_maa44_g4.S raw
/* assembly to compute poly1305 using precomputed key powers and
applying lazy reduction over a group of 4 field elements */
#include "crypto_asm_hidden.h"
// linker define poly1305_maa44_g4
// linker use mask44
// linker use mask42
// linker use pmask1
// linker use pmask2
// linker use pmask3
// linker use pmask4
// linker use pmask5
// linker use upmask1
// linker use upmask2
// linker use upmask3
// linker use upmask4
// linker use upmask5
// linker use twoexp41
// linker use p0
// linker use p1
// linker use p2
#define mask44 CRYPTO_SHARED_NAMESPACE(mask44)
#define mask42 CRYPTO_SHARED_NAMESPACE(mask42)
#define pmask1 CRYPTO_SHARED_NAMESPACE(pmask1)
#define pmask2 CRYPTO_SHARED_NAMESPACE(pmask2)
#define pmask3 CRYPTO_SHARED_NAMESPACE(pmask3)
#define pmask4 CRYPTO_SHARED_NAMESPACE(pmask4)
#define pmask5 CRYPTO_SHARED_NAMESPACE(pmask5)
#define upmask1 CRYPTO_SHARED_NAMESPACE(upmask1)
#define upmask2 CRYPTO_SHARED_NAMESPACE(upmask2)
#define upmask3 CRYPTO_SHARED_NAMESPACE(upmask3)
#define upmask4 CRYPTO_SHARED_NAMESPACE(upmask4)
#define upmask5 CRYPTO_SHARED_NAMESPACE(upmask5)
#define twoexp41 CRYPTO_SHARED_NAMESPACE(twoexp41)
#define p0 CRYPTO_SHARED_NAMESPACE(p0)
#define p1 CRYPTO_SHARED_NAMESPACE(p1)
#define p2 CRYPTO_SHARED_NAMESPACE(p2)
#include "poly1305_asm.h"
.p2align 5
ASM_HIDDEN _CRYPTO_SHARED_NAMESPACE(poly1305_maa44_g4)
ASM_HIDDEN CRYPTO_SHARED_NAMESPACE(poly1305_maa44_g4)
.global _CRYPTO_SHARED_NAMESPACE(poly1305_maa44_g4)
.global CRYPTO_SHARED_NAMESPACE(poly1305_maa44_g4)
_CRYPTO_SHARED_NAMESPACE(poly1305_maa44_g4):
CRYPTO_SHARED_NAMESPACE(poly1305_maa44_g4):
movq %rsp,%r11
andq $-32,%rsp
subq $160,%rsp
movq %r11,0(%rsp)
movq %r12,8(%rsp)
movq %r13,16(%rsp)
movq %r14,24(%rsp)
movq %r15,32(%rsp)
movq %rbx,40(%rsp)
movq %rbp,48(%rsp)
movq %rdi,56(%rsp)
movq %r8,64(%rsp)
movq %r9,72(%rsp)
/* store last 16 bytes of the key */
movq 96(%rdx),%r14
movq 104(%rdx),%r15
movq %r14,88(%rsp)
movq %r15,96(%rsp)
/* initialize a quad-word on the stack with 0 */
movq $0,104(%rsp)
/* if the message has a single block */
cmpq $1,%rcx
je .L5
movq %rcx,80(%rsp)
movq %rdx,%rdi
movq %rdx,112(%rsp)
movq $0,%r8
movq $0,%r9
movq $0,%r10
movq $0,%r11
movq $0,%r12
movq $0,%r13
cmpq $2,%rcx
je .LB2
cmpq $3,%rcx
je .LB3
.LB4:
fe1305_mul_taun(0,48)
fe1305_add_product()
fe1305_mul_taun(16,24)
fe1305_add_product()
fe1305_mul_taun(32,0)
fe1305_add_product()
fe1305_reduce_3l_128bits()
addq $48,%rsi
movq 80(%rsp),%rcx
subq $4,%rcx
movq %rcx,80(%rsp)
/* if there are no blocks left before processing the last block */
cmpq $0,%rcx
je .LB0
/* if there is one more block before processing the last block */
fe1305_add_msg_block(0)
addq $16,%rsi
cmpq $1,%rcx
je .LB1
.LT2:
cmpq $2,%rcx
jg .LT3
/* if there are two more block before processing the last block */
fe1305_mul_taunr(24)
jmp .LB2
.LT3:
cmpq $3,%rcx
jg .LT4
/* if there are three more block before processing the last block */
fe1305_mul_taunr(48)
jmp .LB3
.LT4:
/* if there are at least four more block before processing the last block */
fe1305_reduce_3l_64bits()
fe1305_mul_taunr(72)
jmp .LB4
.LB1:
fe1305_mul_taunr(0)
fe1305_reduce_3l_128bits()
jmp .LB0
.LB2:
fe1305_mul_taun(0,0)
fe1305_add_product()
fe1305_reduce_3l_128bits()
addq $16,%rsi
jmp .LB0
.LB3:
fe1305_mul_taun(0,24)
fe1305_add_product()
fe1305_mul_taun(16,0)
fe1305_add_product()
fe1305_reduce_3l_128bits()
addq $32,%rsi
.LB0:
/* if the last block is full */
cmpq $0,64(%rsp)
je .L3
/* if the last block has 8 bytes */
cmpq $64,64(%rsp)
je .L2
/* if the last block has 1 to 7 bytes */
jl .L1
/* else if the last block has 9 to 15 bytes */
/* first chunk of message block = (rax) */
movq 0(%rsi),%rax
addq $8,%rsi
movq $128,%rbp
subq 64(%rsp),%rbp
movq $64,%rcx
subq %rbp,%rcx
shrq $3,%rcx
leaq 104(%rsp),%rdi
rep movsb (%rsi),(%rdi)
/* second chunk of message block = (rdi) */
movq 104(%rsp),%rdi
movq $-1,%r11
movq %rbp,%rcx
shrq %cl,%r11
addq $1,%r11
orq %r11,%rdi
movq $0,%rbp
jmp .L4
.L1:
movq 64(%rsp),%rcx
shrq $3,%rcx
leaq 104(%rsp),%rdi
rep movsb (%rsi),(%rdi)
/* first chunk of message block = (rax) */
movq 104(%rsp),%rax
movq $-1,%r11
movb $64,%cl
subb 64(%rsp),%cl
shrq %cl,%r11
addq $1,%r11
orq %r11,%rax
/* second chunk of message block = (rdi) */
movq $0,%rdi
movq $0,%rbp
jmp .L4
.L2:
movq 0(%rsi),%rax
movq $1,%rdi
movq $0,%rbp
jmp .L4
.L3:
movq 0(%rsi),%rax
movq 8(%rsi),%rdi
movq $1,%rbp
.L4:
movq %rax,%r13
andq pmask1(%rip),%rax
movq %rdi,%r11
andq pmask2(%rip),%r13
shrq $44,%r13
andq pmask3(%rip),%rdi
shlq $20,%rdi
orq %r13,%rdi
andq pmask4(%rip),%r11
shrq $24,%r11
andq pmask5(%rip),%rbp
shlq $40,%rbp
orq %r11,%rbp
addq %rax,%r8
addq %rdi,%r10
addq %rbp,%r12
movq 112(%rsp),%rdi
fe1305_mul_taunr(0)
fe1305_reduce_3l_128bits()
jmp .L10
.L5:
movq 0(%rdx),%r14
movq 8(%rdx),%r15
movq 16(%rdx),%rbx
/* if the single message block is full */
cmpq $0,64(%rsp)
je .L8
/* if the single message block has 1 to 7 bytes */
cmpq $8,72(%rsp)
jl .L6
/* if the single message block has 8 bytes */
je .L7
/* else if the single message block has 9 to 15 bytes */
/* first chunk of message block = (rax) */
movq 0(%rsi),%rax
addq $8,%rsi
movq $128,%r8
subq 64(%rsp),%r8
movq $64,%rcx
subq %r8,%rcx
shrq $3,%rcx
leaq 104(%rsp),%rdi
rep movsb (%rsi),(%rdi)
/* second chunk of message block = (rbp) */
movq 104(%rsp),%rbp
movq $-1,%r11
movq %r8,%rcx
shrq %cl,%r11
addq $1,%r11
orq %r11,%rbp
movq %rax,%r9
andq pmask1(%rip),%rax
movq %rbp,%rdi
andq pmask2(%rip),%r9
shrq $44,%r9
andq pmask3(%rip),%rdi
shlq $20,%rdi
orq %r9,%rdi
andq pmask4(%rip),%rbp
shrq $24,%rbp
movq %rax,104(%rsp)
mulq %r14
movq %rax,%r8
movq %rdx,%r9
movq %rdi,%rax
imul $20,%rax,%rax
mulq %rbx
addq %rax,%r8
adcq %rdx,%r9
movq %rbp,%rax
imul $20,%rax,%rax
movq %rax,112(%rsp)
mulq %r15
addq %rax,%r8
adcq %rdx,%r9
shld $20,%r8,%r9
movq 104(%rsp),%rax
mulq %r15
movq %rax,%r10
movq %rdx,%r11
movq %rdi,%rax
mulq %r14
addq %rax,%r10
adcq %rdx,%r11
movq 112(%rsp),%rax
mulq %rbx
addq %rax,%r10
adcq %rdx,%r11
shld $20,%r10,%r11
movq 104(%rsp),%rax
mulq %rbx
movq %rax,%r12
movq %rdx,%r13
movq %rdi,%rax
mulq %r15
addq %rax,%r12
adcq %rdx,%r13
movq %rbp,%rax
mulq %r14
addq %rax,%r12
adcq %rdx,%r13
shld $22,%r12,%r13
jmp .L9
.L6:
/* read the remainder bytes onto stack */
movq 64(%rsp),%rcx
shrq $3,%rcx
leaq 104(%rsp),%rdi
rep movsb (%rsi),(%rdi)
/* message block = (rax) */
movq 104(%rsp),%rax
movq $-1,%r11
movb $64,%cl
subb 64(%rsp),%cl
shrq %cl,%r11
addq $1,%r11
orq %r11,%rax
movq %rax,%rdi
andq pmask1(%rip),%rax
andq pmask2(%rip),%rdi
shrq $44,%rdi
movq %rax,104(%rsp)
mulq %r14
movq %rax,%r8
movq %rdx,%r9
movq %rdi,%rax
imul $20,%rax,%rax
mulq %rbx
addq %rax,%r8
adcq %rdx,%r9
shld $20,%r8,%r9
movq 104(%rsp),%rax
mulq %r15
movq %rax,%r10
movq %rdx,%r11
movq %rdi,%rax
mulq %r14
addq %rax,%r10
adcq %rdx,%r11
shld $20,%r10,%r11
movq 104(%rsp),%rax
mulq %rbx
movq %rax,%r12
movq %rdx,%r13
movq %rdi,%rax
mulq %r15
addq %rax,%r12
adcq %rdx,%r13
shld $22,%r12,%r13
jmp .L9
.L7:
/* message block = (rax) */
movq 0(%rsi),%rax
movq $1,%rdi
movq %rax,%r9
andq pmask1(%rip),%rax
movq %rdi,%rbp
andq pmask2(%rip),%r9
shrq $44,%r9
andq pmask3(%rip),%rdi
shlq $20,%rdi
orq %r9,%rdi
andq pmask4(%rip),%rbp
shrq $24,%rbp
movq %rax,104(%rsp)
mulq %r14
movq %rax,%r8
movq %rdx,%r9
movq %rdi,%rax
imul $20,%rax,%rax
mulq %rbx
addq %rax,%r8
adcq %rdx,%r9
shld $20,%r8,%r9
movq 104(%rsp),%rax
mulq %r15
movq %rax,%r10
movq %rdx,%r11
movq %rdi,%rax
mulq %r14
addq %rax,%r10
adcq %rdx,%r11
shld $20,%r10,%r11
movq 104(%rsp),%rax
mulq %rbx
movq %rax,%r12
movq %rdx,%r13
movq %rdi,%rax
mulq %r15
addq %rax,%r12
adcq %rdx,%r13
shld $22,%r12,%r13
jmp .L9
.L8:
/* message block = (rbp : rdi : rax) */
movq 0(%rsi),%rax
movq 8(%rsi),%rdi
movq $1,%rbp
movq %rax,%r9
andq pmask1(%rip),%rax
movq %rdi,%r11
andq pmask2(%rip),%r9
shrq $44,%r9
andq pmask3(%rip),%rdi
shlq $20,%rdi
orq %r9,%rdi
andq pmask4(%rip),%r11
shrq $24,%r11
andq pmask5(%rip),%rbp
shlq $40,%rbp
orq %r11,%rbp
movq %rax,104(%rsp)
mulq %r14
movq %rax,%r8
movq %rdx,%r9
movq %rdi,%rax
imul $20,%rax,%rax
mulq %rbx
addq %rax,%r8
adcq %rdx,%r9
movq %rbp,%rax
imul $20,%rax,%rax
movq %rax,112(%rsp)
mulq %r15
addq %rax,%r8
adcq %rdx,%r9
shld $20,%r8,%r9
movq 104(%rsp),%rax
mulq %r15
movq %rax,%r10
movq %rdx,%r11
movq %rdi,%rax
mulq %r14
addq %rax,%r10
adcq %rdx,%r11
movq 112(%rsp),%rax
mulq %rbx
addq %rax,%r10
adcq %rdx,%r11
shld $20,%r10,%r11
movq 104(%rsp),%rax
mulq %rbx
movq %rax,%r12
movq %rdx,%r13
movq %rdi,%rax
mulq %r15
addq %rax,%r12
adcq %rdx,%r13
movq %rbp,%rax
mulq %r14
addq %rax,%r12
adcq %rdx,%r13
shld $22,%r12,%r13
.L9:
/* reduction on (r12 : r10 : r8) */
movq mask44(%rip),%rbp
andq %rbp,%r8
andq %rbp,%r10
addq %r9,%r10
andq mask42(%rip),%r12
addq %r11,%r12
imul $5,%r13,%r13
addq %r13,%r8
.L10:
movq %r8,%rdx
shrq $44,%rdx
addq %r10,%rdx
andq %rbp,%r8
movq %rdx,%r10
shrq $44,%rdx
addq %r12,%rdx
andq %rbp,%r10
movq %rdx,%r12
shrq $42,%rdx
imul $5,%rdx,%rdx
addq %rdx,%r8
andq mask42(%rip),%r12
/* get back the element in base 2^{64} */
andq upmask1(%rip),%r8
movq %r10,%r9
andq upmask2(%rip),%r9
shlq $44,%r9
orq %r9,%r8
andq upmask3(%rip),%r10
shrq $20,%r10
movq %r12,%r11
andq upmask4(%rip),%r11
shlq $24,%r11
orq %r11,%r10
andq upmask5(%rip),%r12
shrq $20,%r12
/* freeze the reduced field element (r12 : r10 : r8) */
movq %r8,%r9
movq %r10,%r11
movq %r12,%r13
subq p0(%rip),%r8
sbbq p1(%rip),%r10
sbbq p2(%rip),%r12
movq %r12,%rcx
shlq $62,%rcx
cmovc %r9,%r8
cmovc %r11,%r10
cmovc %r13,%r12
/* add last 16 bytes of the key */
addq 88(%rsp),%r8
adcq 96(%rsp),%r10
adcq $0,%r12
/* store first 128 bytes of the result */
movq 56(%rsp),%rdi
movq %r8,0(%rdi)
movq %r10,8(%rdi)
movq 0(%rsp),%r11
movq 8(%rsp),%r12
movq 16(%rsp),%r13
movq 24(%rsp),%r14
movq 32(%rsp),%r15
movq 40(%rsp),%rbx
movq 48(%rsp),%rbp
movq %r11,%rsp
ret