-rw-r--r-- 17434 lib1305-20250415/crypto_onetimeauth/poly1305/amd64-maa64-g16/poly1305_maa64_g16.S raw
/* assembly to compute poly1305 using precomputed key powers and
applying lazy reduction over a group of 16 field elements */
#include "crypto_asm_hidden.h"
// linker define poly1305_maa64_g16
// linker use mask2
// linker use mask2c
// linker use p0
// linker use p1
// linker use p2
#define mask2 CRYPTO_SHARED_NAMESPACE(mask2)
#define mask2c CRYPTO_SHARED_NAMESPACE(mask2c)
#define p0 CRYPTO_SHARED_NAMESPACE(p0)
#define p1 CRYPTO_SHARED_NAMESPACE(p1)
#define p2 CRYPTO_SHARED_NAMESPACE(p2)
#include "poly1305_asm.h"
.p2align 5
ASM_HIDDEN _CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g16)
ASM_HIDDEN CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g16)
.global _CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g16)
.global CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g16)
_CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g16):
CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g16):
movq %rsp,%r11
andq $-32,%rsp
subq $160,%rsp
movq %r11,0(%rsp)
movq %r12,8(%rsp)
movq %r13,16(%rsp)
movq %r14,24(%rsp)
movq %r15,32(%rsp)
movq %rbx,40(%rsp)
movq %rbp,48(%rsp)
movq %rdi,56(%rsp)
movq %r8,64(%rsp)
movq %r9,72(%rsp)
/* store last 16 bytes of the key */
movq 384(%rdx),%r14
movq 392(%rdx),%r15
movq %r14,88(%rsp)
movq %r15,96(%rsp)
/* key = (r15 : r14) */
movq 0(%rdx),%r14
movq 8(%rdx),%r15
/* initialize a quad-word on the stack with 0 */
movq $0,104(%rsp)
/* if the message has a single block */
cmpq $1,%rcx
je .L5
movq %rcx,80(%rsp)
movq %rdx,%rdi
movq %rdx,112(%rsp)
movq $0,%r8
movq $0,%r9
movq $0,%r10
movq $0,%r11
cmpq $2,%rcx
je .LB2
cmpq $3,%rcx
je .LB3
cmpq $4,%rcx
je .LB4
cmpq $5,%rcx
je .LB5
cmpq $6,%rcx
je .LB6
cmpq $7,%rcx
je .LB7
cmpq $8,%rcx
je .LB8
cmpq $9,%rcx
je .LB9
cmpq $10,%rcx
je .LB10
cmpq $11,%rcx
je .LB11
cmpq $12,%rcx
je .LB12
cmpq $13,%rcx
je .LB13
cmpq $14,%rcx
je .LB14
cmpq $15,%rcx
je .LB15
.LB16:
fe1305_mul_taun(0,336)
fe1305_add_product()
fe1305_mul_taun(16,312)
fe1305_add_product()
fe1305_mul_taun(32,288)
fe1305_add_product()
fe1305_mul_taun(48,264)
fe1305_add_product()
fe1305_mul_taun(64,240)
fe1305_add_product()
fe1305_mul_taun(80,216)
fe1305_add_product()
fe1305_mul_taun(96,192)
fe1305_add_product()
fe1305_mul_taun(112,168)
fe1305_add_product()
fe1305_mul_taun(128,144)
fe1305_add_product()
fe1305_mul_taun(144,120)
fe1305_add_product()
fe1305_mul_taun(160,96)
fe1305_add_product()
fe1305_mul_taun(176,72)
fe1305_add_product()
fe1305_mul_taun(192,48)
fe1305_add_product()
fe1305_mul_taun(208,24)
fe1305_add_product()
fe1305_mul_tau(224,0)
fe1305_add_product()
fe1305_reduce_4l()
addq $240,%rsi
movq 80(%rsp),%rcx
subq $16,%rcx
movq %rcx,80(%rsp)
/* if there are no blocks left before processing the last block */
cmpq $0,%rcx
je .LB0
/* if there is one more block before processing the last block */
fe1305_add_msg_block(0)
addq $16,%rsi
cmpq $1,%rcx
je .LB1
.LT2:
cmpq $2,%rcx
jg .LT3
/* if there are two more block before processing the last block */
fe1305_mul_taunr(24)
jmp .LB2
.LT3:
cmpq $3,%rcx
jg .LT4
/* if there are three more block before processing the last block */
fe1305_mul_taunr(48)
jmp .LB3
.LT4:
cmpq $4,%rcx
jg .LT5
/* if there are four more block before processing the last block */
fe1305_mul_taunr(72)
jmp .LB4
.LT5:
cmpq $5,%rcx
jg .LT6
/* if there are five more block before processing the last block */
fe1305_mul_taunr(96)
jmp .LB5
.LT6:
cmpq $6,%rcx
jg .LT7
/* if there are six more block before processing the last block */
fe1305_mul_taunr(120)
jmp .LB6
.LT7:
cmpq $7,%rcx
jg .LT8
/* if there are seven more block before processing the last block */
fe1305_mul_taunr(144)
jmp .LB7
.LT8:
cmpq $8,%rcx
jg .LT9
/* if there are eight more block before processing the last block */
fe1305_mul_taunr(168)
jmp .LB8
.LT9:
cmpq $9,%rcx
jg .LT10
/* if there are nine more block before processing the last block */
fe1305_mul_taunr(192)
jmp .LB9
.LT10:
cmpq $10,%rcx
jg .LT11
/* if there are ten more block before processing the last block */
fe1305_mul_taunr(216)
jmp .LB10
.LT11:
cmpq $11,%rcx
jg .LT12
/* if there are eleven more block before processing the last block */
fe1305_mul_taunr(240)
jmp .LB11
.LT12:
cmpq $12,%rcx
jg .LT13
/* if there are twelve more block before processing the last block */
fe1305_mul_taunr(264)
jmp .LB12
.LT13:
cmpq $13,%rcx
jg .LT14
/* if there are thirteen more block before processing the last block */
fe1305_mul_taunr(288)
jmp .LB13
.LT14:
cmpq $14,%rcx
jg .LT15
/* if there are fourteen more block before processing the last block */
fe1305_mul_taunr(312)
jmp .LB14
.LT15:
cmpq $15,%rcx
jg .LT16
/* if there are fifteen more block before processing the last block */
fe1305_mul_taunr(336)
jmp .LB15
.LT16:
/* if there are at least sixteen more block before processing the last block */
fe1305_mul_taunr(360)
jmp .LB16
.LB1:
fe1305_mul_taur(0)
fe1305_reduce_4l()
jmp .LB0
.LB2:
fe1305_mul_tau(0,0)
fe1305_add_product()
fe1305_reduce_4l()
addq $16,%rsi
jmp .LB0
.LB3:
fe1305_mul_taun(0,24)
fe1305_add_product()
fe1305_mul_tau(16,0)
fe1305_add_product()
fe1305_reduce_4l()
addq $32,%rsi
jmp .LB0
.LB4:
fe1305_mul_taun(0,48)
fe1305_add_product()
fe1305_mul_taun(16,24)
fe1305_add_product()
fe1305_mul_tau(32,0)
fe1305_add_product()
fe1305_reduce_4l()
addq $48,%rsi
jmp .LB0
.LB5:
fe1305_mul_taun(0,72)
fe1305_add_product()
fe1305_mul_taun(16,48)
fe1305_add_product()
fe1305_mul_taun(32,24)
fe1305_add_product()
fe1305_mul_tau(48,0)
fe1305_add_product()
fe1305_reduce_4l()
addq $64,%rsi
jmp .LB0
.LB6:
fe1305_mul_taun(0,96)
fe1305_add_product()
fe1305_mul_taun(16,72)
fe1305_add_product()
fe1305_mul_taun(32,48)
fe1305_add_product()
fe1305_mul_taun(48,24)
fe1305_add_product()
fe1305_mul_tau(64,0)
fe1305_add_product()
fe1305_reduce_4l()
addq $80,%rsi
jmp .LB0
.LB7:
fe1305_mul_taun(0,120)
fe1305_add_product()
fe1305_mul_taun(16,96)
fe1305_add_product()
fe1305_mul_taun(32,72)
fe1305_add_product()
fe1305_mul_taun(48,48)
fe1305_add_product()
fe1305_mul_taun(64,24)
fe1305_add_product()
fe1305_mul_tau(80,0)
fe1305_add_product()
fe1305_reduce_4l()
addq $96,%rsi
jmp .LB0
.LB8:
fe1305_mul_taun(0,144)
fe1305_add_product()
fe1305_mul_taun(16,120)
fe1305_add_product()
fe1305_mul_taun(32,96)
fe1305_add_product()
fe1305_mul_taun(48,72)
fe1305_add_product()
fe1305_mul_taun(64,48)
fe1305_add_product()
fe1305_mul_taun(80,24)
fe1305_add_product()
fe1305_mul_tau(96,0)
fe1305_add_product()
fe1305_reduce_4l()
addq $112,%rsi
jmp .LB0
.LB9:
fe1305_mul_taun(0,168)
fe1305_add_product()
fe1305_mul_taun(16,144)
fe1305_add_product()
fe1305_mul_taun(32,120)
fe1305_add_product()
fe1305_mul_taun(48,96)
fe1305_add_product()
fe1305_mul_taun(64,72)
fe1305_add_product()
fe1305_mul_taun(80,48)
fe1305_add_product()
fe1305_mul_taun(96,24)
fe1305_add_product()
fe1305_mul_tau(112,0)
fe1305_add_product()
fe1305_reduce_4l()
addq $128,%rsi
jmp .LB0
.LB10:
fe1305_mul_taun(0,192)
fe1305_add_product()
fe1305_mul_taun(16,168)
fe1305_add_product()
fe1305_mul_taun(32,144)
fe1305_add_product()
fe1305_mul_taun(48,120)
fe1305_add_product()
fe1305_mul_taun(64,96)
fe1305_add_product()
fe1305_mul_taun(80,72)
fe1305_add_product()
fe1305_mul_taun(96,48)
fe1305_add_product()
fe1305_mul_taun(112,24)
fe1305_add_product()
fe1305_mul_tau(128,0)
fe1305_add_product()
fe1305_reduce_4l()
addq $144,%rsi
jmp .LB0
.LB11:
fe1305_mul_taun(0,216)
fe1305_add_product()
fe1305_mul_taun(16,192)
fe1305_add_product()
fe1305_mul_taun(32,168)
fe1305_add_product()
fe1305_mul_taun(48,144)
fe1305_add_product()
fe1305_mul_taun(64,120)
fe1305_add_product()
fe1305_mul_taun(80,96)
fe1305_add_product()
fe1305_mul_taun(96,72)
fe1305_add_product()
fe1305_mul_taun(112,48)
fe1305_add_product()
fe1305_mul_taun(128,24)
fe1305_add_product()
fe1305_mul_tau(144,0)
fe1305_add_product()
fe1305_reduce_4l()
addq $160,%rsi
jmp .LB0
.LB12:
fe1305_mul_taun(0,240)
fe1305_add_product()
fe1305_mul_taun(16,216)
fe1305_add_product()
fe1305_mul_taun(32,192)
fe1305_add_product()
fe1305_mul_taun(48,168)
fe1305_add_product()
fe1305_mul_taun(64,144)
fe1305_add_product()
fe1305_mul_taun(80,120)
fe1305_add_product()
fe1305_mul_taun(96,96)
fe1305_add_product()
fe1305_mul_taun(112,72)
fe1305_add_product()
fe1305_mul_taun(128,48)
fe1305_add_product()
fe1305_mul_taun(144,24)
fe1305_add_product()
fe1305_mul_tau(160,0)
fe1305_add_product()
fe1305_reduce_4l()
addq $176,%rsi
jmp .LB0
.LB13:
fe1305_mul_taun(0,264)
fe1305_add_product()
fe1305_mul_taun(16,240)
fe1305_add_product()
fe1305_mul_taun(32,216)
fe1305_add_product()
fe1305_mul_taun(48,192)
fe1305_add_product()
fe1305_mul_taun(64,168)
fe1305_add_product()
fe1305_mul_taun(80,144)
fe1305_add_product()
fe1305_mul_taun(96,120)
fe1305_add_product()
fe1305_mul_taun(112,96)
fe1305_add_product()
fe1305_mul_taun(128,72)
fe1305_add_product()
fe1305_mul_taun(144,48)
fe1305_add_product()
fe1305_mul_taun(160,24)
fe1305_add_product()
fe1305_mul_tau(176,0)
fe1305_add_product()
fe1305_reduce_4l()
addq $192,%rsi
jmp .LB0
.LB14:
fe1305_mul_taun(0,288)
fe1305_add_product()
fe1305_mul_taun(16,264)
fe1305_add_product()
fe1305_mul_taun(32,240)
fe1305_add_product()
fe1305_mul_taun(48,216)
fe1305_add_product()
fe1305_mul_taun(64,192)
fe1305_add_product()
fe1305_mul_taun(80,168)
fe1305_add_product()
fe1305_mul_taun(96,144)
fe1305_add_product()
fe1305_mul_taun(112,120)
fe1305_add_product()
fe1305_mul_taun(128,96)
fe1305_add_product()
fe1305_mul_taun(144,72)
fe1305_add_product()
fe1305_mul_taun(160,48)
fe1305_add_product()
fe1305_mul_taun(176,24)
fe1305_add_product()
fe1305_mul_tau(192,0)
fe1305_add_product()
fe1305_reduce_4l()
addq $208,%rsi
jmp .LB0
.LB15:
fe1305_mul_taun(0,312)
fe1305_add_product()
fe1305_mul_taun(16,288)
fe1305_add_product()
fe1305_mul_taun(32,264)
fe1305_add_product()
fe1305_mul_taun(48,240)
fe1305_add_product()
fe1305_mul_taun(64,216)
fe1305_add_product()
fe1305_mul_taun(80,192)
fe1305_add_product()
fe1305_mul_taun(96,168)
fe1305_add_product()
fe1305_mul_taun(112,144)
fe1305_add_product()
fe1305_mul_taun(128,120)
fe1305_add_product()
fe1305_mul_taun(144,96)
fe1305_add_product()
fe1305_mul_taun(160,72)
fe1305_add_product()
fe1305_mul_taun(176,48)
fe1305_add_product()
fe1305_mul_taun(192,24)
fe1305_add_product()
fe1305_mul_tau(208,0)
fe1305_add_product()
fe1305_reduce_4l()
addq $224,%rsi
.LB0:
/* if the last block is full */
cmpq $0,64(%rsp)
je .L3
/* if the last block has 8 bytes */
cmpq $64,64(%rsp)
je .L2
/* if the last block has 1 to 7 bytes */
jl .L1
/* else if the last block has 9 to 15 bytes */
/* first chunk of message block = (r12) */
movq 0(%rsi),%r12
addq $8,%rsi
movq $128,%rbx
subq 64(%rsp),%rbx
movq $64,%rcx
subq %rbx,%rcx
shrq $3,%rcx
leaq 104(%rsp),%rdi
rep movsb (%rsi),(%rdi)
/* second chunk of message block = (r13) */
movq 104(%rsp),%r13
movq $-1,%r11
movq %rbx,%rcx
shrq %cl,%r11
addq $1,%r11
orq %r11,%r13
movq $0,%r14
jmp .L4
.L1:
movq 64(%rsp),%rcx
shrq $3,%rcx
leaq 104(%rsp),%rdi
rep movsb (%rsi),(%rdi)
/* first chunk of message block = (r12) */
movq 104(%rsp),%r12
movq $-1,%r11
movb $64,%cl
subb 64(%rsp),%cl
shrq %cl,%r11
addq $1,%r11
orq %r11,%r12
/* second chunk of message block = (r13) */
movq $0,%r13
movq $0,%r14
jmp .L4
.L2:
movq 0(%rsi),%r12
movq $1,%r13
movq $0,%r14
jmp .L4
.L3:
movq 0(%rsi),%r12
movq 8(%rsi),%r13
movq $1,%r14
.L4:
addq %r12,%r8
adcq %r13,%r9
adcq %r14,%r10
movq 112(%rsp),%rdi
fe1305_mul_taur(0)
fe1305_reduce_4l()
jmp .L9
.L5:
/* if the single message block is full */
cmpq $0,64(%rsp)
je .L8
/* if the single message block has 1 to 7 bytes */
cmpq $8,72(%rsp)
jl .L6
/* if the single message block has 8 bytes */
je .L7
/* else if the single message block has 9 to 15 bytes */
/* first chunk of message block = (rbx) */
movq 0(%rsi),%rbx
addq $8,%rsi
movq $128,%rax
subq 64(%rsp),%rax
movq $64,%rcx
subq %rax,%rcx
shrq $3,%rcx
leaq 104(%rsp),%rdi
rep movsb (%rsi),(%rdi)
/* second chunk of message block = (rbp) */
movq 104(%rsp),%rbp
movq $-1,%r11
movq %rax,%rcx
shrq %cl,%r11
andq %r11,%rbp
addq $1,%r11
orq %r11,%rbp
/* integer multiplication */
xorq %r8,%r8
xorq %r9,%r9
xorq %r10,%r10
xorq %r11,%r11
xorq %r12,%r12
xorq %r13,%r13
movq %rbp,%rax
mulq %r15
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
shld $62,%r8,%r9
shlq $62,%r8
movq %rbx,%rax
mulq %r14
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq %rbx,%rax
mulq %r15
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq %rbp,%rax
mulq %r14
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
/* reduction on (r13 : r12) : (r11 : r10) : (r9 : r8) */
addq %r10,%r9
adcq $0,%r11
addq %r11,%r12
adcq $0,%r13
/* reduction on the integer product (r11 : r10 : r9 : r8) */
movq %r12,%r10
andq mask2(%rip),%r10
andq mask2c(%rip),%r12
addq %r12,%r8
adcq %r13,%r9
adcq $0,%r10
shrd $2,%r13,%r12
shrq $2,%r13
addq %r12,%r8
adcq %r13,%r9
adcq $0,%r10
jmp .L9
.L6:
/* read the remainder bytes onto stack */
movq 64(%rsp),%rcx
shrq $3,%rcx
leaq 104(%rsp),%rdi
rep movsb (%rsi),(%rdi)
/* message block = (rbx) */
movq 104(%rsp),%rbx
movq $-1,%r11
movb $64,%cl
subb 64(%rsp),%cl
shrq %cl,%r11
andq %r11,%rbx
addq $1,%r11
orq %r11,%rbx
/* integer multiplication */
movq %r14,%rax
mulq %rbx
movq %rax,%r8
movq %rdx,%r9
xorq %r10,%r10
movq %r15,%rax
mulq %rbx
addq %rax,%r9
adcq %rdx,%r10
jmp .L9
.L7:
/* message block = (rbx) */
movq 0(%rsi),%rbx
/* integer multiplication */
movq %r14,%rax
mulq %rbx
movq %rax,%r8
movq %rdx,%r9
xorq %r10,%r10
movq %r15,%rax
mulq %rbx
addq %rax,%r9
adcq %rdx,%r10
xorq %r11,%r11
addq %r14,%r9
adcq %r15,%r10
adcq %r11,%r11
/* reduction on the integer product (r11 : r10 : r9 : r8) */
movq %r10,%r13
andq mask2(%rip),%r10
andq mask2c(%rip),%r13
addq %r13,%r8
adcq %r11,%r9
adcq $0,%r10
shrd $2,%r11,%r13
shrq $2,%r11
addq %r13,%r8
adcq %r11,%r9
adcq $0,%r10
jmp .L9
.L8:
/* message block = (rbp : rbx) */
movq 0(%rsi),%rbx
movq 8(%rsi),%rbp
/* integer multiplication */
movq %r15,%r8
xorq %r9,%r9
movq %r15,%r10
xorq %r11,%r11
xorq %r12,%r12
xorq %r13,%r13
movq %rbp,%rax
mulq %r15
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
addq %r14,%r12
adcq $0,%r13
shld $62,%r8,%r9
shlq $62,%r8
movq %rbx,%rax
mulq %r14
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq %rbx,%rax
mulq %r15
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq %rbp,%rax
mulq %r14
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
/* reduction on (r13 : r12) : (r11 : r10) : (r9 : r8) */
addq %r10,%r9
adcq $0,%r11
addq %r11,%r12
adcq $0,%r13
/* reduction on the integer product (r13 : r12 : r9 : r8) */
movq %r12,%r10
andq mask2(%rip),%r10
andq mask2c(%rip),%r12
addq %r12,%r8
adcq %r13,%r9
adcq $0,%r10
shrd $2,%r13,%r12
shrq $2,%r13
addq %r12,%r8
adcq %r13,%r9
adcq $0,%r10
.L9:
/* final reduction on (r10 : r9 : r8) */
movq %r10,%r11
shrq $2,%r11
andq mask2(%rip),%r10
imul $5,%r11,%r11
addq %r11,%r8
adcq $0,%r9
adcq $0,%r10
/* freeze the reduced field element (r10 : r9 : r8) */
movq %r8,%r11
movq %r9,%r12
movq %r10,%r13
subq p0(%rip),%r8
sbbq p1(%rip),%r9
sbbq p2(%rip),%r10
movq %r10,%rcx
shlq $62,%rcx
cmovc %r11,%r8
cmovc %r12,%r9
cmovc %r13,%r10
/* add last 16 bytes of the key */
addq 88(%rsp),%r8
adcq 96(%rsp),%r9
adcq $0,%r10
/* store first 128 bytes of the result */
movq 56(%rsp),%rdi
movq %r8,0(%rdi)
movq %r9,8(%rdi)
movq 0(%rsp),%r11
movq 8(%rsp),%r12
movq 16(%rsp),%r13
movq 24(%rsp),%r14
movq 32(%rsp),%r15
movq 40(%rsp),%rbx
movq 48(%rsp),%rbp
movq %r11,%rsp
ret