-rw-r--r-- 11973 lib1305-20250415/crypto_onetimeauth/poly1305/amd64-maa64-g1/poly1305_maa64_g1.S raw
/* assembly to compute poly1305 */ #include "crypto_asm_hidden.h" // linker define poly1305_maa64_g1 // linker use mask2 // linker use mask2c // linker use p0 // linker use p1 // linker use p2 #define mask2 CRYPTO_SHARED_NAMESPACE(mask2) #define mask2c CRYPTO_SHARED_NAMESPACE(mask2c) #define p0 CRYPTO_SHARED_NAMESPACE(p0) #define p1 CRYPTO_SHARED_NAMESPACE(p1) #define p2 CRYPTO_SHARED_NAMESPACE(p2) .p2align 5 ASM_HIDDEN _CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g1) ASM_HIDDEN CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g1) .global _CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g1) .global CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g1) _CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g1): CRYPTO_SHARED_NAMESPACE(poly1305_maa64_g1): movq %rsp,%r11 andq $-32,%rsp subq $128,%rsp movq %r11,0(%rsp) movq %r12,8(%rsp) movq %r13,16(%rsp) movq %r14,24(%rsp) movq %r15,32(%rsp) movq %rbx,40(%rsp) movq %rbp,48(%rsp) movq %rdi,56(%rsp) movq %r8,64(%rsp) movq %r9,72(%rsp) /* store high 16 bytes of key */ movq 16(%rdx),%r14 movq 24(%rdx),%r15 movq %r14,80(%rsp) movq %r15,88(%rsp) /* key = (r15 : r14) */ movq 0(%rdx),%r14 movq 8(%rdx),%r15 /* initialize a quad-word on the stack with 0 */ movq $0,96(%rsp) /* if the message has a single block */ cmpq $1,%rcx je .L9 /* message block = (rbp : rbx) */ movq 0(%rsi),%rbx movq 8(%rsi),%rbp /* else loop around and multiply the 129-byte (3-limb) * message block with the 128-byte (2-limb) key; * read the 129th bit in %rdi before proceeding */ movq $1,%rdi .L1: /* integer multiplication */ movq %rdi,%rax mulq %r15 movq %rax,%r8 xorq %r9,%r9 movq %rax,%r10 xorq %r11,%r11 movq %rdx,%r12 xorq %r13,%r13 xorq %rax,%rax shld $62,%rdx,%rax shlq $62,%rdx addq %rdx,%r10 adcq %rax,%r11 movq %rbp,%rax mulq %r15 addq %rax,%r12 adcq $0,%r13 addq %rdx,%r8 adcq $0,%r9 addq %rdx,%r10 adcq $0,%r11 movq %rdi,%rax mulq %r14 addq %rax,%r12 adcq $0,%r13 addq %rdx,%r8 adcq $0,%r9 addq %rdx,%r10 adcq $0,%r11 shld $62,%r8,%r9 shlq $62,%r8 movq %rbx,%rax mulq %r14 addq %rax,%r8 adcq $0,%r9 addq %rdx,%r10 adcq $0,%r11 movq %rbx,%rax mulq %r15 addq %rax,%r10 adcq $0,%r11 addq %rdx,%r12 adcq $0,%r13 movq %rbp,%rax mulq %r14 addq %rax,%r10 adcq $0,%r11 addq %rdx,%r12 adcq $0,%r13 /* reduction on (r13 : r12) : (r11 : r10) : (r9 : r8) */ addq %r9,%r10 adcq $0,%r11 addq %r11,%r12 adcq $0,%r13 /* reduction on (r13 : r12 : r10 : r8) */ movq %r12,%rdi andq mask2(%rip),%rdi andq mask2c(%rip),%r12 addq %r12,%r8 adcq %r13,%r10 adcq $0,%rdi shrd $2,%r13,%r12 shrq $2,%r13 addq %r12,%r8 adcq %r13,%r10 adcq $0,%rdi movq %r8,%rbx movq %r10,%rbp addq $16,%rsi subq $1,%rcx cmpq $2,%rcx jg .L2 je .L3 jl .L4 .L2: /* add the next message block */ addq 0(%rsi),%rbx adcq 8(%rsi),%rbp adcq $1,%rdi jmp .L1 .L3: /* add the second last block and proceed */ addq 0(%rsi),%rbx adcq 8(%rsi),%rbp adcq $1,%rdi /* integer multiplication */ movq %rdi,%rax mulq %r15 movq %rax,%r8 xorq %r9,%r9 movq %rax,%r10 xorq %r11,%r11 movq %rdx,%r12 xorq %r13,%r13 xorq %rax,%rax shld $62,%rdx,%rax shlq $62,%rdx addq %rdx,%r10 adcq %rax,%r11 movq %rbp,%rax mulq %r15 addq %rax,%r12 adcq $0,%r13 addq %rdx,%r8 adcq $0,%r9 addq %rdx,%r10 adcq $0,%r11 movq %rdi,%rax mulq %r14 addq %rax,%r12 adcq $0,%r13 addq %rdx,%r8 adcq $0,%r9 addq %rdx,%r10 adcq $0,%r11 shld $62,%r8,%r9 shlq $62,%r8 movq %rbx,%rax mulq %r14 addq %rax,%r8 adcq $0,%r9 addq %rdx,%r10 adcq $0,%r11 movq %rbx,%rax mulq %r15 addq %rax,%r10 adcq $0,%r11 addq %rdx,%r12 adcq $0,%r13 movq %rbp,%rax mulq %r14 addq %rax,%r10 adcq $0,%r11 addq %rdx,%r12 adcq $0,%r13 /* reduction on (r13 : r12) : (r11 : r10) : (r9 : r8) */ addq %r9,%r10 adcq $0,%r11 addq %r11,%r12 adcq $0,%r13 /* reduction on (r13 : r12 : r10 : r8) */ movq %r12,%rdi andq mask2(%rip),%rdi andq mask2c(%rip),%r12 addq %r12,%r8 adcq %r13,%r10 adcq $0,%rdi shrd $2,%r13,%r12 shrq $2,%r13 addq %r12,%r8 adcq %r13,%r10 adcq $0,%rdi movq %r8,%rbx movq %r10,%rbp addq $16,%rsi subq $1,%rcx .L4: /* process the last block */ movq %rdi,%r12 /* if the last block is full */ cmpq $0,64(%rsp) je .L7 /* if the last block has 8 bytes */ cmpq $64,64(%rsp) je .L6 /* if the last block has 1 to 7 bytes */ jl .L5 /* else if the last block has 9 to 15 bytes */ /* first chunk of message block = (r8) */ movq 0(%rsi),%r8 addq $8,%rsi movq $128,%rax subq 64(%rsp),%rax movq $64,%rcx subq %rax,%rcx shrq $3,%rcx leaq 96(%rsp),%rdi rep movsb (%rsi),(%rdi) /* second chunk of message block = (r9) */ movq 96(%rsp),%r9 movq $-1,%r11 movq %rax,%rcx shrq %cl,%r11 andq %r11,%r9 addq $1,%r11 orq %r11,%r9 movq $0,%r10 jmp .L8 .L5: movq 64(%rsp),%rcx shrq $3,%rcx leaq 96(%rsp),%rdi rep movsb (%rsi),(%rdi) /* first chunk of message block = (r8) */ movq 96(%rsp),%r8 movq $-1,%r11 movb $64,%cl subb 64(%rsp),%cl shrq %cl,%r11 andq %r11,%r8 addq $1,%r11 orq %r11,%r8 /* second chunk of message block = (r9) */ movq $0,%r9 movq $0,%r10 jmp .L8 .L6: movq 0(%rsi),%r8 movq $1,%r9 movq $0,%r10 jmp .L8 .L7: movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq $1,%r10 .L8: movq %r12,%rdi addq %r8,%rbx adcq %r9,%rbp adcq %r10,%rdi /* integer multiplication */ movq %rdi,%rax mulq %r15 movq %rax,%r8 xorq %r9,%r9 movq %rax,%r10 xorq %r11,%r11 movq %rdx,%r12 xorq %r13,%r13 xorq %rax,%rax shld $62,%rdx,%rax shlq $62,%rdx addq %rdx,%r10 adcq %rax,%r11 movq %rbp,%rax mulq %r15 addq %rax,%r12 adcq $0,%r13 addq %rdx,%r8 adcq $0,%r9 addq %rdx,%r10 adcq $0,%r11 movq %rdi,%rax mulq %r14 addq %rax,%r12 adcq $0,%r13 addq %rdx,%r8 adcq $0,%r9 addq %rdx,%r10 adcq $0,%r11 shld $62,%r8,%r9 shlq $62,%r8 movq %rbx,%rax mulq %r14 addq %rax,%r8 adcq $0,%r9 addq %rdx,%r10 adcq $0,%r11 movq %rbx,%rax mulq %r15 addq %rax,%r10 adcq $0,%r11 addq %rdx,%r12 adcq $0,%r13 movq %rbp,%rax mulq %r14 addq %rax,%r10 adcq $0,%r11 addq %rdx,%r12 adcq $0,%r13 /* reduction on (r13 : r12) : (r11 : r10) : (r9 : r8) */ addq %r10,%r9 adcq $0,%r11 addq %r11,%r12 adcq $0,%r13 /* reduction on (r13 : r12 : r9 : r8) */ movq %r12,%r10 andq mask2(%rip),%r10 andq mask2c(%rip),%r12 addq %r12,%r8 adcq %r13,%r9 adcq $0,%r10 shrd $2,%r13,%r12 shrq $2,%r13 addq %r12,%r8 adcq %r13,%r9 adcq $0,%r10 jmp .L13 .L9: /* if the single message block is full */ cmpq $0,64(%rsp) je .L12 /* if the single message block has 1 to 7 bytes */ cmpq $8,72(%rsp) jl .L10 /* if the single message block has 8 bytes */ je .L11 /* else if the single message block has 9 to 15 bytes */ /* first chunk of message block = (rbx) */ movq 0(%rsi),%rbx addq $8,%rsi movq $128,%rax subq 64(%rsp),%rax movq $64,%rcx subq %rax,%rcx shrq $3,%rcx leaq 96(%rsp),%rdi rep movsb (%rsi),(%rdi) /* second chunk of message block = (rbp) */ movq 96(%rsp),%rbp movq $-1,%r11 movq %rax,%rcx shrq %cl,%r11 andq %r11,%rbp addq $1,%r11 orq %r11,%rbp /* integer multiplication */ xorq %r8,%r8 xorq %r9,%r9 xorq %r10,%r10 xorq %r11,%r11 xorq %r12,%r12 xorq %r13,%r13 movq %rbp,%rax mulq %r15 addq %rax,%r12 adcq $0,%r13 addq %rdx,%r8 adcq $0,%r9 addq %rdx,%r10 adcq $0,%r11 shld $62,%r8,%r9 shlq $62,%r8 movq %rbx,%rax mulq %r14 addq %rax,%r8 adcq $0,%r9 addq %rdx,%r10 adcq $0,%r11 movq %rbx,%rax mulq %r15 addq %rax,%r10 adcq $0,%r11 addq %rdx,%r12 adcq $0,%r13 movq %rbp,%rax mulq %r14 addq %rax,%r10 adcq $0,%r11 addq %rdx,%r12 adcq $0,%r13 /* reduction on (r13 : r12) : (r11 : r10) : (r9 : r8) */ addq %r10,%r9 adcq $0,%r11 addq %r11,%r12 adcq $0,%r13 /* reduction on the integer product (r11 : r10 : r9 : r8) */ movq %r12,%r10 andq mask2(%rip),%r10 andq mask2c(%rip),%r12 addq %r12,%r8 adcq %r13,%r9 adcq $0,%r10 shrd $2,%r13,%r12 shrq $2,%r13 addq %r12,%r8 adcq %r13,%r9 adcq $0,%r10 jmp .L13 .L10: /* read the remainder bytes onto stack */ movq 64(%rsp),%rcx shrq $3,%rcx leaq 96(%rsp),%rdi rep movsb (%rsi),(%rdi) /* message block = (rbx) */ movq 96(%rsp),%rbx movq $-1,%r11 movb $64,%cl subb 64(%rsp),%cl shrq %cl,%r11 andq %r11,%rbx addq $1,%r11 orq %r11,%rbx /* integer multiplication */ movq %r14,%rax mulq %rbx movq %rax,%r8 movq %rdx,%r9 xorq %r10,%r10 movq %r15,%rax mulq %rbx addq %rax,%r9 adcq %rdx,%r10 jmp .L13 .L11: /* message block = (rbx) */ movq 0(%rsi),%rbx /* integer multiplication */ movq %r14,%rax mulq %rbx movq %rax,%r8 movq %rdx,%r9 xorq %r10,%r10 movq %r15,%rax mulq %rbx addq %rax,%r9 adcq %rdx,%r10 xorq %r11,%r11 addq %r14,%r9 adcq %r15,%r10 adcq %r11,%r11 /* reduction on the integer product (r11 : r10 : r9 : r8) */ movq %r10,%r13 andq mask2(%rip),%r10 andq mask2c(%rip),%r13 addq %r13,%r8 adcq %r11,%r9 adcq $0,%r10 shrd $2,%r11,%r13 shrq $2,%r11 addq %r13,%r8 adcq %r11,%r9 adcq $0,%r10 jmp .L13 .L12: /* message block = (rbp : rbx) */ movq 0(%rsi),%rbx movq 8(%rsi),%rbp /* integer multiplication */ movq %r15,%r8 xorq %r9,%r9 movq %r15,%r10 xorq %r11,%r11 xorq %r12,%r12 xorq %r13,%r13 movq %rbp,%rax mulq %r15 addq %rax,%r12 adcq $0,%r13 addq %rdx,%r8 adcq $0,%r9 addq %rdx,%r10 adcq $0,%r11 addq %r14,%r12 adcq $0,%r13 shld $62,%r8,%r9 shlq $62,%r8 movq %rbx,%rax mulq %r14 addq %rax,%r8 adcq $0,%r9 addq %rdx,%r10 adcq $0,%r11 movq %rbx,%rax mulq %r15 addq %rax,%r10 adcq $0,%r11 addq %rdx,%r12 adcq $0,%r13 movq %rbp,%rax mulq %r14 addq %rax,%r10 adcq $0,%r11 addq %rdx,%r12 adcq $0,%r13 /* reduction on (r13 : r12) : (r11 : r10) : (r9 : r8) */ addq %r10,%r9 adcq $0,%r11 addq %r11,%r12 adcq $0,%r13 /* reduction on the integer product (r13 : r12 : r9 : r8) */ movq %r12,%r10 andq mask2(%rip),%r10 andq mask2c(%rip),%r12 addq %r12,%r8 adcq %r13,%r9 adcq $0,%r10 shrd $2,%r13,%r12 shrq $2,%r13 addq %r12,%r8 adcq %r13,%r9 adcq $0,%r10 .L13: /* final reduction on (r10 : r9 : r8) */ movq %r10,%r11 shrq $2,%r11 andq mask2(%rip),%r10 imul $5,%r11,%r11 addq %r11,%r8 adcq $0,%r9 adcq $0,%r10 /* freeze the reduced field element (r10 : r9 : r8) */ movq %r8,%r11 movq %r9,%r12 movq %r10,%r13 subq p0(%rip),%r8 sbbq p1(%rip),%r9 sbbq p2(%rip),%r10 movq %r10,%rcx shlq $62,%rcx cmovc %r11,%r8 cmovc %r12,%r9 cmovc %r13,%r10 /* add last 16 bytes of the key */ addq 80(%rsp),%r8 adcq 88(%rsp),%r9 adcq $0,%r10 /* store first 128 bytes of the result */ movq 56(%rsp),%rdi movq %r8,0(%rdi) movq %r9,8(%rdi) movq 0(%rsp),%r11 movq 8(%rsp),%r12 movq 16(%rsp),%r13 movq 24(%rsp),%r14 movq 32(%rsp),%r15 movq 40(%rsp),%rbx movq 48(%rsp),%rbp movq %r11,%rsp ret