-rw-r--r-- 11321 lib1305-20250407/crypto_onetimeauth/poly1305/amd64-maa44-g4/poly1305_maa44_g4.S raw
/* assembly to compute poly1305 using precomputed key powers and applying lazy reduction over a group of 4 field elements */ #include "crypto_asm_hidden.h" // linker define poly1305_maa44_g4 // linker use mask44 // linker use mask42 // linker use pmask1 // linker use pmask2 // linker use pmask3 // linker use pmask4 // linker use pmask5 // linker use upmask1 // linker use upmask2 // linker use upmask3 // linker use upmask4 // linker use upmask5 // linker use twoexp41 // linker use p0 // linker use p1 // linker use p2 #define mask44 CRYPTO_SHARED_NAMESPACE(mask44) #define mask42 CRYPTO_SHARED_NAMESPACE(mask42) #define pmask1 CRYPTO_SHARED_NAMESPACE(pmask1) #define pmask2 CRYPTO_SHARED_NAMESPACE(pmask2) #define pmask3 CRYPTO_SHARED_NAMESPACE(pmask3) #define pmask4 CRYPTO_SHARED_NAMESPACE(pmask4) #define pmask5 CRYPTO_SHARED_NAMESPACE(pmask5) #define upmask1 CRYPTO_SHARED_NAMESPACE(upmask1) #define upmask2 CRYPTO_SHARED_NAMESPACE(upmask2) #define upmask3 CRYPTO_SHARED_NAMESPACE(upmask3) #define upmask4 CRYPTO_SHARED_NAMESPACE(upmask4) #define upmask5 CRYPTO_SHARED_NAMESPACE(upmask5) #define twoexp41 CRYPTO_SHARED_NAMESPACE(twoexp41) #define p0 CRYPTO_SHARED_NAMESPACE(p0) #define p1 CRYPTO_SHARED_NAMESPACE(p1) #define p2 CRYPTO_SHARED_NAMESPACE(p2) #include "poly1305_asm.h" .p2align 5 ASM_HIDDEN _CRYPTO_SHARED_NAMESPACE(poly1305_maa44_g4) ASM_HIDDEN CRYPTO_SHARED_NAMESPACE(poly1305_maa44_g4) .global _CRYPTO_SHARED_NAMESPACE(poly1305_maa44_g4) .global CRYPTO_SHARED_NAMESPACE(poly1305_maa44_g4) _CRYPTO_SHARED_NAMESPACE(poly1305_maa44_g4): CRYPTO_SHARED_NAMESPACE(poly1305_maa44_g4): movq %rsp,%r11 andq $-32,%rsp subq $160,%rsp movq %r11,0(%rsp) movq %r12,8(%rsp) movq %r13,16(%rsp) movq %r14,24(%rsp) movq %r15,32(%rsp) movq %rbx,40(%rsp) movq %rbp,48(%rsp) movq %rdi,56(%rsp) movq %r8,64(%rsp) movq %r9,72(%rsp) /* store last 16 bytes of the key */ movq 96(%rdx),%r14 movq 104(%rdx),%r15 movq %r14,88(%rsp) movq %r15,96(%rsp) /* initialize a quad-word on the stack with 0 */ movq $0,104(%rsp) /* if the message has a single block */ cmpq $1,%rcx je .L5 movq %rcx,80(%rsp) movq %rdx,%rdi movq %rdx,112(%rsp) movq $0,%r8 movq $0,%r9 movq $0,%r10 movq $0,%r11 movq $0,%r12 movq $0,%r13 cmpq $2,%rcx je .LB2 cmpq $3,%rcx je .LB3 .LB4: fe1305_mul_taun(0,48) fe1305_add_product() fe1305_mul_taun(16,24) fe1305_add_product() fe1305_mul_taun(32,0) fe1305_add_product() fe1305_reduce_3l_128bits() addq $48,%rsi movq 80(%rsp),%rcx subq $4,%rcx movq %rcx,80(%rsp) /* if there are no blocks left before processing the last block */ cmpq $0,%rcx je .LB0 /* if there is one more block before processing the last block */ fe1305_add_msg_block(0) addq $16,%rsi cmpq $1,%rcx je .LB1 .LT2: cmpq $2,%rcx jg .LT3 /* if there are two more block before processing the last block */ fe1305_mul_taunr(24) jmp .LB2 .LT3: cmpq $3,%rcx jg .LT4 /* if there are three more block before processing the last block */ fe1305_mul_taunr(48) jmp .LB3 .LT4: /* if there are at least four more block before processing the last block */ fe1305_reduce_3l_64bits() fe1305_mul_taunr(72) jmp .LB4 .LB1: fe1305_mul_taunr(0) fe1305_reduce_3l_128bits() jmp .LB0 .LB2: fe1305_mul_taun(0,0) fe1305_add_product() fe1305_reduce_3l_128bits() addq $16,%rsi jmp .LB0 .LB3: fe1305_mul_taun(0,24) fe1305_add_product() fe1305_mul_taun(16,0) fe1305_add_product() fe1305_reduce_3l_128bits() addq $32,%rsi .LB0: /* if the last block is full */ cmpq $0,64(%rsp) je .L3 /* if the last block has 8 bytes */ cmpq $64,64(%rsp) je .L2 /* if the last block has 1 to 7 bytes */ jl .L1 /* else if the last block has 9 to 15 bytes */ /* first chunk of message block = (rax) */ movq 0(%rsi),%rax addq $8,%rsi movq $128,%rbp subq 64(%rsp),%rbp movq $64,%rcx subq %rbp,%rcx shrq $3,%rcx leaq 104(%rsp),%rdi rep movsb (%rsi),(%rdi) /* second chunk of message block = (rdi) */ movq 104(%rsp),%rdi movq $-1,%r11 movq %rbp,%rcx shrq %cl,%r11 addq $1,%r11 orq %r11,%rdi movq $0,%rbp jmp .L4 .L1: movq 64(%rsp),%rcx shrq $3,%rcx leaq 104(%rsp),%rdi rep movsb (%rsi),(%rdi) /* first chunk of message block = (rax) */ movq 104(%rsp),%rax movq $-1,%r11 movb $64,%cl subb 64(%rsp),%cl shrq %cl,%r11 addq $1,%r11 orq %r11,%rax /* second chunk of message block = (rdi) */ movq $0,%rdi movq $0,%rbp jmp .L4 .L2: movq 0(%rsi),%rax movq $1,%rdi movq $0,%rbp jmp .L4 .L3: movq 0(%rsi),%rax movq 8(%rsi),%rdi movq $1,%rbp .L4: movq %rax,%r13 andq pmask1(%rip),%rax movq %rdi,%r11 andq pmask2(%rip),%r13 shrq $44,%r13 andq pmask3(%rip),%rdi shlq $20,%rdi orq %r13,%rdi andq pmask4(%rip),%r11 shrq $24,%r11 andq pmask5(%rip),%rbp shlq $40,%rbp orq %r11,%rbp addq %rax,%r8 addq %rdi,%r10 addq %rbp,%r12 movq 112(%rsp),%rdi fe1305_mul_taunr(0) fe1305_reduce_3l_128bits() jmp .L10 .L5: movq 0(%rdx),%r14 movq 8(%rdx),%r15 movq 16(%rdx),%rbx /* if the single message block is full */ cmpq $0,64(%rsp) je .L8 /* if the single message block has 1 to 7 bytes */ cmpq $8,72(%rsp) jl .L6 /* if the single message block has 8 bytes */ je .L7 /* else if the single message block has 9 to 15 bytes */ /* first chunk of message block = (rax) */ movq 0(%rsi),%rax addq $8,%rsi movq $128,%r8 subq 64(%rsp),%r8 movq $64,%rcx subq %r8,%rcx shrq $3,%rcx leaq 104(%rsp),%rdi rep movsb (%rsi),(%rdi) /* second chunk of message block = (rbp) */ movq 104(%rsp),%rbp movq $-1,%r11 movq %r8,%rcx shrq %cl,%r11 addq $1,%r11 orq %r11,%rbp movq %rax,%r9 andq pmask1(%rip),%rax movq %rbp,%rdi andq pmask2(%rip),%r9 shrq $44,%r9 andq pmask3(%rip),%rdi shlq $20,%rdi orq %r9,%rdi andq pmask4(%rip),%rbp shrq $24,%rbp movq %rax,104(%rsp) mulq %r14 movq %rax,%r8 movq %rdx,%r9 movq %rdi,%rax imul $20,%rax,%rax mulq %rbx addq %rax,%r8 adcq %rdx,%r9 movq %rbp,%rax imul $20,%rax,%rax movq %rax,112(%rsp) mulq %r15 addq %rax,%r8 adcq %rdx,%r9 shld $20,%r8,%r9 movq 104(%rsp),%rax mulq %r15 movq %rax,%r10 movq %rdx,%r11 movq %rdi,%rax mulq %r14 addq %rax,%r10 adcq %rdx,%r11 movq 112(%rsp),%rax mulq %rbx addq %rax,%r10 adcq %rdx,%r11 shld $20,%r10,%r11 movq 104(%rsp),%rax mulq %rbx movq %rax,%r12 movq %rdx,%r13 movq %rdi,%rax mulq %r15 addq %rax,%r12 adcq %rdx,%r13 movq %rbp,%rax mulq %r14 addq %rax,%r12 adcq %rdx,%r13 shld $22,%r12,%r13 jmp .L9 .L6: /* read the remainder bytes onto stack */ movq 64(%rsp),%rcx shrq $3,%rcx leaq 104(%rsp),%rdi rep movsb (%rsi),(%rdi) /* message block = (rax) */ movq 104(%rsp),%rax movq $-1,%r11 movb $64,%cl subb 64(%rsp),%cl shrq %cl,%r11 addq $1,%r11 orq %r11,%rax movq %rax,%rdi andq pmask1(%rip),%rax andq pmask2(%rip),%rdi shrq $44,%rdi movq %rax,104(%rsp) mulq %r14 movq %rax,%r8 movq %rdx,%r9 movq %rdi,%rax imul $20,%rax,%rax mulq %rbx addq %rax,%r8 adcq %rdx,%r9 shld $20,%r8,%r9 movq 104(%rsp),%rax mulq %r15 movq %rax,%r10 movq %rdx,%r11 movq %rdi,%rax mulq %r14 addq %rax,%r10 adcq %rdx,%r11 shld $20,%r10,%r11 movq 104(%rsp),%rax mulq %rbx movq %rax,%r12 movq %rdx,%r13 movq %rdi,%rax mulq %r15 addq %rax,%r12 adcq %rdx,%r13 shld $22,%r12,%r13 jmp .L9 .L7: /* message block = (rax) */ movq 0(%rsi),%rax movq $1,%rdi movq %rax,%r9 andq pmask1(%rip),%rax movq %rdi,%rbp andq pmask2(%rip),%r9 shrq $44,%r9 andq pmask3(%rip),%rdi shlq $20,%rdi orq %r9,%rdi andq pmask4(%rip),%rbp shrq $24,%rbp movq %rax,104(%rsp) mulq %r14 movq %rax,%r8 movq %rdx,%r9 movq %rdi,%rax imul $20,%rax,%rax mulq %rbx addq %rax,%r8 adcq %rdx,%r9 shld $20,%r8,%r9 movq 104(%rsp),%rax mulq %r15 movq %rax,%r10 movq %rdx,%r11 movq %rdi,%rax mulq %r14 addq %rax,%r10 adcq %rdx,%r11 shld $20,%r10,%r11 movq 104(%rsp),%rax mulq %rbx movq %rax,%r12 movq %rdx,%r13 movq %rdi,%rax mulq %r15 addq %rax,%r12 adcq %rdx,%r13 shld $22,%r12,%r13 jmp .L9 .L8: /* message block = (rbp : rdi : rax) */ movq 0(%rsi),%rax movq 8(%rsi),%rdi movq $1,%rbp movq %rax,%r9 andq pmask1(%rip),%rax movq %rdi,%r11 andq pmask2(%rip),%r9 shrq $44,%r9 andq pmask3(%rip),%rdi shlq $20,%rdi orq %r9,%rdi andq pmask4(%rip),%r11 shrq $24,%r11 andq pmask5(%rip),%rbp shlq $40,%rbp orq %r11,%rbp movq %rax,104(%rsp) mulq %r14 movq %rax,%r8 movq %rdx,%r9 movq %rdi,%rax imul $20,%rax,%rax mulq %rbx addq %rax,%r8 adcq %rdx,%r9 movq %rbp,%rax imul $20,%rax,%rax movq %rax,112(%rsp) mulq %r15 addq %rax,%r8 adcq %rdx,%r9 shld $20,%r8,%r9 movq 104(%rsp),%rax mulq %r15 movq %rax,%r10 movq %rdx,%r11 movq %rdi,%rax mulq %r14 addq %rax,%r10 adcq %rdx,%r11 movq 112(%rsp),%rax mulq %rbx addq %rax,%r10 adcq %rdx,%r11 shld $20,%r10,%r11 movq 104(%rsp),%rax mulq %rbx movq %rax,%r12 movq %rdx,%r13 movq %rdi,%rax mulq %r15 addq %rax,%r12 adcq %rdx,%r13 movq %rbp,%rax mulq %r14 addq %rax,%r12 adcq %rdx,%r13 shld $22,%r12,%r13 .L9: /* reduction on (r12 : r10 : r8) */ movq mask44(%rip),%rbp andq %rbp,%r8 andq %rbp,%r10 addq %r9,%r10 andq mask42(%rip),%r12 addq %r11,%r12 imul $5,%r13,%r13 addq %r13,%r8 .L10: movq %r8,%rdx shrq $44,%rdx addq %r10,%rdx andq %rbp,%r8 movq %rdx,%r10 shrq $44,%rdx addq %r12,%rdx andq %rbp,%r10 movq %rdx,%r12 shrq $42,%rdx imul $5,%rdx,%rdx addq %rdx,%r8 andq mask42(%rip),%r12 /* get back the element in base 2^{64} */ andq upmask1(%rip),%r8 movq %r10,%r9 andq upmask2(%rip),%r9 shlq $44,%r9 orq %r9,%r8 andq upmask3(%rip),%r10 shrq $20,%r10 movq %r12,%r11 andq upmask4(%rip),%r11 shlq $24,%r11 orq %r11,%r10 andq upmask5(%rip),%r12 shrq $20,%r12 /* freeze the reduced field element (r12 : r10 : r8) */ movq %r8,%r9 movq %r10,%r11 movq %r12,%r13 subq p0(%rip),%r8 sbbq p1(%rip),%r10 sbbq p2(%rip),%r12 movq %r12,%rcx shlq $62,%rcx cmovc %r9,%r8 cmovc %r11,%r10 cmovc %r13,%r12 /* add last 16 bytes of the key */ addq 88(%rsp),%r8 adcq 96(%rsp),%r10 adcq $0,%r12 /* store first 128 bytes of the result */ movq 56(%rsp),%rdi movq %r8,0(%rdi) movq %r10,8(%rdi) movq 0(%rsp),%r11 movq 8(%rsp),%r12 movq 16(%rsp),%r13 movq 24(%rsp),%r14 movq 32(%rsp),%r15 movq 40(%rsp),%rbx movq 48(%rsp),%rbp movq %r11,%rsp ret