#ifndef __POLY1305ASM__ #define __POLY1305ASM__ /* field arithmetic used for computing poly1305 */ #define fe1305_mul_taun(m,t) \ \ movq m+0(%rsi),%rdx; \ \ mulx t+0(%rdi),%r13,%r14; \ mulx t+8(%rdi),%rbx,%r15; \ addq %rbx,%r14; \ \ mulx t+16(%rdi),%rbx,%rax; \ adcq %rbx,%r15; \ adcq $0,%rax; \ \ movq %rax,120(%rsp); \ movq m+8(%rsi),%rdx; \ \ mulx t+0(%rdi),%rbx,%rbp; \ mulx t+8(%rdi),%rax,%rcx; \ addq %rax,%rbp; \ \ mulx t+16(%rdi),%rax,%rdx; \ adcq %rcx,%rax; \ adcq $0,%rdx; \ \ addq %rbx,%r14; \ adcq %rbp,%r15; \ adcq 120(%rsp),%rax; \ adcq $0,%rdx; \ \ movq %rdx,%rcx; \ \ addq t+0(%rdi),%r15; \ adcq t+8(%rdi),%rax; \ adcq t+16(%rdi),%rcx; \ #define fe1305_mul_taunr(t) \ \ movq %r8,%rdx; \ \ mulx t+0(%rdi),%r8,%r14; \ mulx t+8(%rdi),%rbx,%r15; \ addq %rbx,%r14; \ \ mulx t+16(%rdi),%rbx,%rax; \ adcq %rbx,%r15; \ adcq $0,%rax; \ \ movq %r9,%rdx; \ \ mulx t+0(%rdi),%r9,%rbp; \ mulx t+8(%rdi),%rbx,%r13; \ addq %rbx,%rbp; \ \ mulx t+16(%rdi),%rbx,%rcx; \ adcq %r13,%rbx; \ adcq $0,%rcx; \ \ addq %r14,%r9; \ adcq %rbp,%r15; \ adcq %rbx,%rax; \ adcq $0,%rcx; \ \ movq %r10,%rdx; \ \ mulx t+0(%rdi),%r10,%r11; \ mulx t+8(%rdi),%r14,%r13; \ addq %r14,%r11; \ \ mulx t+16(%rdi),%r12,%rdx; \ adcq %r13,%r12; \ \ addq %r15,%r10; \ adcq %rax,%r11; \ adcq %rcx,%r12; \ #define fe1305_mul_tau(m,t) \ \ movq $0,%rcx; \ movq m+0(%rsi),%rdx; \ \ mulx t+0(%rdi),%r13,%r14; \ mulx t+8(%rdi),%rbx,%r15; \ addq %rbx,%r14; \ adcq %rcx,%r15; \ \ movq m+8(%rsi),%rdx; \ \ mulx t+0(%rdi),%rbx,%rbp; \ mulx t+8(%rdi),%rdx,%rax; \ addq %rdx,%rbp; \ adcq %rcx,%rax; \ \ addq %rbx,%r14; \ adcq %rbp,%r15; \ adcq %rcx,%rax; \ \ addq t+0(%rdi),%r15; \ adcq t+8(%rdi),%rax; \ adcq %rcx,%rcx; \ #define fe1305_mul_taur(t) \ \ movq $0,%r12; \ movq %r8,%rdx; \ \ mulx t+0(%rdi),%r8,%r14; \ mulx t+8(%rdi),%rbx,%r15; \ addq %rbx,%r14; \ adcq %r12,%r15; \ \ movq %r9,%rdx; \ \ mulx t+0(%rdi),%r9,%rbp; \ mulx t+8(%rdi),%rdx,%r11; \ addq %rdx,%rbp; \ adcq %r12,%r11; \ \ addq %r14,%r9; \ adcq %rbp,%r15; \ adcq %r12,%r11; \ \ movq %r10,%rdx; \ \ mulx t+0(%rdi),%r10,%rcx; \ mulx t+8(%rdi),%rdx,%rbx; \ addq %rdx,%rcx; \ adcq %r12,%rbx; \ \ addq %r15,%r10; \ adcq %rcx,%r11; \ adcq %rbx,%r12; \ #define fe1305_add_product() \ \ addq %r13,%r8; \ adcq %r14,%r9; \ adcq %r15,%r10; \ adcq %rax,%r11; \ adcq %rcx,%r12; \ #define fe1305_add_msg_block(m) \ \ addq m+0(%rsi),%r8; \ adcq m+8(%rsi),%r9; \ adcq $1,%r10; \ #define fe1305_reduce_5l() \ \ movq %r10,%r13; \ andq mask2(%rip),%r10; \ andq mask2c(%rip),%r13; \ \ addq %r13,%r8; \ adcq %r11,%r9; \ adcq %r12,%r10; \ \ shrd $2,%r11,%r13; \ shrd $2,%r12,%r11; \ shrq $2,%r12; \ \ addq %r13,%r8; \ adcq %r11,%r9; \ adcq %r12,%r10; \ #define fe1305_reduce_4l() \ \ movq %r10,%r13; \ \ andq mask2(%rip),%r10; \ andq mask2c(%rip),%r13; \ \ addq %r13,%r8; \ adcq %r11,%r9; \ adcq %r12,%r10; \ \ shrd $2,%r11,%r13; \ shrq $2,%r11; \ \ addq %r13,%r8; \ adcq %r11,%r9; \ adcq %r12,%r10; \ #define fe1305_reduce_3l() \ \ movq %r10,%r11; \ andq mask2(%rip),%r10; \ shrq $2,%r11; \ \ imul $5,%r11,%r11; \ addq %r11,%r8; \ adcq $0,%r9; \ adcq $0,%r10; \ #endif