/* assembly to compute poly1305 using precomputed key powers and
   applying lazy reduction over a group of 32 field elements */

#include "crypto_asm_hidden.h"
// linker define poly1305_maax_g24
// linker use mask2
// linker use mask2c
// linker use zero
// linker use p0
// linker use p1
// linker use p2

#define mask2  CRYPTO_SHARED_NAMESPACE(mask2)
#define mask2c CRYPTO_SHARED_NAMESPACE(mask2c)
#define zero   CRYPTO_SHARED_NAMESPACE(zero)
#define p0     CRYPTO_SHARED_NAMESPACE(p0)
#define p1     CRYPTO_SHARED_NAMESPACE(p1)
#define p2     CRYPTO_SHARED_NAMESPACE(p2)

#include "poly1305_asm.h"

	.p2align 5

ASM_HIDDEN _CRYPTO_SHARED_NAMESPACE(poly1305_maax_g24)
ASM_HIDDEN CRYPTO_SHARED_NAMESPACE(poly1305_maax_g24)
.global _CRYPTO_SHARED_NAMESPACE(poly1305_maax_g24)
.global CRYPTO_SHARED_NAMESPACE(poly1305_maax_g24)
_CRYPTO_SHARED_NAMESPACE(poly1305_maax_g24):
CRYPTO_SHARED_NAMESPACE(poly1305_maax_g24):

	movq 	%rsp,%r11
	andq    $-32,%rsp
	subq 	$128,%rsp

	movq 	%r11,0(%rsp)
	movq 	%r12,8(%rsp)
	movq 	%r13,16(%rsp)
	movq 	%r14,24(%rsp)
	movq 	%r15,32(%rsp)
	movq 	%rbx,40(%rsp)
	movq 	%rbp,48(%rsp)
	movq 	%rdi,56(%rsp)
	movq 	%r8,64(%rsp)
	movq 	%r9,72(%rsp)

	/* store last 16 bytes of the key */
	movq    576(%rdx),%r14
	movq    584(%rdx),%r15		
	movq    %r14,88(%rsp)
	movq    %r15,96(%rsp)
	
	/* key = (r15 : r14) */
	movq    0(%rdx),%r14
	movq    8(%rdx),%r15
	
	/* initialize a quad-word on the stack with 0 */
	movq	$0,104(%rsp)	

	/* if the message has a single block */
	cmpq    $1,%rcx
	je      .L5	
	
	movq    %rcx,80(%rsp)
	movq    %rdx,%rdi
	movq    %rdx,112(%rsp)
	
	movq    $0,%r8
	movq    $0,%r9
	movq    $0,%r10
	movq    $0,%r11
	movq    $0,%r12	
	
	cmpq    $2,%rcx
	je      .LB2
	
	cmpq    $3,%rcx
	je      .LB3
	
	cmpq    $4,%rcx
	je      .LB4
	
	cmpq    $5,%rcx
	je      .LB5
	
	cmpq    $6,%rcx
	je      .LB6
	
	cmpq    $7,%rcx
	je      .LB7
	
	cmpq    $8,%rcx
	je      .LB8
	
	cmpq    $9,%rcx
	je      .LB9
	
	cmpq    $10,%rcx
	je      .LB10
	
	cmpq    $11,%rcx
	je      .LB11
	
	cmpq    $12,%rcx
	je      .LB12
	
	cmpq    $13,%rcx
	je      .LB13
	
	cmpq    $14,%rcx
	je      .LB14
	
	cmpq    $15,%rcx
	je      .LB15
	
	cmpq    $16,%rcx
	je      .LB16
	
	cmpq    $17,%rcx
	je      .LB17
	
	cmpq    $18,%rcx
	je      .LB18
	
	cmpq    $19,%rcx
	je      .LB19
	
	cmpq    $20,%rcx
	je      .LB20
	
	cmpq    $21,%rcx
	je      .LB21
	
	cmpq    $22,%rcx
	je      .LB22
	
	cmpq    $23,%rcx
	je      .LB23
		
.LB24:
	fe1305_mul_taun(0,528)
	fe1305_add_product()
	
	fe1305_mul_taun(16,504)
	fe1305_add_product()
	
	fe1305_mul_taun(32,480)
	fe1305_add_product()
	
	fe1305_mul_taun(48,456)
	fe1305_add_product()
	
	fe1305_mul_taun(64,432)
	fe1305_add_product()
	
	fe1305_mul_taun(80,408)
	fe1305_add_product()	
	
	fe1305_mul_taun(96,384)
	fe1305_add_product()
	
	fe1305_mul_taun(112,360)
	fe1305_add_product()
	
	fe1305_mul_taun(128,336)
	fe1305_add_product()
	
	fe1305_mul_taun(144,312)
	fe1305_add_product()	
	
	fe1305_mul_taun(160,288)
	fe1305_add_product()
	
	fe1305_mul_taun(176,264)
	fe1305_add_product()
	
	fe1305_mul_taun(192,240)
	fe1305_add_product()
	
	fe1305_mul_taun(208,216)
	fe1305_add_product()
	
	fe1305_mul_taun(224,192)
	fe1305_add_product()
	
	fe1305_mul_taun(240,168)
	fe1305_add_product()
	
	fe1305_mul_taun(256,144)
	fe1305_add_product()
	
	fe1305_mul_taun(272,120)
	fe1305_add_product()
	
	fe1305_mul_taun(288,96)
	fe1305_add_product()
	
	fe1305_mul_taun(304,72)
	fe1305_add_product()
	
	fe1305_mul_taun(320,48)
	fe1305_add_product()
	
	fe1305_mul_taun(336,24)
	fe1305_add_product()
	
	fe1305_mul_tau(352,0)
	fe1305_add_product()					
        
	fe1305_reduce_5l()
	fe1305_reduce_3l()
	
	addq	$368,%rsi
	
	movq    80(%rsp),%rcx
	subq    $24,%rcx
	movq    %rcx,80(%rsp)	
	
	/* if there are no blocks left before processing the last block */
	cmpq    $0,%rcx	
	je      .LB0

	/* if there is one more block before processing the last block */
	fe1305_add_msg_block(0)
	addq	$16,%rsi
	
	cmpq    $1,%rcx
	je     .LB1
	
.LT2:
	cmpq    $2,%rcx	
	jg      .LT3	

	/* if there are two more block before processing the last block */
	fe1305_mul_taunr(24)
	jmp     .LB2

.LT3:
	cmpq    $3,%rcx
	jg      .LT4

	/* if there are three more block before processing the last block */
	fe1305_mul_taunr(48)
	jmp     .LB3
	
.LT4:
	cmpq    $4,%rcx
	jg      .LT5

	/* if there are four more block before processing the last block */
	fe1305_mul_taunr(72)
	jmp     .LB4
	
.LT5:
	cmpq    $5,%rcx
	jg      .LT6

	/* if there are five more block before processing the last block */
	fe1305_mul_taunr(96)
	jmp     .LB5
	
.LT6:
	cmpq    $6,%rcx
	jg      .LT7

	/* if there are six more block before processing the last block */
	fe1305_mul_taunr(120)
	jmp     .LB6
	
.LT7:
	cmpq    $7,%rcx
	jg      .LT8

	/* if there are seven more block before processing the last block */
	fe1305_mul_taunr(144)
	jmp     .LB7
	
.LT8:
	cmpq    $8,%rcx
	jg      .LT9

	/* if there are eight more block before processing the last block */
	fe1305_mul_taunr(168)
	jmp     .LB8
	
.LT9:
	cmpq    $9,%rcx
	jg      .LT10

	/* if there are nine more block before processing the last block */
	fe1305_mul_taunr(192)
	jmp     .LB9
	
.LT10:
	cmpq    $10,%rcx
	jg      .LT11

	/* if there are ten more block before processing the last block */
	fe1305_mul_taunr(216)
	jmp     .LB10
	
.LT11:
	cmpq    $11,%rcx
	jg      .LT12

	/* if there are eleven more block before processing the last block */
	fe1305_mul_taunr(240)
	jmp     .LB11
	
.LT12:
	cmpq    $12,%rcx
	jg      .LT13

	/* if there are twelve more block before processing the last block */
	fe1305_mul_taunr(264)
	jmp     .LB12
	
.LT13:
	cmpq    $13,%rcx
	jg      .LT14

	/* if there are thirteen more block before processing the last block */
	fe1305_mul_taunr(288)
	jmp     .LB13
	
.LT14:
	cmpq    $14,%rcx
	jg      .LT15

	/* if there are fourteen more block before processing the last block */
	fe1305_mul_taunr(312)
	jmp     .LB14
	
.LT15:
	cmpq    $15,%rcx
	jg      .LT16

	/* if there are fifteen more block before processing the last block */
	fe1305_mul_taunr(336)
	jmp     .LB15
	
.LT16:
	cmpq    $16,%rcx
	jg      .LT17

	/* if there are sixteen more block before processing the last block */
	fe1305_mul_taunr(360)
	jmp     .LB16
	
.LT17:
	cmpq    $17,%rcx
	jg      .LT18

	/* if there are seventeen more block before processing the last block */
	fe1305_mul_taunr(384)
	jmp     .LB17
	
.LT18:
	cmpq    $18,%rcx
	jg      .LT19

	/* if there are eighteen more block before processing the last block */
	fe1305_mul_taunr(408)
	jmp     .LB18
	
.LT19:
	cmpq    $19,%rcx
	jg      .LT20

	/* if there are nineteen more block before processing the last block */
	fe1305_mul_taunr(432)
	jmp     .LB19
	
.LT20:
	cmpq    $20,%rcx
	jg      .LT21

	/* if there are twenty more block before processing the last block */
	fe1305_mul_taunr(456)
	jmp     .LB20
	
.LT21:
	cmpq    $21,%rcx
	jg      .LT22

	/* if there are twenty one more block before processing the last block */
	fe1305_mul_taunr(480)
	jmp     .LB21
	
.LT22:
	cmpq    $22,%rcx
	jg      .LT23

	/* if there are twenty two more block before processing the last block */
	fe1305_mul_taunr(504)
	jmp     .LB22
	
.LT23:
	cmpq    $23,%rcx
	jg      .LT24

	/* if there are twenty three more block before processing the last block */
	fe1305_mul_taunr(528)
	jmp     .LB23				
		
.LT24:
	/* if there are at least twenty four more block before processing the last block */
	fe1305_mul_taunr(552)
	jmp     .LB24
			
.LB1:
	fe1305_mul_taur(0)

	fe1305_reduce_5l()
	
	jmp     .LB0	
			
.LB2:
	fe1305_mul_tau(0,0)
	fe1305_add_product()
	        
	fe1305_reduce_5l()
	
	addq	$16,%rsi
	jmp     .LB0
	
.LB3:
	fe1305_mul_taun(0,24)
	fe1305_add_product()
		
	fe1305_mul_tau(16,0)
	fe1305_add_product()
		
	fe1305_reduce_5l()
	
	addq	$32,%rsi
	jmp     .LB0	
	
.LB4:
	fe1305_mul_taun(0,48)
	fe1305_add_product()
	
	fe1305_mul_taun(16,24)
	fe1305_add_product()
	
	fe1305_mul_tau(32,0)
	fe1305_add_product()
	
	fe1305_reduce_5l()
		
	addq	$48,%rsi
	jmp     .LB0
	
.LB5:
	fe1305_mul_taun(0,72)
	fe1305_add_product()

	fe1305_mul_taun(16,48)
	fe1305_add_product()
	
	fe1305_mul_taun(32,24)
	fe1305_add_product()
	
	fe1305_mul_tau(48,0)
	fe1305_add_product()
	
	fe1305_reduce_5l()
	
	addq	$64,%rsi
	jmp     .LB0
	
.LB6:
	fe1305_mul_taun(0,96)
	fe1305_add_product()
	
	fe1305_mul_taun(16,72)
	fe1305_add_product()
	
	fe1305_mul_taun(32,48)
	fe1305_add_product()
	
	fe1305_mul_taun(48,24)
	fe1305_add_product()
	
	fe1305_mul_tau(64,0)
	fe1305_add_product()
	
	fe1305_reduce_5l()
	
	addq	$80,%rsi
	jmp     .LB0
	
.LB7:
	fe1305_mul_taun(0,120)
	fe1305_add_product()

	fe1305_mul_taun(16,96)
	fe1305_add_product()
	
	fe1305_mul_taun(32,72)
	fe1305_add_product()
	
	fe1305_mul_taun(48,48)
	fe1305_add_product()
	
	fe1305_mul_taun(64,24)
	fe1305_add_product()
	
	fe1305_mul_tau(80,0)
	fe1305_add_product()
	
	fe1305_reduce_5l()
	
	addq	$96,%rsi
	jmp     .LB0
	
.LB8:
	fe1305_mul_taun(0,144)
	fe1305_add_product()

	fe1305_mul_taun(16,120)
	fe1305_add_product()
	
	fe1305_mul_taun(32,96)
	fe1305_add_product()
	
	fe1305_mul_taun(48,72)
	fe1305_add_product()
	
	fe1305_mul_taun(64,48)
	fe1305_add_product()
	
	fe1305_mul_taun(80,24)
	fe1305_add_product()
	
	fe1305_mul_tau(96,0)
	fe1305_add_product()	
	
	fe1305_reduce_5l()
	
	addq	$112,%rsi
	jmp     .LB0
	
.LB9:
	fe1305_mul_taun(0,168)
	fe1305_add_product()

	fe1305_mul_taun(16,144)
	fe1305_add_product()
	
	fe1305_mul_taun(32,120)
	fe1305_add_product()
	
	fe1305_mul_taun(48,96)
	fe1305_add_product()
	
	fe1305_mul_taun(64,72)
	fe1305_add_product()
	
	fe1305_mul_taun(80,48)
	fe1305_add_product()
	
	fe1305_mul_taun(96,24)
	fe1305_add_product()
	
	fe1305_mul_tau(112,0)
	fe1305_add_product()		
	
	fe1305_reduce_5l()
	
	addq	$128,%rsi
	jmp     .LB0
	
.LB10:
	fe1305_mul_taun(0,192)
	fe1305_add_product()

	fe1305_mul_taun(16,168)
	fe1305_add_product()
	
	fe1305_mul_taun(32,144)
	fe1305_add_product()
	
	fe1305_mul_taun(48,120)
	fe1305_add_product()
	
	fe1305_mul_taun(64,96)
	fe1305_add_product()
	
	fe1305_mul_taun(80,72)
	fe1305_add_product()
	
	fe1305_mul_taun(96,48)
	fe1305_add_product()
	
	fe1305_mul_taun(112,24)
	fe1305_add_product()
	
	fe1305_mul_tau(128,0)
	fe1305_add_product()	
	
	fe1305_reduce_5l()
	
	addq	$144,%rsi
	jmp     .LB0
	
.LB11:
	fe1305_mul_taun(0,216)
	fe1305_add_product()

	fe1305_mul_taun(16,192)
	fe1305_add_product()
	
	fe1305_mul_taun(32,168)
	fe1305_add_product()
	
	fe1305_mul_taun(48,144)
	fe1305_add_product()
	
	fe1305_mul_taun(64,120)
	fe1305_add_product()
	
	fe1305_mul_taun(80,96)
	fe1305_add_product()
	
	fe1305_mul_taun(96,72)
	fe1305_add_product()
	
	fe1305_mul_taun(112,48)
	fe1305_add_product()
	
	fe1305_mul_taun(128,24)
	fe1305_add_product()
	
	fe1305_mul_tau(144,0)
	fe1305_add_product()	
	
	fe1305_reduce_5l()
	
	addq	$160,%rsi
	jmp     .LB0
		
.LB12:
	fe1305_mul_taun(0,240)
	fe1305_add_product()

	fe1305_mul_taun(16,216)
	fe1305_add_product()
	
	fe1305_mul_taun(32,192)
	fe1305_add_product()
	
	fe1305_mul_taun(48,168)
	fe1305_add_product()
	
	fe1305_mul_taun(64,144)
	fe1305_add_product()
	
	fe1305_mul_taun(80,120)
	fe1305_add_product()
	
	fe1305_mul_taun(96,96)
	fe1305_add_product()
	
	fe1305_mul_taun(112,72)
	fe1305_add_product()
	
	fe1305_mul_taun(128,48)
	fe1305_add_product()
	
	fe1305_mul_taun(144,24)
	fe1305_add_product()
	
	fe1305_mul_tau(160,0)
	fe1305_add_product()		
	
	fe1305_reduce_5l()
	
	addq	$176,%rsi
	jmp     .LB0
	
.LB13:
	fe1305_mul_taun(0,264)
	fe1305_add_product()

	fe1305_mul_taun(16,240)
	fe1305_add_product()
	
	fe1305_mul_taun(32,216)
	fe1305_add_product()
	
	fe1305_mul_taun(48,192)
	fe1305_add_product()
	
	fe1305_mul_taun(64,168)
	fe1305_add_product()
	
	fe1305_mul_taun(80,144)
	fe1305_add_product()
	
	fe1305_mul_taun(96,120)
	fe1305_add_product()
	
	fe1305_mul_taun(112,96)
	fe1305_add_product()
	
	fe1305_mul_taun(128,72)
	fe1305_add_product()
	
	fe1305_mul_taun(144,48)
	fe1305_add_product()
	
	fe1305_mul_taun(160,24)
	fe1305_add_product()
	
	fe1305_mul_tau(176,0)
	fe1305_add_product()			
	
	fe1305_reduce_5l()
	
	addq	$192,%rsi
	jmp     .LB0	
	
.LB14:
	fe1305_mul_taun(0,288)
	fe1305_add_product()

	fe1305_mul_taun(16,264)
	fe1305_add_product()
	
	fe1305_mul_taun(32,240)
	fe1305_add_product()
	
	fe1305_mul_taun(48,216)
	fe1305_add_product()
	
	fe1305_mul_taun(64,192)
	fe1305_add_product()
	
	fe1305_mul_taun(80,168)
	fe1305_add_product()
	
	fe1305_mul_taun(96,144)
	fe1305_add_product()
	
	fe1305_mul_taun(112,120)
	fe1305_add_product()
	
	fe1305_mul_taun(128,96)
	fe1305_add_product()
	
	fe1305_mul_taun(144,72)
	fe1305_add_product()
	
	fe1305_mul_taun(160,48)
	fe1305_add_product()
	
	fe1305_mul_taun(176,24)
	fe1305_add_product()
	
	fe1305_mul_tau(192,0)
	fe1305_add_product()				
	
	fe1305_reduce_5l()
	
	addq	$208,%rsi
	jmp     .LB0
	
.LB15:
	fe1305_mul_taun(0,312)
	fe1305_add_product()

	fe1305_mul_taun(16,288)
	fe1305_add_product()
	
	fe1305_mul_taun(32,264)
	fe1305_add_product()
	
	fe1305_mul_taun(48,240)
	fe1305_add_product()
	
	fe1305_mul_taun(64,216)
	fe1305_add_product()
	
	fe1305_mul_taun(80,192)
	fe1305_add_product()
	
	fe1305_mul_taun(96,168)
	fe1305_add_product()
	
	fe1305_mul_taun(112,144)
	fe1305_add_product()
	
	fe1305_mul_taun(128,120)
	fe1305_add_product()
	
	fe1305_mul_taun(144,96)
	fe1305_add_product()
	
	fe1305_mul_taun(160,72)
	fe1305_add_product()
	
	fe1305_mul_taun(176,48)
	fe1305_add_product()
	
	fe1305_mul_taun(192,24)
	fe1305_add_product()
	
	fe1305_mul_tau(208,0)
	fe1305_add_product()					
	
	fe1305_reduce_5l()
	
	addq	$224,%rsi	
	jmp	.LB0
	
.LB16:
	fe1305_mul_taun(0,336)
	fe1305_add_product()

	fe1305_mul_taun(16,312)
	fe1305_add_product()

	fe1305_mul_taun(32,288)
	fe1305_add_product()
	
	fe1305_mul_taun(48,264)
	fe1305_add_product()
	
	fe1305_mul_taun(64,240)
	fe1305_add_product()
	
	fe1305_mul_taun(80,216)
	fe1305_add_product()
	
	fe1305_mul_taun(96,192)
	fe1305_add_product()
	
	fe1305_mul_taun(112,168)
	fe1305_add_product()
	
	fe1305_mul_taun(128,144)
	fe1305_add_product()
	
	fe1305_mul_taun(144,120)
	fe1305_add_product()
	
	fe1305_mul_taun(160,96)
	fe1305_add_product()
	
	fe1305_mul_taun(176,72)
	fe1305_add_product()
	
	fe1305_mul_taun(192,48)
	fe1305_add_product()
	
	fe1305_mul_taun(208,24)
	fe1305_add_product()
	
	fe1305_mul_tau(224,0)
	fe1305_add_product()					
	
	fe1305_reduce_5l()
	
	addq	$240,%rsi
	jmp	.LB0
	
.LB17:
	fe1305_mul_taun(0,360)
	fe1305_add_product()

	fe1305_mul_taun(16,336)
	fe1305_add_product()

	fe1305_mul_taun(32,312)
	fe1305_add_product()

	fe1305_mul_taun(48,288)
	fe1305_add_product()
	
	fe1305_mul_taun(64,264)
	fe1305_add_product()
	
	fe1305_mul_taun(80,240)
	fe1305_add_product()
	
	fe1305_mul_taun(96,216)
	fe1305_add_product()
	
	fe1305_mul_taun(112,192)
	fe1305_add_product()
	
	fe1305_mul_taun(128,168)
	fe1305_add_product()
	
	fe1305_mul_taun(144,144)
	fe1305_add_product()
	
	fe1305_mul_taun(160,120)
	fe1305_add_product()
	
	fe1305_mul_taun(176,96)
	fe1305_add_product()
	
	fe1305_mul_taun(192,72)
	fe1305_add_product()
	
	fe1305_mul_taun(208,48)
	fe1305_add_product()
	
	fe1305_mul_taun(224,24)
	fe1305_add_product()
	
	fe1305_mul_tau(240,0)
	fe1305_add_product()					
	
	fe1305_reduce_5l()
	
	addq	$256,%rsi
	jmp	.LB0
	
.LB18:
	fe1305_mul_taun(0,384)
	fe1305_add_product()
	
	fe1305_mul_taun(16,360)
	fe1305_add_product()

	fe1305_mul_taun(32,336)
	fe1305_add_product()

	fe1305_mul_taun(48,312)
	fe1305_add_product()

	fe1305_mul_taun(64,288)
	fe1305_add_product()
	
	fe1305_mul_taun(80,264)
	fe1305_add_product()
	
	fe1305_mul_taun(96,240)
	fe1305_add_product()
	
	fe1305_mul_taun(112,216)
	fe1305_add_product()
	
	fe1305_mul_taun(128,192)
	fe1305_add_product()
	
	fe1305_mul_taun(144,168)
	fe1305_add_product()
	
	fe1305_mul_taun(160,144)
	fe1305_add_product()
	
	fe1305_mul_taun(176,120)
	fe1305_add_product()
	
	fe1305_mul_taun(192,96)
	fe1305_add_product()
	
	fe1305_mul_taun(208,72)
	fe1305_add_product()
	
	fe1305_mul_taun(224,48)
	fe1305_add_product()
	
	fe1305_mul_taun(240,24)
	fe1305_add_product()
	
	fe1305_mul_tau(256,0)
	fe1305_add_product()					
	
	fe1305_reduce_5l()
	
	addq	$272,%rsi
	jmp	.LB0
	
.LB19:
	fe1305_mul_taun(0,408)
	fe1305_add_product()

	fe1305_mul_taun(16,384)
	fe1305_add_product()
	
	fe1305_mul_taun(32,360)
	fe1305_add_product()

	fe1305_mul_taun(48,336)
	fe1305_add_product()

	fe1305_mul_taun(64,312)
	fe1305_add_product()

	fe1305_mul_taun(80,288)
	fe1305_add_product()
	
	fe1305_mul_taun(96,264)
	fe1305_add_product()
	
	fe1305_mul_taun(112,240)
	fe1305_add_product()
	
	fe1305_mul_taun(128,216)
	fe1305_add_product()
	
	fe1305_mul_taun(144,192)
	fe1305_add_product()
	
	fe1305_mul_taun(160,168)
	fe1305_add_product()
	
	fe1305_mul_taun(176,144)
	fe1305_add_product()
	
	fe1305_mul_taun(192,120)
	fe1305_add_product()
	
	fe1305_mul_taun(208,96)
	fe1305_add_product()
	
	fe1305_mul_taun(224,72)
	fe1305_add_product()
	
	fe1305_mul_taun(240,48)
	fe1305_add_product()
	
	fe1305_mul_taun(256,24)
	fe1305_add_product()
	
	fe1305_mul_tau(272,0)
	fe1305_add_product()					
	
	fe1305_reduce_5l()
	
	addq	$288,%rsi
	jmp	.LB0
	
.LB20:
	fe1305_mul_taun(0,432)
	fe1305_add_product()

	fe1305_mul_taun(16,408)
	fe1305_add_product()

	fe1305_mul_taun(32,384)
	fe1305_add_product()
	
	fe1305_mul_taun(48,360)
	fe1305_add_product()

	fe1305_mul_taun(64,336)
	fe1305_add_product()

	fe1305_mul_taun(80,312)
	fe1305_add_product()

	fe1305_mul_taun(96,288)
	fe1305_add_product()
	
	fe1305_mul_taun(112,264)
	fe1305_add_product()
	
	fe1305_mul_taun(128,240)
	fe1305_add_product()
	
	fe1305_mul_taun(144,216)
	fe1305_add_product()
	
	fe1305_mul_taun(160,192)
	fe1305_add_product()
	
	fe1305_mul_taun(176,168)
	fe1305_add_product()
	
	fe1305_mul_taun(192,144)
	fe1305_add_product()
	
	fe1305_mul_taun(208,120)
	fe1305_add_product()
	
	fe1305_mul_taun(224,96)
	fe1305_add_product()
	
	fe1305_mul_taun(240,72)
	fe1305_add_product()
	
	fe1305_mul_taun(256,48)
	fe1305_add_product()
	
	fe1305_mul_taun(272,24)
	fe1305_add_product()
	
	fe1305_mul_tau(288,0)
	fe1305_add_product()					
	
	fe1305_reduce_5l()
	
	addq	$304,%rsi
	jmp	.LB0
	
.LB21:
	fe1305_mul_taun(0,456)
	fe1305_add_product()

	fe1305_mul_taun(16,432)
	fe1305_add_product()

	fe1305_mul_taun(32,408)
	fe1305_add_product()

	fe1305_mul_taun(48,384)
	fe1305_add_product()
	
	fe1305_mul_taun(64,360)
	fe1305_add_product()

	fe1305_mul_taun(80,336)
	fe1305_add_product()

	fe1305_mul_taun(96,312)
	fe1305_add_product()

	fe1305_mul_taun(112,288)
	fe1305_add_product()
	
	fe1305_mul_taun(128,264)
	fe1305_add_product()
	
	fe1305_mul_taun(144,240)
	fe1305_add_product()
	
	fe1305_mul_taun(160,216)
	fe1305_add_product()
	
	fe1305_mul_taun(176,192)
	fe1305_add_product()
	
	fe1305_mul_taun(192,168)
	fe1305_add_product()
	
	fe1305_mul_taun(208,144)
	fe1305_add_product()
	
	fe1305_mul_taun(224,120)
	fe1305_add_product()
	
	fe1305_mul_taun(240,96)
	fe1305_add_product()
	
	fe1305_mul_taun(256,72)
	fe1305_add_product()
	
	fe1305_mul_taun(272,48)
	fe1305_add_product()
	
	fe1305_mul_taun(288,24)
	fe1305_add_product()
	
	fe1305_mul_tau(304,0)
	fe1305_add_product()					
	
	fe1305_reduce_5l()
	
	addq	$320,%rsi
	jmp	.LB0
	
.LB22:
	fe1305_mul_taun(0,480)
	fe1305_add_product()

	fe1305_mul_taun(16,456)
	fe1305_add_product()

	fe1305_mul_taun(32,432)
	fe1305_add_product()

	fe1305_mul_taun(48,408)
	fe1305_add_product()

	fe1305_mul_taun(64,384)
	fe1305_add_product()
	
	fe1305_mul_taun(80,360)
	fe1305_add_product()

	fe1305_mul_taun(96,336)
	fe1305_add_product()

	fe1305_mul_taun(112,312)
	fe1305_add_product()

	fe1305_mul_taun(128,288)
	fe1305_add_product()
	
	fe1305_mul_taun(144,264)
	fe1305_add_product()
	
	fe1305_mul_taun(160,240)
	fe1305_add_product()
	
	fe1305_mul_taun(176,216)
	fe1305_add_product()
	
	fe1305_mul_taun(192,192)
	fe1305_add_product()
	
	fe1305_mul_taun(208,168)
	fe1305_add_product()
	
	fe1305_mul_taun(224,144)
	fe1305_add_product()
	
	fe1305_mul_taun(240,120)
	fe1305_add_product()
	
	fe1305_mul_taun(256,96)
	fe1305_add_product()
	
	fe1305_mul_taun(272,72)
	fe1305_add_product()
	
	fe1305_mul_taun(288,48)
	fe1305_add_product()
	
	fe1305_mul_taun(304,24)
	fe1305_add_product()
	
	fe1305_mul_tau(320,0)
	fe1305_add_product()					
	
	fe1305_reduce_5l()
	
	addq	$336,%rsi
	jmp	.LB0
	
.LB23:
	fe1305_mul_taun(0,504)
	fe1305_add_product()

	fe1305_mul_taun(16,480)
	fe1305_add_product()

	fe1305_mul_taun(32,456)
	fe1305_add_product()

	fe1305_mul_taun(48,432)
	fe1305_add_product()

	fe1305_mul_taun(64,408)
	fe1305_add_product()

	fe1305_mul_taun(80,384)
	fe1305_add_product()
	
	fe1305_mul_taun(96,360)
	fe1305_add_product()

	fe1305_mul_taun(112,336)
	fe1305_add_product()

	fe1305_mul_taun(128,312)
	fe1305_add_product()

	fe1305_mul_taun(144,288)
	fe1305_add_product()
	
	fe1305_mul_taun(160,264)
	fe1305_add_product()
	
	fe1305_mul_taun(176,240)
	fe1305_add_product()
	
	fe1305_mul_taun(192,216)
	fe1305_add_product()
	
	fe1305_mul_taun(208,192)
	fe1305_add_product()
	
	fe1305_mul_taun(224,168)
	fe1305_add_product()
	
	fe1305_mul_taun(240,144)
	fe1305_add_product()
	
	fe1305_mul_taun(256,120)
	fe1305_add_product()
	
	fe1305_mul_taun(272,96)
	fe1305_add_product()
	
	fe1305_mul_taun(288,72)
	fe1305_add_product()
	
	fe1305_mul_taun(304,48)
	fe1305_add_product()
	
	fe1305_mul_taun(320,24)
	fe1305_add_product()
	
	fe1305_mul_tau(336,0)
	fe1305_add_product()					
	
	fe1305_reduce_5l()
	
	addq	$352,%rsi
			
.LB0:
	/* if the last block is full */
	cmpq    $0,64(%rsp)
	je      .L3
	
	/* if the last block has 8 bytes */	
	cmpq    $64,64(%rsp)	
	je	.L2	

	/* if the last block has 1 to 7 bytes */	
	jl	.L1
	
	/* else if the last block has 9 to 15 bytes */
	
	/* first chunk of message block = (r12) */	
	movq    0(%rsi),%r12
	addq	$8,%rsi	
	
	movq	$128,%rbx
	subq	64(%rsp),%rbx
	movq	$64,%rcx
	subq	%rbx,%rcx	
	shrq	$3,%rcx	
	leaq	104(%rsp),%rdi	
rep	movsb	(%rsi),(%rdi)	

	/* second chunk of message block = (r13) */
	movq    104(%rsp),%r13	
	movq	$-1,%r11
	movq	%rbx,%rcx	
	shrq	%cl,%r11
	addq	$1,%r11	
	orq	%r11,%r13
	
	movq	$0,%r14
	
	jmp	.L4

.L1:
	movq	64(%rsp),%rcx
	shrq	$3,%rcx	
	leaq	104(%rsp),%rdi	
rep	movsb	(%rsi),(%rdi)	

	/* first chunk of message block = (r12) */
	movq    104(%rsp),%r12	
	movq	$-1,%r11	
	movb	$64,%cl	
	subb	64(%rsp),%cl
	shrq	%cl,%r11	
	addq	$1,%r11	
	orq	%r11,%r12
	
	/* second chunk of message block = (r13) */
	movq	$0,%r13
		
	movq	$0,%r14

	jmp	.L4
	
.L2:
	movq	0(%rsi),%r12
	movq	$1,%r13
	movq	$0,%r14
	jmp	.L4

.L3:
	movq	0(%rsi),%r12
	movq	8(%rsi),%r13
	movq	$1,%r14
	
.L4:
	addq    %r12,%r8
	adcq    %r13,%r9
	adcq    %r14,%r10

	movq    112(%rsp),%rdi
	fe1305_mul_taur(0)

	fe1305_reduce_5l()
	
	jmp	.L9
	
.L5:   
	/* if the single message block is full */
	cmpq    $0,64(%rsp)
	je      .L8

	/* if the single message block has 1 to 7 bytes */
	cmpq    $8,72(%rsp)
	jl      .L6
	
	/* if the single message block has 8 bytes */
	je     .L7
	
	/* else if the single message block has 9 to 15 bytes */

	/* first chunk of message block = (r13) */	
	movq    0(%rsi),%r13
	
	addq	$8,%rsi	
	
	movq	$128,%rbx
	subq	64(%rsp),%rbx
	movq	$64,%rcx
	subq	%rbx,%rcx	
	shrq	$3,%rcx	
	leaq	104(%rsp),%rdi	
rep	movsb	(%rsi),(%rdi)	

	/* second chunk of message block = (rax) */
	movq    104(%rsp),%rax	
	movq	$-1,%r11
	movq	%rbx,%rcx	
	shrq	%cl,%r11	
	addq	$1,%r11	
	orq	%r11,%rax
	
	/* integer multiplication */		
	xorq    %r11,%r11
	movq    %r13,%rdx    

	mulx    %r14,%r8,%r9
	mulx    %r15,%rbx,%r10
	adcx    %rbx,%r9
	adcx    %r11,%r10

	xorq    %r12,%r12
	movq    %rax,%rdx
	   
	mulx    %r14,%rbx,%rbp
	adcx    %rbx,%r9
	adox    %rbp,%r10
	    
	mulx    %r15,%rbx,%rbp
	adcx    %rbx,%r10
	adox    %rbp,%r11
	adcx    %r12,%r11
	
	/* reduction on the integer product (r11 : r10 : r9 : r8) */
	movq    %r10,%rbx

	andq    mask2(%rip),%r10
	andq    mask2c(%rip),%rbx

	addq    %rbx,%r8
	adcq    %r11,%r9
	adcq    %r12,%r10

	shrd    $2,%r11,%rbx
	shrq    $2,%r11

	addq    %rbx,%r8
	adcq    %r11,%r9
	adcq    %r12,%r10

	jmp     .L9

.L6:   
	movq	64(%rsp),%rcx
	shrq	$3,%rcx	
	leaq	104(%rsp),%rdi	
rep	movsb	(%rsi),(%rdi)	

	/* message block = (r13) */
	movq    104(%rsp),%r13	
	movq	$-1,%r11
	movb	$64,%cl
	subb	64(%rsp),%cl
	shrq	%cl,%r11
	addq	$1,%r11
	orq	%r11,%r13

	/* integer multiplication */
	xorq    %r11,%r11
	movq    %r13,%rdx    

	mulx    %r14,%r8,%r9
	mulx    %r15,%rbx,%r10
	adcx    %rbx,%r9
	adcx    %r11,%r10
	
	jmp     .L9
	
.L7:   
	/* integer multiplication */
	xorq    %r11,%r11
	movq    0(%rsi),%rdx    

	mulx    %r14,%r8,%r9
	mulx    %r15,%rbx,%r10
	adcx    %rbx,%r9
	adcx    %r11,%r10
	
	xorq    %rax,%rax
	    
	adcx    %r14,%r9
	adox    %rax,%r10	
	adcx    %r15,%r10
	adox    %rax,%r11
	adcx    %rax,%r11	

	/* reduction on the integer product (r11 : r10 : r9 : r8) */
	movq    %r10,%r13

	andq    mask2(%rip),%r10
	andq    mask2c(%rip),%r13

	addq    %r13,%r8
	adcq    %r11,%r9
	adcq    $0,%r10

	shrd    $2,%r11,%r13
	shrq    $2,%r11

	addq    %r13,%r8
	adcq    %r11,%r9
	adcq    $0,%r10
	
	jmp     .L9		

.L8:
	/* integer multiplication */
	xorq    %r11,%r11
	movq    0(%rsi),%rdx    

	mulx    %r14,%r8,%r9
	mulx    %r15,%rbx,%r10
	adcx    %rbx,%r9
	adcx    %r11,%r10

	xorq    %r12,%r12
	movq    8(%rsi),%rdx
	   
	mulx    %r14,%rbx,%rbp
	adcx    %rbx,%r9
	adox    %rbp,%r10
	    
	mulx    %r15,%rbx,%rbp
	adcx    %rbx,%r10
	adox    %rbp,%r11
	adcx    %r12,%r11

	xorq    %rax,%rax
	    
	adcx    %r14,%r10
	adox    %rax,%r11	
	adcx    %r15,%r11
	adox    %rax,%r12
	adcx    %rax,%r12

	/* reduction on the integer product (r12 : r11 : r10 : r9 : r8) */
	movq    %r10,%rbx

	andq    mask2(%rip),%r10
	andq    mask2c(%rip),%rbx

	addq    %rbx,%r8
	adcq    %r11,%r9
	adcq    %r12,%r10

	shrd    $2,%r11,%rbx
	shrd    $2,%r12,%r11
	shrq    $2,%r12

	addq    %rbx,%r8
	adcq    %r11,%r9
	adcq    %r12,%r10

.L9:	
	/* final reduction on (r10 : r9 : r8) */
	movq    %r10,%r11
	shrq    $2,%r11
	andq	mask2(%rip),%r10

	imul    $5,%r11,%r11
	addq    %r11,%r8
	adcq    $0,%r9
	adcq    $0,%r10
	
	/* freeze the reduced field element (r10 : r9 : r8) */
	movq    %r8,%r11
	movq    %r9,%r12
	movq    %r10,%r13

	subq    p0(%rip),%r8
	sbbq    p1(%rip),%r9
	sbbq    p2(%rip),%r10

	movq    %r10,%rcx
	shlq    $62,%rcx

	cmovc   %r11,%r8
	cmovc   %r12,%r9
	cmovc   %r13,%r10

	/* add last 16 bytes of the key */
	addq	88(%rsp),%r8
	adcq	96(%rsp),%r9
	adcq	$0,%r10	

	/* store first 128 bytes of the result */
	movq 	56(%rsp),%rdi
	movq    %r8,0(%rdi)
	movq    %r9,8(%rdi)

	movq 	0(%rsp),%r11
	movq 	8(%rsp),%r12
	movq 	16(%rsp),%r13
	movq 	24(%rsp),%r14
	movq 	32(%rsp),%r15
	movq 	40(%rsp),%rbx
	movq 	48(%rsp),%rbp

	movq 	%r11,%rsp

	ret