// Code generated by command: go run chacha20poly1305_amd64_asm.go -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305. DO NOT EDIT. //go:build gc && !purego #include "textflag.h" // func polyHashADInternal<>() TEXT polyHashADInternal<>(SB), NOSPLIT, $0 // Hack: Must declare #define macros inside of a function due to Avo constraints // ROL rotates the uint32s in register R left by N bits, using temporary T. #define ROL(N, R, T) \ MOVO R, T; \ PSLLL $(N), T; \ PSRLL $(32-(N)), R; \ PXOR T, R // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. #ifdef GOAMD64_v2 #define ROL8(R, T) PSHUFB ·rol8<>(SB), R #else #define ROL8(R, T) ROL(8, R, T) #endif // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. #ifdef GOAMD64_v2 #define ROL16(R, T) PSHUFB ·rol16<>(SB), R #else #define ROL16(R, T) ROL(16, R, T) #endif XORQ R10, R10 XORQ R11, R11 XORQ R12, R12 CMPQ R9, $0x0d JNE hashADLoop MOVQ (CX), R10 MOVQ 5(CX), R11 SHRQ $0x18, R11 MOVQ $0x00000001, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 RET hashADLoop: // Hash in 16 byte chunks CMPQ R9, $0x10 JB hashADTail ADDQ (CX), R10 ADCQ 8(CX), R11 ADCQ $0x01, R12 LEAQ 16(CX), CX SUBQ $0x10, R9 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 JMP hashADLoop hashADTail: CMPQ R9, $0x00 JE hashADDone // Hash last < 16 byte tail XORQ R13, R13 XORQ R14, R14 XORQ R15, R15 ADDQ R9, CX hashADTailLoop: SHLQ $0x08, R13, R14 SHLQ $0x08, R13 MOVB -1(CX), R15 XORQ R15, R13 DECQ CX DECQ R9 JNE hashADTailLoop ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 hashADDone: RET // func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool // Requires: AVX, AVX2, BMI2, CMOV, SSE2 TEXT ·chacha20Poly1305Open(SB), $288-97 // For aligned stack access MOVQ SP, BP ADDQ $0x20, BP ANDQ $-32, BP MOVQ dst_base+0(FP), DI MOVQ key_base+24(FP), R8 MOVQ src_base+48(FP), SI MOVQ src_len+56(FP), BX MOVQ ad_base+72(FP), CX // Check for AVX2 support CMPB ·useAVX2+0(SB), $0x01 JE chacha20Poly1305Open_AVX2 // Special optimization, for very short buffers CMPQ BX, $0x80 JBE openSSE128 // For long buffers, prepare the poly key first MOVOU ·chacha20Constants<>+0(SB), X0 MOVOU 16(R8), X3 MOVOU 32(R8), X6 MOVOU 48(R8), X9 MOVO X9, X13 // Store state on stack for future use MOVO X3, 32(BP) MOVO X6, 48(BP) MOVO X9, 128(BP) MOVQ $0x0000000a, R9 openSSEPreparePolyKey: PADDD X3, X0 PXOR X0, X9 ROL16(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x0c, X12 PSRLL $0x14, X3 PXOR X12, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x07, X12 PSRLL $0x19, X3 PXOR X12, X3 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x0c PADDD X3, X0 PXOR X0, X9 ROL16(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x0c, X12 PSRLL $0x14, X3 PXOR X12, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x07, X12 PSRLL $0x19, X3 PXOR X12, X3 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x04 DECQ R9 JNE openSSEPreparePolyKey // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded PADDL ·chacha20Constants<>+0(SB), X0 PADDL 32(BP), X3 // Clamp and store the key PAND ·polyClampMask<>+0(SB), X0 MOVO X0, (BP) MOVO X3, 16(BP) // Hash AAD MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) openSSEMainLoop: CMPQ BX, $0x00000100 JB openSSEMainLoopDone // Load state, increment counter blocks MOVO ·chacha20Constants<>+0(SB), X0 MOVO 32(BP), X3 MOVO 48(BP), X6 MOVO 128(BP), X9 PADDL ·sseIncMask<>+0(SB), X9 MOVO X0, X1 MOVO X3, X4 MOVO X6, X7 MOVO X9, X10 PADDL ·sseIncMask<>+0(SB), X10 MOVO X1, X2 MOVO X4, X5 MOVO X7, X8 MOVO X10, X11 PADDL ·sseIncMask<>+0(SB), X11 MOVO X2, X12 MOVO X5, X13 MOVO X8, X14 MOVO X11, X15 PADDL ·sseIncMask<>+0(SB), X15 // Store counters MOVO X9, 80(BP) MOVO X10, 96(BP) MOVO X11, 112(BP) MOVO X15, 128(BP) // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash // 2 blocks, and for the remaining 4 only 1 block - for a total of 16 MOVQ $0x00000004, CX MOVQ SI, R9 openSSEInternalLoop: MOVO X14, 64(BP) PADDD X3, X0 PXOR X0, X9 ROL16(X9, X14) PADDD X9, X6 PXOR X6, X3 MOVO X3, X14 PSLLL $0x0c, X14 PSRLL $0x14, X3 PXOR X14, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X14) PADDD X9, X6 PXOR X6, X3 MOVO X3, X14 PSLLL $0x07, X14 PSRLL $0x19, X3 PXOR X14, X3 PADDD X4, X1 PXOR X1, X10 ROL16(X10, X14) PADDD X10, X7 PXOR X7, X4 MOVO X4, X14 PSLLL $0x0c, X14 PSRLL $0x14, X4 PXOR X14, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X14) PADDD X10, X7 PXOR X7, X4 MOVO X4, X14 PSLLL $0x07, X14 PSRLL $0x19, X4 PXOR X14, X4 PADDD X5, X2 PXOR X2, X11 ROL16(X11, X14) PADDD X11, X8 PXOR X8, X5 MOVO X5, X14 PSLLL $0x0c, X14 PSRLL $0x14, X5 PXOR X14, X5 PADDD X5, X2 PXOR X2, X11 ROL8(X11, X14) PADDD X11, X8 PXOR X8, X5 MOVO X5, X14 PSLLL $0x07, X14 PSRLL $0x19, X5 PXOR X14, X5 MOVO 64(BP), X14 MOVO X7, 64(BP) PADDD X13, X12 PXOR X12, X15 ROL16(X15, X7) PADDD X15, X14 PXOR X14, X13 MOVO X13, X7 PSLLL $0x0c, X7 PSRLL $0x14, X13 PXOR X7, X13 PADDD X13, X12 PXOR X12, X15 ROL8(X15, X7) PADDD X15, X14 PXOR X14, X13 MOVO X13, X7 PSLLL $0x07, X7 PSRLL $0x19, X13 PXOR X7, X13 MOVO 64(BP), X7 ADDQ (R9), R10 ADCQ 8(R9), R11 ADCQ $0x01, R12 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc0 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x0c MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX LEAQ 16(R9), R9 MOVO X14, 64(BP) PADDD X3, X0 PXOR X0, X9 ROL16(X9, X14) PADDD X9, X6 PXOR X6, X3 MOVO X3, X14 PSLLL $0x0c, X14 PSRLL $0x14, X3 PXOR X14, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X14) PADDD X9, X6 PXOR X6, X3 MOVO X3, X14 PSLLL $0x07, X14 PSRLL $0x19, X3 PXOR X14, X3 PADDD X4, X1 PXOR X1, X10 ROL16(X10, X14) PADDD X10, X7 PXOR X7, X4 MOVO X4, X14 PSLLL $0x0c, X14 PSRLL $0x14, X4 PXOR X14, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X14) PADDD X10, X7 PXOR X7, X4 MOVO X4, X14 PSLLL $0x07, X14 PSRLL $0x19, X4 PXOR X14, X4 PADDD X5, X2 PXOR X2, X11 ROL16(X11, X14) PADDD X11, X8 PXOR X8, X5 MOVO X5, X14 PSLLL $0x0c, X14 PSRLL $0x14, X5 PXOR X14, X5 PADDD X5, X2 PXOR X2, X11 ROL8(X11, X14) PADDD X11, X8 PXOR X8, X5 MOVO X5, X14 PSLLL $0x07, X14 PSRLL $0x19, X5 PXOR X14, X5 MOVO 64(BP), X14 MOVO X7, 64(BP) IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 PADDD X13, X12 PXOR X12, X15 ROL16(X15, X7) PADDD X15, X14 PXOR X14, X13 MOVO X13, X7 PSLLL $0x0c, X7 PSRLL $0x14, X13 PXOR X7, X13 PADDD X13, X12 PXOR X12, X15 ROL8(X15, X7) PADDD X15, X14 PXOR X14, X13 MOVO X13, X7 PSLLL $0x07, X7 PSRLL $0x19, X13 PXOR X7, X13 MOVO 64(BP), X7 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc0 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x04 DECQ CX JGE openSSEInternalLoop ADDQ (R9), R10 ADCQ 8(R9), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 16(R9), R9 CMPQ CX, $-6 JG openSSEInternalLoop // Add in the state PADDD ·chacha20Constants<>+0(SB), X0 PADDD ·chacha20Constants<>+0(SB), X1 PADDD ·chacha20Constants<>+0(SB), X2 PADDD ·chacha20Constants<>+0(SB), X12 PADDD 32(BP), X3 PADDD 32(BP), X4 PADDD 32(BP), X5 PADDD 32(BP), X13 PADDD 48(BP), X6 PADDD 48(BP), X7 PADDD 48(BP), X8 PADDD 48(BP), X14 PADDD 80(BP), X9 PADDD 96(BP), X10 PADDD 112(BP), X11 PADDD 128(BP), X15 // Load - xor - store MOVO X15, 64(BP) MOVOU (SI), X15 PXOR X15, X0 MOVOU X0, (DI) MOVOU 16(SI), X15 PXOR X15, X3 MOVOU X3, 16(DI) MOVOU 32(SI), X15 PXOR X15, X6 MOVOU X6, 32(DI) MOVOU 48(SI), X15 PXOR X15, X9 MOVOU X9, 48(DI) MOVOU 64(SI), X9 PXOR X9, X1 MOVOU X1, 64(DI) MOVOU 80(SI), X9 PXOR X9, X4 MOVOU X4, 80(DI) MOVOU 96(SI), X9 PXOR X9, X7 MOVOU X7, 96(DI) MOVOU 112(SI), X9 PXOR X9, X10 MOVOU X10, 112(DI) MOVOU 128(SI), X9 PXOR X9, X2 MOVOU X2, 128(DI) MOVOU 144(SI), X9 PXOR X9, X5 MOVOU X5, 144(DI) MOVOU 160(SI), X9 PXOR X9, X8 MOVOU X8, 160(DI) MOVOU 176(SI), X9 PXOR X9, X11 MOVOU X11, 176(DI) MOVOU 192(SI), X9 PXOR X9, X12 MOVOU X12, 192(DI) MOVOU 208(SI), X9 PXOR X9, X13 MOVOU X13, 208(DI) MOVOU 224(SI), X9 PXOR X9, X14 MOVOU X14, 224(DI) MOVOU 240(SI), X9 PXOR 64(BP), X9 MOVOU X9, 240(DI) LEAQ 256(SI), SI LEAQ 256(DI), DI SUBQ $0x00000100, BX JMP openSSEMainLoop openSSEMainLoopDone: // Handle the various tail sizes efficiently TESTQ BX, BX JE openSSEFinalize CMPQ BX, $0x40 JBE openSSETail64 CMPQ BX, $0x80 JBE openSSETail128 CMPQ BX, $0xc0 JBE openSSETail192 JMP openSSETail256 openSSEFinalize: // Hash in the PT, AAD lengths ADDQ ad_len+80(FP), R10 ADCQ src_len+56(FP), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 // Final reduce MOVQ R10, R13 MOVQ R11, R14 MOVQ R12, R15 SUBQ $-5, R10 SBBQ $-1, R11 SBBQ $0x03, R12 CMOVQCS R13, R10 CMOVQCS R14, R11 CMOVQCS R15, R12 // Add in the "s" part of the key ADDQ 16(BP), R10 ADCQ 24(BP), R11 // Finally, constant time compare to the tag at the end of the message XORQ AX, AX MOVQ $0x00000001, DX XORQ (SI), R10 XORQ 8(SI), R11 ORQ R11, R10 CMOVQEQ DX, AX // Return true iff tags are equal MOVB AX, ret+96(FP) RET openSSE128: MOVOU ·chacha20Constants<>+0(SB), X0 MOVOU 16(R8), X3 MOVOU 32(R8), X6 MOVOU 48(R8), X9 MOVO X0, X1 MOVO X3, X4 MOVO X6, X7 MOVO X9, X10 PADDL ·sseIncMask<>+0(SB), X10 MOVO X1, X2 MOVO X4, X5 MOVO X7, X8 MOVO X10, X11 PADDL ·sseIncMask<>+0(SB), X11 MOVO X3, X13 MOVO X6, X14 MOVO X10, X15 MOVQ $0x0000000a, R9 openSSE128InnerCipherLoop: PADDD X3, X0 PXOR X0, X9 ROL16(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x0c, X12 PSRLL $0x14, X3 PXOR X12, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x07, X12 PSRLL $0x19, X3 PXOR X12, X3 PADDD X4, X1 PXOR X1, X10 ROL16(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x0c, X12 PSRLL $0x14, X4 PXOR X12, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x07, X12 PSRLL $0x19, X4 PXOR X12, X4 PADDD X5, X2 PXOR X2, X11 ROL16(X11, X12) PADDD X11, X8 PXOR X8, X5 MOVO X5, X12 PSLLL $0x0c, X12 PSRLL $0x14, X5 PXOR X12, X5 PADDD X5, X2 PXOR X2, X11 ROL8(X11, X12) PADDD X11, X8 PXOR X8, X5 MOVO X5, X12 PSLLL $0x07, X12 PSRLL $0x19, X5 PXOR X12, X5 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc0 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x0c PADDD X3, X0 PXOR X0, X9 ROL16(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x0c, X12 PSRLL $0x14, X3 PXOR X12, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x07, X12 PSRLL $0x19, X3 PXOR X12, X3 PADDD X4, X1 PXOR X1, X10 ROL16(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x0c, X12 PSRLL $0x14, X4 PXOR X12, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x07, X12 PSRLL $0x19, X4 PXOR X12, X4 PADDD X5, X2 PXOR X2, X11 ROL16(X11, X12) PADDD X11, X8 PXOR X8, X5 MOVO X5, X12 PSLLL $0x0c, X12 PSRLL $0x14, X5 PXOR X12, X5 PADDD X5, X2 PXOR X2, X11 ROL8(X11, X12) PADDD X11, X8 PXOR X8, X5 MOVO X5, X12 PSLLL $0x07, X12 PSRLL $0x19, X5 PXOR X12, X5 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc0 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x04 DECQ R9 JNE openSSE128InnerCipherLoop // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded PADDL ·chacha20Constants<>+0(SB), X0 PADDL ·chacha20Constants<>+0(SB), X1 PADDL ·chacha20Constants<>+0(SB), X2 PADDL X13, X3 PADDL X13, X4 PADDL X13, X5 PADDL X14, X7 PADDL X14, X8 PADDL X15, X10 PADDL ·sseIncMask<>+0(SB), X15 PADDL X15, X11 // Clamp and store the key PAND ·polyClampMask<>+0(SB), X0 MOVOU X0, (BP) MOVOU X3, 16(BP) // Hash MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) openSSE128Open: CMPQ BX, $0x10 JB openSSETail16 SUBQ $0x10, BX // Load for hashing ADDQ (SI), R10 ADCQ 8(SI), R11 ADCQ $0x01, R12 // Load for decryption MOVOU (SI), X12 PXOR X12, X1 MOVOU X1, (DI) LEAQ 16(SI), SI LEAQ 16(DI), DI MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 // Shift the stream "left" MOVO X4, X1 MOVO X7, X4 MOVO X10, X7 MOVO X2, X10 MOVO X5, X2 MOVO X8, X5 MOVO X11, X8 JMP openSSE128Open openSSETail16: TESTQ BX, BX JE openSSEFinalize // We can safely load the CT from the end, because it is padded with the MAC MOVQ BX, R9 SHLQ $0x04, R9 LEAQ ·andMask<>+0(SB), R13 MOVOU (SI), X12 ADDQ BX, SI PAND -16(R13)(R9*1), X12 MOVO X12, 64(BP) MOVQ X12, R13 MOVQ 72(BP), R14 PXOR X1, X12 // We can only store one byte at a time, since plaintext can be shorter than 16 bytes openSSETail16Store: MOVQ X12, R8 MOVB R8, (DI) PSRLDQ $0x01, X12 INCQ DI DECQ BX JNE openSSETail16Store ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 JMP openSSEFinalize openSSETail64: MOVO ·chacha20Constants<>+0(SB), X0 MOVO 32(BP), X3 MOVO 48(BP), X6 MOVO 128(BP), X9 PADDL ·sseIncMask<>+0(SB), X9 MOVO X9, 80(BP) XORQ R9, R9 MOVQ BX, CX CMPQ CX, $0x10 JB openSSETail64LoopB openSSETail64LoopA: ADDQ (SI)(R9*1), R10 ADCQ 8(SI)(R9*1), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 SUBQ $0x10, CX openSSETail64LoopB: ADDQ $0x10, R9 PADDD X3, X0 PXOR X0, X9 ROL16(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x0c, X12 PSRLL $0x14, X3 PXOR X12, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x07, X12 PSRLL $0x19, X3 PXOR X12, X3 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x0c PADDD X3, X0 PXOR X0, X9 ROL16(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x0c, X12 PSRLL $0x14, X3 PXOR X12, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x07, X12 PSRLL $0x19, X3 PXOR X12, X3 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x04 CMPQ CX, $0x10 JAE openSSETail64LoopA CMPQ R9, $0xa0 JNE openSSETail64LoopB PADDL ·chacha20Constants<>+0(SB), X0 PADDL 32(BP), X3 PADDL 48(BP), X6 PADDL 80(BP), X9 openSSETail64DecLoop: CMPQ BX, $0x10 JB openSSETail64DecLoopDone SUBQ $0x10, BX MOVOU (SI), X12 PXOR X12, X0 MOVOU X0, (DI) LEAQ 16(SI), SI LEAQ 16(DI), DI MOVO X3, X0 MOVO X6, X3 MOVO X9, X6 JMP openSSETail64DecLoop openSSETail64DecLoopDone: MOVO X0, X1 JMP openSSETail16 openSSETail128: MOVO ·chacha20Constants<>+0(SB), X1 MOVO 32(BP), X4 MOVO 48(BP), X7 MOVO 128(BP), X10 PADDL ·sseIncMask<>+0(SB), X10 MOVO X10, 80(BP) MOVO X1, X0 MOVO X4, X3 MOVO X7, X6 MOVO X10, X9 PADDL ·sseIncMask<>+0(SB), X9 MOVO X9, 96(BP) XORQ R9, R9 MOVQ BX, CX ANDQ $-16, CX openSSETail128LoopA: ADDQ (SI)(R9*1), R10 ADCQ 8(SI)(R9*1), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 openSSETail128LoopB: ADDQ $0x10, R9 PADDD X3, X0 PXOR X0, X9 ROL16(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x0c, X12 PSRLL $0x14, X3 PXOR X12, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x07, X12 PSRLL $0x19, X3 PXOR X12, X3 PADDD X4, X1 PXOR X1, X10 ROL16(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x0c, X12 PSRLL $0x14, X4 PXOR X12, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x07, X12 PSRLL $0x19, X4 PXOR X12, X4 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x0c PADDD X3, X0 PXOR X0, X9 ROL16(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x0c, X12 PSRLL $0x14, X3 PXOR X12, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x07, X12 PSRLL $0x19, X3 PXOR X12, X3 PADDD X4, X1 PXOR X1, X10 ROL16(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x0c, X12 PSRLL $0x14, X4 PXOR X12, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x07, X12 PSRLL $0x19, X4 PXOR X12, X4 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x04 CMPQ R9, CX JB openSSETail128LoopA CMPQ R9, $0xa0 JNE openSSETail128LoopB PADDL ·chacha20Constants<>+0(SB), X0 PADDL ·chacha20Constants<>+0(SB), X1 PADDL 32(BP), X3 PADDL 32(BP), X4 PADDL 48(BP), X6 PADDL 48(BP), X7 PADDL 96(BP), X9 PADDL 80(BP), X10 MOVOU (SI), X12 MOVOU 16(SI), X13 MOVOU 32(SI), X14 MOVOU 48(SI), X15 PXOR X12, X1 PXOR X13, X4 PXOR X14, X7 PXOR X15, X10 MOVOU X1, (DI) MOVOU X4, 16(DI) MOVOU X7, 32(DI) MOVOU X10, 48(DI) SUBQ $0x40, BX LEAQ 64(SI), SI LEAQ 64(DI), DI JMP openSSETail64DecLoop openSSETail192: MOVO ·chacha20Constants<>+0(SB), X2 MOVO 32(BP), X5 MOVO 48(BP), X8 MOVO 128(BP), X11 PADDL ·sseIncMask<>+0(SB), X11 MOVO X11, 80(BP) MOVO X2, X1 MOVO X5, X4 MOVO X8, X7 MOVO X11, X10 PADDL ·sseIncMask<>+0(SB), X10 MOVO X10, 96(BP) MOVO X1, X0 MOVO X4, X3 MOVO X7, X6 MOVO X10, X9 PADDL ·sseIncMask<>+0(SB), X9 MOVO X9, 112(BP) MOVQ BX, CX MOVQ $0x000000a0, R9 CMPQ CX, $0xa0 CMOVQGT R9, CX ANDQ $-16, CX XORQ R9, R9 openSSLTail192LoopA: ADDQ (SI)(R9*1), R10 ADCQ 8(SI)(R9*1), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 openSSLTail192LoopB: ADDQ $0x10, R9 PADDD X3, X0 PXOR X0, X9 ROL16(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x0c, X12 PSRLL $0x14, X3 PXOR X12, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x07, X12 PSRLL $0x19, X3 PXOR X12, X3 PADDD X4, X1 PXOR X1, X10 ROL16(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x0c, X12 PSRLL $0x14, X4 PXOR X12, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x07, X12 PSRLL $0x19, X4 PXOR X12, X4 PADDD X5, X2 PXOR X2, X11 ROL16(X11, X12) PADDD X11, X8 PXOR X8, X5 MOVO X5, X12 PSLLL $0x0c, X12 PSRLL $0x14, X5 PXOR X12, X5 PADDD X5, X2 PXOR X2, X11 ROL8(X11, X12) PADDD X11, X8 PXOR X8, X5 MOVO X5, X12 PSLLL $0x07, X12 PSRLL $0x19, X5 PXOR X12, X5 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc0 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x0c PADDD X3, X0 PXOR X0, X9 ROL16(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x0c, X12 PSRLL $0x14, X3 PXOR X12, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x07, X12 PSRLL $0x19, X3 PXOR X12, X3 PADDD X4, X1 PXOR X1, X10 ROL16(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x0c, X12 PSRLL $0x14, X4 PXOR X12, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x07, X12 PSRLL $0x19, X4 PXOR X12, X4 PADDD X5, X2 PXOR X2, X11 ROL16(X11, X12) PADDD X11, X8 PXOR X8, X5 MOVO X5, X12 PSLLL $0x0c, X12 PSRLL $0x14, X5 PXOR X12, X5 PADDD X5, X2 PXOR X2, X11 ROL8(X11, X12) PADDD X11, X8 PXOR X8, X5 MOVO X5, X12 PSLLL $0x07, X12 PSRLL $0x19, X5 PXOR X12, X5 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc0 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x04 CMPQ R9, CX JB openSSLTail192LoopA CMPQ R9, $0xa0 JNE openSSLTail192LoopB CMPQ BX, $0xb0 JB openSSLTail192Store ADDQ 160(SI), R10 ADCQ 168(SI), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 CMPQ BX, $0xc0 JB openSSLTail192Store ADDQ 176(SI), R10 ADCQ 184(SI), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 openSSLTail192Store: PADDL ·chacha20Constants<>+0(SB), X0 PADDL ·chacha20Constants<>+0(SB), X1 PADDL ·chacha20Constants<>+0(SB), X2 PADDL 32(BP), X3 PADDL 32(BP), X4 PADDL 32(BP), X5 PADDL 48(BP), X6 PADDL 48(BP), X7 PADDL 48(BP), X8 PADDL 112(BP), X9 PADDL 96(BP), X10 PADDL 80(BP), X11 MOVOU (SI), X12 MOVOU 16(SI), X13 MOVOU 32(SI), X14 MOVOU 48(SI), X15 PXOR X12, X2 PXOR X13, X5 PXOR X14, X8 PXOR X15, X11 MOVOU X2, (DI) MOVOU X5, 16(DI) MOVOU X8, 32(DI) MOVOU X11, 48(DI) MOVOU 64(SI), X12 MOVOU 80(SI), X13 MOVOU 96(SI), X14 MOVOU 112(SI), X15 PXOR X12, X1 PXOR X13, X4 PXOR X14, X7 PXOR X15, X10 MOVOU X1, 64(DI) MOVOU X4, 80(DI) MOVOU X7, 96(DI) MOVOU X10, 112(DI) SUBQ $0x80, BX LEAQ 128(SI), SI LEAQ 128(DI), DI JMP openSSETail64DecLoop openSSETail256: MOVO ·chacha20Constants<>+0(SB), X0 MOVO 32(BP), X3 MOVO 48(BP), X6 MOVO 128(BP), X9 PADDL ·sseIncMask<>+0(SB), X9 MOVO X0, X1 MOVO X3, X4 MOVO X6, X7 MOVO X9, X10 PADDL ·sseIncMask<>+0(SB), X10 MOVO X1, X2 MOVO X4, X5 MOVO X7, X8 MOVO X10, X11 PADDL ·sseIncMask<>+0(SB), X11 MOVO X2, X12 MOVO X5, X13 MOVO X8, X14 MOVO X11, X15 PADDL ·sseIncMask<>+0(SB), X15 // Store counters MOVO X9, 80(BP) MOVO X10, 96(BP) MOVO X11, 112(BP) MOVO X15, 128(BP) XORQ R9, R9 openSSETail256Loop: ADDQ (SI)(R9*1), R10 ADCQ 8(SI)(R9*1), R11 ADCQ $0x01, R12 MOVO X14, 64(BP) PADDD X3, X0 PXOR X0, X9 ROL16(X9, X14) PADDD X9, X6 PXOR X6, X3 MOVO X3, X14 PSLLL $0x0c, X14 PSRLL $0x14, X3 PXOR X14, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X14) PADDD X9, X6 PXOR X6, X3 MOVO X3, X14 PSLLL $0x07, X14 PSRLL $0x19, X3 PXOR X14, X3 PADDD X4, X1 PXOR X1, X10 ROL16(X10, X14) PADDD X10, X7 PXOR X7, X4 MOVO X4, X14 PSLLL $0x0c, X14 PSRLL $0x14, X4 PXOR X14, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X14) PADDD X10, X7 PXOR X7, X4 MOVO X4, X14 PSLLL $0x07, X14 PSRLL $0x19, X4 PXOR X14, X4 PADDD X5, X2 PXOR X2, X11 ROL16(X11, X14) PADDD X11, X8 PXOR X8, X5 MOVO X5, X14 PSLLL $0x0c, X14 PSRLL $0x14, X5 PXOR X14, X5 PADDD X5, X2 PXOR X2, X11 ROL8(X11, X14) PADDD X11, X8 PXOR X8, X5 MOVO X5, X14 PSLLL $0x07, X14 PSRLL $0x19, X5 PXOR X14, X5 MOVO 64(BP), X14 MOVO X7, 64(BP) PADDD X13, X12 PXOR X12, X15 ROL16(X15, X7) PADDD X15, X14 PXOR X14, X13 MOVO X13, X7 PSLLL $0x0c, X7 PSRLL $0x14, X13 PXOR X7, X13 PADDD X13, X12 PXOR X12, X15 ROL8(X15, X7) PADDD X15, X14 PXOR X14, X13 MOVO X13, X7 PSLLL $0x07, X7 PSRLL $0x19, X13 PXOR X7, X13 MOVO 64(BP), X7 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc0 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x0c MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX MOVO X14, 64(BP) PADDD X3, X0 PXOR X0, X9 ROL16(X9, X14) PADDD X9, X6 PXOR X6, X3 MOVO X3, X14 PSLLL $0x0c, X14 PSRLL $0x14, X3 PXOR X14, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X14) PADDD X9, X6 PXOR X6, X3 MOVO X3, X14 PSLLL $0x07, X14 PSRLL $0x19, X3 PXOR X14, X3 PADDD X4, X1 PXOR X1, X10 ROL16(X10, X14) PADDD X10, X7 PXOR X7, X4 MOVO X4, X14 PSLLL $0x0c, X14 PSRLL $0x14, X4 PXOR X14, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X14) PADDD X10, X7 PXOR X7, X4 MOVO X4, X14 PSLLL $0x07, X14 PSRLL $0x19, X4 PXOR X14, X4 PADDD X5, X2 PXOR X2, X11 ROL16(X11, X14) PADDD X11, X8 PXOR X8, X5 MOVO X5, X14 PSLLL $0x0c, X14 PSRLL $0x14, X5 PXOR X14, X5 PADDD X5, X2 PXOR X2, X11 ROL8(X11, X14) PADDD X11, X8 PXOR X8, X5 MOVO X5, X14 PSLLL $0x07, X14 PSRLL $0x19, X5 PXOR X14, X5 MOVO 64(BP), X14 MOVO X7, 64(BP) PADDD X13, X12 PXOR X12, X15 ROL16(X15, X7) PADDD X15, X14 PXOR X14, X13 MOVO X13, X7 PSLLL $0x0c, X7 PSRLL $0x14, X13 PXOR X7, X13 PADDD X13, X12 PXOR X12, X15 ROL8(X15, X7) PADDD X15, X14 PXOR X14, X13 MOVO X13, X7 PSLLL $0x07, X7 PSRLL $0x19, X13 PXOR X7, X13 MOVO 64(BP), X7 IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc0 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x04 ADDQ $0x10, R9 CMPQ R9, $0xa0 JB openSSETail256Loop MOVQ BX, CX ANDQ $-16, CX openSSETail256HashLoop: ADDQ (SI)(R9*1), R10 ADCQ 8(SI)(R9*1), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 ADDQ $0x10, R9 CMPQ R9, CX JB openSSETail256HashLoop // Add in the state PADDD ·chacha20Constants<>+0(SB), X0 PADDD ·chacha20Constants<>+0(SB), X1 PADDD ·chacha20Constants<>+0(SB), X2 PADDD ·chacha20Constants<>+0(SB), X12 PADDD 32(BP), X3 PADDD 32(BP), X4 PADDD 32(BP), X5 PADDD 32(BP), X13 PADDD 48(BP), X6 PADDD 48(BP), X7 PADDD 48(BP), X8 PADDD 48(BP), X14 PADDD 80(BP), X9 PADDD 96(BP), X10 PADDD 112(BP), X11 PADDD 128(BP), X15 MOVO X15, 64(BP) // Load - xor - store MOVOU (SI), X15 PXOR X15, X0 MOVOU 16(SI), X15 PXOR X15, X3 MOVOU 32(SI), X15 PXOR X15, X6 MOVOU 48(SI), X15 PXOR X15, X9 MOVOU X0, (DI) MOVOU X3, 16(DI) MOVOU X6, 32(DI) MOVOU X9, 48(DI) MOVOU 64(SI), X0 MOVOU 80(SI), X3 MOVOU 96(SI), X6 MOVOU 112(SI), X9 PXOR X0, X1 PXOR X3, X4 PXOR X6, X7 PXOR X9, X10 MOVOU X1, 64(DI) MOVOU X4, 80(DI) MOVOU X7, 96(DI) MOVOU X10, 112(DI) MOVOU 128(SI), X0 MOVOU 144(SI), X3 MOVOU 160(SI), X6 MOVOU 176(SI), X9 PXOR X0, X2 PXOR X3, X5 PXOR X6, X8 PXOR X9, X11 MOVOU X2, 128(DI) MOVOU X5, 144(DI) MOVOU X8, 160(DI) MOVOU X11, 176(DI) LEAQ 192(SI), SI LEAQ 192(DI), DI SUBQ $0xc0, BX MOVO X12, X0 MOVO X13, X3 MOVO X14, X6 MOVO 64(BP), X9 JMP openSSETail64DecLoop chacha20Poly1305Open_AVX2: VZEROUPPER VMOVDQU ·chacha20Constants<>+0(SB), Y0 BYTE $0xc4 BYTE $0x42 BYTE $0x7d BYTE $0x5a BYTE $0x70 BYTE $0x10 BYTE $0xc4 BYTE $0x42 BYTE $0x7d BYTE $0x5a BYTE $0x60 BYTE $0x20 BYTE $0xc4 BYTE $0xc2 BYTE $0x7d BYTE $0x5a BYTE $0x60 BYTE $0x30 VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 // Special optimization, for very short buffers CMPQ BX, $0xc0 JBE openAVX2192 CMPQ BX, $0x00000140 JBE openAVX2320 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream VMOVDQA Y14, 32(BP) VMOVDQA Y12, 64(BP) VMOVDQA Y4, 192(BP) MOVQ $0x0000000a, R9 openAVX2PreparePolyKey: VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 VPXOR Y3, Y14, Y14 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x0c, Y4, Y4, Y4 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 VPXOR Y3, Y14, Y14 VPALIGNR $0x0c, Y14, Y14, Y14 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x04, Y4, Y4, Y4 DECQ R9 JNE openAVX2PreparePolyKey VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 VPADDD 32(BP), Y14, Y14 VPADDD 64(BP), Y12, Y12 VPADDD 192(BP), Y4, Y4 VPERM2I128 $0x02, Y0, Y14, Y3 // Clamp and store poly key VPAND ·polyClampMask<>+0(SB), Y3, Y3 VMOVDQA Y3, (BP) // Stream for the first 64 bytes VPERM2I128 $0x13, Y0, Y14, Y0 VPERM2I128 $0x13, Y12, Y4, Y14 // Hash AD + first 64 bytes MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) XORQ CX, CX openAVX2InitialHash64: ADDQ (SI)(CX*1), R10 ADCQ 8(SI)(CX*1), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 ADDQ $0x10, CX CMPQ CX, $0x40 JNE openAVX2InitialHash64 // Decrypt the first 64 bytes VPXOR (SI), Y0, Y0 VPXOR 32(SI), Y14, Y14 VMOVDQU Y0, (DI) VMOVDQU Y14, 32(DI) LEAQ 64(SI), SI LEAQ 64(DI), DI SUBQ $0x40, BX openAVX2MainLoop: CMPQ BX, $0x00000200 JB openAVX2MainLoopDone // Load state, increment counter blocks, store the incremented counters VMOVDQU ·chacha20Constants<>+0(SB), Y0 VMOVDQA Y0, Y5 VMOVDQA Y0, Y6 VMOVDQA Y0, Y7 VMOVDQA 32(BP), Y14 VMOVDQA Y14, Y9 VMOVDQA Y14, Y10 VMOVDQA Y14, Y11 VMOVDQA 64(BP), Y12 VMOVDQA Y12, Y13 VMOVDQA Y12, Y8 VMOVDQA Y12, Y15 VMOVDQA 192(BP), Y4 VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 VMOVDQA Y4, 96(BP) VMOVDQA Y1, 128(BP) VMOVDQA Y2, 160(BP) VMOVDQA Y3, 192(BP) XORQ CX, CX openAVX2InternalLoop: ADDQ (SI)(CX*1), R10 ADCQ 8(SI)(CX*1), R11 ADCQ $0x01, R12 VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 VPADDD Y10, Y6, Y6 VPADDD Y11, Y7, Y7 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 VPXOR Y6, Y2, Y2 VPXOR Y7, Y3, Y3 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y2, Y2 VPSHUFB ·rol16<>+0(SB), Y3, Y3 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 VPADDD Y2, Y8, Y8 VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 VPXOR Y8, Y10, Y10 VPXOR Y15, Y11, Y11 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 VMOVDQA Y15, 224(BP) VPSLLD $0x0c, Y14, Y15 VPSRLD $0x14, Y14, Y14 VPXOR Y15, Y14, Y14 VPSLLD $0x0c, Y9, Y15 VPSRLD $0x14, Y9, Y9 VPXOR Y15, Y9, Y9 VPSLLD $0x0c, Y10, Y15 VPSRLD $0x14, Y10, Y10 VPXOR Y15, Y10, Y10 VPSLLD $0x0c, Y11, Y15 VPSRLD $0x14, Y11, Y11 VPXOR Y15, Y11, Y11 VMOVDQA 224(BP), Y15 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 VPADDD Y10, Y6, Y6 VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 VPXOR Y6, Y2, Y2 VPXOR Y7, Y3, Y3 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y2, Y2 VPSHUFB ·rol8<>+0(SB), Y3, Y3 ADDQ 16(SI)(CX*1), R10 ADCQ 24(SI)(CX*1), R11 ADCQ $0x01, R12 VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 VPADDD Y2, Y8, Y8 VPADDD Y3, Y15, Y15 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 VPXOR Y8, Y10, Y10 VPXOR Y15, Y11, Y11 VMOVDQA Y15, 224(BP) VPSLLD $0x07, Y14, Y15 VPSRLD $0x19, Y14, Y14 VPXOR Y15, Y14, Y14 VPSLLD $0x07, Y9, Y15 VPSRLD $0x19, Y9, Y9 VPXOR Y15, Y9, Y9 VPSLLD $0x07, Y10, Y15 VPSRLD $0x19, Y10, Y10 VPXOR Y15, Y10, Y10 VPSLLD $0x07, Y11, Y15 VPSRLD $0x19, Y11, Y11 VPXOR Y15, Y11, Y11 VMOVDQA 224(BP), Y15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x04, Y9, Y9, Y9 VPALIGNR $0x04, Y10, Y10, Y10 VPALIGNR $0x04, Y11, Y11, Y11 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 VPALIGNR $0x08, Y15, Y15, Y15 VPALIGNR $0x0c, Y4, Y4, Y4 VPALIGNR $0x0c, Y1, Y1, Y1 VPALIGNR $0x0c, Y2, Y2, Y2 VPALIGNR $0x0c, Y3, Y3, Y3 VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 VPADDD Y10, Y6, Y6 VPADDD Y11, Y7, Y7 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 VPXOR Y6, Y2, Y2 VPXOR Y7, Y3, Y3 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y2, Y2 VPSHUFB ·rol16<>+0(SB), Y3, Y3 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 VPADDD Y2, Y8, Y8 VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 VPXOR Y8, Y10, Y10 VPXOR Y15, Y11, Y11 ADDQ 32(SI)(CX*1), R10 ADCQ 40(SI)(CX*1), R11 ADCQ $0x01, R12 LEAQ 48(CX), CX VMOVDQA Y15, 224(BP) VPSLLD $0x0c, Y14, Y15 VPSRLD $0x14, Y14, Y14 VPXOR Y15, Y14, Y14 VPSLLD $0x0c, Y9, Y15 VPSRLD $0x14, Y9, Y9 VPXOR Y15, Y9, Y9 VPSLLD $0x0c, Y10, Y15 VPSRLD $0x14, Y10, Y10 VPXOR Y15, Y10, Y10 VPSLLD $0x0c, Y11, Y15 VPSRLD $0x14, Y11, Y11 VPXOR Y15, Y11, Y11 VMOVDQA 224(BP), Y15 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 VPADDD Y10, Y6, Y6 VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 VPXOR Y6, Y2, Y2 VPXOR Y7, Y3, Y3 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y2, Y2 VPSHUFB ·rol8<>+0(SB), Y3, Y3 VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 VPADDD Y2, Y8, Y8 VPADDD Y3, Y15, Y15 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 VPXOR Y8, Y10, Y10 VPXOR Y15, Y11, Y11 VMOVDQA Y15, 224(BP) VPSLLD $0x07, Y14, Y15 VPSRLD $0x19, Y14, Y14 VPXOR Y15, Y14, Y14 VPSLLD $0x07, Y9, Y15 VPSRLD $0x19, Y9, Y9 VPXOR Y15, Y9, Y9 VPSLLD $0x07, Y10, Y15 VPSRLD $0x19, Y10, Y10 VPXOR Y15, Y10, Y10 VPSLLD $0x07, Y11, Y15 VPSRLD $0x19, Y11, Y11 VPXOR Y15, Y11, Y11 VMOVDQA 224(BP), Y15 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 VPALIGNR $0x0c, Y14, Y14, Y14 VPALIGNR $0x0c, Y9, Y9, Y9 VPALIGNR $0x0c, Y10, Y10, Y10 VPALIGNR $0x0c, Y11, Y11, Y11 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 VPALIGNR $0x08, Y15, Y15, Y15 VPALIGNR $0x04, Y4, Y4, Y4 VPALIGNR $0x04, Y1, Y1, Y1 VPALIGNR $0x04, Y2, Y2, Y2 VPALIGNR $0x04, Y3, Y3, Y3 CMPQ CX, $0x000001e0 JNE openAVX2InternalLoop VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 VPADDD 32(BP), Y14, Y14 VPADDD 32(BP), Y9, Y9 VPADDD 32(BP), Y10, Y10 VPADDD 32(BP), Y11, Y11 VPADDD 64(BP), Y12, Y12 VPADDD 64(BP), Y13, Y13 VPADDD 64(BP), Y8, Y8 VPADDD 64(BP), Y15, Y15 VPADDD 96(BP), Y4, Y4 VPADDD 128(BP), Y1, Y1 VPADDD 160(BP), Y2, Y2 VPADDD 192(BP), Y3, Y3 VMOVDQA Y15, 224(BP) // We only hashed 480 of the 512 bytes available - hash the remaining 32 here ADDQ 480(SI), R10 ADCQ 488(SI), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 VPERM2I128 $0x02, Y0, Y14, Y15 VPERM2I128 $0x13, Y0, Y14, Y14 VPERM2I128 $0x02, Y12, Y4, Y0 VPERM2I128 $0x13, Y12, Y4, Y12 VPXOR (SI), Y15, Y15 VPXOR 32(SI), Y0, Y0 VPXOR 64(SI), Y14, Y14 VPXOR 96(SI), Y12, Y12 VMOVDQU Y15, (DI) VMOVDQU Y0, 32(DI) VMOVDQU Y14, 64(DI) VMOVDQU Y12, 96(DI) VPERM2I128 $0x02, Y5, Y9, Y0 VPERM2I128 $0x02, Y13, Y1, Y14 VPERM2I128 $0x13, Y5, Y9, Y12 VPERM2I128 $0x13, Y13, Y1, Y4 VPXOR 128(SI), Y0, Y0 VPXOR 160(SI), Y14, Y14 VPXOR 192(SI), Y12, Y12 VPXOR 224(SI), Y4, Y4 VMOVDQU Y0, 128(DI) VMOVDQU Y14, 160(DI) VMOVDQU Y12, 192(DI) VMOVDQU Y4, 224(DI) // and here ADDQ 496(SI), R10 ADCQ 504(SI), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 VPERM2I128 $0x02, Y6, Y10, Y0 VPERM2I128 $0x02, Y8, Y2, Y14 VPERM2I128 $0x13, Y6, Y10, Y12 VPERM2I128 $0x13, Y8, Y2, Y4 VPXOR 256(SI), Y0, Y0 VPXOR 288(SI), Y14, Y14 VPXOR 320(SI), Y12, Y12 VPXOR 352(SI), Y4, Y4 VMOVDQU Y0, 256(DI) VMOVDQU Y14, 288(DI) VMOVDQU Y12, 320(DI) VMOVDQU Y4, 352(DI) VPERM2I128 $0x02, Y7, Y11, Y0 VPERM2I128 $0x02, 224(BP), Y3, Y14 VPERM2I128 $0x13, Y7, Y11, Y12 VPERM2I128 $0x13, 224(BP), Y3, Y4 VPXOR 384(SI), Y0, Y0 VPXOR 416(SI), Y14, Y14 VPXOR 448(SI), Y12, Y12 VPXOR 480(SI), Y4, Y4 VMOVDQU Y0, 384(DI) VMOVDQU Y14, 416(DI) VMOVDQU Y12, 448(DI) VMOVDQU Y4, 480(DI) LEAQ 512(SI), SI LEAQ 512(DI), DI SUBQ $0x00000200, BX JMP openAVX2MainLoop openAVX2MainLoopDone: // Handle the various tail sizes efficiently TESTQ BX, BX JE openSSEFinalize CMPQ BX, $0x80 JBE openAVX2Tail128 CMPQ BX, $0x00000100 JBE openAVX2Tail256 CMPQ BX, $0x00000180 JBE openAVX2Tail384 JMP openAVX2Tail512 openAVX2192: VMOVDQA Y0, Y5 VMOVDQA Y14, Y9 VMOVDQA Y12, Y13 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 VMOVDQA Y0, Y6 VMOVDQA Y14, Y10 VMOVDQA Y12, Y8 VMOVDQA Y4, Y2 VMOVDQA Y1, Y15 MOVQ $0x0000000a, R9 openAVX2192InnerCipherLoop: VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y3 VPSRLD $0x14, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y3 VPSRLD $0x19, Y9, Y9 VPXOR Y3, Y9, Y9 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x04, Y9, Y9, Y9 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x0c, Y4, Y4, Y4 VPALIGNR $0x0c, Y1, Y1, Y1 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y3 VPSRLD $0x14, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y3 VPSRLD $0x19, Y9, Y9 VPXOR Y3, Y9, Y9 VPALIGNR $0x0c, Y14, Y14, Y14 VPALIGNR $0x0c, Y9, Y9, Y9 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x04, Y4, Y4, Y4 VPALIGNR $0x04, Y1, Y1, Y1 DECQ R9 JNE openAVX2192InnerCipherLoop VPADDD Y6, Y0, Y0 VPADDD Y6, Y5, Y5 VPADDD Y10, Y14, Y14 VPADDD Y10, Y9, Y9 VPADDD Y8, Y12, Y12 VPADDD Y8, Y13, Y13 VPADDD Y2, Y4, Y4 VPADDD Y15, Y1, Y1 VPERM2I128 $0x02, Y0, Y14, Y3 // Clamp and store poly key VPAND ·polyClampMask<>+0(SB), Y3, Y3 VMOVDQA Y3, (BP) // Stream for up to 192 bytes VPERM2I128 $0x13, Y0, Y14, Y0 VPERM2I128 $0x13, Y12, Y4, Y14 VPERM2I128 $0x02, Y5, Y9, Y12 VPERM2I128 $0x02, Y13, Y1, Y4 VPERM2I128 $0x13, Y5, Y9, Y5 VPERM2I128 $0x13, Y13, Y1, Y9 openAVX2ShortOpen: // Hash MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) openAVX2ShortOpenLoop: CMPQ BX, $0x20 JB openAVX2ShortTail32 SUBQ $0x20, BX // Load for hashing ADDQ (SI), R10 ADCQ 8(SI), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 ADDQ 16(SI), R10 ADCQ 24(SI), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 // Load for decryption VPXOR (SI), Y0, Y0 VMOVDQU Y0, (DI) LEAQ 32(SI), SI LEAQ 32(DI), DI // Shift stream left VMOVDQA Y14, Y0 VMOVDQA Y12, Y14 VMOVDQA Y4, Y12 VMOVDQA Y5, Y4 VMOVDQA Y9, Y5 VMOVDQA Y13, Y9 VMOVDQA Y1, Y13 VMOVDQA Y6, Y1 VMOVDQA Y10, Y6 JMP openAVX2ShortOpenLoop openAVX2ShortTail32: CMPQ BX, $0x10 VMOVDQA X0, X1 JB openAVX2ShortDone SUBQ $0x10, BX // Load for hashing ADDQ (SI), R10 ADCQ 8(SI), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 // Load for decryption VPXOR (SI), X0, X12 VMOVDQU X12, (DI) LEAQ 16(SI), SI LEAQ 16(DI), DI VPERM2I128 $0x11, Y0, Y0, Y0 VMOVDQA X0, X1 openAVX2ShortDone: VZEROUPPER JMP openSSETail16 openAVX2320: VMOVDQA Y0, Y5 VMOVDQA Y14, Y9 VMOVDQA Y12, Y13 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 VMOVDQA Y0, Y6 VMOVDQA Y14, Y10 VMOVDQA Y12, Y8 VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 VMOVDQA Y14, Y7 VMOVDQA Y12, Y11 VMOVDQA Y4, Y15 MOVQ $0x0000000a, R9 openAVX2320InnerCipherLoop: VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y3 VPSRLD $0x14, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y3 VPSRLD $0x19, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol16<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x0c, Y10, Y3 VPSRLD $0x14, Y10, Y10 VPXOR Y3, Y10, Y10 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol8<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x07, Y10, Y3 VPSRLD $0x19, Y10, Y10 VPXOR Y3, Y10, Y10 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x04, Y9, Y9, Y9 VPALIGNR $0x04, Y10, Y10, Y10 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 VPALIGNR $0x0c, Y4, Y4, Y4 VPALIGNR $0x0c, Y1, Y1, Y1 VPALIGNR $0x0c, Y2, Y2, Y2 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y3 VPSRLD $0x14, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y3 VPSRLD $0x19, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol16<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x0c, Y10, Y3 VPSRLD $0x14, Y10, Y10 VPXOR Y3, Y10, Y10 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol8<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x07, Y10, Y3 VPSRLD $0x19, Y10, Y10 VPXOR Y3, Y10, Y10 VPALIGNR $0x0c, Y14, Y14, Y14 VPALIGNR $0x0c, Y9, Y9, Y9 VPALIGNR $0x0c, Y10, Y10, Y10 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 VPALIGNR $0x04, Y4, Y4, Y4 VPALIGNR $0x04, Y1, Y1, Y1 VPALIGNR $0x04, Y2, Y2, Y2 DECQ R9 JNE openAVX2320InnerCipherLoop VMOVDQA ·chacha20Constants<>+0(SB), Y3 VPADDD Y3, Y0, Y0 VPADDD Y3, Y5, Y5 VPADDD Y3, Y6, Y6 VPADDD Y7, Y14, Y14 VPADDD Y7, Y9, Y9 VPADDD Y7, Y10, Y10 VPADDD Y11, Y12, Y12 VPADDD Y11, Y13, Y13 VPADDD Y11, Y8, Y8 VMOVDQA ·avx2IncMask<>+0(SB), Y3 VPADDD Y15, Y4, Y4 VPADDD Y3, Y15, Y15 VPADDD Y15, Y1, Y1 VPADDD Y3, Y15, Y15 VPADDD Y15, Y2, Y2 // Clamp and store poly key VPERM2I128 $0x02, Y0, Y14, Y3 VPAND ·polyClampMask<>+0(SB), Y3, Y3 VMOVDQA Y3, (BP) // Stream for up to 320 bytes VPERM2I128 $0x13, Y0, Y14, Y0 VPERM2I128 $0x13, Y12, Y4, Y14 VPERM2I128 $0x02, Y5, Y9, Y12 VPERM2I128 $0x02, Y13, Y1, Y4 VPERM2I128 $0x13, Y5, Y9, Y5 VPERM2I128 $0x13, Y13, Y1, Y9 VPERM2I128 $0x02, Y6, Y10, Y13 VPERM2I128 $0x02, Y8, Y2, Y1 VPERM2I128 $0x13, Y6, Y10, Y6 VPERM2I128 $0x13, Y8, Y2, Y10 JMP openAVX2ShortOpen openAVX2Tail128: // Need to decrypt up to 128 bytes - prepare two blocks VMOVDQA ·chacha20Constants<>+0(SB), Y5 VMOVDQA 32(BP), Y9 VMOVDQA 64(BP), Y13 VMOVDQA 192(BP), Y1 VPADDD ·avx2IncMask<>+0(SB), Y1, Y1 VMOVDQA Y1, Y4 XORQ R9, R9 MOVQ BX, CX ANDQ $-16, CX TESTQ CX, CX JE openAVX2Tail128LoopB openAVX2Tail128LoopA: ADDQ (SI)(R9*1), R10 ADCQ 8(SI)(R9*1), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 openAVX2Tail128LoopB: ADDQ $0x10, R9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y3 VPSRLD $0x14, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y3 VPSRLD $0x19, Y9, Y9 VPXOR Y3, Y9, Y9 VPALIGNR $0x04, Y9, Y9, Y9 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x0c, Y1, Y1, Y1 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y3 VPSRLD $0x14, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y3 VPSRLD $0x19, Y9, Y9 VPXOR Y3, Y9, Y9 VPALIGNR $0x0c, Y9, Y9, Y9 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x04, Y1, Y1, Y1 CMPQ R9, CX JB openAVX2Tail128LoopA CMPQ R9, $0xa0 JNE openAVX2Tail128LoopB VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 VPADDD 32(BP), Y9, Y9 VPADDD 64(BP), Y13, Y13 VPADDD Y4, Y1, Y1 VPERM2I128 $0x02, Y5, Y9, Y0 VPERM2I128 $0x02, Y13, Y1, Y14 VPERM2I128 $0x13, Y5, Y9, Y12 VPERM2I128 $0x13, Y13, Y1, Y4 openAVX2TailLoop: CMPQ BX, $0x20 JB openAVX2Tail SUBQ $0x20, BX // Load for decryption VPXOR (SI), Y0, Y0 VMOVDQU Y0, (DI) LEAQ 32(SI), SI LEAQ 32(DI), DI VMOVDQA Y14, Y0 VMOVDQA Y12, Y14 VMOVDQA Y4, Y12 JMP openAVX2TailLoop openAVX2Tail: CMPQ BX, $0x10 VMOVDQA X0, X1 JB openAVX2TailDone SUBQ $0x10, BX // Load for decryption VPXOR (SI), X0, X12 VMOVDQU X12, (DI) LEAQ 16(SI), SI LEAQ 16(DI), DI VPERM2I128 $0x11, Y0, Y0, Y0 VMOVDQA X0, X1 openAVX2TailDone: VZEROUPPER JMP openSSETail16 openAVX2Tail256: VMOVDQA ·chacha20Constants<>+0(SB), Y0 VMOVDQA Y0, Y5 VMOVDQA 32(BP), Y14 VMOVDQA Y14, Y9 VMOVDQA 64(BP), Y12 VMOVDQA Y12, Y13 VMOVDQA 192(BP), Y4 VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 VMOVDQA Y4, Y7 VMOVDQA Y1, Y11 // Compute the number of iterations that will hash data MOVQ BX, 224(BP) MOVQ BX, CX SUBQ $0x80, CX SHRQ $0x04, CX MOVQ $0x0000000a, R9 CMPQ CX, $0x0a CMOVQGT R9, CX MOVQ SI, BX XORQ R9, R9 openAVX2Tail256LoopA: ADDQ (BX), R10 ADCQ 8(BX), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 16(BX), BX openAVX2Tail256LoopB: VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y3 VPSRLD $0x14, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y3 VPSRLD $0x19, Y9, Y9 VPXOR Y3, Y9, Y9 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x04, Y9, Y9, Y9 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x0c, Y4, Y4, Y4 VPALIGNR $0x0c, Y1, Y1, Y1 INCQ R9 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y3 VPSRLD $0x14, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y3 VPSRLD $0x19, Y9, Y9 VPXOR Y3, Y9, Y9 VPALIGNR $0x0c, Y14, Y14, Y14 VPALIGNR $0x0c, Y9, Y9, Y9 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x04, Y4, Y4, Y4 VPALIGNR $0x04, Y1, Y1, Y1 CMPQ R9, CX JB openAVX2Tail256LoopA CMPQ R9, $0x0a JNE openAVX2Tail256LoopB MOVQ BX, R9 SUBQ SI, BX MOVQ BX, CX MOVQ 224(BP), BX openAVX2Tail256Hash: ADDQ $0x10, CX CMPQ CX, BX JGT openAVX2Tail256HashEnd ADDQ (R9), R10 ADCQ 8(R9), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 16(R9), R9 JMP openAVX2Tail256Hash openAVX2Tail256HashEnd: VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 VPADDD 32(BP), Y14, Y14 VPADDD 32(BP), Y9, Y9 VPADDD 64(BP), Y12, Y12 VPADDD 64(BP), Y13, Y13 VPADDD Y7, Y4, Y4 VPADDD Y11, Y1, Y1 VPERM2I128 $0x02, Y0, Y14, Y6 VPERM2I128 $0x02, Y12, Y4, Y10 VPERM2I128 $0x13, Y0, Y14, Y8 VPERM2I128 $0x13, Y12, Y4, Y2 VPERM2I128 $0x02, Y5, Y9, Y0 VPERM2I128 $0x02, Y13, Y1, Y14 VPERM2I128 $0x13, Y5, Y9, Y12 VPERM2I128 $0x13, Y13, Y1, Y4 VPXOR (SI), Y6, Y6 VPXOR 32(SI), Y10, Y10 VPXOR 64(SI), Y8, Y8 VPXOR 96(SI), Y2, Y2 VMOVDQU Y6, (DI) VMOVDQU Y10, 32(DI) VMOVDQU Y8, 64(DI) VMOVDQU Y2, 96(DI) LEAQ 128(SI), SI LEAQ 128(DI), DI SUBQ $0x80, BX JMP openAVX2TailLoop openAVX2Tail384: // Need to decrypt up to 384 bytes - prepare six blocks VMOVDQA ·chacha20Constants<>+0(SB), Y0 VMOVDQA Y0, Y5 VMOVDQA Y0, Y6 VMOVDQA 32(BP), Y14 VMOVDQA Y14, Y9 VMOVDQA Y14, Y10 VMOVDQA 64(BP), Y12 VMOVDQA Y12, Y13 VMOVDQA Y12, Y8 VMOVDQA 192(BP), Y4 VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 VMOVDQA Y4, 96(BP) VMOVDQA Y1, 128(BP) VMOVDQA Y2, 160(BP) // Compute the number of iterations that will hash two blocks of data MOVQ BX, 224(BP) MOVQ BX, CX SUBQ $0x00000100, CX SHRQ $0x04, CX ADDQ $0x06, CX MOVQ $0x0000000a, R9 CMPQ CX, $0x0a CMOVQGT R9, CX MOVQ SI, BX XORQ R9, R9 openAVX2Tail384LoopB: ADDQ (BX), R10 ADCQ 8(BX), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 16(BX), BX openAVX2Tail384LoopA: VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y3 VPSRLD $0x14, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y3 VPSRLD $0x19, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol16<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x0c, Y10, Y3 VPSRLD $0x14, Y10, Y10 VPXOR Y3, Y10, Y10 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol8<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x07, Y10, Y3 VPSRLD $0x19, Y10, Y10 VPXOR Y3, Y10, Y10 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x04, Y9, Y9, Y9 VPALIGNR $0x04, Y10, Y10, Y10 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 VPALIGNR $0x0c, Y4, Y4, Y4 VPALIGNR $0x0c, Y1, Y1, Y1 VPALIGNR $0x0c, Y2, Y2, Y2 ADDQ (BX), R10 ADCQ 8(BX), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 16(BX), BX INCQ R9 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y3 VPSRLD $0x14, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y3 VPSRLD $0x19, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol16<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x0c, Y10, Y3 VPSRLD $0x14, Y10, Y10 VPXOR Y3, Y10, Y10 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol8<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x07, Y10, Y3 VPSRLD $0x19, Y10, Y10 VPXOR Y3, Y10, Y10 VPALIGNR $0x0c, Y14, Y14, Y14 VPALIGNR $0x0c, Y9, Y9, Y9 VPALIGNR $0x0c, Y10, Y10, Y10 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 VPALIGNR $0x04, Y4, Y4, Y4 VPALIGNR $0x04, Y1, Y1, Y1 VPALIGNR $0x04, Y2, Y2, Y2 CMPQ R9, CX JB openAVX2Tail384LoopB CMPQ R9, $0x0a JNE openAVX2Tail384LoopA MOVQ BX, R9 SUBQ SI, BX MOVQ BX, CX MOVQ 224(BP), BX openAVX2Tail384Hash: ADDQ $0x10, CX CMPQ CX, BX JGT openAVX2Tail384HashEnd ADDQ (R9), R10 ADCQ 8(R9), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 16(R9), R9 JMP openAVX2Tail384Hash openAVX2Tail384HashEnd: VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 VPADDD 32(BP), Y14, Y14 VPADDD 32(BP), Y9, Y9 VPADDD 32(BP), Y10, Y10 VPADDD 64(BP), Y12, Y12 VPADDD 64(BP), Y13, Y13 VPADDD 64(BP), Y8, Y8 VPADDD 96(BP), Y4, Y4 VPADDD 128(BP), Y1, Y1 VPADDD 160(BP), Y2, Y2 VPERM2I128 $0x02, Y0, Y14, Y3 VPERM2I128 $0x02, Y12, Y4, Y7 VPERM2I128 $0x13, Y0, Y14, Y11 VPERM2I128 $0x13, Y12, Y4, Y15 VPXOR (SI), Y3, Y3 VPXOR 32(SI), Y7, Y7 VPXOR 64(SI), Y11, Y11 VPXOR 96(SI), Y15, Y15 VMOVDQU Y3, (DI) VMOVDQU Y7, 32(DI) VMOVDQU Y11, 64(DI) VMOVDQU Y15, 96(DI) VPERM2I128 $0x02, Y5, Y9, Y3 VPERM2I128 $0x02, Y13, Y1, Y7 VPERM2I128 $0x13, Y5, Y9, Y11 VPERM2I128 $0x13, Y13, Y1, Y15 VPXOR 128(SI), Y3, Y3 VPXOR 160(SI), Y7, Y7 VPXOR 192(SI), Y11, Y11 VPXOR 224(SI), Y15, Y15 VMOVDQU Y3, 128(DI) VMOVDQU Y7, 160(DI) VMOVDQU Y11, 192(DI) VMOVDQU Y15, 224(DI) VPERM2I128 $0x02, Y6, Y10, Y0 VPERM2I128 $0x02, Y8, Y2, Y14 VPERM2I128 $0x13, Y6, Y10, Y12 VPERM2I128 $0x13, Y8, Y2, Y4 LEAQ 256(SI), SI LEAQ 256(DI), DI SUBQ $0x00000100, BX JMP openAVX2TailLoop openAVX2Tail512: VMOVDQU ·chacha20Constants<>+0(SB), Y0 VMOVDQA Y0, Y5 VMOVDQA Y0, Y6 VMOVDQA Y0, Y7 VMOVDQA 32(BP), Y14 VMOVDQA Y14, Y9 VMOVDQA Y14, Y10 VMOVDQA Y14, Y11 VMOVDQA 64(BP), Y12 VMOVDQA Y12, Y13 VMOVDQA Y12, Y8 VMOVDQA Y12, Y15 VMOVDQA 192(BP), Y4 VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 VMOVDQA Y4, 96(BP) VMOVDQA Y1, 128(BP) VMOVDQA Y2, 160(BP) VMOVDQA Y3, 192(BP) XORQ CX, CX MOVQ SI, R9 openAVX2Tail512LoopB: ADDQ (R9), R10 ADCQ 8(R9), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 16(R9), R9 openAVX2Tail512LoopA: VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 VPADDD Y10, Y6, Y6 VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 VPXOR Y6, Y2, Y2 VPXOR Y7, Y3, Y3 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y2, Y2 VPSHUFB ·rol16<>+0(SB), Y3, Y3 VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 VPADDD Y2, Y8, Y8 VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 VPXOR Y8, Y10, Y10 VPXOR Y15, Y11, Y11 VMOVDQA Y15, 224(BP) VPSLLD $0x0c, Y14, Y15 VPSRLD $0x14, Y14, Y14 VPXOR Y15, Y14, Y14 VPSLLD $0x0c, Y9, Y15 VPSRLD $0x14, Y9, Y9 VPXOR Y15, Y9, Y9 VPSLLD $0x0c, Y10, Y15 VPSRLD $0x14, Y10, Y10 VPXOR Y15, Y10, Y10 VPSLLD $0x0c, Y11, Y15 VPSRLD $0x14, Y11, Y11 VPXOR Y15, Y11, Y11 VMOVDQA 224(BP), Y15 ADDQ (R9), R10 ADCQ 8(R9), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 VPADDD Y10, Y6, Y6 VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 VPXOR Y6, Y2, Y2 VPXOR Y7, Y3, Y3 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y2, Y2 VPSHUFB ·rol8<>+0(SB), Y3, Y3 VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 VPADDD Y2, Y8, Y8 VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 VPXOR Y8, Y10, Y10 VPXOR Y15, Y11, Y11 VMOVDQA Y15, 224(BP) VPSLLD $0x07, Y14, Y15 VPSRLD $0x19, Y14, Y14 VPXOR Y15, Y14, Y14 VPSLLD $0x07, Y9, Y15 VPSRLD $0x19, Y9, Y9 VPXOR Y15, Y9, Y9 VPSLLD $0x07, Y10, Y15 VPSRLD $0x19, Y10, Y10 VPXOR Y15, Y10, Y10 VPSLLD $0x07, Y11, Y15 VPSRLD $0x19, Y11, Y11 VPXOR Y15, Y11, Y11 VMOVDQA 224(BP), Y15 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x04, Y9, Y9, Y9 VPALIGNR $0x04, Y10, Y10, Y10 VPALIGNR $0x04, Y11, Y11, Y11 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 VPALIGNR $0x08, Y15, Y15, Y15 VPALIGNR $0x0c, Y4, Y4, Y4 VPALIGNR $0x0c, Y1, Y1, Y1 VPALIGNR $0x0c, Y2, Y2, Y2 VPALIGNR $0x0c, Y3, Y3, Y3 VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 VPADDD Y10, Y6, Y6 VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 VPXOR Y6, Y2, Y2 VPXOR Y7, Y3, Y3 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y2, Y2 VPSHUFB ·rol16<>+0(SB), Y3, Y3 VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 VPADDD Y2, Y8, Y8 VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 VPXOR Y8, Y10, Y10 VPXOR Y15, Y11, Y11 ADDQ 16(R9), R10 ADCQ 24(R9), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 32(R9), R9 VMOVDQA Y15, 224(BP) VPSLLD $0x0c, Y14, Y15 VPSRLD $0x14, Y14, Y14 VPXOR Y15, Y14, Y14 VPSLLD $0x0c, Y9, Y15 VPSRLD $0x14, Y9, Y9 VPXOR Y15, Y9, Y9 VPSLLD $0x0c, Y10, Y15 VPSRLD $0x14, Y10, Y10 VPXOR Y15, Y10, Y10 VPSLLD $0x0c, Y11, Y15 VPSRLD $0x14, Y11, Y11 VPXOR Y15, Y11, Y11 VMOVDQA 224(BP), Y15 VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 VPADDD Y10, Y6, Y6 VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 VPXOR Y6, Y2, Y2 VPXOR Y7, Y3, Y3 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y2, Y2 VPSHUFB ·rol8<>+0(SB), Y3, Y3 VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 VPADDD Y2, Y8, Y8 VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 VPXOR Y8, Y10, Y10 VPXOR Y15, Y11, Y11 VMOVDQA Y15, 224(BP) VPSLLD $0x07, Y14, Y15 VPSRLD $0x19, Y14, Y14 VPXOR Y15, Y14, Y14 VPSLLD $0x07, Y9, Y15 VPSRLD $0x19, Y9, Y9 VPXOR Y15, Y9, Y9 VPSLLD $0x07, Y10, Y15 VPSRLD $0x19, Y10, Y10 VPXOR Y15, Y10, Y10 VPSLLD $0x07, Y11, Y15 VPSRLD $0x19, Y11, Y11 VPXOR Y15, Y11, Y11 VMOVDQA 224(BP), Y15 VPALIGNR $0x0c, Y14, Y14, Y14 VPALIGNR $0x0c, Y9, Y9, Y9 VPALIGNR $0x0c, Y10, Y10, Y10 VPALIGNR $0x0c, Y11, Y11, Y11 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 VPALIGNR $0x08, Y15, Y15, Y15 VPALIGNR $0x04, Y4, Y4, Y4 VPALIGNR $0x04, Y1, Y1, Y1 VPALIGNR $0x04, Y2, Y2, Y2 VPALIGNR $0x04, Y3, Y3, Y3 INCQ CX CMPQ CX, $0x04 JLT openAVX2Tail512LoopB CMPQ CX, $0x0a JNE openAVX2Tail512LoopA MOVQ BX, CX SUBQ $0x00000180, CX ANDQ $-16, CX openAVX2Tail512HashLoop: TESTQ CX, CX JE openAVX2Tail512HashEnd ADDQ (R9), R10 ADCQ 8(R9), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 16(R9), R9 SUBQ $0x10, CX JMP openAVX2Tail512HashLoop openAVX2Tail512HashEnd: VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 VPADDD 32(BP), Y14, Y14 VPADDD 32(BP), Y9, Y9 VPADDD 32(BP), Y10, Y10 VPADDD 32(BP), Y11, Y11 VPADDD 64(BP), Y12, Y12 VPADDD 64(BP), Y13, Y13 VPADDD 64(BP), Y8, Y8 VPADDD 64(BP), Y15, Y15 VPADDD 96(BP), Y4, Y4 VPADDD 128(BP), Y1, Y1 VPADDD 160(BP), Y2, Y2 VPADDD 192(BP), Y3, Y3 VMOVDQA Y15, 224(BP) VPERM2I128 $0x02, Y0, Y14, Y15 VPERM2I128 $0x13, Y0, Y14, Y14 VPERM2I128 $0x02, Y12, Y4, Y0 VPERM2I128 $0x13, Y12, Y4, Y12 VPXOR (SI), Y15, Y15 VPXOR 32(SI), Y0, Y0 VPXOR 64(SI), Y14, Y14 VPXOR 96(SI), Y12, Y12 VMOVDQU Y15, (DI) VMOVDQU Y0, 32(DI) VMOVDQU Y14, 64(DI) VMOVDQU Y12, 96(DI) VPERM2I128 $0x02, Y5, Y9, Y0 VPERM2I128 $0x02, Y13, Y1, Y14 VPERM2I128 $0x13, Y5, Y9, Y12 VPERM2I128 $0x13, Y13, Y1, Y4 VPXOR 128(SI), Y0, Y0 VPXOR 160(SI), Y14, Y14 VPXOR 192(SI), Y12, Y12 VPXOR 224(SI), Y4, Y4 VMOVDQU Y0, 128(DI) VMOVDQU Y14, 160(DI) VMOVDQU Y12, 192(DI) VMOVDQU Y4, 224(DI) VPERM2I128 $0x02, Y6, Y10, Y0 VPERM2I128 $0x02, Y8, Y2, Y14 VPERM2I128 $0x13, Y6, Y10, Y12 VPERM2I128 $0x13, Y8, Y2, Y4 VPXOR 256(SI), Y0, Y0 VPXOR 288(SI), Y14, Y14 VPXOR 320(SI), Y12, Y12 VPXOR 352(SI), Y4, Y4 VMOVDQU Y0, 256(DI) VMOVDQU Y14, 288(DI) VMOVDQU Y12, 320(DI) VMOVDQU Y4, 352(DI) VPERM2I128 $0x02, Y7, Y11, Y0 VPERM2I128 $0x02, 224(BP), Y3, Y14 VPERM2I128 $0x13, Y7, Y11, Y12 VPERM2I128 $0x13, 224(BP), Y3, Y4 LEAQ 384(SI), SI LEAQ 384(DI), DI SUBQ $0x00000180, BX JMP openAVX2TailLoop DATA ·chacha20Constants<>+0(SB)/4, $0x61707865 DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32 DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574 DATA ·chacha20Constants<>+16(SB)/4, $0x61707865 DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32 DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574 GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32 DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32 DATA ·sseIncMask<>+0(SB)/8, $0x0000000000000001 DATA ·sseIncMask<>+8(SB)/8, $0x0000000000000000 GLOBL ·sseIncMask<>(SB), RODATA|NOPTR, $16 DATA ·andMask<>+0(SB)/8, $0x00000000000000ff DATA ·andMask<>+8(SB)/8, $0x0000000000000000 DATA ·andMask<>+16(SB)/8, $0x000000000000ffff DATA ·andMask<>+24(SB)/8, $0x0000000000000000 DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff DATA ·andMask<>+40(SB)/8, $0x0000000000000000 DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff DATA ·andMask<>+56(SB)/8, $0x0000000000000000 DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff DATA ·andMask<>+72(SB)/8, $0x0000000000000000 DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff DATA ·andMask<>+88(SB)/8, $0x0000000000000000 DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff DATA ·andMask<>+104(SB)/8, $0x0000000000000000 DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff DATA ·andMask<>+120(SB)/8, $0x0000000000000000 DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff DATA ·andMask<>+136(SB)/8, $0x00000000000000ff DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff DATA ·andMask<>+152(SB)/8, $0x000000000000ffff DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff GLOBL ·andMask<>(SB), RODATA|NOPTR, $240 DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000 DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000 DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001 DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000 GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32 DATA ·rol16<>+0(SB)/8, $0x0504070601000302 DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a DATA ·rol16<>+16(SB)/8, $0x0504070601000302 DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a GLOBL ·rol16<>(SB), RODATA|NOPTR, $32 DATA ·rol8<>+0(SB)/8, $0x0605040702010003 DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b DATA ·rol8<>+16(SB)/8, $0x0605040702010003 DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b GLOBL ·rol8<>(SB), RODATA|NOPTR, $32 DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002 DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000 DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002 DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000 GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32 // func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte) // Requires: AVX, AVX2, BMI2, CMOV, SSE2 TEXT ·chacha20Poly1305Seal(SB), $288-96 MOVQ SP, BP ADDQ $0x20, BP ANDQ $-32, BP MOVQ dst_base+0(FP), DI MOVQ key_base+24(FP), R8 MOVQ src_base+48(FP), SI MOVQ src_len+56(FP), BX MOVQ ad_base+72(FP), CX CMPB ·useAVX2+0(SB), $0x01 JE chacha20Poly1305Seal_AVX2 // Special optimization, for very short buffers CMPQ BX, $0x80 JBE sealSSE128 // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration MOVOU ·chacha20Constants<>+0(SB), X0 MOVOU 16(R8), X3 MOVOU 32(R8), X6 MOVOU 48(R8), X9 // Store state on stack for future use MOVO X3, 32(BP) MOVO X6, 48(BP) // Load state, increment counter blocks MOVO X0, X1 MOVO X3, X4 MOVO X6, X7 MOVO X9, X10 PADDL ·sseIncMask<>+0(SB), X10 MOVO X1, X2 MOVO X4, X5 MOVO X7, X8 MOVO X10, X11 PADDL ·sseIncMask<>+0(SB), X11 MOVO X2, X12 MOVO X5, X13 MOVO X8, X14 MOVO X11, X15 PADDL ·sseIncMask<>+0(SB), X15 // Store counters MOVO X9, 80(BP) MOVO X10, 96(BP) MOVO X11, 112(BP) MOVO X15, 128(BP) MOVQ $0x0000000a, R9 sealSSEIntroLoop: MOVO X14, 64(BP) PADDD X3, X0 PXOR X0, X9 ROL16(X9, X14) PADDD X9, X6 PXOR X6, X3 MOVO X3, X14 PSLLL $0x0c, X14 PSRLL $0x14, X3 PXOR X14, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X14) PADDD X9, X6 PXOR X6, X3 MOVO X3, X14 PSLLL $0x07, X14 PSRLL $0x19, X3 PXOR X14, X3 PADDD X4, X1 PXOR X1, X10 ROL16(X10, X14) PADDD X10, X7 PXOR X7, X4 MOVO X4, X14 PSLLL $0x0c, X14 PSRLL $0x14, X4 PXOR X14, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X14) PADDD X10, X7 PXOR X7, X4 MOVO X4, X14 PSLLL $0x07, X14 PSRLL $0x19, X4 PXOR X14, X4 PADDD X5, X2 PXOR X2, X11 ROL16(X11, X14) PADDD X11, X8 PXOR X8, X5 MOVO X5, X14 PSLLL $0x0c, X14 PSRLL $0x14, X5 PXOR X14, X5 PADDD X5, X2 PXOR X2, X11 ROL8(X11, X14) PADDD X11, X8 PXOR X8, X5 MOVO X5, X14 PSLLL $0x07, X14 PSRLL $0x19, X5 PXOR X14, X5 MOVO 64(BP), X14 MOVO X7, 64(BP) PADDD X13, X12 PXOR X12, X15 ROL16(X15, X7) PADDD X15, X14 PXOR X14, X13 MOVO X13, X7 PSLLL $0x0c, X7 PSRLL $0x14, X13 PXOR X7, X13 PADDD X13, X12 PXOR X12, X15 ROL8(X15, X7) PADDD X15, X14 PXOR X14, X13 MOVO X13, X7 PSLLL $0x07, X7 PSRLL $0x19, X13 PXOR X7, X13 MOVO 64(BP), X7 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc0 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x0c MOVO X14, 64(BP) PADDD X3, X0 PXOR X0, X9 ROL16(X9, X14) PADDD X9, X6 PXOR X6, X3 MOVO X3, X14 PSLLL $0x0c, X14 PSRLL $0x14, X3 PXOR X14, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X14) PADDD X9, X6 PXOR X6, X3 MOVO X3, X14 PSLLL $0x07, X14 PSRLL $0x19, X3 PXOR X14, X3 PADDD X4, X1 PXOR X1, X10 ROL16(X10, X14) PADDD X10, X7 PXOR X7, X4 MOVO X4, X14 PSLLL $0x0c, X14 PSRLL $0x14, X4 PXOR X14, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X14) PADDD X10, X7 PXOR X7, X4 MOVO X4, X14 PSLLL $0x07, X14 PSRLL $0x19, X4 PXOR X14, X4 PADDD X5, X2 PXOR X2, X11 ROL16(X11, X14) PADDD X11, X8 PXOR X8, X5 MOVO X5, X14 PSLLL $0x0c, X14 PSRLL $0x14, X5 PXOR X14, X5 PADDD X5, X2 PXOR X2, X11 ROL8(X11, X14) PADDD X11, X8 PXOR X8, X5 MOVO X5, X14 PSLLL $0x07, X14 PSRLL $0x19, X5 PXOR X14, X5 MOVO 64(BP), X14 MOVO X7, 64(BP) PADDD X13, X12 PXOR X12, X15 ROL16(X15, X7) PADDD X15, X14 PXOR X14, X13 MOVO X13, X7 PSLLL $0x0c, X7 PSRLL $0x14, X13 PXOR X7, X13 PADDD X13, X12 PXOR X12, X15 ROL8(X15, X7) PADDD X15, X14 PXOR X14, X13 MOVO X13, X7 PSLLL $0x07, X7 PSRLL $0x19, X13 PXOR X7, X13 MOVO 64(BP), X7 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc0 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x04 DECQ R9 JNE sealSSEIntroLoop // Add in the state PADDD ·chacha20Constants<>+0(SB), X0 PADDD ·chacha20Constants<>+0(SB), X1 PADDD ·chacha20Constants<>+0(SB), X2 PADDD ·chacha20Constants<>+0(SB), X12 PADDD 32(BP), X3 PADDD 32(BP), X4 PADDD 32(BP), X5 PADDD 32(BP), X13 PADDD 48(BP), X7 PADDD 48(BP), X8 PADDD 48(BP), X14 PADDD 96(BP), X10 PADDD 112(BP), X11 PADDD 128(BP), X15 // Clamp and store the key PAND ·polyClampMask<>+0(SB), X0 MOVO X0, (BP) MOVO X3, 16(BP) // Hash AAD MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) MOVOU (SI), X0 MOVOU 16(SI), X3 MOVOU 32(SI), X6 MOVOU 48(SI), X9 PXOR X0, X1 PXOR X3, X4 PXOR X6, X7 PXOR X9, X10 MOVOU X1, (DI) MOVOU X4, 16(DI) MOVOU X7, 32(DI) MOVOU X10, 48(DI) MOVOU 64(SI), X0 MOVOU 80(SI), X3 MOVOU 96(SI), X6 MOVOU 112(SI), X9 PXOR X0, X2 PXOR X3, X5 PXOR X6, X8 PXOR X9, X11 MOVOU X2, 64(DI) MOVOU X5, 80(DI) MOVOU X8, 96(DI) MOVOU X11, 112(DI) MOVQ $0x00000080, CX SUBQ $0x80, BX LEAQ 128(SI), SI MOVO X12, X1 MOVO X13, X4 MOVO X14, X7 MOVO X15, X10 CMPQ BX, $0x40 JBE sealSSE128SealHash MOVOU (SI), X0 MOVOU 16(SI), X3 MOVOU 32(SI), X6 MOVOU 48(SI), X9 PXOR X0, X12 PXOR X3, X13 PXOR X6, X14 PXOR X9, X15 MOVOU X12, 128(DI) MOVOU X13, 144(DI) MOVOU X14, 160(DI) MOVOU X15, 176(DI) ADDQ $0x40, CX SUBQ $0x40, BX LEAQ 64(SI), SI MOVQ $0x00000002, CX MOVQ $0x00000008, R9 CMPQ BX, $0x40 JBE sealSSETail64 CMPQ BX, $0x80 JBE sealSSETail128 CMPQ BX, $0xc0 JBE sealSSETail192 sealSSEMainLoop: // Load state, increment counter blocks MOVO ·chacha20Constants<>+0(SB), X0 MOVO 32(BP), X3 MOVO 48(BP), X6 MOVO 128(BP), X9 PADDL ·sseIncMask<>+0(SB), X9 MOVO X0, X1 MOVO X3, X4 MOVO X6, X7 MOVO X9, X10 PADDL ·sseIncMask<>+0(SB), X10 MOVO X1, X2 MOVO X4, X5 MOVO X7, X8 MOVO X10, X11 PADDL ·sseIncMask<>+0(SB), X11 MOVO X2, X12 MOVO X5, X13 MOVO X8, X14 MOVO X11, X15 PADDL ·sseIncMask<>+0(SB), X15 // Store counters MOVO X9, 80(BP) MOVO X10, 96(BP) MOVO X11, 112(BP) MOVO X15, 128(BP) sealSSEInnerLoop: MOVO X14, 64(BP) PADDD X3, X0 PXOR X0, X9 ROL16(X9, X14) PADDD X9, X6 PXOR X6, X3 MOVO X3, X14 PSLLL $0x0c, X14 PSRLL $0x14, X3 PXOR X14, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X14) PADDD X9, X6 PXOR X6, X3 MOVO X3, X14 PSLLL $0x07, X14 PSRLL $0x19, X3 PXOR X14, X3 PADDD X4, X1 PXOR X1, X10 ROL16(X10, X14) PADDD X10, X7 PXOR X7, X4 MOVO X4, X14 PSLLL $0x0c, X14 PSRLL $0x14, X4 PXOR X14, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X14) PADDD X10, X7 PXOR X7, X4 MOVO X4, X14 PSLLL $0x07, X14 PSRLL $0x19, X4 PXOR X14, X4 PADDD X5, X2 PXOR X2, X11 ROL16(X11, X14) PADDD X11, X8 PXOR X8, X5 MOVO X5, X14 PSLLL $0x0c, X14 PSRLL $0x14, X5 PXOR X14, X5 PADDD X5, X2 PXOR X2, X11 ROL8(X11, X14) PADDD X11, X8 PXOR X8, X5 MOVO X5, X14 PSLLL $0x07, X14 PSRLL $0x19, X5 PXOR X14, X5 MOVO 64(BP), X14 MOVO X7, 64(BP) PADDD X13, X12 PXOR X12, X15 ROL16(X15, X7) PADDD X15, X14 PXOR X14, X13 MOVO X13, X7 PSLLL $0x0c, X7 PSRLL $0x14, X13 PXOR X7, X13 PADDD X13, X12 PXOR X12, X15 ROL8(X15, X7) PADDD X15, X14 PXOR X14, X13 MOVO X13, X7 PSLLL $0x07, X7 PSRLL $0x19, X13 PXOR X7, X13 MOVO 64(BP), X7 ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc0 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x0c MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX LEAQ 16(DI), DI MOVO X14, 64(BP) PADDD X3, X0 PXOR X0, X9 ROL16(X9, X14) PADDD X9, X6 PXOR X6, X3 MOVO X3, X14 PSLLL $0x0c, X14 PSRLL $0x14, X3 PXOR X14, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X14) PADDD X9, X6 PXOR X6, X3 MOVO X3, X14 PSLLL $0x07, X14 PSRLL $0x19, X3 PXOR X14, X3 PADDD X4, X1 PXOR X1, X10 ROL16(X10, X14) PADDD X10, X7 PXOR X7, X4 MOVO X4, X14 PSLLL $0x0c, X14 PSRLL $0x14, X4 PXOR X14, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X14) PADDD X10, X7 PXOR X7, X4 MOVO X4, X14 PSLLL $0x07, X14 PSRLL $0x19, X4 PXOR X14, X4 PADDD X5, X2 PXOR X2, X11 ROL16(X11, X14) PADDD X11, X8 PXOR X8, X5 MOVO X5, X14 PSLLL $0x0c, X14 PSRLL $0x14, X5 PXOR X14, X5 PADDD X5, X2 PXOR X2, X11 ROL8(X11, X14) PADDD X11, X8 PXOR X8, X5 MOVO X5, X14 PSLLL $0x07, X14 PSRLL $0x19, X5 PXOR X14, X5 MOVO 64(BP), X14 MOVO X7, 64(BP) IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 PADDD X13, X12 PXOR X12, X15 ROL16(X15, X7) PADDD X15, X14 PXOR X14, X13 MOVO X13, X7 PSLLL $0x0c, X7 PSRLL $0x14, X13 PXOR X7, X13 PADDD X13, X12 PXOR X12, X15 ROL8(X15, X7) PADDD X15, X14 PXOR X14, X13 MOVO X13, X7 PSLLL $0x07, X7 PSRLL $0x19, X13 PXOR X7, X13 MOVO 64(BP), X7 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc0 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x04 DECQ R9 JGE sealSSEInnerLoop ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 16(DI), DI DECQ CX JG sealSSEInnerLoop // Add in the state PADDD ·chacha20Constants<>+0(SB), X0 PADDD ·chacha20Constants<>+0(SB), X1 PADDD ·chacha20Constants<>+0(SB), X2 PADDD ·chacha20Constants<>+0(SB), X12 PADDD 32(BP), X3 PADDD 32(BP), X4 PADDD 32(BP), X5 PADDD 32(BP), X13 PADDD 48(BP), X6 PADDD 48(BP), X7 PADDD 48(BP), X8 PADDD 48(BP), X14 PADDD 80(BP), X9 PADDD 96(BP), X10 PADDD 112(BP), X11 PADDD 128(BP), X15 MOVO X15, 64(BP) // Load - xor - store MOVOU (SI), X15 PXOR X15, X0 MOVOU 16(SI), X15 PXOR X15, X3 MOVOU 32(SI), X15 PXOR X15, X6 MOVOU 48(SI), X15 PXOR X15, X9 MOVOU X0, (DI) MOVOU X3, 16(DI) MOVOU X6, 32(DI) MOVOU X9, 48(DI) MOVO 64(BP), X15 MOVOU 64(SI), X0 MOVOU 80(SI), X3 MOVOU 96(SI), X6 MOVOU 112(SI), X9 PXOR X0, X1 PXOR X3, X4 PXOR X6, X7 PXOR X9, X10 MOVOU X1, 64(DI) MOVOU X4, 80(DI) MOVOU X7, 96(DI) MOVOU X10, 112(DI) MOVOU 128(SI), X0 MOVOU 144(SI), X3 MOVOU 160(SI), X6 MOVOU 176(SI), X9 PXOR X0, X2 PXOR X3, X5 PXOR X6, X8 PXOR X9, X11 MOVOU X2, 128(DI) MOVOU X5, 144(DI) MOVOU X8, 160(DI) MOVOU X11, 176(DI) ADDQ $0xc0, SI MOVQ $0x000000c0, CX SUBQ $0xc0, BX MOVO X12, X1 MOVO X13, X4 MOVO X14, X7 MOVO X15, X10 CMPQ BX, $0x40 JBE sealSSE128SealHash MOVOU (SI), X0 MOVOU 16(SI), X3 MOVOU 32(SI), X6 MOVOU 48(SI), X9 PXOR X0, X12 PXOR X3, X13 PXOR X6, X14 PXOR X9, X15 MOVOU X12, 192(DI) MOVOU X13, 208(DI) MOVOU X14, 224(DI) MOVOU X15, 240(DI) LEAQ 64(SI), SI SUBQ $0x40, BX MOVQ $0x00000006, CX MOVQ $0x00000004, R9 CMPQ BX, $0xc0 JG sealSSEMainLoop MOVQ BX, CX TESTQ BX, BX JE sealSSE128SealHash MOVQ $0x00000006, CX CMPQ BX, $0x40 JBE sealSSETail64 CMPQ BX, $0x80 JBE sealSSETail128 JMP sealSSETail192 sealSSETail64: MOVO ·chacha20Constants<>+0(SB), X1 MOVO 32(BP), X4 MOVO 48(BP), X7 MOVO 128(BP), X10 PADDL ·sseIncMask<>+0(SB), X10 MOVO X10, 80(BP) sealSSETail64LoopA: ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 16(DI), DI sealSSETail64LoopB: PADDD X4, X1 PXOR X1, X10 ROL16(X10, X13) PADDD X10, X7 PXOR X7, X4 MOVO X4, X13 PSLLL $0x0c, X13 PSRLL $0x14, X4 PXOR X13, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X13) PADDD X10, X7 PXOR X7, X4 MOVO X4, X13 PSLLL $0x07, X13 PSRLL $0x19, X4 PXOR X13, X4 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x0c PADDD X4, X1 PXOR X1, X10 ROL16(X10, X13) PADDD X10, X7 PXOR X7, X4 MOVO X4, X13 PSLLL $0x0c, X13 PSRLL $0x14, X4 PXOR X13, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X13) PADDD X10, X7 PXOR X7, X4 MOVO X4, X13 PSLLL $0x07, X13 PSRLL $0x19, X4 PXOR X13, X4 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x04 ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 16(DI), DI DECQ CX JG sealSSETail64LoopA DECQ R9 JGE sealSSETail64LoopB PADDL ·chacha20Constants<>+0(SB), X1 PADDL 32(BP), X4 PADDL 48(BP), X7 PADDL 80(BP), X10 JMP sealSSE128Seal sealSSETail128: MOVO ·chacha20Constants<>+0(SB), X0 MOVO 32(BP), X3 MOVO 48(BP), X6 MOVO 128(BP), X9 PADDL ·sseIncMask<>+0(SB), X9 MOVO X9, 80(BP) MOVO X0, X1 MOVO X3, X4 MOVO X6, X7 MOVO X9, X10 PADDL ·sseIncMask<>+0(SB), X10 MOVO X10, 96(BP) sealSSETail128LoopA: ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 16(DI), DI sealSSETail128LoopB: PADDD X3, X0 PXOR X0, X9 ROL16(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x0c, X12 PSRLL $0x14, X3 PXOR X12, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x07, X12 PSRLL $0x19, X3 PXOR X12, X3 PADDD X4, X1 PXOR X1, X10 ROL16(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x0c, X12 PSRLL $0x14, X4 PXOR X12, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x07, X12 PSRLL $0x19, X4 PXOR X12, X4 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x0c ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 16(DI), DI PADDD X3, X0 PXOR X0, X9 ROL16(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x0c, X12 PSRLL $0x14, X3 PXOR X12, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x07, X12 PSRLL $0x19, X3 PXOR X12, X3 PADDD X4, X1 PXOR X1, X10 ROL16(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x0c, X12 PSRLL $0x14, X4 PXOR X12, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x07, X12 PSRLL $0x19, X4 PXOR X12, X4 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x04 DECQ CX JG sealSSETail128LoopA DECQ R9 JGE sealSSETail128LoopB PADDL ·chacha20Constants<>+0(SB), X0 PADDL ·chacha20Constants<>+0(SB), X1 PADDL 32(BP), X3 PADDL 32(BP), X4 PADDL 48(BP), X6 PADDL 48(BP), X7 PADDL 80(BP), X9 PADDL 96(BP), X10 MOVOU (SI), X12 MOVOU 16(SI), X13 MOVOU 32(SI), X14 MOVOU 48(SI), X15 PXOR X12, X0 PXOR X13, X3 PXOR X14, X6 PXOR X15, X9 MOVOU X0, (DI) MOVOU X3, 16(DI) MOVOU X6, 32(DI) MOVOU X9, 48(DI) MOVQ $0x00000040, CX LEAQ 64(SI), SI SUBQ $0x40, BX JMP sealSSE128SealHash sealSSETail192: MOVO ·chacha20Constants<>+0(SB), X0 MOVO 32(BP), X3 MOVO 48(BP), X6 MOVO 128(BP), X9 PADDL ·sseIncMask<>+0(SB), X9 MOVO X9, 80(BP) MOVO X0, X1 MOVO X3, X4 MOVO X6, X7 MOVO X9, X10 PADDL ·sseIncMask<>+0(SB), X10 MOVO X10, 96(BP) MOVO X1, X2 MOVO X4, X5 MOVO X7, X8 MOVO X10, X11 PADDL ·sseIncMask<>+0(SB), X11 MOVO X11, 112(BP) sealSSETail192LoopA: ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 16(DI), DI sealSSETail192LoopB: PADDD X3, X0 PXOR X0, X9 ROL16(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x0c, X12 PSRLL $0x14, X3 PXOR X12, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x07, X12 PSRLL $0x19, X3 PXOR X12, X3 PADDD X4, X1 PXOR X1, X10 ROL16(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x0c, X12 PSRLL $0x14, X4 PXOR X12, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x07, X12 PSRLL $0x19, X4 PXOR X12, X4 PADDD X5, X2 PXOR X2, X11 ROL16(X11, X12) PADDD X11, X8 PXOR X8, X5 MOVO X5, X12 PSLLL $0x0c, X12 PSRLL $0x14, X5 PXOR X12, X5 PADDD X5, X2 PXOR X2, X11 ROL8(X11, X12) PADDD X11, X8 PXOR X8, X5 MOVO X5, X12 PSLLL $0x07, X12 PSRLL $0x19, X5 PXOR X12, X5 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc0 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x0c ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 16(DI), DI PADDD X3, X0 PXOR X0, X9 ROL16(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x0c, X12 PSRLL $0x14, X3 PXOR X12, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x07, X12 PSRLL $0x19, X3 PXOR X12, X3 PADDD X4, X1 PXOR X1, X10 ROL16(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x0c, X12 PSRLL $0x14, X4 PXOR X12, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x07, X12 PSRLL $0x19, X4 PXOR X12, X4 PADDD X5, X2 PXOR X2, X11 ROL16(X11, X12) PADDD X11, X8 PXOR X8, X5 MOVO X5, X12 PSLLL $0x0c, X12 PSRLL $0x14, X5 PXOR X12, X5 PADDD X5, X2 PXOR X2, X11 ROL8(X11, X12) PADDD X11, X8 PXOR X8, X5 MOVO X5, X12 PSLLL $0x07, X12 PSRLL $0x19, X5 PXOR X12, X5 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc0 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x04 DECQ CX JG sealSSETail192LoopA DECQ R9 JGE sealSSETail192LoopB PADDL ·chacha20Constants<>+0(SB), X0 PADDL ·chacha20Constants<>+0(SB), X1 PADDL ·chacha20Constants<>+0(SB), X2 PADDL 32(BP), X3 PADDL 32(BP), X4 PADDL 32(BP), X5 PADDL 48(BP), X6 PADDL 48(BP), X7 PADDL 48(BP), X8 PADDL 80(BP), X9 PADDL 96(BP), X10 PADDL 112(BP), X11 MOVOU (SI), X12 MOVOU 16(SI), X13 MOVOU 32(SI), X14 MOVOU 48(SI), X15 PXOR X12, X0 PXOR X13, X3 PXOR X14, X6 PXOR X15, X9 MOVOU X0, (DI) MOVOU X3, 16(DI) MOVOU X6, 32(DI) MOVOU X9, 48(DI) MOVOU 64(SI), X12 MOVOU 80(SI), X13 MOVOU 96(SI), X14 MOVOU 112(SI), X15 PXOR X12, X1 PXOR X13, X4 PXOR X14, X7 PXOR X15, X10 MOVOU X1, 64(DI) MOVOU X4, 80(DI) MOVOU X7, 96(DI) MOVOU X10, 112(DI) MOVO X2, X1 MOVO X5, X4 MOVO X8, X7 MOVO X11, X10 MOVQ $0x00000080, CX LEAQ 128(SI), SI SUBQ $0x80, BX JMP sealSSE128SealHash sealSSE128: MOVOU ·chacha20Constants<>+0(SB), X0 MOVOU 16(R8), X3 MOVOU 32(R8), X6 MOVOU 48(R8), X9 MOVO X0, X1 MOVO X3, X4 MOVO X6, X7 MOVO X9, X10 PADDL ·sseIncMask<>+0(SB), X10 MOVO X1, X2 MOVO X4, X5 MOVO X7, X8 MOVO X10, X11 PADDL ·sseIncMask<>+0(SB), X11 MOVO X3, X13 MOVO X6, X14 MOVO X10, X15 MOVQ $0x0000000a, R9 sealSSE128InnerCipherLoop: PADDD X3, X0 PXOR X0, X9 ROL16(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x0c, X12 PSRLL $0x14, X3 PXOR X12, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x07, X12 PSRLL $0x19, X3 PXOR X12, X3 PADDD X4, X1 PXOR X1, X10 ROL16(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x0c, X12 PSRLL $0x14, X4 PXOR X12, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x07, X12 PSRLL $0x19, X4 PXOR X12, X4 PADDD X5, X2 PXOR X2, X11 ROL16(X11, X12) PADDD X11, X8 PXOR X8, X5 MOVO X5, X12 PSLLL $0x0c, X12 PSRLL $0x14, X5 PXOR X12, X5 PADDD X5, X2 PXOR X2, X11 ROL8(X11, X12) PADDD X11, X8 PXOR X8, X5 MOVO X5, X12 PSLLL $0x07, X12 PSRLL $0x19, X5 PXOR X12, X5 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x04 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc0 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x0c BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x0c PADDD X3, X0 PXOR X0, X9 ROL16(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x0c, X12 PSRLL $0x14, X3 PXOR X12, X3 PADDD X3, X0 PXOR X0, X9 ROL8(X9, X12) PADDD X9, X6 PXOR X6, X3 MOVO X3, X12 PSLLL $0x07, X12 PSRLL $0x19, X3 PXOR X12, X3 PADDD X4, X1 PXOR X1, X10 ROL16(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x0c, X12 PSRLL $0x14, X4 PXOR X12, X4 PADDD X4, X1 PXOR X1, X10 ROL8(X10, X12) PADDD X10, X7 PXOR X7, X4 MOVO X4, X12 PSLLL $0x07, X12 PSRLL $0x19, X4 PXOR X12, X4 PADDD X5, X2 PXOR X2, X11 ROL16(X11, X12) PADDD X11, X8 PXOR X8, X5 MOVO X5, X12 PSLLL $0x0c, X12 PSRLL $0x14, X5 PXOR X12, X5 PADDD X5, X2 PXOR X2, X11 ROL8(X11, X12) PADDD X11, X8 PXOR X8, X5 MOVO X5, X12 PSLLL $0x07, X12 PSRLL $0x19, X5 PXOR X12, X5 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xe4 BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xed BYTE $0x0c BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xf6 BYTE $0x08 BYTE $0x66 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xff BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc0 BYTE $0x08 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xc9 BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xd2 BYTE $0x04 BYTE $0x66 BYTE $0x45 BYTE $0x0f BYTE $0x3a BYTE $0x0f BYTE $0xdb BYTE $0x04 DECQ R9 JNE sealSSE128InnerCipherLoop // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded PADDL ·chacha20Constants<>+0(SB), X0 PADDL ·chacha20Constants<>+0(SB), X1 PADDL ·chacha20Constants<>+0(SB), X2 PADDL X13, X3 PADDL X13, X4 PADDL X13, X5 PADDL X14, X7 PADDL X14, X8 PADDL X15, X10 PADDL ·sseIncMask<>+0(SB), X15 PADDL X15, X11 PAND ·polyClampMask<>+0(SB), X0 MOVOU X0, (BP) MOVOU X3, 16(BP) // Hash MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) XORQ CX, CX sealSSE128SealHash: CMPQ CX, $0x10 JB sealSSE128Seal ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 SUBQ $0x10, CX ADDQ $0x10, DI JMP sealSSE128SealHash sealSSE128Seal: CMPQ BX, $0x10 JB sealSSETail SUBQ $0x10, BX // Load for decryption MOVOU (SI), X12 PXOR X12, X1 MOVOU X1, (DI) LEAQ 16(SI), SI LEAQ 16(DI), DI // Extract for hashing MOVQ X1, R13 PSRLDQ $0x08, X1 MOVQ X1, R14 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 // Shift the stream "left" MOVO X4, X1 MOVO X7, X4 MOVO X10, X7 MOVO X2, X10 MOVO X5, X2 MOVO X8, X5 MOVO X11, X8 JMP sealSSE128Seal sealSSETail: TESTQ BX, BX JE sealSSEFinalize // We can only load the PT one byte at a time to avoid read after end of buffer MOVQ BX, R9 SHLQ $0x04, R9 LEAQ ·andMask<>+0(SB), R13 MOVQ BX, CX LEAQ -1(SI)(BX*1), SI XORQ R15, R15 XORQ R8, R8 XORQ AX, AX sealSSETailLoadLoop: SHLQ $0x08, R15, R8 SHLQ $0x08, R15 MOVB (SI), AX XORQ AX, R15 LEAQ -1(SI), SI DECQ CX JNE sealSSETailLoadLoop MOVQ R15, 64(BP) MOVQ R8, 72(BP) PXOR 64(BP), X1 MOVOU X1, (DI) MOVOU -16(R13)(R9*1), X12 PAND X12, X1 MOVQ X1, R13 PSRLDQ $0x08, X1 MOVQ X1, R14 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 ADDQ BX, DI sealSSEFinalize: // Hash in the buffer lengths ADDQ ad_len+80(FP), R10 ADCQ src_len+56(FP), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 // Final reduce MOVQ R10, R13 MOVQ R11, R14 MOVQ R12, R15 SUBQ $-5, R10 SBBQ $-1, R11 SBBQ $0x03, R12 CMOVQCS R13, R10 CMOVQCS R14, R11 CMOVQCS R15, R12 // Add in the "s" part of the key ADDQ 16(BP), R10 ADCQ 24(BP), R11 // Finally store the tag at the end of the message MOVQ R10, (DI) MOVQ R11, 8(DI) RET chacha20Poly1305Seal_AVX2: VZEROUPPER VMOVDQU ·chacha20Constants<>+0(SB), Y0 BYTE $0xc4 BYTE $0x42 BYTE $0x7d BYTE $0x5a BYTE $0x70 BYTE $0x10 BYTE $0xc4 BYTE $0x42 BYTE $0x7d BYTE $0x5a BYTE $0x60 BYTE $0x20 BYTE $0xc4 BYTE $0xc2 BYTE $0x7d BYTE $0x5a BYTE $0x60 BYTE $0x30 VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 // Special optimizations, for very short buffers CMPQ BX, $0x000000c0 JBE seal192AVX2 CMPQ BX, $0x00000140 JBE seal320AVX2 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream VMOVDQA Y0, Y5 VMOVDQA Y0, Y6 VMOVDQA Y0, Y7 VMOVDQA Y14, Y9 VMOVDQA Y14, Y10 VMOVDQA Y14, Y11 VMOVDQA Y14, 32(BP) VMOVDQA Y12, Y13 VMOVDQA Y12, Y8 VMOVDQA Y12, Y15 VMOVDQA Y12, 64(BP) VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 VMOVDQA Y4, 96(BP) VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 VMOVDQA Y1, 128(BP) VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 VMOVDQA Y2, 160(BP) VMOVDQA Y3, 192(BP) MOVQ $0x0000000a, R9 sealAVX2IntroLoop: VMOVDQA Y15, 224(BP) VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y15 VPSRLD $0x14, Y14, Y14 VPXOR Y15, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y15 VPSRLD $0x19, Y14, Y14 VPXOR Y15, Y14, Y14 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y15 VPSRLD $0x14, Y9, Y9 VPXOR Y15, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y15 VPSRLD $0x19, Y9, Y9 VPXOR Y15, Y9, Y9 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol16<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x0c, Y10, Y15 VPSRLD $0x14, Y10, Y10 VPXOR Y15, Y10, Y10 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol8<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x07, Y10, Y15 VPSRLD $0x19, Y10, Y10 VPXOR Y15, Y10, Y10 VMOVDQA 224(BP), Y15 VMOVDQA Y13, 224(BP) VPADDD Y11, Y7, Y7 VPXOR Y7, Y3, Y3 VPSHUFB ·rol16<>+0(SB), Y3, Y3 VPADDD Y3, Y15, Y15 VPXOR Y15, Y11, Y11 VPSLLD $0x0c, Y11, Y13 VPSRLD $0x14, Y11, Y11 VPXOR Y13, Y11, Y11 VPADDD Y11, Y7, Y7 VPXOR Y7, Y3, Y3 VPSHUFB ·rol8<>+0(SB), Y3, Y3 VPADDD Y3, Y15, Y15 VPXOR Y15, Y11, Y11 VPSLLD $0x07, Y11, Y13 VPSRLD $0x19, Y11, Y11 VPXOR Y13, Y11, Y11 VMOVDQA 224(BP), Y13 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x0c, Y4, Y4, Y4 VPALIGNR $0x04, Y9, Y9, Y9 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x0c, Y1, Y1, Y1 VPALIGNR $0x04, Y10, Y10, Y10 VPALIGNR $0x08, Y8, Y8, Y8 VPALIGNR $0x0c, Y2, Y2, Y2 VPALIGNR $0x04, Y11, Y11, Y11 VPALIGNR $0x08, Y15, Y15, Y15 VPALIGNR $0x0c, Y3, Y3, Y3 VMOVDQA Y15, 224(BP) VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y15 VPSRLD $0x14, Y14, Y14 VPXOR Y15, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y15 VPSRLD $0x19, Y14, Y14 VPXOR Y15, Y14, Y14 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y15 VPSRLD $0x14, Y9, Y9 VPXOR Y15, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y15 VPSRLD $0x19, Y9, Y9 VPXOR Y15, Y9, Y9 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol16<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x0c, Y10, Y15 VPSRLD $0x14, Y10, Y10 VPXOR Y15, Y10, Y10 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol8<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x07, Y10, Y15 VPSRLD $0x19, Y10, Y10 VPXOR Y15, Y10, Y10 VMOVDQA 224(BP), Y15 VMOVDQA Y13, 224(BP) VPADDD Y11, Y7, Y7 VPXOR Y7, Y3, Y3 VPSHUFB ·rol16<>+0(SB), Y3, Y3 VPADDD Y3, Y15, Y15 VPXOR Y15, Y11, Y11 VPSLLD $0x0c, Y11, Y13 VPSRLD $0x14, Y11, Y11 VPXOR Y13, Y11, Y11 VPADDD Y11, Y7, Y7 VPXOR Y7, Y3, Y3 VPSHUFB ·rol8<>+0(SB), Y3, Y3 VPADDD Y3, Y15, Y15 VPXOR Y15, Y11, Y11 VPSLLD $0x07, Y11, Y13 VPSRLD $0x19, Y11, Y11 VPXOR Y13, Y11, Y11 VMOVDQA 224(BP), Y13 VPALIGNR $0x0c, Y14, Y14, Y14 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x04, Y4, Y4, Y4 VPALIGNR $0x0c, Y9, Y9, Y9 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x04, Y1, Y1, Y1 VPALIGNR $0x0c, Y10, Y10, Y10 VPALIGNR $0x08, Y8, Y8, Y8 VPALIGNR $0x04, Y2, Y2, Y2 VPALIGNR $0x0c, Y11, Y11, Y11 VPALIGNR $0x08, Y15, Y15, Y15 VPALIGNR $0x04, Y3, Y3, Y3 DECQ R9 JNE sealAVX2IntroLoop VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 VPADDD 32(BP), Y14, Y14 VPADDD 32(BP), Y9, Y9 VPADDD 32(BP), Y10, Y10 VPADDD 32(BP), Y11, Y11 VPADDD 64(BP), Y12, Y12 VPADDD 64(BP), Y13, Y13 VPADDD 64(BP), Y8, Y8 VPADDD 64(BP), Y15, Y15 VPADDD 96(BP), Y4, Y4 VPADDD 128(BP), Y1, Y1 VPADDD 160(BP), Y2, Y2 VPADDD 192(BP), Y3, Y3 VPERM2I128 $0x13, Y12, Y4, Y12 VPERM2I128 $0x02, Y0, Y14, Y4 VPERM2I128 $0x13, Y0, Y14, Y0 // Clamp and store poly key VPAND ·polyClampMask<>+0(SB), Y4, Y4 VMOVDQA Y4, (BP) // Hash AD MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) // Can store at least 320 bytes VPXOR (SI), Y0, Y0 VPXOR 32(SI), Y12, Y12 VMOVDQU Y0, (DI) VMOVDQU Y12, 32(DI) VPERM2I128 $0x02, Y5, Y9, Y0 VPERM2I128 $0x02, Y13, Y1, Y14 VPERM2I128 $0x13, Y5, Y9, Y12 VPERM2I128 $0x13, Y13, Y1, Y4 VPXOR 64(SI), Y0, Y0 VPXOR 96(SI), Y14, Y14 VPXOR 128(SI), Y12, Y12 VPXOR 160(SI), Y4, Y4 VMOVDQU Y0, 64(DI) VMOVDQU Y14, 96(DI) VMOVDQU Y12, 128(DI) VMOVDQU Y4, 160(DI) VPERM2I128 $0x02, Y6, Y10, Y0 VPERM2I128 $0x02, Y8, Y2, Y14 VPERM2I128 $0x13, Y6, Y10, Y12 VPERM2I128 $0x13, Y8, Y2, Y4 VPXOR 192(SI), Y0, Y0 VPXOR 224(SI), Y14, Y14 VPXOR 256(SI), Y12, Y12 VPXOR 288(SI), Y4, Y4 VMOVDQU Y0, 192(DI) VMOVDQU Y14, 224(DI) VMOVDQU Y12, 256(DI) VMOVDQU Y4, 288(DI) MOVQ $0x00000140, CX SUBQ $0x00000140, BX LEAQ 320(SI), SI VPERM2I128 $0x02, Y7, Y11, Y0 VPERM2I128 $0x02, Y15, Y3, Y14 VPERM2I128 $0x13, Y7, Y11, Y12 VPERM2I128 $0x13, Y15, Y3, Y4 CMPQ BX, $0x80 JBE sealAVX2SealHash VPXOR (SI), Y0, Y0 VPXOR 32(SI), Y14, Y14 VPXOR 64(SI), Y12, Y12 VPXOR 96(SI), Y4, Y4 VMOVDQU Y0, 320(DI) VMOVDQU Y14, 352(DI) VMOVDQU Y12, 384(DI) VMOVDQU Y4, 416(DI) SUBQ $0x80, BX LEAQ 128(SI), SI MOVQ $0x00000008, CX MOVQ $0x00000002, R9 CMPQ BX, $0x80 JBE sealAVX2Tail128 CMPQ BX, $0x00000100 JBE sealAVX2Tail256 CMPQ BX, $0x00000180 JBE sealAVX2Tail384 CMPQ BX, $0x00000200 JBE sealAVX2Tail512 // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop VMOVDQA ·chacha20Constants<>+0(SB), Y0 VMOVDQA Y0, Y5 VMOVDQA Y0, Y6 VMOVDQA Y0, Y7 VMOVDQA 32(BP), Y14 VMOVDQA Y14, Y9 VMOVDQA Y14, Y10 VMOVDQA Y14, Y11 VMOVDQA 64(BP), Y12 VMOVDQA Y12, Y13 VMOVDQA Y12, Y8 VMOVDQA Y12, Y15 VMOVDQA 192(BP), Y4 VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 VMOVDQA Y4, 96(BP) VMOVDQA Y1, 128(BP) VMOVDQA Y2, 160(BP) VMOVDQA Y3, 192(BP) VMOVDQA Y15, 224(BP) VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y15 VPSRLD $0x14, Y14, Y14 VPXOR Y15, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y15 VPSRLD $0x19, Y14, Y14 VPXOR Y15, Y14, Y14 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y15 VPSRLD $0x14, Y9, Y9 VPXOR Y15, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y15 VPSRLD $0x19, Y9, Y9 VPXOR Y15, Y9, Y9 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol16<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x0c, Y10, Y15 VPSRLD $0x14, Y10, Y10 VPXOR Y15, Y10, Y10 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol8<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x07, Y10, Y15 VPSRLD $0x19, Y10, Y10 VPXOR Y15, Y10, Y10 VMOVDQA 224(BP), Y15 VMOVDQA Y13, 224(BP) VPADDD Y11, Y7, Y7 VPXOR Y7, Y3, Y3 VPSHUFB ·rol16<>+0(SB), Y3, Y3 VPADDD Y3, Y15, Y15 VPXOR Y15, Y11, Y11 VPSLLD $0x0c, Y11, Y13 VPSRLD $0x14, Y11, Y11 VPXOR Y13, Y11, Y11 VPADDD Y11, Y7, Y7 VPXOR Y7, Y3, Y3 VPSHUFB ·rol8<>+0(SB), Y3, Y3 VPADDD Y3, Y15, Y15 VPXOR Y15, Y11, Y11 VPSLLD $0x07, Y11, Y13 VPSRLD $0x19, Y11, Y11 VPXOR Y13, Y11, Y11 VMOVDQA 224(BP), Y13 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x0c, Y4, Y4, Y4 VPALIGNR $0x04, Y9, Y9, Y9 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x0c, Y1, Y1, Y1 VPALIGNR $0x04, Y10, Y10, Y10 VPALIGNR $0x08, Y8, Y8, Y8 VPALIGNR $0x0c, Y2, Y2, Y2 VPALIGNR $0x04, Y11, Y11, Y11 VPALIGNR $0x08, Y15, Y15, Y15 VPALIGNR $0x0c, Y3, Y3, Y3 VMOVDQA Y15, 224(BP) VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y15 VPSRLD $0x14, Y14, Y14 VPXOR Y15, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y15 VPSRLD $0x19, Y14, Y14 VPXOR Y15, Y14, Y14 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y15 VPSRLD $0x14, Y9, Y9 VPXOR Y15, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y15 VPSRLD $0x19, Y9, Y9 VPXOR Y15, Y9, Y9 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol16<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x0c, Y10, Y15 VPSRLD $0x14, Y10, Y10 VPXOR Y15, Y10, Y10 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol8<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x07, Y10, Y15 VPSRLD $0x19, Y10, Y10 VPXOR Y15, Y10, Y10 VMOVDQA 224(BP), Y15 VMOVDQA Y13, 224(BP) VPADDD Y11, Y7, Y7 VPXOR Y7, Y3, Y3 VPSHUFB ·rol16<>+0(SB), Y3, Y3 VPADDD Y3, Y15, Y15 VPXOR Y15, Y11, Y11 VPSLLD $0x0c, Y11, Y13 VPSRLD $0x14, Y11, Y11 VPXOR Y13, Y11, Y11 VPADDD Y11, Y7, Y7 VPXOR Y7, Y3, Y3 VPSHUFB ·rol8<>+0(SB), Y3, Y3 VPADDD Y3, Y15, Y15 VPXOR Y15, Y11, Y11 VPSLLD $0x07, Y11, Y13 VPSRLD $0x19, Y11, Y11 VPXOR Y13, Y11, Y11 VMOVDQA 224(BP), Y13 VPALIGNR $0x0c, Y14, Y14, Y14 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x04, Y4, Y4, Y4 VPALIGNR $0x0c, Y9, Y9, Y9 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x04, Y1, Y1, Y1 VPALIGNR $0x0c, Y10, Y10, Y10 VPALIGNR $0x08, Y8, Y8, Y8 VPALIGNR $0x04, Y2, Y2, Y2 VPALIGNR $0x0c, Y11, Y11, Y11 VPALIGNR $0x08, Y15, Y15, Y15 VPALIGNR $0x04, Y3, Y3, Y3 VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 VPADDD Y10, Y6, Y6 VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 VPXOR Y6, Y2, Y2 VPXOR Y7, Y3, Y3 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y2, Y2 VPSHUFB ·rol16<>+0(SB), Y3, Y3 VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 VPADDD Y2, Y8, Y8 VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 VPXOR Y8, Y10, Y10 VPXOR Y15, Y11, Y11 VMOVDQA Y15, 224(BP) VPSLLD $0x0c, Y14, Y15 VPSRLD $0x14, Y14, Y14 VPXOR Y15, Y14, Y14 VPSLLD $0x0c, Y9, Y15 VPSRLD $0x14, Y9, Y9 VPXOR Y15, Y9, Y9 VPSLLD $0x0c, Y10, Y15 VPSRLD $0x14, Y10, Y10 VPXOR Y15, Y10, Y10 VPSLLD $0x0c, Y11, Y15 VPSRLD $0x14, Y11, Y11 VPXOR Y15, Y11, Y11 VMOVDQA 224(BP), Y15 SUBQ $0x10, DI MOVQ $0x00000009, CX JMP sealAVX2InternalLoopStart sealAVX2MainLoop: VMOVDQU ·chacha20Constants<>+0(SB), Y0 VMOVDQA Y0, Y5 VMOVDQA Y0, Y6 VMOVDQA Y0, Y7 VMOVDQA 32(BP), Y14 VMOVDQA Y14, Y9 VMOVDQA Y14, Y10 VMOVDQA Y14, Y11 VMOVDQA 64(BP), Y12 VMOVDQA Y12, Y13 VMOVDQA Y12, Y8 VMOVDQA Y12, Y15 VMOVDQA 192(BP), Y4 VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 VMOVDQA Y4, 96(BP) VMOVDQA Y1, 128(BP) VMOVDQA Y2, 160(BP) VMOVDQA Y3, 192(BP) MOVQ $0x0000000a, CX sealAVX2InternalLoop: ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 VPADDD Y10, Y6, Y6 VPADDD Y11, Y7, Y7 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 VPXOR Y6, Y2, Y2 VPXOR Y7, Y3, Y3 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y2, Y2 VPSHUFB ·rol16<>+0(SB), Y3, Y3 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 VPADDD Y2, Y8, Y8 VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 VPXOR Y8, Y10, Y10 VPXOR Y15, Y11, Y11 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 VMOVDQA Y15, 224(BP) VPSLLD $0x0c, Y14, Y15 VPSRLD $0x14, Y14, Y14 VPXOR Y15, Y14, Y14 VPSLLD $0x0c, Y9, Y15 VPSRLD $0x14, Y9, Y9 VPXOR Y15, Y9, Y9 VPSLLD $0x0c, Y10, Y15 VPSRLD $0x14, Y10, Y10 VPXOR Y15, Y10, Y10 VPSLLD $0x0c, Y11, Y15 VPSRLD $0x14, Y11, Y11 VPXOR Y15, Y11, Y11 VMOVDQA 224(BP), Y15 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 sealAVX2InternalLoopStart: VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 VPADDD Y10, Y6, Y6 VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 VPXOR Y6, Y2, Y2 VPXOR Y7, Y3, Y3 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y2, Y2 VPSHUFB ·rol8<>+0(SB), Y3, Y3 ADDQ 16(DI), R10 ADCQ 24(DI), R11 ADCQ $0x01, R12 VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 VPADDD Y2, Y8, Y8 VPADDD Y3, Y15, Y15 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 VPXOR Y8, Y10, Y10 VPXOR Y15, Y11, Y11 VMOVDQA Y15, 224(BP) VPSLLD $0x07, Y14, Y15 VPSRLD $0x19, Y14, Y14 VPXOR Y15, Y14, Y14 VPSLLD $0x07, Y9, Y15 VPSRLD $0x19, Y9, Y9 VPXOR Y15, Y9, Y9 VPSLLD $0x07, Y10, Y15 VPSRLD $0x19, Y10, Y10 VPXOR Y15, Y10, Y10 VPSLLD $0x07, Y11, Y15 VPSRLD $0x19, Y11, Y11 VPXOR Y15, Y11, Y11 VMOVDQA 224(BP), Y15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x04, Y9, Y9, Y9 VPALIGNR $0x04, Y10, Y10, Y10 VPALIGNR $0x04, Y11, Y11, Y11 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 VPALIGNR $0x08, Y15, Y15, Y15 VPALIGNR $0x0c, Y4, Y4, Y4 VPALIGNR $0x0c, Y1, Y1, Y1 VPALIGNR $0x0c, Y2, Y2, Y2 VPALIGNR $0x0c, Y3, Y3, Y3 VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 VPADDD Y10, Y6, Y6 VPADDD Y11, Y7, Y7 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 VPXOR Y6, Y2, Y2 VPXOR Y7, Y3, Y3 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y2, Y2 VPSHUFB ·rol16<>+0(SB), Y3, Y3 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 VPADDD Y2, Y8, Y8 VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 VPXOR Y8, Y10, Y10 VPXOR Y15, Y11, Y11 ADDQ 32(DI), R10 ADCQ 40(DI), R11 ADCQ $0x01, R12 LEAQ 48(DI), DI VMOVDQA Y15, 224(BP) VPSLLD $0x0c, Y14, Y15 VPSRLD $0x14, Y14, Y14 VPXOR Y15, Y14, Y14 VPSLLD $0x0c, Y9, Y15 VPSRLD $0x14, Y9, Y9 VPXOR Y15, Y9, Y9 VPSLLD $0x0c, Y10, Y15 VPSRLD $0x14, Y10, Y10 VPXOR Y15, Y10, Y10 VPSLLD $0x0c, Y11, Y15 VPSRLD $0x14, Y11, Y11 VPXOR Y15, Y11, Y11 VMOVDQA 224(BP), Y15 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 VPADDD Y10, Y6, Y6 VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 VPXOR Y6, Y2, Y2 VPXOR Y7, Y3, Y3 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y2, Y2 VPSHUFB ·rol8<>+0(SB), Y3, Y3 VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 VPADDD Y2, Y8, Y8 VPADDD Y3, Y15, Y15 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 VPXOR Y8, Y10, Y10 VPXOR Y15, Y11, Y11 VMOVDQA Y15, 224(BP) VPSLLD $0x07, Y14, Y15 VPSRLD $0x19, Y14, Y14 VPXOR Y15, Y14, Y14 VPSLLD $0x07, Y9, Y15 VPSRLD $0x19, Y9, Y9 VPXOR Y15, Y9, Y9 VPSLLD $0x07, Y10, Y15 VPSRLD $0x19, Y10, Y10 VPXOR Y15, Y10, Y10 VPSLLD $0x07, Y11, Y15 VPSRLD $0x19, Y11, Y11 VPXOR Y15, Y11, Y11 VMOVDQA 224(BP), Y15 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 VPALIGNR $0x0c, Y14, Y14, Y14 VPALIGNR $0x0c, Y9, Y9, Y9 VPALIGNR $0x0c, Y10, Y10, Y10 VPALIGNR $0x0c, Y11, Y11, Y11 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 VPALIGNR $0x08, Y15, Y15, Y15 VPALIGNR $0x04, Y4, Y4, Y4 VPALIGNR $0x04, Y1, Y1, Y1 VPALIGNR $0x04, Y2, Y2, Y2 VPALIGNR $0x04, Y3, Y3, Y3 DECQ CX JNE sealAVX2InternalLoop VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 VPADDD 32(BP), Y14, Y14 VPADDD 32(BP), Y9, Y9 VPADDD 32(BP), Y10, Y10 VPADDD 32(BP), Y11, Y11 VPADDD 64(BP), Y12, Y12 VPADDD 64(BP), Y13, Y13 VPADDD 64(BP), Y8, Y8 VPADDD 64(BP), Y15, Y15 VPADDD 96(BP), Y4, Y4 VPADDD 128(BP), Y1, Y1 VPADDD 160(BP), Y2, Y2 VPADDD 192(BP), Y3, Y3 VMOVDQA Y15, 224(BP) // We only hashed 480 of the 512 bytes available - hash the remaining 32 here ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 32(DI), DI VPERM2I128 $0x02, Y0, Y14, Y15 VPERM2I128 $0x13, Y0, Y14, Y14 VPERM2I128 $0x02, Y12, Y4, Y0 VPERM2I128 $0x13, Y12, Y4, Y12 VPXOR (SI), Y15, Y15 VPXOR 32(SI), Y0, Y0 VPXOR 64(SI), Y14, Y14 VPXOR 96(SI), Y12, Y12 VMOVDQU Y15, (DI) VMOVDQU Y0, 32(DI) VMOVDQU Y14, 64(DI) VMOVDQU Y12, 96(DI) VPERM2I128 $0x02, Y5, Y9, Y0 VPERM2I128 $0x02, Y13, Y1, Y14 VPERM2I128 $0x13, Y5, Y9, Y12 VPERM2I128 $0x13, Y13, Y1, Y4 VPXOR 128(SI), Y0, Y0 VPXOR 160(SI), Y14, Y14 VPXOR 192(SI), Y12, Y12 VPXOR 224(SI), Y4, Y4 VMOVDQU Y0, 128(DI) VMOVDQU Y14, 160(DI) VMOVDQU Y12, 192(DI) VMOVDQU Y4, 224(DI) // and here ADDQ -16(DI), R10 ADCQ -8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 VPERM2I128 $0x02, Y6, Y10, Y0 VPERM2I128 $0x02, Y8, Y2, Y14 VPERM2I128 $0x13, Y6, Y10, Y12 VPERM2I128 $0x13, Y8, Y2, Y4 VPXOR 256(SI), Y0, Y0 VPXOR 288(SI), Y14, Y14 VPXOR 320(SI), Y12, Y12 VPXOR 352(SI), Y4, Y4 VMOVDQU Y0, 256(DI) VMOVDQU Y14, 288(DI) VMOVDQU Y12, 320(DI) VMOVDQU Y4, 352(DI) VPERM2I128 $0x02, Y7, Y11, Y0 VPERM2I128 $0x02, 224(BP), Y3, Y14 VPERM2I128 $0x13, Y7, Y11, Y12 VPERM2I128 $0x13, 224(BP), Y3, Y4 VPXOR 384(SI), Y0, Y0 VPXOR 416(SI), Y14, Y14 VPXOR 448(SI), Y12, Y12 VPXOR 480(SI), Y4, Y4 VMOVDQU Y0, 384(DI) VMOVDQU Y14, 416(DI) VMOVDQU Y12, 448(DI) VMOVDQU Y4, 480(DI) LEAQ 512(SI), SI SUBQ $0x00000200, BX CMPQ BX, $0x00000200 JG sealAVX2MainLoop // Tail can only hash 480 bytes ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 ADDQ 16(DI), R10 ADCQ 24(DI), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 32(DI), DI MOVQ $0x0000000a, CX MOVQ $0x00000000, R9 CMPQ BX, $0x80 JBE sealAVX2Tail128 CMPQ BX, $0x00000100 JBE sealAVX2Tail256 CMPQ BX, $0x00000180 JBE sealAVX2Tail384 JMP sealAVX2Tail512 seal192AVX2: VMOVDQA Y0, Y5 VMOVDQA Y14, Y9 VMOVDQA Y12, Y13 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 VMOVDQA Y0, Y6 VMOVDQA Y14, Y10 VMOVDQA Y12, Y8 VMOVDQA Y4, Y2 VMOVDQA Y1, Y15 MOVQ $0x0000000a, R9 sealAVX2192InnerCipherLoop: VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y3 VPSRLD $0x14, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y3 VPSRLD $0x19, Y9, Y9 VPXOR Y3, Y9, Y9 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x04, Y9, Y9, Y9 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x0c, Y4, Y4, Y4 VPALIGNR $0x0c, Y1, Y1, Y1 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y3 VPSRLD $0x14, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y3 VPSRLD $0x19, Y9, Y9 VPXOR Y3, Y9, Y9 VPALIGNR $0x0c, Y14, Y14, Y14 VPALIGNR $0x0c, Y9, Y9, Y9 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x04, Y4, Y4, Y4 VPALIGNR $0x04, Y1, Y1, Y1 DECQ R9 JNE sealAVX2192InnerCipherLoop VPADDD Y6, Y0, Y0 VPADDD Y6, Y5, Y5 VPADDD Y10, Y14, Y14 VPADDD Y10, Y9, Y9 VPADDD Y8, Y12, Y12 VPADDD Y8, Y13, Y13 VPADDD Y2, Y4, Y4 VPADDD Y15, Y1, Y1 VPERM2I128 $0x02, Y0, Y14, Y3 // Clamp and store poly key VPAND ·polyClampMask<>+0(SB), Y3, Y3 VMOVDQA Y3, (BP) // Stream for up to 192 bytes VPERM2I128 $0x13, Y0, Y14, Y0 VPERM2I128 $0x13, Y12, Y4, Y14 VPERM2I128 $0x02, Y5, Y9, Y12 VPERM2I128 $0x02, Y13, Y1, Y4 VPERM2I128 $0x13, Y5, Y9, Y5 VPERM2I128 $0x13, Y13, Y1, Y9 sealAVX2ShortSeal: // Hash aad MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) XORQ CX, CX sealAVX2SealHash: // itr1 holds the number of bytes encrypted but not yet hashed CMPQ CX, $0x10 JB sealAVX2ShortSealLoop ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 SUBQ $0x10, CX ADDQ $0x10, DI JMP sealAVX2SealHash sealAVX2ShortSealLoop: CMPQ BX, $0x20 JB sealAVX2ShortTail32 SUBQ $0x20, BX // Load for encryption VPXOR (SI), Y0, Y0 VMOVDQU Y0, (DI) LEAQ 32(SI), SI // Now can hash ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 ADDQ 16(DI), R10 ADCQ 24(DI), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 32(DI), DI // Shift stream left VMOVDQA Y14, Y0 VMOVDQA Y12, Y14 VMOVDQA Y4, Y12 VMOVDQA Y5, Y4 VMOVDQA Y9, Y5 VMOVDQA Y13, Y9 VMOVDQA Y1, Y13 VMOVDQA Y6, Y1 VMOVDQA Y10, Y6 JMP sealAVX2ShortSealLoop sealAVX2ShortTail32: CMPQ BX, $0x10 VMOVDQA X0, X1 JB sealAVX2ShortDone SUBQ $0x10, BX // Load for encryption VPXOR (SI), X0, X12 VMOVDQU X12, (DI) LEAQ 16(SI), SI // Hash ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 16(DI), DI VPERM2I128 $0x11, Y0, Y0, Y0 VMOVDQA X0, X1 sealAVX2ShortDone: VZEROUPPER JMP sealSSETail seal320AVX2: VMOVDQA Y0, Y5 VMOVDQA Y14, Y9 VMOVDQA Y12, Y13 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 VMOVDQA Y0, Y6 VMOVDQA Y14, Y10 VMOVDQA Y12, Y8 VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 VMOVDQA Y14, Y7 VMOVDQA Y12, Y11 VMOVDQA Y4, Y15 MOVQ $0x0000000a, R9 sealAVX2320InnerCipherLoop: VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y3 VPSRLD $0x14, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y3 VPSRLD $0x19, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol16<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x0c, Y10, Y3 VPSRLD $0x14, Y10, Y10 VPXOR Y3, Y10, Y10 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol8<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x07, Y10, Y3 VPSRLD $0x19, Y10, Y10 VPXOR Y3, Y10, Y10 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x04, Y9, Y9, Y9 VPALIGNR $0x04, Y10, Y10, Y10 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 VPALIGNR $0x0c, Y4, Y4, Y4 VPALIGNR $0x0c, Y1, Y1, Y1 VPALIGNR $0x0c, Y2, Y2, Y2 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y3 VPSRLD $0x14, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y3 VPSRLD $0x19, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol16<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x0c, Y10, Y3 VPSRLD $0x14, Y10, Y10 VPXOR Y3, Y10, Y10 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol8<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x07, Y10, Y3 VPSRLD $0x19, Y10, Y10 VPXOR Y3, Y10, Y10 VPALIGNR $0x0c, Y14, Y14, Y14 VPALIGNR $0x0c, Y9, Y9, Y9 VPALIGNR $0x0c, Y10, Y10, Y10 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 VPALIGNR $0x04, Y4, Y4, Y4 VPALIGNR $0x04, Y1, Y1, Y1 VPALIGNR $0x04, Y2, Y2, Y2 DECQ R9 JNE sealAVX2320InnerCipherLoop VMOVDQA ·chacha20Constants<>+0(SB), Y3 VPADDD Y3, Y0, Y0 VPADDD Y3, Y5, Y5 VPADDD Y3, Y6, Y6 VPADDD Y7, Y14, Y14 VPADDD Y7, Y9, Y9 VPADDD Y7, Y10, Y10 VPADDD Y11, Y12, Y12 VPADDD Y11, Y13, Y13 VPADDD Y11, Y8, Y8 VMOVDQA ·avx2IncMask<>+0(SB), Y3 VPADDD Y15, Y4, Y4 VPADDD Y3, Y15, Y15 VPADDD Y15, Y1, Y1 VPADDD Y3, Y15, Y15 VPADDD Y15, Y2, Y2 // Clamp and store poly key VPERM2I128 $0x02, Y0, Y14, Y3 VPAND ·polyClampMask<>+0(SB), Y3, Y3 VMOVDQA Y3, (BP) // Stream for up to 320 bytes VPERM2I128 $0x13, Y0, Y14, Y0 VPERM2I128 $0x13, Y12, Y4, Y14 VPERM2I128 $0x02, Y5, Y9, Y12 VPERM2I128 $0x02, Y13, Y1, Y4 VPERM2I128 $0x13, Y5, Y9, Y5 VPERM2I128 $0x13, Y13, Y1, Y9 VPERM2I128 $0x02, Y6, Y10, Y13 VPERM2I128 $0x02, Y8, Y2, Y1 VPERM2I128 $0x13, Y6, Y10, Y6 VPERM2I128 $0x13, Y8, Y2, Y10 JMP sealAVX2ShortSeal sealAVX2Tail128: VMOVDQA ·chacha20Constants<>+0(SB), Y0 VMOVDQA 32(BP), Y14 VMOVDQA 64(BP), Y12 VMOVDQA 192(BP), Y4 VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 VMOVDQA Y4, Y1 sealAVX2Tail128LoopA: ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 16(DI), DI sealAVX2Tail128LoopB: VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 VPXOR Y3, Y14, Y14 ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x0c, Y4, Y4, Y4 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 VPXOR Y3, Y14, Y14 ADDQ 16(DI), R10 ADCQ 24(DI), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 32(DI), DI VPALIGNR $0x0c, Y14, Y14, Y14 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x04, Y4, Y4, Y4 DECQ CX JG sealAVX2Tail128LoopA DECQ R9 JGE sealAVX2Tail128LoopB VPADDD ·chacha20Constants<>+0(SB), Y0, Y5 VPADDD 32(BP), Y14, Y9 VPADDD 64(BP), Y12, Y13 VPADDD Y1, Y4, Y1 VPERM2I128 $0x02, Y5, Y9, Y0 VPERM2I128 $0x02, Y13, Y1, Y14 VPERM2I128 $0x13, Y5, Y9, Y12 VPERM2I128 $0x13, Y13, Y1, Y4 JMP sealAVX2ShortSealLoop sealAVX2Tail256: VMOVDQA ·chacha20Constants<>+0(SB), Y0 VMOVDQA ·chacha20Constants<>+0(SB), Y5 VMOVDQA 32(BP), Y14 VMOVDQA 32(BP), Y9 VMOVDQA 64(BP), Y12 VMOVDQA 64(BP), Y13 VMOVDQA 192(BP), Y4 VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 VMOVDQA Y4, Y7 VMOVDQA Y1, Y11 sealAVX2Tail256LoopA: ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 16(DI), DI sealAVX2Tail256LoopB: VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y3 VPSRLD $0x14, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y3 VPSRLD $0x19, Y9, Y9 VPXOR Y3, Y9, Y9 ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x04, Y9, Y9, Y9 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x0c, Y4, Y4, Y4 VPALIGNR $0x0c, Y1, Y1, Y1 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y3 VPSRLD $0x14, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y3 VPSRLD $0x19, Y9, Y9 VPXOR Y3, Y9, Y9 ADDQ 16(DI), R10 ADCQ 24(DI), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 32(DI), DI VPALIGNR $0x0c, Y14, Y14, Y14 VPALIGNR $0x0c, Y9, Y9, Y9 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x04, Y4, Y4, Y4 VPALIGNR $0x04, Y1, Y1, Y1 DECQ CX JG sealAVX2Tail256LoopA DECQ R9 JGE sealAVX2Tail256LoopB VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 VPADDD 32(BP), Y14, Y14 VPADDD 32(BP), Y9, Y9 VPADDD 64(BP), Y12, Y12 VPADDD 64(BP), Y13, Y13 VPADDD Y7, Y4, Y4 VPADDD Y11, Y1, Y1 VPERM2I128 $0x02, Y0, Y14, Y3 VPERM2I128 $0x02, Y12, Y4, Y7 VPERM2I128 $0x13, Y0, Y14, Y11 VPERM2I128 $0x13, Y12, Y4, Y15 VPXOR (SI), Y3, Y3 VPXOR 32(SI), Y7, Y7 VPXOR 64(SI), Y11, Y11 VPXOR 96(SI), Y15, Y15 VMOVDQU Y3, (DI) VMOVDQU Y7, 32(DI) VMOVDQU Y11, 64(DI) VMOVDQU Y15, 96(DI) MOVQ $0x00000080, CX LEAQ 128(SI), SI SUBQ $0x80, BX VPERM2I128 $0x02, Y5, Y9, Y0 VPERM2I128 $0x02, Y13, Y1, Y14 VPERM2I128 $0x13, Y5, Y9, Y12 VPERM2I128 $0x13, Y13, Y1, Y4 JMP sealAVX2SealHash sealAVX2Tail384: VMOVDQA ·chacha20Constants<>+0(SB), Y0 VMOVDQA Y0, Y5 VMOVDQA Y0, Y6 VMOVDQA 32(BP), Y14 VMOVDQA Y14, Y9 VMOVDQA Y14, Y10 VMOVDQA 64(BP), Y12 VMOVDQA Y12, Y13 VMOVDQA Y12, Y8 VMOVDQA 192(BP), Y4 VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 VMOVDQA Y4, Y7 VMOVDQA Y1, Y11 VMOVDQA Y2, Y15 sealAVX2Tail384LoopA: ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 16(DI), DI sealAVX2Tail384LoopB: VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y3 VPSRLD $0x14, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y3 VPSRLD $0x19, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol16<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x0c, Y10, Y3 VPSRLD $0x14, Y10, Y10 VPXOR Y3, Y10, Y10 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol8<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x07, Y10, Y3 VPSRLD $0x19, Y10, Y10 VPXOR Y3, Y10, Y10 ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x04, Y9, Y9, Y9 VPALIGNR $0x04, Y10, Y10, Y10 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 VPALIGNR $0x0c, Y4, Y4, Y4 VPALIGNR $0x0c, Y1, Y1, Y1 VPALIGNR $0x0c, Y2, Y2, Y2 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 VPXOR Y0, Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPADDD Y4, Y12, Y12 VPXOR Y12, Y14, Y14 VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x0c, Y9, Y3 VPSRLD $0x14, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y9, Y5, Y5 VPXOR Y5, Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPADDD Y1, Y13, Y13 VPXOR Y13, Y9, Y9 VPSLLD $0x07, Y9, Y3 VPSRLD $0x19, Y9, Y9 VPXOR Y3, Y9, Y9 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol16<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x0c, Y10, Y3 VPSRLD $0x14, Y10, Y10 VPXOR Y3, Y10, Y10 VPADDD Y10, Y6, Y6 VPXOR Y6, Y2, Y2 VPSHUFB ·rol8<>+0(SB), Y2, Y2 VPADDD Y2, Y8, Y8 VPXOR Y8, Y10, Y10 VPSLLD $0x07, Y10, Y3 VPSRLD $0x19, Y10, Y10 VPXOR Y3, Y10, Y10 ADDQ 16(DI), R10 ADCQ 24(DI), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 32(DI), DI VPALIGNR $0x0c, Y14, Y14, Y14 VPALIGNR $0x0c, Y9, Y9, Y9 VPALIGNR $0x0c, Y10, Y10, Y10 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 VPALIGNR $0x04, Y4, Y4, Y4 VPALIGNR $0x04, Y1, Y1, Y1 VPALIGNR $0x04, Y2, Y2, Y2 DECQ CX JG sealAVX2Tail384LoopA DECQ R9 JGE sealAVX2Tail384LoopB VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 VPADDD 32(BP), Y14, Y14 VPADDD 32(BP), Y9, Y9 VPADDD 32(BP), Y10, Y10 VPADDD 64(BP), Y12, Y12 VPADDD 64(BP), Y13, Y13 VPADDD 64(BP), Y8, Y8 VPADDD Y7, Y4, Y4 VPADDD Y11, Y1, Y1 VPADDD Y15, Y2, Y2 VPERM2I128 $0x02, Y0, Y14, Y3 VPERM2I128 $0x02, Y12, Y4, Y7 VPERM2I128 $0x13, Y0, Y14, Y11 VPERM2I128 $0x13, Y12, Y4, Y15 VPXOR (SI), Y3, Y3 VPXOR 32(SI), Y7, Y7 VPXOR 64(SI), Y11, Y11 VPXOR 96(SI), Y15, Y15 VMOVDQU Y3, (DI) VMOVDQU Y7, 32(DI) VMOVDQU Y11, 64(DI) VMOVDQU Y15, 96(DI) VPERM2I128 $0x02, Y5, Y9, Y3 VPERM2I128 $0x02, Y13, Y1, Y7 VPERM2I128 $0x13, Y5, Y9, Y11 VPERM2I128 $0x13, Y13, Y1, Y15 VPXOR 128(SI), Y3, Y3 VPXOR 160(SI), Y7, Y7 VPXOR 192(SI), Y11, Y11 VPXOR 224(SI), Y15, Y15 VMOVDQU Y3, 128(DI) VMOVDQU Y7, 160(DI) VMOVDQU Y11, 192(DI) VMOVDQU Y15, 224(DI) MOVQ $0x00000100, CX LEAQ 256(SI), SI SUBQ $0x00000100, BX VPERM2I128 $0x02, Y6, Y10, Y0 VPERM2I128 $0x02, Y8, Y2, Y14 VPERM2I128 $0x13, Y6, Y10, Y12 VPERM2I128 $0x13, Y8, Y2, Y4 JMP sealAVX2SealHash sealAVX2Tail512: VMOVDQA ·chacha20Constants<>+0(SB), Y0 VMOVDQA Y0, Y5 VMOVDQA Y0, Y6 VMOVDQA Y0, Y7 VMOVDQA 32(BP), Y14 VMOVDQA Y14, Y9 VMOVDQA Y14, Y10 VMOVDQA Y14, Y11 VMOVDQA 64(BP), Y12 VMOVDQA Y12, Y13 VMOVDQA Y12, Y8 VMOVDQA Y12, Y15 VMOVDQA 192(BP), Y4 VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 VMOVDQA Y4, 96(BP) VMOVDQA Y1, 128(BP) VMOVDQA Y2, 160(BP) VMOVDQA Y3, 192(BP) sealAVX2Tail512LoopA: ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), AX MOVQ AX, R15 MULQ R10 MOVQ AX, R13 MOVQ DX, R14 MOVQ (BP), AX MULQ R11 IMULQ R12, R15 ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), AX MOVQ AX, R8 MULQ R10 ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 8(BP), AX MULQ R11 ADDQ AX, R15 ADCQ $0x00, DX IMULQ R12, R8 ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 16(DI), DI sealAVX2Tail512LoopB: VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 VPADDD Y10, Y6, Y6 VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 VPXOR Y6, Y2, Y2 VPXOR Y7, Y3, Y3 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y2, Y2 VPSHUFB ·rol16<>+0(SB), Y3, Y3 VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 VPADDD Y2, Y8, Y8 VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 VPXOR Y8, Y10, Y10 VPXOR Y15, Y11, Y11 VMOVDQA Y15, 224(BP) VPSLLD $0x0c, Y14, Y15 VPSRLD $0x14, Y14, Y14 VPXOR Y15, Y14, Y14 VPSLLD $0x0c, Y9, Y15 VPSRLD $0x14, Y9, Y9 VPXOR Y15, Y9, Y9 VPSLLD $0x0c, Y10, Y15 VPSRLD $0x14, Y10, Y10 VPXOR Y15, Y10, Y10 VPSLLD $0x0c, Y11, Y15 VPSRLD $0x14, Y11, Y11 VPXOR Y15, Y11, Y11 VMOVDQA 224(BP), Y15 ADDQ (DI), R10 ADCQ 8(DI), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 VPADDD Y10, Y6, Y6 VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 VPXOR Y6, Y2, Y2 VPXOR Y7, Y3, Y3 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y2, Y2 VPSHUFB ·rol8<>+0(SB), Y3, Y3 VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 VPADDD Y2, Y8, Y8 VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 VPXOR Y8, Y10, Y10 VPXOR Y15, Y11, Y11 VMOVDQA Y15, 224(BP) VPSLLD $0x07, Y14, Y15 VPSRLD $0x19, Y14, Y14 VPXOR Y15, Y14, Y14 VPSLLD $0x07, Y9, Y15 VPSRLD $0x19, Y9, Y9 VPXOR Y15, Y9, Y9 VPSLLD $0x07, Y10, Y15 VPSRLD $0x19, Y10, Y10 VPXOR Y15, Y10, Y10 VPSLLD $0x07, Y11, Y15 VPSRLD $0x19, Y11, Y11 VPXOR Y15, Y11, Y11 VMOVDQA 224(BP), Y15 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x04, Y9, Y9, Y9 VPALIGNR $0x04, Y10, Y10, Y10 VPALIGNR $0x04, Y11, Y11, Y11 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 VPALIGNR $0x08, Y15, Y15, Y15 VPALIGNR $0x0c, Y4, Y4, Y4 VPALIGNR $0x0c, Y1, Y1, Y1 VPALIGNR $0x0c, Y2, Y2, Y2 VPALIGNR $0x0c, Y3, Y3, Y3 VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 VPADDD Y10, Y6, Y6 VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 VPXOR Y6, Y2, Y2 VPXOR Y7, Y3, Y3 VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y1, Y1 VPSHUFB ·rol16<>+0(SB), Y2, Y2 VPSHUFB ·rol16<>+0(SB), Y3, Y3 VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 VPADDD Y2, Y8, Y8 VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 VPXOR Y8, Y10, Y10 VPXOR Y15, Y11, Y11 ADDQ 16(DI), R10 ADCQ 24(DI), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 MULXQ R10, R13, R14 IMULQ R12, R15 MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 MOVQ 8(BP), DX MULXQ R10, R10, AX ADDQ R10, R14 MULXQ R11, R11, R8 ADCQ R11, R15 ADCQ $0x00, R8 IMULQ R12, DX ADDQ AX, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 ANDQ $0x03, R12 MOVQ R15, R13 ANDQ $-4, R13 MOVQ R8, R14 SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 ADCQ R14, R11 ADCQ $0x00, R12 ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 LEAQ 32(DI), DI VMOVDQA Y15, 224(BP) VPSLLD $0x0c, Y14, Y15 VPSRLD $0x14, Y14, Y14 VPXOR Y15, Y14, Y14 VPSLLD $0x0c, Y9, Y15 VPSRLD $0x14, Y9, Y9 VPXOR Y15, Y9, Y9 VPSLLD $0x0c, Y10, Y15 VPSRLD $0x14, Y10, Y10 VPXOR Y15, Y10, Y10 VPSLLD $0x0c, Y11, Y15 VPSRLD $0x14, Y11, Y11 VPXOR Y15, Y11, Y11 VMOVDQA 224(BP), Y15 VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 VPADDD Y10, Y6, Y6 VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 VPXOR Y6, Y2, Y2 VPXOR Y7, Y3, Y3 VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y2, Y2 VPSHUFB ·rol8<>+0(SB), Y3, Y3 VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 VPADDD Y2, Y8, Y8 VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 VPXOR Y8, Y10, Y10 VPXOR Y15, Y11, Y11 VMOVDQA Y15, 224(BP) VPSLLD $0x07, Y14, Y15 VPSRLD $0x19, Y14, Y14 VPXOR Y15, Y14, Y14 VPSLLD $0x07, Y9, Y15 VPSRLD $0x19, Y9, Y9 VPXOR Y15, Y9, Y9 VPSLLD $0x07, Y10, Y15 VPSRLD $0x19, Y10, Y10 VPXOR Y15, Y10, Y10 VPSLLD $0x07, Y11, Y15 VPSRLD $0x19, Y11, Y11 VPXOR Y15, Y11, Y11 VMOVDQA 224(BP), Y15 VPALIGNR $0x0c, Y14, Y14, Y14 VPALIGNR $0x0c, Y9, Y9, Y9 VPALIGNR $0x0c, Y10, Y10, Y10 VPALIGNR $0x0c, Y11, Y11, Y11 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 VPALIGNR $0x08, Y15, Y15, Y15 VPALIGNR $0x04, Y4, Y4, Y4 VPALIGNR $0x04, Y1, Y1, Y1 VPALIGNR $0x04, Y2, Y2, Y2 VPALIGNR $0x04, Y3, Y3, Y3 DECQ CX JG sealAVX2Tail512LoopA DECQ R9 JGE sealAVX2Tail512LoopB VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 VPADDD 32(BP), Y14, Y14 VPADDD 32(BP), Y9, Y9 VPADDD 32(BP), Y10, Y10 VPADDD 32(BP), Y11, Y11 VPADDD 64(BP), Y12, Y12 VPADDD 64(BP), Y13, Y13 VPADDD 64(BP), Y8, Y8 VPADDD 64(BP), Y15, Y15 VPADDD 96(BP), Y4, Y4 VPADDD 128(BP), Y1, Y1 VPADDD 160(BP), Y2, Y2 VPADDD 192(BP), Y3, Y3 VMOVDQA Y15, 224(BP) VPERM2I128 $0x02, Y0, Y14, Y15 VPXOR (SI), Y15, Y15 VMOVDQU Y15, (DI) VPERM2I128 $0x02, Y12, Y4, Y15 VPXOR 32(SI), Y15, Y15 VMOVDQU Y15, 32(DI) VPERM2I128 $0x13, Y0, Y14, Y15 VPXOR 64(SI), Y15, Y15 VMOVDQU Y15, 64(DI) VPERM2I128 $0x13, Y12, Y4, Y15 VPXOR 96(SI), Y15, Y15 VMOVDQU Y15, 96(DI) VPERM2I128 $0x02, Y5, Y9, Y0 VPERM2I128 $0x02, Y13, Y1, Y14 VPERM2I128 $0x13, Y5, Y9, Y12 VPERM2I128 $0x13, Y13, Y1, Y4 VPXOR 128(SI), Y0, Y0 VPXOR 160(SI), Y14, Y14 VPXOR 192(SI), Y12, Y12 VPXOR 224(SI), Y4, Y4 VMOVDQU Y0, 128(DI) VMOVDQU Y14, 160(DI) VMOVDQU Y12, 192(DI) VMOVDQU Y4, 224(DI) VPERM2I128 $0x02, Y6, Y10, Y0 VPERM2I128 $0x02, Y8, Y2, Y14 VPERM2I128 $0x13, Y6, Y10, Y12 VPERM2I128 $0x13, Y8, Y2, Y4 VPXOR 256(SI), Y0, Y0 VPXOR 288(SI), Y14, Y14 VPXOR 320(SI), Y12, Y12 VPXOR 352(SI), Y4, Y4 VMOVDQU Y0, 256(DI) VMOVDQU Y14, 288(DI) VMOVDQU Y12, 320(DI) VMOVDQU Y4, 352(DI) MOVQ $0x00000180, CX LEAQ 384(SI), SI SUBQ $0x00000180, BX VPERM2I128 $0x02, Y7, Y11, Y0 VPERM2I128 $0x02, 224(BP), Y3, Y14 VPERM2I128 $0x13, Y7, Y11, Y12 VPERM2I128 $0x13, 224(BP), Y3, Y4 JMP sealAVX2SealHash