// Code generated by command: go run src.go -out ../amd64.s -stubs ../stubs_amd64.go -pkg common. DO NOT EDIT. //go:build amd64 #include "textflag.h" // func addAVX2(p *[256]int16, a *[256]int16, b *[256]int16) // Requires: AVX, AVX2 TEXT ·addAVX2(SB), NOSPLIT, $0-24 MOVQ p+0(FP), AX MOVQ a+8(FP), CX MOVQ b+16(FP), DX VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y2 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y6 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y10 VMOVDQU 192(CX), Y12 VMOVDQU 224(CX), Y14 VMOVDQU (DX), Y1 VMOVDQU 32(DX), Y3 VMOVDQU 64(DX), Y5 VMOVDQU 96(DX), Y7 VMOVDQU 128(DX), Y9 VMOVDQU 160(DX), Y11 VMOVDQU 192(DX), Y13 VMOVDQU 224(DX), Y15 VPADDW Y0, Y1, Y1 VPADDW Y2, Y3, Y3 VPADDW Y4, Y5, Y5 VPADDW Y6, Y7, Y7 VPADDW Y8, Y9, Y9 VPADDW Y10, Y11, Y11 VPADDW Y12, Y13, Y13 VPADDW Y14, Y15, Y15 VMOVDQU Y1, (AX) VMOVDQU Y3, 32(AX) VMOVDQU Y5, 64(AX) VMOVDQU Y7, 96(AX) VMOVDQU Y9, 128(AX) VMOVDQU Y11, 160(AX) VMOVDQU Y13, 192(AX) VMOVDQU Y15, 224(AX) VMOVDQU 256(CX), Y0 VMOVDQU 288(CX), Y2 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y10 VMOVDQU 448(CX), Y12 VMOVDQU 480(CX), Y14 VMOVDQU 256(DX), Y1 VMOVDQU 288(DX), Y3 VMOVDQU 320(DX), Y5 VMOVDQU 352(DX), Y7 VMOVDQU 384(DX), Y9 VMOVDQU 416(DX), Y11 VMOVDQU 448(DX), Y13 VMOVDQU 480(DX), Y15 VPADDW Y0, Y1, Y1 VPADDW Y2, Y3, Y3 VPADDW Y4, Y5, Y5 VPADDW Y6, Y7, Y7 VPADDW Y8, Y9, Y9 VPADDW Y10, Y11, Y11 VPADDW Y12, Y13, Y13 VPADDW Y14, Y15, Y15 VMOVDQU Y1, 256(AX) VMOVDQU Y3, 288(AX) VMOVDQU Y5, 320(AX) VMOVDQU Y7, 352(AX) VMOVDQU Y9, 384(AX) VMOVDQU Y11, 416(AX) VMOVDQU Y13, 448(AX) VMOVDQU Y15, 480(AX) RET // func subAVX2(p *[256]int16, a *[256]int16, b *[256]int16) // Requires: AVX, AVX2 TEXT ·subAVX2(SB), NOSPLIT, $0-24 MOVQ p+0(FP), AX MOVQ a+8(FP), CX MOVQ b+16(FP), DX VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y2 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y6 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y10 VMOVDQU 192(CX), Y12 VMOVDQU 224(CX), Y14 VMOVDQU (DX), Y1 VMOVDQU 32(DX), Y3 VMOVDQU 64(DX), Y5 VMOVDQU 96(DX), Y7 VMOVDQU 128(DX), Y9 VMOVDQU 160(DX), Y11 VMOVDQU 192(DX), Y13 VMOVDQU 224(DX), Y15 VPSUBW Y1, Y0, Y1 VPSUBW Y3, Y2, Y3 VPSUBW Y5, Y4, Y5 VPSUBW Y7, Y6, Y7 VPSUBW Y9, Y8, Y9 VPSUBW Y11, Y10, Y11 VPSUBW Y13, Y12, Y13 VPSUBW Y15, Y14, Y15 VMOVDQU Y1, (AX) VMOVDQU Y3, 32(AX) VMOVDQU Y5, 64(AX) VMOVDQU Y7, 96(AX) VMOVDQU Y9, 128(AX) VMOVDQU Y11, 160(AX) VMOVDQU Y13, 192(AX) VMOVDQU Y15, 224(AX) VMOVDQU 256(CX), Y0 VMOVDQU 288(CX), Y2 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y10 VMOVDQU 448(CX), Y12 VMOVDQU 480(CX), Y14 VMOVDQU 256(DX), Y1 VMOVDQU 288(DX), Y3 VMOVDQU 320(DX), Y5 VMOVDQU 352(DX), Y7 VMOVDQU 384(DX), Y9 VMOVDQU 416(DX), Y11 VMOVDQU 448(DX), Y13 VMOVDQU 480(DX), Y15 VPSUBW Y1, Y0, Y1 VPSUBW Y3, Y2, Y3 VPSUBW Y5, Y4, Y5 VPSUBW Y7, Y6, Y7 VPSUBW Y9, Y8, Y9 VPSUBW Y11, Y10, Y11 VPSUBW Y13, Y12, Y13 VPSUBW Y15, Y14, Y15 VMOVDQU Y1, 256(AX) VMOVDQU Y3, 288(AX) VMOVDQU Y5, 320(AX) VMOVDQU Y7, 352(AX) VMOVDQU Y9, 384(AX) VMOVDQU Y11, 416(AX) VMOVDQU Y13, 448(AX) VMOVDQU Y15, 480(AX) RET // func nttAVX2(p *[256]int16) // Requires: AVX, AVX2 TEXT ·nttAVX2(SB), NOSPLIT, $0-8 MOVQ p+0(FP), AX LEAQ ·ZetasAVX2+0(SB), CX MOVL $0x00000d01, DX VMOVD DX, X0 VPBROADCASTW X0, Y15 VPBROADCASTW (CX), Y0 VPBROADCASTW 2(CX), Y1 VMOVDQU (AX), Y7 VMOVDQU 32(AX), Y8 VMOVDQU 64(AX), Y9 VMOVDQU 96(AX), Y10 VMOVDQU 256(AX), Y11 VMOVDQU 288(AX), Y12 VMOVDQU 320(AX), Y13 VMOVDQU 352(AX), Y14 VPMULLW Y11, Y0, Y2 VPMULLW Y12, Y0, Y3 VPMULLW Y13, Y0, Y4 VPMULLW Y14, Y0, Y5 VPMULHW Y11, Y1, Y11 VPMULHW Y12, Y1, Y12 VPMULHW Y13, Y1, Y13 VPMULHW Y14, Y1, Y14 VPMULHW Y2, Y15, Y2 VPMULHW Y3, Y15, Y3 VPMULHW Y4, Y15, Y4 VPMULHW Y5, Y15, Y5 VPSUBW Y2, Y11, Y2 VPSUBW Y3, Y12, Y3 VPSUBW Y4, Y13, Y4 VPSUBW Y5, Y14, Y5 VPSUBW Y2, Y7, Y11 VPSUBW Y3, Y8, Y12 VPSUBW Y4, Y9, Y13 VPSUBW Y5, Y10, Y14 VPADDW Y2, Y7, Y7 VPADDW Y3, Y8, Y8 VPADDW Y4, Y9, Y9 VPADDW Y5, Y10, Y10 VMOVDQU Y7, (AX) VMOVDQU Y8, 32(AX) VMOVDQU Y9, 64(AX) VMOVDQU Y10, 96(AX) VMOVDQU Y11, 256(AX) VMOVDQU Y12, 288(AX) VMOVDQU Y13, 320(AX) VMOVDQU Y14, 352(AX) VMOVDQU 128(AX), Y7 VMOVDQU 160(AX), Y8 VMOVDQU 192(AX), Y9 VMOVDQU 224(AX), Y10 VMOVDQU 384(AX), Y11 VMOVDQU 416(AX), Y12 VMOVDQU 448(AX), Y13 VMOVDQU 480(AX), Y14 VPMULLW Y11, Y0, Y2 VPMULLW Y12, Y0, Y3 VPMULLW Y13, Y0, Y4 VPMULLW Y14, Y0, Y5 VPMULHW Y11, Y1, Y11 VPMULHW Y12, Y1, Y12 VPMULHW Y13, Y1, Y13 VPMULHW Y14, Y1, Y14 VPMULHW Y2, Y15, Y2 VPMULHW Y3, Y15, Y3 VPMULHW Y4, Y15, Y4 VPMULHW Y5, Y15, Y5 VPSUBW Y2, Y11, Y2 VPSUBW Y3, Y12, Y3 VPSUBW Y4, Y13, Y4 VPSUBW Y5, Y14, Y5 VPSUBW Y2, Y7, Y11 VPSUBW Y3, Y8, Y12 VPSUBW Y4, Y9, Y13 VPSUBW Y5, Y10, Y14 VPADDW Y2, Y7, Y7 VPADDW Y3, Y8, Y8 VPADDW Y4, Y9, Y9 VPADDW Y5, Y10, Y10 VMOVDQU Y7, 128(AX) VMOVDQU Y8, 160(AX) VMOVDQU Y9, 192(AX) VMOVDQU Y10, 224(AX) VMOVDQU Y11, 384(AX) VMOVDQU Y12, 416(AX) VMOVDQU Y13, 448(AX) VMOVDQU Y14, 480(AX) VPBROADCASTW 4(CX), Y0 VPBROADCASTW 6(CX), Y1 VMOVDQU (AX), Y7 VMOVDQU 32(AX), Y8 VMOVDQU 64(AX), Y9 VMOVDQU 96(AX), Y10 VMOVDQU 128(AX), Y11 VMOVDQU 160(AX), Y12 VMOVDQU 192(AX), Y13 VMOVDQU 224(AX), Y14 VPMULLW Y11, Y0, Y2 VPMULLW Y12, Y0, Y3 VPMULLW Y13, Y0, Y4 VPMULLW Y14, Y0, Y5 VPMULHW Y11, Y1, Y11 VPMULHW Y12, Y1, Y12 VPMULHW Y13, Y1, Y13 VPMULHW Y14, Y1, Y14 VPMULHW Y2, Y15, Y2 VPMULHW Y3, Y15, Y3 VPMULHW Y4, Y15, Y4 VPMULHW Y5, Y15, Y5 VPSUBW Y2, Y11, Y2 VPSUBW Y3, Y12, Y3 VPSUBW Y4, Y13, Y4 VPSUBW Y5, Y14, Y5 VPSUBW Y2, Y7, Y11 VPSUBW Y3, Y8, Y12 VPSUBW Y4, Y9, Y13 VPSUBW Y5, Y10, Y14 VPADDW Y2, Y7, Y7 VPADDW Y3, Y8, Y8 VPADDW Y4, Y9, Y9 VPADDW Y5, Y10, Y10 VPBROADCASTW 12(CX), Y0 VPBROADCASTW 14(CX), Y1 VPBROADCASTW 16(CX), Y2 VPBROADCASTW 18(CX), Y3 VPMULLW Y9, Y0, Y4 VPMULLW Y10, Y0, Y5 VPMULLW Y13, Y2, Y6 VPMULLW Y14, Y2, Y0 VPMULHW Y9, Y1, Y9 VPMULHW Y10, Y1, Y10 VPMULHW Y13, Y3, Y13 VPMULHW Y14, Y3, Y14 VPMULHW Y4, Y15, Y4 VPMULHW Y5, Y15, Y5 VPMULHW Y6, Y15, Y6 VPMULHW Y0, Y15, Y0 VPSUBW Y4, Y9, Y4 VPSUBW Y5, Y10, Y5 VPSUBW Y6, Y13, Y6 VPSUBW Y0, Y14, Y0 VPSUBW Y4, Y7, Y9 VPSUBW Y5, Y8, Y10 VPSUBW Y6, Y11, Y13 VPSUBW Y0, Y12, Y14 VPADDW Y4, Y7, Y7 VPADDW Y5, Y8, Y8 VPADDW Y6, Y11, Y11 VPADDW Y0, Y12, Y12 VMOVDQU 32(CX), Y0 VMOVDQU 64(CX), Y1 VMOVDQU 96(CX), Y2 VMOVDQU 128(CX), Y3 VPERM2I128 $0x20, Y9, Y7, Y4 VPERM2I128 $0x31, Y9, Y7, Y9 VMOVDQA Y4, Y7 VPERM2I128 $0x20, Y10, Y8, Y4 VPERM2I128 $0x31, Y10, Y8, Y10 VMOVDQA Y4, Y8 VPERM2I128 $0x20, Y13, Y11, Y4 VPERM2I128 $0x31, Y13, Y11, Y13 VMOVDQA Y4, Y11 VPERM2I128 $0x20, Y14, Y12, Y4 VPERM2I128 $0x31, Y14, Y12, Y14 VMOVDQA Y4, Y12 VPMULLW Y8, Y0, Y4 VPMULLW Y10, Y0, Y5 VPMULLW Y12, Y2, Y6 VPMULLW Y14, Y2, Y0 VPMULHW Y8, Y1, Y8 VPMULHW Y10, Y1, Y10 VPMULHW Y12, Y3, Y12 VPMULHW Y14, Y3, Y14 VPMULHW Y4, Y15, Y4 VPMULHW Y5, Y15, Y5 VPMULHW Y6, Y15, Y6 VPMULHW Y0, Y15, Y0 VPSUBW Y4, Y8, Y4 VPSUBW Y5, Y10, Y5 VPSUBW Y6, Y12, Y6 VPSUBW Y0, Y14, Y0 VPSUBW Y4, Y7, Y8 VPSUBW Y5, Y9, Y10 VPSUBW Y6, Y11, Y12 VPSUBW Y0, Y13, Y14 VPADDW Y4, Y7, Y7 VPADDW Y5, Y9, Y9 VPADDW Y6, Y11, Y11 VPADDW Y0, Y13, Y13 VMOVDQU 288(CX), Y0 VMOVDQU 320(CX), Y1 VMOVDQU 352(CX), Y2 VMOVDQU 384(CX), Y3 VPUNPCKLQDQ Y8, Y7, Y4 VPUNPCKHQDQ Y8, Y7, Y8 VMOVDQA Y4, Y7 VPUNPCKLQDQ Y10, Y9, Y4 VPUNPCKHQDQ Y10, Y9, Y10 VMOVDQA Y4, Y9 VPUNPCKLQDQ Y12, Y11, Y4 VPUNPCKHQDQ Y12, Y11, Y12 VMOVDQA Y4, Y11 VPUNPCKLQDQ Y14, Y13, Y4 VPUNPCKHQDQ Y14, Y13, Y14 VMOVDQA Y4, Y13 VPMULLW Y9, Y0, Y4 VPMULLW Y10, Y0, Y5 VPMULLW Y13, Y2, Y6 VPMULLW Y14, Y2, Y0 VPMULHW Y9, Y1, Y9 VPMULHW Y10, Y1, Y10 VPMULHW Y13, Y3, Y13 VPMULHW Y14, Y3, Y14 VPMULHW Y4, Y15, Y4 VPMULHW Y5, Y15, Y5 VPMULHW Y6, Y15, Y6 VPMULHW Y0, Y15, Y0 VPSUBW Y4, Y9, Y4 VPSUBW Y5, Y10, Y5 VPSUBW Y6, Y13, Y6 VPSUBW Y0, Y14, Y0 VPSUBW Y4, Y7, Y9 VPSUBW Y5, Y8, Y10 VPSUBW Y6, Y11, Y13 VPSUBW Y0, Y12, Y14 VPADDW Y4, Y7, Y7 VPADDW Y5, Y8, Y8 VPADDW Y6, Y11, Y11 VPADDW Y0, Y12, Y12 VMOVDQU 544(CX), Y0 VMOVDQU 576(CX), Y1 VMOVDQU 608(CX), Y2 VMOVDQU 640(CX), Y3 VMOVSLDUP Y9, Y4 VPBLENDD $0xaa, Y4, Y7, Y4 VPSRLQ $0x20, Y7, Y7 VPBLENDD $0xaa, Y9, Y7, Y9 VMOVDQA Y4, Y7 VMOVSLDUP Y10, Y4 VPBLENDD $0xaa, Y4, Y8, Y4 VPSRLQ $0x20, Y8, Y8 VPBLENDD $0xaa, Y10, Y8, Y10 VMOVDQA Y4, Y8 VMOVSLDUP Y13, Y4 VPBLENDD $0xaa, Y4, Y11, Y4 VPSRLQ $0x20, Y11, Y11 VPBLENDD $0xaa, Y13, Y11, Y13 VMOVDQA Y4, Y11 VMOVSLDUP Y14, Y4 VPBLENDD $0xaa, Y4, Y12, Y4 VPSRLQ $0x20, Y12, Y12 VPBLENDD $0xaa, Y14, Y12, Y14 VMOVDQA Y4, Y12 VPMULLW Y8, Y0, Y4 VPMULLW Y10, Y0, Y5 VPMULLW Y12, Y2, Y6 VPMULLW Y14, Y2, Y0 VPMULHW Y8, Y1, Y8 VPMULHW Y10, Y1, Y10 VPMULHW Y12, Y3, Y12 VPMULHW Y14, Y3, Y14 VPMULHW Y4, Y15, Y4 VPMULHW Y5, Y15, Y5 VPMULHW Y6, Y15, Y6 VPMULHW Y0, Y15, Y0 VPSUBW Y4, Y8, Y4 VPSUBW Y5, Y10, Y5 VPSUBW Y6, Y12, Y6 VPSUBW Y0, Y14, Y0 VPSUBW Y4, Y7, Y8 VPSUBW Y5, Y9, Y10 VPSUBW Y6, Y11, Y12 VPSUBW Y0, Y13, Y14 VPADDW Y4, Y7, Y7 VPADDW Y5, Y9, Y9 VPADDW Y6, Y11, Y11 VPADDW Y0, Y13, Y13 VMOVDQU 800(CX), Y0 VMOVDQU 832(CX), Y1 VMOVDQU 864(CX), Y2 VMOVDQU 896(CX), Y3 VPSLLD $0x10, Y8, Y4 VPBLENDW $0xaa, Y4, Y7, Y4 VPSRLD $0x10, Y7, Y7 VPBLENDW $0xaa, Y8, Y7, Y8 VMOVDQA Y4, Y7 VPSLLD $0x10, Y10, Y4 VPBLENDW $0xaa, Y4, Y9, Y4 VPSRLD $0x10, Y9, Y9 VPBLENDW $0xaa, Y10, Y9, Y10 VMOVDQA Y4, Y9 VPSLLD $0x10, Y12, Y4 VPBLENDW $0xaa, Y4, Y11, Y4 VPSRLD $0x10, Y11, Y11 VPBLENDW $0xaa, Y12, Y11, Y12 VMOVDQA Y4, Y11 VPSLLD $0x10, Y14, Y4 VPBLENDW $0xaa, Y4, Y13, Y4 VPSRLD $0x10, Y13, Y13 VPBLENDW $0xaa, Y14, Y13, Y14 VMOVDQA Y4, Y13 VPMULLW Y9, Y0, Y4 VPMULLW Y10, Y0, Y5 VPMULLW Y13, Y2, Y6 VPMULLW Y14, Y2, Y0 VPMULHW Y9, Y1, Y9 VPMULHW Y10, Y1, Y10 VPMULHW Y13, Y3, Y13 VPMULHW Y14, Y3, Y14 VPMULHW Y4, Y15, Y4 VPMULHW Y5, Y15, Y5 VPMULHW Y6, Y15, Y6 VPMULHW Y0, Y15, Y0 VPSUBW Y4, Y9, Y4 VPSUBW Y5, Y10, Y5 VPSUBW Y6, Y13, Y6 VPSUBW Y0, Y14, Y0 VPSUBW Y4, Y7, Y9 VPSUBW Y5, Y8, Y10 VPSUBW Y6, Y11, Y13 VPSUBW Y0, Y12, Y14 VPADDW Y4, Y7, Y7 VPADDW Y5, Y8, Y8 VPADDW Y6, Y11, Y11 VPADDW Y0, Y12, Y12 VMOVDQU Y7, (AX) VMOVDQU Y8, 32(AX) VMOVDQU Y9, 64(AX) VMOVDQU Y10, 96(AX) VMOVDQU Y11, 128(AX) VMOVDQU Y12, 160(AX) VMOVDQU Y13, 192(AX) VMOVDQU Y14, 224(AX) VPBROADCASTW 8(CX), Y0 VPBROADCASTW 10(CX), Y1 VMOVDQU 256(AX), Y7 VMOVDQU 288(AX), Y8 VMOVDQU 320(AX), Y9 VMOVDQU 352(AX), Y10 VMOVDQU 384(AX), Y11 VMOVDQU 416(AX), Y12 VMOVDQU 448(AX), Y13 VMOVDQU 480(AX), Y14 VPMULLW Y11, Y0, Y2 VPMULLW Y12, Y0, Y3 VPMULLW Y13, Y0, Y4 VPMULLW Y14, Y0, Y5 VPMULHW Y11, Y1, Y11 VPMULHW Y12, Y1, Y12 VPMULHW Y13, Y1, Y13 VPMULHW Y14, Y1, Y14 VPMULHW Y2, Y15, Y2 VPMULHW Y3, Y15, Y3 VPMULHW Y4, Y15, Y4 VPMULHW Y5, Y15, Y5 VPSUBW Y2, Y11, Y2 VPSUBW Y3, Y12, Y3 VPSUBW Y4, Y13, Y4 VPSUBW Y5, Y14, Y5 VPSUBW Y2, Y7, Y11 VPSUBW Y3, Y8, Y12 VPSUBW Y4, Y9, Y13 VPSUBW Y5, Y10, Y14 VPADDW Y2, Y7, Y7 VPADDW Y3, Y8, Y8 VPADDW Y4, Y9, Y9 VPADDW Y5, Y10, Y10 VPBROADCASTW 20(CX), Y0 VPBROADCASTW 22(CX), Y1 VPBROADCASTW 24(CX), Y2 VPBROADCASTW 26(CX), Y3 VPMULLW Y9, Y0, Y4 VPMULLW Y10, Y0, Y5 VPMULLW Y13, Y2, Y6 VPMULLW Y14, Y2, Y0 VPMULHW Y9, Y1, Y9 VPMULHW Y10, Y1, Y10 VPMULHW Y13, Y3, Y13 VPMULHW Y14, Y3, Y14 VPMULHW Y4, Y15, Y4 VPMULHW Y5, Y15, Y5 VPMULHW Y6, Y15, Y6 VPMULHW Y0, Y15, Y0 VPSUBW Y4, Y9, Y4 VPSUBW Y5, Y10, Y5 VPSUBW Y6, Y13, Y6 VPSUBW Y0, Y14, Y0 VPSUBW Y4, Y7, Y9 VPSUBW Y5, Y8, Y10 VPSUBW Y6, Y11, Y13 VPSUBW Y0, Y12, Y14 VPADDW Y4, Y7, Y7 VPADDW Y5, Y8, Y8 VPADDW Y6, Y11, Y11 VPADDW Y0, Y12, Y12 VMOVDQU 160(CX), Y0 VMOVDQU 192(CX), Y1 VMOVDQU 224(CX), Y2 VMOVDQU 256(CX), Y3 VPERM2I128 $0x20, Y9, Y7, Y4 VPERM2I128 $0x31, Y9, Y7, Y9 VMOVDQA Y4, Y7 VPERM2I128 $0x20, Y10, Y8, Y4 VPERM2I128 $0x31, Y10, Y8, Y10 VMOVDQA Y4, Y8 VPERM2I128 $0x20, Y13, Y11, Y4 VPERM2I128 $0x31, Y13, Y11, Y13 VMOVDQA Y4, Y11 VPERM2I128 $0x20, Y14, Y12, Y4 VPERM2I128 $0x31, Y14, Y12, Y14 VMOVDQA Y4, Y12 VPMULLW Y8, Y0, Y4 VPMULLW Y10, Y0, Y5 VPMULLW Y12, Y2, Y6 VPMULLW Y14, Y2, Y0 VPMULHW Y8, Y1, Y8 VPMULHW Y10, Y1, Y10 VPMULHW Y12, Y3, Y12 VPMULHW Y14, Y3, Y14 VPMULHW Y4, Y15, Y4 VPMULHW Y5, Y15, Y5 VPMULHW Y6, Y15, Y6 VPMULHW Y0, Y15, Y0 VPSUBW Y4, Y8, Y4 VPSUBW Y5, Y10, Y5 VPSUBW Y6, Y12, Y6 VPSUBW Y0, Y14, Y0 VPSUBW Y4, Y7, Y8 VPSUBW Y5, Y9, Y10 VPSUBW Y6, Y11, Y12 VPSUBW Y0, Y13, Y14 VPADDW Y4, Y7, Y7 VPADDW Y5, Y9, Y9 VPADDW Y6, Y11, Y11 VPADDW Y0, Y13, Y13 VMOVDQU 416(CX), Y0 VMOVDQU 448(CX), Y1 VMOVDQU 480(CX), Y2 VMOVDQU 512(CX), Y3 VPUNPCKLQDQ Y8, Y7, Y4 VPUNPCKHQDQ Y8, Y7, Y8 VMOVDQA Y4, Y7 VPUNPCKLQDQ Y10, Y9, Y4 VPUNPCKHQDQ Y10, Y9, Y10 VMOVDQA Y4, Y9 VPUNPCKLQDQ Y12, Y11, Y4 VPUNPCKHQDQ Y12, Y11, Y12 VMOVDQA Y4, Y11 VPUNPCKLQDQ Y14, Y13, Y4 VPUNPCKHQDQ Y14, Y13, Y14 VMOVDQA Y4, Y13 VPMULLW Y9, Y0, Y4 VPMULLW Y10, Y0, Y5 VPMULLW Y13, Y2, Y6 VPMULLW Y14, Y2, Y0 VPMULHW Y9, Y1, Y9 VPMULHW Y10, Y1, Y10 VPMULHW Y13, Y3, Y13 VPMULHW Y14, Y3, Y14 VPMULHW Y4, Y15, Y4 VPMULHW Y5, Y15, Y5 VPMULHW Y6, Y15, Y6 VPMULHW Y0, Y15, Y0 VPSUBW Y4, Y9, Y4 VPSUBW Y5, Y10, Y5 VPSUBW Y6, Y13, Y6 VPSUBW Y0, Y14, Y0 VPSUBW Y4, Y7, Y9 VPSUBW Y5, Y8, Y10 VPSUBW Y6, Y11, Y13 VPSUBW Y0, Y12, Y14 VPADDW Y4, Y7, Y7 VPADDW Y5, Y8, Y8 VPADDW Y6, Y11, Y11 VPADDW Y0, Y12, Y12 VMOVDQU 672(CX), Y0 VMOVDQU 704(CX), Y1 VMOVDQU 736(CX), Y2 VMOVDQU 768(CX), Y3 VMOVSLDUP Y9, Y4 VPBLENDD $0xaa, Y4, Y7, Y4 VPSRLQ $0x20, Y7, Y7 VPBLENDD $0xaa, Y9, Y7, Y9 VMOVDQA Y4, Y7 VMOVSLDUP Y10, Y4 VPBLENDD $0xaa, Y4, Y8, Y4 VPSRLQ $0x20, Y8, Y8 VPBLENDD $0xaa, Y10, Y8, Y10 VMOVDQA Y4, Y8 VMOVSLDUP Y13, Y4 VPBLENDD $0xaa, Y4, Y11, Y4 VPSRLQ $0x20, Y11, Y11 VPBLENDD $0xaa, Y13, Y11, Y13 VMOVDQA Y4, Y11 VMOVSLDUP Y14, Y4 VPBLENDD $0xaa, Y4, Y12, Y4 VPSRLQ $0x20, Y12, Y12 VPBLENDD $0xaa, Y14, Y12, Y14 VMOVDQA Y4, Y12 VPMULLW Y8, Y0, Y4 VPMULLW Y10, Y0, Y5 VPMULLW Y12, Y2, Y6 VPMULLW Y14, Y2, Y0 VPMULHW Y8, Y1, Y8 VPMULHW Y10, Y1, Y10 VPMULHW Y12, Y3, Y12 VPMULHW Y14, Y3, Y14 VPMULHW Y4, Y15, Y4 VPMULHW Y5, Y15, Y5 VPMULHW Y6, Y15, Y6 VPMULHW Y0, Y15, Y0 VPSUBW Y4, Y8, Y4 VPSUBW Y5, Y10, Y5 VPSUBW Y6, Y12, Y6 VPSUBW Y0, Y14, Y0 VPSUBW Y4, Y7, Y8 VPSUBW Y5, Y9, Y10 VPSUBW Y6, Y11, Y12 VPSUBW Y0, Y13, Y14 VPADDW Y4, Y7, Y7 VPADDW Y5, Y9, Y9 VPADDW Y6, Y11, Y11 VPADDW Y0, Y13, Y13 VMOVDQU 928(CX), Y0 VMOVDQU 960(CX), Y1 VMOVDQU 992(CX), Y2 VMOVDQU 1024(CX), Y3 VPSLLD $0x10, Y8, Y4 VPBLENDW $0xaa, Y4, Y7, Y4 VPSRLD $0x10, Y7, Y7 VPBLENDW $0xaa, Y8, Y7, Y8 VMOVDQA Y4, Y7 VPSLLD $0x10, Y10, Y4 VPBLENDW $0xaa, Y4, Y9, Y4 VPSRLD $0x10, Y9, Y9 VPBLENDW $0xaa, Y10, Y9, Y10 VMOVDQA Y4, Y9 VPSLLD $0x10, Y12, Y4 VPBLENDW $0xaa, Y4, Y11, Y4 VPSRLD $0x10, Y11, Y11 VPBLENDW $0xaa, Y12, Y11, Y12 VMOVDQA Y4, Y11 VPSLLD $0x10, Y14, Y4 VPBLENDW $0xaa, Y4, Y13, Y4 VPSRLD $0x10, Y13, Y13 VPBLENDW $0xaa, Y14, Y13, Y14 VMOVDQA Y4, Y13 VPMULLW Y9, Y0, Y4 VPMULLW Y10, Y0, Y5 VPMULLW Y13, Y2, Y6 VPMULLW Y14, Y2, Y0 VPMULHW Y9, Y1, Y9 VPMULHW Y10, Y1, Y10 VPMULHW Y13, Y3, Y13 VPMULHW Y14, Y3, Y14 VPMULHW Y4, Y15, Y4 VPMULHW Y5, Y15, Y5 VPMULHW Y6, Y15, Y6 VPMULHW Y0, Y15, Y0 VPSUBW Y4, Y9, Y4 VPSUBW Y5, Y10, Y5 VPSUBW Y6, Y13, Y6 VPSUBW Y0, Y14, Y0 VPSUBW Y4, Y7, Y9 VPSUBW Y5, Y8, Y10 VPSUBW Y6, Y11, Y13 VPSUBW Y0, Y12, Y14 VPADDW Y4, Y7, Y7 VPADDW Y5, Y8, Y8 VPADDW Y6, Y11, Y11 VPADDW Y0, Y12, Y12 VMOVDQU Y7, 256(AX) VMOVDQU Y8, 288(AX) VMOVDQU Y9, 320(AX) VMOVDQU Y10, 352(AX) VMOVDQU Y11, 384(AX) VMOVDQU Y12, 416(AX) VMOVDQU Y13, 448(AX) VMOVDQU Y14, 480(AX) RET // func invNttAVX2(p *[256]int16) // Requires: AVX, AVX2 TEXT ·invNttAVX2(SB), NOSPLIT, $0-8 MOVQ p+0(FP), AX LEAQ ·ZetasAVX2+0(SB), CX MOVL $0x00000d01, DX VMOVD DX, X0 VPBROADCASTW X0, Y15 VMOVDQU (AX), Y7 VMOVDQU 32(AX), Y8 VMOVDQU 64(AX), Y9 VMOVDQU 96(AX), Y10 VMOVDQU 128(AX), Y11 VMOVDQU 160(AX), Y12 VMOVDQU 192(AX), Y13 VMOVDQU 224(AX), Y14 VMOVDQU 1056(CX), Y0 VMOVDQU 1088(CX), Y1 VMOVDQU 1120(CX), Y2 VMOVDQU 1152(CX), Y3 VPSUBW Y7, Y9, Y4 VPSUBW Y8, Y10, Y5 VPSUBW Y11, Y13, Y6 VPADDW Y7, Y9, Y7 VPADDW Y8, Y10, Y8 VPADDW Y11, Y13, Y11 VPMULLW Y4, Y0, Y9 VPMULLW Y5, Y0, Y10 VPSUBW Y12, Y14, Y0 VPMULLW Y6, Y2, Y13 VPADDW Y12, Y14, Y12 VPMULLW Y0, Y2, Y14 VPMULHW Y4, Y1, Y4 VPMULHW Y5, Y1, Y5 VPMULHW Y6, Y3, Y6 VPMULHW Y0, Y3, Y0 VPMULHW Y9, Y15, Y9 VPMULHW Y10, Y15, Y10 VPMULHW Y13, Y15, Y13 VPMULHW Y14, Y15, Y14 VPSUBW Y9, Y4, Y9 VPSUBW Y10, Y5, Y10 VPSUBW Y13, Y6, Y13 VPSUBW Y14, Y0, Y14 VMOVDQU 1312(CX), Y0 VMOVDQU 1344(CX), Y1 VMOVDQU 1376(CX), Y2 VMOVDQU 1408(CX), Y3 VPSLLD $0x10, Y8, Y4 VPBLENDW $0xaa, Y4, Y7, Y4 VPSRLD $0x10, Y7, Y7 VPBLENDW $0xaa, Y8, Y7, Y8 VMOVDQA Y4, Y7 VPSLLD $0x10, Y10, Y4 VPBLENDW $0xaa, Y4, Y9, Y4 VPSRLD $0x10, Y9, Y9 VPBLENDW $0xaa, Y10, Y9, Y10 VMOVDQA Y4, Y9 VPSLLD $0x10, Y12, Y4 VPBLENDW $0xaa, Y4, Y11, Y4 VPSRLD $0x10, Y11, Y11 VPBLENDW $0xaa, Y12, Y11, Y12 VMOVDQA Y4, Y11 VPSLLD $0x10, Y14, Y4 VPBLENDW $0xaa, Y4, Y13, Y4 VPSRLD $0x10, Y13, Y13 VPBLENDW $0xaa, Y14, Y13, Y14 VMOVDQA Y4, Y13 VPSUBW Y7, Y8, Y4 VPSUBW Y9, Y10, Y5 VPSUBW Y11, Y12, Y6 VPADDW Y7, Y8, Y7 VPADDW Y9, Y10, Y9 VPADDW Y11, Y12, Y11 VPMULLW Y4, Y0, Y8 VPMULLW Y5, Y0, Y10 VPSUBW Y13, Y14, Y0 VPMULLW Y6, Y2, Y12 VPADDW Y13, Y14, Y13 VPMULLW Y0, Y2, Y14 VPMULHW Y4, Y1, Y4 VPMULHW Y5, Y1, Y5 VPMULHW Y6, Y3, Y6 VPMULHW Y0, Y3, Y0 VPMULHW Y8, Y15, Y8 VPMULHW Y10, Y15, Y10 VPMULHW Y12, Y15, Y12 VPMULHW Y14, Y15, Y14 VPSUBW Y8, Y4, Y8 VPSUBW Y10, Y5, Y10 VPSUBW Y12, Y6, Y12 VPSUBW Y14, Y0, Y14 VMOVDQU 1568(CX), Y0 VMOVDQU 1600(CX), Y1 VMOVDQU 1632(CX), Y2 VMOVDQU 1664(CX), Y3 VMOVSLDUP Y9, Y4 VPBLENDD $0xaa, Y4, Y7, Y4 VPSRLQ $0x20, Y7, Y7 VPBLENDD $0xaa, Y9, Y7, Y9 VMOVDQA Y4, Y7 VMOVSLDUP Y10, Y4 VPBLENDD $0xaa, Y4, Y8, Y4 VPSRLQ $0x20, Y8, Y8 VPBLENDD $0xaa, Y10, Y8, Y10 VMOVDQA Y4, Y8 VMOVSLDUP Y13, Y4 VPBLENDD $0xaa, Y4, Y11, Y4 VPSRLQ $0x20, Y11, Y11 VPBLENDD $0xaa, Y13, Y11, Y13 VMOVDQA Y4, Y11 VMOVSLDUP Y14, Y4 VPBLENDD $0xaa, Y4, Y12, Y4 VPSRLQ $0x20, Y12, Y12 VPBLENDD $0xaa, Y14, Y12, Y14 VMOVDQA Y4, Y12 VPSUBW Y7, Y9, Y4 VPSUBW Y8, Y10, Y5 VPSUBW Y11, Y13, Y6 VPADDW Y7, Y9, Y7 VPADDW Y8, Y10, Y8 VPADDW Y11, Y13, Y11 VPMULLW Y4, Y0, Y9 VPMULLW Y5, Y0, Y10 VPSUBW Y12, Y14, Y0 VPMULLW Y6, Y2, Y13 VPADDW Y12, Y14, Y12 VPMULLW Y0, Y2, Y14 VPMULHW Y4, Y1, Y4 VPMULHW Y5, Y1, Y5 VPMULHW Y6, Y3, Y6 VPMULHW Y0, Y3, Y0 VPMULHW Y9, Y15, Y9 VPMULHW Y10, Y15, Y10 VPMULHW Y13, Y15, Y13 VPMULHW Y14, Y15, Y14 VPSUBW Y9, Y4, Y9 VPSUBW Y10, Y5, Y10 VPSUBW Y13, Y6, Y13 VPSUBW Y14, Y0, Y14 MOVL $0x00004ebf, DX VMOVD DX, X0 VPBROADCASTW X0, Y4 VPMULHW Y4, Y7, Y5 VPSRAW $0x0a, Y5, Y5 VPMULLW Y15, Y5, Y5 VPSUBW Y5, Y7, Y7 VPMULHW Y4, Y11, Y5 VPSRAW $0x0a, Y5, Y5 VPMULLW Y15, Y5, Y5 VPSUBW Y5, Y11, Y11 VMOVDQU 1824(CX), Y0 VMOVDQU 1856(CX), Y1 VMOVDQU 1888(CX), Y2 VMOVDQU 1920(CX), Y3 VPUNPCKLQDQ Y8, Y7, Y4 VPUNPCKHQDQ Y8, Y7, Y8 VMOVDQA Y4, Y7 VPUNPCKLQDQ Y10, Y9, Y4 VPUNPCKHQDQ Y10, Y9, Y10 VMOVDQA Y4, Y9 VPUNPCKLQDQ Y12, Y11, Y4 VPUNPCKHQDQ Y12, Y11, Y12 VMOVDQA Y4, Y11 VPUNPCKLQDQ Y14, Y13, Y4 VPUNPCKHQDQ Y14, Y13, Y14 VMOVDQA Y4, Y13 VPSUBW Y7, Y8, Y4 VPSUBW Y9, Y10, Y5 VPSUBW Y11, Y12, Y6 VPADDW Y7, Y8, Y7 VPADDW Y9, Y10, Y9 VPADDW Y11, Y12, Y11 VPMULLW Y4, Y0, Y8 VPMULLW Y5, Y0, Y10 VPSUBW Y13, Y14, Y0 VPMULLW Y6, Y2, Y12 VPADDW Y13, Y14, Y13 VPMULLW Y0, Y2, Y14 VPMULHW Y4, Y1, Y4 VPMULHW Y5, Y1, Y5 VPMULHW Y6, Y3, Y6 VPMULHW Y0, Y3, Y0 VPMULHW Y8, Y15, Y8 VPMULHW Y10, Y15, Y10 VPMULHW Y12, Y15, Y12 VPMULHW Y14, Y15, Y14 VPSUBW Y8, Y4, Y8 VPSUBW Y10, Y5, Y10 VPSUBW Y12, Y6, Y12 VPSUBW Y14, Y0, Y14 VPBROADCASTW 2080(CX), Y0 VPBROADCASTW 2082(CX), Y1 VPBROADCASTW 2084(CX), Y2 VPBROADCASTW 2086(CX), Y3 VPERM2I128 $0x20, Y9, Y7, Y4 VPERM2I128 $0x31, Y9, Y7, Y9 VMOVDQA Y4, Y7 VPERM2I128 $0x20, Y10, Y8, Y4 VPERM2I128 $0x31, Y10, Y8, Y10 VMOVDQA Y4, Y8 VPERM2I128 $0x20, Y13, Y11, Y4 VPERM2I128 $0x31, Y13, Y11, Y13 VMOVDQA Y4, Y11 VPERM2I128 $0x20, Y14, Y12, Y4 VPERM2I128 $0x31, Y14, Y12, Y14 VMOVDQA Y4, Y12 VPSUBW Y7, Y9, Y4 VPSUBW Y8, Y10, Y5 VPSUBW Y11, Y13, Y6 VPADDW Y7, Y9, Y7 VPADDW Y8, Y10, Y8 VPADDW Y11, Y13, Y11 VPMULLW Y4, Y0, Y9 VPMULLW Y5, Y0, Y10 VPSUBW Y12, Y14, Y0 VPMULLW Y6, Y2, Y13 VPADDW Y12, Y14, Y12 VPMULLW Y0, Y2, Y14 VPMULHW Y4, Y1, Y4 VPMULHW Y5, Y1, Y5 VPMULHW Y6, Y3, Y6 VPMULHW Y0, Y3, Y0 VPMULHW Y9, Y15, Y9 VPMULHW Y10, Y15, Y10 VPMULHW Y13, Y15, Y13 VPMULHW Y14, Y15, Y14 VPSUBW Y9, Y4, Y9 VPSUBW Y10, Y5, Y10 VPSUBW Y13, Y6, Y13 VPSUBW Y14, Y0, Y14 MOVL $0x00004ebf, DX VMOVD DX, X0 VPBROADCASTW X0, Y4 VPMULHW Y4, Y7, Y5 VPSRAW $0x0a, Y5, Y5 VPMULLW Y15, Y5, Y5 VPSUBW Y5, Y7, Y7 VPMULHW Y4, Y11, Y5 VPSRAW $0x0a, Y5, Y5 VPMULLW Y15, Y5, Y5 VPSUBW Y5, Y11, Y11 VPBROADCASTW 2096(CX), Y0 VPBROADCASTW 2098(CX), Y1 VPSUBW Y7, Y11, Y4 VPSUBW Y8, Y12, Y5 VPSUBW Y9, Y13, Y6 VPADDW Y7, Y11, Y7 VPADDW Y8, Y12, Y8 VPADDW Y9, Y13, Y9 VPMULLW Y4, Y0, Y11 VPMULLW Y5, Y0, Y12 VPSUBW Y10, Y14, Y2 VPMULLW Y6, Y0, Y13 VPADDW Y10, Y14, Y10 VPMULLW Y2, Y0, Y14 VPMULHW Y4, Y1, Y4 VPMULHW Y5, Y1, Y5 VPMULHW Y6, Y1, Y6 VPMULHW Y2, Y1, Y2 VPMULHW Y11, Y15, Y11 VPMULHW Y12, Y15, Y12 VPMULHW Y13, Y15, Y13 VPMULHW Y14, Y15, Y14 VPSUBW Y11, Y4, Y11 VPSUBW Y12, Y5, Y12 VPSUBW Y13, Y6, Y13 VPSUBW Y14, Y2, Y14 VMOVDQU Y7, (AX) VMOVDQU Y8, 32(AX) VMOVDQU Y9, 64(AX) VMOVDQU Y10, 96(AX) VMOVDQU Y11, 128(AX) VMOVDQU Y12, 160(AX) VMOVDQU Y13, 192(AX) VMOVDQU Y14, 224(AX) VMOVDQU 256(AX), Y7 VMOVDQU 288(AX), Y8 VMOVDQU 320(AX), Y9 VMOVDQU 352(AX), Y10 VMOVDQU 384(AX), Y11 VMOVDQU 416(AX), Y12 VMOVDQU 448(AX), Y13 VMOVDQU 480(AX), Y14 VMOVDQU 1184(CX), Y0 VMOVDQU 1216(CX), Y1 VMOVDQU 1248(CX), Y2 VMOVDQU 1280(CX), Y3 VPSUBW Y7, Y9, Y4 VPSUBW Y8, Y10, Y5 VPSUBW Y11, Y13, Y6 VPADDW Y7, Y9, Y7 VPADDW Y8, Y10, Y8 VPADDW Y11, Y13, Y11 VPMULLW Y4, Y0, Y9 VPMULLW Y5, Y0, Y10 VPSUBW Y12, Y14, Y0 VPMULLW Y6, Y2, Y13 VPADDW Y12, Y14, Y12 VPMULLW Y0, Y2, Y14 VPMULHW Y4, Y1, Y4 VPMULHW Y5, Y1, Y5 VPMULHW Y6, Y3, Y6 VPMULHW Y0, Y3, Y0 VPMULHW Y9, Y15, Y9 VPMULHW Y10, Y15, Y10 VPMULHW Y13, Y15, Y13 VPMULHW Y14, Y15, Y14 VPSUBW Y9, Y4, Y9 VPSUBW Y10, Y5, Y10 VPSUBW Y13, Y6, Y13 VPSUBW Y14, Y0, Y14 VMOVDQU 1440(CX), Y0 VMOVDQU 1472(CX), Y1 VMOVDQU 1504(CX), Y2 VMOVDQU 1536(CX), Y3 VPSLLD $0x10, Y8, Y4 VPBLENDW $0xaa, Y4, Y7, Y4 VPSRLD $0x10, Y7, Y7 VPBLENDW $0xaa, Y8, Y7, Y8 VMOVDQA Y4, Y7 VPSLLD $0x10, Y10, Y4 VPBLENDW $0xaa, Y4, Y9, Y4 VPSRLD $0x10, Y9, Y9 VPBLENDW $0xaa, Y10, Y9, Y10 VMOVDQA Y4, Y9 VPSLLD $0x10, Y12, Y4 VPBLENDW $0xaa, Y4, Y11, Y4 VPSRLD $0x10, Y11, Y11 VPBLENDW $0xaa, Y12, Y11, Y12 VMOVDQA Y4, Y11 VPSLLD $0x10, Y14, Y4 VPBLENDW $0xaa, Y4, Y13, Y4 VPSRLD $0x10, Y13, Y13 VPBLENDW $0xaa, Y14, Y13, Y14 VMOVDQA Y4, Y13 VPSUBW Y7, Y8, Y4 VPSUBW Y9, Y10, Y5 VPSUBW Y11, Y12, Y6 VPADDW Y7, Y8, Y7 VPADDW Y9, Y10, Y9 VPADDW Y11, Y12, Y11 VPMULLW Y4, Y0, Y8 VPMULLW Y5, Y0, Y10 VPSUBW Y13, Y14, Y0 VPMULLW Y6, Y2, Y12 VPADDW Y13, Y14, Y13 VPMULLW Y0, Y2, Y14 VPMULHW Y4, Y1, Y4 VPMULHW Y5, Y1, Y5 VPMULHW Y6, Y3, Y6 VPMULHW Y0, Y3, Y0 VPMULHW Y8, Y15, Y8 VPMULHW Y10, Y15, Y10 VPMULHW Y12, Y15, Y12 VPMULHW Y14, Y15, Y14 VPSUBW Y8, Y4, Y8 VPSUBW Y10, Y5, Y10 VPSUBW Y12, Y6, Y12 VPSUBW Y14, Y0, Y14 VMOVDQU 1696(CX), Y0 VMOVDQU 1728(CX), Y1 VMOVDQU 1760(CX), Y2 VMOVDQU 1792(CX), Y3 VMOVSLDUP Y9, Y4 VPBLENDD $0xaa, Y4, Y7, Y4 VPSRLQ $0x20, Y7, Y7 VPBLENDD $0xaa, Y9, Y7, Y9 VMOVDQA Y4, Y7 VMOVSLDUP Y10, Y4 VPBLENDD $0xaa, Y4, Y8, Y4 VPSRLQ $0x20, Y8, Y8 VPBLENDD $0xaa, Y10, Y8, Y10 VMOVDQA Y4, Y8 VMOVSLDUP Y13, Y4 VPBLENDD $0xaa, Y4, Y11, Y4 VPSRLQ $0x20, Y11, Y11 VPBLENDD $0xaa, Y13, Y11, Y13 VMOVDQA Y4, Y11 VMOVSLDUP Y14, Y4 VPBLENDD $0xaa, Y4, Y12, Y4 VPSRLQ $0x20, Y12, Y12 VPBLENDD $0xaa, Y14, Y12, Y14 VMOVDQA Y4, Y12 VPSUBW Y7, Y9, Y4 VPSUBW Y8, Y10, Y5 VPSUBW Y11, Y13, Y6 VPADDW Y7, Y9, Y7 VPADDW Y8, Y10, Y8 VPADDW Y11, Y13, Y11 VPMULLW Y4, Y0, Y9 VPMULLW Y5, Y0, Y10 VPSUBW Y12, Y14, Y0 VPMULLW Y6, Y2, Y13 VPADDW Y12, Y14, Y12 VPMULLW Y0, Y2, Y14 VPMULHW Y4, Y1, Y4 VPMULHW Y5, Y1, Y5 VPMULHW Y6, Y3, Y6 VPMULHW Y0, Y3, Y0 VPMULHW Y9, Y15, Y9 VPMULHW Y10, Y15, Y10 VPMULHW Y13, Y15, Y13 VPMULHW Y14, Y15, Y14 VPSUBW Y9, Y4, Y9 VPSUBW Y10, Y5, Y10 VPSUBW Y13, Y6, Y13 VPSUBW Y14, Y0, Y14 MOVL $0x00004ebf, DX VMOVD DX, X0 VPBROADCASTW X0, Y4 VPMULHW Y4, Y7, Y5 VPSRAW $0x0a, Y5, Y5 VPMULLW Y15, Y5, Y5 VPSUBW Y5, Y7, Y7 VPMULHW Y4, Y11, Y5 VPSRAW $0x0a, Y5, Y5 VPMULLW Y15, Y5, Y5 VPSUBW Y5, Y11, Y11 VMOVDQU 1952(CX), Y0 VMOVDQU 1984(CX), Y1 VMOVDQU 2016(CX), Y2 VMOVDQU 2048(CX), Y3 VPUNPCKLQDQ Y8, Y7, Y4 VPUNPCKHQDQ Y8, Y7, Y8 VMOVDQA Y4, Y7 VPUNPCKLQDQ Y10, Y9, Y4 VPUNPCKHQDQ Y10, Y9, Y10 VMOVDQA Y4, Y9 VPUNPCKLQDQ Y12, Y11, Y4 VPUNPCKHQDQ Y12, Y11, Y12 VMOVDQA Y4, Y11 VPUNPCKLQDQ Y14, Y13, Y4 VPUNPCKHQDQ Y14, Y13, Y14 VMOVDQA Y4, Y13 VPSUBW Y7, Y8, Y4 VPSUBW Y9, Y10, Y5 VPSUBW Y11, Y12, Y6 VPADDW Y7, Y8, Y7 VPADDW Y9, Y10, Y9 VPADDW Y11, Y12, Y11 VPMULLW Y4, Y0, Y8 VPMULLW Y5, Y0, Y10 VPSUBW Y13, Y14, Y0 VPMULLW Y6, Y2, Y12 VPADDW Y13, Y14, Y13 VPMULLW Y0, Y2, Y14 VPMULHW Y4, Y1, Y4 VPMULHW Y5, Y1, Y5 VPMULHW Y6, Y3, Y6 VPMULHW Y0, Y3, Y0 VPMULHW Y8, Y15, Y8 VPMULHW Y10, Y15, Y10 VPMULHW Y12, Y15, Y12 VPMULHW Y14, Y15, Y14 VPSUBW Y8, Y4, Y8 VPSUBW Y10, Y5, Y10 VPSUBW Y12, Y6, Y12 VPSUBW Y14, Y0, Y14 VPBROADCASTW 2088(CX), Y0 VPBROADCASTW 2090(CX), Y1 VPBROADCASTW 2092(CX), Y2 VPBROADCASTW 2094(CX), Y3 VPERM2I128 $0x20, Y9, Y7, Y4 VPERM2I128 $0x31, Y9, Y7, Y9 VMOVDQA Y4, Y7 VPERM2I128 $0x20, Y10, Y8, Y4 VPERM2I128 $0x31, Y10, Y8, Y10 VMOVDQA Y4, Y8 VPERM2I128 $0x20, Y13, Y11, Y4 VPERM2I128 $0x31, Y13, Y11, Y13 VMOVDQA Y4, Y11 VPERM2I128 $0x20, Y14, Y12, Y4 VPERM2I128 $0x31, Y14, Y12, Y14 VMOVDQA Y4, Y12 VPSUBW Y7, Y9, Y4 VPSUBW Y8, Y10, Y5 VPSUBW Y11, Y13, Y6 VPADDW Y7, Y9, Y7 VPADDW Y8, Y10, Y8 VPADDW Y11, Y13, Y11 VPMULLW Y4, Y0, Y9 VPMULLW Y5, Y0, Y10 VPSUBW Y12, Y14, Y0 VPMULLW Y6, Y2, Y13 VPADDW Y12, Y14, Y12 VPMULLW Y0, Y2, Y14 VPMULHW Y4, Y1, Y4 VPMULHW Y5, Y1, Y5 VPMULHW Y6, Y3, Y6 VPMULHW Y0, Y3, Y0 VPMULHW Y9, Y15, Y9 VPMULHW Y10, Y15, Y10 VPMULHW Y13, Y15, Y13 VPMULHW Y14, Y15, Y14 VPSUBW Y9, Y4, Y9 VPSUBW Y10, Y5, Y10 VPSUBW Y13, Y6, Y13 VPSUBW Y14, Y0, Y14 MOVL $0x00004ebf, DX VMOVD DX, X0 VPBROADCASTW X0, Y4 VPMULHW Y4, Y7, Y5 VPSRAW $0x0a, Y5, Y5 VPMULLW Y15, Y5, Y5 VPSUBW Y5, Y7, Y7 VPMULHW Y4, Y11, Y5 VPSRAW $0x0a, Y5, Y5 VPMULLW Y15, Y5, Y5 VPSUBW Y5, Y11, Y11 VPBROADCASTW 2100(CX), Y0 VPBROADCASTW 2102(CX), Y1 VPSUBW Y7, Y11, Y4 VPSUBW Y8, Y12, Y5 VPSUBW Y9, Y13, Y6 VPADDW Y7, Y11, Y7 VPADDW Y8, Y12, Y8 VPADDW Y9, Y13, Y9 VPMULLW Y4, Y0, Y11 VPMULLW Y5, Y0, Y12 VPSUBW Y10, Y14, Y2 VPMULLW Y6, Y0, Y13 VPADDW Y10, Y14, Y10 VPMULLW Y2, Y0, Y14 VPMULHW Y4, Y1, Y4 VPMULHW Y5, Y1, Y5 VPMULHW Y6, Y1, Y6 VPMULHW Y2, Y1, Y2 VPMULHW Y11, Y15, Y11 VPMULHW Y12, Y15, Y12 VPMULHW Y13, Y15, Y13 VPMULHW Y14, Y15, Y14 VPSUBW Y11, Y4, Y11 VPSUBW Y12, Y5, Y12 VPSUBW Y13, Y6, Y13 VPSUBW Y14, Y2, Y14 VMOVDQU Y7, 256(AX) VMOVDQU Y8, 288(AX) VMOVDQU Y9, 320(AX) VMOVDQU Y10, 352(AX) VMOVDQU Y11, 384(AX) VMOVDQU Y12, 416(AX) VMOVDQU Y13, 448(AX) VMOVDQU Y14, 480(AX) VPBROADCASTW 2104(CX), Y0 VPBROADCASTW 2106(CX), Y1 VMOVDQU (AX), Y7 VMOVDQU 32(AX), Y8 VMOVDQU 64(AX), Y9 VMOVDQU 96(AX), Y10 VMOVDQU 256(AX), Y11 VMOVDQU 288(AX), Y12 VMOVDQU 320(AX), Y13 VMOVDQU 352(AX), Y14 VPSUBW Y7, Y11, Y2 VPSUBW Y8, Y12, Y3 VPSUBW Y9, Y13, Y4 VPADDW Y7, Y11, Y7 VPADDW Y8, Y12, Y8 VPADDW Y9, Y13, Y9 VPMULLW Y2, Y0, Y11 VPMULLW Y3, Y0, Y12 VPSUBW Y10, Y14, Y5 VPMULLW Y4, Y0, Y13 VPADDW Y10, Y14, Y10 VPMULLW Y5, Y0, Y14 VPMULHW Y2, Y1, Y2 VPMULHW Y3, Y1, Y3 VPMULHW Y4, Y1, Y4 VPMULHW Y5, Y1, Y5 VPMULHW Y11, Y15, Y11 VPMULHW Y12, Y15, Y12 VPMULHW Y13, Y15, Y13 VPMULHW Y14, Y15, Y14 VPSUBW Y11, Y2, Y11 VPSUBW Y12, Y3, Y12 VPSUBW Y13, Y4, Y13 VPSUBW Y14, Y5, Y14 MOVL $0xffffd8a1, DX VMOVD DX, X0 VPBROADCASTW X0, Y0 MOVL $0x000005a1, DX VMOVD DX, X1 VPBROADCASTW X1, Y1 VPMULLW Y7, Y0, Y2 VPMULLW Y8, Y0, Y3 VPMULLW Y9, Y0, Y4 VPMULLW Y10, Y0, Y5 VPMULHW Y7, Y1, Y7 VPMULHW Y8, Y1, Y8 VPMULHW Y9, Y1, Y9 VPMULHW Y10, Y1, Y10 VPMULHW Y2, Y15, Y2 VPMULHW Y3, Y15, Y3 VPMULHW Y4, Y15, Y4 VPMULHW Y5, Y15, Y5 VPSUBW Y2, Y7, Y7 VPSUBW Y3, Y8, Y8 VPSUBW Y4, Y9, Y9 VPSUBW Y5, Y10, Y10 VPMULLW Y11, Y0, Y2 VPMULLW Y12, Y0, Y3 VPMULLW Y13, Y0, Y4 VPMULLW Y14, Y0, Y5 VPMULHW Y11, Y1, Y11 VPMULHW Y12, Y1, Y12 VPMULHW Y13, Y1, Y13 VPMULHW Y14, Y1, Y14 VPMULHW Y2, Y15, Y2 VPMULHW Y3, Y15, Y3 VPMULHW Y4, Y15, Y4 VPMULHW Y5, Y15, Y5 VPSUBW Y2, Y11, Y11 VPSUBW Y3, Y12, Y12 VPSUBW Y4, Y13, Y13 VPSUBW Y5, Y14, Y14 VMOVDQU Y7, (AX) VMOVDQU Y8, 32(AX) VMOVDQU Y9, 64(AX) VMOVDQU Y10, 96(AX) VMOVDQU Y11, 256(AX) VMOVDQU Y12, 288(AX) VMOVDQU Y13, 320(AX) VMOVDQU Y14, 352(AX) VPBROADCASTW 2104(CX), Y0 VPBROADCASTW 2106(CX), Y1 VMOVDQU 128(AX), Y7 VMOVDQU 160(AX), Y8 VMOVDQU 192(AX), Y9 VMOVDQU 224(AX), Y10 VMOVDQU 384(AX), Y11 VMOVDQU 416(AX), Y12 VMOVDQU 448(AX), Y13 VMOVDQU 480(AX), Y14 VPSUBW Y7, Y11, Y2 VPSUBW Y8, Y12, Y3 VPSUBW Y9, Y13, Y4 VPADDW Y7, Y11, Y7 VPADDW Y8, Y12, Y8 VPADDW Y9, Y13, Y9 VPMULLW Y2, Y0, Y11 VPMULLW Y3, Y0, Y12 VPSUBW Y10, Y14, Y5 VPMULLW Y4, Y0, Y13 VPADDW Y10, Y14, Y10 VPMULLW Y5, Y0, Y14 VPMULHW Y2, Y1, Y2 VPMULHW Y3, Y1, Y3 VPMULHW Y4, Y1, Y4 VPMULHW Y5, Y1, Y5 VPMULHW Y11, Y15, Y11 VPMULHW Y12, Y15, Y12 VPMULHW Y13, Y15, Y13 VPMULHW Y14, Y15, Y14 VPSUBW Y11, Y2, Y11 VPSUBW Y12, Y3, Y12 VPSUBW Y13, Y4, Y13 VPSUBW Y14, Y5, Y14 MOVL $0xffffd8a1, CX VMOVD CX, X0 VPBROADCASTW X0, Y0 MOVL $0x000005a1, CX VMOVD CX, X1 VPBROADCASTW X1, Y1 VPMULLW Y7, Y0, Y2 VPMULLW Y8, Y0, Y3 VPMULLW Y9, Y0, Y4 VPMULLW Y10, Y0, Y5 VPMULHW Y7, Y1, Y7 VPMULHW Y8, Y1, Y8 VPMULHW Y9, Y1, Y9 VPMULHW Y10, Y1, Y10 VPMULHW Y2, Y15, Y2 VPMULHW Y3, Y15, Y3 VPMULHW Y4, Y15, Y4 VPMULHW Y5, Y15, Y5 VPSUBW Y2, Y7, Y7 VPSUBW Y3, Y8, Y8 VPSUBW Y4, Y9, Y9 VPSUBW Y5, Y10, Y10 VPMULLW Y11, Y0, Y2 VPMULLW Y12, Y0, Y3 VPMULLW Y13, Y0, Y4 VPMULLW Y14, Y0, Y5 VPMULHW Y11, Y1, Y11 VPMULHW Y12, Y1, Y12 VPMULHW Y13, Y1, Y13 VPMULHW Y14, Y1, Y14 VPMULHW Y2, Y15, Y2 VPMULHW Y3, Y15, Y3 VPMULHW Y4, Y15, Y4 VPMULHW Y5, Y15, Y5 VPSUBW Y2, Y11, Y11 VPSUBW Y3, Y12, Y12 VPSUBW Y4, Y13, Y13 VPSUBW Y5, Y14, Y14 VMOVDQU Y7, 128(AX) VMOVDQU Y8, 160(AX) VMOVDQU Y9, 192(AX) VMOVDQU Y10, 224(AX) VMOVDQU Y11, 384(AX) VMOVDQU Y12, 416(AX) VMOVDQU Y13, 448(AX) VMOVDQU Y14, 480(AX) RET // func mulHatAVX2(p *[256]int16, a *[256]int16, b *[256]int16) // Requires: AVX, AVX2 TEXT ·mulHatAVX2(SB), NOSPLIT, $8-24 MOVQ p+0(FP), AX MOVQ a+8(FP), CX MOVQ b+16(FP), DX LEAQ ·ZetasAVX2+0(SB), BX MOVL $0xfffff301, SI VMOVD SI, X0 VPBROADCASTW X0, Y14 MOVL $0x00000d01, SI VMOVD SI, X0 VPBROADCASTW X0, Y15 VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 VMOVDQU 64(CX), Y2 VMOVDQU 96(CX), Y3 VMOVDQU (DX), Y4 VMOVDQU 32(DX), Y5 VMOVDQU 64(DX), Y6 VMOVDQU 96(DX), Y7 VPMULLW Y1, Y5, Y8 VPMULLW Y0, Y4, Y9 VPMULLW Y0, Y5, Y10 VPMULLW Y1, Y4, Y11 VPMULLW Y8, Y14, Y8 VPMULLW Y9, Y14, Y9 VPMULLW Y10, Y14, Y10 VPMULLW Y11, Y14, Y11 VPMULHW Y1, Y5, Y12 VPMULHW Y0, Y4, Y13 VPMULHW Y0, Y5, Y0 VPMULHW Y1, Y4, Y1 VMOVDQA Y12, Y4 VMOVDQA Y13, Y5 VPMULHW Y8, Y15, Y8 VPMULHW Y9, Y15, Y9 VPMULHW Y10, Y15, Y10 VPMULHW Y11, Y15, Y11 VPSUBW Y8, Y4, Y4 VPSUBW Y9, Y5, Y5 VPSUBW Y10, Y0, Y0 VPSUBW Y11, Y1, Y1 VMOVDQU 800(BX), Y12 VMOVDQU 832(BX), Y13 VPMULLW Y4, Y12, Y8 VPMULHW Y4, Y13, Y4 VPMULHW Y8, Y15, Y8 VPSUBW Y8, Y4, Y4 VPADDW Y4, Y5, Y4 VPADDW Y0, Y1, Y5 VPMULLW Y3, Y7, Y8 VPMULLW Y2, Y6, Y9 VPMULLW Y2, Y7, Y10 VPMULLW Y3, Y6, Y11 VPMULLW Y8, Y14, Y8 VPMULLW Y9, Y14, Y9 VPMULLW Y10, Y14, Y10 VPMULLW Y11, Y14, Y11 VPMULHW Y3, Y7, Y12 VPMULHW Y2, Y6, Y13 VPMULHW Y2, Y7, Y2 VPMULHW Y3, Y6, Y3 VMOVDQA Y12, Y6 VMOVDQA Y13, Y7 VPMULHW Y8, Y15, Y8 VPMULHW Y9, Y15, Y9 VPMULHW Y10, Y15, Y10 VPMULHW Y11, Y15, Y11 VPSUBW Y8, Y6, Y6 VPSUBW Y9, Y7, Y7 VPSUBW Y10, Y2, Y2 VPSUBW Y11, Y3, Y3 VMOVDQU 800(BX), Y12 VMOVDQU 832(BX), Y13 VPMULLW Y6, Y12, Y8 VPMULHW Y6, Y13, Y6 VPMULHW Y8, Y15, Y8 VPSUBW Y8, Y6, Y6 VPSUBW Y6, Y7, Y6 VPADDW Y2, Y3, Y7 VMOVDQU Y4, (AX) VMOVDQU Y5, 32(AX) VMOVDQU Y6, 64(AX) VMOVDQU Y7, 96(AX) VMOVDQU 128(CX), Y0 VMOVDQU 160(CX), Y1 VMOVDQU 192(CX), Y2 VMOVDQU 224(CX), Y3 VMOVDQU 128(DX), Y4 VMOVDQU 160(DX), Y5 VMOVDQU 192(DX), Y6 VMOVDQU 224(DX), Y7 VPMULLW Y1, Y5, Y8 VPMULLW Y0, Y4, Y9 VPMULLW Y0, Y5, Y10 VPMULLW Y1, Y4, Y11 VPMULLW Y8, Y14, Y8 VPMULLW Y9, Y14, Y9 VPMULLW Y10, Y14, Y10 VPMULLW Y11, Y14, Y11 VPMULHW Y1, Y5, Y12 VPMULHW Y0, Y4, Y13 VPMULHW Y0, Y5, Y0 VPMULHW Y1, Y4, Y1 VMOVDQA Y12, Y4 VMOVDQA Y13, Y5 VPMULHW Y8, Y15, Y8 VPMULHW Y9, Y15, Y9 VPMULHW Y10, Y15, Y10 VPMULHW Y11, Y15, Y11 VPSUBW Y8, Y4, Y4 VPSUBW Y9, Y5, Y5 VPSUBW Y10, Y0, Y0 VPSUBW Y11, Y1, Y1 VMOVDQU 864(BX), Y12 VMOVDQU 896(BX), Y13 VPMULLW Y4, Y12, Y8 VPMULHW Y4, Y13, Y4 VPMULHW Y8, Y15, Y8 VPSUBW Y8, Y4, Y4 VPADDW Y4, Y5, Y4 VPADDW Y0, Y1, Y5 VPMULLW Y3, Y7, Y8 VPMULLW Y2, Y6, Y9 VPMULLW Y2, Y7, Y10 VPMULLW Y3, Y6, Y11 VPMULLW Y8, Y14, Y8 VPMULLW Y9, Y14, Y9 VPMULLW Y10, Y14, Y10 VPMULLW Y11, Y14, Y11 VPMULHW Y3, Y7, Y12 VPMULHW Y2, Y6, Y13 VPMULHW Y2, Y7, Y2 VPMULHW Y3, Y6, Y3 VMOVDQA Y12, Y6 VMOVDQA Y13, Y7 VPMULHW Y8, Y15, Y8 VPMULHW Y9, Y15, Y9 VPMULHW Y10, Y15, Y10 VPMULHW Y11, Y15, Y11 VPSUBW Y8, Y6, Y6 VPSUBW Y9, Y7, Y7 VPSUBW Y10, Y2, Y2 VPSUBW Y11, Y3, Y3 VMOVDQU 864(BX), Y12 VMOVDQU 896(BX), Y13 VPMULLW Y6, Y12, Y8 VPMULHW Y6, Y13, Y6 VPMULHW Y8, Y15, Y8 VPSUBW Y8, Y6, Y6 VPSUBW Y6, Y7, Y6 VPADDW Y2, Y3, Y7 VMOVDQU Y4, 128(AX) VMOVDQU Y5, 160(AX) VMOVDQU Y6, 192(AX) VMOVDQU Y7, 224(AX) VMOVDQU 256(CX), Y0 VMOVDQU 288(CX), Y1 VMOVDQU 320(CX), Y2 VMOVDQU 352(CX), Y3 VMOVDQU 256(DX), Y4 VMOVDQU 288(DX), Y5 VMOVDQU 320(DX), Y6 VMOVDQU 352(DX), Y7 VPMULLW Y1, Y5, Y8 VPMULLW Y0, Y4, Y9 VPMULLW Y0, Y5, Y10 VPMULLW Y1, Y4, Y11 VPMULLW Y8, Y14, Y8 VPMULLW Y9, Y14, Y9 VPMULLW Y10, Y14, Y10 VPMULLW Y11, Y14, Y11 VPMULHW Y1, Y5, Y12 VPMULHW Y0, Y4, Y13 VPMULHW Y0, Y5, Y0 VPMULHW Y1, Y4, Y1 VMOVDQA Y12, Y4 VMOVDQA Y13, Y5 VPMULHW Y8, Y15, Y8 VPMULHW Y9, Y15, Y9 VPMULHW Y10, Y15, Y10 VPMULHW Y11, Y15, Y11 VPSUBW Y8, Y4, Y4 VPSUBW Y9, Y5, Y5 VPSUBW Y10, Y0, Y0 VPSUBW Y11, Y1, Y1 VMOVDQU 928(BX), Y12 VMOVDQU 960(BX), Y13 VPMULLW Y4, Y12, Y8 VPMULHW Y4, Y13, Y4 VPMULHW Y8, Y15, Y8 VPSUBW Y8, Y4, Y4 VPADDW Y4, Y5, Y4 VPADDW Y0, Y1, Y5 VPMULLW Y3, Y7, Y8 VPMULLW Y2, Y6, Y9 VPMULLW Y2, Y7, Y10 VPMULLW Y3, Y6, Y11 VPMULLW Y8, Y14, Y8 VPMULLW Y9, Y14, Y9 VPMULLW Y10, Y14, Y10 VPMULLW Y11, Y14, Y11 VPMULHW Y3, Y7, Y12 VPMULHW Y2, Y6, Y13 VPMULHW Y2, Y7, Y2 VPMULHW Y3, Y6, Y3 VMOVDQA Y12, Y6 VMOVDQA Y13, Y7 VPMULHW Y8, Y15, Y8 VPMULHW Y9, Y15, Y9 VPMULHW Y10, Y15, Y10 VPMULHW Y11, Y15, Y11 VPSUBW Y8, Y6, Y6 VPSUBW Y9, Y7, Y7 VPSUBW Y10, Y2, Y2 VPSUBW Y11, Y3, Y3 VMOVDQU 928(BX), Y12 VMOVDQU 960(BX), Y13 VPMULLW Y6, Y12, Y8 VPMULHW Y6, Y13, Y6 VPMULHW Y8, Y15, Y8 VPSUBW Y8, Y6, Y6 VPSUBW Y6, Y7, Y6 VPADDW Y2, Y3, Y7 VMOVDQU Y4, 256(AX) VMOVDQU Y5, 288(AX) VMOVDQU Y6, 320(AX) VMOVDQU Y7, 352(AX) VMOVDQU 384(CX), Y0 VMOVDQU 416(CX), Y1 VMOVDQU 448(CX), Y2 VMOVDQU 480(CX), Y3 VMOVDQU 384(DX), Y4 VMOVDQU 416(DX), Y5 VMOVDQU 448(DX), Y6 VMOVDQU 480(DX), Y7 VPMULLW Y1, Y5, Y8 VPMULLW Y0, Y4, Y9 VPMULLW Y0, Y5, Y10 VPMULLW Y1, Y4, Y11 VPMULLW Y8, Y14, Y8 VPMULLW Y9, Y14, Y9 VPMULLW Y10, Y14, Y10 VPMULLW Y11, Y14, Y11 VPMULHW Y1, Y5, Y12 VPMULHW Y0, Y4, Y13 VPMULHW Y0, Y5, Y0 VPMULHW Y1, Y4, Y1 VMOVDQA Y12, Y4 VMOVDQA Y13, Y5 VPMULHW Y8, Y15, Y8 VPMULHW Y9, Y15, Y9 VPMULHW Y10, Y15, Y10 VPMULHW Y11, Y15, Y11 VPSUBW Y8, Y4, Y4 VPSUBW Y9, Y5, Y5 VPSUBW Y10, Y0, Y0 VPSUBW Y11, Y1, Y1 VMOVDQU 992(BX), Y12 VMOVDQU 1024(BX), Y13 VPMULLW Y4, Y12, Y8 VPMULHW Y4, Y13, Y4 VPMULHW Y8, Y15, Y8 VPSUBW Y8, Y4, Y4 VPADDW Y4, Y5, Y4 VPADDW Y0, Y1, Y5 VPMULLW Y3, Y7, Y8 VPMULLW Y2, Y6, Y9 VPMULLW Y2, Y7, Y10 VPMULLW Y3, Y6, Y11 VPMULLW Y8, Y14, Y8 VPMULLW Y9, Y14, Y9 VPMULLW Y10, Y14, Y10 VPMULLW Y11, Y14, Y11 VPMULHW Y3, Y7, Y12 VPMULHW Y2, Y6, Y13 VPMULHW Y2, Y7, Y2 VPMULHW Y3, Y6, Y3 VMOVDQA Y12, Y6 VMOVDQA Y13, Y7 VPMULHW Y8, Y15, Y8 VPMULHW Y9, Y15, Y9 VPMULHW Y10, Y15, Y10 VPMULHW Y11, Y15, Y11 VPSUBW Y8, Y6, Y6 VPSUBW Y9, Y7, Y7 VPSUBW Y10, Y2, Y2 VPSUBW Y11, Y3, Y3 VMOVDQU 992(BX), Y12 VMOVDQU 1024(BX), Y13 VPMULLW Y6, Y12, Y8 VPMULHW Y6, Y13, Y6 VPMULHW Y8, Y15, Y8 VPSUBW Y8, Y6, Y6 VPSUBW Y6, Y7, Y6 VPADDW Y2, Y3, Y7 VMOVDQU Y4, 384(AX) VMOVDQU Y5, 416(AX) VMOVDQU Y6, 448(AX) VMOVDQU Y7, 480(AX) RET // func detangleAVX2(p *[256]int16) // Requires: AVX, AVX2 TEXT ·detangleAVX2(SB), NOSPLIT, $0-8 MOVQ p+0(FP), AX VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y1 VMOVDQU 64(AX), Y2 VMOVDQU 96(AX), Y3 VMOVDQU 128(AX), Y4 VMOVDQU 160(AX), Y5 VMOVDQU 192(AX), Y6 VMOVDQU 224(AX), Y7 VPSLLD $0x10, Y1, Y8 VPBLENDW $0xaa, Y8, Y0, Y8 VPSRLD $0x10, Y0, Y0 VPBLENDW $0xaa, Y1, Y0, Y1 VMOVDQA Y8, Y0 VPSLLD $0x10, Y3, Y8 VPBLENDW $0xaa, Y8, Y2, Y8 VPSRLD $0x10, Y2, Y2 VPBLENDW $0xaa, Y3, Y2, Y3 VMOVDQA Y8, Y2 VPSLLD $0x10, Y5, Y8 VPBLENDW $0xaa, Y8, Y4, Y8 VPSRLD $0x10, Y4, Y4 VPBLENDW $0xaa, Y5, Y4, Y5 VMOVDQA Y8, Y4 VPSLLD $0x10, Y7, Y8 VPBLENDW $0xaa, Y8, Y6, Y8 VPSRLD $0x10, Y6, Y6 VPBLENDW $0xaa, Y7, Y6, Y7 VMOVDQA Y8, Y6 VMOVSLDUP Y2, Y8 VPBLENDD $0xaa, Y8, Y0, Y8 VPSRLQ $0x20, Y0, Y0 VPBLENDD $0xaa, Y2, Y0, Y2 VMOVDQA Y8, Y0 VMOVSLDUP Y3, Y8 VPBLENDD $0xaa, Y8, Y1, Y8 VPSRLQ $0x20, Y1, Y1 VPBLENDD $0xaa, Y3, Y1, Y3 VMOVDQA Y8, Y1 VMOVSLDUP Y6, Y8 VPBLENDD $0xaa, Y8, Y4, Y8 VPSRLQ $0x20, Y4, Y4 VPBLENDD $0xaa, Y6, Y4, Y6 VMOVDQA Y8, Y4 VMOVSLDUP Y7, Y8 VPBLENDD $0xaa, Y8, Y5, Y8 VPSRLQ $0x20, Y5, Y5 VPBLENDD $0xaa, Y7, Y5, Y7 VMOVDQA Y8, Y5 VPUNPCKLQDQ Y1, Y0, Y8 VPUNPCKHQDQ Y1, Y0, Y1 VMOVDQA Y8, Y0 VPUNPCKLQDQ Y3, Y2, Y8 VPUNPCKHQDQ Y3, Y2, Y3 VMOVDQA Y8, Y2 VPUNPCKLQDQ Y5, Y4, Y8 VPUNPCKHQDQ Y5, Y4, Y5 VMOVDQA Y8, Y4 VPUNPCKLQDQ Y7, Y6, Y8 VPUNPCKHQDQ Y7, Y6, Y7 VMOVDQA Y8, Y6 VPERM2I128 $0x20, Y2, Y0, Y8 VPERM2I128 $0x31, Y2, Y0, Y2 VMOVDQA Y8, Y0 VPERM2I128 $0x20, Y3, Y1, Y8 VPERM2I128 $0x31, Y3, Y1, Y3 VMOVDQA Y8, Y1 VPERM2I128 $0x20, Y6, Y4, Y8 VPERM2I128 $0x31, Y6, Y4, Y6 VMOVDQA Y8, Y4 VPERM2I128 $0x20, Y7, Y5, Y8 VPERM2I128 $0x31, Y7, Y5, Y7 VMOVDQA Y8, Y5 VMOVDQU Y0, (AX) VMOVDQU Y1, 32(AX) VMOVDQU Y2, 64(AX) VMOVDQU Y3, 96(AX) VMOVDQU Y4, 128(AX) VMOVDQU Y5, 160(AX) VMOVDQU Y6, 192(AX) VMOVDQU Y7, 224(AX) VMOVDQU 256(AX), Y0 VMOVDQU 288(AX), Y1 VMOVDQU 320(AX), Y2 VMOVDQU 352(AX), Y3 VMOVDQU 384(AX), Y4 VMOVDQU 416(AX), Y5 VMOVDQU 448(AX), Y6 VMOVDQU 480(AX), Y7 VPSLLD $0x10, Y1, Y8 VPBLENDW $0xaa, Y8, Y0, Y8 VPSRLD $0x10, Y0, Y0 VPBLENDW $0xaa, Y1, Y0, Y1 VMOVDQA Y8, Y0 VPSLLD $0x10, Y3, Y8 VPBLENDW $0xaa, Y8, Y2, Y8 VPSRLD $0x10, Y2, Y2 VPBLENDW $0xaa, Y3, Y2, Y3 VMOVDQA Y8, Y2 VPSLLD $0x10, Y5, Y8 VPBLENDW $0xaa, Y8, Y4, Y8 VPSRLD $0x10, Y4, Y4 VPBLENDW $0xaa, Y5, Y4, Y5 VMOVDQA Y8, Y4 VPSLLD $0x10, Y7, Y8 VPBLENDW $0xaa, Y8, Y6, Y8 VPSRLD $0x10, Y6, Y6 VPBLENDW $0xaa, Y7, Y6, Y7 VMOVDQA Y8, Y6 VMOVSLDUP Y2, Y8 VPBLENDD $0xaa, Y8, Y0, Y8 VPSRLQ $0x20, Y0, Y0 VPBLENDD $0xaa, Y2, Y0, Y2 VMOVDQA Y8, Y0 VMOVSLDUP Y3, Y8 VPBLENDD $0xaa, Y8, Y1, Y8 VPSRLQ $0x20, Y1, Y1 VPBLENDD $0xaa, Y3, Y1, Y3 VMOVDQA Y8, Y1 VMOVSLDUP Y6, Y8 VPBLENDD $0xaa, Y8, Y4, Y8 VPSRLQ $0x20, Y4, Y4 VPBLENDD $0xaa, Y6, Y4, Y6 VMOVDQA Y8, Y4 VMOVSLDUP Y7, Y8 VPBLENDD $0xaa, Y8, Y5, Y8 VPSRLQ $0x20, Y5, Y5 VPBLENDD $0xaa, Y7, Y5, Y7 VMOVDQA Y8, Y5 VPUNPCKLQDQ Y1, Y0, Y8 VPUNPCKHQDQ Y1, Y0, Y1 VMOVDQA Y8, Y0 VPUNPCKLQDQ Y3, Y2, Y8 VPUNPCKHQDQ Y3, Y2, Y3 VMOVDQA Y8, Y2 VPUNPCKLQDQ Y5, Y4, Y8 VPUNPCKHQDQ Y5, Y4, Y5 VMOVDQA Y8, Y4 VPUNPCKLQDQ Y7, Y6, Y8 VPUNPCKHQDQ Y7, Y6, Y7 VMOVDQA Y8, Y6 VPERM2I128 $0x20, Y2, Y0, Y8 VPERM2I128 $0x31, Y2, Y0, Y2 VMOVDQA Y8, Y0 VPERM2I128 $0x20, Y3, Y1, Y8 VPERM2I128 $0x31, Y3, Y1, Y3 VMOVDQA Y8, Y1 VPERM2I128 $0x20, Y6, Y4, Y8 VPERM2I128 $0x31, Y6, Y4, Y6 VMOVDQA Y8, Y4 VPERM2I128 $0x20, Y7, Y5, Y8 VPERM2I128 $0x31, Y7, Y5, Y7 VMOVDQA Y8, Y5 VMOVDQU Y0, 256(AX) VMOVDQU Y1, 288(AX) VMOVDQU Y2, 320(AX) VMOVDQU Y3, 352(AX) VMOVDQU Y4, 384(AX) VMOVDQU Y5, 416(AX) VMOVDQU Y6, 448(AX) VMOVDQU Y7, 480(AX) RET // func tangleAVX2(p *[256]int16) // Requires: AVX, AVX2 TEXT ·tangleAVX2(SB), NOSPLIT, $0-8 MOVQ p+0(FP), AX VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y1 VMOVDQU 64(AX), Y2 VMOVDQU 96(AX), Y3 VMOVDQU 128(AX), Y4 VMOVDQU 160(AX), Y5 VMOVDQU 192(AX), Y6 VMOVDQU 224(AX), Y7 VPERM2I128 $0x20, Y2, Y0, Y8 VPERM2I128 $0x31, Y2, Y0, Y2 VMOVDQA Y8, Y0 VPERM2I128 $0x20, Y3, Y1, Y8 VPERM2I128 $0x31, Y3, Y1, Y3 VMOVDQA Y8, Y1 VPERM2I128 $0x20, Y6, Y4, Y8 VPERM2I128 $0x31, Y6, Y4, Y6 VMOVDQA Y8, Y4 VPERM2I128 $0x20, Y7, Y5, Y8 VPERM2I128 $0x31, Y7, Y5, Y7 VMOVDQA Y8, Y5 VPUNPCKLQDQ Y1, Y0, Y8 VPUNPCKHQDQ Y1, Y0, Y1 VMOVDQA Y8, Y0 VPUNPCKLQDQ Y3, Y2, Y8 VPUNPCKHQDQ Y3, Y2, Y3 VMOVDQA Y8, Y2 VPUNPCKLQDQ Y5, Y4, Y8 VPUNPCKHQDQ Y5, Y4, Y5 VMOVDQA Y8, Y4 VPUNPCKLQDQ Y7, Y6, Y8 VPUNPCKHQDQ Y7, Y6, Y7 VMOVDQA Y8, Y6 VMOVSLDUP Y2, Y8 VPBLENDD $0xaa, Y8, Y0, Y8 VPSRLQ $0x20, Y0, Y0 VPBLENDD $0xaa, Y2, Y0, Y2 VMOVDQA Y8, Y0 VMOVSLDUP Y3, Y8 VPBLENDD $0xaa, Y8, Y1, Y8 VPSRLQ $0x20, Y1, Y1 VPBLENDD $0xaa, Y3, Y1, Y3 VMOVDQA Y8, Y1 VMOVSLDUP Y6, Y8 VPBLENDD $0xaa, Y8, Y4, Y8 VPSRLQ $0x20, Y4, Y4 VPBLENDD $0xaa, Y6, Y4, Y6 VMOVDQA Y8, Y4 VMOVSLDUP Y7, Y8 VPBLENDD $0xaa, Y8, Y5, Y8 VPSRLQ $0x20, Y5, Y5 VPBLENDD $0xaa, Y7, Y5, Y7 VMOVDQA Y8, Y5 VPSLLD $0x10, Y1, Y8 VPBLENDW $0xaa, Y8, Y0, Y8 VPSRLD $0x10, Y0, Y0 VPBLENDW $0xaa, Y1, Y0, Y1 VMOVDQA Y8, Y0 VPSLLD $0x10, Y3, Y8 VPBLENDW $0xaa, Y8, Y2, Y8 VPSRLD $0x10, Y2, Y2 VPBLENDW $0xaa, Y3, Y2, Y3 VMOVDQA Y8, Y2 VPSLLD $0x10, Y5, Y8 VPBLENDW $0xaa, Y8, Y4, Y8 VPSRLD $0x10, Y4, Y4 VPBLENDW $0xaa, Y5, Y4, Y5 VMOVDQA Y8, Y4 VPSLLD $0x10, Y7, Y8 VPBLENDW $0xaa, Y8, Y6, Y8 VPSRLD $0x10, Y6, Y6 VPBLENDW $0xaa, Y7, Y6, Y7 VMOVDQA Y8, Y6 VMOVDQU Y0, (AX) VMOVDQU Y1, 32(AX) VMOVDQU Y2, 64(AX) VMOVDQU Y3, 96(AX) VMOVDQU Y4, 128(AX) VMOVDQU Y5, 160(AX) VMOVDQU Y6, 192(AX) VMOVDQU Y7, 224(AX) VMOVDQU 256(AX), Y0 VMOVDQU 288(AX), Y1 VMOVDQU 320(AX), Y2 VMOVDQU 352(AX), Y3 VMOVDQU 384(AX), Y4 VMOVDQU 416(AX), Y5 VMOVDQU 448(AX), Y6 VMOVDQU 480(AX), Y7 VPERM2I128 $0x20, Y2, Y0, Y8 VPERM2I128 $0x31, Y2, Y0, Y2 VMOVDQA Y8, Y0 VPERM2I128 $0x20, Y3, Y1, Y8 VPERM2I128 $0x31, Y3, Y1, Y3 VMOVDQA Y8, Y1 VPERM2I128 $0x20, Y6, Y4, Y8 VPERM2I128 $0x31, Y6, Y4, Y6 VMOVDQA Y8, Y4 VPERM2I128 $0x20, Y7, Y5, Y8 VPERM2I128 $0x31, Y7, Y5, Y7 VMOVDQA Y8, Y5 VPUNPCKLQDQ Y1, Y0, Y8 VPUNPCKHQDQ Y1, Y0, Y1 VMOVDQA Y8, Y0 VPUNPCKLQDQ Y3, Y2, Y8 VPUNPCKHQDQ Y3, Y2, Y3 VMOVDQA Y8, Y2 VPUNPCKLQDQ Y5, Y4, Y8 VPUNPCKHQDQ Y5, Y4, Y5 VMOVDQA Y8, Y4 VPUNPCKLQDQ Y7, Y6, Y8 VPUNPCKHQDQ Y7, Y6, Y7 VMOVDQA Y8, Y6 VMOVSLDUP Y2, Y8 VPBLENDD $0xaa, Y8, Y0, Y8 VPSRLQ $0x20, Y0, Y0 VPBLENDD $0xaa, Y2, Y0, Y2 VMOVDQA Y8, Y0 VMOVSLDUP Y3, Y8 VPBLENDD $0xaa, Y8, Y1, Y8 VPSRLQ $0x20, Y1, Y1 VPBLENDD $0xaa, Y3, Y1, Y3 VMOVDQA Y8, Y1 VMOVSLDUP Y6, Y8 VPBLENDD $0xaa, Y8, Y4, Y8 VPSRLQ $0x20, Y4, Y4 VPBLENDD $0xaa, Y6, Y4, Y6 VMOVDQA Y8, Y4 VMOVSLDUP Y7, Y8 VPBLENDD $0xaa, Y8, Y5, Y8 VPSRLQ $0x20, Y5, Y5 VPBLENDD $0xaa, Y7, Y5, Y7 VMOVDQA Y8, Y5 VPSLLD $0x10, Y1, Y8 VPBLENDW $0xaa, Y8, Y0, Y8 VPSRLD $0x10, Y0, Y0 VPBLENDW $0xaa, Y1, Y0, Y1 VMOVDQA Y8, Y0 VPSLLD $0x10, Y3, Y8 VPBLENDW $0xaa, Y8, Y2, Y8 VPSRLD $0x10, Y2, Y2 VPBLENDW $0xaa, Y3, Y2, Y3 VMOVDQA Y8, Y2 VPSLLD $0x10, Y5, Y8 VPBLENDW $0xaa, Y8, Y4, Y8 VPSRLD $0x10, Y4, Y4 VPBLENDW $0xaa, Y5, Y4, Y5 VMOVDQA Y8, Y4 VPSLLD $0x10, Y7, Y8 VPBLENDW $0xaa, Y8, Y6, Y8 VPSRLD $0x10, Y6, Y6 VPBLENDW $0xaa, Y7, Y6, Y7 VMOVDQA Y8, Y6 VMOVDQU Y0, 256(AX) VMOVDQU Y1, 288(AX) VMOVDQU Y2, 320(AX) VMOVDQU Y3, 352(AX) VMOVDQU Y4, 384(AX) VMOVDQU Y5, 416(AX) VMOVDQU Y6, 448(AX) VMOVDQU Y7, 480(AX) RET // func barrettReduceAVX2(p *[256]int16) // Requires: AVX, AVX2 TEXT ·barrettReduceAVX2(SB), NOSPLIT, $0-8 MOVQ p+0(FP), AX MOVL $0x00000d01, CX VMOVD CX, X0 VPBROADCASTW X0, Y9 MOVL $0x00004ebf, CX VMOVD CX, X0 VPBROADCASTW X0, Y8 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y1 VMOVDQU 64(AX), Y2 VMOVDQU 96(AX), Y3 VPMULHW Y8, Y0, Y4 VPMULHW Y8, Y1, Y5 VPMULHW Y8, Y2, Y6 VPMULHW Y8, Y3, Y7 VPSRAW $0x0a, Y4, Y4 VPSRAW $0x0a, Y5, Y5 VPSRAW $0x0a, Y6, Y6 VPSRAW $0x0a, Y7, Y7 VPMULLW Y9, Y4, Y4 VPMULLW Y9, Y5, Y5 VPMULLW Y9, Y6, Y6 VPMULLW Y9, Y7, Y7 VPSUBW Y4, Y0, Y0 VPSUBW Y5, Y1, Y1 VPSUBW Y6, Y2, Y2 VPSUBW Y7, Y3, Y3 VMOVDQU Y0, (AX) VMOVDQU Y1, 32(AX) VMOVDQU Y2, 64(AX) VMOVDQU Y3, 96(AX) VMOVDQU 128(AX), Y0 VMOVDQU 160(AX), Y1 VMOVDQU 192(AX), Y2 VMOVDQU 224(AX), Y3 VPMULHW Y8, Y0, Y4 VPMULHW Y8, Y1, Y5 VPMULHW Y8, Y2, Y6 VPMULHW Y8, Y3, Y7 VPSRAW $0x0a, Y4, Y4 VPSRAW $0x0a, Y5, Y5 VPSRAW $0x0a, Y6, Y6 VPSRAW $0x0a, Y7, Y7 VPMULLW Y9, Y4, Y4 VPMULLW Y9, Y5, Y5 VPMULLW Y9, Y6, Y6 VPMULLW Y9, Y7, Y7 VPSUBW Y4, Y0, Y0 VPSUBW Y5, Y1, Y1 VPSUBW Y6, Y2, Y2 VPSUBW Y7, Y3, Y3 VMOVDQU Y0, 128(AX) VMOVDQU Y1, 160(AX) VMOVDQU Y2, 192(AX) VMOVDQU Y3, 224(AX) VMOVDQU 256(AX), Y0 VMOVDQU 288(AX), Y1 VMOVDQU 320(AX), Y2 VMOVDQU 352(AX), Y3 VPMULHW Y8, Y0, Y4 VPMULHW Y8, Y1, Y5 VPMULHW Y8, Y2, Y6 VPMULHW Y8, Y3, Y7 VPSRAW $0x0a, Y4, Y4 VPSRAW $0x0a, Y5, Y5 VPSRAW $0x0a, Y6, Y6 VPSRAW $0x0a, Y7, Y7 VPMULLW Y9, Y4, Y4 VPMULLW Y9, Y5, Y5 VPMULLW Y9, Y6, Y6 VPMULLW Y9, Y7, Y7 VPSUBW Y4, Y0, Y0 VPSUBW Y5, Y1, Y1 VPSUBW Y6, Y2, Y2 VPSUBW Y7, Y3, Y3 VMOVDQU Y0, 256(AX) VMOVDQU Y1, 288(AX) VMOVDQU Y2, 320(AX) VMOVDQU Y3, 352(AX) VMOVDQU 384(AX), Y0 VMOVDQU 416(AX), Y1 VMOVDQU 448(AX), Y2 VMOVDQU 480(AX), Y3 VPMULHW Y8, Y0, Y4 VPMULHW Y8, Y1, Y5 VPMULHW Y8, Y2, Y6 VPMULHW Y8, Y3, Y7 VPSRAW $0x0a, Y4, Y4 VPSRAW $0x0a, Y5, Y5 VPSRAW $0x0a, Y6, Y6 VPSRAW $0x0a, Y7, Y7 VPMULLW Y9, Y4, Y4 VPMULLW Y9, Y5, Y5 VPMULLW Y9, Y6, Y6 VPMULLW Y9, Y7, Y7 VPSUBW Y4, Y0, Y0 VPSUBW Y5, Y1, Y1 VPSUBW Y6, Y2, Y2 VPSUBW Y7, Y3, Y3 VMOVDQU Y0, 384(AX) VMOVDQU Y1, 416(AX) VMOVDQU Y2, 448(AX) VMOVDQU Y3, 480(AX) RET // func normalizeAVX2(p *[256]int16) // Requires: AVX, AVX2 TEXT ·normalizeAVX2(SB), NOSPLIT, $0-8 MOVQ p+0(FP), AX MOVL $0x00000d01, CX VMOVD CX, X0 VPBROADCASTW X0, Y9 MOVL $0x00004ebf, CX VMOVD CX, X0 VPBROADCASTW X0, Y8 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y1 VMOVDQU 64(AX), Y2 VMOVDQU 96(AX), Y3 VPMULHW Y8, Y0, Y4 VPMULHW Y8, Y1, Y5 VPMULHW Y8, Y2, Y6 VPMULHW Y8, Y3, Y7 VPSRAW $0x0a, Y4, Y4 VPSRAW $0x0a, Y5, Y5 VPSRAW $0x0a, Y6, Y6 VPSRAW $0x0a, Y7, Y7 VPMULLW Y9, Y4, Y4 VPMULLW Y9, Y5, Y5 VPMULLW Y9, Y6, Y6 VPMULLW Y9, Y7, Y7 VPSUBW Y4, Y0, Y0 VPSUBW Y5, Y1, Y1 VPSUBW Y6, Y2, Y2 VPSUBW Y7, Y3, Y3 VPSUBW Y9, Y0, Y0 VPSUBW Y9, Y1, Y1 VPSUBW Y9, Y2, Y2 VPSUBW Y9, Y3, Y3 VPSRAW $0x0f, Y0, Y4 VPSRAW $0x0f, Y1, Y5 VPSRAW $0x0f, Y2, Y6 VPSRAW $0x0f, Y3, Y7 VPAND Y4, Y9, Y4 VPAND Y5, Y9, Y5 VPAND Y6, Y9, Y6 VPAND Y7, Y9, Y7 VPADDW Y0, Y4, Y0 VPADDW Y1, Y5, Y1 VPADDW Y2, Y6, Y2 VPADDW Y3, Y7, Y3 VMOVDQU Y0, (AX) VMOVDQU Y1, 32(AX) VMOVDQU Y2, 64(AX) VMOVDQU Y3, 96(AX) VMOVDQU 128(AX), Y0 VMOVDQU 160(AX), Y1 VMOVDQU 192(AX), Y2 VMOVDQU 224(AX), Y3 VPMULHW Y8, Y0, Y4 VPMULHW Y8, Y1, Y5 VPMULHW Y8, Y2, Y6 VPMULHW Y8, Y3, Y7 VPSRAW $0x0a, Y4, Y4 VPSRAW $0x0a, Y5, Y5 VPSRAW $0x0a, Y6, Y6 VPSRAW $0x0a, Y7, Y7 VPMULLW Y9, Y4, Y4 VPMULLW Y9, Y5, Y5 VPMULLW Y9, Y6, Y6 VPMULLW Y9, Y7, Y7 VPSUBW Y4, Y0, Y0 VPSUBW Y5, Y1, Y1 VPSUBW Y6, Y2, Y2 VPSUBW Y7, Y3, Y3 VPSUBW Y9, Y0, Y0 VPSUBW Y9, Y1, Y1 VPSUBW Y9, Y2, Y2 VPSUBW Y9, Y3, Y3 VPSRAW $0x0f, Y0, Y4 VPSRAW $0x0f, Y1, Y5 VPSRAW $0x0f, Y2, Y6 VPSRAW $0x0f, Y3, Y7 VPAND Y4, Y9, Y4 VPAND Y5, Y9, Y5 VPAND Y6, Y9, Y6 VPAND Y7, Y9, Y7 VPADDW Y0, Y4, Y0 VPADDW Y1, Y5, Y1 VPADDW Y2, Y6, Y2 VPADDW Y3, Y7, Y3 VMOVDQU Y0, 128(AX) VMOVDQU Y1, 160(AX) VMOVDQU Y2, 192(AX) VMOVDQU Y3, 224(AX) VMOVDQU 256(AX), Y0 VMOVDQU 288(AX), Y1 VMOVDQU 320(AX), Y2 VMOVDQU 352(AX), Y3 VPMULHW Y8, Y0, Y4 VPMULHW Y8, Y1, Y5 VPMULHW Y8, Y2, Y6 VPMULHW Y8, Y3, Y7 VPSRAW $0x0a, Y4, Y4 VPSRAW $0x0a, Y5, Y5 VPSRAW $0x0a, Y6, Y6 VPSRAW $0x0a, Y7, Y7 VPMULLW Y9, Y4, Y4 VPMULLW Y9, Y5, Y5 VPMULLW Y9, Y6, Y6 VPMULLW Y9, Y7, Y7 VPSUBW Y4, Y0, Y0 VPSUBW Y5, Y1, Y1 VPSUBW Y6, Y2, Y2 VPSUBW Y7, Y3, Y3 VPSUBW Y9, Y0, Y0 VPSUBW Y9, Y1, Y1 VPSUBW Y9, Y2, Y2 VPSUBW Y9, Y3, Y3 VPSRAW $0x0f, Y0, Y4 VPSRAW $0x0f, Y1, Y5 VPSRAW $0x0f, Y2, Y6 VPSRAW $0x0f, Y3, Y7 VPAND Y4, Y9, Y4 VPAND Y5, Y9, Y5 VPAND Y6, Y9, Y6 VPAND Y7, Y9, Y7 VPADDW Y0, Y4, Y0 VPADDW Y1, Y5, Y1 VPADDW Y2, Y6, Y2 VPADDW Y3, Y7, Y3 VMOVDQU Y0, 256(AX) VMOVDQU Y1, 288(AX) VMOVDQU Y2, 320(AX) VMOVDQU Y3, 352(AX) VMOVDQU 384(AX), Y0 VMOVDQU 416(AX), Y1 VMOVDQU 448(AX), Y2 VMOVDQU 480(AX), Y3 VPMULHW Y8, Y0, Y4 VPMULHW Y8, Y1, Y5 VPMULHW Y8, Y2, Y6 VPMULHW Y8, Y3, Y7 VPSRAW $0x0a, Y4, Y4 VPSRAW $0x0a, Y5, Y5 VPSRAW $0x0a, Y6, Y6 VPSRAW $0x0a, Y7, Y7 VPMULLW Y9, Y4, Y4 VPMULLW Y9, Y5, Y5 VPMULLW Y9, Y6, Y6 VPMULLW Y9, Y7, Y7 VPSUBW Y4, Y0, Y0 VPSUBW Y5, Y1, Y1 VPSUBW Y6, Y2, Y2 VPSUBW Y7, Y3, Y3 VPSUBW Y9, Y0, Y0 VPSUBW Y9, Y1, Y1 VPSUBW Y9, Y2, Y2 VPSUBW Y9, Y3, Y3 VPSRAW $0x0f, Y0, Y4 VPSRAW $0x0f, Y1, Y5 VPSRAW $0x0f, Y2, Y6 VPSRAW $0x0f, Y3, Y7 VPAND Y4, Y9, Y4 VPAND Y5, Y9, Y5 VPAND Y6, Y9, Y6 VPAND Y7, Y9, Y7 VPADDW Y0, Y4, Y0 VPADDW Y1, Y5, Y1 VPADDW Y2, Y6, Y2 VPADDW Y3, Y7, Y3 VMOVDQU Y0, 384(AX) VMOVDQU Y1, 416(AX) VMOVDQU Y2, 448(AX) VMOVDQU Y3, 480(AX) RET