...

Text file src/github.com/cloudflare/circl/pke/kyber/internal/common/amd64.s

Documentation: github.com/cloudflare/circl/pke/kyber/internal/common

     1// Code generated by command: go run src.go -out ../amd64.s -stubs ../stubs_amd64.go -pkg common. DO NOT EDIT.
     2
     3//go:build amd64
     4
     5#include "textflag.h"
     6
     7// func addAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
     8// Requires: AVX, AVX2
     9TEXT ·addAVX2(SB), NOSPLIT, $0-24
    10	MOVQ    p+0(FP), AX
    11	MOVQ    a+8(FP), CX
    12	MOVQ    b+16(FP), DX
    13	VMOVDQU (CX), Y0
    14	VMOVDQU 32(CX), Y2
    15	VMOVDQU 64(CX), Y4
    16	VMOVDQU 96(CX), Y6
    17	VMOVDQU 128(CX), Y8
    18	VMOVDQU 160(CX), Y10
    19	VMOVDQU 192(CX), Y12
    20	VMOVDQU 224(CX), Y14
    21	VMOVDQU (DX), Y1
    22	VMOVDQU 32(DX), Y3
    23	VMOVDQU 64(DX), Y5
    24	VMOVDQU 96(DX), Y7
    25	VMOVDQU 128(DX), Y9
    26	VMOVDQU 160(DX), Y11
    27	VMOVDQU 192(DX), Y13
    28	VMOVDQU 224(DX), Y15
    29	VPADDW  Y0, Y1, Y1
    30	VPADDW  Y2, Y3, Y3
    31	VPADDW  Y4, Y5, Y5
    32	VPADDW  Y6, Y7, Y7
    33	VPADDW  Y8, Y9, Y9
    34	VPADDW  Y10, Y11, Y11
    35	VPADDW  Y12, Y13, Y13
    36	VPADDW  Y14, Y15, Y15
    37	VMOVDQU Y1, (AX)
    38	VMOVDQU Y3, 32(AX)
    39	VMOVDQU Y5, 64(AX)
    40	VMOVDQU Y7, 96(AX)
    41	VMOVDQU Y9, 128(AX)
    42	VMOVDQU Y11, 160(AX)
    43	VMOVDQU Y13, 192(AX)
    44	VMOVDQU Y15, 224(AX)
    45	VMOVDQU 256(CX), Y0
    46	VMOVDQU 288(CX), Y2
    47	VMOVDQU 320(CX), Y4
    48	VMOVDQU 352(CX), Y6
    49	VMOVDQU 384(CX), Y8
    50	VMOVDQU 416(CX), Y10
    51	VMOVDQU 448(CX), Y12
    52	VMOVDQU 480(CX), Y14
    53	VMOVDQU 256(DX), Y1
    54	VMOVDQU 288(DX), Y3
    55	VMOVDQU 320(DX), Y5
    56	VMOVDQU 352(DX), Y7
    57	VMOVDQU 384(DX), Y9
    58	VMOVDQU 416(DX), Y11
    59	VMOVDQU 448(DX), Y13
    60	VMOVDQU 480(DX), Y15
    61	VPADDW  Y0, Y1, Y1
    62	VPADDW  Y2, Y3, Y3
    63	VPADDW  Y4, Y5, Y5
    64	VPADDW  Y6, Y7, Y7
    65	VPADDW  Y8, Y9, Y9
    66	VPADDW  Y10, Y11, Y11
    67	VPADDW  Y12, Y13, Y13
    68	VPADDW  Y14, Y15, Y15
    69	VMOVDQU Y1, 256(AX)
    70	VMOVDQU Y3, 288(AX)
    71	VMOVDQU Y5, 320(AX)
    72	VMOVDQU Y7, 352(AX)
    73	VMOVDQU Y9, 384(AX)
    74	VMOVDQU Y11, 416(AX)
    75	VMOVDQU Y13, 448(AX)
    76	VMOVDQU Y15, 480(AX)
    77	RET
    78
    79// func subAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
    80// Requires: AVX, AVX2
    81TEXT ·subAVX2(SB), NOSPLIT, $0-24
    82	MOVQ    p+0(FP), AX
    83	MOVQ    a+8(FP), CX
    84	MOVQ    b+16(FP), DX
    85	VMOVDQU (CX), Y0
    86	VMOVDQU 32(CX), Y2
    87	VMOVDQU 64(CX), Y4
    88	VMOVDQU 96(CX), Y6
    89	VMOVDQU 128(CX), Y8
    90	VMOVDQU 160(CX), Y10
    91	VMOVDQU 192(CX), Y12
    92	VMOVDQU 224(CX), Y14
    93	VMOVDQU (DX), Y1
    94	VMOVDQU 32(DX), Y3
    95	VMOVDQU 64(DX), Y5
    96	VMOVDQU 96(DX), Y7
    97	VMOVDQU 128(DX), Y9
    98	VMOVDQU 160(DX), Y11
    99	VMOVDQU 192(DX), Y13
   100	VMOVDQU 224(DX), Y15
   101	VPSUBW  Y1, Y0, Y1
   102	VPSUBW  Y3, Y2, Y3
   103	VPSUBW  Y5, Y4, Y5
   104	VPSUBW  Y7, Y6, Y7
   105	VPSUBW  Y9, Y8, Y9
   106	VPSUBW  Y11, Y10, Y11
   107	VPSUBW  Y13, Y12, Y13
   108	VPSUBW  Y15, Y14, Y15
   109	VMOVDQU Y1, (AX)
   110	VMOVDQU Y3, 32(AX)
   111	VMOVDQU Y5, 64(AX)
   112	VMOVDQU Y7, 96(AX)
   113	VMOVDQU Y9, 128(AX)
   114	VMOVDQU Y11, 160(AX)
   115	VMOVDQU Y13, 192(AX)
   116	VMOVDQU Y15, 224(AX)
   117	VMOVDQU 256(CX), Y0
   118	VMOVDQU 288(CX), Y2
   119	VMOVDQU 320(CX), Y4
   120	VMOVDQU 352(CX), Y6
   121	VMOVDQU 384(CX), Y8
   122	VMOVDQU 416(CX), Y10
   123	VMOVDQU 448(CX), Y12
   124	VMOVDQU 480(CX), Y14
   125	VMOVDQU 256(DX), Y1
   126	VMOVDQU 288(DX), Y3
   127	VMOVDQU 320(DX), Y5
   128	VMOVDQU 352(DX), Y7
   129	VMOVDQU 384(DX), Y9
   130	VMOVDQU 416(DX), Y11
   131	VMOVDQU 448(DX), Y13
   132	VMOVDQU 480(DX), Y15
   133	VPSUBW  Y1, Y0, Y1
   134	VPSUBW  Y3, Y2, Y3
   135	VPSUBW  Y5, Y4, Y5
   136	VPSUBW  Y7, Y6, Y7
   137	VPSUBW  Y9, Y8, Y9
   138	VPSUBW  Y11, Y10, Y11
   139	VPSUBW  Y13, Y12, Y13
   140	VPSUBW  Y15, Y14, Y15
   141	VMOVDQU Y1, 256(AX)
   142	VMOVDQU Y3, 288(AX)
   143	VMOVDQU Y5, 320(AX)
   144	VMOVDQU Y7, 352(AX)
   145	VMOVDQU Y9, 384(AX)
   146	VMOVDQU Y11, 416(AX)
   147	VMOVDQU Y13, 448(AX)
   148	VMOVDQU Y15, 480(AX)
   149	RET
   150
   151// func nttAVX2(p *[256]int16)
   152// Requires: AVX, AVX2
   153TEXT ·nttAVX2(SB), NOSPLIT, $0-8
   154	MOVQ         p+0(FP), AX
   155	LEAQ         ·ZetasAVX2+0(SB), CX
   156	MOVL         $0x00000d01, DX
   157	VMOVD        DX, X0
   158	VPBROADCASTW X0, Y15
   159	VPBROADCASTW (CX), Y0
   160	VPBROADCASTW 2(CX), Y1
   161	VMOVDQU      (AX), Y7
   162	VMOVDQU      32(AX), Y8
   163	VMOVDQU      64(AX), Y9
   164	VMOVDQU      96(AX), Y10
   165	VMOVDQU      256(AX), Y11
   166	VMOVDQU      288(AX), Y12
   167	VMOVDQU      320(AX), Y13
   168	VMOVDQU      352(AX), Y14
   169	VPMULLW      Y11, Y0, Y2
   170	VPMULLW      Y12, Y0, Y3
   171	VPMULLW      Y13, Y0, Y4
   172	VPMULLW      Y14, Y0, Y5
   173	VPMULHW      Y11, Y1, Y11
   174	VPMULHW      Y12, Y1, Y12
   175	VPMULHW      Y13, Y1, Y13
   176	VPMULHW      Y14, Y1, Y14
   177	VPMULHW      Y2, Y15, Y2
   178	VPMULHW      Y3, Y15, Y3
   179	VPMULHW      Y4, Y15, Y4
   180	VPMULHW      Y5, Y15, Y5
   181	VPSUBW       Y2, Y11, Y2
   182	VPSUBW       Y3, Y12, Y3
   183	VPSUBW       Y4, Y13, Y4
   184	VPSUBW       Y5, Y14, Y5
   185	VPSUBW       Y2, Y7, Y11
   186	VPSUBW       Y3, Y8, Y12
   187	VPSUBW       Y4, Y9, Y13
   188	VPSUBW       Y5, Y10, Y14
   189	VPADDW       Y2, Y7, Y7
   190	VPADDW       Y3, Y8, Y8
   191	VPADDW       Y4, Y9, Y9
   192	VPADDW       Y5, Y10, Y10
   193	VMOVDQU      Y7, (AX)
   194	VMOVDQU      Y8, 32(AX)
   195	VMOVDQU      Y9, 64(AX)
   196	VMOVDQU      Y10, 96(AX)
   197	VMOVDQU      Y11, 256(AX)
   198	VMOVDQU      Y12, 288(AX)
   199	VMOVDQU      Y13, 320(AX)
   200	VMOVDQU      Y14, 352(AX)
   201	VMOVDQU      128(AX), Y7
   202	VMOVDQU      160(AX), Y8
   203	VMOVDQU      192(AX), Y9
   204	VMOVDQU      224(AX), Y10
   205	VMOVDQU      384(AX), Y11
   206	VMOVDQU      416(AX), Y12
   207	VMOVDQU      448(AX), Y13
   208	VMOVDQU      480(AX), Y14
   209	VPMULLW      Y11, Y0, Y2
   210	VPMULLW      Y12, Y0, Y3
   211	VPMULLW      Y13, Y0, Y4
   212	VPMULLW      Y14, Y0, Y5
   213	VPMULHW      Y11, Y1, Y11
   214	VPMULHW      Y12, Y1, Y12
   215	VPMULHW      Y13, Y1, Y13
   216	VPMULHW      Y14, Y1, Y14
   217	VPMULHW      Y2, Y15, Y2
   218	VPMULHW      Y3, Y15, Y3
   219	VPMULHW      Y4, Y15, Y4
   220	VPMULHW      Y5, Y15, Y5
   221	VPSUBW       Y2, Y11, Y2
   222	VPSUBW       Y3, Y12, Y3
   223	VPSUBW       Y4, Y13, Y4
   224	VPSUBW       Y5, Y14, Y5
   225	VPSUBW       Y2, Y7, Y11
   226	VPSUBW       Y3, Y8, Y12
   227	VPSUBW       Y4, Y9, Y13
   228	VPSUBW       Y5, Y10, Y14
   229	VPADDW       Y2, Y7, Y7
   230	VPADDW       Y3, Y8, Y8
   231	VPADDW       Y4, Y9, Y9
   232	VPADDW       Y5, Y10, Y10
   233	VMOVDQU      Y7, 128(AX)
   234	VMOVDQU      Y8, 160(AX)
   235	VMOVDQU      Y9, 192(AX)
   236	VMOVDQU      Y10, 224(AX)
   237	VMOVDQU      Y11, 384(AX)
   238	VMOVDQU      Y12, 416(AX)
   239	VMOVDQU      Y13, 448(AX)
   240	VMOVDQU      Y14, 480(AX)
   241	VPBROADCASTW 4(CX), Y0
   242	VPBROADCASTW 6(CX), Y1
   243	VMOVDQU      (AX), Y7
   244	VMOVDQU      32(AX), Y8
   245	VMOVDQU      64(AX), Y9
   246	VMOVDQU      96(AX), Y10
   247	VMOVDQU      128(AX), Y11
   248	VMOVDQU      160(AX), Y12
   249	VMOVDQU      192(AX), Y13
   250	VMOVDQU      224(AX), Y14
   251	VPMULLW      Y11, Y0, Y2
   252	VPMULLW      Y12, Y0, Y3
   253	VPMULLW      Y13, Y0, Y4
   254	VPMULLW      Y14, Y0, Y5
   255	VPMULHW      Y11, Y1, Y11
   256	VPMULHW      Y12, Y1, Y12
   257	VPMULHW      Y13, Y1, Y13
   258	VPMULHW      Y14, Y1, Y14
   259	VPMULHW      Y2, Y15, Y2
   260	VPMULHW      Y3, Y15, Y3
   261	VPMULHW      Y4, Y15, Y4
   262	VPMULHW      Y5, Y15, Y5
   263	VPSUBW       Y2, Y11, Y2
   264	VPSUBW       Y3, Y12, Y3
   265	VPSUBW       Y4, Y13, Y4
   266	VPSUBW       Y5, Y14, Y5
   267	VPSUBW       Y2, Y7, Y11
   268	VPSUBW       Y3, Y8, Y12
   269	VPSUBW       Y4, Y9, Y13
   270	VPSUBW       Y5, Y10, Y14
   271	VPADDW       Y2, Y7, Y7
   272	VPADDW       Y3, Y8, Y8
   273	VPADDW       Y4, Y9, Y9
   274	VPADDW       Y5, Y10, Y10
   275	VPBROADCASTW 12(CX), Y0
   276	VPBROADCASTW 14(CX), Y1
   277	VPBROADCASTW 16(CX), Y2
   278	VPBROADCASTW 18(CX), Y3
   279	VPMULLW      Y9, Y0, Y4
   280	VPMULLW      Y10, Y0, Y5
   281	VPMULLW      Y13, Y2, Y6
   282	VPMULLW      Y14, Y2, Y0
   283	VPMULHW      Y9, Y1, Y9
   284	VPMULHW      Y10, Y1, Y10
   285	VPMULHW      Y13, Y3, Y13
   286	VPMULHW      Y14, Y3, Y14
   287	VPMULHW      Y4, Y15, Y4
   288	VPMULHW      Y5, Y15, Y5
   289	VPMULHW      Y6, Y15, Y6
   290	VPMULHW      Y0, Y15, Y0
   291	VPSUBW       Y4, Y9, Y4
   292	VPSUBW       Y5, Y10, Y5
   293	VPSUBW       Y6, Y13, Y6
   294	VPSUBW       Y0, Y14, Y0
   295	VPSUBW       Y4, Y7, Y9
   296	VPSUBW       Y5, Y8, Y10
   297	VPSUBW       Y6, Y11, Y13
   298	VPSUBW       Y0, Y12, Y14
   299	VPADDW       Y4, Y7, Y7
   300	VPADDW       Y5, Y8, Y8
   301	VPADDW       Y6, Y11, Y11
   302	VPADDW       Y0, Y12, Y12
   303	VMOVDQU      32(CX), Y0
   304	VMOVDQU      64(CX), Y1
   305	VMOVDQU      96(CX), Y2
   306	VMOVDQU      128(CX), Y3
   307	VPERM2I128   $0x20, Y9, Y7, Y4
   308	VPERM2I128   $0x31, Y9, Y7, Y9
   309	VMOVDQA      Y4, Y7
   310	VPERM2I128   $0x20, Y10, Y8, Y4
   311	VPERM2I128   $0x31, Y10, Y8, Y10
   312	VMOVDQA      Y4, Y8
   313	VPERM2I128   $0x20, Y13, Y11, Y4
   314	VPERM2I128   $0x31, Y13, Y11, Y13
   315	VMOVDQA      Y4, Y11
   316	VPERM2I128   $0x20, Y14, Y12, Y4
   317	VPERM2I128   $0x31, Y14, Y12, Y14
   318	VMOVDQA      Y4, Y12
   319	VPMULLW      Y8, Y0, Y4
   320	VPMULLW      Y10, Y0, Y5
   321	VPMULLW      Y12, Y2, Y6
   322	VPMULLW      Y14, Y2, Y0
   323	VPMULHW      Y8, Y1, Y8
   324	VPMULHW      Y10, Y1, Y10
   325	VPMULHW      Y12, Y3, Y12
   326	VPMULHW      Y14, Y3, Y14
   327	VPMULHW      Y4, Y15, Y4
   328	VPMULHW      Y5, Y15, Y5
   329	VPMULHW      Y6, Y15, Y6
   330	VPMULHW      Y0, Y15, Y0
   331	VPSUBW       Y4, Y8, Y4
   332	VPSUBW       Y5, Y10, Y5
   333	VPSUBW       Y6, Y12, Y6
   334	VPSUBW       Y0, Y14, Y0
   335	VPSUBW       Y4, Y7, Y8
   336	VPSUBW       Y5, Y9, Y10
   337	VPSUBW       Y6, Y11, Y12
   338	VPSUBW       Y0, Y13, Y14
   339	VPADDW       Y4, Y7, Y7
   340	VPADDW       Y5, Y9, Y9
   341	VPADDW       Y6, Y11, Y11
   342	VPADDW       Y0, Y13, Y13
   343	VMOVDQU      288(CX), Y0
   344	VMOVDQU      320(CX), Y1
   345	VMOVDQU      352(CX), Y2
   346	VMOVDQU      384(CX), Y3
   347	VPUNPCKLQDQ  Y8, Y7, Y4
   348	VPUNPCKHQDQ  Y8, Y7, Y8
   349	VMOVDQA      Y4, Y7
   350	VPUNPCKLQDQ  Y10, Y9, Y4
   351	VPUNPCKHQDQ  Y10, Y9, Y10
   352	VMOVDQA      Y4, Y9
   353	VPUNPCKLQDQ  Y12, Y11, Y4
   354	VPUNPCKHQDQ  Y12, Y11, Y12
   355	VMOVDQA      Y4, Y11
   356	VPUNPCKLQDQ  Y14, Y13, Y4
   357	VPUNPCKHQDQ  Y14, Y13, Y14
   358	VMOVDQA      Y4, Y13
   359	VPMULLW      Y9, Y0, Y4
   360	VPMULLW      Y10, Y0, Y5
   361	VPMULLW      Y13, Y2, Y6
   362	VPMULLW      Y14, Y2, Y0
   363	VPMULHW      Y9, Y1, Y9
   364	VPMULHW      Y10, Y1, Y10
   365	VPMULHW      Y13, Y3, Y13
   366	VPMULHW      Y14, Y3, Y14
   367	VPMULHW      Y4, Y15, Y4
   368	VPMULHW      Y5, Y15, Y5
   369	VPMULHW      Y6, Y15, Y6
   370	VPMULHW      Y0, Y15, Y0
   371	VPSUBW       Y4, Y9, Y4
   372	VPSUBW       Y5, Y10, Y5
   373	VPSUBW       Y6, Y13, Y6
   374	VPSUBW       Y0, Y14, Y0
   375	VPSUBW       Y4, Y7, Y9
   376	VPSUBW       Y5, Y8, Y10
   377	VPSUBW       Y6, Y11, Y13
   378	VPSUBW       Y0, Y12, Y14
   379	VPADDW       Y4, Y7, Y7
   380	VPADDW       Y5, Y8, Y8
   381	VPADDW       Y6, Y11, Y11
   382	VPADDW       Y0, Y12, Y12
   383	VMOVDQU      544(CX), Y0
   384	VMOVDQU      576(CX), Y1
   385	VMOVDQU      608(CX), Y2
   386	VMOVDQU      640(CX), Y3
   387	VMOVSLDUP    Y9, Y4
   388	VPBLENDD     $0xaa, Y4, Y7, Y4
   389	VPSRLQ       $0x20, Y7, Y7
   390	VPBLENDD     $0xaa, Y9, Y7, Y9
   391	VMOVDQA      Y4, Y7
   392	VMOVSLDUP    Y10, Y4
   393	VPBLENDD     $0xaa, Y4, Y8, Y4
   394	VPSRLQ       $0x20, Y8, Y8
   395	VPBLENDD     $0xaa, Y10, Y8, Y10
   396	VMOVDQA      Y4, Y8
   397	VMOVSLDUP    Y13, Y4
   398	VPBLENDD     $0xaa, Y4, Y11, Y4
   399	VPSRLQ       $0x20, Y11, Y11
   400	VPBLENDD     $0xaa, Y13, Y11, Y13
   401	VMOVDQA      Y4, Y11
   402	VMOVSLDUP    Y14, Y4
   403	VPBLENDD     $0xaa, Y4, Y12, Y4
   404	VPSRLQ       $0x20, Y12, Y12
   405	VPBLENDD     $0xaa, Y14, Y12, Y14
   406	VMOVDQA      Y4, Y12
   407	VPMULLW      Y8, Y0, Y4
   408	VPMULLW      Y10, Y0, Y5
   409	VPMULLW      Y12, Y2, Y6
   410	VPMULLW      Y14, Y2, Y0
   411	VPMULHW      Y8, Y1, Y8
   412	VPMULHW      Y10, Y1, Y10
   413	VPMULHW      Y12, Y3, Y12
   414	VPMULHW      Y14, Y3, Y14
   415	VPMULHW      Y4, Y15, Y4
   416	VPMULHW      Y5, Y15, Y5
   417	VPMULHW      Y6, Y15, Y6
   418	VPMULHW      Y0, Y15, Y0
   419	VPSUBW       Y4, Y8, Y4
   420	VPSUBW       Y5, Y10, Y5
   421	VPSUBW       Y6, Y12, Y6
   422	VPSUBW       Y0, Y14, Y0
   423	VPSUBW       Y4, Y7, Y8
   424	VPSUBW       Y5, Y9, Y10
   425	VPSUBW       Y6, Y11, Y12
   426	VPSUBW       Y0, Y13, Y14
   427	VPADDW       Y4, Y7, Y7
   428	VPADDW       Y5, Y9, Y9
   429	VPADDW       Y6, Y11, Y11
   430	VPADDW       Y0, Y13, Y13
   431	VMOVDQU      800(CX), Y0
   432	VMOVDQU      832(CX), Y1
   433	VMOVDQU      864(CX), Y2
   434	VMOVDQU      896(CX), Y3
   435	VPSLLD       $0x10, Y8, Y4
   436	VPBLENDW     $0xaa, Y4, Y7, Y4
   437	VPSRLD       $0x10, Y7, Y7
   438	VPBLENDW     $0xaa, Y8, Y7, Y8
   439	VMOVDQA      Y4, Y7
   440	VPSLLD       $0x10, Y10, Y4
   441	VPBLENDW     $0xaa, Y4, Y9, Y4
   442	VPSRLD       $0x10, Y9, Y9
   443	VPBLENDW     $0xaa, Y10, Y9, Y10
   444	VMOVDQA      Y4, Y9
   445	VPSLLD       $0x10, Y12, Y4
   446	VPBLENDW     $0xaa, Y4, Y11, Y4
   447	VPSRLD       $0x10, Y11, Y11
   448	VPBLENDW     $0xaa, Y12, Y11, Y12
   449	VMOVDQA      Y4, Y11
   450	VPSLLD       $0x10, Y14, Y4
   451	VPBLENDW     $0xaa, Y4, Y13, Y4
   452	VPSRLD       $0x10, Y13, Y13
   453	VPBLENDW     $0xaa, Y14, Y13, Y14
   454	VMOVDQA      Y4, Y13
   455	VPMULLW      Y9, Y0, Y4
   456	VPMULLW      Y10, Y0, Y5
   457	VPMULLW      Y13, Y2, Y6
   458	VPMULLW      Y14, Y2, Y0
   459	VPMULHW      Y9, Y1, Y9
   460	VPMULHW      Y10, Y1, Y10
   461	VPMULHW      Y13, Y3, Y13
   462	VPMULHW      Y14, Y3, Y14
   463	VPMULHW      Y4, Y15, Y4
   464	VPMULHW      Y5, Y15, Y5
   465	VPMULHW      Y6, Y15, Y6
   466	VPMULHW      Y0, Y15, Y0
   467	VPSUBW       Y4, Y9, Y4
   468	VPSUBW       Y5, Y10, Y5
   469	VPSUBW       Y6, Y13, Y6
   470	VPSUBW       Y0, Y14, Y0
   471	VPSUBW       Y4, Y7, Y9
   472	VPSUBW       Y5, Y8, Y10
   473	VPSUBW       Y6, Y11, Y13
   474	VPSUBW       Y0, Y12, Y14
   475	VPADDW       Y4, Y7, Y7
   476	VPADDW       Y5, Y8, Y8
   477	VPADDW       Y6, Y11, Y11
   478	VPADDW       Y0, Y12, Y12
   479	VMOVDQU      Y7, (AX)
   480	VMOVDQU      Y8, 32(AX)
   481	VMOVDQU      Y9, 64(AX)
   482	VMOVDQU      Y10, 96(AX)
   483	VMOVDQU      Y11, 128(AX)
   484	VMOVDQU      Y12, 160(AX)
   485	VMOVDQU      Y13, 192(AX)
   486	VMOVDQU      Y14, 224(AX)
   487	VPBROADCASTW 8(CX), Y0
   488	VPBROADCASTW 10(CX), Y1
   489	VMOVDQU      256(AX), Y7
   490	VMOVDQU      288(AX), Y8
   491	VMOVDQU      320(AX), Y9
   492	VMOVDQU      352(AX), Y10
   493	VMOVDQU      384(AX), Y11
   494	VMOVDQU      416(AX), Y12
   495	VMOVDQU      448(AX), Y13
   496	VMOVDQU      480(AX), Y14
   497	VPMULLW      Y11, Y0, Y2
   498	VPMULLW      Y12, Y0, Y3
   499	VPMULLW      Y13, Y0, Y4
   500	VPMULLW      Y14, Y0, Y5
   501	VPMULHW      Y11, Y1, Y11
   502	VPMULHW      Y12, Y1, Y12
   503	VPMULHW      Y13, Y1, Y13
   504	VPMULHW      Y14, Y1, Y14
   505	VPMULHW      Y2, Y15, Y2
   506	VPMULHW      Y3, Y15, Y3
   507	VPMULHW      Y4, Y15, Y4
   508	VPMULHW      Y5, Y15, Y5
   509	VPSUBW       Y2, Y11, Y2
   510	VPSUBW       Y3, Y12, Y3
   511	VPSUBW       Y4, Y13, Y4
   512	VPSUBW       Y5, Y14, Y5
   513	VPSUBW       Y2, Y7, Y11
   514	VPSUBW       Y3, Y8, Y12
   515	VPSUBW       Y4, Y9, Y13
   516	VPSUBW       Y5, Y10, Y14
   517	VPADDW       Y2, Y7, Y7
   518	VPADDW       Y3, Y8, Y8
   519	VPADDW       Y4, Y9, Y9
   520	VPADDW       Y5, Y10, Y10
   521	VPBROADCASTW 20(CX), Y0
   522	VPBROADCASTW 22(CX), Y1
   523	VPBROADCASTW 24(CX), Y2
   524	VPBROADCASTW 26(CX), Y3
   525	VPMULLW      Y9, Y0, Y4
   526	VPMULLW      Y10, Y0, Y5
   527	VPMULLW      Y13, Y2, Y6
   528	VPMULLW      Y14, Y2, Y0
   529	VPMULHW      Y9, Y1, Y9
   530	VPMULHW      Y10, Y1, Y10
   531	VPMULHW      Y13, Y3, Y13
   532	VPMULHW      Y14, Y3, Y14
   533	VPMULHW      Y4, Y15, Y4
   534	VPMULHW      Y5, Y15, Y5
   535	VPMULHW      Y6, Y15, Y6
   536	VPMULHW      Y0, Y15, Y0
   537	VPSUBW       Y4, Y9, Y4
   538	VPSUBW       Y5, Y10, Y5
   539	VPSUBW       Y6, Y13, Y6
   540	VPSUBW       Y0, Y14, Y0
   541	VPSUBW       Y4, Y7, Y9
   542	VPSUBW       Y5, Y8, Y10
   543	VPSUBW       Y6, Y11, Y13
   544	VPSUBW       Y0, Y12, Y14
   545	VPADDW       Y4, Y7, Y7
   546	VPADDW       Y5, Y8, Y8
   547	VPADDW       Y6, Y11, Y11
   548	VPADDW       Y0, Y12, Y12
   549	VMOVDQU      160(CX), Y0
   550	VMOVDQU      192(CX), Y1
   551	VMOVDQU      224(CX), Y2
   552	VMOVDQU      256(CX), Y3
   553	VPERM2I128   $0x20, Y9, Y7, Y4
   554	VPERM2I128   $0x31, Y9, Y7, Y9
   555	VMOVDQA      Y4, Y7
   556	VPERM2I128   $0x20, Y10, Y8, Y4
   557	VPERM2I128   $0x31, Y10, Y8, Y10
   558	VMOVDQA      Y4, Y8
   559	VPERM2I128   $0x20, Y13, Y11, Y4
   560	VPERM2I128   $0x31, Y13, Y11, Y13
   561	VMOVDQA      Y4, Y11
   562	VPERM2I128   $0x20, Y14, Y12, Y4
   563	VPERM2I128   $0x31, Y14, Y12, Y14
   564	VMOVDQA      Y4, Y12
   565	VPMULLW      Y8, Y0, Y4
   566	VPMULLW      Y10, Y0, Y5
   567	VPMULLW      Y12, Y2, Y6
   568	VPMULLW      Y14, Y2, Y0
   569	VPMULHW      Y8, Y1, Y8
   570	VPMULHW      Y10, Y1, Y10
   571	VPMULHW      Y12, Y3, Y12
   572	VPMULHW      Y14, Y3, Y14
   573	VPMULHW      Y4, Y15, Y4
   574	VPMULHW      Y5, Y15, Y5
   575	VPMULHW      Y6, Y15, Y6
   576	VPMULHW      Y0, Y15, Y0
   577	VPSUBW       Y4, Y8, Y4
   578	VPSUBW       Y5, Y10, Y5
   579	VPSUBW       Y6, Y12, Y6
   580	VPSUBW       Y0, Y14, Y0
   581	VPSUBW       Y4, Y7, Y8
   582	VPSUBW       Y5, Y9, Y10
   583	VPSUBW       Y6, Y11, Y12
   584	VPSUBW       Y0, Y13, Y14
   585	VPADDW       Y4, Y7, Y7
   586	VPADDW       Y5, Y9, Y9
   587	VPADDW       Y6, Y11, Y11
   588	VPADDW       Y0, Y13, Y13
   589	VMOVDQU      416(CX), Y0
   590	VMOVDQU      448(CX), Y1
   591	VMOVDQU      480(CX), Y2
   592	VMOVDQU      512(CX), Y3
   593	VPUNPCKLQDQ  Y8, Y7, Y4
   594	VPUNPCKHQDQ  Y8, Y7, Y8
   595	VMOVDQA      Y4, Y7
   596	VPUNPCKLQDQ  Y10, Y9, Y4
   597	VPUNPCKHQDQ  Y10, Y9, Y10
   598	VMOVDQA      Y4, Y9
   599	VPUNPCKLQDQ  Y12, Y11, Y4
   600	VPUNPCKHQDQ  Y12, Y11, Y12
   601	VMOVDQA      Y4, Y11
   602	VPUNPCKLQDQ  Y14, Y13, Y4
   603	VPUNPCKHQDQ  Y14, Y13, Y14
   604	VMOVDQA      Y4, Y13
   605	VPMULLW      Y9, Y0, Y4
   606	VPMULLW      Y10, Y0, Y5
   607	VPMULLW      Y13, Y2, Y6
   608	VPMULLW      Y14, Y2, Y0
   609	VPMULHW      Y9, Y1, Y9
   610	VPMULHW      Y10, Y1, Y10
   611	VPMULHW      Y13, Y3, Y13
   612	VPMULHW      Y14, Y3, Y14
   613	VPMULHW      Y4, Y15, Y4
   614	VPMULHW      Y5, Y15, Y5
   615	VPMULHW      Y6, Y15, Y6
   616	VPMULHW      Y0, Y15, Y0
   617	VPSUBW       Y4, Y9, Y4
   618	VPSUBW       Y5, Y10, Y5
   619	VPSUBW       Y6, Y13, Y6
   620	VPSUBW       Y0, Y14, Y0
   621	VPSUBW       Y4, Y7, Y9
   622	VPSUBW       Y5, Y8, Y10
   623	VPSUBW       Y6, Y11, Y13
   624	VPSUBW       Y0, Y12, Y14
   625	VPADDW       Y4, Y7, Y7
   626	VPADDW       Y5, Y8, Y8
   627	VPADDW       Y6, Y11, Y11
   628	VPADDW       Y0, Y12, Y12
   629	VMOVDQU      672(CX), Y0
   630	VMOVDQU      704(CX), Y1
   631	VMOVDQU      736(CX), Y2
   632	VMOVDQU      768(CX), Y3
   633	VMOVSLDUP    Y9, Y4
   634	VPBLENDD     $0xaa, Y4, Y7, Y4
   635	VPSRLQ       $0x20, Y7, Y7
   636	VPBLENDD     $0xaa, Y9, Y7, Y9
   637	VMOVDQA      Y4, Y7
   638	VMOVSLDUP    Y10, Y4
   639	VPBLENDD     $0xaa, Y4, Y8, Y4
   640	VPSRLQ       $0x20, Y8, Y8
   641	VPBLENDD     $0xaa, Y10, Y8, Y10
   642	VMOVDQA      Y4, Y8
   643	VMOVSLDUP    Y13, Y4
   644	VPBLENDD     $0xaa, Y4, Y11, Y4
   645	VPSRLQ       $0x20, Y11, Y11
   646	VPBLENDD     $0xaa, Y13, Y11, Y13
   647	VMOVDQA      Y4, Y11
   648	VMOVSLDUP    Y14, Y4
   649	VPBLENDD     $0xaa, Y4, Y12, Y4
   650	VPSRLQ       $0x20, Y12, Y12
   651	VPBLENDD     $0xaa, Y14, Y12, Y14
   652	VMOVDQA      Y4, Y12
   653	VPMULLW      Y8, Y0, Y4
   654	VPMULLW      Y10, Y0, Y5
   655	VPMULLW      Y12, Y2, Y6
   656	VPMULLW      Y14, Y2, Y0
   657	VPMULHW      Y8, Y1, Y8
   658	VPMULHW      Y10, Y1, Y10
   659	VPMULHW      Y12, Y3, Y12
   660	VPMULHW      Y14, Y3, Y14
   661	VPMULHW      Y4, Y15, Y4
   662	VPMULHW      Y5, Y15, Y5
   663	VPMULHW      Y6, Y15, Y6
   664	VPMULHW      Y0, Y15, Y0
   665	VPSUBW       Y4, Y8, Y4
   666	VPSUBW       Y5, Y10, Y5
   667	VPSUBW       Y6, Y12, Y6
   668	VPSUBW       Y0, Y14, Y0
   669	VPSUBW       Y4, Y7, Y8
   670	VPSUBW       Y5, Y9, Y10
   671	VPSUBW       Y6, Y11, Y12
   672	VPSUBW       Y0, Y13, Y14
   673	VPADDW       Y4, Y7, Y7
   674	VPADDW       Y5, Y9, Y9
   675	VPADDW       Y6, Y11, Y11
   676	VPADDW       Y0, Y13, Y13
   677	VMOVDQU      928(CX), Y0
   678	VMOVDQU      960(CX), Y1
   679	VMOVDQU      992(CX), Y2
   680	VMOVDQU      1024(CX), Y3
   681	VPSLLD       $0x10, Y8, Y4
   682	VPBLENDW     $0xaa, Y4, Y7, Y4
   683	VPSRLD       $0x10, Y7, Y7
   684	VPBLENDW     $0xaa, Y8, Y7, Y8
   685	VMOVDQA      Y4, Y7
   686	VPSLLD       $0x10, Y10, Y4
   687	VPBLENDW     $0xaa, Y4, Y9, Y4
   688	VPSRLD       $0x10, Y9, Y9
   689	VPBLENDW     $0xaa, Y10, Y9, Y10
   690	VMOVDQA      Y4, Y9
   691	VPSLLD       $0x10, Y12, Y4
   692	VPBLENDW     $0xaa, Y4, Y11, Y4
   693	VPSRLD       $0x10, Y11, Y11
   694	VPBLENDW     $0xaa, Y12, Y11, Y12
   695	VMOVDQA      Y4, Y11
   696	VPSLLD       $0x10, Y14, Y4
   697	VPBLENDW     $0xaa, Y4, Y13, Y4
   698	VPSRLD       $0x10, Y13, Y13
   699	VPBLENDW     $0xaa, Y14, Y13, Y14
   700	VMOVDQA      Y4, Y13
   701	VPMULLW      Y9, Y0, Y4
   702	VPMULLW      Y10, Y0, Y5
   703	VPMULLW      Y13, Y2, Y6
   704	VPMULLW      Y14, Y2, Y0
   705	VPMULHW      Y9, Y1, Y9
   706	VPMULHW      Y10, Y1, Y10
   707	VPMULHW      Y13, Y3, Y13
   708	VPMULHW      Y14, Y3, Y14
   709	VPMULHW      Y4, Y15, Y4
   710	VPMULHW      Y5, Y15, Y5
   711	VPMULHW      Y6, Y15, Y6
   712	VPMULHW      Y0, Y15, Y0
   713	VPSUBW       Y4, Y9, Y4
   714	VPSUBW       Y5, Y10, Y5
   715	VPSUBW       Y6, Y13, Y6
   716	VPSUBW       Y0, Y14, Y0
   717	VPSUBW       Y4, Y7, Y9
   718	VPSUBW       Y5, Y8, Y10
   719	VPSUBW       Y6, Y11, Y13
   720	VPSUBW       Y0, Y12, Y14
   721	VPADDW       Y4, Y7, Y7
   722	VPADDW       Y5, Y8, Y8
   723	VPADDW       Y6, Y11, Y11
   724	VPADDW       Y0, Y12, Y12
   725	VMOVDQU      Y7, 256(AX)
   726	VMOVDQU      Y8, 288(AX)
   727	VMOVDQU      Y9, 320(AX)
   728	VMOVDQU      Y10, 352(AX)
   729	VMOVDQU      Y11, 384(AX)
   730	VMOVDQU      Y12, 416(AX)
   731	VMOVDQU      Y13, 448(AX)
   732	VMOVDQU      Y14, 480(AX)
   733	RET
   734
   735// func invNttAVX2(p *[256]int16)
   736// Requires: AVX, AVX2
   737TEXT ·invNttAVX2(SB), NOSPLIT, $0-8
   738	MOVQ         p+0(FP), AX
   739	LEAQ         ·ZetasAVX2+0(SB), CX
   740	MOVL         $0x00000d01, DX
   741	VMOVD        DX, X0
   742	VPBROADCASTW X0, Y15
   743	VMOVDQU      (AX), Y7
   744	VMOVDQU      32(AX), Y8
   745	VMOVDQU      64(AX), Y9
   746	VMOVDQU      96(AX), Y10
   747	VMOVDQU      128(AX), Y11
   748	VMOVDQU      160(AX), Y12
   749	VMOVDQU      192(AX), Y13
   750	VMOVDQU      224(AX), Y14
   751	VMOVDQU      1056(CX), Y0
   752	VMOVDQU      1088(CX), Y1
   753	VMOVDQU      1120(CX), Y2
   754	VMOVDQU      1152(CX), Y3
   755	VPSUBW       Y7, Y9, Y4
   756	VPSUBW       Y8, Y10, Y5
   757	VPSUBW       Y11, Y13, Y6
   758	VPADDW       Y7, Y9, Y7
   759	VPADDW       Y8, Y10, Y8
   760	VPADDW       Y11, Y13, Y11
   761	VPMULLW      Y4, Y0, Y9
   762	VPMULLW      Y5, Y0, Y10
   763	VPSUBW       Y12, Y14, Y0
   764	VPMULLW      Y6, Y2, Y13
   765	VPADDW       Y12, Y14, Y12
   766	VPMULLW      Y0, Y2, Y14
   767	VPMULHW      Y4, Y1, Y4
   768	VPMULHW      Y5, Y1, Y5
   769	VPMULHW      Y6, Y3, Y6
   770	VPMULHW      Y0, Y3, Y0
   771	VPMULHW      Y9, Y15, Y9
   772	VPMULHW      Y10, Y15, Y10
   773	VPMULHW      Y13, Y15, Y13
   774	VPMULHW      Y14, Y15, Y14
   775	VPSUBW       Y9, Y4, Y9
   776	VPSUBW       Y10, Y5, Y10
   777	VPSUBW       Y13, Y6, Y13
   778	VPSUBW       Y14, Y0, Y14
   779	VMOVDQU      1312(CX), Y0
   780	VMOVDQU      1344(CX), Y1
   781	VMOVDQU      1376(CX), Y2
   782	VMOVDQU      1408(CX), Y3
   783	VPSLLD       $0x10, Y8, Y4
   784	VPBLENDW     $0xaa, Y4, Y7, Y4
   785	VPSRLD       $0x10, Y7, Y7
   786	VPBLENDW     $0xaa, Y8, Y7, Y8
   787	VMOVDQA      Y4, Y7
   788	VPSLLD       $0x10, Y10, Y4
   789	VPBLENDW     $0xaa, Y4, Y9, Y4
   790	VPSRLD       $0x10, Y9, Y9
   791	VPBLENDW     $0xaa, Y10, Y9, Y10
   792	VMOVDQA      Y4, Y9
   793	VPSLLD       $0x10, Y12, Y4
   794	VPBLENDW     $0xaa, Y4, Y11, Y4
   795	VPSRLD       $0x10, Y11, Y11
   796	VPBLENDW     $0xaa, Y12, Y11, Y12
   797	VMOVDQA      Y4, Y11
   798	VPSLLD       $0x10, Y14, Y4
   799	VPBLENDW     $0xaa, Y4, Y13, Y4
   800	VPSRLD       $0x10, Y13, Y13
   801	VPBLENDW     $0xaa, Y14, Y13, Y14
   802	VMOVDQA      Y4, Y13
   803	VPSUBW       Y7, Y8, Y4
   804	VPSUBW       Y9, Y10, Y5
   805	VPSUBW       Y11, Y12, Y6
   806	VPADDW       Y7, Y8, Y7
   807	VPADDW       Y9, Y10, Y9
   808	VPADDW       Y11, Y12, Y11
   809	VPMULLW      Y4, Y0, Y8
   810	VPMULLW      Y5, Y0, Y10
   811	VPSUBW       Y13, Y14, Y0
   812	VPMULLW      Y6, Y2, Y12
   813	VPADDW       Y13, Y14, Y13
   814	VPMULLW      Y0, Y2, Y14
   815	VPMULHW      Y4, Y1, Y4
   816	VPMULHW      Y5, Y1, Y5
   817	VPMULHW      Y6, Y3, Y6
   818	VPMULHW      Y0, Y3, Y0
   819	VPMULHW      Y8, Y15, Y8
   820	VPMULHW      Y10, Y15, Y10
   821	VPMULHW      Y12, Y15, Y12
   822	VPMULHW      Y14, Y15, Y14
   823	VPSUBW       Y8, Y4, Y8
   824	VPSUBW       Y10, Y5, Y10
   825	VPSUBW       Y12, Y6, Y12
   826	VPSUBW       Y14, Y0, Y14
   827	VMOVDQU      1568(CX), Y0
   828	VMOVDQU      1600(CX), Y1
   829	VMOVDQU      1632(CX), Y2
   830	VMOVDQU      1664(CX), Y3
   831	VMOVSLDUP    Y9, Y4
   832	VPBLENDD     $0xaa, Y4, Y7, Y4
   833	VPSRLQ       $0x20, Y7, Y7
   834	VPBLENDD     $0xaa, Y9, Y7, Y9
   835	VMOVDQA      Y4, Y7
   836	VMOVSLDUP    Y10, Y4
   837	VPBLENDD     $0xaa, Y4, Y8, Y4
   838	VPSRLQ       $0x20, Y8, Y8
   839	VPBLENDD     $0xaa, Y10, Y8, Y10
   840	VMOVDQA      Y4, Y8
   841	VMOVSLDUP    Y13, Y4
   842	VPBLENDD     $0xaa, Y4, Y11, Y4
   843	VPSRLQ       $0x20, Y11, Y11
   844	VPBLENDD     $0xaa, Y13, Y11, Y13
   845	VMOVDQA      Y4, Y11
   846	VMOVSLDUP    Y14, Y4
   847	VPBLENDD     $0xaa, Y4, Y12, Y4
   848	VPSRLQ       $0x20, Y12, Y12
   849	VPBLENDD     $0xaa, Y14, Y12, Y14
   850	VMOVDQA      Y4, Y12
   851	VPSUBW       Y7, Y9, Y4
   852	VPSUBW       Y8, Y10, Y5
   853	VPSUBW       Y11, Y13, Y6
   854	VPADDW       Y7, Y9, Y7
   855	VPADDW       Y8, Y10, Y8
   856	VPADDW       Y11, Y13, Y11
   857	VPMULLW      Y4, Y0, Y9
   858	VPMULLW      Y5, Y0, Y10
   859	VPSUBW       Y12, Y14, Y0
   860	VPMULLW      Y6, Y2, Y13
   861	VPADDW       Y12, Y14, Y12
   862	VPMULLW      Y0, Y2, Y14
   863	VPMULHW      Y4, Y1, Y4
   864	VPMULHW      Y5, Y1, Y5
   865	VPMULHW      Y6, Y3, Y6
   866	VPMULHW      Y0, Y3, Y0
   867	VPMULHW      Y9, Y15, Y9
   868	VPMULHW      Y10, Y15, Y10
   869	VPMULHW      Y13, Y15, Y13
   870	VPMULHW      Y14, Y15, Y14
   871	VPSUBW       Y9, Y4, Y9
   872	VPSUBW       Y10, Y5, Y10
   873	VPSUBW       Y13, Y6, Y13
   874	VPSUBW       Y14, Y0, Y14
   875	MOVL         $0x00004ebf, DX
   876	VMOVD        DX, X0
   877	VPBROADCASTW X0, Y4
   878	VPMULHW      Y4, Y7, Y5
   879	VPSRAW       $0x0a, Y5, Y5
   880	VPMULLW      Y15, Y5, Y5
   881	VPSUBW       Y5, Y7, Y7
   882	VPMULHW      Y4, Y11, Y5
   883	VPSRAW       $0x0a, Y5, Y5
   884	VPMULLW      Y15, Y5, Y5
   885	VPSUBW       Y5, Y11, Y11
   886	VMOVDQU      1824(CX), Y0
   887	VMOVDQU      1856(CX), Y1
   888	VMOVDQU      1888(CX), Y2
   889	VMOVDQU      1920(CX), Y3
   890	VPUNPCKLQDQ  Y8, Y7, Y4
   891	VPUNPCKHQDQ  Y8, Y7, Y8
   892	VMOVDQA      Y4, Y7
   893	VPUNPCKLQDQ  Y10, Y9, Y4
   894	VPUNPCKHQDQ  Y10, Y9, Y10
   895	VMOVDQA      Y4, Y9
   896	VPUNPCKLQDQ  Y12, Y11, Y4
   897	VPUNPCKHQDQ  Y12, Y11, Y12
   898	VMOVDQA      Y4, Y11
   899	VPUNPCKLQDQ  Y14, Y13, Y4
   900	VPUNPCKHQDQ  Y14, Y13, Y14
   901	VMOVDQA      Y4, Y13
   902	VPSUBW       Y7, Y8, Y4
   903	VPSUBW       Y9, Y10, Y5
   904	VPSUBW       Y11, Y12, Y6
   905	VPADDW       Y7, Y8, Y7
   906	VPADDW       Y9, Y10, Y9
   907	VPADDW       Y11, Y12, Y11
   908	VPMULLW      Y4, Y0, Y8
   909	VPMULLW      Y5, Y0, Y10
   910	VPSUBW       Y13, Y14, Y0
   911	VPMULLW      Y6, Y2, Y12
   912	VPADDW       Y13, Y14, Y13
   913	VPMULLW      Y0, Y2, Y14
   914	VPMULHW      Y4, Y1, Y4
   915	VPMULHW      Y5, Y1, Y5
   916	VPMULHW      Y6, Y3, Y6
   917	VPMULHW      Y0, Y3, Y0
   918	VPMULHW      Y8, Y15, Y8
   919	VPMULHW      Y10, Y15, Y10
   920	VPMULHW      Y12, Y15, Y12
   921	VPMULHW      Y14, Y15, Y14
   922	VPSUBW       Y8, Y4, Y8
   923	VPSUBW       Y10, Y5, Y10
   924	VPSUBW       Y12, Y6, Y12
   925	VPSUBW       Y14, Y0, Y14
   926	VPBROADCASTW 2080(CX), Y0
   927	VPBROADCASTW 2082(CX), Y1
   928	VPBROADCASTW 2084(CX), Y2
   929	VPBROADCASTW 2086(CX), Y3
   930	VPERM2I128   $0x20, Y9, Y7, Y4
   931	VPERM2I128   $0x31, Y9, Y7, Y9
   932	VMOVDQA      Y4, Y7
   933	VPERM2I128   $0x20, Y10, Y8, Y4
   934	VPERM2I128   $0x31, Y10, Y8, Y10
   935	VMOVDQA      Y4, Y8
   936	VPERM2I128   $0x20, Y13, Y11, Y4
   937	VPERM2I128   $0x31, Y13, Y11, Y13
   938	VMOVDQA      Y4, Y11
   939	VPERM2I128   $0x20, Y14, Y12, Y4
   940	VPERM2I128   $0x31, Y14, Y12, Y14
   941	VMOVDQA      Y4, Y12
   942	VPSUBW       Y7, Y9, Y4
   943	VPSUBW       Y8, Y10, Y5
   944	VPSUBW       Y11, Y13, Y6
   945	VPADDW       Y7, Y9, Y7
   946	VPADDW       Y8, Y10, Y8
   947	VPADDW       Y11, Y13, Y11
   948	VPMULLW      Y4, Y0, Y9
   949	VPMULLW      Y5, Y0, Y10
   950	VPSUBW       Y12, Y14, Y0
   951	VPMULLW      Y6, Y2, Y13
   952	VPADDW       Y12, Y14, Y12
   953	VPMULLW      Y0, Y2, Y14
   954	VPMULHW      Y4, Y1, Y4
   955	VPMULHW      Y5, Y1, Y5
   956	VPMULHW      Y6, Y3, Y6
   957	VPMULHW      Y0, Y3, Y0
   958	VPMULHW      Y9, Y15, Y9
   959	VPMULHW      Y10, Y15, Y10
   960	VPMULHW      Y13, Y15, Y13
   961	VPMULHW      Y14, Y15, Y14
   962	VPSUBW       Y9, Y4, Y9
   963	VPSUBW       Y10, Y5, Y10
   964	VPSUBW       Y13, Y6, Y13
   965	VPSUBW       Y14, Y0, Y14
   966	MOVL         $0x00004ebf, DX
   967	VMOVD        DX, X0
   968	VPBROADCASTW X0, Y4
   969	VPMULHW      Y4, Y7, Y5
   970	VPSRAW       $0x0a, Y5, Y5
   971	VPMULLW      Y15, Y5, Y5
   972	VPSUBW       Y5, Y7, Y7
   973	VPMULHW      Y4, Y11, Y5
   974	VPSRAW       $0x0a, Y5, Y5
   975	VPMULLW      Y15, Y5, Y5
   976	VPSUBW       Y5, Y11, Y11
   977	VPBROADCASTW 2096(CX), Y0
   978	VPBROADCASTW 2098(CX), Y1
   979	VPSUBW       Y7, Y11, Y4
   980	VPSUBW       Y8, Y12, Y5
   981	VPSUBW       Y9, Y13, Y6
   982	VPADDW       Y7, Y11, Y7
   983	VPADDW       Y8, Y12, Y8
   984	VPADDW       Y9, Y13, Y9
   985	VPMULLW      Y4, Y0, Y11
   986	VPMULLW      Y5, Y0, Y12
   987	VPSUBW       Y10, Y14, Y2
   988	VPMULLW      Y6, Y0, Y13
   989	VPADDW       Y10, Y14, Y10
   990	VPMULLW      Y2, Y0, Y14
   991	VPMULHW      Y4, Y1, Y4
   992	VPMULHW      Y5, Y1, Y5
   993	VPMULHW      Y6, Y1, Y6
   994	VPMULHW      Y2, Y1, Y2
   995	VPMULHW      Y11, Y15, Y11
   996	VPMULHW      Y12, Y15, Y12
   997	VPMULHW      Y13, Y15, Y13
   998	VPMULHW      Y14, Y15, Y14
   999	VPSUBW       Y11, Y4, Y11
  1000	VPSUBW       Y12, Y5, Y12
  1001	VPSUBW       Y13, Y6, Y13
  1002	VPSUBW       Y14, Y2, Y14
  1003	VMOVDQU      Y7, (AX)
  1004	VMOVDQU      Y8, 32(AX)
  1005	VMOVDQU      Y9, 64(AX)
  1006	VMOVDQU      Y10, 96(AX)
  1007	VMOVDQU      Y11, 128(AX)
  1008	VMOVDQU      Y12, 160(AX)
  1009	VMOVDQU      Y13, 192(AX)
  1010	VMOVDQU      Y14, 224(AX)
  1011	VMOVDQU      256(AX), Y7
  1012	VMOVDQU      288(AX), Y8
  1013	VMOVDQU      320(AX), Y9
  1014	VMOVDQU      352(AX), Y10
  1015	VMOVDQU      384(AX), Y11
  1016	VMOVDQU      416(AX), Y12
  1017	VMOVDQU      448(AX), Y13
  1018	VMOVDQU      480(AX), Y14
  1019	VMOVDQU      1184(CX), Y0
  1020	VMOVDQU      1216(CX), Y1
  1021	VMOVDQU      1248(CX), Y2
  1022	VMOVDQU      1280(CX), Y3
  1023	VPSUBW       Y7, Y9, Y4
  1024	VPSUBW       Y8, Y10, Y5
  1025	VPSUBW       Y11, Y13, Y6
  1026	VPADDW       Y7, Y9, Y7
  1027	VPADDW       Y8, Y10, Y8
  1028	VPADDW       Y11, Y13, Y11
  1029	VPMULLW      Y4, Y0, Y9
  1030	VPMULLW      Y5, Y0, Y10
  1031	VPSUBW       Y12, Y14, Y0
  1032	VPMULLW      Y6, Y2, Y13
  1033	VPADDW       Y12, Y14, Y12
  1034	VPMULLW      Y0, Y2, Y14
  1035	VPMULHW      Y4, Y1, Y4
  1036	VPMULHW      Y5, Y1, Y5
  1037	VPMULHW      Y6, Y3, Y6
  1038	VPMULHW      Y0, Y3, Y0
  1039	VPMULHW      Y9, Y15, Y9
  1040	VPMULHW      Y10, Y15, Y10
  1041	VPMULHW      Y13, Y15, Y13
  1042	VPMULHW      Y14, Y15, Y14
  1043	VPSUBW       Y9, Y4, Y9
  1044	VPSUBW       Y10, Y5, Y10
  1045	VPSUBW       Y13, Y6, Y13
  1046	VPSUBW       Y14, Y0, Y14
  1047	VMOVDQU      1440(CX), Y0
  1048	VMOVDQU      1472(CX), Y1
  1049	VMOVDQU      1504(CX), Y2
  1050	VMOVDQU      1536(CX), Y3
  1051	VPSLLD       $0x10, Y8, Y4
  1052	VPBLENDW     $0xaa, Y4, Y7, Y4
  1053	VPSRLD       $0x10, Y7, Y7
  1054	VPBLENDW     $0xaa, Y8, Y7, Y8
  1055	VMOVDQA      Y4, Y7
  1056	VPSLLD       $0x10, Y10, Y4
  1057	VPBLENDW     $0xaa, Y4, Y9, Y4
  1058	VPSRLD       $0x10, Y9, Y9
  1059	VPBLENDW     $0xaa, Y10, Y9, Y10
  1060	VMOVDQA      Y4, Y9
  1061	VPSLLD       $0x10, Y12, Y4
  1062	VPBLENDW     $0xaa, Y4, Y11, Y4
  1063	VPSRLD       $0x10, Y11, Y11
  1064	VPBLENDW     $0xaa, Y12, Y11, Y12
  1065	VMOVDQA      Y4, Y11
  1066	VPSLLD       $0x10, Y14, Y4
  1067	VPBLENDW     $0xaa, Y4, Y13, Y4
  1068	VPSRLD       $0x10, Y13, Y13
  1069	VPBLENDW     $0xaa, Y14, Y13, Y14
  1070	VMOVDQA      Y4, Y13
  1071	VPSUBW       Y7, Y8, Y4
  1072	VPSUBW       Y9, Y10, Y5
  1073	VPSUBW       Y11, Y12, Y6
  1074	VPADDW       Y7, Y8, Y7
  1075	VPADDW       Y9, Y10, Y9
  1076	VPADDW       Y11, Y12, Y11
  1077	VPMULLW      Y4, Y0, Y8
  1078	VPMULLW      Y5, Y0, Y10
  1079	VPSUBW       Y13, Y14, Y0
  1080	VPMULLW      Y6, Y2, Y12
  1081	VPADDW       Y13, Y14, Y13
  1082	VPMULLW      Y0, Y2, Y14
  1083	VPMULHW      Y4, Y1, Y4
  1084	VPMULHW      Y5, Y1, Y5
  1085	VPMULHW      Y6, Y3, Y6
  1086	VPMULHW      Y0, Y3, Y0
  1087	VPMULHW      Y8, Y15, Y8
  1088	VPMULHW      Y10, Y15, Y10
  1089	VPMULHW      Y12, Y15, Y12
  1090	VPMULHW      Y14, Y15, Y14
  1091	VPSUBW       Y8, Y4, Y8
  1092	VPSUBW       Y10, Y5, Y10
  1093	VPSUBW       Y12, Y6, Y12
  1094	VPSUBW       Y14, Y0, Y14
  1095	VMOVDQU      1696(CX), Y0
  1096	VMOVDQU      1728(CX), Y1
  1097	VMOVDQU      1760(CX), Y2
  1098	VMOVDQU      1792(CX), Y3
  1099	VMOVSLDUP    Y9, Y4
  1100	VPBLENDD     $0xaa, Y4, Y7, Y4
  1101	VPSRLQ       $0x20, Y7, Y7
  1102	VPBLENDD     $0xaa, Y9, Y7, Y9
  1103	VMOVDQA      Y4, Y7
  1104	VMOVSLDUP    Y10, Y4
  1105	VPBLENDD     $0xaa, Y4, Y8, Y4
  1106	VPSRLQ       $0x20, Y8, Y8
  1107	VPBLENDD     $0xaa, Y10, Y8, Y10
  1108	VMOVDQA      Y4, Y8
  1109	VMOVSLDUP    Y13, Y4
  1110	VPBLENDD     $0xaa, Y4, Y11, Y4
  1111	VPSRLQ       $0x20, Y11, Y11
  1112	VPBLENDD     $0xaa, Y13, Y11, Y13
  1113	VMOVDQA      Y4, Y11
  1114	VMOVSLDUP    Y14, Y4
  1115	VPBLENDD     $0xaa, Y4, Y12, Y4
  1116	VPSRLQ       $0x20, Y12, Y12
  1117	VPBLENDD     $0xaa, Y14, Y12, Y14
  1118	VMOVDQA      Y4, Y12
  1119	VPSUBW       Y7, Y9, Y4
  1120	VPSUBW       Y8, Y10, Y5
  1121	VPSUBW       Y11, Y13, Y6
  1122	VPADDW       Y7, Y9, Y7
  1123	VPADDW       Y8, Y10, Y8
  1124	VPADDW       Y11, Y13, Y11
  1125	VPMULLW      Y4, Y0, Y9
  1126	VPMULLW      Y5, Y0, Y10
  1127	VPSUBW       Y12, Y14, Y0
  1128	VPMULLW      Y6, Y2, Y13
  1129	VPADDW       Y12, Y14, Y12
  1130	VPMULLW      Y0, Y2, Y14
  1131	VPMULHW      Y4, Y1, Y4
  1132	VPMULHW      Y5, Y1, Y5
  1133	VPMULHW      Y6, Y3, Y6
  1134	VPMULHW      Y0, Y3, Y0
  1135	VPMULHW      Y9, Y15, Y9
  1136	VPMULHW      Y10, Y15, Y10
  1137	VPMULHW      Y13, Y15, Y13
  1138	VPMULHW      Y14, Y15, Y14
  1139	VPSUBW       Y9, Y4, Y9
  1140	VPSUBW       Y10, Y5, Y10
  1141	VPSUBW       Y13, Y6, Y13
  1142	VPSUBW       Y14, Y0, Y14
  1143	MOVL         $0x00004ebf, DX
  1144	VMOVD        DX, X0
  1145	VPBROADCASTW X0, Y4
  1146	VPMULHW      Y4, Y7, Y5
  1147	VPSRAW       $0x0a, Y5, Y5
  1148	VPMULLW      Y15, Y5, Y5
  1149	VPSUBW       Y5, Y7, Y7
  1150	VPMULHW      Y4, Y11, Y5
  1151	VPSRAW       $0x0a, Y5, Y5
  1152	VPMULLW      Y15, Y5, Y5
  1153	VPSUBW       Y5, Y11, Y11
  1154	VMOVDQU      1952(CX), Y0
  1155	VMOVDQU      1984(CX), Y1
  1156	VMOVDQU      2016(CX), Y2
  1157	VMOVDQU      2048(CX), Y3
  1158	VPUNPCKLQDQ  Y8, Y7, Y4
  1159	VPUNPCKHQDQ  Y8, Y7, Y8
  1160	VMOVDQA      Y4, Y7
  1161	VPUNPCKLQDQ  Y10, Y9, Y4
  1162	VPUNPCKHQDQ  Y10, Y9, Y10
  1163	VMOVDQA      Y4, Y9
  1164	VPUNPCKLQDQ  Y12, Y11, Y4
  1165	VPUNPCKHQDQ  Y12, Y11, Y12
  1166	VMOVDQA      Y4, Y11
  1167	VPUNPCKLQDQ  Y14, Y13, Y4
  1168	VPUNPCKHQDQ  Y14, Y13, Y14
  1169	VMOVDQA      Y4, Y13
  1170	VPSUBW       Y7, Y8, Y4
  1171	VPSUBW       Y9, Y10, Y5
  1172	VPSUBW       Y11, Y12, Y6
  1173	VPADDW       Y7, Y8, Y7
  1174	VPADDW       Y9, Y10, Y9
  1175	VPADDW       Y11, Y12, Y11
  1176	VPMULLW      Y4, Y0, Y8
  1177	VPMULLW      Y5, Y0, Y10
  1178	VPSUBW       Y13, Y14, Y0
  1179	VPMULLW      Y6, Y2, Y12
  1180	VPADDW       Y13, Y14, Y13
  1181	VPMULLW      Y0, Y2, Y14
  1182	VPMULHW      Y4, Y1, Y4
  1183	VPMULHW      Y5, Y1, Y5
  1184	VPMULHW      Y6, Y3, Y6
  1185	VPMULHW      Y0, Y3, Y0
  1186	VPMULHW      Y8, Y15, Y8
  1187	VPMULHW      Y10, Y15, Y10
  1188	VPMULHW      Y12, Y15, Y12
  1189	VPMULHW      Y14, Y15, Y14
  1190	VPSUBW       Y8, Y4, Y8
  1191	VPSUBW       Y10, Y5, Y10
  1192	VPSUBW       Y12, Y6, Y12
  1193	VPSUBW       Y14, Y0, Y14
  1194	VPBROADCASTW 2088(CX), Y0
  1195	VPBROADCASTW 2090(CX), Y1
  1196	VPBROADCASTW 2092(CX), Y2
  1197	VPBROADCASTW 2094(CX), Y3
  1198	VPERM2I128   $0x20, Y9, Y7, Y4
  1199	VPERM2I128   $0x31, Y9, Y7, Y9
  1200	VMOVDQA      Y4, Y7
  1201	VPERM2I128   $0x20, Y10, Y8, Y4
  1202	VPERM2I128   $0x31, Y10, Y8, Y10
  1203	VMOVDQA      Y4, Y8
  1204	VPERM2I128   $0x20, Y13, Y11, Y4
  1205	VPERM2I128   $0x31, Y13, Y11, Y13
  1206	VMOVDQA      Y4, Y11
  1207	VPERM2I128   $0x20, Y14, Y12, Y4
  1208	VPERM2I128   $0x31, Y14, Y12, Y14
  1209	VMOVDQA      Y4, Y12
  1210	VPSUBW       Y7, Y9, Y4
  1211	VPSUBW       Y8, Y10, Y5
  1212	VPSUBW       Y11, Y13, Y6
  1213	VPADDW       Y7, Y9, Y7
  1214	VPADDW       Y8, Y10, Y8
  1215	VPADDW       Y11, Y13, Y11
  1216	VPMULLW      Y4, Y0, Y9
  1217	VPMULLW      Y5, Y0, Y10
  1218	VPSUBW       Y12, Y14, Y0
  1219	VPMULLW      Y6, Y2, Y13
  1220	VPADDW       Y12, Y14, Y12
  1221	VPMULLW      Y0, Y2, Y14
  1222	VPMULHW      Y4, Y1, Y4
  1223	VPMULHW      Y5, Y1, Y5
  1224	VPMULHW      Y6, Y3, Y6
  1225	VPMULHW      Y0, Y3, Y0
  1226	VPMULHW      Y9, Y15, Y9
  1227	VPMULHW      Y10, Y15, Y10
  1228	VPMULHW      Y13, Y15, Y13
  1229	VPMULHW      Y14, Y15, Y14
  1230	VPSUBW       Y9, Y4, Y9
  1231	VPSUBW       Y10, Y5, Y10
  1232	VPSUBW       Y13, Y6, Y13
  1233	VPSUBW       Y14, Y0, Y14
  1234	MOVL         $0x00004ebf, DX
  1235	VMOVD        DX, X0
  1236	VPBROADCASTW X0, Y4
  1237	VPMULHW      Y4, Y7, Y5
  1238	VPSRAW       $0x0a, Y5, Y5
  1239	VPMULLW      Y15, Y5, Y5
  1240	VPSUBW       Y5, Y7, Y7
  1241	VPMULHW      Y4, Y11, Y5
  1242	VPSRAW       $0x0a, Y5, Y5
  1243	VPMULLW      Y15, Y5, Y5
  1244	VPSUBW       Y5, Y11, Y11
  1245	VPBROADCASTW 2100(CX), Y0
  1246	VPBROADCASTW 2102(CX), Y1
  1247	VPSUBW       Y7, Y11, Y4
  1248	VPSUBW       Y8, Y12, Y5
  1249	VPSUBW       Y9, Y13, Y6
  1250	VPADDW       Y7, Y11, Y7
  1251	VPADDW       Y8, Y12, Y8
  1252	VPADDW       Y9, Y13, Y9
  1253	VPMULLW      Y4, Y0, Y11
  1254	VPMULLW      Y5, Y0, Y12
  1255	VPSUBW       Y10, Y14, Y2
  1256	VPMULLW      Y6, Y0, Y13
  1257	VPADDW       Y10, Y14, Y10
  1258	VPMULLW      Y2, Y0, Y14
  1259	VPMULHW      Y4, Y1, Y4
  1260	VPMULHW      Y5, Y1, Y5
  1261	VPMULHW      Y6, Y1, Y6
  1262	VPMULHW      Y2, Y1, Y2
  1263	VPMULHW      Y11, Y15, Y11
  1264	VPMULHW      Y12, Y15, Y12
  1265	VPMULHW      Y13, Y15, Y13
  1266	VPMULHW      Y14, Y15, Y14
  1267	VPSUBW       Y11, Y4, Y11
  1268	VPSUBW       Y12, Y5, Y12
  1269	VPSUBW       Y13, Y6, Y13
  1270	VPSUBW       Y14, Y2, Y14
  1271	VMOVDQU      Y7, 256(AX)
  1272	VMOVDQU      Y8, 288(AX)
  1273	VMOVDQU      Y9, 320(AX)
  1274	VMOVDQU      Y10, 352(AX)
  1275	VMOVDQU      Y11, 384(AX)
  1276	VMOVDQU      Y12, 416(AX)
  1277	VMOVDQU      Y13, 448(AX)
  1278	VMOVDQU      Y14, 480(AX)
  1279	VPBROADCASTW 2104(CX), Y0
  1280	VPBROADCASTW 2106(CX), Y1
  1281	VMOVDQU      (AX), Y7
  1282	VMOVDQU      32(AX), Y8
  1283	VMOVDQU      64(AX), Y9
  1284	VMOVDQU      96(AX), Y10
  1285	VMOVDQU      256(AX), Y11
  1286	VMOVDQU      288(AX), Y12
  1287	VMOVDQU      320(AX), Y13
  1288	VMOVDQU      352(AX), Y14
  1289	VPSUBW       Y7, Y11, Y2
  1290	VPSUBW       Y8, Y12, Y3
  1291	VPSUBW       Y9, Y13, Y4
  1292	VPADDW       Y7, Y11, Y7
  1293	VPADDW       Y8, Y12, Y8
  1294	VPADDW       Y9, Y13, Y9
  1295	VPMULLW      Y2, Y0, Y11
  1296	VPMULLW      Y3, Y0, Y12
  1297	VPSUBW       Y10, Y14, Y5
  1298	VPMULLW      Y4, Y0, Y13
  1299	VPADDW       Y10, Y14, Y10
  1300	VPMULLW      Y5, Y0, Y14
  1301	VPMULHW      Y2, Y1, Y2
  1302	VPMULHW      Y3, Y1, Y3
  1303	VPMULHW      Y4, Y1, Y4
  1304	VPMULHW      Y5, Y1, Y5
  1305	VPMULHW      Y11, Y15, Y11
  1306	VPMULHW      Y12, Y15, Y12
  1307	VPMULHW      Y13, Y15, Y13
  1308	VPMULHW      Y14, Y15, Y14
  1309	VPSUBW       Y11, Y2, Y11
  1310	VPSUBW       Y12, Y3, Y12
  1311	VPSUBW       Y13, Y4, Y13
  1312	VPSUBW       Y14, Y5, Y14
  1313	MOVL         $0xffffd8a1, DX
  1314	VMOVD        DX, X0
  1315	VPBROADCASTW X0, Y0
  1316	MOVL         $0x000005a1, DX
  1317	VMOVD        DX, X1
  1318	VPBROADCASTW X1, Y1
  1319	VPMULLW      Y7, Y0, Y2
  1320	VPMULLW      Y8, Y0, Y3
  1321	VPMULLW      Y9, Y0, Y4
  1322	VPMULLW      Y10, Y0, Y5
  1323	VPMULHW      Y7, Y1, Y7
  1324	VPMULHW      Y8, Y1, Y8
  1325	VPMULHW      Y9, Y1, Y9
  1326	VPMULHW      Y10, Y1, Y10
  1327	VPMULHW      Y2, Y15, Y2
  1328	VPMULHW      Y3, Y15, Y3
  1329	VPMULHW      Y4, Y15, Y4
  1330	VPMULHW      Y5, Y15, Y5
  1331	VPSUBW       Y2, Y7, Y7
  1332	VPSUBW       Y3, Y8, Y8
  1333	VPSUBW       Y4, Y9, Y9
  1334	VPSUBW       Y5, Y10, Y10
  1335	VPMULLW      Y11, Y0, Y2
  1336	VPMULLW      Y12, Y0, Y3
  1337	VPMULLW      Y13, Y0, Y4
  1338	VPMULLW      Y14, Y0, Y5
  1339	VPMULHW      Y11, Y1, Y11
  1340	VPMULHW      Y12, Y1, Y12
  1341	VPMULHW      Y13, Y1, Y13
  1342	VPMULHW      Y14, Y1, Y14
  1343	VPMULHW      Y2, Y15, Y2
  1344	VPMULHW      Y3, Y15, Y3
  1345	VPMULHW      Y4, Y15, Y4
  1346	VPMULHW      Y5, Y15, Y5
  1347	VPSUBW       Y2, Y11, Y11
  1348	VPSUBW       Y3, Y12, Y12
  1349	VPSUBW       Y4, Y13, Y13
  1350	VPSUBW       Y5, Y14, Y14
  1351	VMOVDQU      Y7, (AX)
  1352	VMOVDQU      Y8, 32(AX)
  1353	VMOVDQU      Y9, 64(AX)
  1354	VMOVDQU      Y10, 96(AX)
  1355	VMOVDQU      Y11, 256(AX)
  1356	VMOVDQU      Y12, 288(AX)
  1357	VMOVDQU      Y13, 320(AX)
  1358	VMOVDQU      Y14, 352(AX)
  1359	VPBROADCASTW 2104(CX), Y0
  1360	VPBROADCASTW 2106(CX), Y1
  1361	VMOVDQU      128(AX), Y7
  1362	VMOVDQU      160(AX), Y8
  1363	VMOVDQU      192(AX), Y9
  1364	VMOVDQU      224(AX), Y10
  1365	VMOVDQU      384(AX), Y11
  1366	VMOVDQU      416(AX), Y12
  1367	VMOVDQU      448(AX), Y13
  1368	VMOVDQU      480(AX), Y14
  1369	VPSUBW       Y7, Y11, Y2
  1370	VPSUBW       Y8, Y12, Y3
  1371	VPSUBW       Y9, Y13, Y4
  1372	VPADDW       Y7, Y11, Y7
  1373	VPADDW       Y8, Y12, Y8
  1374	VPADDW       Y9, Y13, Y9
  1375	VPMULLW      Y2, Y0, Y11
  1376	VPMULLW      Y3, Y0, Y12
  1377	VPSUBW       Y10, Y14, Y5
  1378	VPMULLW      Y4, Y0, Y13
  1379	VPADDW       Y10, Y14, Y10
  1380	VPMULLW      Y5, Y0, Y14
  1381	VPMULHW      Y2, Y1, Y2
  1382	VPMULHW      Y3, Y1, Y3
  1383	VPMULHW      Y4, Y1, Y4
  1384	VPMULHW      Y5, Y1, Y5
  1385	VPMULHW      Y11, Y15, Y11
  1386	VPMULHW      Y12, Y15, Y12
  1387	VPMULHW      Y13, Y15, Y13
  1388	VPMULHW      Y14, Y15, Y14
  1389	VPSUBW       Y11, Y2, Y11
  1390	VPSUBW       Y12, Y3, Y12
  1391	VPSUBW       Y13, Y4, Y13
  1392	VPSUBW       Y14, Y5, Y14
  1393	MOVL         $0xffffd8a1, CX
  1394	VMOVD        CX, X0
  1395	VPBROADCASTW X0, Y0
  1396	MOVL         $0x000005a1, CX
  1397	VMOVD        CX, X1
  1398	VPBROADCASTW X1, Y1
  1399	VPMULLW      Y7, Y0, Y2
  1400	VPMULLW      Y8, Y0, Y3
  1401	VPMULLW      Y9, Y0, Y4
  1402	VPMULLW      Y10, Y0, Y5
  1403	VPMULHW      Y7, Y1, Y7
  1404	VPMULHW      Y8, Y1, Y8
  1405	VPMULHW      Y9, Y1, Y9
  1406	VPMULHW      Y10, Y1, Y10
  1407	VPMULHW      Y2, Y15, Y2
  1408	VPMULHW      Y3, Y15, Y3
  1409	VPMULHW      Y4, Y15, Y4
  1410	VPMULHW      Y5, Y15, Y5
  1411	VPSUBW       Y2, Y7, Y7
  1412	VPSUBW       Y3, Y8, Y8
  1413	VPSUBW       Y4, Y9, Y9
  1414	VPSUBW       Y5, Y10, Y10
  1415	VPMULLW      Y11, Y0, Y2
  1416	VPMULLW      Y12, Y0, Y3
  1417	VPMULLW      Y13, Y0, Y4
  1418	VPMULLW      Y14, Y0, Y5
  1419	VPMULHW      Y11, Y1, Y11
  1420	VPMULHW      Y12, Y1, Y12
  1421	VPMULHW      Y13, Y1, Y13
  1422	VPMULHW      Y14, Y1, Y14
  1423	VPMULHW      Y2, Y15, Y2
  1424	VPMULHW      Y3, Y15, Y3
  1425	VPMULHW      Y4, Y15, Y4
  1426	VPMULHW      Y5, Y15, Y5
  1427	VPSUBW       Y2, Y11, Y11
  1428	VPSUBW       Y3, Y12, Y12
  1429	VPSUBW       Y4, Y13, Y13
  1430	VPSUBW       Y5, Y14, Y14
  1431	VMOVDQU      Y7, 128(AX)
  1432	VMOVDQU      Y8, 160(AX)
  1433	VMOVDQU      Y9, 192(AX)
  1434	VMOVDQU      Y10, 224(AX)
  1435	VMOVDQU      Y11, 384(AX)
  1436	VMOVDQU      Y12, 416(AX)
  1437	VMOVDQU      Y13, 448(AX)
  1438	VMOVDQU      Y14, 480(AX)
  1439	RET
  1440
  1441// func mulHatAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
  1442// Requires: AVX, AVX2
  1443TEXT ·mulHatAVX2(SB), NOSPLIT, $8-24
  1444	MOVQ         p+0(FP), AX
  1445	MOVQ         a+8(FP), CX
  1446	MOVQ         b+16(FP), DX
  1447	LEAQ         ·ZetasAVX2+0(SB), BX
  1448	MOVL         $0xfffff301, SI
  1449	VMOVD        SI, X0
  1450	VPBROADCASTW X0, Y14
  1451	MOVL         $0x00000d01, SI
  1452	VMOVD        SI, X0
  1453	VPBROADCASTW X0, Y15
  1454	VMOVDQU      (CX), Y0
  1455	VMOVDQU      32(CX), Y1
  1456	VMOVDQU      64(CX), Y2
  1457	VMOVDQU      96(CX), Y3
  1458	VMOVDQU      (DX), Y4
  1459	VMOVDQU      32(DX), Y5
  1460	VMOVDQU      64(DX), Y6
  1461	VMOVDQU      96(DX), Y7
  1462	VPMULLW      Y1, Y5, Y8
  1463	VPMULLW      Y0, Y4, Y9
  1464	VPMULLW      Y0, Y5, Y10
  1465	VPMULLW      Y1, Y4, Y11
  1466	VPMULLW      Y8, Y14, Y8
  1467	VPMULLW      Y9, Y14, Y9
  1468	VPMULLW      Y10, Y14, Y10
  1469	VPMULLW      Y11, Y14, Y11
  1470	VPMULHW      Y1, Y5, Y12
  1471	VPMULHW      Y0, Y4, Y13
  1472	VPMULHW      Y0, Y5, Y0
  1473	VPMULHW      Y1, Y4, Y1
  1474	VMOVDQA      Y12, Y4
  1475	VMOVDQA      Y13, Y5
  1476	VPMULHW      Y8, Y15, Y8
  1477	VPMULHW      Y9, Y15, Y9
  1478	VPMULHW      Y10, Y15, Y10
  1479	VPMULHW      Y11, Y15, Y11
  1480	VPSUBW       Y8, Y4, Y4
  1481	VPSUBW       Y9, Y5, Y5
  1482	VPSUBW       Y10, Y0, Y0
  1483	VPSUBW       Y11, Y1, Y1
  1484	VMOVDQU      800(BX), Y12
  1485	VMOVDQU      832(BX), Y13
  1486	VPMULLW      Y4, Y12, Y8
  1487	VPMULHW      Y4, Y13, Y4
  1488	VPMULHW      Y8, Y15, Y8
  1489	VPSUBW       Y8, Y4, Y4
  1490	VPADDW       Y4, Y5, Y4
  1491	VPADDW       Y0, Y1, Y5
  1492	VPMULLW      Y3, Y7, Y8
  1493	VPMULLW      Y2, Y6, Y9
  1494	VPMULLW      Y2, Y7, Y10
  1495	VPMULLW      Y3, Y6, Y11
  1496	VPMULLW      Y8, Y14, Y8
  1497	VPMULLW      Y9, Y14, Y9
  1498	VPMULLW      Y10, Y14, Y10
  1499	VPMULLW      Y11, Y14, Y11
  1500	VPMULHW      Y3, Y7, Y12
  1501	VPMULHW      Y2, Y6, Y13
  1502	VPMULHW      Y2, Y7, Y2
  1503	VPMULHW      Y3, Y6, Y3
  1504	VMOVDQA      Y12, Y6
  1505	VMOVDQA      Y13, Y7
  1506	VPMULHW      Y8, Y15, Y8
  1507	VPMULHW      Y9, Y15, Y9
  1508	VPMULHW      Y10, Y15, Y10
  1509	VPMULHW      Y11, Y15, Y11
  1510	VPSUBW       Y8, Y6, Y6
  1511	VPSUBW       Y9, Y7, Y7
  1512	VPSUBW       Y10, Y2, Y2
  1513	VPSUBW       Y11, Y3, Y3
  1514	VMOVDQU      800(BX), Y12
  1515	VMOVDQU      832(BX), Y13
  1516	VPMULLW      Y6, Y12, Y8
  1517	VPMULHW      Y6, Y13, Y6
  1518	VPMULHW      Y8, Y15, Y8
  1519	VPSUBW       Y8, Y6, Y6
  1520	VPSUBW       Y6, Y7, Y6
  1521	VPADDW       Y2, Y3, Y7
  1522	VMOVDQU      Y4, (AX)
  1523	VMOVDQU      Y5, 32(AX)
  1524	VMOVDQU      Y6, 64(AX)
  1525	VMOVDQU      Y7, 96(AX)
  1526	VMOVDQU      128(CX), Y0
  1527	VMOVDQU      160(CX), Y1
  1528	VMOVDQU      192(CX), Y2
  1529	VMOVDQU      224(CX), Y3
  1530	VMOVDQU      128(DX), Y4
  1531	VMOVDQU      160(DX), Y5
  1532	VMOVDQU      192(DX), Y6
  1533	VMOVDQU      224(DX), Y7
  1534	VPMULLW      Y1, Y5, Y8
  1535	VPMULLW      Y0, Y4, Y9
  1536	VPMULLW      Y0, Y5, Y10
  1537	VPMULLW      Y1, Y4, Y11
  1538	VPMULLW      Y8, Y14, Y8
  1539	VPMULLW      Y9, Y14, Y9
  1540	VPMULLW      Y10, Y14, Y10
  1541	VPMULLW      Y11, Y14, Y11
  1542	VPMULHW      Y1, Y5, Y12
  1543	VPMULHW      Y0, Y4, Y13
  1544	VPMULHW      Y0, Y5, Y0
  1545	VPMULHW      Y1, Y4, Y1
  1546	VMOVDQA      Y12, Y4
  1547	VMOVDQA      Y13, Y5
  1548	VPMULHW      Y8, Y15, Y8
  1549	VPMULHW      Y9, Y15, Y9
  1550	VPMULHW      Y10, Y15, Y10
  1551	VPMULHW      Y11, Y15, Y11
  1552	VPSUBW       Y8, Y4, Y4
  1553	VPSUBW       Y9, Y5, Y5
  1554	VPSUBW       Y10, Y0, Y0
  1555	VPSUBW       Y11, Y1, Y1
  1556	VMOVDQU      864(BX), Y12
  1557	VMOVDQU      896(BX), Y13
  1558	VPMULLW      Y4, Y12, Y8
  1559	VPMULHW      Y4, Y13, Y4
  1560	VPMULHW      Y8, Y15, Y8
  1561	VPSUBW       Y8, Y4, Y4
  1562	VPADDW       Y4, Y5, Y4
  1563	VPADDW       Y0, Y1, Y5
  1564	VPMULLW      Y3, Y7, Y8
  1565	VPMULLW      Y2, Y6, Y9
  1566	VPMULLW      Y2, Y7, Y10
  1567	VPMULLW      Y3, Y6, Y11
  1568	VPMULLW      Y8, Y14, Y8
  1569	VPMULLW      Y9, Y14, Y9
  1570	VPMULLW      Y10, Y14, Y10
  1571	VPMULLW      Y11, Y14, Y11
  1572	VPMULHW      Y3, Y7, Y12
  1573	VPMULHW      Y2, Y6, Y13
  1574	VPMULHW      Y2, Y7, Y2
  1575	VPMULHW      Y3, Y6, Y3
  1576	VMOVDQA      Y12, Y6
  1577	VMOVDQA      Y13, Y7
  1578	VPMULHW      Y8, Y15, Y8
  1579	VPMULHW      Y9, Y15, Y9
  1580	VPMULHW      Y10, Y15, Y10
  1581	VPMULHW      Y11, Y15, Y11
  1582	VPSUBW       Y8, Y6, Y6
  1583	VPSUBW       Y9, Y7, Y7
  1584	VPSUBW       Y10, Y2, Y2
  1585	VPSUBW       Y11, Y3, Y3
  1586	VMOVDQU      864(BX), Y12
  1587	VMOVDQU      896(BX), Y13
  1588	VPMULLW      Y6, Y12, Y8
  1589	VPMULHW      Y6, Y13, Y6
  1590	VPMULHW      Y8, Y15, Y8
  1591	VPSUBW       Y8, Y6, Y6
  1592	VPSUBW       Y6, Y7, Y6
  1593	VPADDW       Y2, Y3, Y7
  1594	VMOVDQU      Y4, 128(AX)
  1595	VMOVDQU      Y5, 160(AX)
  1596	VMOVDQU      Y6, 192(AX)
  1597	VMOVDQU      Y7, 224(AX)
  1598	VMOVDQU      256(CX), Y0
  1599	VMOVDQU      288(CX), Y1
  1600	VMOVDQU      320(CX), Y2
  1601	VMOVDQU      352(CX), Y3
  1602	VMOVDQU      256(DX), Y4
  1603	VMOVDQU      288(DX), Y5
  1604	VMOVDQU      320(DX), Y6
  1605	VMOVDQU      352(DX), Y7
  1606	VPMULLW      Y1, Y5, Y8
  1607	VPMULLW      Y0, Y4, Y9
  1608	VPMULLW      Y0, Y5, Y10
  1609	VPMULLW      Y1, Y4, Y11
  1610	VPMULLW      Y8, Y14, Y8
  1611	VPMULLW      Y9, Y14, Y9
  1612	VPMULLW      Y10, Y14, Y10
  1613	VPMULLW      Y11, Y14, Y11
  1614	VPMULHW      Y1, Y5, Y12
  1615	VPMULHW      Y0, Y4, Y13
  1616	VPMULHW      Y0, Y5, Y0
  1617	VPMULHW      Y1, Y4, Y1
  1618	VMOVDQA      Y12, Y4
  1619	VMOVDQA      Y13, Y5
  1620	VPMULHW      Y8, Y15, Y8
  1621	VPMULHW      Y9, Y15, Y9
  1622	VPMULHW      Y10, Y15, Y10
  1623	VPMULHW      Y11, Y15, Y11
  1624	VPSUBW       Y8, Y4, Y4
  1625	VPSUBW       Y9, Y5, Y5
  1626	VPSUBW       Y10, Y0, Y0
  1627	VPSUBW       Y11, Y1, Y1
  1628	VMOVDQU      928(BX), Y12
  1629	VMOVDQU      960(BX), Y13
  1630	VPMULLW      Y4, Y12, Y8
  1631	VPMULHW      Y4, Y13, Y4
  1632	VPMULHW      Y8, Y15, Y8
  1633	VPSUBW       Y8, Y4, Y4
  1634	VPADDW       Y4, Y5, Y4
  1635	VPADDW       Y0, Y1, Y5
  1636	VPMULLW      Y3, Y7, Y8
  1637	VPMULLW      Y2, Y6, Y9
  1638	VPMULLW      Y2, Y7, Y10
  1639	VPMULLW      Y3, Y6, Y11
  1640	VPMULLW      Y8, Y14, Y8
  1641	VPMULLW      Y9, Y14, Y9
  1642	VPMULLW      Y10, Y14, Y10
  1643	VPMULLW      Y11, Y14, Y11
  1644	VPMULHW      Y3, Y7, Y12
  1645	VPMULHW      Y2, Y6, Y13
  1646	VPMULHW      Y2, Y7, Y2
  1647	VPMULHW      Y3, Y6, Y3
  1648	VMOVDQA      Y12, Y6
  1649	VMOVDQA      Y13, Y7
  1650	VPMULHW      Y8, Y15, Y8
  1651	VPMULHW      Y9, Y15, Y9
  1652	VPMULHW      Y10, Y15, Y10
  1653	VPMULHW      Y11, Y15, Y11
  1654	VPSUBW       Y8, Y6, Y6
  1655	VPSUBW       Y9, Y7, Y7
  1656	VPSUBW       Y10, Y2, Y2
  1657	VPSUBW       Y11, Y3, Y3
  1658	VMOVDQU      928(BX), Y12
  1659	VMOVDQU      960(BX), Y13
  1660	VPMULLW      Y6, Y12, Y8
  1661	VPMULHW      Y6, Y13, Y6
  1662	VPMULHW      Y8, Y15, Y8
  1663	VPSUBW       Y8, Y6, Y6
  1664	VPSUBW       Y6, Y7, Y6
  1665	VPADDW       Y2, Y3, Y7
  1666	VMOVDQU      Y4, 256(AX)
  1667	VMOVDQU      Y5, 288(AX)
  1668	VMOVDQU      Y6, 320(AX)
  1669	VMOVDQU      Y7, 352(AX)
  1670	VMOVDQU      384(CX), Y0
  1671	VMOVDQU      416(CX), Y1
  1672	VMOVDQU      448(CX), Y2
  1673	VMOVDQU      480(CX), Y3
  1674	VMOVDQU      384(DX), Y4
  1675	VMOVDQU      416(DX), Y5
  1676	VMOVDQU      448(DX), Y6
  1677	VMOVDQU      480(DX), Y7
  1678	VPMULLW      Y1, Y5, Y8
  1679	VPMULLW      Y0, Y4, Y9
  1680	VPMULLW      Y0, Y5, Y10
  1681	VPMULLW      Y1, Y4, Y11
  1682	VPMULLW      Y8, Y14, Y8
  1683	VPMULLW      Y9, Y14, Y9
  1684	VPMULLW      Y10, Y14, Y10
  1685	VPMULLW      Y11, Y14, Y11
  1686	VPMULHW      Y1, Y5, Y12
  1687	VPMULHW      Y0, Y4, Y13
  1688	VPMULHW      Y0, Y5, Y0
  1689	VPMULHW      Y1, Y4, Y1
  1690	VMOVDQA      Y12, Y4
  1691	VMOVDQA      Y13, Y5
  1692	VPMULHW      Y8, Y15, Y8
  1693	VPMULHW      Y9, Y15, Y9
  1694	VPMULHW      Y10, Y15, Y10
  1695	VPMULHW      Y11, Y15, Y11
  1696	VPSUBW       Y8, Y4, Y4
  1697	VPSUBW       Y9, Y5, Y5
  1698	VPSUBW       Y10, Y0, Y0
  1699	VPSUBW       Y11, Y1, Y1
  1700	VMOVDQU      992(BX), Y12
  1701	VMOVDQU      1024(BX), Y13
  1702	VPMULLW      Y4, Y12, Y8
  1703	VPMULHW      Y4, Y13, Y4
  1704	VPMULHW      Y8, Y15, Y8
  1705	VPSUBW       Y8, Y4, Y4
  1706	VPADDW       Y4, Y5, Y4
  1707	VPADDW       Y0, Y1, Y5
  1708	VPMULLW      Y3, Y7, Y8
  1709	VPMULLW      Y2, Y6, Y9
  1710	VPMULLW      Y2, Y7, Y10
  1711	VPMULLW      Y3, Y6, Y11
  1712	VPMULLW      Y8, Y14, Y8
  1713	VPMULLW      Y9, Y14, Y9
  1714	VPMULLW      Y10, Y14, Y10
  1715	VPMULLW      Y11, Y14, Y11
  1716	VPMULHW      Y3, Y7, Y12
  1717	VPMULHW      Y2, Y6, Y13
  1718	VPMULHW      Y2, Y7, Y2
  1719	VPMULHW      Y3, Y6, Y3
  1720	VMOVDQA      Y12, Y6
  1721	VMOVDQA      Y13, Y7
  1722	VPMULHW      Y8, Y15, Y8
  1723	VPMULHW      Y9, Y15, Y9
  1724	VPMULHW      Y10, Y15, Y10
  1725	VPMULHW      Y11, Y15, Y11
  1726	VPSUBW       Y8, Y6, Y6
  1727	VPSUBW       Y9, Y7, Y7
  1728	VPSUBW       Y10, Y2, Y2
  1729	VPSUBW       Y11, Y3, Y3
  1730	VMOVDQU      992(BX), Y12
  1731	VMOVDQU      1024(BX), Y13
  1732	VPMULLW      Y6, Y12, Y8
  1733	VPMULHW      Y6, Y13, Y6
  1734	VPMULHW      Y8, Y15, Y8
  1735	VPSUBW       Y8, Y6, Y6
  1736	VPSUBW       Y6, Y7, Y6
  1737	VPADDW       Y2, Y3, Y7
  1738	VMOVDQU      Y4, 384(AX)
  1739	VMOVDQU      Y5, 416(AX)
  1740	VMOVDQU      Y6, 448(AX)
  1741	VMOVDQU      Y7, 480(AX)
  1742	RET
  1743
  1744// func detangleAVX2(p *[256]int16)
  1745// Requires: AVX, AVX2
  1746TEXT ·detangleAVX2(SB), NOSPLIT, $0-8
  1747	MOVQ        p+0(FP), AX
  1748	VMOVDQU     (AX), Y0
  1749	VMOVDQU     32(AX), Y1
  1750	VMOVDQU     64(AX), Y2
  1751	VMOVDQU     96(AX), Y3
  1752	VMOVDQU     128(AX), Y4
  1753	VMOVDQU     160(AX), Y5
  1754	VMOVDQU     192(AX), Y6
  1755	VMOVDQU     224(AX), Y7
  1756	VPSLLD      $0x10, Y1, Y8
  1757	VPBLENDW    $0xaa, Y8, Y0, Y8
  1758	VPSRLD      $0x10, Y0, Y0
  1759	VPBLENDW    $0xaa, Y1, Y0, Y1
  1760	VMOVDQA     Y8, Y0
  1761	VPSLLD      $0x10, Y3, Y8
  1762	VPBLENDW    $0xaa, Y8, Y2, Y8
  1763	VPSRLD      $0x10, Y2, Y2
  1764	VPBLENDW    $0xaa, Y3, Y2, Y3
  1765	VMOVDQA     Y8, Y2
  1766	VPSLLD      $0x10, Y5, Y8
  1767	VPBLENDW    $0xaa, Y8, Y4, Y8
  1768	VPSRLD      $0x10, Y4, Y4
  1769	VPBLENDW    $0xaa, Y5, Y4, Y5
  1770	VMOVDQA     Y8, Y4
  1771	VPSLLD      $0x10, Y7, Y8
  1772	VPBLENDW    $0xaa, Y8, Y6, Y8
  1773	VPSRLD      $0x10, Y6, Y6
  1774	VPBLENDW    $0xaa, Y7, Y6, Y7
  1775	VMOVDQA     Y8, Y6
  1776	VMOVSLDUP   Y2, Y8
  1777	VPBLENDD    $0xaa, Y8, Y0, Y8
  1778	VPSRLQ      $0x20, Y0, Y0
  1779	VPBLENDD    $0xaa, Y2, Y0, Y2
  1780	VMOVDQA     Y8, Y0
  1781	VMOVSLDUP   Y3, Y8
  1782	VPBLENDD    $0xaa, Y8, Y1, Y8
  1783	VPSRLQ      $0x20, Y1, Y1
  1784	VPBLENDD    $0xaa, Y3, Y1, Y3
  1785	VMOVDQA     Y8, Y1
  1786	VMOVSLDUP   Y6, Y8
  1787	VPBLENDD    $0xaa, Y8, Y4, Y8
  1788	VPSRLQ      $0x20, Y4, Y4
  1789	VPBLENDD    $0xaa, Y6, Y4, Y6
  1790	VMOVDQA     Y8, Y4
  1791	VMOVSLDUP   Y7, Y8
  1792	VPBLENDD    $0xaa, Y8, Y5, Y8
  1793	VPSRLQ      $0x20, Y5, Y5
  1794	VPBLENDD    $0xaa, Y7, Y5, Y7
  1795	VMOVDQA     Y8, Y5
  1796	VPUNPCKLQDQ Y1, Y0, Y8
  1797	VPUNPCKHQDQ Y1, Y0, Y1
  1798	VMOVDQA     Y8, Y0
  1799	VPUNPCKLQDQ Y3, Y2, Y8
  1800	VPUNPCKHQDQ Y3, Y2, Y3
  1801	VMOVDQA     Y8, Y2
  1802	VPUNPCKLQDQ Y5, Y4, Y8
  1803	VPUNPCKHQDQ Y5, Y4, Y5
  1804	VMOVDQA     Y8, Y4
  1805	VPUNPCKLQDQ Y7, Y6, Y8
  1806	VPUNPCKHQDQ Y7, Y6, Y7
  1807	VMOVDQA     Y8, Y6
  1808	VPERM2I128  $0x20, Y2, Y0, Y8
  1809	VPERM2I128  $0x31, Y2, Y0, Y2
  1810	VMOVDQA     Y8, Y0
  1811	VPERM2I128  $0x20, Y3, Y1, Y8
  1812	VPERM2I128  $0x31, Y3, Y1, Y3
  1813	VMOVDQA     Y8, Y1
  1814	VPERM2I128  $0x20, Y6, Y4, Y8
  1815	VPERM2I128  $0x31, Y6, Y4, Y6
  1816	VMOVDQA     Y8, Y4
  1817	VPERM2I128  $0x20, Y7, Y5, Y8
  1818	VPERM2I128  $0x31, Y7, Y5, Y7
  1819	VMOVDQA     Y8, Y5
  1820	VMOVDQU     Y0, (AX)
  1821	VMOVDQU     Y1, 32(AX)
  1822	VMOVDQU     Y2, 64(AX)
  1823	VMOVDQU     Y3, 96(AX)
  1824	VMOVDQU     Y4, 128(AX)
  1825	VMOVDQU     Y5, 160(AX)
  1826	VMOVDQU     Y6, 192(AX)
  1827	VMOVDQU     Y7, 224(AX)
  1828	VMOVDQU     256(AX), Y0
  1829	VMOVDQU     288(AX), Y1
  1830	VMOVDQU     320(AX), Y2
  1831	VMOVDQU     352(AX), Y3
  1832	VMOVDQU     384(AX), Y4
  1833	VMOVDQU     416(AX), Y5
  1834	VMOVDQU     448(AX), Y6
  1835	VMOVDQU     480(AX), Y7
  1836	VPSLLD      $0x10, Y1, Y8
  1837	VPBLENDW    $0xaa, Y8, Y0, Y8
  1838	VPSRLD      $0x10, Y0, Y0
  1839	VPBLENDW    $0xaa, Y1, Y0, Y1
  1840	VMOVDQA     Y8, Y0
  1841	VPSLLD      $0x10, Y3, Y8
  1842	VPBLENDW    $0xaa, Y8, Y2, Y8
  1843	VPSRLD      $0x10, Y2, Y2
  1844	VPBLENDW    $0xaa, Y3, Y2, Y3
  1845	VMOVDQA     Y8, Y2
  1846	VPSLLD      $0x10, Y5, Y8
  1847	VPBLENDW    $0xaa, Y8, Y4, Y8
  1848	VPSRLD      $0x10, Y4, Y4
  1849	VPBLENDW    $0xaa, Y5, Y4, Y5
  1850	VMOVDQA     Y8, Y4
  1851	VPSLLD      $0x10, Y7, Y8
  1852	VPBLENDW    $0xaa, Y8, Y6, Y8
  1853	VPSRLD      $0x10, Y6, Y6
  1854	VPBLENDW    $0xaa, Y7, Y6, Y7
  1855	VMOVDQA     Y8, Y6
  1856	VMOVSLDUP   Y2, Y8
  1857	VPBLENDD    $0xaa, Y8, Y0, Y8
  1858	VPSRLQ      $0x20, Y0, Y0
  1859	VPBLENDD    $0xaa, Y2, Y0, Y2
  1860	VMOVDQA     Y8, Y0
  1861	VMOVSLDUP   Y3, Y8
  1862	VPBLENDD    $0xaa, Y8, Y1, Y8
  1863	VPSRLQ      $0x20, Y1, Y1
  1864	VPBLENDD    $0xaa, Y3, Y1, Y3
  1865	VMOVDQA     Y8, Y1
  1866	VMOVSLDUP   Y6, Y8
  1867	VPBLENDD    $0xaa, Y8, Y4, Y8
  1868	VPSRLQ      $0x20, Y4, Y4
  1869	VPBLENDD    $0xaa, Y6, Y4, Y6
  1870	VMOVDQA     Y8, Y4
  1871	VMOVSLDUP   Y7, Y8
  1872	VPBLENDD    $0xaa, Y8, Y5, Y8
  1873	VPSRLQ      $0x20, Y5, Y5
  1874	VPBLENDD    $0xaa, Y7, Y5, Y7
  1875	VMOVDQA     Y8, Y5
  1876	VPUNPCKLQDQ Y1, Y0, Y8
  1877	VPUNPCKHQDQ Y1, Y0, Y1
  1878	VMOVDQA     Y8, Y0
  1879	VPUNPCKLQDQ Y3, Y2, Y8
  1880	VPUNPCKHQDQ Y3, Y2, Y3
  1881	VMOVDQA     Y8, Y2
  1882	VPUNPCKLQDQ Y5, Y4, Y8
  1883	VPUNPCKHQDQ Y5, Y4, Y5
  1884	VMOVDQA     Y8, Y4
  1885	VPUNPCKLQDQ Y7, Y6, Y8
  1886	VPUNPCKHQDQ Y7, Y6, Y7
  1887	VMOVDQA     Y8, Y6
  1888	VPERM2I128  $0x20, Y2, Y0, Y8
  1889	VPERM2I128  $0x31, Y2, Y0, Y2
  1890	VMOVDQA     Y8, Y0
  1891	VPERM2I128  $0x20, Y3, Y1, Y8
  1892	VPERM2I128  $0x31, Y3, Y1, Y3
  1893	VMOVDQA     Y8, Y1
  1894	VPERM2I128  $0x20, Y6, Y4, Y8
  1895	VPERM2I128  $0x31, Y6, Y4, Y6
  1896	VMOVDQA     Y8, Y4
  1897	VPERM2I128  $0x20, Y7, Y5, Y8
  1898	VPERM2I128  $0x31, Y7, Y5, Y7
  1899	VMOVDQA     Y8, Y5
  1900	VMOVDQU     Y0, 256(AX)
  1901	VMOVDQU     Y1, 288(AX)
  1902	VMOVDQU     Y2, 320(AX)
  1903	VMOVDQU     Y3, 352(AX)
  1904	VMOVDQU     Y4, 384(AX)
  1905	VMOVDQU     Y5, 416(AX)
  1906	VMOVDQU     Y6, 448(AX)
  1907	VMOVDQU     Y7, 480(AX)
  1908	RET
  1909
  1910// func tangleAVX2(p *[256]int16)
  1911// Requires: AVX, AVX2
  1912TEXT ·tangleAVX2(SB), NOSPLIT, $0-8
  1913	MOVQ        p+0(FP), AX
  1914	VMOVDQU     (AX), Y0
  1915	VMOVDQU     32(AX), Y1
  1916	VMOVDQU     64(AX), Y2
  1917	VMOVDQU     96(AX), Y3
  1918	VMOVDQU     128(AX), Y4
  1919	VMOVDQU     160(AX), Y5
  1920	VMOVDQU     192(AX), Y6
  1921	VMOVDQU     224(AX), Y7
  1922	VPERM2I128  $0x20, Y2, Y0, Y8
  1923	VPERM2I128  $0x31, Y2, Y0, Y2
  1924	VMOVDQA     Y8, Y0
  1925	VPERM2I128  $0x20, Y3, Y1, Y8
  1926	VPERM2I128  $0x31, Y3, Y1, Y3
  1927	VMOVDQA     Y8, Y1
  1928	VPERM2I128  $0x20, Y6, Y4, Y8
  1929	VPERM2I128  $0x31, Y6, Y4, Y6
  1930	VMOVDQA     Y8, Y4
  1931	VPERM2I128  $0x20, Y7, Y5, Y8
  1932	VPERM2I128  $0x31, Y7, Y5, Y7
  1933	VMOVDQA     Y8, Y5
  1934	VPUNPCKLQDQ Y1, Y0, Y8
  1935	VPUNPCKHQDQ Y1, Y0, Y1
  1936	VMOVDQA     Y8, Y0
  1937	VPUNPCKLQDQ Y3, Y2, Y8
  1938	VPUNPCKHQDQ Y3, Y2, Y3
  1939	VMOVDQA     Y8, Y2
  1940	VPUNPCKLQDQ Y5, Y4, Y8
  1941	VPUNPCKHQDQ Y5, Y4, Y5
  1942	VMOVDQA     Y8, Y4
  1943	VPUNPCKLQDQ Y7, Y6, Y8
  1944	VPUNPCKHQDQ Y7, Y6, Y7
  1945	VMOVDQA     Y8, Y6
  1946	VMOVSLDUP   Y2, Y8
  1947	VPBLENDD    $0xaa, Y8, Y0, Y8
  1948	VPSRLQ      $0x20, Y0, Y0
  1949	VPBLENDD    $0xaa, Y2, Y0, Y2
  1950	VMOVDQA     Y8, Y0
  1951	VMOVSLDUP   Y3, Y8
  1952	VPBLENDD    $0xaa, Y8, Y1, Y8
  1953	VPSRLQ      $0x20, Y1, Y1
  1954	VPBLENDD    $0xaa, Y3, Y1, Y3
  1955	VMOVDQA     Y8, Y1
  1956	VMOVSLDUP   Y6, Y8
  1957	VPBLENDD    $0xaa, Y8, Y4, Y8
  1958	VPSRLQ      $0x20, Y4, Y4
  1959	VPBLENDD    $0xaa, Y6, Y4, Y6
  1960	VMOVDQA     Y8, Y4
  1961	VMOVSLDUP   Y7, Y8
  1962	VPBLENDD    $0xaa, Y8, Y5, Y8
  1963	VPSRLQ      $0x20, Y5, Y5
  1964	VPBLENDD    $0xaa, Y7, Y5, Y7
  1965	VMOVDQA     Y8, Y5
  1966	VPSLLD      $0x10, Y1, Y8
  1967	VPBLENDW    $0xaa, Y8, Y0, Y8
  1968	VPSRLD      $0x10, Y0, Y0
  1969	VPBLENDW    $0xaa, Y1, Y0, Y1
  1970	VMOVDQA     Y8, Y0
  1971	VPSLLD      $0x10, Y3, Y8
  1972	VPBLENDW    $0xaa, Y8, Y2, Y8
  1973	VPSRLD      $0x10, Y2, Y2
  1974	VPBLENDW    $0xaa, Y3, Y2, Y3
  1975	VMOVDQA     Y8, Y2
  1976	VPSLLD      $0x10, Y5, Y8
  1977	VPBLENDW    $0xaa, Y8, Y4, Y8
  1978	VPSRLD      $0x10, Y4, Y4
  1979	VPBLENDW    $0xaa, Y5, Y4, Y5
  1980	VMOVDQA     Y8, Y4
  1981	VPSLLD      $0x10, Y7, Y8
  1982	VPBLENDW    $0xaa, Y8, Y6, Y8
  1983	VPSRLD      $0x10, Y6, Y6
  1984	VPBLENDW    $0xaa, Y7, Y6, Y7
  1985	VMOVDQA     Y8, Y6
  1986	VMOVDQU     Y0, (AX)
  1987	VMOVDQU     Y1, 32(AX)
  1988	VMOVDQU     Y2, 64(AX)
  1989	VMOVDQU     Y3, 96(AX)
  1990	VMOVDQU     Y4, 128(AX)
  1991	VMOVDQU     Y5, 160(AX)
  1992	VMOVDQU     Y6, 192(AX)
  1993	VMOVDQU     Y7, 224(AX)
  1994	VMOVDQU     256(AX), Y0
  1995	VMOVDQU     288(AX), Y1
  1996	VMOVDQU     320(AX), Y2
  1997	VMOVDQU     352(AX), Y3
  1998	VMOVDQU     384(AX), Y4
  1999	VMOVDQU     416(AX), Y5
  2000	VMOVDQU     448(AX), Y6
  2001	VMOVDQU     480(AX), Y7
  2002	VPERM2I128  $0x20, Y2, Y0, Y8
  2003	VPERM2I128  $0x31, Y2, Y0, Y2
  2004	VMOVDQA     Y8, Y0
  2005	VPERM2I128  $0x20, Y3, Y1, Y8
  2006	VPERM2I128  $0x31, Y3, Y1, Y3
  2007	VMOVDQA     Y8, Y1
  2008	VPERM2I128  $0x20, Y6, Y4, Y8
  2009	VPERM2I128  $0x31, Y6, Y4, Y6
  2010	VMOVDQA     Y8, Y4
  2011	VPERM2I128  $0x20, Y7, Y5, Y8
  2012	VPERM2I128  $0x31, Y7, Y5, Y7
  2013	VMOVDQA     Y8, Y5
  2014	VPUNPCKLQDQ Y1, Y0, Y8
  2015	VPUNPCKHQDQ Y1, Y0, Y1
  2016	VMOVDQA     Y8, Y0
  2017	VPUNPCKLQDQ Y3, Y2, Y8
  2018	VPUNPCKHQDQ Y3, Y2, Y3
  2019	VMOVDQA     Y8, Y2
  2020	VPUNPCKLQDQ Y5, Y4, Y8
  2021	VPUNPCKHQDQ Y5, Y4, Y5
  2022	VMOVDQA     Y8, Y4
  2023	VPUNPCKLQDQ Y7, Y6, Y8
  2024	VPUNPCKHQDQ Y7, Y6, Y7
  2025	VMOVDQA     Y8, Y6
  2026	VMOVSLDUP   Y2, Y8
  2027	VPBLENDD    $0xaa, Y8, Y0, Y8
  2028	VPSRLQ      $0x20, Y0, Y0
  2029	VPBLENDD    $0xaa, Y2, Y0, Y2
  2030	VMOVDQA     Y8, Y0
  2031	VMOVSLDUP   Y3, Y8
  2032	VPBLENDD    $0xaa, Y8, Y1, Y8
  2033	VPSRLQ      $0x20, Y1, Y1
  2034	VPBLENDD    $0xaa, Y3, Y1, Y3
  2035	VMOVDQA     Y8, Y1
  2036	VMOVSLDUP   Y6, Y8
  2037	VPBLENDD    $0xaa, Y8, Y4, Y8
  2038	VPSRLQ      $0x20, Y4, Y4
  2039	VPBLENDD    $0xaa, Y6, Y4, Y6
  2040	VMOVDQA     Y8, Y4
  2041	VMOVSLDUP   Y7, Y8
  2042	VPBLENDD    $0xaa, Y8, Y5, Y8
  2043	VPSRLQ      $0x20, Y5, Y5
  2044	VPBLENDD    $0xaa, Y7, Y5, Y7
  2045	VMOVDQA     Y8, Y5
  2046	VPSLLD      $0x10, Y1, Y8
  2047	VPBLENDW    $0xaa, Y8, Y0, Y8
  2048	VPSRLD      $0x10, Y0, Y0
  2049	VPBLENDW    $0xaa, Y1, Y0, Y1
  2050	VMOVDQA     Y8, Y0
  2051	VPSLLD      $0x10, Y3, Y8
  2052	VPBLENDW    $0xaa, Y8, Y2, Y8
  2053	VPSRLD      $0x10, Y2, Y2
  2054	VPBLENDW    $0xaa, Y3, Y2, Y3
  2055	VMOVDQA     Y8, Y2
  2056	VPSLLD      $0x10, Y5, Y8
  2057	VPBLENDW    $0xaa, Y8, Y4, Y8
  2058	VPSRLD      $0x10, Y4, Y4
  2059	VPBLENDW    $0xaa, Y5, Y4, Y5
  2060	VMOVDQA     Y8, Y4
  2061	VPSLLD      $0x10, Y7, Y8
  2062	VPBLENDW    $0xaa, Y8, Y6, Y8
  2063	VPSRLD      $0x10, Y6, Y6
  2064	VPBLENDW    $0xaa, Y7, Y6, Y7
  2065	VMOVDQA     Y8, Y6
  2066	VMOVDQU     Y0, 256(AX)
  2067	VMOVDQU     Y1, 288(AX)
  2068	VMOVDQU     Y2, 320(AX)
  2069	VMOVDQU     Y3, 352(AX)
  2070	VMOVDQU     Y4, 384(AX)
  2071	VMOVDQU     Y5, 416(AX)
  2072	VMOVDQU     Y6, 448(AX)
  2073	VMOVDQU     Y7, 480(AX)
  2074	RET
  2075
  2076// func barrettReduceAVX2(p *[256]int16)
  2077// Requires: AVX, AVX2
  2078TEXT ·barrettReduceAVX2(SB), NOSPLIT, $0-8
  2079	MOVQ         p+0(FP), AX
  2080	MOVL         $0x00000d01, CX
  2081	VMOVD        CX, X0
  2082	VPBROADCASTW X0, Y9
  2083	MOVL         $0x00004ebf, CX
  2084	VMOVD        CX, X0
  2085	VPBROADCASTW X0, Y8
  2086	VMOVDQU      (AX), Y0
  2087	VMOVDQU      32(AX), Y1
  2088	VMOVDQU      64(AX), Y2
  2089	VMOVDQU      96(AX), Y3
  2090	VPMULHW      Y8, Y0, Y4
  2091	VPMULHW      Y8, Y1, Y5
  2092	VPMULHW      Y8, Y2, Y6
  2093	VPMULHW      Y8, Y3, Y7
  2094	VPSRAW       $0x0a, Y4, Y4
  2095	VPSRAW       $0x0a, Y5, Y5
  2096	VPSRAW       $0x0a, Y6, Y6
  2097	VPSRAW       $0x0a, Y7, Y7
  2098	VPMULLW      Y9, Y4, Y4
  2099	VPMULLW      Y9, Y5, Y5
  2100	VPMULLW      Y9, Y6, Y6
  2101	VPMULLW      Y9, Y7, Y7
  2102	VPSUBW       Y4, Y0, Y0
  2103	VPSUBW       Y5, Y1, Y1
  2104	VPSUBW       Y6, Y2, Y2
  2105	VPSUBW       Y7, Y3, Y3
  2106	VMOVDQU      Y0, (AX)
  2107	VMOVDQU      Y1, 32(AX)
  2108	VMOVDQU      Y2, 64(AX)
  2109	VMOVDQU      Y3, 96(AX)
  2110	VMOVDQU      128(AX), Y0
  2111	VMOVDQU      160(AX), Y1
  2112	VMOVDQU      192(AX), Y2
  2113	VMOVDQU      224(AX), Y3
  2114	VPMULHW      Y8, Y0, Y4
  2115	VPMULHW      Y8, Y1, Y5
  2116	VPMULHW      Y8, Y2, Y6
  2117	VPMULHW      Y8, Y3, Y7
  2118	VPSRAW       $0x0a, Y4, Y4
  2119	VPSRAW       $0x0a, Y5, Y5
  2120	VPSRAW       $0x0a, Y6, Y6
  2121	VPSRAW       $0x0a, Y7, Y7
  2122	VPMULLW      Y9, Y4, Y4
  2123	VPMULLW      Y9, Y5, Y5
  2124	VPMULLW      Y9, Y6, Y6
  2125	VPMULLW      Y9, Y7, Y7
  2126	VPSUBW       Y4, Y0, Y0
  2127	VPSUBW       Y5, Y1, Y1
  2128	VPSUBW       Y6, Y2, Y2
  2129	VPSUBW       Y7, Y3, Y3
  2130	VMOVDQU      Y0, 128(AX)
  2131	VMOVDQU      Y1, 160(AX)
  2132	VMOVDQU      Y2, 192(AX)
  2133	VMOVDQU      Y3, 224(AX)
  2134	VMOVDQU      256(AX), Y0
  2135	VMOVDQU      288(AX), Y1
  2136	VMOVDQU      320(AX), Y2
  2137	VMOVDQU      352(AX), Y3
  2138	VPMULHW      Y8, Y0, Y4
  2139	VPMULHW      Y8, Y1, Y5
  2140	VPMULHW      Y8, Y2, Y6
  2141	VPMULHW      Y8, Y3, Y7
  2142	VPSRAW       $0x0a, Y4, Y4
  2143	VPSRAW       $0x0a, Y5, Y5
  2144	VPSRAW       $0x0a, Y6, Y6
  2145	VPSRAW       $0x0a, Y7, Y7
  2146	VPMULLW      Y9, Y4, Y4
  2147	VPMULLW      Y9, Y5, Y5
  2148	VPMULLW      Y9, Y6, Y6
  2149	VPMULLW      Y9, Y7, Y7
  2150	VPSUBW       Y4, Y0, Y0
  2151	VPSUBW       Y5, Y1, Y1
  2152	VPSUBW       Y6, Y2, Y2
  2153	VPSUBW       Y7, Y3, Y3
  2154	VMOVDQU      Y0, 256(AX)
  2155	VMOVDQU      Y1, 288(AX)
  2156	VMOVDQU      Y2, 320(AX)
  2157	VMOVDQU      Y3, 352(AX)
  2158	VMOVDQU      384(AX), Y0
  2159	VMOVDQU      416(AX), Y1
  2160	VMOVDQU      448(AX), Y2
  2161	VMOVDQU      480(AX), Y3
  2162	VPMULHW      Y8, Y0, Y4
  2163	VPMULHW      Y8, Y1, Y5
  2164	VPMULHW      Y8, Y2, Y6
  2165	VPMULHW      Y8, Y3, Y7
  2166	VPSRAW       $0x0a, Y4, Y4
  2167	VPSRAW       $0x0a, Y5, Y5
  2168	VPSRAW       $0x0a, Y6, Y6
  2169	VPSRAW       $0x0a, Y7, Y7
  2170	VPMULLW      Y9, Y4, Y4
  2171	VPMULLW      Y9, Y5, Y5
  2172	VPMULLW      Y9, Y6, Y6
  2173	VPMULLW      Y9, Y7, Y7
  2174	VPSUBW       Y4, Y0, Y0
  2175	VPSUBW       Y5, Y1, Y1
  2176	VPSUBW       Y6, Y2, Y2
  2177	VPSUBW       Y7, Y3, Y3
  2178	VMOVDQU      Y0, 384(AX)
  2179	VMOVDQU      Y1, 416(AX)
  2180	VMOVDQU      Y2, 448(AX)
  2181	VMOVDQU      Y3, 480(AX)
  2182	RET
  2183
  2184// func normalizeAVX2(p *[256]int16)
  2185// Requires: AVX, AVX2
  2186TEXT ·normalizeAVX2(SB), NOSPLIT, $0-8
  2187	MOVQ         p+0(FP), AX
  2188	MOVL         $0x00000d01, CX
  2189	VMOVD        CX, X0
  2190	VPBROADCASTW X0, Y9
  2191	MOVL         $0x00004ebf, CX
  2192	VMOVD        CX, X0
  2193	VPBROADCASTW X0, Y8
  2194	VMOVDQU      (AX), Y0
  2195	VMOVDQU      32(AX), Y1
  2196	VMOVDQU      64(AX), Y2
  2197	VMOVDQU      96(AX), Y3
  2198	VPMULHW      Y8, Y0, Y4
  2199	VPMULHW      Y8, Y1, Y5
  2200	VPMULHW      Y8, Y2, Y6
  2201	VPMULHW      Y8, Y3, Y7
  2202	VPSRAW       $0x0a, Y4, Y4
  2203	VPSRAW       $0x0a, Y5, Y5
  2204	VPSRAW       $0x0a, Y6, Y6
  2205	VPSRAW       $0x0a, Y7, Y7
  2206	VPMULLW      Y9, Y4, Y4
  2207	VPMULLW      Y9, Y5, Y5
  2208	VPMULLW      Y9, Y6, Y6
  2209	VPMULLW      Y9, Y7, Y7
  2210	VPSUBW       Y4, Y0, Y0
  2211	VPSUBW       Y5, Y1, Y1
  2212	VPSUBW       Y6, Y2, Y2
  2213	VPSUBW       Y7, Y3, Y3
  2214	VPSUBW       Y9, Y0, Y0
  2215	VPSUBW       Y9, Y1, Y1
  2216	VPSUBW       Y9, Y2, Y2
  2217	VPSUBW       Y9, Y3, Y3
  2218	VPSRAW       $0x0f, Y0, Y4
  2219	VPSRAW       $0x0f, Y1, Y5
  2220	VPSRAW       $0x0f, Y2, Y6
  2221	VPSRAW       $0x0f, Y3, Y7
  2222	VPAND        Y4, Y9, Y4
  2223	VPAND        Y5, Y9, Y5
  2224	VPAND        Y6, Y9, Y6
  2225	VPAND        Y7, Y9, Y7
  2226	VPADDW       Y0, Y4, Y0
  2227	VPADDW       Y1, Y5, Y1
  2228	VPADDW       Y2, Y6, Y2
  2229	VPADDW       Y3, Y7, Y3
  2230	VMOVDQU      Y0, (AX)
  2231	VMOVDQU      Y1, 32(AX)
  2232	VMOVDQU      Y2, 64(AX)
  2233	VMOVDQU      Y3, 96(AX)
  2234	VMOVDQU      128(AX), Y0
  2235	VMOVDQU      160(AX), Y1
  2236	VMOVDQU      192(AX), Y2
  2237	VMOVDQU      224(AX), Y3
  2238	VPMULHW      Y8, Y0, Y4
  2239	VPMULHW      Y8, Y1, Y5
  2240	VPMULHW      Y8, Y2, Y6
  2241	VPMULHW      Y8, Y3, Y7
  2242	VPSRAW       $0x0a, Y4, Y4
  2243	VPSRAW       $0x0a, Y5, Y5
  2244	VPSRAW       $0x0a, Y6, Y6
  2245	VPSRAW       $0x0a, Y7, Y7
  2246	VPMULLW      Y9, Y4, Y4
  2247	VPMULLW      Y9, Y5, Y5
  2248	VPMULLW      Y9, Y6, Y6
  2249	VPMULLW      Y9, Y7, Y7
  2250	VPSUBW       Y4, Y0, Y0
  2251	VPSUBW       Y5, Y1, Y1
  2252	VPSUBW       Y6, Y2, Y2
  2253	VPSUBW       Y7, Y3, Y3
  2254	VPSUBW       Y9, Y0, Y0
  2255	VPSUBW       Y9, Y1, Y1
  2256	VPSUBW       Y9, Y2, Y2
  2257	VPSUBW       Y9, Y3, Y3
  2258	VPSRAW       $0x0f, Y0, Y4
  2259	VPSRAW       $0x0f, Y1, Y5
  2260	VPSRAW       $0x0f, Y2, Y6
  2261	VPSRAW       $0x0f, Y3, Y7
  2262	VPAND        Y4, Y9, Y4
  2263	VPAND        Y5, Y9, Y5
  2264	VPAND        Y6, Y9, Y6
  2265	VPAND        Y7, Y9, Y7
  2266	VPADDW       Y0, Y4, Y0
  2267	VPADDW       Y1, Y5, Y1
  2268	VPADDW       Y2, Y6, Y2
  2269	VPADDW       Y3, Y7, Y3
  2270	VMOVDQU      Y0, 128(AX)
  2271	VMOVDQU      Y1, 160(AX)
  2272	VMOVDQU      Y2, 192(AX)
  2273	VMOVDQU      Y3, 224(AX)
  2274	VMOVDQU      256(AX), Y0
  2275	VMOVDQU      288(AX), Y1
  2276	VMOVDQU      320(AX), Y2
  2277	VMOVDQU      352(AX), Y3
  2278	VPMULHW      Y8, Y0, Y4
  2279	VPMULHW      Y8, Y1, Y5
  2280	VPMULHW      Y8, Y2, Y6
  2281	VPMULHW      Y8, Y3, Y7
  2282	VPSRAW       $0x0a, Y4, Y4
  2283	VPSRAW       $0x0a, Y5, Y5
  2284	VPSRAW       $0x0a, Y6, Y6
  2285	VPSRAW       $0x0a, Y7, Y7
  2286	VPMULLW      Y9, Y4, Y4
  2287	VPMULLW      Y9, Y5, Y5
  2288	VPMULLW      Y9, Y6, Y6
  2289	VPMULLW      Y9, Y7, Y7
  2290	VPSUBW       Y4, Y0, Y0
  2291	VPSUBW       Y5, Y1, Y1
  2292	VPSUBW       Y6, Y2, Y2
  2293	VPSUBW       Y7, Y3, Y3
  2294	VPSUBW       Y9, Y0, Y0
  2295	VPSUBW       Y9, Y1, Y1
  2296	VPSUBW       Y9, Y2, Y2
  2297	VPSUBW       Y9, Y3, Y3
  2298	VPSRAW       $0x0f, Y0, Y4
  2299	VPSRAW       $0x0f, Y1, Y5
  2300	VPSRAW       $0x0f, Y2, Y6
  2301	VPSRAW       $0x0f, Y3, Y7
  2302	VPAND        Y4, Y9, Y4
  2303	VPAND        Y5, Y9, Y5
  2304	VPAND        Y6, Y9, Y6
  2305	VPAND        Y7, Y9, Y7
  2306	VPADDW       Y0, Y4, Y0
  2307	VPADDW       Y1, Y5, Y1
  2308	VPADDW       Y2, Y6, Y2
  2309	VPADDW       Y3, Y7, Y3
  2310	VMOVDQU      Y0, 256(AX)
  2311	VMOVDQU      Y1, 288(AX)
  2312	VMOVDQU      Y2, 320(AX)
  2313	VMOVDQU      Y3, 352(AX)
  2314	VMOVDQU      384(AX), Y0
  2315	VMOVDQU      416(AX), Y1
  2316	VMOVDQU      448(AX), Y2
  2317	VMOVDQU      480(AX), Y3
  2318	VPMULHW      Y8, Y0, Y4
  2319	VPMULHW      Y8, Y1, Y5
  2320	VPMULHW      Y8, Y2, Y6
  2321	VPMULHW      Y8, Y3, Y7
  2322	VPSRAW       $0x0a, Y4, Y4
  2323	VPSRAW       $0x0a, Y5, Y5
  2324	VPSRAW       $0x0a, Y6, Y6
  2325	VPSRAW       $0x0a, Y7, Y7
  2326	VPMULLW      Y9, Y4, Y4
  2327	VPMULLW      Y9, Y5, Y5
  2328	VPMULLW      Y9, Y6, Y6
  2329	VPMULLW      Y9, Y7, Y7
  2330	VPSUBW       Y4, Y0, Y0
  2331	VPSUBW       Y5, Y1, Y1
  2332	VPSUBW       Y6, Y2, Y2
  2333	VPSUBW       Y7, Y3, Y3
  2334	VPSUBW       Y9, Y0, Y0
  2335	VPSUBW       Y9, Y1, Y1
  2336	VPSUBW       Y9, Y2, Y2
  2337	VPSUBW       Y9, Y3, Y3
  2338	VPSRAW       $0x0f, Y0, Y4
  2339	VPSRAW       $0x0f, Y1, Y5
  2340	VPSRAW       $0x0f, Y2, Y6
  2341	VPSRAW       $0x0f, Y3, Y7
  2342	VPAND        Y4, Y9, Y4
  2343	VPAND        Y5, Y9, Y5
  2344	VPAND        Y6, Y9, Y6
  2345	VPAND        Y7, Y9, Y7
  2346	VPADDW       Y0, Y4, Y0
  2347	VPADDW       Y1, Y5, Y1
  2348	VPADDW       Y2, Y6, Y2
  2349	VPADDW       Y3, Y7, Y3
  2350	VMOVDQU      Y0, 384(AX)
  2351	VMOVDQU      Y1, 416(AX)
  2352	VMOVDQU      Y2, 448(AX)
  2353	VMOVDQU      Y3, 480(AX)
  2354	RET

View as plain text