...

Text file src/github.com/zeebo/xxh3/accum_vector_avx_amd64.s

Documentation: github.com/zeebo/xxh3

     1// Code generated by command: go run gen.go -avx -out ../accum_vector_avx_amd64.s -pkg xxh3. DO NOT EDIT.
     2
     3#include "textflag.h"
     4
     5DATA prime_avx<>+0(SB)/8, $0x000000009e3779b1
     6DATA prime_avx<>+8(SB)/8, $0x000000009e3779b1
     7DATA prime_avx<>+16(SB)/8, $0x000000009e3779b1
     8DATA prime_avx<>+24(SB)/8, $0x000000009e3779b1
     9GLOBL prime_avx<>(SB), RODATA|NOPTR, $32
    10
    11// func accumAVX2(acc *[8]uint64, data *byte, key *byte, len uint64)
    12// Requires: AVX, AVX2, MMX+
    13TEXT ·accumAVX2(SB), NOSPLIT, $0-32
    14	MOVQ    acc+0(FP), AX
    15	MOVQ    data+8(FP), CX
    16	MOVQ    key+16(FP), DX
    17	MOVQ    key+16(FP), BX
    18	MOVQ    len+24(FP), SI
    19	VMOVDQU (AX), Y1
    20	VMOVDQU 32(AX), Y2
    21	VMOVDQU prime_avx<>+0(SB), Y0
    22
    23accum_large:
    24	CMPQ       SI, $0x00000400
    25	JLE        accum
    26	VMOVDQU    (CX), Y3
    27	VMOVDQU    32(CX), Y6
    28	PREFETCHT0 512(CX)
    29	VPXOR      (DX), Y3, Y4
    30	VPXOR      32(DX), Y6, Y7
    31	VPSHUFD    $0x31, Y4, Y5
    32	VPSHUFD    $0x31, Y7, Y8
    33	VPMULUDQ   Y4, Y5, Y4
    34	VPMULUDQ   Y7, Y8, Y7
    35	VPSHUFD    $0x4e, Y3, Y3
    36	VPSHUFD    $0x4e, Y6, Y6
    37	VPADDQ     Y1, Y3, Y1
    38	VPADDQ     Y1, Y4, Y1
    39	VPADDQ     Y2, Y6, Y2
    40	VPADDQ     Y2, Y7, Y2
    41	VMOVDQU    64(CX), Y3
    42	VMOVDQU    96(CX), Y6
    43	PREFETCHT0 576(CX)
    44	VPXOR      8(DX), Y3, Y4
    45	VPXOR      40(DX), Y6, Y7
    46	VPSHUFD    $0x31, Y4, Y5
    47	VPSHUFD    $0x31, Y7, Y8
    48	VPMULUDQ   Y4, Y5, Y4
    49	VPMULUDQ   Y7, Y8, Y7
    50	VPSHUFD    $0x4e, Y3, Y3
    51	VPSHUFD    $0x4e, Y6, Y6
    52	VPADDQ     Y1, Y3, Y1
    53	VPADDQ     Y1, Y4, Y1
    54	VPADDQ     Y2, Y6, Y2
    55	VPADDQ     Y2, Y7, Y2
    56	VMOVDQU    128(CX), Y3
    57	VMOVDQU    160(CX), Y6
    58	PREFETCHT0 640(CX)
    59	VPXOR      16(DX), Y3, Y4
    60	VPXOR      48(DX), Y6, Y7
    61	VPSHUFD    $0x31, Y4, Y5
    62	VPSHUFD    $0x31, Y7, Y8
    63	VPMULUDQ   Y4, Y5, Y4
    64	VPMULUDQ   Y7, Y8, Y7
    65	VPSHUFD    $0x4e, Y3, Y3
    66	VPSHUFD    $0x4e, Y6, Y6
    67	VPADDQ     Y1, Y3, Y1
    68	VPADDQ     Y1, Y4, Y1
    69	VPADDQ     Y2, Y6, Y2
    70	VPADDQ     Y2, Y7, Y2
    71	VMOVDQU    192(CX), Y3
    72	VMOVDQU    224(CX), Y6
    73	PREFETCHT0 704(CX)
    74	VPXOR      24(DX), Y3, Y4
    75	VPXOR      56(DX), Y6, Y7
    76	VPSHUFD    $0x31, Y4, Y5
    77	VPSHUFD    $0x31, Y7, Y8
    78	VPMULUDQ   Y4, Y5, Y4
    79	VPMULUDQ   Y7, Y8, Y7
    80	VPSHUFD    $0x4e, Y3, Y3
    81	VPSHUFD    $0x4e, Y6, Y6
    82	VPADDQ     Y1, Y3, Y1
    83	VPADDQ     Y1, Y4, Y1
    84	VPADDQ     Y2, Y6, Y2
    85	VPADDQ     Y2, Y7, Y2
    86	VMOVDQU    256(CX), Y3
    87	VMOVDQU    288(CX), Y6
    88	PREFETCHT0 768(CX)
    89	VPXOR      32(DX), Y3, Y4
    90	VPXOR      64(DX), Y6, Y7
    91	VPSHUFD    $0x31, Y4, Y5
    92	VPSHUFD    $0x31, Y7, Y8
    93	VPMULUDQ   Y4, Y5, Y4
    94	VPMULUDQ   Y7, Y8, Y7
    95	VPSHUFD    $0x4e, Y3, Y3
    96	VPSHUFD    $0x4e, Y6, Y6
    97	VPADDQ     Y1, Y3, Y1
    98	VPADDQ     Y1, Y4, Y1
    99	VPADDQ     Y2, Y6, Y2
   100	VPADDQ     Y2, Y7, Y2
   101	VMOVDQU    320(CX), Y3
   102	VMOVDQU    352(CX), Y6
   103	PREFETCHT0 832(CX)
   104	VPXOR      40(DX), Y3, Y4
   105	VPXOR      72(DX), Y6, Y7
   106	VPSHUFD    $0x31, Y4, Y5
   107	VPSHUFD    $0x31, Y7, Y8
   108	VPMULUDQ   Y4, Y5, Y4
   109	VPMULUDQ   Y7, Y8, Y7
   110	VPSHUFD    $0x4e, Y3, Y3
   111	VPSHUFD    $0x4e, Y6, Y6
   112	VPADDQ     Y1, Y3, Y1
   113	VPADDQ     Y1, Y4, Y1
   114	VPADDQ     Y2, Y6, Y2
   115	VPADDQ     Y2, Y7, Y2
   116	VMOVDQU    384(CX), Y3
   117	VMOVDQU    416(CX), Y6
   118	PREFETCHT0 896(CX)
   119	VPXOR      48(DX), Y3, Y4
   120	VPXOR      80(DX), Y6, Y7
   121	VPSHUFD    $0x31, Y4, Y5
   122	VPSHUFD    $0x31, Y7, Y8
   123	VPMULUDQ   Y4, Y5, Y4
   124	VPMULUDQ   Y7, Y8, Y7
   125	VPSHUFD    $0x4e, Y3, Y3
   126	VPSHUFD    $0x4e, Y6, Y6
   127	VPADDQ     Y1, Y3, Y1
   128	VPADDQ     Y1, Y4, Y1
   129	VPADDQ     Y2, Y6, Y2
   130	VPADDQ     Y2, Y7, Y2
   131	VMOVDQU    448(CX), Y3
   132	VMOVDQU    480(CX), Y6
   133	PREFETCHT0 960(CX)
   134	VPXOR      56(DX), Y3, Y4
   135	VPXOR      88(DX), Y6, Y7
   136	VPSHUFD    $0x31, Y4, Y5
   137	VPSHUFD    $0x31, Y7, Y8
   138	VPMULUDQ   Y4, Y5, Y4
   139	VPMULUDQ   Y7, Y8, Y7
   140	VPSHUFD    $0x4e, Y3, Y3
   141	VPSHUFD    $0x4e, Y6, Y6
   142	VPADDQ     Y1, Y3, Y1
   143	VPADDQ     Y1, Y4, Y1
   144	VPADDQ     Y2, Y6, Y2
   145	VPADDQ     Y2, Y7, Y2
   146	VMOVDQU    512(CX), Y3
   147	VMOVDQU    544(CX), Y6
   148	PREFETCHT0 1024(CX)
   149	VPXOR      64(DX), Y3, Y4
   150	VPXOR      96(DX), Y6, Y7
   151	VPSHUFD    $0x31, Y4, Y5
   152	VPSHUFD    $0x31, Y7, Y8
   153	VPMULUDQ   Y4, Y5, Y4
   154	VPMULUDQ   Y7, Y8, Y7
   155	VPSHUFD    $0x4e, Y3, Y3
   156	VPSHUFD    $0x4e, Y6, Y6
   157	VPADDQ     Y1, Y3, Y1
   158	VPADDQ     Y1, Y4, Y1
   159	VPADDQ     Y2, Y6, Y2
   160	VPADDQ     Y2, Y7, Y2
   161	VMOVDQU    576(CX), Y3
   162	VMOVDQU    608(CX), Y6
   163	PREFETCHT0 1088(CX)
   164	VPXOR      72(DX), Y3, Y4
   165	VPXOR      104(DX), Y6, Y7
   166	VPSHUFD    $0x31, Y4, Y5
   167	VPSHUFD    $0x31, Y7, Y8
   168	VPMULUDQ   Y4, Y5, Y4
   169	VPMULUDQ   Y7, Y8, Y7
   170	VPSHUFD    $0x4e, Y3, Y3
   171	VPSHUFD    $0x4e, Y6, Y6
   172	VPADDQ     Y1, Y3, Y1
   173	VPADDQ     Y1, Y4, Y1
   174	VPADDQ     Y2, Y6, Y2
   175	VPADDQ     Y2, Y7, Y2
   176	VMOVDQU    640(CX), Y3
   177	VMOVDQU    672(CX), Y6
   178	PREFETCHT0 1152(CX)
   179	VPXOR      80(DX), Y3, Y4
   180	VPXOR      112(DX), Y6, Y7
   181	VPSHUFD    $0x31, Y4, Y5
   182	VPSHUFD    $0x31, Y7, Y8
   183	VPMULUDQ   Y4, Y5, Y4
   184	VPMULUDQ   Y7, Y8, Y7
   185	VPSHUFD    $0x4e, Y3, Y3
   186	VPSHUFD    $0x4e, Y6, Y6
   187	VPADDQ     Y1, Y3, Y1
   188	VPADDQ     Y1, Y4, Y1
   189	VPADDQ     Y2, Y6, Y2
   190	VPADDQ     Y2, Y7, Y2
   191	VMOVDQU    704(CX), Y3
   192	VMOVDQU    736(CX), Y6
   193	PREFETCHT0 1216(CX)
   194	VPXOR      88(DX), Y3, Y4
   195	VPXOR      120(DX), Y6, Y7
   196	VPSHUFD    $0x31, Y4, Y5
   197	VPSHUFD    $0x31, Y7, Y8
   198	VPMULUDQ   Y4, Y5, Y4
   199	VPMULUDQ   Y7, Y8, Y7
   200	VPSHUFD    $0x4e, Y3, Y3
   201	VPSHUFD    $0x4e, Y6, Y6
   202	VPADDQ     Y1, Y3, Y1
   203	VPADDQ     Y1, Y4, Y1
   204	VPADDQ     Y2, Y6, Y2
   205	VPADDQ     Y2, Y7, Y2
   206	VMOVDQU    768(CX), Y3
   207	VMOVDQU    800(CX), Y6
   208	PREFETCHT0 1280(CX)
   209	VPXOR      96(DX), Y3, Y4
   210	VPXOR      128(DX), Y6, Y7
   211	VPSHUFD    $0x31, Y4, Y5
   212	VPSHUFD    $0x31, Y7, Y8
   213	VPMULUDQ   Y4, Y5, Y4
   214	VPMULUDQ   Y7, Y8, Y7
   215	VPSHUFD    $0x4e, Y3, Y3
   216	VPSHUFD    $0x4e, Y6, Y6
   217	VPADDQ     Y1, Y3, Y1
   218	VPADDQ     Y1, Y4, Y1
   219	VPADDQ     Y2, Y6, Y2
   220	VPADDQ     Y2, Y7, Y2
   221	VMOVDQU    832(CX), Y3
   222	VMOVDQU    864(CX), Y6
   223	PREFETCHT0 1344(CX)
   224	VPXOR      104(DX), Y3, Y4
   225	VPXOR      136(DX), Y6, Y7
   226	VPSHUFD    $0x31, Y4, Y5
   227	VPSHUFD    $0x31, Y7, Y8
   228	VPMULUDQ   Y4, Y5, Y4
   229	VPMULUDQ   Y7, Y8, Y7
   230	VPSHUFD    $0x4e, Y3, Y3
   231	VPSHUFD    $0x4e, Y6, Y6
   232	VPADDQ     Y1, Y3, Y1
   233	VPADDQ     Y1, Y4, Y1
   234	VPADDQ     Y2, Y6, Y2
   235	VPADDQ     Y2, Y7, Y2
   236	VMOVDQU    896(CX), Y3
   237	VMOVDQU    928(CX), Y6
   238	PREFETCHT0 1408(CX)
   239	VPXOR      112(DX), Y3, Y4
   240	VPXOR      144(DX), Y6, Y7
   241	VPSHUFD    $0x31, Y4, Y5
   242	VPSHUFD    $0x31, Y7, Y8
   243	VPMULUDQ   Y4, Y5, Y4
   244	VPMULUDQ   Y7, Y8, Y7
   245	VPSHUFD    $0x4e, Y3, Y3
   246	VPSHUFD    $0x4e, Y6, Y6
   247	VPADDQ     Y1, Y3, Y1
   248	VPADDQ     Y1, Y4, Y1
   249	VPADDQ     Y2, Y6, Y2
   250	VPADDQ     Y2, Y7, Y2
   251	VMOVDQU    960(CX), Y3
   252	VMOVDQU    992(CX), Y6
   253	PREFETCHT0 1472(CX)
   254	VPXOR      120(DX), Y3, Y4
   255	VPXOR      152(DX), Y6, Y7
   256	VPSHUFD    $0x31, Y4, Y5
   257	VPSHUFD    $0x31, Y7, Y8
   258	VPMULUDQ   Y4, Y5, Y4
   259	VPMULUDQ   Y7, Y8, Y7
   260	VPSHUFD    $0x4e, Y3, Y3
   261	VPSHUFD    $0x4e, Y6, Y6
   262	VPADDQ     Y1, Y3, Y1
   263	VPADDQ     Y1, Y4, Y1
   264	VPADDQ     Y2, Y6, Y2
   265	VPADDQ     Y2, Y7, Y2
   266	ADDQ       $0x00000400, CX
   267	SUBQ       $0x00000400, SI
   268	VPSRLQ     $0x2f, Y1, Y3
   269	VPXOR      Y1, Y3, Y3
   270	VPXOR      128(DX), Y3, Y3
   271	VPMULUDQ   Y0, Y3, Y1
   272	VPSHUFD    $0xf5, Y3, Y3
   273	VPMULUDQ   Y0, Y3, Y3
   274	VPSLLQ     $0x20, Y3, Y3
   275	VPADDQ     Y1, Y3, Y1
   276	VPSRLQ     $0x2f, Y2, Y3
   277	VPXOR      Y2, Y3, Y3
   278	VPXOR      160(DX), Y3, Y3
   279	VPMULUDQ   Y0, Y3, Y2
   280	VPSHUFD    $0xf5, Y3, Y3
   281	VPMULUDQ   Y0, Y3, Y3
   282	VPSLLQ     $0x20, Y3, Y3
   283	VPADDQ     Y2, Y3, Y2
   284	JMP        accum_large
   285
   286accum:
   287	CMPQ     SI, $0x40
   288	JLE      finalize
   289	VMOVDQU  (CX), Y0
   290	VMOVDQU  32(CX), Y5
   291	VPXOR    (BX), Y0, Y3
   292	VPXOR    32(BX), Y5, Y6
   293	VPSHUFD  $0x31, Y3, Y4
   294	VPSHUFD  $0x31, Y6, Y7
   295	VPMULUDQ Y3, Y4, Y3
   296	VPMULUDQ Y6, Y7, Y6
   297	VPSHUFD  $0x4e, Y0, Y0
   298	VPSHUFD  $0x4e, Y5, Y5
   299	VPADDQ   Y1, Y0, Y1
   300	VPADDQ   Y1, Y3, Y1
   301	VPADDQ   Y2, Y5, Y2
   302	VPADDQ   Y2, Y6, Y2
   303	ADDQ     $0x00000040, CX
   304	SUBQ     $0x00000040, SI
   305	ADDQ     $0x00000008, BX
   306	JMP      accum
   307
   308finalize:
   309	CMPQ     SI, $0x00
   310	JE       return
   311	SUBQ     $0x40, CX
   312	ADDQ     SI, CX
   313	VMOVDQU  (CX), Y0
   314	VMOVDQU  32(CX), Y5
   315	VPXOR    121(DX), Y0, Y3
   316	VPXOR    153(DX), Y5, Y6
   317	VPSHUFD  $0x31, Y3, Y4
   318	VPSHUFD  $0x31, Y6, Y7
   319	VPMULUDQ Y3, Y4, Y3
   320	VPMULUDQ Y6, Y7, Y6
   321	VPSHUFD  $0x4e, Y0, Y0
   322	VPSHUFD  $0x4e, Y5, Y5
   323	VPADDQ   Y1, Y0, Y1
   324	VPADDQ   Y1, Y3, Y1
   325	VPADDQ   Y2, Y5, Y2
   326	VPADDQ   Y2, Y6, Y2
   327
   328return:
   329	VMOVDQU Y1, (AX)
   330	VMOVDQU Y2, 32(AX)
   331	VZEROUPPER
   332	RET
   333
   334// func accumBlockAVX2(acc *[8]uint64, data *byte, key *byte)
   335// Requires: AVX, AVX2
   336TEXT ·accumBlockAVX2(SB), NOSPLIT, $0-24
   337	MOVQ     acc+0(FP), AX
   338	MOVQ     data+8(FP), CX
   339	MOVQ     key+16(FP), DX
   340	VMOVDQU  (AX), Y1
   341	VMOVDQU  32(AX), Y2
   342	VMOVDQU  prime_avx<>+0(SB), Y0
   343	VMOVDQU  (CX), Y3
   344	VMOVDQU  32(CX), Y6
   345	VPXOR    (DX), Y3, Y4
   346	VPXOR    32(DX), Y6, Y7
   347	VPSHUFD  $0x31, Y4, Y5
   348	VPSHUFD  $0x31, Y7, Y8
   349	VPMULUDQ Y4, Y5, Y4
   350	VPMULUDQ Y7, Y8, Y7
   351	VPSHUFD  $0x4e, Y3, Y3
   352	VPSHUFD  $0x4e, Y6, Y6
   353	VPADDQ   Y1, Y3, Y1
   354	VPADDQ   Y1, Y4, Y1
   355	VPADDQ   Y2, Y6, Y2
   356	VPADDQ   Y2, Y7, Y2
   357	VMOVDQU  64(CX), Y3
   358	VMOVDQU  96(CX), Y6
   359	VPXOR    8(DX), Y3, Y4
   360	VPXOR    40(DX), Y6, Y7
   361	VPSHUFD  $0x31, Y4, Y5
   362	VPSHUFD  $0x31, Y7, Y8
   363	VPMULUDQ Y4, Y5, Y4
   364	VPMULUDQ Y7, Y8, Y7
   365	VPSHUFD  $0x4e, Y3, Y3
   366	VPSHUFD  $0x4e, Y6, Y6
   367	VPADDQ   Y1, Y3, Y1
   368	VPADDQ   Y1, Y4, Y1
   369	VPADDQ   Y2, Y6, Y2
   370	VPADDQ   Y2, Y7, Y2
   371	VMOVDQU  128(CX), Y3
   372	VMOVDQU  160(CX), Y6
   373	VPXOR    16(DX), Y3, Y4
   374	VPXOR    48(DX), Y6, Y7
   375	VPSHUFD  $0x31, Y4, Y5
   376	VPSHUFD  $0x31, Y7, Y8
   377	VPMULUDQ Y4, Y5, Y4
   378	VPMULUDQ Y7, Y8, Y7
   379	VPSHUFD  $0x4e, Y3, Y3
   380	VPSHUFD  $0x4e, Y6, Y6
   381	VPADDQ   Y1, Y3, Y1
   382	VPADDQ   Y1, Y4, Y1
   383	VPADDQ   Y2, Y6, Y2
   384	VPADDQ   Y2, Y7, Y2
   385	VMOVDQU  192(CX), Y3
   386	VMOVDQU  224(CX), Y6
   387	VPXOR    24(DX), Y3, Y4
   388	VPXOR    56(DX), Y6, Y7
   389	VPSHUFD  $0x31, Y4, Y5
   390	VPSHUFD  $0x31, Y7, Y8
   391	VPMULUDQ Y4, Y5, Y4
   392	VPMULUDQ Y7, Y8, Y7
   393	VPSHUFD  $0x4e, Y3, Y3
   394	VPSHUFD  $0x4e, Y6, Y6
   395	VPADDQ   Y1, Y3, Y1
   396	VPADDQ   Y1, Y4, Y1
   397	VPADDQ   Y2, Y6, Y2
   398	VPADDQ   Y2, Y7, Y2
   399	VMOVDQU  256(CX), Y3
   400	VMOVDQU  288(CX), Y6
   401	VPXOR    32(DX), Y3, Y4
   402	VPXOR    64(DX), Y6, Y7
   403	VPSHUFD  $0x31, Y4, Y5
   404	VPSHUFD  $0x31, Y7, Y8
   405	VPMULUDQ Y4, Y5, Y4
   406	VPMULUDQ Y7, Y8, Y7
   407	VPSHUFD  $0x4e, Y3, Y3
   408	VPSHUFD  $0x4e, Y6, Y6
   409	VPADDQ   Y1, Y3, Y1
   410	VPADDQ   Y1, Y4, Y1
   411	VPADDQ   Y2, Y6, Y2
   412	VPADDQ   Y2, Y7, Y2
   413	VMOVDQU  320(CX), Y3
   414	VMOVDQU  352(CX), Y6
   415	VPXOR    40(DX), Y3, Y4
   416	VPXOR    72(DX), Y6, Y7
   417	VPSHUFD  $0x31, Y4, Y5
   418	VPSHUFD  $0x31, Y7, Y8
   419	VPMULUDQ Y4, Y5, Y4
   420	VPMULUDQ Y7, Y8, Y7
   421	VPSHUFD  $0x4e, Y3, Y3
   422	VPSHUFD  $0x4e, Y6, Y6
   423	VPADDQ   Y1, Y3, Y1
   424	VPADDQ   Y1, Y4, Y1
   425	VPADDQ   Y2, Y6, Y2
   426	VPADDQ   Y2, Y7, Y2
   427	VMOVDQU  384(CX), Y3
   428	VMOVDQU  416(CX), Y6
   429	VPXOR    48(DX), Y3, Y4
   430	VPXOR    80(DX), Y6, Y7
   431	VPSHUFD  $0x31, Y4, Y5
   432	VPSHUFD  $0x31, Y7, Y8
   433	VPMULUDQ Y4, Y5, Y4
   434	VPMULUDQ Y7, Y8, Y7
   435	VPSHUFD  $0x4e, Y3, Y3
   436	VPSHUFD  $0x4e, Y6, Y6
   437	VPADDQ   Y1, Y3, Y1
   438	VPADDQ   Y1, Y4, Y1
   439	VPADDQ   Y2, Y6, Y2
   440	VPADDQ   Y2, Y7, Y2
   441	VMOVDQU  448(CX), Y3
   442	VMOVDQU  480(CX), Y6
   443	VPXOR    56(DX), Y3, Y4
   444	VPXOR    88(DX), Y6, Y7
   445	VPSHUFD  $0x31, Y4, Y5
   446	VPSHUFD  $0x31, Y7, Y8
   447	VPMULUDQ Y4, Y5, Y4
   448	VPMULUDQ Y7, Y8, Y7
   449	VPSHUFD  $0x4e, Y3, Y3
   450	VPSHUFD  $0x4e, Y6, Y6
   451	VPADDQ   Y1, Y3, Y1
   452	VPADDQ   Y1, Y4, Y1
   453	VPADDQ   Y2, Y6, Y2
   454	VPADDQ   Y2, Y7, Y2
   455	VMOVDQU  512(CX), Y3
   456	VMOVDQU  544(CX), Y6
   457	VPXOR    64(DX), Y3, Y4
   458	VPXOR    96(DX), Y6, Y7
   459	VPSHUFD  $0x31, Y4, Y5
   460	VPSHUFD  $0x31, Y7, Y8
   461	VPMULUDQ Y4, Y5, Y4
   462	VPMULUDQ Y7, Y8, Y7
   463	VPSHUFD  $0x4e, Y3, Y3
   464	VPSHUFD  $0x4e, Y6, Y6
   465	VPADDQ   Y1, Y3, Y1
   466	VPADDQ   Y1, Y4, Y1
   467	VPADDQ   Y2, Y6, Y2
   468	VPADDQ   Y2, Y7, Y2
   469	VMOVDQU  576(CX), Y3
   470	VMOVDQU  608(CX), Y6
   471	VPXOR    72(DX), Y3, Y4
   472	VPXOR    104(DX), Y6, Y7
   473	VPSHUFD  $0x31, Y4, Y5
   474	VPSHUFD  $0x31, Y7, Y8
   475	VPMULUDQ Y4, Y5, Y4
   476	VPMULUDQ Y7, Y8, Y7
   477	VPSHUFD  $0x4e, Y3, Y3
   478	VPSHUFD  $0x4e, Y6, Y6
   479	VPADDQ   Y1, Y3, Y1
   480	VPADDQ   Y1, Y4, Y1
   481	VPADDQ   Y2, Y6, Y2
   482	VPADDQ   Y2, Y7, Y2
   483	VMOVDQU  640(CX), Y3
   484	VMOVDQU  672(CX), Y6
   485	VPXOR    80(DX), Y3, Y4
   486	VPXOR    112(DX), Y6, Y7
   487	VPSHUFD  $0x31, Y4, Y5
   488	VPSHUFD  $0x31, Y7, Y8
   489	VPMULUDQ Y4, Y5, Y4
   490	VPMULUDQ Y7, Y8, Y7
   491	VPSHUFD  $0x4e, Y3, Y3
   492	VPSHUFD  $0x4e, Y6, Y6
   493	VPADDQ   Y1, Y3, Y1
   494	VPADDQ   Y1, Y4, Y1
   495	VPADDQ   Y2, Y6, Y2
   496	VPADDQ   Y2, Y7, Y2
   497	VMOVDQU  704(CX), Y3
   498	VMOVDQU  736(CX), Y6
   499	VPXOR    88(DX), Y3, Y4
   500	VPXOR    120(DX), Y6, Y7
   501	VPSHUFD  $0x31, Y4, Y5
   502	VPSHUFD  $0x31, Y7, Y8
   503	VPMULUDQ Y4, Y5, Y4
   504	VPMULUDQ Y7, Y8, Y7
   505	VPSHUFD  $0x4e, Y3, Y3
   506	VPSHUFD  $0x4e, Y6, Y6
   507	VPADDQ   Y1, Y3, Y1
   508	VPADDQ   Y1, Y4, Y1
   509	VPADDQ   Y2, Y6, Y2
   510	VPADDQ   Y2, Y7, Y2
   511	VMOVDQU  768(CX), Y3
   512	VMOVDQU  800(CX), Y6
   513	VPXOR    96(DX), Y3, Y4
   514	VPXOR    128(DX), Y6, Y7
   515	VPSHUFD  $0x31, Y4, Y5
   516	VPSHUFD  $0x31, Y7, Y8
   517	VPMULUDQ Y4, Y5, Y4
   518	VPMULUDQ Y7, Y8, Y7
   519	VPSHUFD  $0x4e, Y3, Y3
   520	VPSHUFD  $0x4e, Y6, Y6
   521	VPADDQ   Y1, Y3, Y1
   522	VPADDQ   Y1, Y4, Y1
   523	VPADDQ   Y2, Y6, Y2
   524	VPADDQ   Y2, Y7, Y2
   525	VMOVDQU  832(CX), Y3
   526	VMOVDQU  864(CX), Y6
   527	VPXOR    104(DX), Y3, Y4
   528	VPXOR    136(DX), Y6, Y7
   529	VPSHUFD  $0x31, Y4, Y5
   530	VPSHUFD  $0x31, Y7, Y8
   531	VPMULUDQ Y4, Y5, Y4
   532	VPMULUDQ Y7, Y8, Y7
   533	VPSHUFD  $0x4e, Y3, Y3
   534	VPSHUFD  $0x4e, Y6, Y6
   535	VPADDQ   Y1, Y3, Y1
   536	VPADDQ   Y1, Y4, Y1
   537	VPADDQ   Y2, Y6, Y2
   538	VPADDQ   Y2, Y7, Y2
   539	VMOVDQU  896(CX), Y3
   540	VMOVDQU  928(CX), Y6
   541	VPXOR    112(DX), Y3, Y4
   542	VPXOR    144(DX), Y6, Y7
   543	VPSHUFD  $0x31, Y4, Y5
   544	VPSHUFD  $0x31, Y7, Y8
   545	VPMULUDQ Y4, Y5, Y4
   546	VPMULUDQ Y7, Y8, Y7
   547	VPSHUFD  $0x4e, Y3, Y3
   548	VPSHUFD  $0x4e, Y6, Y6
   549	VPADDQ   Y1, Y3, Y1
   550	VPADDQ   Y1, Y4, Y1
   551	VPADDQ   Y2, Y6, Y2
   552	VPADDQ   Y2, Y7, Y2
   553	VMOVDQU  960(CX), Y3
   554	VMOVDQU  992(CX), Y6
   555	VPXOR    120(DX), Y3, Y4
   556	VPXOR    152(DX), Y6, Y7
   557	VPSHUFD  $0x31, Y4, Y5
   558	VPSHUFD  $0x31, Y7, Y8
   559	VPMULUDQ Y4, Y5, Y4
   560	VPMULUDQ Y7, Y8, Y7
   561	VPSHUFD  $0x4e, Y3, Y3
   562	VPSHUFD  $0x4e, Y6, Y6
   563	VPADDQ   Y1, Y3, Y1
   564	VPADDQ   Y1, Y4, Y1
   565	VPADDQ   Y2, Y6, Y2
   566	VPADDQ   Y2, Y7, Y2
   567	VPSRLQ   $0x2f, Y1, Y3
   568	VPXOR    Y1, Y3, Y3
   569	VPXOR    128(DX), Y3, Y3
   570	VPMULUDQ Y0, Y3, Y1
   571	VPSHUFD  $0xf5, Y3, Y3
   572	VPMULUDQ Y0, Y3, Y3
   573	VPSLLQ   $0x20, Y3, Y3
   574	VPADDQ   Y1, Y3, Y1
   575	VPSRLQ   $0x2f, Y2, Y3
   576	VPXOR    Y2, Y3, Y3
   577	VPXOR    160(DX), Y3, Y3
   578	VPMULUDQ Y0, Y3, Y2
   579	VPSHUFD  $0xf5, Y3, Y3
   580	VPMULUDQ Y0, Y3, Y3
   581	VPSLLQ   $0x20, Y3, Y3
   582	VPADDQ   Y2, Y3, Y2
   583	VMOVDQU  Y1, (AX)
   584	VMOVDQU  Y2, 32(AX)
   585	VZEROUPPER
   586	RET

View as plain text