...

Text file src/github.com/apache/arrow/go/v15/parquet/internal/utils/_lib/bit_packing_avx2.s

Documentation: github.com/apache/arrow/go/v15/parquet/internal/utils/_lib

     1	.text
     2	.intel_syntax noprefix
     3	.file	"bit_packing_avx2.c"
     4	.section	.rodata.cst8,"aM",@progbits,8
     5	.p2align	3                               # -- Begin function unpack32_avx2
     6.LCPI0_0:
     7	.quad	9223372034707292159             # 0x7fffffff7fffffff
     8.LCPI0_8:
     9	.quad	4611686015206162431             # 0x3fffffff3fffffff
    10.LCPI0_12:
    11	.quad	2305843005455597567             # 0x1fffffff1fffffff
    12.LCPI0_23:
    13	.quad	1152921500580315135             # 0xfffffff0fffffff
    14.LCPI0_25:
    15	.quad	576460748142673919              # 0x7ffffff07ffffff
    16.LCPI0_34:
    17	.quad	288230371923853311              # 0x3ffffff03ffffff
    18.LCPI0_35:
    19	.quad	42949672976                     # 0xa00000010
    20.LCPI0_36:
    21	.quad	94489280528                     # 0x1600000010
    22.LCPI0_38:
    23	.quad	144115183814443007              # 0x1ffffff01ffffff
    24.LCPI0_49:
    25	.quad	36028792732385279               # 0x7fffff007fffff
    26.LCPI0_56:
    27	.quad	18014394218708991               # 0x3fffff003fffff
    28.LCPI0_59:
    29	.quad	9007194961870847                # 0x1fffff001fffff
    30.LCPI0_66:
    31	.quad	4503595333451775                # 0xfffff000fffff
    32.LCPI0_68:
    33	.quad	2251795519242239                # 0x7ffff0007ffff
    34.LCPI0_73:
    35	.quad	1125895612137471                # 0x3ffff0003ffff
    36.LCPI0_76:
    37	.quad	562945658585087                 # 0x1ffff0001ffff
    38.LCPI0_80:
    39	.quad	68719476736                     # 0x1000000000
    40.LCPI0_82:
    41	.quad	140733193420799                 # 0x7fff00007fff
    42.LCPI0_87:
    43	.quad	70364449226751                  # 0x3fff00003fff
    44.LCPI0_90:
    45	.quad	35180077129727                  # 0x1fff00001fff
    46.LCPI0_95:
    47	.quad	17587891081215                  # 0xfff00000fff
    48.LCPI0_97:
    49	.quad	8791798056959                   # 0x7ff000007ff
    50.LCPI0_102:
    51	.quad	4393751544831                   # 0x3ff000003ff
    52.LCPI0_105:
    53	.quad	2194728288767                   # 0x1ff000001ff
    54.LCPI0_112:
    55	.quad	545460846719                    # 0x7f0000007f
    56.LCPI0_117:
    57	.quad	270582939711                    # 0x3f0000003f
    58.LCPI0_120:
    59	.quad	133143986207                    # 0x1f0000001f
    60.LCPI0_125:
    61	.quad	64424509455                     # 0xf0000000f
    62.LCPI0_127:
    63	.quad	30064771079                     # 0x700000007
    64.LCPI0_132:
    65	.quad	12884901891                     # 0x300000003
    66.LCPI0_135:
    67	.quad	4294967297                      # 0x100000001
    68	.section	.rodata.cst32,"aM",@progbits,32
    69	.p2align	5
    70.LCPI0_1:
    71	.long	24                              # 0x18
    72	.long	23                              # 0x17
    73	.long	22                              # 0x16
    74	.long	21                              # 0x15
    75	.long	20                              # 0x14
    76	.long	19                              # 0x13
    77	.long	18                              # 0x12
    78	.long	17                              # 0x11
    79.LCPI0_2:
    80	.long	8                               # 0x8
    81	.long	9                               # 0x9
    82	.long	10                              # 0xa
    83	.long	11                              # 0xb
    84	.long	12                              # 0xc
    85	.long	13                              # 0xd
    86	.long	14                              # 0xe
    87	.long	15                              # 0xf
    88.LCPI0_3:
    89	.long	16                              # 0x10
    90	.long	15                              # 0xf
    91	.long	14                              # 0xe
    92	.long	13                              # 0xd
    93	.long	12                              # 0xc
    94	.long	11                              # 0xb
    95	.long	10                              # 0xa
    96	.long	9                               # 0x9
    97.LCPI0_4:
    98	.long	16                              # 0x10
    99	.long	17                              # 0x11
   100	.long	18                              # 0x12
   101	.long	19                              # 0x13
   102	.long	20                              # 0x14
   103	.long	21                              # 0x15
   104	.long	22                              # 0x16
   105	.long	23                              # 0x17
   106.LCPI0_7:
   107	.long	0                               # 0x0
   108	.long	0                               # 0x0
   109	.long	0                               # 0x0
   110	.long	0                               # 0x0
   111	.long	0                               # 0x0
   112	.long	0                               # 0x0
   113	.long	0                               # 0x0
   114	.long	1                               # 0x1
   115.LCPI0_11:
   116	.long	0                               # 0x0
   117	.long	0                               # 0x0
   118	.long	0                               # 0x0
   119	.long	0                               # 0x0
   120	.long	0                               # 0x0
   121	.long	0                               # 0x0
   122	.long	0                               # 0x0
   123	.long	2                               # 0x2
   124.LCPI0_15:
   125	.long	0                               # 0x0
   126	.long	0                               # 0x0
   127	.long	2                               # 0x2
   128	.long	0                               # 0x0
   129	.long	0                               # 0x0
   130	.long	0                               # 0x0
   131	.long	0                               # 0x0
   132	.long	0                               # 0x0
   133.LCPI0_18:
   134	.long	0                               # 0x0
   135	.long	0                               # 0x0
   136	.long	0                               # 0x0
   137	.long	0                               # 0x0
   138	.long	0                               # 0x0
   139	.long	1                               # 0x1
   140	.long	0                               # 0x0
   141	.long	0                               # 0x0
   142.LCPI0_21:
   143	.long	0                               # 0x0
   144	.long	0                               # 0x0
   145	.long	0                               # 0x0
   146	.long	0                               # 0x0
   147	.long	0                               # 0x0
   148	.long	0                               # 0x0
   149	.long	0                               # 0x0
   150	.long	3                               # 0x3
   151.LCPI0_22:
   152	.long	0                               # 0x0
   153	.long	0                               # 0x0
   154	.long	0                               # 0x0
   155	.long	0                               # 0x0
   156	.long	0                               # 0x0
   157	.long	0                               # 0x0
   158	.long	0                               # 0x0
   159	.long	4                               # 0x4
   160.LCPI0_24:
   161	.long	0                               # 0x0
   162	.long	0                               # 0x0
   163	.long	0                               # 0x0
   164	.long	0                               # 0x0
   165	.long	0                               # 0x0
   166	.long	0                               # 0x0
   167	.long	2                               # 0x2
   168	.long	0                               # 0x0
   169.LCPI0_28:
   170	.long	0                               # 0x0
   171	.long	0                               # 0x0
   172	.long	0                               # 0x0
   173	.long	0                               # 0x0
   174	.long	4                               # 0x4
   175	.long	0                               # 0x0
   176	.long	0                               # 0x0
   177	.long	0                               # 0x0
   178.LCPI0_31:
   179	.long	0                               # 0x0
   180	.long	0                               # 0x0
   181	.long	0                               # 0x0
   182	.long	1                               # 0x1
   183	.long	0                               # 0x0
   184	.long	0                               # 0x0
   185	.long	0                               # 0x0
   186	.long	0                               # 0x0
   187.LCPI0_32:
   188	.long	0                               # 0x0
   189	.long	3                               # 0x3
   190	.long	0                               # 0x0
   191	.long	0                               # 0x0
   192	.long	0                               # 0x0
   193	.long	0                               # 0x0
   194	.long	0                               # 0x0
   195	.long	5                               # 0x5
   196.LCPI0_33:
   197	.long	0                               # 0x0
   198	.long	0                               # 0x0
   199	.long	0                               # 0x0
   200	.long	0                               # 0x0
   201	.long	0                               # 0x0
   202	.long	2                               # 0x2
   203	.long	0                               # 0x0
   204	.long	0                               # 0x0
   205.LCPI0_37:
   206	.long	0                               # 0x0
   207	.long	0                               # 0x0
   208	.long	4                               # 0x4
   209	.long	0                               # 0x0
   210	.long	0                               # 0x0
   211	.long	0                               # 0x0
   212	.long	0                               # 0x0
   213	.long	6                               # 0x6
   214.LCPI0_39:
   215	.long	0                               # 0x0
   216	.long	1                               # 0x1
   217	.long	0                               # 0x0
   218	.long	0                               # 0x0
   219	.long	0                               # 0x0
   220	.long	5                               # 0x5
   221	.long	0                               # 0x0
   222	.long	0                               # 0x0
   223.LCPI0_42:
   224	.long	0                               # 0x0
   225	.long	0                               # 0x0
   226	.long	2                               # 0x2
   227	.long	0                               # 0x0
   228	.long	0                               # 0x0
   229	.long	0                               # 0x0
   230	.long	6                               # 0x6
   231	.long	0                               # 0x0
   232.LCPI0_45:
   233	.long	0                               # 0x0
   234	.long	0                               # 0x0
   235	.long	0                               # 0x0
   236	.long	3                               # 0x3
   237	.long	0                               # 0x0
   238	.long	0                               # 0x0
   239	.long	0                               # 0x0
   240	.long	7                               # 0x7
   241.LCPI0_48:
   242	.long	0                               # 0x0
   243	.long	0                               # 0x0
   244	.long	0                               # 0x0
   245	.long	5                               # 0x5
   246	.long	0                               # 0x0
   247	.long	0                               # 0x0
   248	.long	0                               # 0x0
   249	.long	1                               # 0x1
   250.LCPI0_52:
   251	.long	0                               # 0x0
   252	.long	0                               # 0x0
   253	.long	6                               # 0x6
   254	.long	0                               # 0x0
   255	.long	0                               # 0x0
   256	.long	0                               # 0x0
   257	.long	2                               # 0x2
   258	.long	0                               # 0x0
   259.LCPI0_53:
   260	.long	0                               # 0x0
   261	.long	7                               # 0x7
   262	.long	0                               # 0x0
   263	.long	0                               # 0x0
   264	.long	0                               # 0x0
   265	.long	3                               # 0x3
   266	.long	0                               # 0x0
   267	.long	0                               # 0x0
   268.LCPI0_54:
   269	.long	8                               # 0x8
   270	.long	0                               # 0x0
   271	.long	0                               # 0x0
   272	.long	0                               # 0x0
   273	.long	4                               # 0x4
   274	.long	0                               # 0x0
   275	.long	0                               # 0x0
   276	.long	9                               # 0x9
   277.LCPI0_55:
   278	.long	0                               # 0x0
   279	.long	0                               # 0x0
   280	.long	0                               # 0x0
   281	.long	2                               # 0x2
   282	.long	0                               # 0x0
   283	.long	0                               # 0x0
   284	.long	4                               # 0x4
   285	.long	0                               # 0x0
   286.LCPI0_57:
   287	.long	0                               # 0x0
   288	.long	6                               # 0x6
   289	.long	0                               # 0x0
   290	.long	0                               # 0x0
   291	.long	8                               # 0x8
   292	.long	0                               # 0x0
   293	.long	0                               # 0x0
   294	.long	10                              # 0xa
   295.LCPI0_58:
   296	.long	0                               # 0x0
   297	.long	0                               # 0x0
   298	.long	10                              # 0xa
   299	.long	0                               # 0x0
   300	.long	0                               # 0x0
   301	.long	9                               # 0x9
   302	.long	0                               # 0x0
   303	.long	0                               # 0x0
   304.LCPI0_60:
   305	.long	8                               # 0x8
   306	.long	0                               # 0x0
   307	.long	0                               # 0x0
   308	.long	7                               # 0x7
   309	.long	0                               # 0x0
   310	.long	0                               # 0x0
   311	.long	6                               # 0x6
   312	.long	0                               # 0x0
   313.LCPI0_61:
   314	.long	0                               # 0x0
   315	.long	5                               # 0x5
   316	.long	0                               # 0x0
   317	.long	0                               # 0x0
   318	.long	4                               # 0x4
   319	.long	0                               # 0x0
   320	.long	0                               # 0x0
   321	.long	3                               # 0x3
   322.LCPI0_64:
   323	.long	0                               # 0x0
   324	.long	0                               # 0x0
   325	.long	2                               # 0x2
   326	.long	0                               # 0x0
   327	.long	0                               # 0x0
   328	.long	1                               # 0x1
   329	.long	0                               # 0x0
   330	.long	11                              # 0xb
   331.LCPI0_65:
   332	.long	0                               # 0x0
   333	.long	0                               # 0x0
   334	.long	8                               # 0x8
   335	.long	0                               # 0x0
   336	.long	0                               # 0x0
   337	.long	4                               # 0x4
   338	.long	0                               # 0x0
   339	.long	12                              # 0xc
   340.LCPI0_67:
   341	.long	0                               # 0x0
   342	.long	0                               # 0x0
   343	.long	6                               # 0x6
   344	.long	0                               # 0x0
   345	.long	12                              # 0xc
   346	.long	0                               # 0x0
   347	.long	0                               # 0x0
   348	.long	5                               # 0x5
   349.LCPI0_69:
   350	.long	0                               # 0x0
   351	.long	11                              # 0xb
   352	.long	0                               # 0x0
   353	.long	0                               # 0x0
   354	.long	4                               # 0x4
   355	.long	0                               # 0x0
   356	.long	10                              # 0xa
   357	.long	0                               # 0x0
   358.LCPI0_70:
   359	.long	0                               # 0x0
   360	.long	3                               # 0x3
   361	.long	0                               # 0x0
   362	.long	9                               # 0x9
   363	.long	0                               # 0x0
   364	.long	0                               # 0x0
   365	.long	2                               # 0x2
   366	.long	0                               # 0x0
   367.LCPI0_71:
   368	.long	8                               # 0x8
   369	.long	0                               # 0x0
   370	.long	0                               # 0x0
   371	.long	1                               # 0x1
   372	.long	0                               # 0x0
   373	.long	7                               # 0x7
   374	.long	0                               # 0x0
   375	.long	13                              # 0xd
   376.LCPI0_72:
   377	.long	0                               # 0x0
   378	.long	0                               # 0x0
   379	.long	4                               # 0x4
   380	.long	0                               # 0x0
   381	.long	8                               # 0x8
   382	.long	0                               # 0x0
   383	.long	12                              # 0xc
   384	.long	0                               # 0x0
   385.LCPI0_74:
   386	.long	0                               # 0x0
   387	.long	2                               # 0x2
   388	.long	0                               # 0x0
   389	.long	6                               # 0x6
   390	.long	0                               # 0x0
   391	.long	10                              # 0xa
   392	.long	0                               # 0x0
   393	.long	14                              # 0xe
   394.LCPI0_75:
   395	.long	0                               # 0x0
   396	.long	0                               # 0x0
   397	.long	2                               # 0x2
   398	.long	0                               # 0x0
   399	.long	4                               # 0x4
   400	.long	0                               # 0x0
   401	.long	6                               # 0x6
   402	.long	0                               # 0x0
   403.LCPI0_77:
   404	.long	8                               # 0x8
   405	.long	0                               # 0x0
   406	.long	10                              # 0xa
   407	.long	0                               # 0x0
   408	.long	12                              # 0xc
   409	.long	0                               # 0x0
   410	.long	14                              # 0xe
   411	.long	0                               # 0x0
   412.LCPI0_78:
   413	.long	0                               # 0x0
   414	.long	1                               # 0x1
   415	.long	0                               # 0x0
   416	.long	3                               # 0x3
   417	.long	0                               # 0x0
   418	.long	5                               # 0x5
   419	.long	0                               # 0x0
   420	.long	7                               # 0x7
   421.LCPI0_79:
   422	.long	0                               # 0x0
   423	.long	9                               # 0x9
   424	.long	0                               # 0x0
   425	.long	11                              # 0xb
   426	.long	0                               # 0x0
   427	.long	13                              # 0xd
   428	.long	0                               # 0x0
   429	.long	15                              # 0xf
   430.LCPI0_81:
   431	.long	0                               # 0x0
   432	.long	15                              # 0xf
   433	.long	0                               # 0x0
   434	.long	13                              # 0xd
   435	.long	0                               # 0x0
   436	.long	11                              # 0xb
   437	.long	0                               # 0x0
   438	.long	9                               # 0x9
   439.LCPI0_83:
   440	.long	0                               # 0x0
   441	.long	7                               # 0x7
   442	.long	0                               # 0x0
   443	.long	5                               # 0x5
   444	.long	0                               # 0x0
   445	.long	3                               # 0x3
   446	.long	0                               # 0x0
   447	.long	1                               # 0x1
   448.LCPI0_84:
   449	.long	16                              # 0x10
   450	.long	0                               # 0x0
   451	.long	14                              # 0xe
   452	.long	0                               # 0x0
   453	.long	12                              # 0xc
   454	.long	0                               # 0x0
   455	.long	10                              # 0xa
   456	.long	0                               # 0x0
   457.LCPI0_85:
   458	.long	8                               # 0x8
   459	.long	0                               # 0x0
   460	.long	6                               # 0x6
   461	.long	0                               # 0x0
   462	.long	4                               # 0x4
   463	.long	0                               # 0x0
   464	.long	2                               # 0x2
   465	.long	17                              # 0x11
   466.LCPI0_86:
   467	.long	0                               # 0x0
   468	.long	14                              # 0xe
   469	.long	0                               # 0x0
   470	.long	10                              # 0xa
   471	.long	0                               # 0x0
   472	.long	6                               # 0x6
   473	.long	0                               # 0x0
   474	.long	2                               # 0x2
   475.LCPI0_88:
   476	.long	16                              # 0x10
   477	.long	0                               # 0x0
   478	.long	12                              # 0xc
   479	.long	0                               # 0x0
   480	.long	8                               # 0x8
   481	.long	0                               # 0x0
   482	.long	4                               # 0x4
   483	.long	18                              # 0x12
   484.LCPI0_89:
   485	.long	0                               # 0x0
   486	.long	13                              # 0xd
   487	.long	0                               # 0x0
   488	.long	7                               # 0x7
   489	.long	0                               # 0x0
   490	.long	1                               # 0x1
   491	.long	14                              # 0xe
   492	.long	0                               # 0x0
   493.LCPI0_91:
   494	.long	8                               # 0x8
   495	.long	0                               # 0x0
   496	.long	2                               # 0x2
   497	.long	15                              # 0xf
   498	.long	0                               # 0x0
   499	.long	9                               # 0x9
   500	.long	0                               # 0x0
   501	.long	3                               # 0x3
   502.LCPI0_92:
   503	.long	16                              # 0x10
   504	.long	0                               # 0x0
   505	.long	10                              # 0xa
   506	.long	0                               # 0x0
   507	.long	4                               # 0x4
   508	.long	17                              # 0x11
   509	.long	0                               # 0x0
   510	.long	11                              # 0xb
   511.LCPI0_93:
   512	.long	0                               # 0x0
   513	.long	5                               # 0x5
   514	.long	18                              # 0x12
   515	.long	0                               # 0x0
   516	.long	12                              # 0xc
   517	.long	0                               # 0x0
   518	.long	6                               # 0x6
   519	.long	19                              # 0x13
   520.LCPI0_94:
   521	.long	0                               # 0x0
   522	.long	12                              # 0xc
   523	.long	0                               # 0x0
   524	.long	4                               # 0x4
   525	.long	16                              # 0x10
   526	.long	0                               # 0x0
   527	.long	8                               # 0x8
   528	.long	20                              # 0x14
   529.LCPI0_96:
   530	.long	0                               # 0x0
   531	.long	11                              # 0xb
   532	.long	0                               # 0x0
   533	.long	1                               # 0x1
   534	.long	12                              # 0xc
   535	.long	0                               # 0x0
   536	.long	2                               # 0x2
   537	.long	13                              # 0xd
   538.LCPI0_98:
   539	.long	0                               # 0x0
   540	.long	3                               # 0x3
   541	.long	14                              # 0xe
   542	.long	0                               # 0x0
   543	.long	4                               # 0x4
   544	.long	15                              # 0xf
   545	.long	0                               # 0x0
   546	.long	5                               # 0x5
   547.LCPI0_99:
   548	.long	16                              # 0x10
   549	.long	0                               # 0x0
   550	.long	6                               # 0x6
   551	.long	17                              # 0x11
   552	.long	0                               # 0x0
   553	.long	7                               # 0x7
   554	.long	18                              # 0x12
   555	.long	0                               # 0x0
   556.LCPI0_100:
   557	.long	8                               # 0x8
   558	.long	19                              # 0x13
   559	.long	0                               # 0x0
   560	.long	9                               # 0x9
   561	.long	20                              # 0x14
   562	.long	0                               # 0x0
   563	.long	10                              # 0xa
   564	.long	21                              # 0x15
   565.LCPI0_101:
   566	.long	0                               # 0x0
   567	.long	10                              # 0xa
   568	.long	20                              # 0x14
   569	.long	0                               # 0x0
   570	.long	8                               # 0x8
   571	.long	18                              # 0x12
   572	.long	0                               # 0x0
   573	.long	6                               # 0x6
   574.LCPI0_103:
   575	.long	16                              # 0x10
   576	.long	0                               # 0x0
   577	.long	4                               # 0x4
   578	.long	14                              # 0xe
   579	.long	0                               # 0x0
   580	.long	2                               # 0x2
   581	.long	12                              # 0xc
   582	.long	22                              # 0x16
   583.LCPI0_104:
   584	.long	0                               # 0x0
   585	.long	9                               # 0x9
   586	.long	18                              # 0x12
   587	.long	0                               # 0x0
   588	.long	4                               # 0x4
   589	.long	13                              # 0xd
   590	.long	22                              # 0x16
   591	.long	0                               # 0x0
   592.LCPI0_106:
   593	.long	8                               # 0x8
   594	.long	17                              # 0x11
   595	.long	0                               # 0x0
   596	.long	3                               # 0x3
   597	.long	12                              # 0xc
   598	.long	21                              # 0x15
   599	.long	0                               # 0x0
   600	.long	7                               # 0x7
   601.LCPI0_107:
   602	.long	16                              # 0x10
   603	.long	0                               # 0x0
   604	.long	2                               # 0x2
   605	.long	11                              # 0xb
   606	.long	20                              # 0x14
   607	.long	0                               # 0x0
   608	.long	6                               # 0x6
   609	.long	15                              # 0xf
   610.LCPI0_108:
   611	.long	0                               # 0x0
   612	.long	1                               # 0x1
   613	.long	10                              # 0xa
   614	.long	19                              # 0x13
   615	.long	0                               # 0x0
   616	.long	5                               # 0x5
   617	.long	14                              # 0xe
   618	.long	23                              # 0x17
   619.LCPI0_111:
   620	.long	0                               # 0x0
   621	.long	7                               # 0x7
   622	.long	14                              # 0xe
   623	.long	21                              # 0x15
   624	.long	0                               # 0x0
   625	.long	3                               # 0x3
   626	.long	10                              # 0xa
   627	.long	17                              # 0x11
   628.LCPI0_113:
   629	.long	24                              # 0x18
   630	.long	0                               # 0x0
   631	.long	6                               # 0x6
   632	.long	13                              # 0xd
   633	.long	20                              # 0x14
   634	.long	0                               # 0x0
   635	.long	2                               # 0x2
   636	.long	9                               # 0x9
   637.LCPI0_114:
   638	.long	16                              # 0x10
   639	.long	23                              # 0x17
   640	.long	0                               # 0x0
   641	.long	5                               # 0x5
   642	.long	12                              # 0xc
   643	.long	19                              # 0x13
   644	.long	0                               # 0x0
   645	.long	1                               # 0x1
   646.LCPI0_115:
   647	.long	8                               # 0x8
   648	.long	15                              # 0xf
   649	.long	22                              # 0x16
   650	.long	0                               # 0x0
   651	.long	4                               # 0x4
   652	.long	11                              # 0xb
   653	.long	18                              # 0x12
   654	.long	25                              # 0x19
   655.LCPI0_116:
   656	.long	0                               # 0x0
   657	.long	6                               # 0x6
   658	.long	12                              # 0xc
   659	.long	18                              # 0x12
   660	.long	24                              # 0x18
   661	.long	0                               # 0x0
   662	.long	4                               # 0x4
   663	.long	10                              # 0xa
   664.LCPI0_118:
   665	.long	16                              # 0x10
   666	.long	22                              # 0x16
   667	.long	0                               # 0x0
   668	.long	2                               # 0x2
   669	.long	8                               # 0x8
   670	.long	14                              # 0xe
   671	.long	20                              # 0x14
   672	.long	26                              # 0x1a
   673.LCPI0_119:
   674	.long	0                               # 0x0
   675	.long	5                               # 0x5
   676	.long	10                              # 0xa
   677	.long	15                              # 0xf
   678	.long	20                              # 0x14
   679	.long	25                              # 0x19
   680	.long	0                               # 0x0
   681	.long	3                               # 0x3
   682.LCPI0_121:
   683	.long	8                               # 0x8
   684	.long	13                              # 0xd
   685	.long	18                              # 0x12
   686	.long	23                              # 0x17
   687	.long	0                               # 0x0
   688	.long	1                               # 0x1
   689	.long	6                               # 0x6
   690	.long	11                              # 0xb
   691.LCPI0_122:
   692	.long	16                              # 0x10
   693	.long	21                              # 0x15
   694	.long	26                              # 0x1a
   695	.long	0                               # 0x0
   696	.long	4                               # 0x4
   697	.long	9                               # 0x9
   698	.long	14                              # 0xe
   699	.long	19                              # 0x13
   700.LCPI0_123:
   701	.long	24                              # 0x18
   702	.long	0                               # 0x0
   703	.long	2                               # 0x2
   704	.long	7                               # 0x7
   705	.long	12                              # 0xc
   706	.long	17                              # 0x11
   707	.long	22                              # 0x16
   708	.long	27                              # 0x1b
   709.LCPI0_124:
   710	.long	0                               # 0x0
   711	.long	4                               # 0x4
   712	.long	8                               # 0x8
   713	.long	12                              # 0xc
   714	.long	16                              # 0x10
   715	.long	20                              # 0x14
   716	.long	24                              # 0x18
   717	.long	28                              # 0x1c
   718.LCPI0_126:
   719	.long	0                               # 0x0
   720	.long	3                               # 0x3
   721	.long	6                               # 0x6
   722	.long	9                               # 0x9
   723	.long	12                              # 0xc
   724	.long	15                              # 0xf
   725	.long	18                              # 0x12
   726	.long	21                              # 0x15
   727.LCPI0_128:
   728	.long	24                              # 0x18
   729	.long	27                              # 0x1b
   730	.long	0                               # 0x0
   731	.long	1                               # 0x1
   732	.long	4                               # 0x4
   733	.long	7                               # 0x7
   734	.long	10                              # 0xa
   735	.long	13                              # 0xd
   736.LCPI0_129:
   737	.long	16                              # 0x10
   738	.long	19                              # 0x13
   739	.long	22                              # 0x16
   740	.long	25                              # 0x19
   741	.long	28                              # 0x1c
   742	.long	0                               # 0x0
   743	.long	2                               # 0x2
   744	.long	5                               # 0x5
   745.LCPI0_130:
   746	.long	8                               # 0x8
   747	.long	11                              # 0xb
   748	.long	14                              # 0xe
   749	.long	17                              # 0x11
   750	.long	20                              # 0x14
   751	.long	23                              # 0x17
   752	.long	26                              # 0x1a
   753	.long	29                              # 0x1d
   754.LCPI0_131:
   755	.long	0                               # 0x0
   756	.long	2                               # 0x2
   757	.long	4                               # 0x4
   758	.long	6                               # 0x6
   759	.long	8                               # 0x8
   760	.long	10                              # 0xa
   761	.long	12                              # 0xc
   762	.long	14                              # 0xe
   763.LCPI0_133:
   764	.long	16                              # 0x10
   765	.long	18                              # 0x12
   766	.long	20                              # 0x14
   767	.long	22                              # 0x16
   768	.long	24                              # 0x18
   769	.long	26                              # 0x1a
   770	.long	28                              # 0x1c
   771	.long	30                              # 0x1e
   772.LCPI0_134:
   773	.long	0                               # 0x0
   774	.long	1                               # 0x1
   775	.long	2                               # 0x2
   776	.long	3                               # 0x3
   777	.long	4                               # 0x4
   778	.long	5                               # 0x5
   779	.long	6                               # 0x6
   780	.long	7                               # 0x7
   781.LCPI0_136:
   782	.long	24                              # 0x18
   783	.long	25                              # 0x19
   784	.long	26                              # 0x1a
   785	.long	27                              # 0x1b
   786	.long	28                              # 0x1c
   787	.long	29                              # 0x1d
   788	.long	30                              # 0x1e
   789	.long	31                              # 0x1f
   790	.section	.rodata.cst16,"aM",@progbits,16
   791	.p2align	4
   792.LCPI0_5:
   793	.long	8                               # 0x8
   794	.long	7                               # 0x7
   795	.long	6                               # 0x6
   796	.long	5                               # 0x5
   797.LCPI0_6:
   798	.long	24                              # 0x18
   799	.long	25                              # 0x19
   800	.long	26                              # 0x1a
   801	.long	27                              # 0x1b
   802.LCPI0_9:
   803	.long	16                              # 0x10
   804	.long	14                              # 0xe
   805	.long	12                              # 0xc
   806	.long	10                              # 0xa
   807.LCPI0_10:
   808	.long	16                              # 0x10
   809	.long	18                              # 0x12
   810	.long	20                              # 0x14
   811	.long	22                              # 0x16
   812.LCPI0_13:
   813	.long	8                               # 0x8
   814	.long	5                               # 0x5
   815	.zero	4
   816	.zero	4
   817.LCPI0_14:
   818	.long	24                              # 0x18
   819	.long	27                              # 0x1b
   820	.zero	4
   821	.zero	4
   822.LCPI0_16:
   823	.long	16                              # 0x10
   824	.long	13                              # 0xd
   825	.long	10                              # 0xa
   826	.long	7                               # 0x7
   827.LCPI0_17:
   828	.long	16                              # 0x10
   829	.long	19                              # 0x13
   830	.long	22                              # 0x16
   831	.long	25                              # 0x19
   832.LCPI0_19:
   833	.long	24                              # 0x18
   834	.long	21                              # 0x15
   835	.long	18                              # 0x12
   836	.long	15                              # 0xf
   837.LCPI0_20:
   838	.long	8                               # 0x8
   839	.long	11                              # 0xb
   840	.long	14                              # 0xe
   841	.long	17                              # 0x11
   842.LCPI0_26:
   843	.long	24                              # 0x18
   844	.long	19                              # 0x13
   845	.long	14                              # 0xe
   846	.long	9                               # 0x9
   847.LCPI0_27:
   848	.long	8                               # 0x8
   849	.long	13                              # 0xd
   850	.long	18                              # 0x12
   851	.long	23                              # 0x17
   852.LCPI0_29:
   853	.long	16                              # 0x10
   854	.long	11                              # 0xb
   855	.zero	4
   856	.zero	4
   857.LCPI0_30:
   858	.long	16                              # 0x10
   859	.long	21                              # 0x15
   860	.zero	4
   861	.zero	4
   862.LCPI0_40:
   863	.long	16                              # 0x10
   864	.long	9                               # 0x9
   865	.zero	4
   866	.zero	4
   867.LCPI0_41:
   868	.long	16                              # 0x10
   869	.long	23                              # 0x17
   870	.zero	4
   871	.zero	4
   872.LCPI0_43:
   873	.long	24                              # 0x18
   874	.long	17                              # 0x11
   875	.zero	4
   876	.zero	4
   877.LCPI0_44:
   878	.long	8                               # 0x8
   879	.long	15                              # 0xf
   880	.zero	4
   881	.zero	4
   882.LCPI0_46:
   883	.long	0                               # 0x0
   884	.long	0                               # 0x0
   885	.long	0                               # 0x0
   886	.long	8                               # 0x8
   887.LCPI0_50:
   888	.long	24                              # 0x18
   889	.long	15                              # 0xf
   890	.zero	4
   891	.zero	4
   892.LCPI0_51:
   893	.long	8                               # 0x8
   894	.long	17                              # 0x11
   895	.zero	4
   896	.zero	4
   897.LCPI0_62:
   898	.long	24                              # 0x18
   899	.long	13                              # 0xd
   900	.zero	4
   901	.zero	4
   902.LCPI0_63:
   903	.long	8                               # 0x8
   904	.long	19                              # 0x13
   905	.zero	4
   906	.zero	4
   907.LCPI0_109:
   908	.long	0                               # 0x0
   909	.long	8                               # 0x8
   910	.long	16                              # 0x10
   911	.long	24                              # 0x18
   912	.section	.rodata.cst4,"aM",@progbits,4
   913	.p2align	2
   914.LCPI0_47:
   915	.long	16777215                        # 0xffffff
   916.LCPI0_110:
   917	.long	255                             # 0xff
   918	.text
   919	.globl	unpack32_avx2
   920	.p2align	4, 0x90
   921	.type	unpack32_avx2,@function
   922unpack32_avx2:                          # @unpack32_avx2
   923# %bb.0:
   924	push	rbp
   925	mov	rbp, rsp
   926	push	r15
   927	push	r14
   928	push	r12
   929	push	rbx
   930	and	rsp, -16
   931                                        # kill: def $edx killed $edx def $rdx
   932	mov	r15, rsi
   933	mov	rbx, rdi
   934	lea	r14d, [rdx + 31]
   935	test	edx, edx
   936	cmovns	r14d, edx
   937	sar	r14d, 5
   938	cmp	ecx, 15
   939	jle	.LBB0_1
   940# %bb.48:
   941	cmp	ecx, 23
   942	jle	.LBB0_49
   943# %bb.72:
   944	cmp	ecx, 27
   945	jle	.LBB0_73
   946# %bb.84:
   947	cmp	ecx, 29
   948	jle	.LBB0_85
   949# %bb.90:
   950	cmp	ecx, 30
   951	je	.LBB0_99
   952# %bb.91:
   953	cmp	ecx, 31
   954	je	.LBB0_96
   955# %bb.92:
   956	cmp	ecx, 32
   957	jne	.LBB0_147
   958# %bb.93:
   959	cmp	edx, 32
   960	jl	.LBB0_147
   961# %bb.94:
   962	mov	r12d, r14d
   963	.p2align	4, 0x90
   964.LBB0_95:                               # =>This Inner Loop Header: Depth=1
   965	mov	edx, 128
   966	mov	rdi, r15
   967	mov	rsi, rbx
   968	call	clib·_memcpy(SB)
   969	sub	rbx, -128
   970	sub	r15, -128
   971	add	r12, -1
   972	jne	.LBB0_95
   973	jmp	.LBB0_147
   974.LBB0_1:
   975	cmp	ecx, 7
   976	jg	.LBB0_25
   977# %bb.2:
   978	cmp	ecx, 3
   979	jg	.LBB0_14
   980# %bb.3:
   981	cmp	ecx, 1
   982	jg	.LBB0_9
   983# %bb.4:
   984	test	ecx, ecx
   985	je	.LBB0_144
   986# %bb.5:
   987	cmp	ecx, 1
   988	jne	.LBB0_147
   989# %bb.6:
   990	cmp	edx, 32
   991	jl	.LBB0_147
   992# %bb.7:
   993	mov	eax, r14d
   994	add	r15, 96
   995	xor	ecx, ecx
   996	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_135] # ymm0 = [4294967297,4294967297,4294967297,4294967297]
   997	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_134] # ymm1 = [0,1,2,3,4,5,6,7]
   998	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_2] # ymm2 = [8,9,10,11,12,13,14,15]
   999	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_4] # ymm3 = [16,17,18,19,20,21,22,23]
  1000	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_136] # ymm4 = [24,25,26,27,28,29,30,31]
  1001	.p2align	4, 0x90
  1002.LBB0_8:                                # =>This Inner Loop Header: Depth=1
  1003	vpbroadcastd	ymm5, dword ptr [rbx + 4*rcx]
  1004	vpsrlvd	ymm5, ymm5, ymm1
  1005	vpand	ymm5, ymm5, ymm0
  1006	vmovdqu	ymmword ptr [r15 - 96], ymm5
  1007	vpbroadcastd	ymm5, dword ptr [rbx + 4*rcx]
  1008	vpsrlvd	ymm5, ymm5, ymm2
  1009	vpand	ymm5, ymm5, ymm0
  1010	vmovdqu	ymmword ptr [r15 - 64], ymm5
  1011	vpbroadcastd	ymm5, dword ptr [rbx + 4*rcx]
  1012	vpsrlvd	ymm5, ymm5, ymm3
  1013	vpand	ymm5, ymm5, ymm0
  1014	vmovdqu	ymmword ptr [r15 - 32], ymm5
  1015	vpbroadcastd	ymm5, dword ptr [rbx + 4*rcx]
  1016	vpsrlvd	ymm5, ymm5, ymm4
  1017	vpand	ymm5, ymm5, ymm0
  1018	vmovdqu	ymmword ptr [r15], ymm5
  1019	add	rcx, 1
  1020	sub	r15, -128
  1021	cmp	rax, rcx
  1022	jne	.LBB0_8
  1023	jmp	.LBB0_147
  1024.LBB0_49:
  1025	cmp	ecx, 19
  1026	jg	.LBB0_61
  1027# %bb.50:
  1028	cmp	ecx, 17
  1029	jg	.LBB0_56
  1030# %bb.51:
  1031	cmp	ecx, 16
  1032	je	.LBB0_120
  1033# %bb.52:
  1034	cmp	ecx, 17
  1035	jne	.LBB0_147
  1036# %bb.53:
  1037	cmp	edx, 32
  1038	jl	.LBB0_147
  1039# %bb.54:
  1040	mov	r8d, r14d
  1041	add	r15, 96
  1042	add	rbx, 64
  1043	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_76] # ymm0 = [562945658585087,562945658585087,562945658585087,562945658585087]
  1044	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_75] # ymm1 = [0,0,2,0,4,0,6,0]
  1045	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_77] # ymm2 = [8,0,10,0,12,0,14,0]
  1046	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_78] # ymm3 = [0,1,0,3,0,5,0,7]
  1047	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_79] # ymm4 = [0,9,0,11,0,13,0,15]
  1048	.p2align	4, 0x90
  1049.LBB0_55:                               # =>This Inner Loop Header: Depth=1
  1050	mov	ecx, dword ptr [rbx - 52]
  1051	mov	r10d, dword ptr [rbx - 48]
  1052	shld	r10d, ecx, 9
  1053	mov	esi, dword ptr [rbx - 56]
  1054	mov	edi, ecx
  1055	shld	edi, esi, 11
  1056	mov	r9d, dword ptr [rbx - 64]
  1057	mov	edx, dword ptr [rbx - 60]
  1058	mov	eax, edx
  1059	shld	eax, r9d, 15
  1060	vmovd	xmm5, esi
  1061	shld	esi, edx, 13
  1062	vpinsrd	xmm5, xmm5, edi, 1
  1063	vpinsrd	xmm5, xmm5, ecx, 2
  1064	vpinsrd	xmm5, xmm5, r10d, 3
  1065	vmovd	xmm6, r9d
  1066	vpinsrd	xmm6, xmm6, eax, 1
  1067	vpinsrd	xmm6, xmm6, edx, 2
  1068	vpinsrd	xmm6, xmm6, esi, 3
  1069	vinserti128	ymm5, ymm6, xmm5, 1
  1070	vpsrlvd	ymm5, ymm5, ymm1
  1071	vpand	ymm5, ymm5, ymm0
  1072	vmovdqu	ymmword ptr [r15 - 96], ymm5
  1073	mov	eax, dword ptr [rbx - 36]
  1074	mov	r10d, dword ptr [rbx - 32]
  1075	shld	r10d, eax, 1
  1076	mov	edx, dword ptr [rbx - 40]
  1077	mov	esi, eax
  1078	shld	esi, edx, 3
  1079	mov	r9d, dword ptr [rbx - 48]
  1080	mov	ecx, dword ptr [rbx - 44]
  1081	mov	edi, ecx
  1082	shld	edi, r9d, 7
  1083	vmovd	xmm5, edx
  1084	shld	edx, ecx, 5
  1085	vpinsrd	xmm5, xmm5, esi, 1
  1086	vpinsrd	xmm5, xmm5, eax, 2
  1087	vpinsrd	xmm5, xmm5, r10d, 3
  1088	vmovd	xmm6, r9d
  1089	vpinsrd	xmm6, xmm6, edi, 1
  1090	vpinsrd	xmm6, xmm6, ecx, 2
  1091	vpinsrd	xmm6, xmm6, edx, 3
  1092	vinserti128	ymm5, ymm6, xmm5, 1
  1093	vpsrlvd	ymm5, ymm5, ymm2
  1094	vpand	ymm5, ymm5, ymm0
  1095	vmovdqu	ymmword ptr [r15 - 64], ymm5
  1096	mov	r9d, dword ptr [rbx - 16]
  1097	mov	r11d, dword ptr [rbx - 20]
  1098	mov	edx, r9d
  1099	shld	edx, r11d, 10
  1100	mov	r10d, dword ptr [rbx - 24]
  1101	mov	edi, r11d
  1102	shld	edi, r10d, 12
  1103	mov	eax, dword ptr [rbx - 28]
  1104	mov	esi, r10d
  1105	shld	esi, eax, 14
  1106	mov	ecx, dword ptr [rbx - 32]
  1107	shrd	ecx, eax, 16
  1108	vmovd	xmm5, edi
  1109	vpinsrd	xmm5, xmm5, r11d, 1
  1110	vpinsrd	xmm5, xmm5, edx, 2
  1111	vpinsrd	xmm5, xmm5, r9d, 3
  1112	vmovd	xmm6, ecx
  1113	vpinsrd	xmm6, xmm6, eax, 1
  1114	vpinsrd	xmm6, xmm6, esi, 2
  1115	vpinsrd	xmm6, xmm6, r10d, 3
  1116	vinserti128	ymm5, ymm6, xmm5, 1
  1117	vpsrlvd	ymm5, ymm5, ymm3
  1118	vpand	ymm5, ymm5, ymm0
  1119	vmovdqu	ymmword ptr [r15 - 32], ymm5
  1120	mov	r9d, dword ptr [rbx]
  1121	mov	r11d, dword ptr [rbx - 4]
  1122	mov	edx, r9d
  1123	shld	edx, r11d, 2
  1124	mov	r10d, dword ptr [rbx - 8]
  1125	mov	edi, r11d
  1126	shld	edi, r10d, 4
  1127	mov	eax, dword ptr [rbx - 16]
  1128	mov	esi, dword ptr [rbx - 12]
  1129	mov	ecx, r10d
  1130	shld	ecx, esi, 6
  1131	shrd	eax, esi, 24
  1132	vmovd	xmm5, edi
  1133	vpinsrd	xmm5, xmm5, r11d, 1
  1134	vpinsrd	xmm5, xmm5, edx, 2
  1135	vpinsrd	xmm5, xmm5, r9d, 3
  1136	vmovd	xmm6, eax
  1137	vpinsrd	xmm6, xmm6, esi, 1
  1138	vpinsrd	xmm6, xmm6, ecx, 2
  1139	vpinsrd	xmm6, xmm6, r10d, 3
  1140	vinserti128	ymm5, ymm6, xmm5, 1
  1141	vpsrlvd	ymm5, ymm5, ymm4
  1142	vpand	ymm5, ymm5, ymm0
  1143	vmovdqu	ymmword ptr [r15], ymm5
  1144	sub	r15, -128
  1145	add	rbx, 68
  1146	add	r8, -1
  1147	jne	.LBB0_55
  1148	jmp	.LBB0_147
  1149.LBB0_25:
  1150	cmp	ecx, 11
  1151	jg	.LBB0_37
  1152# %bb.26:
  1153	cmp	ecx, 9
  1154	jg	.LBB0_32
  1155# %bb.27:
  1156	cmp	ecx, 8
  1157	je	.LBB0_132
  1158# %bb.28:
  1159	cmp	ecx, 9
  1160	jne	.LBB0_147
  1161# %bb.29:
  1162	cmp	edx, 32
  1163	jl	.LBB0_147
  1164# %bb.30:
  1165	mov	r8d, r14d
  1166	add	r15, 96
  1167	add	rbx, 32
  1168	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_105] # ymm0 = [2194728288767,2194728288767,2194728288767,2194728288767]
  1169	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_104] # ymm1 = [0,9,18,0,4,13,22,0]
  1170	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_106] # ymm2 = [8,17,0,3,12,21,0,7]
  1171	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_107] # ymm3 = [16,0,2,11,20,0,6,15]
  1172	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_108] # ymm4 = [0,1,10,19,0,5,14,23]
  1173	.p2align	4, 0x90
  1174.LBB0_31:                               # =>This Inner Loop Header: Depth=1
  1175	mov	ecx, dword ptr [rbx - 32]
  1176	mov	edx, dword ptr [rbx - 28]
  1177	mov	esi, dword ptr [rbx - 24]
  1178	shld	esi, edx, 1
  1179	vmovd	xmm5, edx
  1180	vpinsrd	xmm5, xmm5, edx, 1
  1181	vpinsrd	xmm5, xmm5, edx, 2
  1182	shld	edx, ecx, 5
  1183	vpinsrd	xmm5, xmm5, esi, 3
  1184	vmovd	xmm6, ecx
  1185	vpinsrd	xmm6, xmm6, ecx, 1
  1186	vpinsrd	xmm6, xmm6, ecx, 2
  1187	vpinsrd	xmm6, xmm6, edx, 3
  1188	vinserti128	ymm5, ymm6, xmm5, 1
  1189	vpsrlvd	ymm5, ymm5, ymm1
  1190	vpand	ymm5, ymm5, ymm0
  1191	vmovdqu	ymmword ptr [r15 - 96], ymm5
  1192	mov	ecx, dword ptr [rbx - 16]
  1193	mov	edx, dword ptr [rbx - 24]
  1194	mov	esi, dword ptr [rbx - 20]
  1195	mov	edi, ecx
  1196	shld	edi, esi, 2
  1197	mov	eax, esi
  1198	shld	eax, edx, 6
  1199	vmovd	xmm5, esi
  1200	vpinsrd	xmm5, xmm5, esi, 1
  1201	vpinsrd	xmm5, xmm5, edi, 2
  1202	vpinsrd	xmm5, xmm5, ecx, 3
  1203	vmovd	xmm6, edx
  1204	vpinsrd	xmm6, xmm6, edx, 1
  1205	vpinsrd	xmm6, xmm6, eax, 2
  1206	vpinsrd	xmm6, xmm6, esi, 3
  1207	vinserti128	ymm5, ymm6, xmm5, 1
  1208	vpsrlvd	ymm5, ymm5, ymm2
  1209	vpand	ymm5, ymm5, ymm0
  1210	vmovdqu	ymmword ptr [r15 - 64], ymm5
  1211	mov	eax, dword ptr [rbx - 8]
  1212	mov	ecx, dword ptr [rbx - 16]
  1213	mov	edx, dword ptr [rbx - 12]
  1214	mov	esi, eax
  1215	shld	esi, edx, 3
  1216	mov	edi, edx
  1217	shld	edi, ecx, 7
  1218	vmovd	xmm5, edx
  1219	vpinsrd	xmm5, xmm5, esi, 1
  1220	vpinsrd	xmm5, xmm5, eax, 2
  1221	vpinsrd	xmm5, xmm5, eax, 3
  1222	vmovd	xmm6, ecx
  1223	vpinsrd	xmm6, xmm6, edi, 1
  1224	vpinsrd	xmm6, xmm6, edx, 2
  1225	vpinsrd	xmm6, xmm6, edx, 3
  1226	vinserti128	ymm5, ymm6, xmm5, 1
  1227	vpsrlvd	ymm5, ymm5, ymm3
  1228	vpand	ymm5, ymm5, ymm0
  1229	vmovdqu	ymmword ptr [r15 - 32], ymm5
  1230	mov	eax, dword ptr [rbx]
  1231	mov	ecx, dword ptr [rbx - 8]
  1232	mov	edx, dword ptr [rbx - 4]
  1233	mov	esi, eax
  1234	shld	esi, edx, 4
  1235	shrd	ecx, edx, 24
  1236	vmovd	xmm5, esi
  1237	vpinsrd	xmm5, xmm5, eax, 1
  1238	vpinsrd	xmm5, xmm5, eax, 2
  1239	vpinsrd	xmm5, xmm5, eax, 3
  1240	vmovd	xmm6, ecx
  1241	vpinsrd	xmm6, xmm6, edx, 1
  1242	vpinsrd	xmm6, xmm6, edx, 2
  1243	vpinsrd	xmm6, xmm6, edx, 3
  1244	vinserti128	ymm5, ymm6, xmm5, 1
  1245	vpsrlvd	ymm5, ymm5, ymm4
  1246	vpand	ymm5, ymm5, ymm0
  1247	vmovdqu	ymmword ptr [r15], ymm5
  1248	sub	r15, -128
  1249	add	rbx, 36
  1250	add	r8, -1
  1251	jne	.LBB0_31
  1252	jmp	.LBB0_147
  1253.LBB0_73:
  1254	cmp	ecx, 25
  1255	jg	.LBB0_79
  1256# %bb.74:
  1257	cmp	ecx, 24
  1258	je	.LBB0_108
  1259# %bb.75:
  1260	cmp	ecx, 25
  1261	jne	.LBB0_147
  1262# %bb.76:
  1263	cmp	edx, 32
  1264	jl	.LBB0_147
  1265# %bb.77:
  1266	mov	r8d, r14d
  1267	add	r15, 96
  1268	add	rbx, 96
  1269	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_38] # ymm0 = [144115183814443007,144115183814443007,144115183814443007,144115183814443007]
  1270	vmovdqa	ymm9, ymmword ptr [rip + .LCPI0_28] # ymm9 = [0,0,0,0,4,0,0,0]
  1271	vmovdqa	ymm10, ymmword ptr [rip + .LCPI0_39] # ymm10 = [0,1,0,0,0,5,0,0]
  1272	vmovdqa	xmm11, xmmword ptr [rip + .LCPI0_40] # xmm11 = <16,9,u,u>
  1273	vmovdqa	xmm4, xmmword ptr [rip + .LCPI0_41] # xmm4 = <16,23,u,u>
  1274	vmovdqa	ymm5, ymmword ptr [rip + .LCPI0_42] # ymm5 = [0,0,2,0,0,0,6,0]
  1275	vmovdqa	xmm6, xmmword ptr [rip + .LCPI0_43] # xmm6 = <24,17,u,u>
  1276	vmovdqa	xmm7, xmmword ptr [rip + .LCPI0_44] # xmm7 = <8,15,u,u>
  1277	vmovdqa	ymm8, ymmword ptr [rip + .LCPI0_45] # ymm8 = [0,0,0,3,0,0,0,7]
  1278	.p2align	4, 0x90
  1279.LBB0_78:                               # =>This Inner Loop Header: Depth=1
  1280	mov	ecx, dword ptr [rbx - 76]
  1281	mov	r9d, dword ptr [rbx - 72]
  1282	shld	r9d, ecx, 17
  1283	mov	esi, dword ptr [rbx - 80]
  1284	shld	ecx, esi, 10
  1285	mov	edi, dword ptr [rbx - 84]
  1286	shld	esi, edi, 3
  1287	mov	eax, dword ptr [rbx - 88]
  1288	vmovd	xmm1, edi
  1289	shld	edi, eax, 21
  1290	mov	r10d, dword ptr [rbx - 96]
  1291	mov	edx, dword ptr [rbx - 92]
  1292	shld	eax, edx, 14
  1293	shld	edx, r10d, 7
  1294	vpinsrd	xmm1, xmm1, esi, 1
  1295	vmovd	xmm2, r10d
  1296	vpinsrd	xmm1, xmm1, ecx, 2
  1297	vpinsrd	xmm2, xmm2, edx, 1
  1298	vpinsrd	xmm1, xmm1, r9d, 3
  1299	vpinsrd	xmm2, xmm2, eax, 2
  1300	vpinsrd	xmm2, xmm2, edi, 3
  1301	vinserti128	ymm1, ymm2, xmm1, 1
  1302	vpsrlvd	ymm1, ymm1, ymm9
  1303	vpand	ymm1, ymm1, ymm0
  1304	vmovdqu	ymmword ptr [r15 - 96], ymm1
  1305	mov	r11d, dword ptr [rbx - 52]
  1306	mov	r9d, dword ptr [rbx - 48]
  1307	shld	r9d, r11d, 9
  1308	mov	r10d, dword ptr [rbx - 56]
  1309	shld	r11d, r10d, 2
  1310	mov	esi, dword ptr [rbx - 60]
  1311	mov	edi, r10d
  1312	mov	ecx, dword ptr [rbx - 64]
  1313	shld	edi, esi, 20
  1314	mov	edx, dword ptr [rbx - 72]
  1315	mov	eax, dword ptr [rbx - 68]
  1316	shld	esi, ecx, 13
  1317	shrd	edx, eax, 8
  1318	shld	ecx, eax, 6
  1319	vmovd	xmm1, edi
  1320	vpinsrd	xmm1, xmm1, r10d, 1
  1321	vmovd	xmm2, edx
  1322	vpinsrd	xmm1, xmm1, r11d, 2
  1323	vpinsrd	xmm2, xmm2, eax, 1
  1324	vpinsrd	xmm1, xmm1, r9d, 3
  1325	vpinsrd	xmm2, xmm2, ecx, 2
  1326	vpinsrd	xmm2, xmm2, esi, 3
  1327	vinserti128	ymm1, ymm2, xmm1, 1
  1328	vpsrlvd	ymm1, ymm1, ymm10
  1329	vpand	ymm1, ymm1, ymm0
  1330	vmovdqu	ymmword ptr [r15 - 64], ymm1
  1331	mov	eax, dword ptr [rbx - 28]
  1332	mov	r9d, dword ptr [rbx - 24]
  1333	shld	r9d, eax, 1
  1334	mov	edx, dword ptr [rbx - 32]
  1335	mov	esi, eax
  1336	shld	esi, edx, 19
  1337	mov	edi, dword ptr [rbx - 40]
  1338	mov	ecx, dword ptr [rbx - 36]
  1339	shld	edx, ecx, 12
  1340	shld	ecx, edi, 5
  1341	vmovq	xmm1, qword ptr [rbx - 48]      # xmm1 = mem[0],zero
  1342	vpsrlvd	xmm2, xmm1, xmm11
  1343	vpshufd	xmm1, xmm1, 229                 # xmm1 = xmm1[1,1,2,3]
  1344	vpinsrd	xmm1, xmm1, edi, 1
  1345	vpsllvd	xmm1, xmm1, xmm4
  1346	vpor	xmm1, xmm2, xmm1
  1347	vmovd	xmm2, edx
  1348	vpinsrd	xmm2, xmm2, esi, 1
  1349	vpinsrd	xmm2, xmm2, eax, 2
  1350	vpinsrd	xmm2, xmm2, r9d, 3
  1351	vpinsrd	xmm1, xmm1, edi, 2
  1352	vpinsrd	xmm1, xmm1, ecx, 3
  1353	vinserti128	ymm1, ymm1, xmm2, 1
  1354	vpsrlvd	ymm1, ymm1, ymm5
  1355	vpand	ymm1, ymm1, ymm0
  1356	vmovdqu	ymmword ptr [r15 - 32], ymm1
  1357	mov	r9d, dword ptr [rbx]
  1358	mov	ecx, dword ptr [rbx - 4]
  1359	mov	edx, r9d
  1360	shld	edx, ecx, 18
  1361	mov	esi, dword ptr [rbx - 8]
  1362	shld	ecx, esi, 11
  1363	mov	r10d, dword ptr [rbx - 16]
  1364	mov	edi, dword ptr [rbx - 12]
  1365	shld	esi, edi, 4
  1366	mov	eax, edi
  1367	shld	eax, r10d, 22
  1368	vmovq	xmm1, qword ptr [rbx - 24]      # xmm1 = mem[0],zero
  1369	vpsrlvd	xmm2, xmm1, xmm6
  1370	vpshufd	xmm1, xmm1, 229                 # xmm1 = xmm1[1,1,2,3]
  1371	vpinsrd	xmm1, xmm1, r10d, 1
  1372	vpsllvd	xmm1, xmm1, xmm7
  1373	vmovd	xmm3, esi
  1374	vpinsrd	xmm3, xmm3, ecx, 1
  1375	vpor	xmm1, xmm2, xmm1
  1376	vpinsrd	xmm2, xmm3, edx, 2
  1377	vpinsrd	xmm2, xmm2, r9d, 3
  1378	vpinsrd	xmm1, xmm1, eax, 2
  1379	vpinsrd	xmm1, xmm1, edi, 3
  1380	vinserti128	ymm1, ymm1, xmm2, 1
  1381	vpsrlvd	ymm1, ymm1, ymm8
  1382	vpand	ymm1, ymm1, ymm0
  1383	vmovdqu	ymmword ptr [r15], ymm1
  1384	sub	r15, -128
  1385	add	rbx, 100
  1386	add	r8, -1
  1387	jne	.LBB0_78
  1388	jmp	.LBB0_147
  1389.LBB0_14:
  1390	cmp	ecx, 5
  1391	jg	.LBB0_20
  1392# %bb.15:
  1393	cmp	ecx, 4
  1394	je	.LBB0_138
  1395# %bb.16:
  1396	cmp	ecx, 5
  1397	jne	.LBB0_147
  1398# %bb.17:
  1399	cmp	edx, 32
  1400	jl	.LBB0_147
  1401# %bb.18:
  1402	mov	eax, r14d
  1403	add	r15, 96
  1404	add	rbx, 16
  1405	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_120] # ymm0 = [133143986207,133143986207,133143986207,133143986207]
  1406	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_119] # ymm1 = [0,5,10,15,20,25,0,3]
  1407	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_121] # ymm2 = [8,13,18,23,0,1,6,11]
  1408	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_122] # ymm3 = [16,21,26,0,4,9,14,19]
  1409	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_123] # ymm4 = [24,0,2,7,12,17,22,27]
  1410	.p2align	4, 0x90
  1411.LBB0_19:                               # =>This Inner Loop Header: Depth=1
  1412	mov	ecx, dword ptr [rbx - 16]
  1413	mov	edx, dword ptr [rbx - 12]
  1414	mov	esi, edx
  1415	shld	esi, ecx, 2
  1416	vmovd	xmm5, ecx
  1417	vpbroadcastd	xmm6, xmm5
  1418	vpinsrd	xmm5, xmm5, ecx, 1
  1419	vpinsrd	xmm5, xmm5, esi, 2
  1420	vpinsrd	xmm5, xmm5, edx, 3
  1421	vinserti128	ymm5, ymm6, xmm5, 1
  1422	vpsrlvd	ymm5, ymm5, ymm1
  1423	vpand	ymm5, ymm5, ymm0
  1424	vmovdqu	ymmword ptr [r15 - 96], ymm5
  1425	mov	ecx, dword ptr [rbx - 12]
  1426	mov	edx, dword ptr [rbx - 8]
  1427	mov	esi, edx
  1428	shld	esi, ecx, 4
  1429	vmovd	xmm5, ecx
  1430	vpbroadcastd	xmm5, xmm5
  1431	vmovd	xmm6, esi
  1432	vpinsrd	xmm6, xmm6, edx, 1
  1433	vpinsrd	xmm6, xmm6, edx, 2
  1434	vpinsrd	xmm6, xmm6, edx, 3
  1435	vinserti128	ymm5, ymm5, xmm6, 1
  1436	vpsrlvd	ymm5, ymm5, ymm2
  1437	vpand	ymm5, ymm5, ymm0
  1438	vmovdqu	ymmword ptr [r15 - 64], ymm5
  1439	mov	ecx, dword ptr [rbx - 8]
  1440	mov	edx, dword ptr [rbx - 4]
  1441	vmovd	xmm5, edx
  1442	shld	edx, ecx, 1
  1443	vmovd	xmm6, ecx
  1444	vpinsrd	xmm6, xmm6, ecx, 1
  1445	vpinsrd	xmm6, xmm6, ecx, 2
  1446	vpinsrd	xmm6, xmm6, edx, 3
  1447	vpbroadcastd	xmm5, xmm5
  1448	vinserti128	ymm5, ymm6, xmm5, 1
  1449	vpsrlvd	ymm5, ymm5, ymm3
  1450	vpand	ymm5, ymm5, ymm0
  1451	vmovdqu	ymmword ptr [r15 - 32], ymm5
  1452	mov	ecx, dword ptr [rbx - 4]
  1453	mov	edx, dword ptr [rbx]
  1454	mov	esi, edx
  1455	shld	esi, ecx, 3
  1456	vmovd	xmm5, ecx
  1457	vpinsrd	xmm5, xmm5, esi, 1
  1458	vpinsrd	xmm5, xmm5, edx, 2
  1459	vpinsrd	xmm5, xmm5, edx, 3
  1460	vmovd	xmm6, edx
  1461	vpbroadcastd	xmm6, xmm6
  1462	vinserti128	ymm5, ymm5, xmm6, 1
  1463	vpsrlvd	ymm5, ymm5, ymm4
  1464	vpand	ymm5, ymm5, ymm0
  1465	vmovdqu	ymmword ptr [r15], ymm5
  1466	sub	r15, -128
  1467	add	rbx, 20
  1468	add	rax, -1
  1469	jne	.LBB0_19
  1470	jmp	.LBB0_147
  1471.LBB0_61:
  1472	cmp	ecx, 21
  1473	jg	.LBB0_67
  1474# %bb.62:
  1475	cmp	ecx, 20
  1476	je	.LBB0_114
  1477# %bb.63:
  1478	cmp	ecx, 21
  1479	jne	.LBB0_147
  1480# %bb.64:
  1481	cmp	edx, 32
  1482	jl	.LBB0_147
  1483# %bb.65:
  1484	mov	r8d, r14d
  1485	add	r15, 96
  1486	add	rbx, 80
  1487	vmovdqa	ymm8, ymmword ptr [rip + .LCPI0_58] # ymm8 = [0,0,10,0,0,9,0,0]
  1488	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_59] # ymm1 = [9007194961870847,9007194961870847,9007194961870847,9007194961870847]
  1489	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_60] # ymm2 = [8,0,0,7,0,0,6,0]
  1490	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_61] # ymm3 = [0,5,0,0,4,0,0,3]
  1491	vmovdqa	xmm4, xmmword ptr [rip + .LCPI0_62] # xmm4 = <24,13,u,u>
  1492	vmovdqa	xmm5, xmmword ptr [rip + .LCPI0_63] # xmm5 = <8,19,u,u>
  1493	vmovdqa	ymm6, ymmword ptr [rip + .LCPI0_64] # ymm6 = [0,0,2,0,0,1,0,11]
  1494	.p2align	4, 0x90
  1495.LBB0_66:                               # =>This Inner Loop Header: Depth=1
  1496	mov	ecx, dword ptr [rbx - 64]
  1497	mov	r9d, dword ptr [rbx - 60]
  1498	shld	r9d, ecx, 13
  1499	mov	r11d, dword ptr [rbx - 68]
  1500	shld	ecx, r11d, 2
  1501	mov	edi, dword ptr [rbx - 72]
  1502	mov	esi, r11d
  1503	shld	esi, edi, 12
  1504	mov	r10d, dword ptr [rbx - 80]
  1505	mov	eax, dword ptr [rbx - 76]
  1506	shld	edi, eax, 1
  1507	mov	edx, eax
  1508	shld	edx, r10d, 11
  1509	vmovd	xmm7, r10d
  1510	vmovd	xmm0, esi
  1511	vpinsrd	xmm7, xmm7, edx, 1
  1512	vpinsrd	xmm0, xmm0, r11d, 1
  1513	vpinsrd	xmm7, xmm7, eax, 2
  1514	vpinsrd	xmm0, xmm0, ecx, 2
  1515	vpinsrd	xmm7, xmm7, edi, 3
  1516	vpinsrd	xmm0, xmm0, r9d, 3
  1517	vinserti128	ymm0, ymm7, xmm0, 1
  1518	vpsrlvd	ymm0, ymm0, ymm8
  1519	vpand	ymm0, ymm0, ymm1
  1520	vmovdqu	ymmword ptr [r15 - 96], ymm0
  1521	mov	r10d, dword ptr [rbx - 44]
  1522	mov	r9d, dword ptr [rbx - 40]
  1523	shld	r9d, r10d, 5
  1524	mov	edx, dword ptr [rbx - 48]
  1525	mov	esi, r10d
  1526	shld	esi, edx, 15
  1527	mov	ecx, dword ptr [rbx - 52]
  1528	shld	edx, ecx, 4
  1529	mov	r11d, dword ptr [rbx - 60]
  1530	mov	eax, dword ptr [rbx - 56]
  1531	mov	edi, ecx
  1532	shld	edi, eax, 14
  1533	shld	eax, r11d, 3
  1534	vmovd	xmm0, r11d
  1535	vmovd	xmm7, edx
  1536	vpinsrd	xmm0, xmm0, eax, 1
  1537	vpinsrd	xmm7, xmm7, esi, 1
  1538	vpinsrd	xmm0, xmm0, edi, 2
  1539	vpinsrd	xmm7, xmm7, r10d, 2
  1540	vpinsrd	xmm0, xmm0, ecx, 3
  1541	vpinsrd	xmm7, xmm7, r9d, 3
  1542	vinserti128	ymm0, ymm0, xmm7, 1
  1543	vpsrlvd	ymm0, ymm0, ymm2
  1544	vpand	ymm0, ymm0, ymm1
  1545	vmovdqu	ymmword ptr [r15 - 64], ymm0
  1546	mov	r9d, dword ptr [rbx - 20]
  1547	mov	ecx, dword ptr [rbx - 24]
  1548	mov	r10d, r9d
  1549	shld	r10d, ecx, 18
  1550	mov	esi, dword ptr [rbx - 28]
  1551	shld	ecx, esi, 7
  1552	mov	edi, dword ptr [rbx - 32]
  1553	vmovd	xmm0, esi
  1554	shld	esi, edi, 17
  1555	mov	eax, dword ptr [rbx - 40]
  1556	mov	edx, dword ptr [rbx - 36]
  1557	shld	edi, edx, 6
  1558	shrd	eax, edx, 16
  1559	vpinsrd	xmm0, xmm0, ecx, 1
  1560	vmovd	xmm7, eax
  1561	vpinsrd	xmm0, xmm0, r10d, 2
  1562	vpinsrd	xmm7, xmm7, edx, 1
  1563	vpinsrd	xmm0, xmm0, r9d, 3
  1564	vpinsrd	xmm7, xmm7, edi, 2
  1565	vpinsrd	xmm7, xmm7, esi, 3
  1566	vinserti128	ymm0, ymm7, xmm0, 1
  1567	vpsrlvd	ymm0, ymm0, ymm3
  1568	vpand	ymm0, ymm0, ymm1
  1569	vmovdqu	ymmword ptr [r15 - 32], ymm0
  1570	mov	r9d, dword ptr [rbx]
  1571	mov	eax, dword ptr [rbx - 4]
  1572	mov	edx, r9d
  1573	shld	edx, eax, 10
  1574	mov	esi, dword ptr [rbx - 12]
  1575	mov	edi, dword ptr [rbx - 8]
  1576	mov	ecx, eax
  1577	shld	ecx, edi, 20
  1578	shld	edi, esi, 9
  1579	vmovq	xmm0, qword ptr [rbx - 20]      # xmm0 = mem[0],zero
  1580	vpsrlvd	xmm7, xmm0, xmm4
  1581	vpshufd	xmm0, xmm0, 229                 # xmm0 = xmm0[1,1,2,3]
  1582	vpinsrd	xmm0, xmm0, esi, 1
  1583	vpsllvd	xmm0, xmm0, xmm5
  1584	vpor	xmm0, xmm7, xmm0
  1585	vmovd	xmm7, ecx
  1586	vpinsrd	xmm7, xmm7, eax, 1
  1587	vpinsrd	xmm7, xmm7, edx, 2
  1588	vpinsrd	xmm7, xmm7, r9d, 3
  1589	vpinsrd	xmm0, xmm0, esi, 2
  1590	vpinsrd	xmm0, xmm0, edi, 3
  1591	vinserti128	ymm0, ymm0, xmm7, 1
  1592	vpsrlvd	ymm0, ymm0, ymm6
  1593	vpand	ymm0, ymm0, ymm1
  1594	vmovdqu	ymmword ptr [r15], ymm0
  1595	sub	r15, -128
  1596	add	rbx, 84
  1597	add	r8, -1
  1598	jne	.LBB0_66
  1599	jmp	.LBB0_147
  1600.LBB0_37:
  1601	cmp	ecx, 13
  1602	jg	.LBB0_43
  1603# %bb.38:
  1604	cmp	ecx, 12
  1605	je	.LBB0_126
  1606# %bb.39:
  1607	cmp	ecx, 13
  1608	jne	.LBB0_147
  1609# %bb.40:
  1610	cmp	edx, 32
  1611	jl	.LBB0_147
  1612# %bb.41:
  1613	mov	r8d, r14d
  1614	add	r15, 96
  1615	add	rbx, 48
  1616	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_90] # ymm0 = [35180077129727,35180077129727,35180077129727,35180077129727]
  1617	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_89] # ymm1 = [0,13,0,7,0,1,14,0]
  1618	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_91] # ymm2 = [8,0,2,15,0,9,0,3]
  1619	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_92] # ymm3 = [16,0,10,0,4,17,0,11]
  1620	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_93] # ymm4 = [0,5,18,0,12,0,6,19]
  1621	.p2align	4, 0x90
  1622.LBB0_42:                               # =>This Inner Loop Header: Depth=1
  1623	mov	eax, dword ptr [rbx - 40]
  1624	mov	r9d, dword ptr [rbx - 36]
  1625	shld	r9d, eax, 5
  1626	mov	esi, dword ptr [rbx - 48]
  1627	mov	edx, dword ptr [rbx - 44]
  1628	mov	ecx, eax
  1629	shld	ecx, edx, 12
  1630	mov	edi, edx
  1631	shld	edi, esi, 6
  1632	vmovd	xmm5, ecx
  1633	vpinsrd	xmm5, xmm5, eax, 1
  1634	vpinsrd	xmm5, xmm5, eax, 2
  1635	vpinsrd	xmm5, xmm5, r9d, 3
  1636	vmovd	xmm6, esi
  1637	vpinsrd	xmm6, xmm6, esi, 1
  1638	vpinsrd	xmm6, xmm6, edi, 2
  1639	vpinsrd	xmm6, xmm6, edx, 3
  1640	vinserti128	ymm5, ymm6, xmm5, 1
  1641	vpsrlvd	ymm5, ymm5, ymm1
  1642	vpand	ymm5, ymm5, ymm0
  1643	vmovdqu	ymmword ptr [r15 - 96], ymm5
  1644	mov	r9d, dword ptr [rbx - 24]
  1645	mov	ecx, dword ptr [rbx - 28]
  1646	mov	edx, r9d
  1647	shld	edx, ecx, 10
  1648	mov	esi, dword ptr [rbx - 32]
  1649	mov	edi, ecx
  1650	shld	edi, esi, 4
  1651	mov	r10d, dword ptr [rbx - 36]
  1652	mov	eax, esi
  1653	shld	eax, r10d, 11
  1654	vmovd	xmm5, edi
  1655	vpinsrd	xmm5, xmm5, ecx, 1
  1656	vpinsrd	xmm5, xmm5, edx, 2
  1657	vpinsrd	xmm5, xmm5, r9d, 3
  1658	vmovd	xmm6, r10d
  1659	vpinsrd	xmm6, xmm6, eax, 1
  1660	vpinsrd	xmm6, xmm6, esi, 2
  1661	vpinsrd	xmm6, xmm6, esi, 3
  1662	vinserti128	ymm5, ymm6, xmm5, 1
  1663	vpsrlvd	ymm5, ymm5, ymm2
  1664	vpand	ymm5, ymm5, ymm0
  1665	vmovdqu	ymmword ptr [r15 - 64], ymm5
  1666	mov	r9d, dword ptr [rbx - 12]
  1667	mov	ecx, dword ptr [rbx - 16]
  1668	mov	edx, r9d
  1669	shld	edx, ecx, 2
  1670	mov	esi, dword ptr [rbx - 24]
  1671	mov	eax, dword ptr [rbx - 20]
  1672	vmovd	xmm5, ecx
  1673	vpinsrd	xmm5, xmm5, ecx, 1
  1674	shld	ecx, eax, 9
  1675	mov	edi, eax
  1676	shld	edi, esi, 3
  1677	vpinsrd	xmm5, xmm5, edx, 2
  1678	vpinsrd	xmm5, xmm5, r9d, 3
  1679	vmovd	xmm6, esi
  1680	vpinsrd	xmm6, xmm6, edi, 1
  1681	vpinsrd	xmm6, xmm6, eax, 2
  1682	vpinsrd	xmm6, xmm6, ecx, 3
  1683	vinserti128	ymm5, ymm6, xmm5, 1
  1684	vpsrlvd	ymm5, ymm5, ymm3
  1685	vpand	ymm5, ymm5, ymm0
  1686	vmovdqu	ymmword ptr [r15 - 32], ymm5
  1687	mov	eax, dword ptr [rbx]
  1688	mov	ecx, dword ptr [rbx - 4]
  1689	mov	edx, eax
  1690	shld	edx, ecx, 7
  1691	mov	esi, dword ptr [rbx - 8]
  1692	vmovd	xmm5, ecx
  1693	shld	ecx, esi, 1
  1694	mov	edi, dword ptr [rbx - 12]
  1695	shrd	edi, esi, 24
  1696	vmovd	xmm6, edi
  1697	vpinsrd	xmm6, xmm6, esi, 1
  1698	vpinsrd	xmm6, xmm6, esi, 2
  1699	vpinsrd	xmm6, xmm6, ecx, 3
  1700	vpinsrd	xmm5, xmm5, edx, 1
  1701	vpinsrd	xmm5, xmm5, eax, 2
  1702	vpinsrd	xmm5, xmm5, eax, 3
  1703	vinserti128	ymm5, ymm6, xmm5, 1
  1704	vpsrlvd	ymm5, ymm5, ymm4
  1705	vpand	ymm5, ymm5, ymm0
  1706	vmovdqu	ymmword ptr [r15], ymm5
  1707	sub	r15, -128
  1708	add	rbx, 52
  1709	add	r8, -1
  1710	jne	.LBB0_42
  1711	jmp	.LBB0_147
  1712.LBB0_85:
  1713	cmp	ecx, 28
  1714	je	.LBB0_102
  1715# %bb.86:
  1716	cmp	ecx, 29
  1717	jne	.LBB0_147
  1718# %bb.87:
  1719	cmp	edx, 32
  1720	jl	.LBB0_147
  1721# %bb.88:
  1722	mov	r8d, r14d
  1723	add	r15, 96
  1724	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_12] # ymm0 = [2305843005455597567,2305843005455597567,2305843005455597567,2305843005455597567]
  1725	vmovdqa	xmm8, xmmword ptr [rip + .LCPI0_13] # xmm8 = <8,5,u,u>
  1726	vmovdqa	xmm10, xmmword ptr [rip + .LCPI0_14] # xmm10 = <24,27,u,u>
  1727	vmovdqa	ymm11, ymmword ptr [rip + .LCPI0_15] # ymm11 = [0,0,2,0,0,0,0,0]
  1728	vmovdqa	xmm12, xmmword ptr [rip + .LCPI0_16] # xmm12 = [16,13,10,7]
  1729	vmovdqa	xmm5, xmmword ptr [rip + .LCPI0_17] # xmm5 = [16,19,22,25]
  1730	vmovdqa	ymm6, ymmword ptr [rip + .LCPI0_18] # ymm6 = [0,0,0,0,0,1,0,0]
  1731	vmovdqa	xmm7, xmmword ptr [rip + .LCPI0_19] # xmm7 = [24,21,18,15]
  1732	vmovdqa	xmm1, xmmword ptr [rip + .LCPI0_20] # xmm1 = [8,11,14,17]
  1733	vmovdqa	ymm9, ymmword ptr [rip + .LCPI0_21] # ymm9 = [0,0,0,0,0,0,0,3]
  1734	.p2align	4, 0x90
  1735.LBB0_89:                               # =>This Inner Loop Header: Depth=1
  1736	mov	r11d, dword ptr [rbx + 24]
  1737	mov	r9d, dword ptr [rbx + 28]
  1738	shld	r9d, r11d, 21
  1739	mov	esi, dword ptr [rbx + 20]
  1740	shld	r11d, esi, 18
  1741	mov	edi, dword ptr [rbx + 16]
  1742	shld	esi, edi, 15
  1743	mov	eax, dword ptr [rbx + 12]
  1744	shld	edi, eax, 12
  1745	mov	edx, dword ptr [rbx + 8]
  1746	shld	eax, edx, 9
  1747	mov	r10d, dword ptr [rbx]
  1748	mov	ecx, dword ptr [rbx + 4]
  1749	shld	edx, ecx, 6
  1750	shld	ecx, r10d, 3
  1751	vmovd	xmm2, r10d
  1752	vmovd	xmm3, edi
  1753	vpinsrd	xmm2, xmm2, ecx, 1
  1754	vpinsrd	xmm3, xmm3, esi, 1
  1755	vpinsrd	xmm2, xmm2, edx, 2
  1756	vpinsrd	xmm3, xmm3, r11d, 2
  1757	vpinsrd	xmm2, xmm2, eax, 3
  1758	vpinsrd	xmm3, xmm3, r9d, 3
  1759	vinserti128	ymm2, ymm2, xmm3, 1
  1760	vpand	ymm2, ymm2, ymm0
  1761	vmovdqu	ymmword ptr [r15 - 96], ymm2
  1762	mov	eax, dword ptr [rbx + 52]
  1763	mov	r9d, dword ptr [rbx + 56]
  1764	shld	r9d, eax, 13
  1765	mov	edx, dword ptr [rbx + 48]
  1766	shld	eax, edx, 10
  1767	mov	esi, dword ptr [rbx + 44]
  1768	shld	edx, esi, 7
  1769	mov	edi, dword ptr [rbx + 36]
  1770	mov	ecx, dword ptr [rbx + 40]
  1771	shld	esi, ecx, 4
  1772	shld	ecx, edi, 1
  1773	vmovq	xmm2, qword ptr [rbx + 28]      # xmm2 = mem[0],zero
  1774	vpsrlvd	xmm3, xmm2, xmm8
  1775	vpshufd	xmm2, xmm2, 229                 # xmm2 = xmm2[1,1,2,3]
  1776	vpinsrd	xmm2, xmm2, edi, 1
  1777	vpsllvd	xmm2, xmm2, xmm10
  1778	vpor	xmm2, xmm3, xmm2
  1779	vmovd	xmm3, esi
  1780	vpinsrd	xmm3, xmm3, edx, 1
  1781	vpinsrd	xmm3, xmm3, eax, 2
  1782	vpinsrd	xmm3, xmm3, r9d, 3
  1783	vpinsrd	xmm2, xmm2, edi, 2
  1784	vpinsrd	xmm2, xmm2, ecx, 3
  1785	vinserti128	ymm2, ymm2, xmm3, 1
  1786	vpsrlvd	ymm2, ymm2, ymm11
  1787	vpand	ymm2, ymm2, ymm0
  1788	vmovdqu	ymmword ptr [r15 - 64], ymm2
  1789	mov	eax, dword ptr [rbx + 80]
  1790	mov	ecx, dword ptr [rbx + 84]
  1791	shld	ecx, eax, 5
  1792	mov	edx, dword ptr [rbx + 76]
  1793	mov	esi, dword ptr [rbx + 72]
  1794	shld	eax, edx, 2
  1795	mov	edi, edx
  1796	shld	edi, esi, 28
  1797	vmovdqu	xmm2, xmmword ptr [rbx + 56]
  1798	vpsrlvd	xmm3, xmm2, xmm12
  1799	vpshufd	xmm2, xmm2, 249                 # xmm2 = xmm2[1,2,3,3]
  1800	vpinsrd	xmm2, xmm2, esi, 3
  1801	vmovd	xmm4, edi
  1802	vpinsrd	xmm4, xmm4, edx, 1
  1803	vpinsrd	xmm4, xmm4, eax, 2
  1804	vpsllvd	xmm2, xmm2, xmm5
  1805	vpinsrd	xmm4, xmm4, ecx, 3
  1806	vpor	xmm2, xmm3, xmm2
  1807	vinserti128	ymm2, ymm2, xmm4, 1
  1808	vpsrlvd	ymm2, ymm2, ymm6
  1809	vpand	ymm2, ymm2, ymm0
  1810	vmovdqu	ymmword ptr [r15 - 32], ymm2
  1811	mov	eax, dword ptr [rbx + 112]
  1812	mov	ecx, dword ptr [rbx + 108]
  1813	mov	edx, eax
  1814	shld	edx, ecx, 26
  1815	mov	esi, dword ptr [rbx + 104]
  1816	shld	ecx, esi, 23
  1817	mov	edi, dword ptr [rbx + 100]
  1818	vmovdqu	xmm2, xmmword ptr [rbx + 84]
  1819	shld	esi, edi, 20
  1820	vpsrlvd	xmm3, xmm2, xmm7
  1821	vpshufd	xmm2, xmm2, 249                 # xmm2 = xmm2[1,2,3,3]
  1822	vpinsrd	xmm2, xmm2, edi, 3
  1823	vmovd	xmm4, esi
  1824	vpinsrd	xmm4, xmm4, ecx, 1
  1825	vpsllvd	xmm2, xmm2, xmm1
  1826	vpinsrd	xmm4, xmm4, edx, 2
  1827	vpinsrd	xmm4, xmm4, eax, 3
  1828	vpor	xmm2, xmm3, xmm2
  1829	vinserti128	ymm2, ymm2, xmm4, 1
  1830	vpsrlvd	ymm2, ymm2, ymm9
  1831	vpand	ymm2, ymm2, ymm0
  1832	vmovdqu	ymmword ptr [r15], ymm2
  1833	add	rbx, 116
  1834	sub	r15, -128
  1835	add	r8, -1
  1836	jne	.LBB0_89
  1837	jmp	.LBB0_147
  1838.LBB0_9:
  1839	cmp	ecx, 2
  1840	je	.LBB0_141
  1841# %bb.10:
  1842	cmp	ecx, 3
  1843	jne	.LBB0_147
  1844# %bb.11:
  1845	cmp	edx, 32
  1846	jl	.LBB0_147
  1847# %bb.12:
  1848	mov	eax, r14d
  1849	add	r15, 96
  1850	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_127] # ymm0 = [30064771079,30064771079,30064771079,30064771079]
  1851	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_126] # ymm1 = [0,3,6,9,12,15,18,21]
  1852	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_128] # ymm2 = [24,27,0,1,4,7,10,13]
  1853	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_129] # ymm3 = [16,19,22,25,28,0,2,5]
  1854	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_130] # ymm4 = [8,11,14,17,20,23,26,29]
  1855	.p2align	4, 0x90
  1856.LBB0_13:                               # =>This Inner Loop Header: Depth=1
  1857	vpbroadcastd	ymm5, dword ptr [rbx]
  1858	vpsrlvd	ymm5, ymm5, ymm1
  1859	vpand	ymm5, ymm5, ymm0
  1860	vmovdqu	ymmword ptr [r15 - 96], ymm5
  1861	mov	ecx, dword ptr [rbx]
  1862	mov	edx, dword ptr [rbx + 4]
  1863	mov	esi, edx
  1864	shld	esi, ecx, 2
  1865	vmovd	xmm5, ecx
  1866	vpinsrd	xmm5, xmm5, ecx, 1
  1867	vpinsrd	xmm5, xmm5, esi, 2
  1868	vpinsrd	xmm5, xmm5, edx, 3
  1869	vmovd	xmm6, edx
  1870	vpbroadcastd	xmm6, xmm6
  1871	vinserti128	ymm5, ymm5, xmm6, 1
  1872	vpsrlvd	ymm5, ymm5, ymm2
  1873	vpand	ymm5, ymm5, ymm0
  1874	vmovdqu	ymmword ptr [r15 - 64], ymm5
  1875	mov	ecx, dword ptr [rbx + 4]
  1876	mov	edx, dword ptr [rbx + 8]
  1877	mov	esi, edx
  1878	shld	esi, ecx, 1
  1879	vmovd	xmm5, ecx
  1880	vpbroadcastd	xmm6, xmm5
  1881	vpinsrd	xmm5, xmm5, esi, 1
  1882	vpinsrd	xmm5, xmm5, edx, 2
  1883	vpinsrd	xmm5, xmm5, edx, 3
  1884	vinserti128	ymm5, ymm6, xmm5, 1
  1885	vpsrlvd	ymm5, ymm5, ymm3
  1886	vpand	ymm5, ymm5, ymm0
  1887	vmovdqu	ymmword ptr [r15 - 32], ymm5
  1888	vpbroadcastd	ymm5, dword ptr [rbx + 8]
  1889	vpsrlvd	ymm5, ymm5, ymm4
  1890	vpand	ymm5, ymm5, ymm0
  1891	vmovdqu	ymmword ptr [r15], ymm5
  1892	sub	r15, -128
  1893	add	rbx, 12
  1894	add	rax, -1
  1895	jne	.LBB0_13
  1896	jmp	.LBB0_147
  1897.LBB0_56:
  1898	cmp	ecx, 18
  1899	je	.LBB0_117
  1900# %bb.57:
  1901	cmp	ecx, 19
  1902	jne	.LBB0_147
  1903# %bb.58:
  1904	cmp	edx, 32
  1905	jl	.LBB0_147
  1906# %bb.59:
  1907	mov	r8d, r14d
  1908	add	r15, 96
  1909	add	rbx, 72
  1910	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_68] # ymm0 = [2251795519242239,2251795519242239,2251795519242239,2251795519242239]
  1911	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_67] # ymm1 = [0,0,6,0,12,0,0,5]
  1912	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_69] # ymm2 = [0,11,0,0,4,0,10,0]
  1913	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_70] # ymm3 = [0,3,0,9,0,0,2,0]
  1914	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_71] # ymm4 = [8,0,0,1,0,7,0,13]
  1915	.p2align	4, 0x90
  1916.LBB0_60:                               # =>This Inner Loop Header: Depth=1
  1917	mov	r9d, dword ptr [rbx - 56]
  1918	mov	edx, dword ptr [rbx - 60]
  1919	mov	esi, r9d
  1920	shld	esi, edx, 14
  1921	mov	edi, dword ptr [rbx - 64]
  1922	mov	r10d, dword ptr [rbx - 72]
  1923	shld	edx, edi, 1
  1924	mov	eax, dword ptr [rbx - 68]
  1925	mov	ecx, eax
  1926	shld	ecx, r10d, 13
  1927	vmovd	xmm5, edi
  1928	shld	edi, eax, 7
  1929	vpinsrd	xmm5, xmm5, edx, 1
  1930	vmovd	xmm6, r10d
  1931	vpinsrd	xmm5, xmm5, esi, 2
  1932	vpinsrd	xmm6, xmm6, ecx, 1
  1933	vpinsrd	xmm5, xmm5, r9d, 3
  1934	vpinsrd	xmm6, xmm6, eax, 2
  1935	vpinsrd	xmm6, xmm6, edi, 3
  1936	vinserti128	ymm5, ymm6, xmm5, 1
  1937	vpsrlvd	ymm5, ymm5, ymm1
  1938	vpand	ymm5, ymm5, ymm0
  1939	vmovdqu	ymmword ptr [r15 - 96], ymm5
  1940	mov	r10d, dword ptr [rbx - 40]
  1941	mov	r9d, dword ptr [rbx - 36]
  1942	shld	r9d, r10d, 3
  1943	mov	edx, dword ptr [rbx - 44]
  1944	mov	esi, r10d
  1945	shld	esi, edx, 9
  1946	mov	edi, dword ptr [rbx - 48]
  1947	vmovd	xmm5, edx
  1948	shld	edx, edi, 15
  1949	mov	ecx, dword ptr [rbx - 56]
  1950	mov	eax, dword ptr [rbx - 52]
  1951	shld	edi, eax, 2
  1952	shrd	ecx, eax, 24
  1953	vpinsrd	xmm5, xmm5, esi, 1
  1954	vmovd	xmm6, ecx
  1955	vpinsrd	xmm5, xmm5, r10d, 2
  1956	vpinsrd	xmm6, xmm6, eax, 1
  1957	vpinsrd	xmm5, xmm5, r9d, 3
  1958	vpinsrd	xmm6, xmm6, edi, 2
  1959	vpinsrd	xmm6, xmm6, edx, 3
  1960	vinserti128	ymm5, ymm6, xmm5, 1
  1961	vpsrlvd	ymm5, ymm5, ymm2
  1962	vpand	ymm5, ymm5, ymm0
  1963	vmovdqu	ymmword ptr [r15 - 64], ymm5
  1964	mov	r10d, dword ptr [rbx - 20]
  1965	mov	r9d, dword ptr [rbx - 16]
  1966	shld	r9d, r10d, 11
  1967	mov	edx, dword ptr [rbx - 24]
  1968	mov	esi, r10d
  1969	mov	r11d, dword ptr [rbx - 28]
  1970	shld	esi, edx, 17
  1971	mov	ecx, dword ptr [rbx - 36]
  1972	mov	eax, dword ptr [rbx - 32]
  1973	shld	edx, r11d, 4
  1974	mov	edi, r11d
  1975	shld	edi, eax, 10
  1976	shrd	ecx, eax, 16
  1977	vmovd	xmm5, edx
  1978	vpinsrd	xmm5, xmm5, esi, 1
  1979	vmovd	xmm6, ecx
  1980	vpinsrd	xmm5, xmm5, r10d, 2
  1981	vpinsrd	xmm6, xmm6, eax, 1
  1982	vpinsrd	xmm5, xmm5, r9d, 3
  1983	vpinsrd	xmm6, xmm6, edi, 2
  1984	vpinsrd	xmm6, xmm6, r11d, 3
  1985	vinserti128	ymm5, ymm6, xmm5, 1
  1986	vpsrlvd	ymm5, ymm5, ymm3
  1987	vpand	ymm5, ymm5, ymm0
  1988	vmovdqu	ymmword ptr [r15 - 32], ymm5
  1989	mov	r9d, dword ptr [rbx]
  1990	mov	r11d, dword ptr [rbx - 4]
  1991	mov	edx, r9d
  1992	shld	edx, r11d, 6
  1993	mov	ecx, dword ptr [rbx - 8]
  1994	mov	edi, r11d
  1995	shld	edi, ecx, 12
  1996	mov	r10d, dword ptr [rbx - 16]
  1997	mov	eax, dword ptr [rbx - 12]
  1998	mov	esi, ecx
  1999	shld	esi, eax, 18
  2000	shld	eax, r10d, 5
  2001	vmovd	xmm5, r10d
  2002	vmovd	xmm6, edi
  2003	vpinsrd	xmm5, xmm5, eax, 1
  2004	vpinsrd	xmm6, xmm6, r11d, 1
  2005	vpinsrd	xmm5, xmm5, esi, 2
  2006	vpinsrd	xmm6, xmm6, edx, 2
  2007	vpinsrd	xmm5, xmm5, ecx, 3
  2008	vpinsrd	xmm6, xmm6, r9d, 3
  2009	vinserti128	ymm5, ymm5, xmm6, 1
  2010	vpsrlvd	ymm5, ymm5, ymm4
  2011	vpand	ymm5, ymm5, ymm0
  2012	vmovdqu	ymmword ptr [r15], ymm5
  2013	sub	r15, -128
  2014	add	rbx, 76
  2015	add	r8, -1
  2016	jne	.LBB0_60
  2017	jmp	.LBB0_147
  2018.LBB0_32:
  2019	cmp	ecx, 10
  2020	je	.LBB0_129
  2021# %bb.33:
  2022	cmp	ecx, 11
  2023	jne	.LBB0_147
  2024# %bb.34:
  2025	cmp	edx, 32
  2026	jl	.LBB0_147
  2027# %bb.35:
  2028	mov	r8d, r14d
  2029	add	r15, 96
  2030	add	rbx, 40
  2031	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_97] # ymm0 = [8791798056959,8791798056959,8791798056959,8791798056959]
  2032	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_96] # ymm1 = [0,11,0,1,12,0,2,13]
  2033	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_98] # ymm2 = [0,3,14,0,4,15,0,5]
  2034	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_99] # ymm3 = [16,0,6,17,0,7,18,0]
  2035	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_100] # ymm4 = [8,19,0,9,20,0,10,21]
  2036	.p2align	4, 0x90
  2037.LBB0_36:                               # =>This Inner Loop Header: Depth=1
  2038	mov	ecx, dword ptr [rbx - 32]
  2039	mov	edx, dword ptr [rbx - 40]
  2040	mov	esi, dword ptr [rbx - 36]
  2041	mov	edi, ecx
  2042	shld	edi, esi, 9
  2043	mov	eax, esi
  2044	shld	eax, edx, 10
  2045	vmovd	xmm5, esi
  2046	vpinsrd	xmm5, xmm5, edi, 1
  2047	vpinsrd	xmm5, xmm5, ecx, 2
  2048	vpinsrd	xmm5, xmm5, ecx, 3
  2049	vmovd	xmm6, edx
  2050	vpinsrd	xmm6, xmm6, edx, 1
  2051	vpinsrd	xmm6, xmm6, eax, 2
  2052	vpinsrd	xmm6, xmm6, esi, 3
  2053	vinserti128	ymm5, ymm6, xmm5, 1
  2054	vpsrlvd	ymm5, ymm5, ymm1
  2055	vpand	ymm5, ymm5, ymm0
  2056	vmovdqu	ymmword ptr [r15 - 96], ymm5
  2057	mov	eax, dword ptr [rbx - 20]
  2058	mov	ecx, dword ptr [rbx - 24]
  2059	mov	edx, eax
  2060	shld	edx, ecx, 6
  2061	mov	esi, dword ptr [rbx - 32]
  2062	mov	edi, dword ptr [rbx - 28]
  2063	vmovd	xmm5, ecx
  2064	vpinsrd	xmm5, xmm5, ecx, 1
  2065	shld	ecx, edi, 7
  2066	shrd	esi, edi, 24
  2067	vpinsrd	xmm5, xmm5, edx, 2
  2068	vpinsrd	xmm5, xmm5, eax, 3
  2069	vmovd	xmm6, esi
  2070	vpinsrd	xmm6, xmm6, edi, 1
  2071	vpinsrd	xmm6, xmm6, edi, 2
  2072	vpinsrd	xmm6, xmm6, ecx, 3
  2073	vinserti128	ymm5, ymm6, xmm5, 1
  2074	vpsrlvd	ymm5, ymm5, ymm2
  2075	vpand	ymm5, ymm5, ymm0
  2076	vmovdqu	ymmword ptr [r15 - 64], ymm5
  2077	mov	eax, dword ptr [rbx - 12]
  2078	mov	ecx, dword ptr [rbx - 8]
  2079	shld	ecx, eax, 3
  2080	mov	r9d, dword ptr [rbx - 20]
  2081	mov	esi, dword ptr [rbx - 16]
  2082	mov	edi, eax
  2083	shld	edi, esi, 4
  2084	mov	edx, esi
  2085	shld	edx, r9d, 5
  2086	vmovd	xmm5, edi
  2087	vpinsrd	xmm5, xmm5, eax, 1
  2088	vpinsrd	xmm5, xmm5, eax, 2
  2089	vpinsrd	xmm5, xmm5, ecx, 3
  2090	vmovd	xmm6, r9d
  2091	vpinsrd	xmm6, xmm6, edx, 1
  2092	vpinsrd	xmm6, xmm6, esi, 2
  2093	vpinsrd	xmm6, xmm6, esi, 3
  2094	vinserti128	ymm5, ymm6, xmm5, 1
  2095	vpsrlvd	ymm5, ymm5, ymm3
  2096	vpand	ymm5, ymm5, ymm0
  2097	vmovdqu	ymmword ptr [r15 - 32], ymm5
  2098	mov	eax, dword ptr [rbx]
  2099	mov	ecx, dword ptr [rbx - 8]
  2100	mov	edx, dword ptr [rbx - 4]
  2101	mov	esi, eax
  2102	shld	esi, edx, 1
  2103	mov	edi, edx
  2104	shld	edi, ecx, 2
  2105	vmovd	xmm5, edx
  2106	vpinsrd	xmm5, xmm5, esi, 1
  2107	vpinsrd	xmm5, xmm5, eax, 2
  2108	vpinsrd	xmm5, xmm5, eax, 3
  2109	vmovd	xmm6, ecx
  2110	vpinsrd	xmm6, xmm6, ecx, 1
  2111	vpinsrd	xmm6, xmm6, edi, 2
  2112	vpinsrd	xmm6, xmm6, edx, 3
  2113	vinserti128	ymm5, ymm6, xmm5, 1
  2114	vpsrlvd	ymm5, ymm5, ymm4
  2115	vpand	ymm5, ymm5, ymm0
  2116	vmovdqu	ymmword ptr [r15], ymm5
  2117	sub	r15, -128
  2118	add	rbx, 44
  2119	add	r8, -1
  2120	jne	.LBB0_36
  2121	jmp	.LBB0_147
  2122.LBB0_79:
  2123	cmp	ecx, 26
  2124	je	.LBB0_105
  2125# %bb.80:
  2126	cmp	ecx, 27
  2127	jne	.LBB0_147
  2128# %bb.81:
  2129	cmp	edx, 32
  2130	jl	.LBB0_147
  2131# %bb.82:
  2132	mov	r8d, r14d
  2133	add	r15, 96
  2134	add	rbx, 104
  2135	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_25] # ymm0 = [576460748142673919,576460748142673919,576460748142673919,576460748142673919]
  2136	vmovdqa	ymm9, ymmword ptr [rip + .LCPI0_24] # ymm9 = [0,0,0,0,0,0,2,0]
  2137	vmovdqa	xmm10, xmmword ptr [rip + .LCPI0_26] # xmm10 = [24,19,14,9]
  2138	vmovdqa	xmm11, xmmword ptr [rip + .LCPI0_27] # xmm11 = [8,13,18,23]
  2139	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_28] # ymm4 = [0,0,0,0,4,0,0,0]
  2140	vmovdqa	xmm5, xmmword ptr [rip + .LCPI0_29] # xmm5 = <16,11,u,u>
  2141	vmovdqa	xmm6, xmmword ptr [rip + .LCPI0_30] # xmm6 = <16,21,u,u>
  2142	vmovdqa	ymm7, ymmword ptr [rip + .LCPI0_31] # ymm7 = [0,0,0,1,0,0,0,0]
  2143	vmovdqa	ymm8, ymmword ptr [rip + .LCPI0_32] # ymm8 = [0,3,0,0,0,0,0,5]
  2144	.p2align	4, 0x90
  2145.LBB0_83:                               # =>This Inner Loop Header: Depth=1
  2146	mov	r10d, dword ptr [rbx - 84]
  2147	mov	r9d, dword ptr [rbx - 80]
  2148	shld	r9d, r10d, 3
  2149	mov	esi, dword ptr [rbx - 88]
  2150	mov	edi, r10d
  2151	shld	edi, esi, 25
  2152	mov	eax, dword ptr [rbx - 92]
  2153	shld	esi, eax, 20
  2154	mov	edx, dword ptr [rbx - 96]
  2155	shld	eax, edx, 15
  2156	mov	r11d, dword ptr [rbx - 104]
  2157	mov	ecx, dword ptr [rbx - 100]
  2158	shld	edx, ecx, 10
  2159	shld	ecx, r11d, 5
  2160	vmovd	xmm1, r11d
  2161	vmovd	xmm2, esi
  2162	vpinsrd	xmm1, xmm1, ecx, 1
  2163	vpinsrd	xmm2, xmm2, edi, 1
  2164	vpinsrd	xmm1, xmm1, edx, 2
  2165	vpinsrd	xmm2, xmm2, r10d, 2
  2166	vpinsrd	xmm1, xmm1, eax, 3
  2167	vpinsrd	xmm2, xmm2, r9d, 3
  2168	vinserti128	ymm1, ymm1, xmm2, 1
  2169	vpsrlvd	ymm1, ymm1, ymm9
  2170	vpand	ymm1, ymm1, ymm0
  2171	vmovdqu	ymmword ptr [r15 - 96], ymm1
  2172	mov	eax, dword ptr [rbx - 56]
  2173	mov	ecx, dword ptr [rbx - 52]
  2174	shld	ecx, eax, 11
  2175	mov	edx, dword ptr [rbx - 60]
  2176	mov	esi, dword ptr [rbx - 64]
  2177	shld	eax, edx, 6
  2178	shld	edx, esi, 1
  2179	vmovdqu	xmm1, xmmword ptr [rbx - 80]
  2180	vpsrlvd	xmm2, xmm1, xmm10
  2181	vpshufd	xmm1, xmm1, 249                 # xmm1 = xmm1[1,2,3,3]
  2182	vmovd	xmm3, esi
  2183	vpinsrd	xmm1, xmm1, esi, 3
  2184	vpinsrd	xmm3, xmm3, edx, 1
  2185	vpinsrd	xmm3, xmm3, eax, 2
  2186	vpsllvd	xmm1, xmm1, xmm11
  2187	vpinsrd	xmm3, xmm3, ecx, 3
  2188	vpor	xmm1, xmm2, xmm1
  2189	vinserti128	ymm1, ymm1, xmm3, 1
  2190	vpsrlvd	ymm1, ymm1, ymm4
  2191	vpand	ymm1, ymm1, ymm0
  2192	vmovdqu	ymmword ptr [r15 - 64], ymm1
  2193	mov	eax, dword ptr [rbx - 28]
  2194	mov	r9d, dword ptr [rbx - 24]
  2195	shld	r9d, eax, 19
  2196	mov	edx, dword ptr [rbx - 32]
  2197	shld	eax, edx, 14
  2198	mov	esi, dword ptr [rbx - 36]
  2199	shld	edx, esi, 9
  2200	mov	r10d, dword ptr [rbx - 44]
  2201	mov	edi, dword ptr [rbx - 40]
  2202	shld	esi, edi, 4
  2203	mov	ecx, edi
  2204	shld	ecx, r10d, 26
  2205	vmovq	xmm1, qword ptr [rbx - 52]      # xmm1 = mem[0],zero
  2206	vpsrlvd	xmm2, xmm1, xmm5
  2207	vpshufd	xmm1, xmm1, 229                 # xmm1 = xmm1[1,1,2,3]
  2208	vpinsrd	xmm1, xmm1, r10d, 1
  2209	vpsllvd	xmm1, xmm1, xmm6
  2210	vmovd	xmm3, esi
  2211	vpinsrd	xmm3, xmm3, edx, 1
  2212	vpor	xmm1, xmm2, xmm1
  2213	vpinsrd	xmm2, xmm3, eax, 2
  2214	vpinsrd	xmm2, xmm2, r9d, 3
  2215	vpinsrd	xmm1, xmm1, ecx, 2
  2216	vpinsrd	xmm1, xmm1, edi, 3
  2217	vinserti128	ymm1, ymm1, xmm2, 1
  2218	vpsrlvd	ymm1, ymm1, ymm7
  2219	vpand	ymm1, ymm1, ymm0
  2220	vmovdqu	ymmword ptr [r15 - 32], ymm1
  2221	mov	r9d, dword ptr [rbx]
  2222	mov	r11d, dword ptr [rbx - 4]
  2223	mov	r10d, r9d
  2224	shld	r10d, r11d, 22
  2225	mov	esi, dword ptr [rbx - 8]
  2226	shld	r11d, esi, 17
  2227	mov	edi, dword ptr [rbx - 12]
  2228	mov	eax, dword ptr [rbx - 16]
  2229	shld	esi, edi, 12
  2230	mov	edx, dword ptr [rbx - 24]
  2231	mov	ecx, dword ptr [rbx - 20]
  2232	shld	edi, eax, 7
  2233	shrd	edx, ecx, 8
  2234	shld	eax, ecx, 2
  2235	vmovd	xmm1, esi
  2236	vpinsrd	xmm1, xmm1, r11d, 1
  2237	vmovd	xmm2, edx
  2238	vpinsrd	xmm1, xmm1, r10d, 2
  2239	vpinsrd	xmm2, xmm2, ecx, 1
  2240	vpinsrd	xmm1, xmm1, r9d, 3
  2241	vpinsrd	xmm2, xmm2, eax, 2
  2242	vpinsrd	xmm2, xmm2, edi, 3
  2243	vinserti128	ymm1, ymm2, xmm1, 1
  2244	vpsrlvd	ymm1, ymm1, ymm8
  2245	vpand	ymm1, ymm1, ymm0
  2246	vmovdqu	ymmword ptr [r15], ymm1
  2247	sub	r15, -128
  2248	add	rbx, 108
  2249	add	r8, -1
  2250	jne	.LBB0_83
  2251	jmp	.LBB0_147
  2252.LBB0_20:
  2253	cmp	ecx, 6
  2254	je	.LBB0_135
  2255# %bb.21:
  2256	cmp	ecx, 7
  2257	jne	.LBB0_147
  2258# %bb.22:
  2259	cmp	edx, 32
  2260	jl	.LBB0_147
  2261# %bb.23:
  2262	mov	r8d, r14d
  2263	add	r15, 96
  2264	add	rbx, 24
  2265	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_112] # ymm0 = [545460846719,545460846719,545460846719,545460846719]
  2266	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_111] # ymm1 = [0,7,14,21,0,3,10,17]
  2267	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_113] # ymm2 = [24,0,6,13,20,0,2,9]
  2268	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_114] # ymm3 = [16,23,0,5,12,19,0,1]
  2269	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_115] # ymm4 = [8,15,22,0,4,11,18,25]
  2270	.p2align	4, 0x90
  2271.LBB0_24:                               # =>This Inner Loop Header: Depth=1
  2272	mov	ecx, dword ptr [rbx - 24]
  2273	mov	edx, dword ptr [rbx - 20]
  2274	mov	esi, edx
  2275	shld	esi, ecx, 4
  2276	vmovd	xmm5, ecx
  2277	vmovd	xmm6, esi
  2278	vpinsrd	xmm6, xmm6, edx, 1
  2279	vpinsrd	xmm6, xmm6, edx, 2
  2280	vpinsrd	xmm6, xmm6, edx, 3
  2281	vpbroadcastd	xmm5, xmm5
  2282	vinserti128	ymm5, ymm5, xmm6, 1
  2283	vpsrlvd	ymm5, ymm5, ymm1
  2284	vpand	ymm5, ymm5, ymm0
  2285	vmovdqu	ymmword ptr [r15 - 96], ymm5
  2286	mov	ecx, dword ptr [rbx - 12]
  2287	mov	edx, dword ptr [rbx - 20]
  2288	mov	esi, dword ptr [rbx - 16]
  2289	mov	edi, ecx
  2290	shld	edi, esi, 5
  2291	mov	eax, esi
  2292	shld	eax, edx, 1
  2293	vmovd	xmm5, esi
  2294	vpinsrd	xmm5, xmm5, edi, 1
  2295	vpinsrd	xmm5, xmm5, ecx, 2
  2296	vpinsrd	xmm5, xmm5, ecx, 3
  2297	vmovd	xmm6, edx
  2298	vpinsrd	xmm6, xmm6, eax, 1
  2299	vpinsrd	xmm6, xmm6, esi, 2
  2300	vpinsrd	xmm6, xmm6, esi, 3
  2301	vinserti128	ymm5, ymm6, xmm5, 1
  2302	vpsrlvd	ymm5, ymm5, ymm2
  2303	vpand	ymm5, ymm5, ymm0
  2304	vmovdqu	ymmword ptr [r15 - 64], ymm5
  2305	mov	eax, dword ptr [rbx - 4]
  2306	mov	ecx, dword ptr [rbx - 12]
  2307	mov	edx, dword ptr [rbx - 8]
  2308	mov	esi, eax
  2309	shld	esi, edx, 6
  2310	mov	edi, edx
  2311	shld	edi, ecx, 2
  2312	vmovd	xmm5, edx
  2313	vpinsrd	xmm5, xmm5, edx, 1
  2314	vpinsrd	xmm5, xmm5, esi, 2
  2315	vpinsrd	xmm5, xmm5, eax, 3
  2316	vmovd	xmm6, ecx
  2317	vpinsrd	xmm6, xmm6, ecx, 1
  2318	vpinsrd	xmm6, xmm6, edi, 2
  2319	vpinsrd	xmm6, xmm6, edx, 3
  2320	vinserti128	ymm5, ymm6, xmm5, 1
  2321	vpsrlvd	ymm5, ymm5, ymm3
  2322	vpand	ymm5, ymm5, ymm0
  2323	vmovdqu	ymmword ptr [r15 - 32], ymm5
  2324	mov	eax, dword ptr [rbx - 4]
  2325	mov	ecx, dword ptr [rbx]
  2326	mov	edx, ecx
  2327	shld	edx, eax, 3
  2328	vmovd	xmm5, ecx
  2329	vmovd	xmm6, eax
  2330	vpinsrd	xmm6, xmm6, eax, 1
  2331	vpinsrd	xmm6, xmm6, eax, 2
  2332	vpinsrd	xmm6, xmm6, edx, 3
  2333	vpbroadcastd	xmm5, xmm5
  2334	vinserti128	ymm5, ymm6, xmm5, 1
  2335	vpsrlvd	ymm5, ymm5, ymm4
  2336	vpand	ymm5, ymm5, ymm0
  2337	vmovdqu	ymmword ptr [r15], ymm5
  2338	sub	r15, -128
  2339	add	rbx, 28
  2340	add	r8, -1
  2341	jne	.LBB0_24
  2342	jmp	.LBB0_147
  2343.LBB0_67:
  2344	cmp	ecx, 22
  2345	je	.LBB0_111
  2346# %bb.68:
  2347	cmp	ecx, 23
  2348	jne	.LBB0_147
  2349# %bb.69:
  2350	cmp	edx, 32
  2351	jl	.LBB0_147
  2352# %bb.70:
  2353	mov	r8d, r14d
  2354	add	r15, 96
  2355	add	rbx, 88
  2356	vmovdqa	ymm8, ymmword ptr [rip + .LCPI0_48] # ymm8 = [0,0,0,5,0,0,0,1]
  2357	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_49] # ymm1 = [36028792732385279,36028792732385279,36028792732385279,36028792732385279]
  2358	vmovdqa	xmm2, xmmword ptr [rip + .LCPI0_50] # xmm2 = <24,15,u,u>
  2359	vmovdqa	xmm3, xmmword ptr [rip + .LCPI0_51] # xmm3 = <8,17,u,u>
  2360	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_52] # ymm4 = [0,0,6,0,0,0,2,0]
  2361	vmovdqa	ymm5, ymmword ptr [rip + .LCPI0_53] # ymm5 = [0,7,0,0,0,3,0,0]
  2362	vmovdqa	ymm6, ymmword ptr [rip + .LCPI0_54] # ymm6 = [8,0,0,0,4,0,0,9]
  2363	.p2align	4, 0x90
  2364.LBB0_71:                               # =>This Inner Loop Header: Depth=1
  2365	mov	r9d, dword ptr [rbx - 68]
  2366	mov	edx, dword ptr [rbx - 72]
  2367	mov	r11d, r9d
  2368	shld	r11d, edx, 22
  2369	mov	edi, dword ptr [rbx - 76]
  2370	shld	edx, edi, 13
  2371	mov	esi, dword ptr [rbx - 80]
  2372	shld	edi, esi, 4
  2373	mov	r10d, dword ptr [rbx - 88]
  2374	mov	ecx, dword ptr [rbx - 84]
  2375	mov	eax, esi
  2376	shld	eax, ecx, 18
  2377	shld	ecx, r10d, 9
  2378	vmovd	xmm7, r10d
  2379	vmovd	xmm0, edi
  2380	vpinsrd	xmm7, xmm7, ecx, 1
  2381	vpinsrd	xmm0, xmm0, edx, 1
  2382	vpinsrd	xmm7, xmm7, eax, 2
  2383	vpinsrd	xmm0, xmm0, r11d, 2
  2384	vpinsrd	xmm7, xmm7, esi, 3
  2385	vpinsrd	xmm0, xmm0, r9d, 3
  2386	vinserti128	ymm0, ymm7, xmm0, 1
  2387	vpsrlvd	ymm0, ymm0, ymm8
  2388	vpand	ymm0, ymm0, ymm1
  2389	vmovdqu	ymmword ptr [r15 - 96], ymm0
  2390	mov	eax, dword ptr [rbx - 48]
  2391	mov	r9d, dword ptr [rbx - 44]
  2392	shld	r9d, eax, 7
  2393	mov	edx, dword ptr [rbx - 52]
  2394	mov	esi, eax
  2395	shld	esi, edx, 21
  2396	mov	edi, dword ptr [rbx - 60]
  2397	mov	ecx, dword ptr [rbx - 56]
  2398	shld	edx, ecx, 12
  2399	shld	ecx, edi, 3
  2400	vmovq	xmm0, qword ptr [rbx - 68]      # xmm0 = mem[0],zero
  2401	vpsrlvd	xmm7, xmm0, xmm2
  2402	vpshufd	xmm0, xmm0, 229                 # xmm0 = xmm0[1,1,2,3]
  2403	vpinsrd	xmm0, xmm0, edi, 1
  2404	vpsllvd	xmm0, xmm0, xmm3
  2405	vpor	xmm0, xmm7, xmm0
  2406	vmovd	xmm7, edx
  2407	vpinsrd	xmm7, xmm7, esi, 1
  2408	vpinsrd	xmm7, xmm7, eax, 2
  2409	vpinsrd	xmm7, xmm7, r9d, 3
  2410	vpinsrd	xmm0, xmm0, edi, 2
  2411	vpinsrd	xmm0, xmm0, ecx, 3
  2412	vinserti128	ymm0, ymm0, xmm7, 1
  2413	vpsrlvd	ymm0, ymm0, ymm4
  2414	vpand	ymm0, ymm0, ymm1
  2415	vmovdqu	ymmword ptr [r15 - 64], ymm0
  2416	mov	r11d, dword ptr [rbx - 24]
  2417	mov	r9d, dword ptr [rbx - 20]
  2418	shld	r9d, r11d, 15
  2419	mov	r10d, dword ptr [rbx - 28]
  2420	shld	r11d, r10d, 6
  2421	mov	esi, dword ptr [rbx - 32]
  2422	mov	edi, r10d
  2423	mov	ecx, dword ptr [rbx - 36]
  2424	shld	edi, esi, 20
  2425	mov	edx, dword ptr [rbx - 44]
  2426	mov	eax, dword ptr [rbx - 40]
  2427	shld	esi, ecx, 11
  2428	shrd	edx, eax, 16
  2429	shld	ecx, eax, 2
  2430	vmovd	xmm0, edi
  2431	vpinsrd	xmm0, xmm0, r10d, 1
  2432	vmovd	xmm7, edx
  2433	vpinsrd	xmm0, xmm0, r11d, 2
  2434	vpinsrd	xmm7, xmm7, eax, 1
  2435	vpinsrd	xmm0, xmm0, r9d, 3
  2436	vpinsrd	xmm7, xmm7, ecx, 2
  2437	vpinsrd	xmm7, xmm7, esi, 3
  2438	vinserti128	ymm0, ymm7, xmm0, 1
  2439	vpsrlvd	ymm0, ymm0, ymm5
  2440	vpand	ymm0, ymm0, ymm1
  2441	vmovdqu	ymmword ptr [r15 - 32], ymm0
  2442	mov	r9d, dword ptr [rbx]
  2443	mov	ecx, dword ptr [rbx - 4]
  2444	mov	edx, r9d
  2445	shld	edx, ecx, 14
  2446	mov	esi, dword ptr [rbx - 8]
  2447	shld	ecx, esi, 5
  2448	mov	edi, dword ptr [rbx - 12]
  2449	vmovd	xmm0, esi
  2450	shld	esi, edi, 19
  2451	mov	r10d, dword ptr [rbx - 20]
  2452	mov	eax, dword ptr [rbx - 16]
  2453	shld	edi, eax, 10
  2454	shld	eax, r10d, 1
  2455	vpinsrd	xmm0, xmm0, ecx, 1
  2456	vmovd	xmm7, r10d
  2457	vpinsrd	xmm0, xmm0, edx, 2
  2458	vpinsrd	xmm7, xmm7, eax, 1
  2459	vpinsrd	xmm0, xmm0, r9d, 3
  2460	vpinsrd	xmm7, xmm7, edi, 2
  2461	vpinsrd	xmm7, xmm7, esi, 3
  2462	vinserti128	ymm0, ymm7, xmm0, 1
  2463	vpsrlvd	ymm0, ymm0, ymm6
  2464	vpand	ymm0, ymm0, ymm1
  2465	vmovdqu	ymmword ptr [r15], ymm0
  2466	sub	r15, -128
  2467	add	rbx, 92
  2468	add	r8, -1
  2469	jne	.LBB0_71
  2470	jmp	.LBB0_147
  2471.LBB0_43:
  2472	cmp	ecx, 14
  2473	je	.LBB0_123
  2474# %bb.44:
  2475	cmp	ecx, 15
  2476	jne	.LBB0_147
  2477# %bb.45:
  2478	cmp	edx, 32
  2479	jl	.LBB0_147
  2480# %bb.46:
  2481	mov	r8d, r14d
  2482	add	r15, 96
  2483	add	rbx, 56
  2484	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_82] # ymm0 = [140733193420799,140733193420799,140733193420799,140733193420799]
  2485	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_81] # ymm1 = [0,15,0,13,0,11,0,9]
  2486	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_83] # ymm2 = [0,7,0,5,0,3,0,1]
  2487	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_84] # ymm3 = [16,0,14,0,12,0,10,0]
  2488	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_85] # ymm4 = [8,0,6,0,4,0,2,17]
  2489	.p2align	4, 0x90
  2490.LBB0_47:                               # =>This Inner Loop Header: Depth=1
  2491	mov	r9d, dword ptr [rbx - 44]
  2492	mov	eax, dword ptr [rbx - 48]
  2493	mov	esi, r9d
  2494	shld	esi, eax, 6
  2495	mov	r10d, dword ptr [rbx - 52]
  2496	mov	edx, eax
  2497	shld	edx, r10d, 4
  2498	mov	ecx, dword ptr [rbx - 56]
  2499	mov	edi, r10d
  2500	shld	edi, ecx, 2
  2501	vmovd	xmm5, edx
  2502	vpinsrd	xmm5, xmm5, eax, 1
  2503	vpinsrd	xmm5, xmm5, esi, 2
  2504	vpinsrd	xmm5, xmm5, r9d, 3
  2505	vmovd	xmm6, ecx
  2506	vpinsrd	xmm6, xmm6, ecx, 1
  2507	vpinsrd	xmm6, xmm6, edi, 2
  2508	vpinsrd	xmm6, xmm6, r10d, 3
  2509	vinserti128	ymm5, ymm6, xmm5, 1
  2510	vpsrlvd	ymm5, ymm5, ymm1
  2511	vpand	ymm5, ymm5, ymm0
  2512	vmovdqu	ymmword ptr [r15 - 96], ymm5
  2513	mov	r9d, dword ptr [rbx - 28]
  2514	mov	r11d, dword ptr [rbx - 32]
  2515	mov	edx, r9d
  2516	shld	edx, r11d, 14
  2517	mov	r10d, dword ptr [rbx - 36]
  2518	mov	edi, r11d
  2519	shld	edi, r10d, 12
  2520	mov	eax, dword ptr [rbx - 44]
  2521	mov	esi, dword ptr [rbx - 40]
  2522	mov	ecx, r10d
  2523	shld	ecx, esi, 10
  2524	shrd	eax, esi, 24
  2525	vmovd	xmm5, edi
  2526	vpinsrd	xmm5, xmm5, r11d, 1
  2527	vpinsrd	xmm5, xmm5, edx, 2
  2528	vpinsrd	xmm5, xmm5, r9d, 3
  2529	vmovd	xmm6, eax
  2530	vpinsrd	xmm6, xmm6, esi, 1
  2531	vpinsrd	xmm6, xmm6, ecx, 2
  2532	vpinsrd	xmm6, xmm6, r10d, 3
  2533	vinserti128	ymm5, ymm6, xmm5, 1
  2534	vpsrlvd	ymm5, ymm5, ymm2
  2535	vpand	ymm5, ymm5, ymm0
  2536	vmovdqu	ymmword ptr [r15 - 64], ymm5
  2537	mov	eax, dword ptr [rbx - 16]
  2538	mov	r10d, dword ptr [rbx - 12]
  2539	shld	r10d, eax, 7
  2540	mov	edx, dword ptr [rbx - 20]
  2541	mov	esi, eax
  2542	shld	esi, edx, 5
  2543	mov	r9d, dword ptr [rbx - 28]
  2544	mov	ecx, dword ptr [rbx - 24]
  2545	mov	edi, ecx
  2546	shld	edi, r9d, 1
  2547	vmovd	xmm5, edx
  2548	shld	edx, ecx, 3
  2549	vpinsrd	xmm5, xmm5, esi, 1
  2550	vpinsrd	xmm5, xmm5, eax, 2
  2551	vpinsrd	xmm5, xmm5, r10d, 3
  2552	vmovd	xmm6, r9d
  2553	vpinsrd	xmm6, xmm6, edi, 1
  2554	vpinsrd	xmm6, xmm6, ecx, 2
  2555	vpinsrd	xmm6, xmm6, edx, 3
  2556	vinserti128	ymm5, ymm6, xmm5, 1
  2557	vpsrlvd	ymm5, ymm5, ymm3
  2558	vpand	ymm5, ymm5, ymm0
  2559	vmovdqu	ymmword ptr [r15 - 32], ymm5
  2560	mov	r9d, dword ptr [rbx]
  2561	mov	ecx, dword ptr [rbx - 4]
  2562	mov	edx, r9d
  2563	shld	edx, ecx, 13
  2564	mov	eax, dword ptr [rbx - 8]
  2565	vmovd	xmm5, ecx
  2566	shld	ecx, eax, 11
  2567	mov	edi, dword ptr [rbx - 12]
  2568	mov	esi, eax
  2569	shld	esi, edi, 9
  2570	vmovd	xmm6, edi
  2571	vpinsrd	xmm6, xmm6, esi, 1
  2572	vpinsrd	xmm6, xmm6, eax, 2
  2573	vpinsrd	xmm6, xmm6, ecx, 3
  2574	vpinsrd	xmm5, xmm5, edx, 1
  2575	vpinsrd	xmm5, xmm5, r9d, 2
  2576	vpinsrd	xmm5, xmm5, r9d, 3
  2577	vinserti128	ymm5, ymm6, xmm5, 1
  2578	vpsrlvd	ymm5, ymm5, ymm4
  2579	vpand	ymm5, ymm5, ymm0
  2580	vmovdqu	ymmword ptr [r15], ymm5
  2581	sub	r15, -128
  2582	add	rbx, 60
  2583	add	r8, -1
  2584	jne	.LBB0_47
  2585	jmp	.LBB0_147
  2586.LBB0_96:
  2587	cmp	edx, 32
  2588	jl	.LBB0_147
  2589# %bb.97:
  2590	mov	r8d, r14d
  2591	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_0] # ymm0 = [9223372034707292159,9223372034707292159,9223372034707292159,9223372034707292159]
  2592	add	r15, 96
  2593	vmovdqa	ymm8, ymmword ptr [rip + .LCPI0_1] # ymm8 = [24,23,22,21,20,19,18,17]
  2594	vmovdqa	ymm9, ymmword ptr [rip + .LCPI0_2] # ymm9 = [8,9,10,11,12,13,14,15]
  2595	vmovdqa	ymm10, ymmword ptr [rip + .LCPI0_3] # ymm10 = [16,15,14,13,12,11,10,9]
  2596	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_4] # ymm4 = [16,17,18,19,20,21,22,23]
  2597	vmovdqa	xmm5, xmmword ptr [rip + .LCPI0_5] # xmm5 = [8,7,6,5]
  2598	vmovdqa	xmm6, xmmword ptr [rip + .LCPI0_6] # xmm6 = [24,25,26,27]
  2599	vmovdqa	ymm7, ymmword ptr [rip + .LCPI0_7] # ymm7 = [0,0,0,0,0,0,0,1]
  2600	.p2align	4, 0x90
  2601.LBB0_98:                               # =>This Inner Loop Header: Depth=1
  2602	mov	r10d, dword ptr [rbx + 24]
  2603	mov	r9d, dword ptr [rbx + 28]
  2604	shld	r9d, r10d, 7
  2605	mov	esi, dword ptr [rbx + 20]
  2606	shld	r10d, esi, 6
  2607	mov	edi, dword ptr [rbx + 16]
  2608	shld	esi, edi, 5
  2609	mov	eax, dword ptr [rbx + 12]
  2610	shld	edi, eax, 4
  2611	mov	edx, dword ptr [rbx + 8]
  2612	shld	eax, edx, 3
  2613	mov	ecx, dword ptr [rbx + 4]
  2614	shld	edx, ecx, 2
  2615	mov	r11d, dword ptr [rbx]
  2616	shld	ecx, r11d, 1
  2617	vmovd	xmm1, edi
  2618	vpinsrd	xmm1, xmm1, esi, 1
  2619	vpinsrd	xmm1, xmm1, r10d, 2
  2620	vpinsrd	xmm1, xmm1, r9d, 3
  2621	vmovd	xmm2, r11d
  2622	vpinsrd	xmm2, xmm2, ecx, 1
  2623	vpinsrd	xmm2, xmm2, edx, 2
  2624	vpinsrd	xmm2, xmm2, eax, 3
  2625	vinserti128	ymm1, ymm2, xmm1, 1
  2626	vpand	ymm1, ymm1, ymm0
  2627	vmovdqu	ymmword ptr [r15 - 96], ymm1
  2628	vmovdqu	ymm1, ymmword ptr [rbx + 28]
  2629	vpsrlvd	ymm1, ymm1, ymm8
  2630	vmovdqu	xmm2, xmmword ptr [rbx + 44]
  2631	vpshufd	xmm3, xmm2, 249                 # xmm3 = xmm2[1,2,3,3]
  2632	vpinsrd	xmm3, xmm3, dword ptr [rbx + 60], 3
  2633	vpalignr	xmm2, xmm2, xmmword ptr [rbx + 28], 4 # xmm2 = mem[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
  2634	vinserti128	ymm2, ymm2, xmm3, 1
  2635	vpsllvd	ymm2, ymm2, ymm9
  2636	vpor	ymm1, ymm1, ymm2
  2637	vpand	ymm1, ymm1, ymm0
  2638	vmovdqu	ymmword ptr [r15 - 64], ymm1
  2639	vmovdqu	ymm1, ymmword ptr [rbx + 60]
  2640	vmovdqu	xmm2, xmmword ptr [rbx + 76]
  2641	vpshufd	xmm3, xmm2, 249                 # xmm3 = xmm2[1,2,3,3]
  2642	vpinsrd	xmm3, xmm3, dword ptr [rbx + 92], 3
  2643	vpsrlvd	ymm1, ymm1, ymm10
  2644	vpalignr	xmm2, xmm2, xmmword ptr [rbx + 60], 4 # xmm2 = mem[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
  2645	vinserti128	ymm2, ymm2, xmm3, 1
  2646	vpsllvd	ymm2, ymm2, ymm4
  2647	vpor	ymm1, ymm1, ymm2
  2648	vpand	ymm1, ymm1, ymm0
  2649	vmovdqu	ymmword ptr [r15 - 32], ymm1
  2650	mov	eax, dword ptr [rbx + 120]
  2651	mov	ecx, dword ptr [rbx + 116]
  2652	mov	edx, eax
  2653	shld	edx, ecx, 30
  2654	mov	esi, dword ptr [rbx + 112]
  2655	shld	ecx, esi, 29
  2656	mov	edi, dword ptr [rbx + 108]
  2657	shld	esi, edi, 28
  2658	vmovdqu	xmm1, xmmword ptr [rbx + 92]
  2659	vpsrlvd	xmm2, xmm1, xmm5
  2660	vpshufd	xmm1, xmm1, 249                 # xmm1 = xmm1[1,2,3,3]
  2661	vpinsrd	xmm1, xmm1, edi, 3
  2662	vpsllvd	xmm1, xmm1, xmm6
  2663	vmovd	xmm3, esi
  2664	vpinsrd	xmm3, xmm3, ecx, 1
  2665	vpinsrd	xmm3, xmm3, edx, 2
  2666	vpinsrd	xmm3, xmm3, eax, 3
  2667	vpor	xmm1, xmm2, xmm1
  2668	vinserti128	ymm1, ymm1, xmm3, 1
  2669	vpsrlvd	ymm1, ymm1, ymm7
  2670	vpand	ymm1, ymm1, ymm0
  2671	vmovdqu	ymmword ptr [r15], ymm1
  2672	add	rbx, 124
  2673	sub	r15, -128
  2674	add	r8, -1
  2675	jne	.LBB0_98
  2676	jmp	.LBB0_147
  2677.LBB0_144:
  2678	cmp	edx, 32
  2679	jl	.LBB0_147
  2680# %bb.145:
  2681	mov	ebx, r14d
  2682	.p2align	4, 0x90
  2683.LBB0_146:                              # =>This Inner Loop Header: Depth=1
  2684	mov	edx, 128
  2685	mov	rdi, r15
  2686	xor	esi, esi
  2687	call	clib·_memset(SB)
  2688	sub	r15, -128
  2689	add	rbx, -1
  2690	jne	.LBB0_146
  2691	jmp	.LBB0_147
  2692.LBB0_120:
  2693	cmp	edx, 32
  2694	jl	.LBB0_147
  2695# %bb.121:
  2696	mov	eax, r14d
  2697	xor	ecx, ecx
  2698	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_80] # ymm0 = [68719476736,68719476736,68719476736,68719476736]
  2699	vpxor	xmm1, xmm1, xmm1
  2700	.p2align	4, 0x90
  2701.LBB0_122:                              # =>This Inner Loop Header: Depth=1
  2702	vmovdqu	xmm2, xmmword ptr [rbx + rcx]
  2703	vpermq	ymm2, ymm2, 216                 # ymm2 = ymm2[0,2,1,3]
  2704	vpshufd	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1,4,4,5,5]
  2705	vpsrlvd	ymm2, ymm2, ymm0
  2706	vpblendw	ymm2, ymm2, ymm1, 170           # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
  2707	vmovdqu	ymmword ptr [r15 + 2*rcx], ymm2
  2708	vmovdqu	xmm2, xmmword ptr [rbx + rcx + 16]
  2709	vpermq	ymm2, ymm2, 216                 # ymm2 = ymm2[0,2,1,3]
  2710	vpshufd	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1,4,4,5,5]
  2711	vpsrlvd	ymm2, ymm2, ymm0
  2712	vpblendw	ymm2, ymm2, ymm1, 170           # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
  2713	vmovdqu	ymmword ptr [r15 + 2*rcx + 32], ymm2
  2714	vmovdqu	xmm2, xmmword ptr [rbx + rcx + 32]
  2715	vpermq	ymm2, ymm2, 216                 # ymm2 = ymm2[0,2,1,3]
  2716	vpshufd	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1,4,4,5,5]
  2717	vpsrlvd	ymm2, ymm2, ymm0
  2718	vpblendw	ymm2, ymm2, ymm1, 170           # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
  2719	vmovdqu	ymmword ptr [r15 + 2*rcx + 64], ymm2
  2720	vmovdqu	xmm2, xmmword ptr [rbx + rcx + 48]
  2721	vpermq	ymm2, ymm2, 216                 # ymm2 = ymm2[0,2,1,3]
  2722	vpshufd	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1,4,4,5,5]
  2723	vpsrlvd	ymm2, ymm2, ymm0
  2724	vpblendw	ymm2, ymm2, ymm1, 170           # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
  2725	vmovdqu	ymmword ptr [r15 + 2*rcx + 96], ymm2
  2726	add	rcx, 64
  2727	add	rax, -1
  2728	jne	.LBB0_122
  2729	jmp	.LBB0_147
  2730.LBB0_132:
  2731	cmp	edx, 32
  2732	jl	.LBB0_147
  2733# %bb.133:
  2734	mov	eax, r14d
  2735	xor	ecx, ecx
  2736	vbroadcasti128	ymm0, xmmword ptr [rip + .LCPI0_109] # ymm0 = [0,8,16,24,0,8,16,24]
  2737                                        # ymm0 = mem[0,1,0,1]
  2738	vpbroadcastd	ymm1, dword ptr [rip + .LCPI0_110] # ymm1 = [255,255,255,255,255,255,255,255]
  2739	.p2align	4, 0x90
  2740.LBB0_134:                              # =>This Inner Loop Header: Depth=1
  2741	vmovq	xmm2, qword ptr [rbx + rcx]     # xmm2 = mem[0],zero
  2742	vpshufd	xmm2, xmm2, 80                  # xmm2 = xmm2[0,0,1,1]
  2743	vpermq	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1]
  2744	vpsrlvd	ymm2, ymm2, ymm0
  2745	vpand	ymm2, ymm2, ymm1
  2746	vmovdqu	ymmword ptr [r15 + 4*rcx], ymm2
  2747	vmovq	xmm2, qword ptr [rbx + rcx + 8] # xmm2 = mem[0],zero
  2748	vpshufd	xmm2, xmm2, 80                  # xmm2 = xmm2[0,0,1,1]
  2749	vpermq	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1]
  2750	vpsrlvd	ymm2, ymm2, ymm0
  2751	vpand	ymm2, ymm2, ymm1
  2752	vmovdqu	ymmword ptr [r15 + 4*rcx + 32], ymm2
  2753	vmovq	xmm2, qword ptr [rbx + rcx + 16] # xmm2 = mem[0],zero
  2754	vpshufd	xmm2, xmm2, 80                  # xmm2 = xmm2[0,0,1,1]
  2755	vpermq	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1]
  2756	vpsrlvd	ymm2, ymm2, ymm0
  2757	vpand	ymm2, ymm2, ymm1
  2758	vmovdqu	ymmword ptr [r15 + 4*rcx + 64], ymm2
  2759	vmovq	xmm2, qword ptr [rbx + rcx + 24] # xmm2 = mem[0],zero
  2760	vpshufd	xmm2, xmm2, 80                  # xmm2 = xmm2[0,0,1,1]
  2761	vpermq	ymm2, ymm2, 80                  # ymm2 = ymm2[0,0,1,1]
  2762	vpsrlvd	ymm2, ymm2, ymm0
  2763	vpand	ymm2, ymm2, ymm1
  2764	vmovdqu	ymmword ptr [r15 + 4*rcx + 96], ymm2
  2765	add	rcx, 32
  2766	add	rax, -1
  2767	jne	.LBB0_134
  2768	jmp	.LBB0_147
  2769.LBB0_108:
  2770	cmp	edx, 32
  2771	jl	.LBB0_147
  2772# %bb.109:
  2773	mov	r8d, r14d
  2774	add	r15, 96
  2775	add	rbx, 92
  2776	vbroadcasti128	ymm0, xmmword ptr [rip + .LCPI0_46] # ymm0 = [0,0,0,8,0,0,0,8]
  2777                                        # ymm0 = mem[0,1,0,1]
  2778	vpbroadcastd	ymm1, dword ptr [rip + .LCPI0_47] # ymm1 = [16777215,16777215,16777215,16777215,16777215,16777215,16777215,16777215]
  2779	.p2align	4, 0x90
  2780.LBB0_110:                              # =>This Inner Loop Header: Depth=1
  2781	mov	r9d, dword ptr [rbx - 72]
  2782	mov	edx, dword ptr [rbx - 76]
  2783	mov	esi, r9d
  2784	mov	edi, dword ptr [rbx - 80]
  2785	mov	r10d, dword ptr [rbx - 84]
  2786	shld	esi, edx, 16
  2787	mov	r11d, dword ptr [rbx - 92]
  2788	mov	eax, dword ptr [rbx - 88]
  2789	shld	edx, edi, 8
  2790	mov	ecx, r10d
  2791	shld	ecx, eax, 16
  2792	shld	eax, r11d, 8
  2793	vmovd	xmm2, edi
  2794	vmovd	xmm3, r11d
  2795	vpinsrd	xmm2, xmm2, edx, 1
  2796	vpinsrd	xmm3, xmm3, eax, 1
  2797	vpinsrd	xmm2, xmm2, esi, 2
  2798	vpinsrd	xmm3, xmm3, ecx, 2
  2799	vpinsrd	xmm2, xmm2, r9d, 3
  2800	vpinsrd	xmm3, xmm3, r10d, 3
  2801	vinserti128	ymm2, ymm3, xmm2, 1
  2802	vpsrlvd	ymm2, ymm2, ymm0
  2803	vpand	ymm2, ymm2, ymm1
  2804	vmovdqu	ymmword ptr [r15 - 96], ymm2
  2805	mov	r9d, dword ptr [rbx - 48]
  2806	mov	ecx, dword ptr [rbx - 52]
  2807	mov	edx, r9d
  2808	mov	esi, dword ptr [rbx - 56]
  2809	mov	r10d, dword ptr [rbx - 60]
  2810	shld	edx, ecx, 16
  2811	mov	r11d, dword ptr [rbx - 68]
  2812	mov	edi, dword ptr [rbx - 64]
  2813	shld	ecx, esi, 8
  2814	mov	eax, r10d
  2815	shld	eax, edi, 16
  2816	shld	edi, r11d, 8
  2817	vmovd	xmm2, esi
  2818	vmovd	xmm3, r11d
  2819	vpinsrd	xmm2, xmm2, ecx, 1
  2820	vpinsrd	xmm3, xmm3, edi, 1
  2821	vpinsrd	xmm2, xmm2, edx, 2
  2822	vpinsrd	xmm3, xmm3, eax, 2
  2823	vpinsrd	xmm2, xmm2, r9d, 3
  2824	vpinsrd	xmm3, xmm3, r10d, 3
  2825	vinserti128	ymm2, ymm3, xmm2, 1
  2826	vpsrlvd	ymm2, ymm2, ymm0
  2827	vpand	ymm2, ymm2, ymm1
  2828	vmovdqu	ymmword ptr [r15 - 64], ymm2
  2829	mov	r9d, dword ptr [rbx - 24]
  2830	mov	ecx, dword ptr [rbx - 28]
  2831	mov	edx, r9d
  2832	mov	esi, dword ptr [rbx - 32]
  2833	mov	r10d, dword ptr [rbx - 36]
  2834	shld	edx, ecx, 16
  2835	mov	r11d, dword ptr [rbx - 44]
  2836	mov	edi, dword ptr [rbx - 40]
  2837	shld	ecx, esi, 8
  2838	mov	eax, r10d
  2839	shld	eax, edi, 16
  2840	shld	edi, r11d, 8
  2841	vmovd	xmm2, esi
  2842	vmovd	xmm3, r11d
  2843	vpinsrd	xmm2, xmm2, ecx, 1
  2844	vpinsrd	xmm3, xmm3, edi, 1
  2845	vpinsrd	xmm2, xmm2, edx, 2
  2846	vpinsrd	xmm3, xmm3, eax, 2
  2847	vpinsrd	xmm2, xmm2, r9d, 3
  2848	vpinsrd	xmm3, xmm3, r10d, 3
  2849	vinserti128	ymm2, ymm3, xmm2, 1
  2850	vpsrlvd	ymm2, ymm2, ymm0
  2851	vpand	ymm2, ymm2, ymm1
  2852	vmovdqu	ymmword ptr [r15 - 32], ymm2
  2853	mov	r9d, dword ptr [rbx]
  2854	mov	ecx, dword ptr [rbx - 4]
  2855	mov	edx, r9d
  2856	mov	esi, dword ptr [rbx - 8]
  2857	mov	r10d, dword ptr [rbx - 12]
  2858	shld	edx, ecx, 16
  2859	mov	r11d, dword ptr [rbx - 20]
  2860	mov	edi, dword ptr [rbx - 16]
  2861	shld	ecx, esi, 8
  2862	mov	eax, r10d
  2863	shld	eax, edi, 16
  2864	shld	edi, r11d, 8
  2865	vmovd	xmm2, esi
  2866	vpinsrd	xmm2, xmm2, ecx, 1
  2867	vmovd	xmm3, r11d
  2868	vpinsrd	xmm2, xmm2, edx, 2
  2869	vpinsrd	xmm3, xmm3, edi, 1
  2870	vpinsrd	xmm2, xmm2, r9d, 3
  2871	vpinsrd	xmm3, xmm3, eax, 2
  2872	vpinsrd	xmm3, xmm3, r10d, 3
  2873	vinserti128	ymm2, ymm3, xmm2, 1
  2874	vpsrlvd	ymm2, ymm2, ymm0
  2875	vpand	ymm2, ymm2, ymm1
  2876	vmovdqu	ymmword ptr [r15], ymm2
  2877	sub	r15, -128
  2878	add	rbx, 96
  2879	add	r8, -1
  2880	jne	.LBB0_110
  2881	jmp	.LBB0_147
  2882.LBB0_138:
  2883	cmp	edx, 32
  2884	jl	.LBB0_147
  2885# %bb.139:
  2886	mov	eax, r14d
  2887	xor	ecx, ecx
  2888	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_124] # ymm0 = [0,4,8,12,16,20,24,28]
  2889	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_125] # ymm1 = [64424509455,64424509455,64424509455,64424509455]
  2890	.p2align	4, 0x90
  2891.LBB0_140:                              # =>This Inner Loop Header: Depth=1
  2892	vpbroadcastd	ymm2, dword ptr [rbx + rcx]
  2893	vpsrlvd	ymm2, ymm2, ymm0
  2894	vpand	ymm2, ymm2, ymm1
  2895	vmovdqu	ymmword ptr [r15 + 8*rcx], ymm2
  2896	vpbroadcastd	ymm2, dword ptr [rbx + rcx + 4]
  2897	vpsrlvd	ymm2, ymm2, ymm0
  2898	vpand	ymm2, ymm2, ymm1
  2899	vmovdqu	ymmword ptr [r15 + 8*rcx + 32], ymm2
  2900	vpbroadcastd	ymm2, dword ptr [rbx + rcx + 8]
  2901	vpsrlvd	ymm2, ymm2, ymm0
  2902	vpand	ymm2, ymm2, ymm1
  2903	vmovdqu	ymmword ptr [r15 + 8*rcx + 64], ymm2
  2904	vpbroadcastd	ymm2, dword ptr [rbx + rcx + 12]
  2905	vpsrlvd	ymm2, ymm2, ymm0
  2906	vpand	ymm2, ymm2, ymm1
  2907	vmovdqu	ymmword ptr [r15 + 8*rcx + 96], ymm2
  2908	add	rcx, 16
  2909	add	rax, -1
  2910	jne	.LBB0_140
  2911	jmp	.LBB0_147
  2912.LBB0_114:
  2913	cmp	edx, 32
  2914	jl	.LBB0_147
  2915# %bb.115:
  2916	mov	r8d, r14d
  2917	add	r15, 96
  2918	add	rbx, 76
  2919	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_65] # ymm0 = [0,0,8,0,0,4,0,12]
  2920	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_66] # ymm1 = [4503595333451775,4503595333451775,4503595333451775,4503595333451775]
  2921	.p2align	4, 0x90
  2922.LBB0_116:                              # =>This Inner Loop Header: Depth=1
  2923	mov	r9d, dword ptr [rbx - 60]
  2924	mov	r11d, dword ptr [rbx - 64]
  2925	mov	esi, r9d
  2926	shld	esi, r11d, 8
  2927	mov	edi, dword ptr [rbx - 68]
  2928	mov	edx, r11d
  2929	shld	edx, edi, 16
  2930	mov	eax, dword ptr [rbx - 72]
  2931	shld	edi, eax, 4
  2932	mov	r10d, dword ptr [rbx - 76]
  2933	mov	ecx, eax
  2934	shld	ecx, r10d, 12
  2935	vmovd	xmm2, edx
  2936	vpinsrd	xmm2, xmm2, r11d, 1
  2937	vpinsrd	xmm2, xmm2, esi, 2
  2938	vpinsrd	xmm2, xmm2, r9d, 3
  2939	vmovd	xmm3, r10d
  2940	vpinsrd	xmm3, xmm3, ecx, 1
  2941	vpinsrd	xmm3, xmm3, eax, 2
  2942	vpinsrd	xmm3, xmm3, edi, 3
  2943	vinserti128	ymm2, ymm3, xmm2, 1
  2944	vpsrlvd	ymm2, ymm2, ymm0
  2945	vpand	ymm2, ymm2, ymm1
  2946	vmovdqu	ymmword ptr [r15 - 96], ymm2
  2947	mov	r9d, dword ptr [rbx - 40]
  2948	mov	r11d, dword ptr [rbx - 44]
  2949	mov	edx, r9d
  2950	shld	edx, r11d, 8
  2951	mov	esi, dword ptr [rbx - 48]
  2952	mov	edi, r11d
  2953	shld	edi, esi, 16
  2954	mov	r10d, dword ptr [rbx - 56]
  2955	mov	ecx, dword ptr [rbx - 52]
  2956	shld	esi, ecx, 4
  2957	mov	eax, ecx
  2958	shld	eax, r10d, 12
  2959	vmovd	xmm2, edi
  2960	vpinsrd	xmm2, xmm2, r11d, 1
  2961	vpinsrd	xmm2, xmm2, edx, 2
  2962	vpinsrd	xmm2, xmm2, r9d, 3
  2963	vmovd	xmm3, r10d
  2964	vpinsrd	xmm3, xmm3, eax, 1
  2965	vpinsrd	xmm3, xmm3, ecx, 2
  2966	vpinsrd	xmm3, xmm3, esi, 3
  2967	vinserti128	ymm2, ymm3, xmm2, 1
  2968	vpsrlvd	ymm2, ymm2, ymm0
  2969	vpand	ymm2, ymm2, ymm1
  2970	vmovdqu	ymmword ptr [r15 - 64], ymm2
  2971	mov	r9d, dword ptr [rbx - 20]
  2972	mov	r11d, dword ptr [rbx - 24]
  2973	mov	edx, r9d
  2974	shld	edx, r11d, 8
  2975	mov	esi, dword ptr [rbx - 28]
  2976	mov	edi, r11d
  2977	shld	edi, esi, 16
  2978	mov	ecx, dword ptr [rbx - 32]
  2979	shld	esi, ecx, 4
  2980	mov	r10d, dword ptr [rbx - 36]
  2981	mov	eax, ecx
  2982	shld	eax, r10d, 12
  2983	vmovd	xmm2, edi
  2984	vpinsrd	xmm2, xmm2, r11d, 1
  2985	vpinsrd	xmm2, xmm2, edx, 2
  2986	vpinsrd	xmm2, xmm2, r9d, 3
  2987	vmovd	xmm3, r10d
  2988	vpinsrd	xmm3, xmm3, eax, 1
  2989	vpinsrd	xmm3, xmm3, ecx, 2
  2990	vpinsrd	xmm3, xmm3, esi, 3
  2991	vinserti128	ymm2, ymm3, xmm2, 1
  2992	vpsrlvd	ymm2, ymm2, ymm0
  2993	vpand	ymm2, ymm2, ymm1
  2994	vmovdqu	ymmword ptr [r15 - 32], ymm2
  2995	mov	r9d, dword ptr [rbx]
  2996	mov	r11d, dword ptr [rbx - 4]
  2997	mov	edx, r9d
  2998	shld	edx, r11d, 8
  2999	mov	esi, dword ptr [rbx - 8]
  3000	mov	edi, r11d
  3001	shld	edi, esi, 16
  3002	mov	r10d, dword ptr [rbx - 16]
  3003	mov	ecx, dword ptr [rbx - 12]
  3004	shld	esi, ecx, 4
  3005	mov	eax, ecx
  3006	shld	eax, r10d, 12
  3007	vmovd	xmm2, edi
  3008	vpinsrd	xmm2, xmm2, r11d, 1
  3009	vpinsrd	xmm2, xmm2, edx, 2
  3010	vpinsrd	xmm2, xmm2, r9d, 3
  3011	vmovd	xmm3, r10d
  3012	vpinsrd	xmm3, xmm3, eax, 1
  3013	vpinsrd	xmm3, xmm3, ecx, 2
  3014	vpinsrd	xmm3, xmm3, esi, 3
  3015	vinserti128	ymm2, ymm3, xmm2, 1
  3016	vpsrlvd	ymm2, ymm2, ymm0
  3017	vpand	ymm2, ymm2, ymm1
  3018	vmovdqu	ymmword ptr [r15], ymm2
  3019	sub	r15, -128
  3020	add	rbx, 80
  3021	add	r8, -1
  3022	jne	.LBB0_116
  3023	jmp	.LBB0_147
  3024.LBB0_126:
  3025	cmp	edx, 32
  3026	jl	.LBB0_147
  3027# %bb.127:
  3028	mov	r8d, r14d
  3029	add	r15, 96
  3030	add	rbx, 44
  3031	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_94] # ymm0 = [0,12,0,4,16,0,8,20]
  3032	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_95] # ymm1 = [17587891081215,17587891081215,17587891081215,17587891081215]
  3033	.p2align	4, 0x90
  3034.LBB0_128:                              # =>This Inner Loop Header: Depth=1
  3035	mov	ecx, dword ptr [rbx - 36]
  3036	mov	edx, dword ptr [rbx - 44]
  3037	mov	esi, dword ptr [rbx - 40]
  3038	mov	edi, ecx
  3039	shld	edi, esi, 4
  3040	mov	eax, esi
  3041	shld	eax, edx, 8
  3042	vmovd	xmm2, esi
  3043	vpinsrd	xmm2, xmm2, edi, 1
  3044	vpinsrd	xmm2, xmm2, ecx, 2
  3045	vpinsrd	xmm2, xmm2, ecx, 3
  3046	vmovd	xmm3, edx
  3047	vpinsrd	xmm3, xmm3, edx, 1
  3048	vpinsrd	xmm3, xmm3, eax, 2
  3049	vpinsrd	xmm3, xmm3, esi, 3
  3050	vinserti128	ymm2, ymm3, xmm2, 1
  3051	vpsrlvd	ymm2, ymm2, ymm0
  3052	vpand	ymm2, ymm2, ymm1
  3053	vmovdqu	ymmword ptr [r15 - 96], ymm2
  3054	mov	eax, dword ptr [rbx - 24]
  3055	mov	ecx, dword ptr [rbx - 32]
  3056	mov	edx, dword ptr [rbx - 28]
  3057	mov	esi, eax
  3058	shld	esi, edx, 4
  3059	mov	edi, edx
  3060	shld	edi, ecx, 8
  3061	vmovd	xmm2, edx
  3062	vpinsrd	xmm2, xmm2, esi, 1
  3063	vpinsrd	xmm2, xmm2, eax, 2
  3064	vpinsrd	xmm2, xmm2, eax, 3
  3065	vmovd	xmm3, ecx
  3066	vpinsrd	xmm3, xmm3, ecx, 1
  3067	vpinsrd	xmm3, xmm3, edi, 2
  3068	vpinsrd	xmm3, xmm3, edx, 3
  3069	vinserti128	ymm2, ymm3, xmm2, 1
  3070	vpsrlvd	ymm2, ymm2, ymm0
  3071	vpand	ymm2, ymm2, ymm1
  3072	vmovdqu	ymmword ptr [r15 - 64], ymm2
  3073	mov	eax, dword ptr [rbx - 12]
  3074	mov	ecx, dword ptr [rbx - 20]
  3075	mov	edx, dword ptr [rbx - 16]
  3076	mov	esi, eax
  3077	shld	esi, edx, 4
  3078	mov	edi, edx
  3079	shld	edi, ecx, 8
  3080	vmovd	xmm2, edx
  3081	vpinsrd	xmm2, xmm2, esi, 1
  3082	vpinsrd	xmm2, xmm2, eax, 2
  3083	vpinsrd	xmm2, xmm2, eax, 3
  3084	vmovd	xmm3, ecx
  3085	vpinsrd	xmm3, xmm3, ecx, 1
  3086	vpinsrd	xmm3, xmm3, edi, 2
  3087	vpinsrd	xmm3, xmm3, edx, 3
  3088	vinserti128	ymm2, ymm3, xmm2, 1
  3089	vpsrlvd	ymm2, ymm2, ymm0
  3090	vpand	ymm2, ymm2, ymm1
  3091	vmovdqu	ymmword ptr [r15 - 32], ymm2
  3092	mov	eax, dword ptr [rbx]
  3093	mov	ecx, dword ptr [rbx - 8]
  3094	mov	edx, dword ptr [rbx - 4]
  3095	mov	esi, eax
  3096	shld	esi, edx, 4
  3097	mov	edi, edx
  3098	shld	edi, ecx, 8
  3099	vmovd	xmm2, edx
  3100	vpinsrd	xmm2, xmm2, esi, 1
  3101	vpinsrd	xmm2, xmm2, eax, 2
  3102	vpinsrd	xmm2, xmm2, eax, 3
  3103	vmovd	xmm3, ecx
  3104	vpinsrd	xmm3, xmm3, ecx, 1
  3105	vpinsrd	xmm3, xmm3, edi, 2
  3106	vpinsrd	xmm3, xmm3, edx, 3
  3107	vinserti128	ymm2, ymm3, xmm2, 1
  3108	vpsrlvd	ymm2, ymm2, ymm0
  3109	vpand	ymm2, ymm2, ymm1
  3110	vmovdqu	ymmword ptr [r15], ymm2
  3111	sub	r15, -128
  3112	add	rbx, 48
  3113	add	r8, -1
  3114	jne	.LBB0_128
  3115	jmp	.LBB0_147
  3116.LBB0_102:
  3117	cmp	edx, 32
  3118	jl	.LBB0_147
  3119# %bb.103:
  3120	mov	r8d, r14d
  3121	add	r15, 96
  3122	add	rbx, 108
  3123	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_22] # ymm0 = [0,0,0,0,0,0,0,4]
  3124	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_23] # ymm1 = [1152921500580315135,1152921500580315135,1152921500580315135,1152921500580315135]
  3125	.p2align	4, 0x90
  3126.LBB0_104:                              # =>This Inner Loop Header: Depth=1
  3127	mov	r9d, dword ptr [rbx - 84]
  3128	mov	edx, dword ptr [rbx - 88]
  3129	mov	r10d, r9d
  3130	shld	r10d, edx, 24
  3131	mov	edi, dword ptr [rbx - 92]
  3132	shld	edx, edi, 20
  3133	mov	eax, dword ptr [rbx - 96]
  3134	shld	edi, eax, 16
  3135	mov	ecx, dword ptr [rbx - 100]
  3136	shld	eax, ecx, 12
  3137	mov	r11d, dword ptr [rbx - 108]
  3138	mov	esi, dword ptr [rbx - 104]
  3139	shld	ecx, esi, 8
  3140	shld	esi, r11d, 4
  3141	vmovd	xmm2, r11d
  3142	vmovd	xmm3, edi
  3143	vpinsrd	xmm2, xmm2, esi, 1
  3144	vpinsrd	xmm3, xmm3, edx, 1
  3145	vpinsrd	xmm2, xmm2, ecx, 2
  3146	vpinsrd	xmm3, xmm3, r10d, 2
  3147	vpinsrd	xmm2, xmm2, eax, 3
  3148	vpinsrd	xmm3, xmm3, r9d, 3
  3149	vinserti128	ymm2, ymm2, xmm3, 1
  3150	vpsrlvd	ymm2, ymm2, ymm0
  3151	vpand	ymm2, ymm2, ymm1
  3152	vmovdqu	ymmword ptr [r15 - 96], ymm2
  3153	mov	r9d, dword ptr [rbx - 56]
  3154	mov	ecx, dword ptr [rbx - 60]
  3155	mov	r10d, r9d
  3156	shld	r10d, ecx, 24
  3157	mov	esi, dword ptr [rbx - 64]
  3158	shld	ecx, esi, 20
  3159	mov	edi, dword ptr [rbx - 68]
  3160	shld	esi, edi, 16
  3161	mov	eax, dword ptr [rbx - 72]
  3162	shld	edi, eax, 12
  3163	mov	r11d, dword ptr [rbx - 80]
  3164	mov	edx, dword ptr [rbx - 76]
  3165	shld	eax, edx, 8
  3166	shld	edx, r11d, 4
  3167	vmovd	xmm2, r11d
  3168	vmovd	xmm3, esi
  3169	vpinsrd	xmm2, xmm2, edx, 1
  3170	vpinsrd	xmm3, xmm3, ecx, 1
  3171	vpinsrd	xmm2, xmm2, eax, 2
  3172	vpinsrd	xmm3, xmm3, r10d, 2
  3173	vpinsrd	xmm2, xmm2, edi, 3
  3174	vpinsrd	xmm3, xmm3, r9d, 3
  3175	vinserti128	ymm2, ymm2, xmm3, 1
  3176	vpsrlvd	ymm2, ymm2, ymm0
  3177	vpand	ymm2, ymm2, ymm1
  3178	vmovdqu	ymmword ptr [r15 - 64], ymm2
  3179	mov	r9d, dword ptr [rbx - 28]
  3180	mov	ecx, dword ptr [rbx - 32]
  3181	mov	r10d, r9d
  3182	shld	r10d, ecx, 24
  3183	mov	esi, dword ptr [rbx - 36]
  3184	shld	ecx, esi, 20
  3185	mov	edi, dword ptr [rbx - 40]
  3186	shld	esi, edi, 16
  3187	mov	eax, dword ptr [rbx - 44]
  3188	shld	edi, eax, 12
  3189	mov	r11d, dword ptr [rbx - 52]
  3190	mov	edx, dword ptr [rbx - 48]
  3191	shld	eax, edx, 8
  3192	shld	edx, r11d, 4
  3193	vmovd	xmm2, r11d
  3194	vmovd	xmm3, esi
  3195	vpinsrd	xmm2, xmm2, edx, 1
  3196	vpinsrd	xmm3, xmm3, ecx, 1
  3197	vpinsrd	xmm2, xmm2, eax, 2
  3198	vpinsrd	xmm3, xmm3, r10d, 2
  3199	vpinsrd	xmm2, xmm2, edi, 3
  3200	vpinsrd	xmm3, xmm3, r9d, 3
  3201	vinserti128	ymm2, ymm2, xmm3, 1
  3202	vpsrlvd	ymm2, ymm2, ymm0
  3203	vpand	ymm2, ymm2, ymm1
  3204	vmovdqu	ymmword ptr [r15 - 32], ymm2
  3205	mov	r9d, dword ptr [rbx]
  3206	mov	ecx, dword ptr [rbx - 4]
  3207	mov	r10d, r9d
  3208	shld	r10d, ecx, 24
  3209	mov	esi, dword ptr [rbx - 8]
  3210	shld	ecx, esi, 20
  3211	mov	edi, dword ptr [rbx - 12]
  3212	shld	esi, edi, 16
  3213	mov	eax, dword ptr [rbx - 16]
  3214	shld	edi, eax, 12
  3215	mov	r11d, dword ptr [rbx - 24]
  3216	mov	edx, dword ptr [rbx - 20]
  3217	shld	eax, edx, 8
  3218	shld	edx, r11d, 4
  3219	vmovd	xmm2, r11d
  3220	vmovd	xmm3, esi
  3221	vpinsrd	xmm2, xmm2, edx, 1
  3222	vpinsrd	xmm3, xmm3, ecx, 1
  3223	vpinsrd	xmm2, xmm2, eax, 2
  3224	vpinsrd	xmm3, xmm3, r10d, 2
  3225	vpinsrd	xmm2, xmm2, edi, 3
  3226	vpinsrd	xmm3, xmm3, r9d, 3
  3227	vinserti128	ymm2, ymm2, xmm3, 1
  3228	vpsrlvd	ymm2, ymm2, ymm0
  3229	vpand	ymm2, ymm2, ymm1
  3230	vmovdqu	ymmword ptr [r15], ymm2
  3231	sub	r15, -128
  3232	add	rbx, 112
  3233	add	r8, -1
  3234	jne	.LBB0_104
  3235	jmp	.LBB0_147
  3236.LBB0_141:
  3237	cmp	edx, 32
  3238	jl	.LBB0_147
  3239# %bb.142:
  3240	mov	eax, r14d
  3241	add	r15, 96
  3242	xor	ecx, ecx
  3243	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_131] # ymm0 = [0,2,4,6,8,10,12,14]
  3244	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_132] # ymm1 = [12884901891,12884901891,12884901891,12884901891]
  3245	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_133] # ymm2 = [16,18,20,22,24,26,28,30]
  3246	.p2align	4, 0x90
  3247.LBB0_143:                              # =>This Inner Loop Header: Depth=1
  3248	vpbroadcastd	ymm3, dword ptr [rbx + 8*rcx]
  3249	vpsrlvd	ymm3, ymm3, ymm0
  3250	vpand	ymm3, ymm3, ymm1
  3251	vmovdqu	ymmword ptr [r15 - 96], ymm3
  3252	vpbroadcastd	ymm3, dword ptr [rbx + 8*rcx]
  3253	vpsrlvd	ymm3, ymm3, ymm2
  3254	vpand	ymm3, ymm3, ymm1
  3255	vmovdqu	ymmword ptr [r15 - 64], ymm3
  3256	vpbroadcastd	ymm3, dword ptr [rbx + 8*rcx + 4]
  3257	vpsrlvd	ymm3, ymm3, ymm0
  3258	vpand	ymm3, ymm3, ymm1
  3259	vmovdqu	ymmword ptr [r15 - 32], ymm3
  3260	vpbroadcastd	ymm3, dword ptr [rbx + 8*rcx + 4]
  3261	vpsrlvd	ymm3, ymm3, ymm2
  3262	vpand	ymm3, ymm3, ymm1
  3263	vmovdqu	ymmword ptr [r15], ymm3
  3264	add	rcx, 1
  3265	sub	r15, -128
  3266	cmp	rax, rcx
  3267	jne	.LBB0_143
  3268	jmp	.LBB0_147
  3269.LBB0_117:
  3270	cmp	edx, 32
  3271	jl	.LBB0_147
  3272# %bb.118:
  3273	mov	r8d, r14d
  3274	add	r15, 96
  3275	add	rbx, 68
  3276	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_72] # ymm0 = [0,0,4,0,8,0,12,0]
  3277	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_73] # ymm1 = [1125895612137471,1125895612137471,1125895612137471,1125895612137471]
  3278	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_74] # ymm2 = [0,2,0,6,0,10,0,14]
  3279	.p2align	4, 0x90
  3280.LBB0_119:                              # =>This Inner Loop Header: Depth=1
  3281	mov	ecx, dword ptr [rbx - 56]
  3282	mov	r10d, dword ptr [rbx - 52]
  3283	shld	r10d, ecx, 2
  3284	mov	esi, dword ptr [rbx - 60]
  3285	mov	edi, ecx
  3286	shld	edi, esi, 6
  3287	mov	r9d, dword ptr [rbx - 68]
  3288	mov	edx, dword ptr [rbx - 64]
  3289	mov	eax, edx
  3290	shld	eax, r9d, 14
  3291	vmovd	xmm3, esi
  3292	shld	esi, edx, 10
  3293	vpinsrd	xmm3, xmm3, edi, 1
  3294	vpinsrd	xmm3, xmm3, ecx, 2
  3295	vpinsrd	xmm3, xmm3, r10d, 3
  3296	vmovd	xmm4, r9d
  3297	vpinsrd	xmm4, xmm4, eax, 1
  3298	vpinsrd	xmm4, xmm4, edx, 2
  3299	vpinsrd	xmm4, xmm4, esi, 3
  3300	vinserti128	ymm3, ymm4, xmm3, 1
  3301	vpsrlvd	ymm3, ymm3, ymm0
  3302	vpand	ymm3, ymm3, ymm1
  3303	vmovdqu	ymmword ptr [r15 - 96], ymm3
  3304	mov	r9d, dword ptr [rbx - 36]
  3305	mov	r11d, dword ptr [rbx - 40]
  3306	mov	edx, r9d
  3307	shld	edx, r11d, 4
  3308	mov	r10d, dword ptr [rbx - 44]
  3309	mov	edi, r11d
  3310	shld	edi, r10d, 8
  3311	mov	eax, dword ptr [rbx - 52]
  3312	mov	esi, dword ptr [rbx - 48]
  3313	mov	ecx, r10d
  3314	shld	ecx, esi, 12
  3315	shrd	eax, esi, 16
  3316	vmovd	xmm3, edi
  3317	vpinsrd	xmm3, xmm3, r11d, 1
  3318	vpinsrd	xmm3, xmm3, edx, 2
  3319	vpinsrd	xmm3, xmm3, r9d, 3
  3320	vmovd	xmm4, eax
  3321	vpinsrd	xmm4, xmm4, esi, 1
  3322	vpinsrd	xmm4, xmm4, ecx, 2
  3323	vpinsrd	xmm4, xmm4, r10d, 3
  3324	vinserti128	ymm3, ymm4, xmm3, 1
  3325	vpsrlvd	ymm3, ymm3, ymm2
  3326	vpand	ymm3, ymm3, ymm1
  3327	vmovdqu	ymmword ptr [r15 - 64], ymm3
  3328	mov	eax, dword ptr [rbx - 20]
  3329	mov	r10d, dword ptr [rbx - 16]
  3330	shld	r10d, eax, 2
  3331	mov	edx, dword ptr [rbx - 24]
  3332	mov	esi, eax
  3333	shld	esi, edx, 6
  3334	mov	r9d, dword ptr [rbx - 32]
  3335	mov	ecx, dword ptr [rbx - 28]
  3336	mov	edi, ecx
  3337	shld	edi, r9d, 14
  3338	vmovd	xmm3, edx
  3339	shld	edx, ecx, 10
  3340	vpinsrd	xmm3, xmm3, esi, 1
  3341	vpinsrd	xmm3, xmm3, eax, 2
  3342	vpinsrd	xmm3, xmm3, r10d, 3
  3343	vmovd	xmm4, r9d
  3344	vpinsrd	xmm4, xmm4, edi, 1
  3345	vpinsrd	xmm4, xmm4, ecx, 2
  3346	vpinsrd	xmm4, xmm4, edx, 3
  3347	vinserti128	ymm3, ymm4, xmm3, 1
  3348	vpsrlvd	ymm3, ymm3, ymm0
  3349	vpand	ymm3, ymm3, ymm1
  3350	vmovdqu	ymmword ptr [r15 - 32], ymm3
  3351	mov	r9d, dword ptr [rbx]
  3352	mov	r11d, dword ptr [rbx - 4]
  3353	mov	edx, r9d
  3354	shld	edx, r11d, 4
  3355	mov	r10d, dword ptr [rbx - 8]
  3356	mov	edi, r11d
  3357	shld	edi, r10d, 8
  3358	mov	eax, dword ptr [rbx - 16]
  3359	mov	esi, dword ptr [rbx - 12]
  3360	mov	ecx, r10d
  3361	shld	ecx, esi, 12
  3362	shrd	eax, esi, 16
  3363	vmovd	xmm3, edi
  3364	vpinsrd	xmm3, xmm3, r11d, 1
  3365	vpinsrd	xmm3, xmm3, edx, 2
  3366	vpinsrd	xmm3, xmm3, r9d, 3
  3367	vmovd	xmm4, eax
  3368	vpinsrd	xmm4, xmm4, esi, 1
  3369	vpinsrd	xmm4, xmm4, ecx, 2
  3370	vpinsrd	xmm4, xmm4, r10d, 3
  3371	vinserti128	ymm3, ymm4, xmm3, 1
  3372	vpsrlvd	ymm3, ymm3, ymm2
  3373	vpand	ymm3, ymm3, ymm1
  3374	vmovdqu	ymmword ptr [r15], ymm3
  3375	sub	r15, -128
  3376	add	rbx, 72
  3377	add	r8, -1
  3378	jne	.LBB0_119
  3379	jmp	.LBB0_147
  3380.LBB0_129:
  3381	cmp	edx, 32
  3382	jl	.LBB0_147
  3383# %bb.130:
  3384	mov	r8d, r14d
  3385	add	r15, 96
  3386	add	rbx, 36
  3387	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_101] # ymm0 = [0,10,20,0,8,18,0,6]
  3388	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_102] # ymm1 = [4393751544831,4393751544831,4393751544831,4393751544831]
  3389	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_103] # ymm2 = [16,0,4,14,0,2,12,22]
  3390	.p2align	4, 0x90
  3391.LBB0_131:                              # =>This Inner Loop Header: Depth=1
  3392	mov	ecx, dword ptr [rbx - 28]
  3393	mov	edx, dword ptr [rbx - 36]
  3394	mov	esi, dword ptr [rbx - 32]
  3395	mov	edi, ecx
  3396	shld	edi, esi, 4
  3397	vmovd	xmm3, esi
  3398	vpinsrd	xmm3, xmm3, esi, 1
  3399	shld	esi, edx, 2
  3400	vpinsrd	xmm3, xmm3, edi, 2
  3401	vpinsrd	xmm3, xmm3, ecx, 3
  3402	vmovd	xmm4, edx
  3403	vpinsrd	xmm4, xmm4, edx, 1
  3404	vpinsrd	xmm4, xmm4, edx, 2
  3405	vpinsrd	xmm4, xmm4, esi, 3
  3406	vinserti128	ymm3, ymm4, xmm3, 1
  3407	vpsrlvd	ymm3, ymm3, ymm0
  3408	vpand	ymm3, ymm3, ymm1
  3409	vmovdqu	ymmword ptr [r15 - 96], ymm3
  3410	mov	ecx, dword ptr [rbx - 20]
  3411	mov	edx, dword ptr [rbx - 24]
  3412	mov	esi, ecx
  3413	shld	esi, edx, 8
  3414	mov	edi, dword ptr [rbx - 28]
  3415	mov	eax, edx
  3416	shld	eax, edi, 6
  3417	vmovd	xmm3, esi
  3418	vpinsrd	xmm3, xmm3, ecx, 1
  3419	vpinsrd	xmm3, xmm3, ecx, 2
  3420	vpinsrd	xmm3, xmm3, ecx, 3
  3421	vmovd	xmm4, edi
  3422	vpinsrd	xmm4, xmm4, eax, 1
  3423	vpinsrd	xmm4, xmm4, edx, 2
  3424	vpinsrd	xmm4, xmm4, edx, 3
  3425	vinserti128	ymm3, ymm4, xmm3, 1
  3426	vpsrlvd	ymm3, ymm3, ymm2
  3427	vpand	ymm3, ymm3, ymm1
  3428	vmovdqu	ymmword ptr [r15 - 64], ymm3
  3429	mov	eax, dword ptr [rbx - 8]
  3430	mov	ecx, dword ptr [rbx - 16]
  3431	mov	edx, dword ptr [rbx - 12]
  3432	mov	esi, eax
  3433	shld	esi, edx, 4
  3434	vmovd	xmm3, edx
  3435	vpinsrd	xmm3, xmm3, edx, 1
  3436	shld	edx, ecx, 2
  3437	vpinsrd	xmm3, xmm3, esi, 2
  3438	vpinsrd	xmm3, xmm3, eax, 3
  3439	vmovd	xmm4, ecx
  3440	vpinsrd	xmm4, xmm4, ecx, 1
  3441	vpinsrd	xmm4, xmm4, ecx, 2
  3442	vpinsrd	xmm4, xmm4, edx, 3
  3443	vinserti128	ymm3, ymm4, xmm3, 1
  3444	vpsrlvd	ymm3, ymm3, ymm0
  3445	vpand	ymm3, ymm3, ymm1
  3446	vmovdqu	ymmword ptr [r15 - 32], ymm3
  3447	mov	eax, dword ptr [rbx]
  3448	mov	ecx, dword ptr [rbx - 8]
  3449	mov	edx, dword ptr [rbx - 4]
  3450	mov	esi, eax
  3451	shld	esi, edx, 8
  3452	mov	edi, edx
  3453	shld	edi, ecx, 6
  3454	vmovd	xmm3, esi
  3455	vpinsrd	xmm3, xmm3, eax, 1
  3456	vpinsrd	xmm3, xmm3, eax, 2
  3457	vpinsrd	xmm3, xmm3, eax, 3
  3458	vmovd	xmm4, ecx
  3459	vpinsrd	xmm4, xmm4, edi, 1
  3460	vpinsrd	xmm4, xmm4, edx, 2
  3461	vpinsrd	xmm4, xmm4, edx, 3
  3462	vinserti128	ymm3, ymm4, xmm3, 1
  3463	vpsrlvd	ymm3, ymm3, ymm2
  3464	vpand	ymm3, ymm3, ymm1
  3465	vmovdqu	ymmword ptr [r15], ymm3
  3466	sub	r15, -128
  3467	add	rbx, 40
  3468	add	r8, -1
  3469	jne	.LBB0_131
  3470	jmp	.LBB0_147
  3471.LBB0_105:
  3472	cmp	edx, 32
  3473	jl	.LBB0_147
  3474# %bb.106:
  3475	mov	r8d, r14d
  3476	add	r15, 96
  3477	add	rbx, 100
  3478	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_34] # ymm0 = [288230371923853311,288230371923853311,288230371923853311,288230371923853311]
  3479	vpbroadcastq	xmm1, qword ptr [rip + .LCPI0_35] # xmm1 = [42949672976,42949672976]
  3480	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_33] # ymm2 = [0,0,0,0,0,2,0,0]
  3481	vpbroadcastq	xmm3, qword ptr [rip + .LCPI0_36] # xmm3 = [94489280528,94489280528]
  3482	vmovdqa	ymm4, ymmword ptr [rip + .LCPI0_37] # ymm4 = [0,0,4,0,0,0,0,6]
  3483	.p2align	4, 0x90
  3484.LBB0_107:                              # =>This Inner Loop Header: Depth=1
  3485	mov	ecx, dword ptr [rbx - 80]
  3486	mov	r9d, dword ptr [rbx - 76]
  3487	shld	r9d, ecx, 10
  3488	mov	r11d, dword ptr [rbx - 84]
  3489	shld	ecx, r11d, 4
  3490	mov	edi, dword ptr [rbx - 88]
  3491	mov	esi, r11d
  3492	shld	esi, edi, 24
  3493	mov	edx, dword ptr [rbx - 92]
  3494	shld	edi, edx, 18
  3495	mov	r10d, dword ptr [rbx - 100]
  3496	mov	eax, dword ptr [rbx - 96]
  3497	shld	edx, eax, 12
  3498	shld	eax, r10d, 6
  3499	vmovd	xmm5, r10d
  3500	vmovd	xmm6, esi
  3501	vpinsrd	xmm5, xmm5, eax, 1
  3502	vpinsrd	xmm6, xmm6, r11d, 1
  3503	vpinsrd	xmm5, xmm5, edx, 2
  3504	vpinsrd	xmm6, xmm6, ecx, 2
  3505	vpinsrd	xmm5, xmm5, edi, 3
  3506	vpinsrd	xmm6, xmm6, r9d, 3
  3507	vinserti128	ymm5, ymm5, xmm6, 1
  3508	vpsrlvd	ymm5, ymm5, ymm2
  3509	vpand	ymm5, ymm5, ymm0
  3510	vmovdqu	ymmword ptr [r15 - 96], ymm5
  3511	mov	r9d, dword ptr [rbx - 52]
  3512	mov	ecx, dword ptr [rbx - 56]
  3513	mov	edx, r9d
  3514	shld	edx, ecx, 20
  3515	mov	esi, dword ptr [rbx - 60]
  3516	shld	ecx, esi, 14
  3517	mov	edi, dword ptr [rbx - 68]
  3518	mov	eax, dword ptr [rbx - 64]
  3519	shld	esi, eax, 8
  3520	shld	eax, edi, 2
  3521	vmovq	xmm5, qword ptr [rbx - 76]      # xmm5 = mem[0],zero
  3522	vpsrlvd	xmm6, xmm5, xmm1
  3523	vpshufd	xmm5, xmm5, 229                 # xmm5 = xmm5[1,1,2,3]
  3524	vpinsrd	xmm5, xmm5, edi, 1
  3525	vpsllvd	xmm5, xmm5, xmm3
  3526	vpor	xmm5, xmm6, xmm5
  3527	vmovd	xmm6, esi
  3528	vpinsrd	xmm6, xmm6, ecx, 1
  3529	vpinsrd	xmm6, xmm6, edx, 2
  3530	vpinsrd	xmm6, xmm6, r9d, 3
  3531	vpinsrd	xmm5, xmm5, edi, 2
  3532	vpinsrd	xmm5, xmm5, eax, 3
  3533	vinserti128	ymm5, ymm5, xmm6, 1
  3534	vpsrlvd	ymm5, ymm5, ymm4
  3535	vpand	ymm5, ymm5, ymm0
  3536	vmovdqu	ymmword ptr [r15 - 64], ymm5
  3537	mov	eax, dword ptr [rbx - 28]
  3538	mov	r9d, dword ptr [rbx - 24]
  3539	shld	r9d, eax, 10
  3540	mov	r11d, dword ptr [rbx - 32]
  3541	shld	eax, r11d, 4
  3542	mov	esi, dword ptr [rbx - 36]
  3543	mov	edi, r11d
  3544	shld	edi, esi, 24
  3545	mov	ecx, dword ptr [rbx - 40]
  3546	shld	esi, ecx, 18
  3547	mov	r10d, dword ptr [rbx - 48]
  3548	mov	edx, dword ptr [rbx - 44]
  3549	shld	ecx, edx, 12
  3550	shld	edx, r10d, 6
  3551	vmovd	xmm5, r10d
  3552	vmovd	xmm6, edi
  3553	vpinsrd	xmm5, xmm5, edx, 1
  3554	vpinsrd	xmm6, xmm6, r11d, 1
  3555	vpinsrd	xmm5, xmm5, ecx, 2
  3556	vpinsrd	xmm6, xmm6, eax, 2
  3557	vpinsrd	xmm5, xmm5, esi, 3
  3558	vpinsrd	xmm6, xmm6, r9d, 3
  3559	vinserti128	ymm5, ymm5, xmm6, 1
  3560	vpsrlvd	ymm5, ymm5, ymm2
  3561	vpand	ymm5, ymm5, ymm0
  3562	vmovdqu	ymmword ptr [r15 - 32], ymm5
  3563	mov	r9d, dword ptr [rbx]
  3564	mov	ecx, dword ptr [rbx - 4]
  3565	mov	edx, r9d
  3566	shld	edx, ecx, 20
  3567	mov	esi, dword ptr [rbx - 8]
  3568	shld	ecx, esi, 14
  3569	mov	edi, dword ptr [rbx - 16]
  3570	mov	eax, dword ptr [rbx - 12]
  3571	shld	esi, eax, 8
  3572	shld	eax, edi, 2
  3573	vmovq	xmm5, qword ptr [rbx - 24]      # xmm5 = mem[0],zero
  3574	vpsrlvd	xmm6, xmm5, xmm1
  3575	vpshufd	xmm5, xmm5, 229                 # xmm5 = xmm5[1,1,2,3]
  3576	vpinsrd	xmm5, xmm5, edi, 1
  3577	vpsllvd	xmm5, xmm5, xmm3
  3578	vpor	xmm5, xmm6, xmm5
  3579	vmovd	xmm6, esi
  3580	vpinsrd	xmm6, xmm6, ecx, 1
  3581	vpinsrd	xmm6, xmm6, edx, 2
  3582	vpinsrd	xmm6, xmm6, r9d, 3
  3583	vpinsrd	xmm5, xmm5, edi, 2
  3584	vpinsrd	xmm5, xmm5, eax, 3
  3585	vinserti128	ymm5, ymm5, xmm6, 1
  3586	vpsrlvd	ymm5, ymm5, ymm4
  3587	vpand	ymm5, ymm5, ymm0
  3588	vmovdqu	ymmword ptr [r15], ymm5
  3589	sub	r15, -128
  3590	add	rbx, 104
  3591	add	r8, -1
  3592	jne	.LBB0_107
  3593	jmp	.LBB0_147
  3594.LBB0_135:
  3595	cmp	edx, 32
  3596	jl	.LBB0_147
  3597# %bb.136:
  3598	mov	eax, r14d
  3599	add	r15, 96
  3600	add	rbx, 20
  3601	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_116] # ymm0 = [0,6,12,18,24,0,4,10]
  3602	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_117] # ymm1 = [270582939711,270582939711,270582939711,270582939711]
  3603	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_118] # ymm2 = [16,22,0,2,8,14,20,26]
  3604	.p2align	4, 0x90
  3605.LBB0_137:                              # =>This Inner Loop Header: Depth=1
  3606	mov	ecx, dword ptr [rbx - 20]
  3607	mov	edx, dword ptr [rbx - 16]
  3608	mov	esi, edx
  3609	shld	esi, ecx, 2
  3610	vmovd	xmm3, ecx
  3611	vpbroadcastd	xmm4, xmm3
  3612	vpinsrd	xmm3, xmm3, esi, 1
  3613	vpinsrd	xmm3, xmm3, edx, 2
  3614	vpinsrd	xmm3, xmm3, edx, 3
  3615	vinserti128	ymm3, ymm4, xmm3, 1
  3616	vpsrlvd	ymm3, ymm3, ymm0
  3617	vpand	ymm3, ymm3, ymm1
  3618	vmovdqu	ymmword ptr [r15 - 96], ymm3
  3619	mov	ecx, dword ptr [rbx - 16]
  3620	mov	edx, dword ptr [rbx - 12]
  3621	mov	esi, edx
  3622	shld	esi, ecx, 4
  3623	vmovd	xmm3, ecx
  3624	vpinsrd	xmm3, xmm3, ecx, 1
  3625	vpinsrd	xmm3, xmm3, esi, 2
  3626	vpinsrd	xmm3, xmm3, edx, 3
  3627	vmovd	xmm4, edx
  3628	vpbroadcastd	xmm4, xmm4
  3629	vinserti128	ymm3, ymm3, xmm4, 1
  3630	vpsrlvd	ymm3, ymm3, ymm2
  3631	vpand	ymm3, ymm3, ymm1
  3632	vmovdqu	ymmword ptr [r15 - 64], ymm3
  3633	mov	ecx, dword ptr [rbx - 8]
  3634	mov	edx, dword ptr [rbx - 4]
  3635	mov	esi, edx
  3636	shld	esi, ecx, 2
  3637	vmovd	xmm3, ecx
  3638	vpinsrd	xmm4, xmm3, esi, 1
  3639	vpinsrd	xmm4, xmm4, edx, 2
  3640	vpbroadcastd	xmm3, xmm3
  3641	vpinsrd	xmm4, xmm4, edx, 3
  3642	vinserti128	ymm3, ymm3, xmm4, 1
  3643	vpsrlvd	ymm3, ymm3, ymm0
  3644	vpand	ymm3, ymm3, ymm1
  3645	vmovdqu	ymmword ptr [r15 - 32], ymm3
  3646	mov	ecx, dword ptr [rbx - 4]
  3647	mov	edx, dword ptr [rbx]
  3648	mov	esi, edx
  3649	shld	esi, ecx, 4
  3650	vmovd	xmm3, ecx
  3651	vpinsrd	xmm3, xmm3, ecx, 1
  3652	vpinsrd	xmm3, xmm3, esi, 2
  3653	vpinsrd	xmm3, xmm3, edx, 3
  3654	vmovd	xmm4, edx
  3655	vpbroadcastd	xmm4, xmm4
  3656	vinserti128	ymm3, ymm3, xmm4, 1
  3657	vpsrlvd	ymm3, ymm3, ymm2
  3658	vpand	ymm3, ymm3, ymm1
  3659	vmovdqu	ymmword ptr [r15], ymm3
  3660	sub	r15, -128
  3661	add	rbx, 24
  3662	add	rax, -1
  3663	jne	.LBB0_137
  3664	jmp	.LBB0_147
  3665.LBB0_111:
  3666	cmp	edx, 32
  3667	jl	.LBB0_147
  3668# %bb.112:
  3669	mov	r8d, r14d
  3670	add	r15, 96
  3671	add	rbx, 84
  3672	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_55] # ymm0 = [0,0,0,2,0,0,4,0]
  3673	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_56] # ymm1 = [18014394218708991,18014394218708991,18014394218708991,18014394218708991]
  3674	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_57] # ymm2 = [0,6,0,0,8,0,0,10]
  3675	.p2align	4, 0x90
  3676.LBB0_113:                              # =>This Inner Loop Header: Depth=1
  3677	mov	r10d, dword ptr [rbx - 68]
  3678	mov	r9d, dword ptr [rbx - 64]
  3679	shld	r9d, r10d, 6
  3680	mov	esi, dword ptr [rbx - 72]
  3681	mov	edi, r10d
  3682	shld	edi, esi, 18
  3683	mov	edx, dword ptr [rbx - 76]
  3684	shld	esi, edx, 8
  3685	mov	r11d, dword ptr [rbx - 84]
  3686	mov	ecx, dword ptr [rbx - 80]
  3687	mov	eax, edx
  3688	shld	eax, ecx, 20
  3689	shld	ecx, r11d, 10
  3690	vmovd	xmm3, r11d
  3691	vmovd	xmm4, esi
  3692	vpinsrd	xmm3, xmm3, ecx, 1
  3693	vpinsrd	xmm4, xmm4, edi, 1
  3694	vpinsrd	xmm3, xmm3, eax, 2
  3695	vpinsrd	xmm4, xmm4, r10d, 2
  3696	vpinsrd	xmm3, xmm3, edx, 3
  3697	vpinsrd	xmm4, xmm4, r9d, 3
  3698	vinserti128	ymm3, ymm3, xmm4, 1
  3699	vpsrlvd	ymm3, ymm3, ymm0
  3700	vpand	ymm3, ymm3, ymm1
  3701	vmovdqu	ymmword ptr [r15 - 96], ymm3
  3702	mov	r9d, dword ptr [rbx - 44]
  3703	mov	ecx, dword ptr [rbx - 48]
  3704	mov	r10d, r9d
  3705	shld	r10d, ecx, 12
  3706	mov	esi, dword ptr [rbx - 52]
  3707	shld	ecx, esi, 2
  3708	mov	edi, dword ptr [rbx - 56]
  3709	vmovd	xmm3, esi
  3710	shld	esi, edi, 14
  3711	mov	eax, dword ptr [rbx - 64]
  3712	mov	edx, dword ptr [rbx - 60]
  3713	shld	edi, edx, 4
  3714	shrd	eax, edx, 16
  3715	vpinsrd	xmm3, xmm3, ecx, 1
  3716	vmovd	xmm4, eax
  3717	vpinsrd	xmm3, xmm3, r10d, 2
  3718	vpinsrd	xmm4, xmm4, edx, 1
  3719	vpinsrd	xmm3, xmm3, r9d, 3
  3720	vpinsrd	xmm4, xmm4, edi, 2
  3721	vpinsrd	xmm4, xmm4, esi, 3
  3722	vinserti128	ymm3, ymm4, xmm3, 1
  3723	vpsrlvd	ymm3, ymm3, ymm2
  3724	vpand	ymm3, ymm3, ymm1
  3725	vmovdqu	ymmword ptr [r15 - 64], ymm3
  3726	mov	r10d, dword ptr [rbx - 24]
  3727	mov	r9d, dword ptr [rbx - 20]
  3728	shld	r9d, r10d, 6
  3729	mov	edx, dword ptr [rbx - 28]
  3730	mov	esi, r10d
  3731	shld	esi, edx, 18
  3732	mov	ecx, dword ptr [rbx - 32]
  3733	shld	edx, ecx, 8
  3734	mov	r11d, dword ptr [rbx - 40]
  3735	mov	eax, dword ptr [rbx - 36]
  3736	mov	edi, ecx
  3737	shld	edi, eax, 20
  3738	shld	eax, r11d, 10
  3739	vmovd	xmm3, r11d
  3740	vmovd	xmm4, edx
  3741	vpinsrd	xmm3, xmm3, eax, 1
  3742	vpinsrd	xmm4, xmm4, esi, 1
  3743	vpinsrd	xmm3, xmm3, edi, 2
  3744	vpinsrd	xmm4, xmm4, r10d, 2
  3745	vpinsrd	xmm3, xmm3, ecx, 3
  3746	vpinsrd	xmm4, xmm4, r9d, 3
  3747	vinserti128	ymm3, ymm3, xmm4, 1
  3748	vpsrlvd	ymm3, ymm3, ymm0
  3749	vpand	ymm3, ymm3, ymm1
  3750	vmovdqu	ymmword ptr [r15 - 32], ymm3
  3751	mov	r9d, dword ptr [rbx]
  3752	mov	ecx, dword ptr [rbx - 4]
  3753	mov	r10d, r9d
  3754	shld	r10d, ecx, 12
  3755	mov	esi, dword ptr [rbx - 8]
  3756	shld	ecx, esi, 2
  3757	mov	edi, dword ptr [rbx - 12]
  3758	vmovd	xmm3, esi
  3759	shld	esi, edi, 14
  3760	mov	eax, dword ptr [rbx - 20]
  3761	mov	edx, dword ptr [rbx - 16]
  3762	shld	edi, edx, 4
  3763	shrd	eax, edx, 16
  3764	vpinsrd	xmm3, xmm3, ecx, 1
  3765	vmovd	xmm4, eax
  3766	vpinsrd	xmm3, xmm3, r10d, 2
  3767	vpinsrd	xmm4, xmm4, edx, 1
  3768	vpinsrd	xmm3, xmm3, r9d, 3
  3769	vpinsrd	xmm4, xmm4, edi, 2
  3770	vpinsrd	xmm4, xmm4, esi, 3
  3771	vinserti128	ymm3, ymm4, xmm3, 1
  3772	vpsrlvd	ymm3, ymm3, ymm2
  3773	vpand	ymm3, ymm3, ymm1
  3774	vmovdqu	ymmword ptr [r15], ymm3
  3775	sub	r15, -128
  3776	add	rbx, 88
  3777	add	r8, -1
  3778	jne	.LBB0_113
  3779	jmp	.LBB0_147
  3780.LBB0_123:
  3781	cmp	edx, 32
  3782	jl	.LBB0_147
  3783# %bb.124:
  3784	mov	r8d, r14d
  3785	add	r15, 96
  3786	add	rbx, 52
  3787	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_86] # ymm0 = [0,14,0,10,0,6,0,2]
  3788	vpbroadcastq	ymm1, qword ptr [rip + .LCPI0_87] # ymm1 = [70364449226751,70364449226751,70364449226751,70364449226751]
  3789	vmovdqa	ymm2, ymmword ptr [rip + .LCPI0_88] # ymm2 = [16,0,12,0,8,0,4,18]
  3790	.p2align	4, 0x90
  3791.LBB0_125:                              # =>This Inner Loop Header: Depth=1
  3792	mov	r9d, dword ptr [rbx - 40]
  3793	mov	ecx, dword ptr [rbx - 44]
  3794	mov	esi, r9d
  3795	shld	esi, ecx, 12
  3796	mov	edi, dword ptr [rbx - 52]
  3797	mov	r10d, dword ptr [rbx - 48]
  3798	mov	edx, ecx
  3799	shld	edx, r10d, 8
  3800	mov	eax, r10d
  3801	shld	eax, edi, 4
  3802	vmovd	xmm3, edx
  3803	vpinsrd	xmm3, xmm3, ecx, 1
  3804	vpinsrd	xmm3, xmm3, esi, 2
  3805	vpinsrd	xmm3, xmm3, r9d, 3
  3806	vmovd	xmm4, edi
  3807	vpinsrd	xmm4, xmm4, edi, 1
  3808	vpinsrd	xmm4, xmm4, eax, 2
  3809	vpinsrd	xmm4, xmm4, r10d, 3
  3810	vinserti128	ymm3, ymm4, xmm3, 1
  3811	vpsrlvd	ymm3, ymm3, ymm0
  3812	vpand	ymm3, ymm3, ymm1
  3813	vmovdqu	ymmword ptr [r15 - 96], ymm3
  3814	mov	eax, dword ptr [rbx - 28]
  3815	mov	ecx, dword ptr [rbx - 32]
  3816	mov	edx, eax
  3817	shld	edx, ecx, 10
  3818	mov	r9d, dword ptr [rbx - 40]
  3819	mov	esi, dword ptr [rbx - 36]
  3820	vmovd	xmm3, ecx
  3821	shld	ecx, esi, 6
  3822	mov	edi, esi
  3823	shld	edi, r9d, 2
  3824	vmovd	xmm4, r9d
  3825	vpinsrd	xmm4, xmm4, edi, 1
  3826	vpinsrd	xmm4, xmm4, esi, 2
  3827	vpinsrd	xmm4, xmm4, ecx, 3
  3828	vpinsrd	xmm3, xmm3, edx, 1
  3829	vpinsrd	xmm3, xmm3, eax, 2
  3830	vpinsrd	xmm3, xmm3, eax, 3
  3831	vinserti128	ymm3, ymm4, xmm3, 1
  3832	vpsrlvd	ymm3, ymm3, ymm2
  3833	vpand	ymm3, ymm3, ymm1
  3834	vmovdqu	ymmword ptr [r15 - 64], ymm3
  3835	mov	r9d, dword ptr [rbx - 12]
  3836	mov	eax, dword ptr [rbx - 16]
  3837	mov	edx, r9d
  3838	shld	edx, eax, 12
  3839	mov	esi, dword ptr [rbx - 24]
  3840	mov	r10d, dword ptr [rbx - 20]
  3841	mov	ecx, eax
  3842	shld	ecx, r10d, 8
  3843	mov	edi, r10d
  3844	shld	edi, esi, 4
  3845	vmovd	xmm3, ecx
  3846	vpinsrd	xmm3, xmm3, eax, 1
  3847	vpinsrd	xmm3, xmm3, edx, 2
  3848	vpinsrd	xmm3, xmm3, r9d, 3
  3849	vmovd	xmm4, esi
  3850	vpinsrd	xmm4, xmm4, esi, 1
  3851	vpinsrd	xmm4, xmm4, edi, 2
  3852	vpinsrd	xmm4, xmm4, r10d, 3
  3853	vinserti128	ymm3, ymm4, xmm3, 1
  3854	vpsrlvd	ymm3, ymm3, ymm0
  3855	vpand	ymm3, ymm3, ymm1
  3856	vmovdqu	ymmword ptr [r15 - 32], ymm3
  3857	mov	r9d, dword ptr [rbx]
  3858	mov	ecx, dword ptr [rbx - 4]
  3859	mov	edx, r9d
  3860	shld	edx, ecx, 10
  3861	mov	eax, dword ptr [rbx - 8]
  3862	vmovd	xmm3, ecx
  3863	shld	ecx, eax, 6
  3864	mov	edi, dword ptr [rbx - 12]
  3865	mov	esi, eax
  3866	shld	esi, edi, 2
  3867	vmovd	xmm4, edi
  3868	vpinsrd	xmm4, xmm4, esi, 1
  3869	vpinsrd	xmm4, xmm4, eax, 2
  3870	vpinsrd	xmm4, xmm4, ecx, 3
  3871	vpinsrd	xmm3, xmm3, edx, 1
  3872	vpinsrd	xmm3, xmm3, r9d, 2
  3873	vpinsrd	xmm3, xmm3, r9d, 3
  3874	vinserti128	ymm3, ymm4, xmm3, 1
  3875	vpsrlvd	ymm3, ymm3, ymm2
  3876	vpand	ymm3, ymm3, ymm1
  3877	vmovdqu	ymmword ptr [r15], ymm3
  3878	sub	r15, -128
  3879	add	rbx, 56
  3880	add	r8, -1
  3881	jne	.LBB0_125
  3882	jmp	.LBB0_147
  3883.LBB0_99:
  3884	cmp	edx, 32
  3885	jl	.LBB0_147
  3886# %bb.100:
  3887	mov	r8d, r14d
  3888	add	r15, 96
  3889	vpbroadcastq	ymm0, qword ptr [rip + .LCPI0_8] # ymm0 = [4611686015206162431,4611686015206162431,4611686015206162431,4611686015206162431]
  3890	add	rbx, 116
  3891	vmovdqa	xmm1, xmmword ptr [rip + .LCPI0_9] # xmm1 = [16,14,12,10]
  3892	vmovdqa	xmm2, xmmword ptr [rip + .LCPI0_10] # xmm2 = [16,18,20,22]
  3893	vmovdqa	ymm3, ymmword ptr [rip + .LCPI0_11] # ymm3 = [0,0,0,0,0,0,0,2]
  3894	.p2align	4, 0x90
  3895.LBB0_101:                              # =>This Inner Loop Header: Depth=1
  3896	mov	r11d, dword ptr [rbx - 92]
  3897	mov	r9d, dword ptr [rbx - 88]
  3898	shld	r9d, r11d, 14
  3899	mov	esi, dword ptr [rbx - 96]
  3900	shld	r11d, esi, 12
  3901	mov	edi, dword ptr [rbx - 100]
  3902	shld	esi, edi, 10
  3903	mov	eax, dword ptr [rbx - 104]
  3904	shld	edi, eax, 8
  3905	mov	edx, dword ptr [rbx - 108]
  3906	shld	eax, edx, 6
  3907	mov	r10d, dword ptr [rbx - 116]
  3908	mov	ecx, dword ptr [rbx - 112]
  3909	shld	edx, ecx, 4
  3910	shld	ecx, r10d, 2
  3911	vmovd	xmm4, r10d
  3912	vmovd	xmm5, edi
  3913	vpinsrd	xmm4, xmm4, ecx, 1
  3914	vpinsrd	xmm5, xmm5, esi, 1
  3915	vpinsrd	xmm4, xmm4, edx, 2
  3916	vpinsrd	xmm5, xmm5, r11d, 2
  3917	vpinsrd	xmm4, xmm4, eax, 3
  3918	vpinsrd	xmm5, xmm5, r9d, 3
  3919	vinserti128	ymm4, ymm4, xmm5, 1
  3920	vpand	ymm4, ymm4, ymm0
  3921	vmovdqu	ymmword ptr [r15 - 96], ymm4
  3922	mov	eax, dword ptr [rbx - 60]
  3923	mov	ecx, dword ptr [rbx - 64]
  3924	mov	edx, eax
  3925	shld	edx, ecx, 28
  3926	mov	esi, dword ptr [rbx - 68]
  3927	mov	edi, dword ptr [rbx - 72]
  3928	shld	ecx, esi, 26
  3929	shld	esi, edi, 24
  3930	vmovdqu	xmm4, xmmword ptr [rbx - 88]
  3931	vpsrlvd	xmm5, xmm4, xmm1
  3932	vpshufd	xmm4, xmm4, 249                 # xmm4 = xmm4[1,2,3,3]
  3933	vpinsrd	xmm4, xmm4, edi, 3
  3934	vmovd	xmm6, esi
  3935	vpinsrd	xmm6, xmm6, ecx, 1
  3936	vpinsrd	xmm6, xmm6, edx, 2
  3937	vpsllvd	xmm4, xmm4, xmm2
  3938	vpinsrd	xmm6, xmm6, eax, 3
  3939	vpor	xmm4, xmm5, xmm4
  3940	vinserti128	ymm4, ymm4, xmm6, 1
  3941	vpsrlvd	ymm4, ymm4, ymm3
  3942	vpand	ymm4, ymm4, ymm0
  3943	vmovdqu	ymmword ptr [r15 - 64], ymm4
  3944	mov	r11d, dword ptr [rbx - 32]
  3945	mov	r9d, dword ptr [rbx - 28]
  3946	shld	r9d, r11d, 14
  3947	mov	edx, dword ptr [rbx - 36]
  3948	shld	r11d, edx, 12
  3949	mov	esi, dword ptr [rbx - 40]
  3950	shld	edx, esi, 10
  3951	mov	edi, dword ptr [rbx - 44]
  3952	shld	esi, edi, 8
  3953	mov	ecx, dword ptr [rbx - 48]
  3954	shld	edi, ecx, 6
  3955	mov	r10d, dword ptr [rbx - 56]
  3956	mov	eax, dword ptr [rbx - 52]
  3957	shld	ecx, eax, 4
  3958	shld	eax, r10d, 2
  3959	vmovd	xmm4, r10d
  3960	vmovd	xmm5, esi
  3961	vpinsrd	xmm4, xmm4, eax, 1
  3962	vpinsrd	xmm5, xmm5, edx, 1
  3963	vpinsrd	xmm4, xmm4, ecx, 2
  3964	vpinsrd	xmm5, xmm5, r11d, 2
  3965	vpinsrd	xmm4, xmm4, edi, 3
  3966	vpinsrd	xmm5, xmm5, r9d, 3
  3967	vinserti128	ymm4, ymm4, xmm5, 1
  3968	vpand	ymm4, ymm4, ymm0
  3969	vmovdqu	ymmword ptr [r15 - 32], ymm4
  3970	mov	eax, dword ptr [rbx]
  3971	mov	ecx, dword ptr [rbx - 4]
  3972	mov	edx, eax
  3973	shld	edx, ecx, 28
  3974	mov	esi, dword ptr [rbx - 8]
  3975	shld	ecx, esi, 26
  3976	mov	edi, dword ptr [rbx - 12]
  3977	vmovdqu	xmm4, xmmword ptr [rbx - 28]
  3978	shld	esi, edi, 24
  3979	vpsrlvd	xmm5, xmm4, xmm1
  3980	vpshufd	xmm4, xmm4, 249                 # xmm4 = xmm4[1,2,3,3]
  3981	vpinsrd	xmm4, xmm4, edi, 3
  3982	vmovd	xmm6, esi
  3983	vpinsrd	xmm6, xmm6, ecx, 1
  3984	vpsllvd	xmm4, xmm4, xmm2
  3985	vpinsrd	xmm6, xmm6, edx, 2
  3986	vpinsrd	xmm6, xmm6, eax, 3
  3987	vpor	xmm4, xmm5, xmm4
  3988	vinserti128	ymm4, ymm4, xmm6, 1
  3989	vpsrlvd	ymm4, ymm4, ymm3
  3990	vpand	ymm4, ymm4, ymm0
  3991	vmovdqu	ymmword ptr [r15], ymm4
  3992	sub	r15, -128
  3993	add	rbx, 120
  3994	add	r8, -1
  3995	jne	.LBB0_101
  3996.LBB0_147:
  3997	shl	r14d, 5
  3998	mov	eax, r14d
  3999	lea	rsp, [rbp - 32]
  4000	pop	rbx
  4001	pop	r12
  4002	pop	r14
  4003	pop	r15
  4004	pop	rbp
  4005	vzeroupper
  4006	ret
  4007.Lfunc_end0:
  4008	.size	unpack32_avx2, .Lfunc_end0-unpack32_avx2
  4009                                        # -- End function
  4010	.ident	"Debian clang version 11.1.0-++20210428103820+1fdec59bffc1-1~exp1~20210428204437.162"
  4011	.section	".note.GNU-stack","",@progbits
  4012	.addrsig

View as plain text