1 .text
2 .intel_syntax noprefix
3 .file "bit_packing_avx2.c"
4 .section .rodata.cst8,"aM",@progbits,8
5 .p2align 3 # -- Begin function unpack32_avx2
6.LCPI0_0:
7 .quad 9223372034707292159 # 0x7fffffff7fffffff
8.LCPI0_8:
9 .quad 4611686015206162431 # 0x3fffffff3fffffff
10.LCPI0_12:
11 .quad 2305843005455597567 # 0x1fffffff1fffffff
12.LCPI0_23:
13 .quad 1152921500580315135 # 0xfffffff0fffffff
14.LCPI0_25:
15 .quad 576460748142673919 # 0x7ffffff07ffffff
16.LCPI0_34:
17 .quad 288230371923853311 # 0x3ffffff03ffffff
18.LCPI0_35:
19 .quad 42949672976 # 0xa00000010
20.LCPI0_36:
21 .quad 94489280528 # 0x1600000010
22.LCPI0_38:
23 .quad 144115183814443007 # 0x1ffffff01ffffff
24.LCPI0_49:
25 .quad 36028792732385279 # 0x7fffff007fffff
26.LCPI0_56:
27 .quad 18014394218708991 # 0x3fffff003fffff
28.LCPI0_59:
29 .quad 9007194961870847 # 0x1fffff001fffff
30.LCPI0_66:
31 .quad 4503595333451775 # 0xfffff000fffff
32.LCPI0_68:
33 .quad 2251795519242239 # 0x7ffff0007ffff
34.LCPI0_73:
35 .quad 1125895612137471 # 0x3ffff0003ffff
36.LCPI0_76:
37 .quad 562945658585087 # 0x1ffff0001ffff
38.LCPI0_80:
39 .quad 68719476736 # 0x1000000000
40.LCPI0_82:
41 .quad 140733193420799 # 0x7fff00007fff
42.LCPI0_87:
43 .quad 70364449226751 # 0x3fff00003fff
44.LCPI0_90:
45 .quad 35180077129727 # 0x1fff00001fff
46.LCPI0_95:
47 .quad 17587891081215 # 0xfff00000fff
48.LCPI0_97:
49 .quad 8791798056959 # 0x7ff000007ff
50.LCPI0_102:
51 .quad 4393751544831 # 0x3ff000003ff
52.LCPI0_105:
53 .quad 2194728288767 # 0x1ff000001ff
54.LCPI0_112:
55 .quad 545460846719 # 0x7f0000007f
56.LCPI0_117:
57 .quad 270582939711 # 0x3f0000003f
58.LCPI0_120:
59 .quad 133143986207 # 0x1f0000001f
60.LCPI0_125:
61 .quad 64424509455 # 0xf0000000f
62.LCPI0_127:
63 .quad 30064771079 # 0x700000007
64.LCPI0_132:
65 .quad 12884901891 # 0x300000003
66.LCPI0_135:
67 .quad 4294967297 # 0x100000001
68 .section .rodata.cst32,"aM",@progbits,32
69 .p2align 5
70.LCPI0_1:
71 .long 24 # 0x18
72 .long 23 # 0x17
73 .long 22 # 0x16
74 .long 21 # 0x15
75 .long 20 # 0x14
76 .long 19 # 0x13
77 .long 18 # 0x12
78 .long 17 # 0x11
79.LCPI0_2:
80 .long 8 # 0x8
81 .long 9 # 0x9
82 .long 10 # 0xa
83 .long 11 # 0xb
84 .long 12 # 0xc
85 .long 13 # 0xd
86 .long 14 # 0xe
87 .long 15 # 0xf
88.LCPI0_3:
89 .long 16 # 0x10
90 .long 15 # 0xf
91 .long 14 # 0xe
92 .long 13 # 0xd
93 .long 12 # 0xc
94 .long 11 # 0xb
95 .long 10 # 0xa
96 .long 9 # 0x9
97.LCPI0_4:
98 .long 16 # 0x10
99 .long 17 # 0x11
100 .long 18 # 0x12
101 .long 19 # 0x13
102 .long 20 # 0x14
103 .long 21 # 0x15
104 .long 22 # 0x16
105 .long 23 # 0x17
106.LCPI0_7:
107 .long 0 # 0x0
108 .long 0 # 0x0
109 .long 0 # 0x0
110 .long 0 # 0x0
111 .long 0 # 0x0
112 .long 0 # 0x0
113 .long 0 # 0x0
114 .long 1 # 0x1
115.LCPI0_11:
116 .long 0 # 0x0
117 .long 0 # 0x0
118 .long 0 # 0x0
119 .long 0 # 0x0
120 .long 0 # 0x0
121 .long 0 # 0x0
122 .long 0 # 0x0
123 .long 2 # 0x2
124.LCPI0_15:
125 .long 0 # 0x0
126 .long 0 # 0x0
127 .long 2 # 0x2
128 .long 0 # 0x0
129 .long 0 # 0x0
130 .long 0 # 0x0
131 .long 0 # 0x0
132 .long 0 # 0x0
133.LCPI0_18:
134 .long 0 # 0x0
135 .long 0 # 0x0
136 .long 0 # 0x0
137 .long 0 # 0x0
138 .long 0 # 0x0
139 .long 1 # 0x1
140 .long 0 # 0x0
141 .long 0 # 0x0
142.LCPI0_21:
143 .long 0 # 0x0
144 .long 0 # 0x0
145 .long 0 # 0x0
146 .long 0 # 0x0
147 .long 0 # 0x0
148 .long 0 # 0x0
149 .long 0 # 0x0
150 .long 3 # 0x3
151.LCPI0_22:
152 .long 0 # 0x0
153 .long 0 # 0x0
154 .long 0 # 0x0
155 .long 0 # 0x0
156 .long 0 # 0x0
157 .long 0 # 0x0
158 .long 0 # 0x0
159 .long 4 # 0x4
160.LCPI0_24:
161 .long 0 # 0x0
162 .long 0 # 0x0
163 .long 0 # 0x0
164 .long 0 # 0x0
165 .long 0 # 0x0
166 .long 0 # 0x0
167 .long 2 # 0x2
168 .long 0 # 0x0
169.LCPI0_28:
170 .long 0 # 0x0
171 .long 0 # 0x0
172 .long 0 # 0x0
173 .long 0 # 0x0
174 .long 4 # 0x4
175 .long 0 # 0x0
176 .long 0 # 0x0
177 .long 0 # 0x0
178.LCPI0_31:
179 .long 0 # 0x0
180 .long 0 # 0x0
181 .long 0 # 0x0
182 .long 1 # 0x1
183 .long 0 # 0x0
184 .long 0 # 0x0
185 .long 0 # 0x0
186 .long 0 # 0x0
187.LCPI0_32:
188 .long 0 # 0x0
189 .long 3 # 0x3
190 .long 0 # 0x0
191 .long 0 # 0x0
192 .long 0 # 0x0
193 .long 0 # 0x0
194 .long 0 # 0x0
195 .long 5 # 0x5
196.LCPI0_33:
197 .long 0 # 0x0
198 .long 0 # 0x0
199 .long 0 # 0x0
200 .long 0 # 0x0
201 .long 0 # 0x0
202 .long 2 # 0x2
203 .long 0 # 0x0
204 .long 0 # 0x0
205.LCPI0_37:
206 .long 0 # 0x0
207 .long 0 # 0x0
208 .long 4 # 0x4
209 .long 0 # 0x0
210 .long 0 # 0x0
211 .long 0 # 0x0
212 .long 0 # 0x0
213 .long 6 # 0x6
214.LCPI0_39:
215 .long 0 # 0x0
216 .long 1 # 0x1
217 .long 0 # 0x0
218 .long 0 # 0x0
219 .long 0 # 0x0
220 .long 5 # 0x5
221 .long 0 # 0x0
222 .long 0 # 0x0
223.LCPI0_42:
224 .long 0 # 0x0
225 .long 0 # 0x0
226 .long 2 # 0x2
227 .long 0 # 0x0
228 .long 0 # 0x0
229 .long 0 # 0x0
230 .long 6 # 0x6
231 .long 0 # 0x0
232.LCPI0_45:
233 .long 0 # 0x0
234 .long 0 # 0x0
235 .long 0 # 0x0
236 .long 3 # 0x3
237 .long 0 # 0x0
238 .long 0 # 0x0
239 .long 0 # 0x0
240 .long 7 # 0x7
241.LCPI0_48:
242 .long 0 # 0x0
243 .long 0 # 0x0
244 .long 0 # 0x0
245 .long 5 # 0x5
246 .long 0 # 0x0
247 .long 0 # 0x0
248 .long 0 # 0x0
249 .long 1 # 0x1
250.LCPI0_52:
251 .long 0 # 0x0
252 .long 0 # 0x0
253 .long 6 # 0x6
254 .long 0 # 0x0
255 .long 0 # 0x0
256 .long 0 # 0x0
257 .long 2 # 0x2
258 .long 0 # 0x0
259.LCPI0_53:
260 .long 0 # 0x0
261 .long 7 # 0x7
262 .long 0 # 0x0
263 .long 0 # 0x0
264 .long 0 # 0x0
265 .long 3 # 0x3
266 .long 0 # 0x0
267 .long 0 # 0x0
268.LCPI0_54:
269 .long 8 # 0x8
270 .long 0 # 0x0
271 .long 0 # 0x0
272 .long 0 # 0x0
273 .long 4 # 0x4
274 .long 0 # 0x0
275 .long 0 # 0x0
276 .long 9 # 0x9
277.LCPI0_55:
278 .long 0 # 0x0
279 .long 0 # 0x0
280 .long 0 # 0x0
281 .long 2 # 0x2
282 .long 0 # 0x0
283 .long 0 # 0x0
284 .long 4 # 0x4
285 .long 0 # 0x0
286.LCPI0_57:
287 .long 0 # 0x0
288 .long 6 # 0x6
289 .long 0 # 0x0
290 .long 0 # 0x0
291 .long 8 # 0x8
292 .long 0 # 0x0
293 .long 0 # 0x0
294 .long 10 # 0xa
295.LCPI0_58:
296 .long 0 # 0x0
297 .long 0 # 0x0
298 .long 10 # 0xa
299 .long 0 # 0x0
300 .long 0 # 0x0
301 .long 9 # 0x9
302 .long 0 # 0x0
303 .long 0 # 0x0
304.LCPI0_60:
305 .long 8 # 0x8
306 .long 0 # 0x0
307 .long 0 # 0x0
308 .long 7 # 0x7
309 .long 0 # 0x0
310 .long 0 # 0x0
311 .long 6 # 0x6
312 .long 0 # 0x0
313.LCPI0_61:
314 .long 0 # 0x0
315 .long 5 # 0x5
316 .long 0 # 0x0
317 .long 0 # 0x0
318 .long 4 # 0x4
319 .long 0 # 0x0
320 .long 0 # 0x0
321 .long 3 # 0x3
322.LCPI0_64:
323 .long 0 # 0x0
324 .long 0 # 0x0
325 .long 2 # 0x2
326 .long 0 # 0x0
327 .long 0 # 0x0
328 .long 1 # 0x1
329 .long 0 # 0x0
330 .long 11 # 0xb
331.LCPI0_65:
332 .long 0 # 0x0
333 .long 0 # 0x0
334 .long 8 # 0x8
335 .long 0 # 0x0
336 .long 0 # 0x0
337 .long 4 # 0x4
338 .long 0 # 0x0
339 .long 12 # 0xc
340.LCPI0_67:
341 .long 0 # 0x0
342 .long 0 # 0x0
343 .long 6 # 0x6
344 .long 0 # 0x0
345 .long 12 # 0xc
346 .long 0 # 0x0
347 .long 0 # 0x0
348 .long 5 # 0x5
349.LCPI0_69:
350 .long 0 # 0x0
351 .long 11 # 0xb
352 .long 0 # 0x0
353 .long 0 # 0x0
354 .long 4 # 0x4
355 .long 0 # 0x0
356 .long 10 # 0xa
357 .long 0 # 0x0
358.LCPI0_70:
359 .long 0 # 0x0
360 .long 3 # 0x3
361 .long 0 # 0x0
362 .long 9 # 0x9
363 .long 0 # 0x0
364 .long 0 # 0x0
365 .long 2 # 0x2
366 .long 0 # 0x0
367.LCPI0_71:
368 .long 8 # 0x8
369 .long 0 # 0x0
370 .long 0 # 0x0
371 .long 1 # 0x1
372 .long 0 # 0x0
373 .long 7 # 0x7
374 .long 0 # 0x0
375 .long 13 # 0xd
376.LCPI0_72:
377 .long 0 # 0x0
378 .long 0 # 0x0
379 .long 4 # 0x4
380 .long 0 # 0x0
381 .long 8 # 0x8
382 .long 0 # 0x0
383 .long 12 # 0xc
384 .long 0 # 0x0
385.LCPI0_74:
386 .long 0 # 0x0
387 .long 2 # 0x2
388 .long 0 # 0x0
389 .long 6 # 0x6
390 .long 0 # 0x0
391 .long 10 # 0xa
392 .long 0 # 0x0
393 .long 14 # 0xe
394.LCPI0_75:
395 .long 0 # 0x0
396 .long 0 # 0x0
397 .long 2 # 0x2
398 .long 0 # 0x0
399 .long 4 # 0x4
400 .long 0 # 0x0
401 .long 6 # 0x6
402 .long 0 # 0x0
403.LCPI0_77:
404 .long 8 # 0x8
405 .long 0 # 0x0
406 .long 10 # 0xa
407 .long 0 # 0x0
408 .long 12 # 0xc
409 .long 0 # 0x0
410 .long 14 # 0xe
411 .long 0 # 0x0
412.LCPI0_78:
413 .long 0 # 0x0
414 .long 1 # 0x1
415 .long 0 # 0x0
416 .long 3 # 0x3
417 .long 0 # 0x0
418 .long 5 # 0x5
419 .long 0 # 0x0
420 .long 7 # 0x7
421.LCPI0_79:
422 .long 0 # 0x0
423 .long 9 # 0x9
424 .long 0 # 0x0
425 .long 11 # 0xb
426 .long 0 # 0x0
427 .long 13 # 0xd
428 .long 0 # 0x0
429 .long 15 # 0xf
430.LCPI0_81:
431 .long 0 # 0x0
432 .long 15 # 0xf
433 .long 0 # 0x0
434 .long 13 # 0xd
435 .long 0 # 0x0
436 .long 11 # 0xb
437 .long 0 # 0x0
438 .long 9 # 0x9
439.LCPI0_83:
440 .long 0 # 0x0
441 .long 7 # 0x7
442 .long 0 # 0x0
443 .long 5 # 0x5
444 .long 0 # 0x0
445 .long 3 # 0x3
446 .long 0 # 0x0
447 .long 1 # 0x1
448.LCPI0_84:
449 .long 16 # 0x10
450 .long 0 # 0x0
451 .long 14 # 0xe
452 .long 0 # 0x0
453 .long 12 # 0xc
454 .long 0 # 0x0
455 .long 10 # 0xa
456 .long 0 # 0x0
457.LCPI0_85:
458 .long 8 # 0x8
459 .long 0 # 0x0
460 .long 6 # 0x6
461 .long 0 # 0x0
462 .long 4 # 0x4
463 .long 0 # 0x0
464 .long 2 # 0x2
465 .long 17 # 0x11
466.LCPI0_86:
467 .long 0 # 0x0
468 .long 14 # 0xe
469 .long 0 # 0x0
470 .long 10 # 0xa
471 .long 0 # 0x0
472 .long 6 # 0x6
473 .long 0 # 0x0
474 .long 2 # 0x2
475.LCPI0_88:
476 .long 16 # 0x10
477 .long 0 # 0x0
478 .long 12 # 0xc
479 .long 0 # 0x0
480 .long 8 # 0x8
481 .long 0 # 0x0
482 .long 4 # 0x4
483 .long 18 # 0x12
484.LCPI0_89:
485 .long 0 # 0x0
486 .long 13 # 0xd
487 .long 0 # 0x0
488 .long 7 # 0x7
489 .long 0 # 0x0
490 .long 1 # 0x1
491 .long 14 # 0xe
492 .long 0 # 0x0
493.LCPI0_91:
494 .long 8 # 0x8
495 .long 0 # 0x0
496 .long 2 # 0x2
497 .long 15 # 0xf
498 .long 0 # 0x0
499 .long 9 # 0x9
500 .long 0 # 0x0
501 .long 3 # 0x3
502.LCPI0_92:
503 .long 16 # 0x10
504 .long 0 # 0x0
505 .long 10 # 0xa
506 .long 0 # 0x0
507 .long 4 # 0x4
508 .long 17 # 0x11
509 .long 0 # 0x0
510 .long 11 # 0xb
511.LCPI0_93:
512 .long 0 # 0x0
513 .long 5 # 0x5
514 .long 18 # 0x12
515 .long 0 # 0x0
516 .long 12 # 0xc
517 .long 0 # 0x0
518 .long 6 # 0x6
519 .long 19 # 0x13
520.LCPI0_94:
521 .long 0 # 0x0
522 .long 12 # 0xc
523 .long 0 # 0x0
524 .long 4 # 0x4
525 .long 16 # 0x10
526 .long 0 # 0x0
527 .long 8 # 0x8
528 .long 20 # 0x14
529.LCPI0_96:
530 .long 0 # 0x0
531 .long 11 # 0xb
532 .long 0 # 0x0
533 .long 1 # 0x1
534 .long 12 # 0xc
535 .long 0 # 0x0
536 .long 2 # 0x2
537 .long 13 # 0xd
538.LCPI0_98:
539 .long 0 # 0x0
540 .long 3 # 0x3
541 .long 14 # 0xe
542 .long 0 # 0x0
543 .long 4 # 0x4
544 .long 15 # 0xf
545 .long 0 # 0x0
546 .long 5 # 0x5
547.LCPI0_99:
548 .long 16 # 0x10
549 .long 0 # 0x0
550 .long 6 # 0x6
551 .long 17 # 0x11
552 .long 0 # 0x0
553 .long 7 # 0x7
554 .long 18 # 0x12
555 .long 0 # 0x0
556.LCPI0_100:
557 .long 8 # 0x8
558 .long 19 # 0x13
559 .long 0 # 0x0
560 .long 9 # 0x9
561 .long 20 # 0x14
562 .long 0 # 0x0
563 .long 10 # 0xa
564 .long 21 # 0x15
565.LCPI0_101:
566 .long 0 # 0x0
567 .long 10 # 0xa
568 .long 20 # 0x14
569 .long 0 # 0x0
570 .long 8 # 0x8
571 .long 18 # 0x12
572 .long 0 # 0x0
573 .long 6 # 0x6
574.LCPI0_103:
575 .long 16 # 0x10
576 .long 0 # 0x0
577 .long 4 # 0x4
578 .long 14 # 0xe
579 .long 0 # 0x0
580 .long 2 # 0x2
581 .long 12 # 0xc
582 .long 22 # 0x16
583.LCPI0_104:
584 .long 0 # 0x0
585 .long 9 # 0x9
586 .long 18 # 0x12
587 .long 0 # 0x0
588 .long 4 # 0x4
589 .long 13 # 0xd
590 .long 22 # 0x16
591 .long 0 # 0x0
592.LCPI0_106:
593 .long 8 # 0x8
594 .long 17 # 0x11
595 .long 0 # 0x0
596 .long 3 # 0x3
597 .long 12 # 0xc
598 .long 21 # 0x15
599 .long 0 # 0x0
600 .long 7 # 0x7
601.LCPI0_107:
602 .long 16 # 0x10
603 .long 0 # 0x0
604 .long 2 # 0x2
605 .long 11 # 0xb
606 .long 20 # 0x14
607 .long 0 # 0x0
608 .long 6 # 0x6
609 .long 15 # 0xf
610.LCPI0_108:
611 .long 0 # 0x0
612 .long 1 # 0x1
613 .long 10 # 0xa
614 .long 19 # 0x13
615 .long 0 # 0x0
616 .long 5 # 0x5
617 .long 14 # 0xe
618 .long 23 # 0x17
619.LCPI0_111:
620 .long 0 # 0x0
621 .long 7 # 0x7
622 .long 14 # 0xe
623 .long 21 # 0x15
624 .long 0 # 0x0
625 .long 3 # 0x3
626 .long 10 # 0xa
627 .long 17 # 0x11
628.LCPI0_113:
629 .long 24 # 0x18
630 .long 0 # 0x0
631 .long 6 # 0x6
632 .long 13 # 0xd
633 .long 20 # 0x14
634 .long 0 # 0x0
635 .long 2 # 0x2
636 .long 9 # 0x9
637.LCPI0_114:
638 .long 16 # 0x10
639 .long 23 # 0x17
640 .long 0 # 0x0
641 .long 5 # 0x5
642 .long 12 # 0xc
643 .long 19 # 0x13
644 .long 0 # 0x0
645 .long 1 # 0x1
646.LCPI0_115:
647 .long 8 # 0x8
648 .long 15 # 0xf
649 .long 22 # 0x16
650 .long 0 # 0x0
651 .long 4 # 0x4
652 .long 11 # 0xb
653 .long 18 # 0x12
654 .long 25 # 0x19
655.LCPI0_116:
656 .long 0 # 0x0
657 .long 6 # 0x6
658 .long 12 # 0xc
659 .long 18 # 0x12
660 .long 24 # 0x18
661 .long 0 # 0x0
662 .long 4 # 0x4
663 .long 10 # 0xa
664.LCPI0_118:
665 .long 16 # 0x10
666 .long 22 # 0x16
667 .long 0 # 0x0
668 .long 2 # 0x2
669 .long 8 # 0x8
670 .long 14 # 0xe
671 .long 20 # 0x14
672 .long 26 # 0x1a
673.LCPI0_119:
674 .long 0 # 0x0
675 .long 5 # 0x5
676 .long 10 # 0xa
677 .long 15 # 0xf
678 .long 20 # 0x14
679 .long 25 # 0x19
680 .long 0 # 0x0
681 .long 3 # 0x3
682.LCPI0_121:
683 .long 8 # 0x8
684 .long 13 # 0xd
685 .long 18 # 0x12
686 .long 23 # 0x17
687 .long 0 # 0x0
688 .long 1 # 0x1
689 .long 6 # 0x6
690 .long 11 # 0xb
691.LCPI0_122:
692 .long 16 # 0x10
693 .long 21 # 0x15
694 .long 26 # 0x1a
695 .long 0 # 0x0
696 .long 4 # 0x4
697 .long 9 # 0x9
698 .long 14 # 0xe
699 .long 19 # 0x13
700.LCPI0_123:
701 .long 24 # 0x18
702 .long 0 # 0x0
703 .long 2 # 0x2
704 .long 7 # 0x7
705 .long 12 # 0xc
706 .long 17 # 0x11
707 .long 22 # 0x16
708 .long 27 # 0x1b
709.LCPI0_124:
710 .long 0 # 0x0
711 .long 4 # 0x4
712 .long 8 # 0x8
713 .long 12 # 0xc
714 .long 16 # 0x10
715 .long 20 # 0x14
716 .long 24 # 0x18
717 .long 28 # 0x1c
718.LCPI0_126:
719 .long 0 # 0x0
720 .long 3 # 0x3
721 .long 6 # 0x6
722 .long 9 # 0x9
723 .long 12 # 0xc
724 .long 15 # 0xf
725 .long 18 # 0x12
726 .long 21 # 0x15
727.LCPI0_128:
728 .long 24 # 0x18
729 .long 27 # 0x1b
730 .long 0 # 0x0
731 .long 1 # 0x1
732 .long 4 # 0x4
733 .long 7 # 0x7
734 .long 10 # 0xa
735 .long 13 # 0xd
736.LCPI0_129:
737 .long 16 # 0x10
738 .long 19 # 0x13
739 .long 22 # 0x16
740 .long 25 # 0x19
741 .long 28 # 0x1c
742 .long 0 # 0x0
743 .long 2 # 0x2
744 .long 5 # 0x5
745.LCPI0_130:
746 .long 8 # 0x8
747 .long 11 # 0xb
748 .long 14 # 0xe
749 .long 17 # 0x11
750 .long 20 # 0x14
751 .long 23 # 0x17
752 .long 26 # 0x1a
753 .long 29 # 0x1d
754.LCPI0_131:
755 .long 0 # 0x0
756 .long 2 # 0x2
757 .long 4 # 0x4
758 .long 6 # 0x6
759 .long 8 # 0x8
760 .long 10 # 0xa
761 .long 12 # 0xc
762 .long 14 # 0xe
763.LCPI0_133:
764 .long 16 # 0x10
765 .long 18 # 0x12
766 .long 20 # 0x14
767 .long 22 # 0x16
768 .long 24 # 0x18
769 .long 26 # 0x1a
770 .long 28 # 0x1c
771 .long 30 # 0x1e
772.LCPI0_134:
773 .long 0 # 0x0
774 .long 1 # 0x1
775 .long 2 # 0x2
776 .long 3 # 0x3
777 .long 4 # 0x4
778 .long 5 # 0x5
779 .long 6 # 0x6
780 .long 7 # 0x7
781.LCPI0_136:
782 .long 24 # 0x18
783 .long 25 # 0x19
784 .long 26 # 0x1a
785 .long 27 # 0x1b
786 .long 28 # 0x1c
787 .long 29 # 0x1d
788 .long 30 # 0x1e
789 .long 31 # 0x1f
790 .section .rodata.cst16,"aM",@progbits,16
791 .p2align 4
792.LCPI0_5:
793 .long 8 # 0x8
794 .long 7 # 0x7
795 .long 6 # 0x6
796 .long 5 # 0x5
797.LCPI0_6:
798 .long 24 # 0x18
799 .long 25 # 0x19
800 .long 26 # 0x1a
801 .long 27 # 0x1b
802.LCPI0_9:
803 .long 16 # 0x10
804 .long 14 # 0xe
805 .long 12 # 0xc
806 .long 10 # 0xa
807.LCPI0_10:
808 .long 16 # 0x10
809 .long 18 # 0x12
810 .long 20 # 0x14
811 .long 22 # 0x16
812.LCPI0_13:
813 .long 8 # 0x8
814 .long 5 # 0x5
815 .zero 4
816 .zero 4
817.LCPI0_14:
818 .long 24 # 0x18
819 .long 27 # 0x1b
820 .zero 4
821 .zero 4
822.LCPI0_16:
823 .long 16 # 0x10
824 .long 13 # 0xd
825 .long 10 # 0xa
826 .long 7 # 0x7
827.LCPI0_17:
828 .long 16 # 0x10
829 .long 19 # 0x13
830 .long 22 # 0x16
831 .long 25 # 0x19
832.LCPI0_19:
833 .long 24 # 0x18
834 .long 21 # 0x15
835 .long 18 # 0x12
836 .long 15 # 0xf
837.LCPI0_20:
838 .long 8 # 0x8
839 .long 11 # 0xb
840 .long 14 # 0xe
841 .long 17 # 0x11
842.LCPI0_26:
843 .long 24 # 0x18
844 .long 19 # 0x13
845 .long 14 # 0xe
846 .long 9 # 0x9
847.LCPI0_27:
848 .long 8 # 0x8
849 .long 13 # 0xd
850 .long 18 # 0x12
851 .long 23 # 0x17
852.LCPI0_29:
853 .long 16 # 0x10
854 .long 11 # 0xb
855 .zero 4
856 .zero 4
857.LCPI0_30:
858 .long 16 # 0x10
859 .long 21 # 0x15
860 .zero 4
861 .zero 4
862.LCPI0_40:
863 .long 16 # 0x10
864 .long 9 # 0x9
865 .zero 4
866 .zero 4
867.LCPI0_41:
868 .long 16 # 0x10
869 .long 23 # 0x17
870 .zero 4
871 .zero 4
872.LCPI0_43:
873 .long 24 # 0x18
874 .long 17 # 0x11
875 .zero 4
876 .zero 4
877.LCPI0_44:
878 .long 8 # 0x8
879 .long 15 # 0xf
880 .zero 4
881 .zero 4
882.LCPI0_46:
883 .long 0 # 0x0
884 .long 0 # 0x0
885 .long 0 # 0x0
886 .long 8 # 0x8
887.LCPI0_50:
888 .long 24 # 0x18
889 .long 15 # 0xf
890 .zero 4
891 .zero 4
892.LCPI0_51:
893 .long 8 # 0x8
894 .long 17 # 0x11
895 .zero 4
896 .zero 4
897.LCPI0_62:
898 .long 24 # 0x18
899 .long 13 # 0xd
900 .zero 4
901 .zero 4
902.LCPI0_63:
903 .long 8 # 0x8
904 .long 19 # 0x13
905 .zero 4
906 .zero 4
907.LCPI0_109:
908 .long 0 # 0x0
909 .long 8 # 0x8
910 .long 16 # 0x10
911 .long 24 # 0x18
912 .section .rodata.cst4,"aM",@progbits,4
913 .p2align 2
914.LCPI0_47:
915 .long 16777215 # 0xffffff
916.LCPI0_110:
917 .long 255 # 0xff
918 .text
919 .globl unpack32_avx2
920 .p2align 4, 0x90
921 .type unpack32_avx2,@function
922unpack32_avx2: # @unpack32_avx2
923# %bb.0:
924 push rbp
925 mov rbp, rsp
926 push r15
927 push r14
928 push r12
929 push rbx
930 and rsp, -16
931 # kill: def $edx killed $edx def $rdx
932 mov r15, rsi
933 mov rbx, rdi
934 lea r14d, [rdx + 31]
935 test edx, edx
936 cmovns r14d, edx
937 sar r14d, 5
938 cmp ecx, 15
939 jle .LBB0_1
940# %bb.48:
941 cmp ecx, 23
942 jle .LBB0_49
943# %bb.72:
944 cmp ecx, 27
945 jle .LBB0_73
946# %bb.84:
947 cmp ecx, 29
948 jle .LBB0_85
949# %bb.90:
950 cmp ecx, 30
951 je .LBB0_99
952# %bb.91:
953 cmp ecx, 31
954 je .LBB0_96
955# %bb.92:
956 cmp ecx, 32
957 jne .LBB0_147
958# %bb.93:
959 cmp edx, 32
960 jl .LBB0_147
961# %bb.94:
962 mov r12d, r14d
963 .p2align 4, 0x90
964.LBB0_95: # =>This Inner Loop Header: Depth=1
965 mov edx, 128
966 mov rdi, r15
967 mov rsi, rbx
968 call clib·_memcpy(SB)
969 sub rbx, -128
970 sub r15, -128
971 add r12, -1
972 jne .LBB0_95
973 jmp .LBB0_147
974.LBB0_1:
975 cmp ecx, 7
976 jg .LBB0_25
977# %bb.2:
978 cmp ecx, 3
979 jg .LBB0_14
980# %bb.3:
981 cmp ecx, 1
982 jg .LBB0_9
983# %bb.4:
984 test ecx, ecx
985 je .LBB0_144
986# %bb.5:
987 cmp ecx, 1
988 jne .LBB0_147
989# %bb.6:
990 cmp edx, 32
991 jl .LBB0_147
992# %bb.7:
993 mov eax, r14d
994 add r15, 96
995 xor ecx, ecx
996 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_135] # ymm0 = [4294967297,4294967297,4294967297,4294967297]
997 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_134] # ymm1 = [0,1,2,3,4,5,6,7]
998 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_2] # ymm2 = [8,9,10,11,12,13,14,15]
999 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_4] # ymm3 = [16,17,18,19,20,21,22,23]
1000 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_136] # ymm4 = [24,25,26,27,28,29,30,31]
1001 .p2align 4, 0x90
1002.LBB0_8: # =>This Inner Loop Header: Depth=1
1003 vpbroadcastd ymm5, dword ptr [rbx + 4*rcx]
1004 vpsrlvd ymm5, ymm5, ymm1
1005 vpand ymm5, ymm5, ymm0
1006 vmovdqu ymmword ptr [r15 - 96], ymm5
1007 vpbroadcastd ymm5, dword ptr [rbx + 4*rcx]
1008 vpsrlvd ymm5, ymm5, ymm2
1009 vpand ymm5, ymm5, ymm0
1010 vmovdqu ymmword ptr [r15 - 64], ymm5
1011 vpbroadcastd ymm5, dword ptr [rbx + 4*rcx]
1012 vpsrlvd ymm5, ymm5, ymm3
1013 vpand ymm5, ymm5, ymm0
1014 vmovdqu ymmword ptr [r15 - 32], ymm5
1015 vpbroadcastd ymm5, dword ptr [rbx + 4*rcx]
1016 vpsrlvd ymm5, ymm5, ymm4
1017 vpand ymm5, ymm5, ymm0
1018 vmovdqu ymmword ptr [r15], ymm5
1019 add rcx, 1
1020 sub r15, -128
1021 cmp rax, rcx
1022 jne .LBB0_8
1023 jmp .LBB0_147
1024.LBB0_49:
1025 cmp ecx, 19
1026 jg .LBB0_61
1027# %bb.50:
1028 cmp ecx, 17
1029 jg .LBB0_56
1030# %bb.51:
1031 cmp ecx, 16
1032 je .LBB0_120
1033# %bb.52:
1034 cmp ecx, 17
1035 jne .LBB0_147
1036# %bb.53:
1037 cmp edx, 32
1038 jl .LBB0_147
1039# %bb.54:
1040 mov r8d, r14d
1041 add r15, 96
1042 add rbx, 64
1043 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_76] # ymm0 = [562945658585087,562945658585087,562945658585087,562945658585087]
1044 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_75] # ymm1 = [0,0,2,0,4,0,6,0]
1045 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_77] # ymm2 = [8,0,10,0,12,0,14,0]
1046 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_78] # ymm3 = [0,1,0,3,0,5,0,7]
1047 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_79] # ymm4 = [0,9,0,11,0,13,0,15]
1048 .p2align 4, 0x90
1049.LBB0_55: # =>This Inner Loop Header: Depth=1
1050 mov ecx, dword ptr [rbx - 52]
1051 mov r10d, dword ptr [rbx - 48]
1052 shld r10d, ecx, 9
1053 mov esi, dword ptr [rbx - 56]
1054 mov edi, ecx
1055 shld edi, esi, 11
1056 mov r9d, dword ptr [rbx - 64]
1057 mov edx, dword ptr [rbx - 60]
1058 mov eax, edx
1059 shld eax, r9d, 15
1060 vmovd xmm5, esi
1061 shld esi, edx, 13
1062 vpinsrd xmm5, xmm5, edi, 1
1063 vpinsrd xmm5, xmm5, ecx, 2
1064 vpinsrd xmm5, xmm5, r10d, 3
1065 vmovd xmm6, r9d
1066 vpinsrd xmm6, xmm6, eax, 1
1067 vpinsrd xmm6, xmm6, edx, 2
1068 vpinsrd xmm6, xmm6, esi, 3
1069 vinserti128 ymm5, ymm6, xmm5, 1
1070 vpsrlvd ymm5, ymm5, ymm1
1071 vpand ymm5, ymm5, ymm0
1072 vmovdqu ymmword ptr [r15 - 96], ymm5
1073 mov eax, dword ptr [rbx - 36]
1074 mov r10d, dword ptr [rbx - 32]
1075 shld r10d, eax, 1
1076 mov edx, dword ptr [rbx - 40]
1077 mov esi, eax
1078 shld esi, edx, 3
1079 mov r9d, dword ptr [rbx - 48]
1080 mov ecx, dword ptr [rbx - 44]
1081 mov edi, ecx
1082 shld edi, r9d, 7
1083 vmovd xmm5, edx
1084 shld edx, ecx, 5
1085 vpinsrd xmm5, xmm5, esi, 1
1086 vpinsrd xmm5, xmm5, eax, 2
1087 vpinsrd xmm5, xmm5, r10d, 3
1088 vmovd xmm6, r9d
1089 vpinsrd xmm6, xmm6, edi, 1
1090 vpinsrd xmm6, xmm6, ecx, 2
1091 vpinsrd xmm6, xmm6, edx, 3
1092 vinserti128 ymm5, ymm6, xmm5, 1
1093 vpsrlvd ymm5, ymm5, ymm2
1094 vpand ymm5, ymm5, ymm0
1095 vmovdqu ymmword ptr [r15 - 64], ymm5
1096 mov r9d, dword ptr [rbx - 16]
1097 mov r11d, dword ptr [rbx - 20]
1098 mov edx, r9d
1099 shld edx, r11d, 10
1100 mov r10d, dword ptr [rbx - 24]
1101 mov edi, r11d
1102 shld edi, r10d, 12
1103 mov eax, dword ptr [rbx - 28]
1104 mov esi, r10d
1105 shld esi, eax, 14
1106 mov ecx, dword ptr [rbx - 32]
1107 shrd ecx, eax, 16
1108 vmovd xmm5, edi
1109 vpinsrd xmm5, xmm5, r11d, 1
1110 vpinsrd xmm5, xmm5, edx, 2
1111 vpinsrd xmm5, xmm5, r9d, 3
1112 vmovd xmm6, ecx
1113 vpinsrd xmm6, xmm6, eax, 1
1114 vpinsrd xmm6, xmm6, esi, 2
1115 vpinsrd xmm6, xmm6, r10d, 3
1116 vinserti128 ymm5, ymm6, xmm5, 1
1117 vpsrlvd ymm5, ymm5, ymm3
1118 vpand ymm5, ymm5, ymm0
1119 vmovdqu ymmword ptr [r15 - 32], ymm5
1120 mov r9d, dword ptr [rbx]
1121 mov r11d, dword ptr [rbx - 4]
1122 mov edx, r9d
1123 shld edx, r11d, 2
1124 mov r10d, dword ptr [rbx - 8]
1125 mov edi, r11d
1126 shld edi, r10d, 4
1127 mov eax, dword ptr [rbx - 16]
1128 mov esi, dword ptr [rbx - 12]
1129 mov ecx, r10d
1130 shld ecx, esi, 6
1131 shrd eax, esi, 24
1132 vmovd xmm5, edi
1133 vpinsrd xmm5, xmm5, r11d, 1
1134 vpinsrd xmm5, xmm5, edx, 2
1135 vpinsrd xmm5, xmm5, r9d, 3
1136 vmovd xmm6, eax
1137 vpinsrd xmm6, xmm6, esi, 1
1138 vpinsrd xmm6, xmm6, ecx, 2
1139 vpinsrd xmm6, xmm6, r10d, 3
1140 vinserti128 ymm5, ymm6, xmm5, 1
1141 vpsrlvd ymm5, ymm5, ymm4
1142 vpand ymm5, ymm5, ymm0
1143 vmovdqu ymmword ptr [r15], ymm5
1144 sub r15, -128
1145 add rbx, 68
1146 add r8, -1
1147 jne .LBB0_55
1148 jmp .LBB0_147
1149.LBB0_25:
1150 cmp ecx, 11
1151 jg .LBB0_37
1152# %bb.26:
1153 cmp ecx, 9
1154 jg .LBB0_32
1155# %bb.27:
1156 cmp ecx, 8
1157 je .LBB0_132
1158# %bb.28:
1159 cmp ecx, 9
1160 jne .LBB0_147
1161# %bb.29:
1162 cmp edx, 32
1163 jl .LBB0_147
1164# %bb.30:
1165 mov r8d, r14d
1166 add r15, 96
1167 add rbx, 32
1168 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_105] # ymm0 = [2194728288767,2194728288767,2194728288767,2194728288767]
1169 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_104] # ymm1 = [0,9,18,0,4,13,22,0]
1170 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_106] # ymm2 = [8,17,0,3,12,21,0,7]
1171 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_107] # ymm3 = [16,0,2,11,20,0,6,15]
1172 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_108] # ymm4 = [0,1,10,19,0,5,14,23]
1173 .p2align 4, 0x90
1174.LBB0_31: # =>This Inner Loop Header: Depth=1
1175 mov ecx, dword ptr [rbx - 32]
1176 mov edx, dword ptr [rbx - 28]
1177 mov esi, dword ptr [rbx - 24]
1178 shld esi, edx, 1
1179 vmovd xmm5, edx
1180 vpinsrd xmm5, xmm5, edx, 1
1181 vpinsrd xmm5, xmm5, edx, 2
1182 shld edx, ecx, 5
1183 vpinsrd xmm5, xmm5, esi, 3
1184 vmovd xmm6, ecx
1185 vpinsrd xmm6, xmm6, ecx, 1
1186 vpinsrd xmm6, xmm6, ecx, 2
1187 vpinsrd xmm6, xmm6, edx, 3
1188 vinserti128 ymm5, ymm6, xmm5, 1
1189 vpsrlvd ymm5, ymm5, ymm1
1190 vpand ymm5, ymm5, ymm0
1191 vmovdqu ymmword ptr [r15 - 96], ymm5
1192 mov ecx, dword ptr [rbx - 16]
1193 mov edx, dword ptr [rbx - 24]
1194 mov esi, dword ptr [rbx - 20]
1195 mov edi, ecx
1196 shld edi, esi, 2
1197 mov eax, esi
1198 shld eax, edx, 6
1199 vmovd xmm5, esi
1200 vpinsrd xmm5, xmm5, esi, 1
1201 vpinsrd xmm5, xmm5, edi, 2
1202 vpinsrd xmm5, xmm5, ecx, 3
1203 vmovd xmm6, edx
1204 vpinsrd xmm6, xmm6, edx, 1
1205 vpinsrd xmm6, xmm6, eax, 2
1206 vpinsrd xmm6, xmm6, esi, 3
1207 vinserti128 ymm5, ymm6, xmm5, 1
1208 vpsrlvd ymm5, ymm5, ymm2
1209 vpand ymm5, ymm5, ymm0
1210 vmovdqu ymmword ptr [r15 - 64], ymm5
1211 mov eax, dword ptr [rbx - 8]
1212 mov ecx, dword ptr [rbx - 16]
1213 mov edx, dword ptr [rbx - 12]
1214 mov esi, eax
1215 shld esi, edx, 3
1216 mov edi, edx
1217 shld edi, ecx, 7
1218 vmovd xmm5, edx
1219 vpinsrd xmm5, xmm5, esi, 1
1220 vpinsrd xmm5, xmm5, eax, 2
1221 vpinsrd xmm5, xmm5, eax, 3
1222 vmovd xmm6, ecx
1223 vpinsrd xmm6, xmm6, edi, 1
1224 vpinsrd xmm6, xmm6, edx, 2
1225 vpinsrd xmm6, xmm6, edx, 3
1226 vinserti128 ymm5, ymm6, xmm5, 1
1227 vpsrlvd ymm5, ymm5, ymm3
1228 vpand ymm5, ymm5, ymm0
1229 vmovdqu ymmword ptr [r15 - 32], ymm5
1230 mov eax, dword ptr [rbx]
1231 mov ecx, dword ptr [rbx - 8]
1232 mov edx, dword ptr [rbx - 4]
1233 mov esi, eax
1234 shld esi, edx, 4
1235 shrd ecx, edx, 24
1236 vmovd xmm5, esi
1237 vpinsrd xmm5, xmm5, eax, 1
1238 vpinsrd xmm5, xmm5, eax, 2
1239 vpinsrd xmm5, xmm5, eax, 3
1240 vmovd xmm6, ecx
1241 vpinsrd xmm6, xmm6, edx, 1
1242 vpinsrd xmm6, xmm6, edx, 2
1243 vpinsrd xmm6, xmm6, edx, 3
1244 vinserti128 ymm5, ymm6, xmm5, 1
1245 vpsrlvd ymm5, ymm5, ymm4
1246 vpand ymm5, ymm5, ymm0
1247 vmovdqu ymmword ptr [r15], ymm5
1248 sub r15, -128
1249 add rbx, 36
1250 add r8, -1
1251 jne .LBB0_31
1252 jmp .LBB0_147
1253.LBB0_73:
1254 cmp ecx, 25
1255 jg .LBB0_79
1256# %bb.74:
1257 cmp ecx, 24
1258 je .LBB0_108
1259# %bb.75:
1260 cmp ecx, 25
1261 jne .LBB0_147
1262# %bb.76:
1263 cmp edx, 32
1264 jl .LBB0_147
1265# %bb.77:
1266 mov r8d, r14d
1267 add r15, 96
1268 add rbx, 96
1269 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_38] # ymm0 = [144115183814443007,144115183814443007,144115183814443007,144115183814443007]
1270 vmovdqa ymm9, ymmword ptr [rip + .LCPI0_28] # ymm9 = [0,0,0,0,4,0,0,0]
1271 vmovdqa ymm10, ymmword ptr [rip + .LCPI0_39] # ymm10 = [0,1,0,0,0,5,0,0]
1272 vmovdqa xmm11, xmmword ptr [rip + .LCPI0_40] # xmm11 = <16,9,u,u>
1273 vmovdqa xmm4, xmmword ptr [rip + .LCPI0_41] # xmm4 = <16,23,u,u>
1274 vmovdqa ymm5, ymmword ptr [rip + .LCPI0_42] # ymm5 = [0,0,2,0,0,0,6,0]
1275 vmovdqa xmm6, xmmword ptr [rip + .LCPI0_43] # xmm6 = <24,17,u,u>
1276 vmovdqa xmm7, xmmword ptr [rip + .LCPI0_44] # xmm7 = <8,15,u,u>
1277 vmovdqa ymm8, ymmword ptr [rip + .LCPI0_45] # ymm8 = [0,0,0,3,0,0,0,7]
1278 .p2align 4, 0x90
1279.LBB0_78: # =>This Inner Loop Header: Depth=1
1280 mov ecx, dword ptr [rbx - 76]
1281 mov r9d, dword ptr [rbx - 72]
1282 shld r9d, ecx, 17
1283 mov esi, dword ptr [rbx - 80]
1284 shld ecx, esi, 10
1285 mov edi, dword ptr [rbx - 84]
1286 shld esi, edi, 3
1287 mov eax, dword ptr [rbx - 88]
1288 vmovd xmm1, edi
1289 shld edi, eax, 21
1290 mov r10d, dword ptr [rbx - 96]
1291 mov edx, dword ptr [rbx - 92]
1292 shld eax, edx, 14
1293 shld edx, r10d, 7
1294 vpinsrd xmm1, xmm1, esi, 1
1295 vmovd xmm2, r10d
1296 vpinsrd xmm1, xmm1, ecx, 2
1297 vpinsrd xmm2, xmm2, edx, 1
1298 vpinsrd xmm1, xmm1, r9d, 3
1299 vpinsrd xmm2, xmm2, eax, 2
1300 vpinsrd xmm2, xmm2, edi, 3
1301 vinserti128 ymm1, ymm2, xmm1, 1
1302 vpsrlvd ymm1, ymm1, ymm9
1303 vpand ymm1, ymm1, ymm0
1304 vmovdqu ymmword ptr [r15 - 96], ymm1
1305 mov r11d, dword ptr [rbx - 52]
1306 mov r9d, dword ptr [rbx - 48]
1307 shld r9d, r11d, 9
1308 mov r10d, dword ptr [rbx - 56]
1309 shld r11d, r10d, 2
1310 mov esi, dword ptr [rbx - 60]
1311 mov edi, r10d
1312 mov ecx, dword ptr [rbx - 64]
1313 shld edi, esi, 20
1314 mov edx, dword ptr [rbx - 72]
1315 mov eax, dword ptr [rbx - 68]
1316 shld esi, ecx, 13
1317 shrd edx, eax, 8
1318 shld ecx, eax, 6
1319 vmovd xmm1, edi
1320 vpinsrd xmm1, xmm1, r10d, 1
1321 vmovd xmm2, edx
1322 vpinsrd xmm1, xmm1, r11d, 2
1323 vpinsrd xmm2, xmm2, eax, 1
1324 vpinsrd xmm1, xmm1, r9d, 3
1325 vpinsrd xmm2, xmm2, ecx, 2
1326 vpinsrd xmm2, xmm2, esi, 3
1327 vinserti128 ymm1, ymm2, xmm1, 1
1328 vpsrlvd ymm1, ymm1, ymm10
1329 vpand ymm1, ymm1, ymm0
1330 vmovdqu ymmword ptr [r15 - 64], ymm1
1331 mov eax, dword ptr [rbx - 28]
1332 mov r9d, dword ptr [rbx - 24]
1333 shld r9d, eax, 1
1334 mov edx, dword ptr [rbx - 32]
1335 mov esi, eax
1336 shld esi, edx, 19
1337 mov edi, dword ptr [rbx - 40]
1338 mov ecx, dword ptr [rbx - 36]
1339 shld edx, ecx, 12
1340 shld ecx, edi, 5
1341 vmovq xmm1, qword ptr [rbx - 48] # xmm1 = mem[0],zero
1342 vpsrlvd xmm2, xmm1, xmm11
1343 vpshufd xmm1, xmm1, 229 # xmm1 = xmm1[1,1,2,3]
1344 vpinsrd xmm1, xmm1, edi, 1
1345 vpsllvd xmm1, xmm1, xmm4
1346 vpor xmm1, xmm2, xmm1
1347 vmovd xmm2, edx
1348 vpinsrd xmm2, xmm2, esi, 1
1349 vpinsrd xmm2, xmm2, eax, 2
1350 vpinsrd xmm2, xmm2, r9d, 3
1351 vpinsrd xmm1, xmm1, edi, 2
1352 vpinsrd xmm1, xmm1, ecx, 3
1353 vinserti128 ymm1, ymm1, xmm2, 1
1354 vpsrlvd ymm1, ymm1, ymm5
1355 vpand ymm1, ymm1, ymm0
1356 vmovdqu ymmword ptr [r15 - 32], ymm1
1357 mov r9d, dword ptr [rbx]
1358 mov ecx, dword ptr [rbx - 4]
1359 mov edx, r9d
1360 shld edx, ecx, 18
1361 mov esi, dword ptr [rbx - 8]
1362 shld ecx, esi, 11
1363 mov r10d, dword ptr [rbx - 16]
1364 mov edi, dword ptr [rbx - 12]
1365 shld esi, edi, 4
1366 mov eax, edi
1367 shld eax, r10d, 22
1368 vmovq xmm1, qword ptr [rbx - 24] # xmm1 = mem[0],zero
1369 vpsrlvd xmm2, xmm1, xmm6
1370 vpshufd xmm1, xmm1, 229 # xmm1 = xmm1[1,1,2,3]
1371 vpinsrd xmm1, xmm1, r10d, 1
1372 vpsllvd xmm1, xmm1, xmm7
1373 vmovd xmm3, esi
1374 vpinsrd xmm3, xmm3, ecx, 1
1375 vpor xmm1, xmm2, xmm1
1376 vpinsrd xmm2, xmm3, edx, 2
1377 vpinsrd xmm2, xmm2, r9d, 3
1378 vpinsrd xmm1, xmm1, eax, 2
1379 vpinsrd xmm1, xmm1, edi, 3
1380 vinserti128 ymm1, ymm1, xmm2, 1
1381 vpsrlvd ymm1, ymm1, ymm8
1382 vpand ymm1, ymm1, ymm0
1383 vmovdqu ymmword ptr [r15], ymm1
1384 sub r15, -128
1385 add rbx, 100
1386 add r8, -1
1387 jne .LBB0_78
1388 jmp .LBB0_147
1389.LBB0_14:
1390 cmp ecx, 5
1391 jg .LBB0_20
1392# %bb.15:
1393 cmp ecx, 4
1394 je .LBB0_138
1395# %bb.16:
1396 cmp ecx, 5
1397 jne .LBB0_147
1398# %bb.17:
1399 cmp edx, 32
1400 jl .LBB0_147
1401# %bb.18:
1402 mov eax, r14d
1403 add r15, 96
1404 add rbx, 16
1405 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_120] # ymm0 = [133143986207,133143986207,133143986207,133143986207]
1406 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_119] # ymm1 = [0,5,10,15,20,25,0,3]
1407 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_121] # ymm2 = [8,13,18,23,0,1,6,11]
1408 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_122] # ymm3 = [16,21,26,0,4,9,14,19]
1409 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_123] # ymm4 = [24,0,2,7,12,17,22,27]
1410 .p2align 4, 0x90
1411.LBB0_19: # =>This Inner Loop Header: Depth=1
1412 mov ecx, dword ptr [rbx - 16]
1413 mov edx, dword ptr [rbx - 12]
1414 mov esi, edx
1415 shld esi, ecx, 2
1416 vmovd xmm5, ecx
1417 vpbroadcastd xmm6, xmm5
1418 vpinsrd xmm5, xmm5, ecx, 1
1419 vpinsrd xmm5, xmm5, esi, 2
1420 vpinsrd xmm5, xmm5, edx, 3
1421 vinserti128 ymm5, ymm6, xmm5, 1
1422 vpsrlvd ymm5, ymm5, ymm1
1423 vpand ymm5, ymm5, ymm0
1424 vmovdqu ymmword ptr [r15 - 96], ymm5
1425 mov ecx, dword ptr [rbx - 12]
1426 mov edx, dword ptr [rbx - 8]
1427 mov esi, edx
1428 shld esi, ecx, 4
1429 vmovd xmm5, ecx
1430 vpbroadcastd xmm5, xmm5
1431 vmovd xmm6, esi
1432 vpinsrd xmm6, xmm6, edx, 1
1433 vpinsrd xmm6, xmm6, edx, 2
1434 vpinsrd xmm6, xmm6, edx, 3
1435 vinserti128 ymm5, ymm5, xmm6, 1
1436 vpsrlvd ymm5, ymm5, ymm2
1437 vpand ymm5, ymm5, ymm0
1438 vmovdqu ymmword ptr [r15 - 64], ymm5
1439 mov ecx, dword ptr [rbx - 8]
1440 mov edx, dword ptr [rbx - 4]
1441 vmovd xmm5, edx
1442 shld edx, ecx, 1
1443 vmovd xmm6, ecx
1444 vpinsrd xmm6, xmm6, ecx, 1
1445 vpinsrd xmm6, xmm6, ecx, 2
1446 vpinsrd xmm6, xmm6, edx, 3
1447 vpbroadcastd xmm5, xmm5
1448 vinserti128 ymm5, ymm6, xmm5, 1
1449 vpsrlvd ymm5, ymm5, ymm3
1450 vpand ymm5, ymm5, ymm0
1451 vmovdqu ymmword ptr [r15 - 32], ymm5
1452 mov ecx, dword ptr [rbx - 4]
1453 mov edx, dword ptr [rbx]
1454 mov esi, edx
1455 shld esi, ecx, 3
1456 vmovd xmm5, ecx
1457 vpinsrd xmm5, xmm5, esi, 1
1458 vpinsrd xmm5, xmm5, edx, 2
1459 vpinsrd xmm5, xmm5, edx, 3
1460 vmovd xmm6, edx
1461 vpbroadcastd xmm6, xmm6
1462 vinserti128 ymm5, ymm5, xmm6, 1
1463 vpsrlvd ymm5, ymm5, ymm4
1464 vpand ymm5, ymm5, ymm0
1465 vmovdqu ymmword ptr [r15], ymm5
1466 sub r15, -128
1467 add rbx, 20
1468 add rax, -1
1469 jne .LBB0_19
1470 jmp .LBB0_147
1471.LBB0_61:
1472 cmp ecx, 21
1473 jg .LBB0_67
1474# %bb.62:
1475 cmp ecx, 20
1476 je .LBB0_114
1477# %bb.63:
1478 cmp ecx, 21
1479 jne .LBB0_147
1480# %bb.64:
1481 cmp edx, 32
1482 jl .LBB0_147
1483# %bb.65:
1484 mov r8d, r14d
1485 add r15, 96
1486 add rbx, 80
1487 vmovdqa ymm8, ymmword ptr [rip + .LCPI0_58] # ymm8 = [0,0,10,0,0,9,0,0]
1488 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_59] # ymm1 = [9007194961870847,9007194961870847,9007194961870847,9007194961870847]
1489 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_60] # ymm2 = [8,0,0,7,0,0,6,0]
1490 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_61] # ymm3 = [0,5,0,0,4,0,0,3]
1491 vmovdqa xmm4, xmmword ptr [rip + .LCPI0_62] # xmm4 = <24,13,u,u>
1492 vmovdqa xmm5, xmmword ptr [rip + .LCPI0_63] # xmm5 = <8,19,u,u>
1493 vmovdqa ymm6, ymmword ptr [rip + .LCPI0_64] # ymm6 = [0,0,2,0,0,1,0,11]
1494 .p2align 4, 0x90
1495.LBB0_66: # =>This Inner Loop Header: Depth=1
1496 mov ecx, dword ptr [rbx - 64]
1497 mov r9d, dword ptr [rbx - 60]
1498 shld r9d, ecx, 13
1499 mov r11d, dword ptr [rbx - 68]
1500 shld ecx, r11d, 2
1501 mov edi, dword ptr [rbx - 72]
1502 mov esi, r11d
1503 shld esi, edi, 12
1504 mov r10d, dword ptr [rbx - 80]
1505 mov eax, dword ptr [rbx - 76]
1506 shld edi, eax, 1
1507 mov edx, eax
1508 shld edx, r10d, 11
1509 vmovd xmm7, r10d
1510 vmovd xmm0, esi
1511 vpinsrd xmm7, xmm7, edx, 1
1512 vpinsrd xmm0, xmm0, r11d, 1
1513 vpinsrd xmm7, xmm7, eax, 2
1514 vpinsrd xmm0, xmm0, ecx, 2
1515 vpinsrd xmm7, xmm7, edi, 3
1516 vpinsrd xmm0, xmm0, r9d, 3
1517 vinserti128 ymm0, ymm7, xmm0, 1
1518 vpsrlvd ymm0, ymm0, ymm8
1519 vpand ymm0, ymm0, ymm1
1520 vmovdqu ymmword ptr [r15 - 96], ymm0
1521 mov r10d, dword ptr [rbx - 44]
1522 mov r9d, dword ptr [rbx - 40]
1523 shld r9d, r10d, 5
1524 mov edx, dword ptr [rbx - 48]
1525 mov esi, r10d
1526 shld esi, edx, 15
1527 mov ecx, dword ptr [rbx - 52]
1528 shld edx, ecx, 4
1529 mov r11d, dword ptr [rbx - 60]
1530 mov eax, dword ptr [rbx - 56]
1531 mov edi, ecx
1532 shld edi, eax, 14
1533 shld eax, r11d, 3
1534 vmovd xmm0, r11d
1535 vmovd xmm7, edx
1536 vpinsrd xmm0, xmm0, eax, 1
1537 vpinsrd xmm7, xmm7, esi, 1
1538 vpinsrd xmm0, xmm0, edi, 2
1539 vpinsrd xmm7, xmm7, r10d, 2
1540 vpinsrd xmm0, xmm0, ecx, 3
1541 vpinsrd xmm7, xmm7, r9d, 3
1542 vinserti128 ymm0, ymm0, xmm7, 1
1543 vpsrlvd ymm0, ymm0, ymm2
1544 vpand ymm0, ymm0, ymm1
1545 vmovdqu ymmword ptr [r15 - 64], ymm0
1546 mov r9d, dword ptr [rbx - 20]
1547 mov ecx, dword ptr [rbx - 24]
1548 mov r10d, r9d
1549 shld r10d, ecx, 18
1550 mov esi, dword ptr [rbx - 28]
1551 shld ecx, esi, 7
1552 mov edi, dword ptr [rbx - 32]
1553 vmovd xmm0, esi
1554 shld esi, edi, 17
1555 mov eax, dword ptr [rbx - 40]
1556 mov edx, dword ptr [rbx - 36]
1557 shld edi, edx, 6
1558 shrd eax, edx, 16
1559 vpinsrd xmm0, xmm0, ecx, 1
1560 vmovd xmm7, eax
1561 vpinsrd xmm0, xmm0, r10d, 2
1562 vpinsrd xmm7, xmm7, edx, 1
1563 vpinsrd xmm0, xmm0, r9d, 3
1564 vpinsrd xmm7, xmm7, edi, 2
1565 vpinsrd xmm7, xmm7, esi, 3
1566 vinserti128 ymm0, ymm7, xmm0, 1
1567 vpsrlvd ymm0, ymm0, ymm3
1568 vpand ymm0, ymm0, ymm1
1569 vmovdqu ymmword ptr [r15 - 32], ymm0
1570 mov r9d, dword ptr [rbx]
1571 mov eax, dword ptr [rbx - 4]
1572 mov edx, r9d
1573 shld edx, eax, 10
1574 mov esi, dword ptr [rbx - 12]
1575 mov edi, dword ptr [rbx - 8]
1576 mov ecx, eax
1577 shld ecx, edi, 20
1578 shld edi, esi, 9
1579 vmovq xmm0, qword ptr [rbx - 20] # xmm0 = mem[0],zero
1580 vpsrlvd xmm7, xmm0, xmm4
1581 vpshufd xmm0, xmm0, 229 # xmm0 = xmm0[1,1,2,3]
1582 vpinsrd xmm0, xmm0, esi, 1
1583 vpsllvd xmm0, xmm0, xmm5
1584 vpor xmm0, xmm7, xmm0
1585 vmovd xmm7, ecx
1586 vpinsrd xmm7, xmm7, eax, 1
1587 vpinsrd xmm7, xmm7, edx, 2
1588 vpinsrd xmm7, xmm7, r9d, 3
1589 vpinsrd xmm0, xmm0, esi, 2
1590 vpinsrd xmm0, xmm0, edi, 3
1591 vinserti128 ymm0, ymm0, xmm7, 1
1592 vpsrlvd ymm0, ymm0, ymm6
1593 vpand ymm0, ymm0, ymm1
1594 vmovdqu ymmword ptr [r15], ymm0
1595 sub r15, -128
1596 add rbx, 84
1597 add r8, -1
1598 jne .LBB0_66
1599 jmp .LBB0_147
1600.LBB0_37:
1601 cmp ecx, 13
1602 jg .LBB0_43
1603# %bb.38:
1604 cmp ecx, 12
1605 je .LBB0_126
1606# %bb.39:
1607 cmp ecx, 13
1608 jne .LBB0_147
1609# %bb.40:
1610 cmp edx, 32
1611 jl .LBB0_147
1612# %bb.41:
1613 mov r8d, r14d
1614 add r15, 96
1615 add rbx, 48
1616 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_90] # ymm0 = [35180077129727,35180077129727,35180077129727,35180077129727]
1617 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_89] # ymm1 = [0,13,0,7,0,1,14,0]
1618 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_91] # ymm2 = [8,0,2,15,0,9,0,3]
1619 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_92] # ymm3 = [16,0,10,0,4,17,0,11]
1620 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_93] # ymm4 = [0,5,18,0,12,0,6,19]
1621 .p2align 4, 0x90
1622.LBB0_42: # =>This Inner Loop Header: Depth=1
1623 mov eax, dword ptr [rbx - 40]
1624 mov r9d, dword ptr [rbx - 36]
1625 shld r9d, eax, 5
1626 mov esi, dword ptr [rbx - 48]
1627 mov edx, dword ptr [rbx - 44]
1628 mov ecx, eax
1629 shld ecx, edx, 12
1630 mov edi, edx
1631 shld edi, esi, 6
1632 vmovd xmm5, ecx
1633 vpinsrd xmm5, xmm5, eax, 1
1634 vpinsrd xmm5, xmm5, eax, 2
1635 vpinsrd xmm5, xmm5, r9d, 3
1636 vmovd xmm6, esi
1637 vpinsrd xmm6, xmm6, esi, 1
1638 vpinsrd xmm6, xmm6, edi, 2
1639 vpinsrd xmm6, xmm6, edx, 3
1640 vinserti128 ymm5, ymm6, xmm5, 1
1641 vpsrlvd ymm5, ymm5, ymm1
1642 vpand ymm5, ymm5, ymm0
1643 vmovdqu ymmword ptr [r15 - 96], ymm5
1644 mov r9d, dword ptr [rbx - 24]
1645 mov ecx, dword ptr [rbx - 28]
1646 mov edx, r9d
1647 shld edx, ecx, 10
1648 mov esi, dword ptr [rbx - 32]
1649 mov edi, ecx
1650 shld edi, esi, 4
1651 mov r10d, dword ptr [rbx - 36]
1652 mov eax, esi
1653 shld eax, r10d, 11
1654 vmovd xmm5, edi
1655 vpinsrd xmm5, xmm5, ecx, 1
1656 vpinsrd xmm5, xmm5, edx, 2
1657 vpinsrd xmm5, xmm5, r9d, 3
1658 vmovd xmm6, r10d
1659 vpinsrd xmm6, xmm6, eax, 1
1660 vpinsrd xmm6, xmm6, esi, 2
1661 vpinsrd xmm6, xmm6, esi, 3
1662 vinserti128 ymm5, ymm6, xmm5, 1
1663 vpsrlvd ymm5, ymm5, ymm2
1664 vpand ymm5, ymm5, ymm0
1665 vmovdqu ymmword ptr [r15 - 64], ymm5
1666 mov r9d, dword ptr [rbx - 12]
1667 mov ecx, dword ptr [rbx - 16]
1668 mov edx, r9d
1669 shld edx, ecx, 2
1670 mov esi, dword ptr [rbx - 24]
1671 mov eax, dword ptr [rbx - 20]
1672 vmovd xmm5, ecx
1673 vpinsrd xmm5, xmm5, ecx, 1
1674 shld ecx, eax, 9
1675 mov edi, eax
1676 shld edi, esi, 3
1677 vpinsrd xmm5, xmm5, edx, 2
1678 vpinsrd xmm5, xmm5, r9d, 3
1679 vmovd xmm6, esi
1680 vpinsrd xmm6, xmm6, edi, 1
1681 vpinsrd xmm6, xmm6, eax, 2
1682 vpinsrd xmm6, xmm6, ecx, 3
1683 vinserti128 ymm5, ymm6, xmm5, 1
1684 vpsrlvd ymm5, ymm5, ymm3
1685 vpand ymm5, ymm5, ymm0
1686 vmovdqu ymmword ptr [r15 - 32], ymm5
1687 mov eax, dword ptr [rbx]
1688 mov ecx, dword ptr [rbx - 4]
1689 mov edx, eax
1690 shld edx, ecx, 7
1691 mov esi, dword ptr [rbx - 8]
1692 vmovd xmm5, ecx
1693 shld ecx, esi, 1
1694 mov edi, dword ptr [rbx - 12]
1695 shrd edi, esi, 24
1696 vmovd xmm6, edi
1697 vpinsrd xmm6, xmm6, esi, 1
1698 vpinsrd xmm6, xmm6, esi, 2
1699 vpinsrd xmm6, xmm6, ecx, 3
1700 vpinsrd xmm5, xmm5, edx, 1
1701 vpinsrd xmm5, xmm5, eax, 2
1702 vpinsrd xmm5, xmm5, eax, 3
1703 vinserti128 ymm5, ymm6, xmm5, 1
1704 vpsrlvd ymm5, ymm5, ymm4
1705 vpand ymm5, ymm5, ymm0
1706 vmovdqu ymmword ptr [r15], ymm5
1707 sub r15, -128
1708 add rbx, 52
1709 add r8, -1
1710 jne .LBB0_42
1711 jmp .LBB0_147
1712.LBB0_85:
1713 cmp ecx, 28
1714 je .LBB0_102
1715# %bb.86:
1716 cmp ecx, 29
1717 jne .LBB0_147
1718# %bb.87:
1719 cmp edx, 32
1720 jl .LBB0_147
1721# %bb.88:
1722 mov r8d, r14d
1723 add r15, 96
1724 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_12] # ymm0 = [2305843005455597567,2305843005455597567,2305843005455597567,2305843005455597567]
1725 vmovdqa xmm8, xmmword ptr [rip + .LCPI0_13] # xmm8 = <8,5,u,u>
1726 vmovdqa xmm10, xmmword ptr [rip + .LCPI0_14] # xmm10 = <24,27,u,u>
1727 vmovdqa ymm11, ymmword ptr [rip + .LCPI0_15] # ymm11 = [0,0,2,0,0,0,0,0]
1728 vmovdqa xmm12, xmmword ptr [rip + .LCPI0_16] # xmm12 = [16,13,10,7]
1729 vmovdqa xmm5, xmmword ptr [rip + .LCPI0_17] # xmm5 = [16,19,22,25]
1730 vmovdqa ymm6, ymmword ptr [rip + .LCPI0_18] # ymm6 = [0,0,0,0,0,1,0,0]
1731 vmovdqa xmm7, xmmword ptr [rip + .LCPI0_19] # xmm7 = [24,21,18,15]
1732 vmovdqa xmm1, xmmword ptr [rip + .LCPI0_20] # xmm1 = [8,11,14,17]
1733 vmovdqa ymm9, ymmword ptr [rip + .LCPI0_21] # ymm9 = [0,0,0,0,0,0,0,3]
1734 .p2align 4, 0x90
1735.LBB0_89: # =>This Inner Loop Header: Depth=1
1736 mov r11d, dword ptr [rbx + 24]
1737 mov r9d, dword ptr [rbx + 28]
1738 shld r9d, r11d, 21
1739 mov esi, dword ptr [rbx + 20]
1740 shld r11d, esi, 18
1741 mov edi, dword ptr [rbx + 16]
1742 shld esi, edi, 15
1743 mov eax, dword ptr [rbx + 12]
1744 shld edi, eax, 12
1745 mov edx, dword ptr [rbx + 8]
1746 shld eax, edx, 9
1747 mov r10d, dword ptr [rbx]
1748 mov ecx, dword ptr [rbx + 4]
1749 shld edx, ecx, 6
1750 shld ecx, r10d, 3
1751 vmovd xmm2, r10d
1752 vmovd xmm3, edi
1753 vpinsrd xmm2, xmm2, ecx, 1
1754 vpinsrd xmm3, xmm3, esi, 1
1755 vpinsrd xmm2, xmm2, edx, 2
1756 vpinsrd xmm3, xmm3, r11d, 2
1757 vpinsrd xmm2, xmm2, eax, 3
1758 vpinsrd xmm3, xmm3, r9d, 3
1759 vinserti128 ymm2, ymm2, xmm3, 1
1760 vpand ymm2, ymm2, ymm0
1761 vmovdqu ymmword ptr [r15 - 96], ymm2
1762 mov eax, dword ptr [rbx + 52]
1763 mov r9d, dword ptr [rbx + 56]
1764 shld r9d, eax, 13
1765 mov edx, dword ptr [rbx + 48]
1766 shld eax, edx, 10
1767 mov esi, dword ptr [rbx + 44]
1768 shld edx, esi, 7
1769 mov edi, dword ptr [rbx + 36]
1770 mov ecx, dword ptr [rbx + 40]
1771 shld esi, ecx, 4
1772 shld ecx, edi, 1
1773 vmovq xmm2, qword ptr [rbx + 28] # xmm2 = mem[0],zero
1774 vpsrlvd xmm3, xmm2, xmm8
1775 vpshufd xmm2, xmm2, 229 # xmm2 = xmm2[1,1,2,3]
1776 vpinsrd xmm2, xmm2, edi, 1
1777 vpsllvd xmm2, xmm2, xmm10
1778 vpor xmm2, xmm3, xmm2
1779 vmovd xmm3, esi
1780 vpinsrd xmm3, xmm3, edx, 1
1781 vpinsrd xmm3, xmm3, eax, 2
1782 vpinsrd xmm3, xmm3, r9d, 3
1783 vpinsrd xmm2, xmm2, edi, 2
1784 vpinsrd xmm2, xmm2, ecx, 3
1785 vinserti128 ymm2, ymm2, xmm3, 1
1786 vpsrlvd ymm2, ymm2, ymm11
1787 vpand ymm2, ymm2, ymm0
1788 vmovdqu ymmword ptr [r15 - 64], ymm2
1789 mov eax, dword ptr [rbx + 80]
1790 mov ecx, dword ptr [rbx + 84]
1791 shld ecx, eax, 5
1792 mov edx, dword ptr [rbx + 76]
1793 mov esi, dword ptr [rbx + 72]
1794 shld eax, edx, 2
1795 mov edi, edx
1796 shld edi, esi, 28
1797 vmovdqu xmm2, xmmword ptr [rbx + 56]
1798 vpsrlvd xmm3, xmm2, xmm12
1799 vpshufd xmm2, xmm2, 249 # xmm2 = xmm2[1,2,3,3]
1800 vpinsrd xmm2, xmm2, esi, 3
1801 vmovd xmm4, edi
1802 vpinsrd xmm4, xmm4, edx, 1
1803 vpinsrd xmm4, xmm4, eax, 2
1804 vpsllvd xmm2, xmm2, xmm5
1805 vpinsrd xmm4, xmm4, ecx, 3
1806 vpor xmm2, xmm3, xmm2
1807 vinserti128 ymm2, ymm2, xmm4, 1
1808 vpsrlvd ymm2, ymm2, ymm6
1809 vpand ymm2, ymm2, ymm0
1810 vmovdqu ymmword ptr [r15 - 32], ymm2
1811 mov eax, dword ptr [rbx + 112]
1812 mov ecx, dword ptr [rbx + 108]
1813 mov edx, eax
1814 shld edx, ecx, 26
1815 mov esi, dword ptr [rbx + 104]
1816 shld ecx, esi, 23
1817 mov edi, dword ptr [rbx + 100]
1818 vmovdqu xmm2, xmmword ptr [rbx + 84]
1819 shld esi, edi, 20
1820 vpsrlvd xmm3, xmm2, xmm7
1821 vpshufd xmm2, xmm2, 249 # xmm2 = xmm2[1,2,3,3]
1822 vpinsrd xmm2, xmm2, edi, 3
1823 vmovd xmm4, esi
1824 vpinsrd xmm4, xmm4, ecx, 1
1825 vpsllvd xmm2, xmm2, xmm1
1826 vpinsrd xmm4, xmm4, edx, 2
1827 vpinsrd xmm4, xmm4, eax, 3
1828 vpor xmm2, xmm3, xmm2
1829 vinserti128 ymm2, ymm2, xmm4, 1
1830 vpsrlvd ymm2, ymm2, ymm9
1831 vpand ymm2, ymm2, ymm0
1832 vmovdqu ymmword ptr [r15], ymm2
1833 add rbx, 116
1834 sub r15, -128
1835 add r8, -1
1836 jne .LBB0_89
1837 jmp .LBB0_147
1838.LBB0_9:
1839 cmp ecx, 2
1840 je .LBB0_141
1841# %bb.10:
1842 cmp ecx, 3
1843 jne .LBB0_147
1844# %bb.11:
1845 cmp edx, 32
1846 jl .LBB0_147
1847# %bb.12:
1848 mov eax, r14d
1849 add r15, 96
1850 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_127] # ymm0 = [30064771079,30064771079,30064771079,30064771079]
1851 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_126] # ymm1 = [0,3,6,9,12,15,18,21]
1852 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_128] # ymm2 = [24,27,0,1,4,7,10,13]
1853 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_129] # ymm3 = [16,19,22,25,28,0,2,5]
1854 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_130] # ymm4 = [8,11,14,17,20,23,26,29]
1855 .p2align 4, 0x90
1856.LBB0_13: # =>This Inner Loop Header: Depth=1
1857 vpbroadcastd ymm5, dword ptr [rbx]
1858 vpsrlvd ymm5, ymm5, ymm1
1859 vpand ymm5, ymm5, ymm0
1860 vmovdqu ymmword ptr [r15 - 96], ymm5
1861 mov ecx, dword ptr [rbx]
1862 mov edx, dword ptr [rbx + 4]
1863 mov esi, edx
1864 shld esi, ecx, 2
1865 vmovd xmm5, ecx
1866 vpinsrd xmm5, xmm5, ecx, 1
1867 vpinsrd xmm5, xmm5, esi, 2
1868 vpinsrd xmm5, xmm5, edx, 3
1869 vmovd xmm6, edx
1870 vpbroadcastd xmm6, xmm6
1871 vinserti128 ymm5, ymm5, xmm6, 1
1872 vpsrlvd ymm5, ymm5, ymm2
1873 vpand ymm5, ymm5, ymm0
1874 vmovdqu ymmword ptr [r15 - 64], ymm5
1875 mov ecx, dword ptr [rbx + 4]
1876 mov edx, dword ptr [rbx + 8]
1877 mov esi, edx
1878 shld esi, ecx, 1
1879 vmovd xmm5, ecx
1880 vpbroadcastd xmm6, xmm5
1881 vpinsrd xmm5, xmm5, esi, 1
1882 vpinsrd xmm5, xmm5, edx, 2
1883 vpinsrd xmm5, xmm5, edx, 3
1884 vinserti128 ymm5, ymm6, xmm5, 1
1885 vpsrlvd ymm5, ymm5, ymm3
1886 vpand ymm5, ymm5, ymm0
1887 vmovdqu ymmword ptr [r15 - 32], ymm5
1888 vpbroadcastd ymm5, dword ptr [rbx + 8]
1889 vpsrlvd ymm5, ymm5, ymm4
1890 vpand ymm5, ymm5, ymm0
1891 vmovdqu ymmword ptr [r15], ymm5
1892 sub r15, -128
1893 add rbx, 12
1894 add rax, -1
1895 jne .LBB0_13
1896 jmp .LBB0_147
1897.LBB0_56:
1898 cmp ecx, 18
1899 je .LBB0_117
1900# %bb.57:
1901 cmp ecx, 19
1902 jne .LBB0_147
1903# %bb.58:
1904 cmp edx, 32
1905 jl .LBB0_147
1906# %bb.59:
1907 mov r8d, r14d
1908 add r15, 96
1909 add rbx, 72
1910 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_68] # ymm0 = [2251795519242239,2251795519242239,2251795519242239,2251795519242239]
1911 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_67] # ymm1 = [0,0,6,0,12,0,0,5]
1912 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_69] # ymm2 = [0,11,0,0,4,0,10,0]
1913 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_70] # ymm3 = [0,3,0,9,0,0,2,0]
1914 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_71] # ymm4 = [8,0,0,1,0,7,0,13]
1915 .p2align 4, 0x90
1916.LBB0_60: # =>This Inner Loop Header: Depth=1
1917 mov r9d, dword ptr [rbx - 56]
1918 mov edx, dword ptr [rbx - 60]
1919 mov esi, r9d
1920 shld esi, edx, 14
1921 mov edi, dword ptr [rbx - 64]
1922 mov r10d, dword ptr [rbx - 72]
1923 shld edx, edi, 1
1924 mov eax, dword ptr [rbx - 68]
1925 mov ecx, eax
1926 shld ecx, r10d, 13
1927 vmovd xmm5, edi
1928 shld edi, eax, 7
1929 vpinsrd xmm5, xmm5, edx, 1
1930 vmovd xmm6, r10d
1931 vpinsrd xmm5, xmm5, esi, 2
1932 vpinsrd xmm6, xmm6, ecx, 1
1933 vpinsrd xmm5, xmm5, r9d, 3
1934 vpinsrd xmm6, xmm6, eax, 2
1935 vpinsrd xmm6, xmm6, edi, 3
1936 vinserti128 ymm5, ymm6, xmm5, 1
1937 vpsrlvd ymm5, ymm5, ymm1
1938 vpand ymm5, ymm5, ymm0
1939 vmovdqu ymmword ptr [r15 - 96], ymm5
1940 mov r10d, dword ptr [rbx - 40]
1941 mov r9d, dword ptr [rbx - 36]
1942 shld r9d, r10d, 3
1943 mov edx, dword ptr [rbx - 44]
1944 mov esi, r10d
1945 shld esi, edx, 9
1946 mov edi, dword ptr [rbx - 48]
1947 vmovd xmm5, edx
1948 shld edx, edi, 15
1949 mov ecx, dword ptr [rbx - 56]
1950 mov eax, dword ptr [rbx - 52]
1951 shld edi, eax, 2
1952 shrd ecx, eax, 24
1953 vpinsrd xmm5, xmm5, esi, 1
1954 vmovd xmm6, ecx
1955 vpinsrd xmm5, xmm5, r10d, 2
1956 vpinsrd xmm6, xmm6, eax, 1
1957 vpinsrd xmm5, xmm5, r9d, 3
1958 vpinsrd xmm6, xmm6, edi, 2
1959 vpinsrd xmm6, xmm6, edx, 3
1960 vinserti128 ymm5, ymm6, xmm5, 1
1961 vpsrlvd ymm5, ymm5, ymm2
1962 vpand ymm5, ymm5, ymm0
1963 vmovdqu ymmword ptr [r15 - 64], ymm5
1964 mov r10d, dword ptr [rbx - 20]
1965 mov r9d, dword ptr [rbx - 16]
1966 shld r9d, r10d, 11
1967 mov edx, dword ptr [rbx - 24]
1968 mov esi, r10d
1969 mov r11d, dword ptr [rbx - 28]
1970 shld esi, edx, 17
1971 mov ecx, dword ptr [rbx - 36]
1972 mov eax, dword ptr [rbx - 32]
1973 shld edx, r11d, 4
1974 mov edi, r11d
1975 shld edi, eax, 10
1976 shrd ecx, eax, 16
1977 vmovd xmm5, edx
1978 vpinsrd xmm5, xmm5, esi, 1
1979 vmovd xmm6, ecx
1980 vpinsrd xmm5, xmm5, r10d, 2
1981 vpinsrd xmm6, xmm6, eax, 1
1982 vpinsrd xmm5, xmm5, r9d, 3
1983 vpinsrd xmm6, xmm6, edi, 2
1984 vpinsrd xmm6, xmm6, r11d, 3
1985 vinserti128 ymm5, ymm6, xmm5, 1
1986 vpsrlvd ymm5, ymm5, ymm3
1987 vpand ymm5, ymm5, ymm0
1988 vmovdqu ymmword ptr [r15 - 32], ymm5
1989 mov r9d, dword ptr [rbx]
1990 mov r11d, dword ptr [rbx - 4]
1991 mov edx, r9d
1992 shld edx, r11d, 6
1993 mov ecx, dword ptr [rbx - 8]
1994 mov edi, r11d
1995 shld edi, ecx, 12
1996 mov r10d, dword ptr [rbx - 16]
1997 mov eax, dword ptr [rbx - 12]
1998 mov esi, ecx
1999 shld esi, eax, 18
2000 shld eax, r10d, 5
2001 vmovd xmm5, r10d
2002 vmovd xmm6, edi
2003 vpinsrd xmm5, xmm5, eax, 1
2004 vpinsrd xmm6, xmm6, r11d, 1
2005 vpinsrd xmm5, xmm5, esi, 2
2006 vpinsrd xmm6, xmm6, edx, 2
2007 vpinsrd xmm5, xmm5, ecx, 3
2008 vpinsrd xmm6, xmm6, r9d, 3
2009 vinserti128 ymm5, ymm5, xmm6, 1
2010 vpsrlvd ymm5, ymm5, ymm4
2011 vpand ymm5, ymm5, ymm0
2012 vmovdqu ymmword ptr [r15], ymm5
2013 sub r15, -128
2014 add rbx, 76
2015 add r8, -1
2016 jne .LBB0_60
2017 jmp .LBB0_147
2018.LBB0_32:
2019 cmp ecx, 10
2020 je .LBB0_129
2021# %bb.33:
2022 cmp ecx, 11
2023 jne .LBB0_147
2024# %bb.34:
2025 cmp edx, 32
2026 jl .LBB0_147
2027# %bb.35:
2028 mov r8d, r14d
2029 add r15, 96
2030 add rbx, 40
2031 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_97] # ymm0 = [8791798056959,8791798056959,8791798056959,8791798056959]
2032 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_96] # ymm1 = [0,11,0,1,12,0,2,13]
2033 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_98] # ymm2 = [0,3,14,0,4,15,0,5]
2034 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_99] # ymm3 = [16,0,6,17,0,7,18,0]
2035 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_100] # ymm4 = [8,19,0,9,20,0,10,21]
2036 .p2align 4, 0x90
2037.LBB0_36: # =>This Inner Loop Header: Depth=1
2038 mov ecx, dword ptr [rbx - 32]
2039 mov edx, dword ptr [rbx - 40]
2040 mov esi, dword ptr [rbx - 36]
2041 mov edi, ecx
2042 shld edi, esi, 9
2043 mov eax, esi
2044 shld eax, edx, 10
2045 vmovd xmm5, esi
2046 vpinsrd xmm5, xmm5, edi, 1
2047 vpinsrd xmm5, xmm5, ecx, 2
2048 vpinsrd xmm5, xmm5, ecx, 3
2049 vmovd xmm6, edx
2050 vpinsrd xmm6, xmm6, edx, 1
2051 vpinsrd xmm6, xmm6, eax, 2
2052 vpinsrd xmm6, xmm6, esi, 3
2053 vinserti128 ymm5, ymm6, xmm5, 1
2054 vpsrlvd ymm5, ymm5, ymm1
2055 vpand ymm5, ymm5, ymm0
2056 vmovdqu ymmword ptr [r15 - 96], ymm5
2057 mov eax, dword ptr [rbx - 20]
2058 mov ecx, dword ptr [rbx - 24]
2059 mov edx, eax
2060 shld edx, ecx, 6
2061 mov esi, dword ptr [rbx - 32]
2062 mov edi, dword ptr [rbx - 28]
2063 vmovd xmm5, ecx
2064 vpinsrd xmm5, xmm5, ecx, 1
2065 shld ecx, edi, 7
2066 shrd esi, edi, 24
2067 vpinsrd xmm5, xmm5, edx, 2
2068 vpinsrd xmm5, xmm5, eax, 3
2069 vmovd xmm6, esi
2070 vpinsrd xmm6, xmm6, edi, 1
2071 vpinsrd xmm6, xmm6, edi, 2
2072 vpinsrd xmm6, xmm6, ecx, 3
2073 vinserti128 ymm5, ymm6, xmm5, 1
2074 vpsrlvd ymm5, ymm5, ymm2
2075 vpand ymm5, ymm5, ymm0
2076 vmovdqu ymmword ptr [r15 - 64], ymm5
2077 mov eax, dword ptr [rbx - 12]
2078 mov ecx, dword ptr [rbx - 8]
2079 shld ecx, eax, 3
2080 mov r9d, dword ptr [rbx - 20]
2081 mov esi, dword ptr [rbx - 16]
2082 mov edi, eax
2083 shld edi, esi, 4
2084 mov edx, esi
2085 shld edx, r9d, 5
2086 vmovd xmm5, edi
2087 vpinsrd xmm5, xmm5, eax, 1
2088 vpinsrd xmm5, xmm5, eax, 2
2089 vpinsrd xmm5, xmm5, ecx, 3
2090 vmovd xmm6, r9d
2091 vpinsrd xmm6, xmm6, edx, 1
2092 vpinsrd xmm6, xmm6, esi, 2
2093 vpinsrd xmm6, xmm6, esi, 3
2094 vinserti128 ymm5, ymm6, xmm5, 1
2095 vpsrlvd ymm5, ymm5, ymm3
2096 vpand ymm5, ymm5, ymm0
2097 vmovdqu ymmword ptr [r15 - 32], ymm5
2098 mov eax, dword ptr [rbx]
2099 mov ecx, dword ptr [rbx - 8]
2100 mov edx, dword ptr [rbx - 4]
2101 mov esi, eax
2102 shld esi, edx, 1
2103 mov edi, edx
2104 shld edi, ecx, 2
2105 vmovd xmm5, edx
2106 vpinsrd xmm5, xmm5, esi, 1
2107 vpinsrd xmm5, xmm5, eax, 2
2108 vpinsrd xmm5, xmm5, eax, 3
2109 vmovd xmm6, ecx
2110 vpinsrd xmm6, xmm6, ecx, 1
2111 vpinsrd xmm6, xmm6, edi, 2
2112 vpinsrd xmm6, xmm6, edx, 3
2113 vinserti128 ymm5, ymm6, xmm5, 1
2114 vpsrlvd ymm5, ymm5, ymm4
2115 vpand ymm5, ymm5, ymm0
2116 vmovdqu ymmword ptr [r15], ymm5
2117 sub r15, -128
2118 add rbx, 44
2119 add r8, -1
2120 jne .LBB0_36
2121 jmp .LBB0_147
2122.LBB0_79:
2123 cmp ecx, 26
2124 je .LBB0_105
2125# %bb.80:
2126 cmp ecx, 27
2127 jne .LBB0_147
2128# %bb.81:
2129 cmp edx, 32
2130 jl .LBB0_147
2131# %bb.82:
2132 mov r8d, r14d
2133 add r15, 96
2134 add rbx, 104
2135 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_25] # ymm0 = [576460748142673919,576460748142673919,576460748142673919,576460748142673919]
2136 vmovdqa ymm9, ymmword ptr [rip + .LCPI0_24] # ymm9 = [0,0,0,0,0,0,2,0]
2137 vmovdqa xmm10, xmmword ptr [rip + .LCPI0_26] # xmm10 = [24,19,14,9]
2138 vmovdqa xmm11, xmmword ptr [rip + .LCPI0_27] # xmm11 = [8,13,18,23]
2139 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_28] # ymm4 = [0,0,0,0,4,0,0,0]
2140 vmovdqa xmm5, xmmword ptr [rip + .LCPI0_29] # xmm5 = <16,11,u,u>
2141 vmovdqa xmm6, xmmword ptr [rip + .LCPI0_30] # xmm6 = <16,21,u,u>
2142 vmovdqa ymm7, ymmword ptr [rip + .LCPI0_31] # ymm7 = [0,0,0,1,0,0,0,0]
2143 vmovdqa ymm8, ymmword ptr [rip + .LCPI0_32] # ymm8 = [0,3,0,0,0,0,0,5]
2144 .p2align 4, 0x90
2145.LBB0_83: # =>This Inner Loop Header: Depth=1
2146 mov r10d, dword ptr [rbx - 84]
2147 mov r9d, dword ptr [rbx - 80]
2148 shld r9d, r10d, 3
2149 mov esi, dword ptr [rbx - 88]
2150 mov edi, r10d
2151 shld edi, esi, 25
2152 mov eax, dword ptr [rbx - 92]
2153 shld esi, eax, 20
2154 mov edx, dword ptr [rbx - 96]
2155 shld eax, edx, 15
2156 mov r11d, dword ptr [rbx - 104]
2157 mov ecx, dword ptr [rbx - 100]
2158 shld edx, ecx, 10
2159 shld ecx, r11d, 5
2160 vmovd xmm1, r11d
2161 vmovd xmm2, esi
2162 vpinsrd xmm1, xmm1, ecx, 1
2163 vpinsrd xmm2, xmm2, edi, 1
2164 vpinsrd xmm1, xmm1, edx, 2
2165 vpinsrd xmm2, xmm2, r10d, 2
2166 vpinsrd xmm1, xmm1, eax, 3
2167 vpinsrd xmm2, xmm2, r9d, 3
2168 vinserti128 ymm1, ymm1, xmm2, 1
2169 vpsrlvd ymm1, ymm1, ymm9
2170 vpand ymm1, ymm1, ymm0
2171 vmovdqu ymmword ptr [r15 - 96], ymm1
2172 mov eax, dword ptr [rbx - 56]
2173 mov ecx, dword ptr [rbx - 52]
2174 shld ecx, eax, 11
2175 mov edx, dword ptr [rbx - 60]
2176 mov esi, dword ptr [rbx - 64]
2177 shld eax, edx, 6
2178 shld edx, esi, 1
2179 vmovdqu xmm1, xmmword ptr [rbx - 80]
2180 vpsrlvd xmm2, xmm1, xmm10
2181 vpshufd xmm1, xmm1, 249 # xmm1 = xmm1[1,2,3,3]
2182 vmovd xmm3, esi
2183 vpinsrd xmm1, xmm1, esi, 3
2184 vpinsrd xmm3, xmm3, edx, 1
2185 vpinsrd xmm3, xmm3, eax, 2
2186 vpsllvd xmm1, xmm1, xmm11
2187 vpinsrd xmm3, xmm3, ecx, 3
2188 vpor xmm1, xmm2, xmm1
2189 vinserti128 ymm1, ymm1, xmm3, 1
2190 vpsrlvd ymm1, ymm1, ymm4
2191 vpand ymm1, ymm1, ymm0
2192 vmovdqu ymmword ptr [r15 - 64], ymm1
2193 mov eax, dword ptr [rbx - 28]
2194 mov r9d, dword ptr [rbx - 24]
2195 shld r9d, eax, 19
2196 mov edx, dword ptr [rbx - 32]
2197 shld eax, edx, 14
2198 mov esi, dword ptr [rbx - 36]
2199 shld edx, esi, 9
2200 mov r10d, dword ptr [rbx - 44]
2201 mov edi, dword ptr [rbx - 40]
2202 shld esi, edi, 4
2203 mov ecx, edi
2204 shld ecx, r10d, 26
2205 vmovq xmm1, qword ptr [rbx - 52] # xmm1 = mem[0],zero
2206 vpsrlvd xmm2, xmm1, xmm5
2207 vpshufd xmm1, xmm1, 229 # xmm1 = xmm1[1,1,2,3]
2208 vpinsrd xmm1, xmm1, r10d, 1
2209 vpsllvd xmm1, xmm1, xmm6
2210 vmovd xmm3, esi
2211 vpinsrd xmm3, xmm3, edx, 1
2212 vpor xmm1, xmm2, xmm1
2213 vpinsrd xmm2, xmm3, eax, 2
2214 vpinsrd xmm2, xmm2, r9d, 3
2215 vpinsrd xmm1, xmm1, ecx, 2
2216 vpinsrd xmm1, xmm1, edi, 3
2217 vinserti128 ymm1, ymm1, xmm2, 1
2218 vpsrlvd ymm1, ymm1, ymm7
2219 vpand ymm1, ymm1, ymm0
2220 vmovdqu ymmword ptr [r15 - 32], ymm1
2221 mov r9d, dword ptr [rbx]
2222 mov r11d, dword ptr [rbx - 4]
2223 mov r10d, r9d
2224 shld r10d, r11d, 22
2225 mov esi, dword ptr [rbx - 8]
2226 shld r11d, esi, 17
2227 mov edi, dword ptr [rbx - 12]
2228 mov eax, dword ptr [rbx - 16]
2229 shld esi, edi, 12
2230 mov edx, dword ptr [rbx - 24]
2231 mov ecx, dword ptr [rbx - 20]
2232 shld edi, eax, 7
2233 shrd edx, ecx, 8
2234 shld eax, ecx, 2
2235 vmovd xmm1, esi
2236 vpinsrd xmm1, xmm1, r11d, 1
2237 vmovd xmm2, edx
2238 vpinsrd xmm1, xmm1, r10d, 2
2239 vpinsrd xmm2, xmm2, ecx, 1
2240 vpinsrd xmm1, xmm1, r9d, 3
2241 vpinsrd xmm2, xmm2, eax, 2
2242 vpinsrd xmm2, xmm2, edi, 3
2243 vinserti128 ymm1, ymm2, xmm1, 1
2244 vpsrlvd ymm1, ymm1, ymm8
2245 vpand ymm1, ymm1, ymm0
2246 vmovdqu ymmword ptr [r15], ymm1
2247 sub r15, -128
2248 add rbx, 108
2249 add r8, -1
2250 jne .LBB0_83
2251 jmp .LBB0_147
2252.LBB0_20:
2253 cmp ecx, 6
2254 je .LBB0_135
2255# %bb.21:
2256 cmp ecx, 7
2257 jne .LBB0_147
2258# %bb.22:
2259 cmp edx, 32
2260 jl .LBB0_147
2261# %bb.23:
2262 mov r8d, r14d
2263 add r15, 96
2264 add rbx, 24
2265 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_112] # ymm0 = [545460846719,545460846719,545460846719,545460846719]
2266 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_111] # ymm1 = [0,7,14,21,0,3,10,17]
2267 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_113] # ymm2 = [24,0,6,13,20,0,2,9]
2268 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_114] # ymm3 = [16,23,0,5,12,19,0,1]
2269 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_115] # ymm4 = [8,15,22,0,4,11,18,25]
2270 .p2align 4, 0x90
2271.LBB0_24: # =>This Inner Loop Header: Depth=1
2272 mov ecx, dword ptr [rbx - 24]
2273 mov edx, dword ptr [rbx - 20]
2274 mov esi, edx
2275 shld esi, ecx, 4
2276 vmovd xmm5, ecx
2277 vmovd xmm6, esi
2278 vpinsrd xmm6, xmm6, edx, 1
2279 vpinsrd xmm6, xmm6, edx, 2
2280 vpinsrd xmm6, xmm6, edx, 3
2281 vpbroadcastd xmm5, xmm5
2282 vinserti128 ymm5, ymm5, xmm6, 1
2283 vpsrlvd ymm5, ymm5, ymm1
2284 vpand ymm5, ymm5, ymm0
2285 vmovdqu ymmword ptr [r15 - 96], ymm5
2286 mov ecx, dword ptr [rbx - 12]
2287 mov edx, dword ptr [rbx - 20]
2288 mov esi, dword ptr [rbx - 16]
2289 mov edi, ecx
2290 shld edi, esi, 5
2291 mov eax, esi
2292 shld eax, edx, 1
2293 vmovd xmm5, esi
2294 vpinsrd xmm5, xmm5, edi, 1
2295 vpinsrd xmm5, xmm5, ecx, 2
2296 vpinsrd xmm5, xmm5, ecx, 3
2297 vmovd xmm6, edx
2298 vpinsrd xmm6, xmm6, eax, 1
2299 vpinsrd xmm6, xmm6, esi, 2
2300 vpinsrd xmm6, xmm6, esi, 3
2301 vinserti128 ymm5, ymm6, xmm5, 1
2302 vpsrlvd ymm5, ymm5, ymm2
2303 vpand ymm5, ymm5, ymm0
2304 vmovdqu ymmword ptr [r15 - 64], ymm5
2305 mov eax, dword ptr [rbx - 4]
2306 mov ecx, dword ptr [rbx - 12]
2307 mov edx, dword ptr [rbx - 8]
2308 mov esi, eax
2309 shld esi, edx, 6
2310 mov edi, edx
2311 shld edi, ecx, 2
2312 vmovd xmm5, edx
2313 vpinsrd xmm5, xmm5, edx, 1
2314 vpinsrd xmm5, xmm5, esi, 2
2315 vpinsrd xmm5, xmm5, eax, 3
2316 vmovd xmm6, ecx
2317 vpinsrd xmm6, xmm6, ecx, 1
2318 vpinsrd xmm6, xmm6, edi, 2
2319 vpinsrd xmm6, xmm6, edx, 3
2320 vinserti128 ymm5, ymm6, xmm5, 1
2321 vpsrlvd ymm5, ymm5, ymm3
2322 vpand ymm5, ymm5, ymm0
2323 vmovdqu ymmword ptr [r15 - 32], ymm5
2324 mov eax, dword ptr [rbx - 4]
2325 mov ecx, dword ptr [rbx]
2326 mov edx, ecx
2327 shld edx, eax, 3
2328 vmovd xmm5, ecx
2329 vmovd xmm6, eax
2330 vpinsrd xmm6, xmm6, eax, 1
2331 vpinsrd xmm6, xmm6, eax, 2
2332 vpinsrd xmm6, xmm6, edx, 3
2333 vpbroadcastd xmm5, xmm5
2334 vinserti128 ymm5, ymm6, xmm5, 1
2335 vpsrlvd ymm5, ymm5, ymm4
2336 vpand ymm5, ymm5, ymm0
2337 vmovdqu ymmword ptr [r15], ymm5
2338 sub r15, -128
2339 add rbx, 28
2340 add r8, -1
2341 jne .LBB0_24
2342 jmp .LBB0_147
2343.LBB0_67:
2344 cmp ecx, 22
2345 je .LBB0_111
2346# %bb.68:
2347 cmp ecx, 23
2348 jne .LBB0_147
2349# %bb.69:
2350 cmp edx, 32
2351 jl .LBB0_147
2352# %bb.70:
2353 mov r8d, r14d
2354 add r15, 96
2355 add rbx, 88
2356 vmovdqa ymm8, ymmword ptr [rip + .LCPI0_48] # ymm8 = [0,0,0,5,0,0,0,1]
2357 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_49] # ymm1 = [36028792732385279,36028792732385279,36028792732385279,36028792732385279]
2358 vmovdqa xmm2, xmmword ptr [rip + .LCPI0_50] # xmm2 = <24,15,u,u>
2359 vmovdqa xmm3, xmmword ptr [rip + .LCPI0_51] # xmm3 = <8,17,u,u>
2360 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_52] # ymm4 = [0,0,6,0,0,0,2,0]
2361 vmovdqa ymm5, ymmword ptr [rip + .LCPI0_53] # ymm5 = [0,7,0,0,0,3,0,0]
2362 vmovdqa ymm6, ymmword ptr [rip + .LCPI0_54] # ymm6 = [8,0,0,0,4,0,0,9]
2363 .p2align 4, 0x90
2364.LBB0_71: # =>This Inner Loop Header: Depth=1
2365 mov r9d, dword ptr [rbx - 68]
2366 mov edx, dword ptr [rbx - 72]
2367 mov r11d, r9d
2368 shld r11d, edx, 22
2369 mov edi, dword ptr [rbx - 76]
2370 shld edx, edi, 13
2371 mov esi, dword ptr [rbx - 80]
2372 shld edi, esi, 4
2373 mov r10d, dword ptr [rbx - 88]
2374 mov ecx, dword ptr [rbx - 84]
2375 mov eax, esi
2376 shld eax, ecx, 18
2377 shld ecx, r10d, 9
2378 vmovd xmm7, r10d
2379 vmovd xmm0, edi
2380 vpinsrd xmm7, xmm7, ecx, 1
2381 vpinsrd xmm0, xmm0, edx, 1
2382 vpinsrd xmm7, xmm7, eax, 2
2383 vpinsrd xmm0, xmm0, r11d, 2
2384 vpinsrd xmm7, xmm7, esi, 3
2385 vpinsrd xmm0, xmm0, r9d, 3
2386 vinserti128 ymm0, ymm7, xmm0, 1
2387 vpsrlvd ymm0, ymm0, ymm8
2388 vpand ymm0, ymm0, ymm1
2389 vmovdqu ymmword ptr [r15 - 96], ymm0
2390 mov eax, dword ptr [rbx - 48]
2391 mov r9d, dword ptr [rbx - 44]
2392 shld r9d, eax, 7
2393 mov edx, dword ptr [rbx - 52]
2394 mov esi, eax
2395 shld esi, edx, 21
2396 mov edi, dword ptr [rbx - 60]
2397 mov ecx, dword ptr [rbx - 56]
2398 shld edx, ecx, 12
2399 shld ecx, edi, 3
2400 vmovq xmm0, qword ptr [rbx - 68] # xmm0 = mem[0],zero
2401 vpsrlvd xmm7, xmm0, xmm2
2402 vpshufd xmm0, xmm0, 229 # xmm0 = xmm0[1,1,2,3]
2403 vpinsrd xmm0, xmm0, edi, 1
2404 vpsllvd xmm0, xmm0, xmm3
2405 vpor xmm0, xmm7, xmm0
2406 vmovd xmm7, edx
2407 vpinsrd xmm7, xmm7, esi, 1
2408 vpinsrd xmm7, xmm7, eax, 2
2409 vpinsrd xmm7, xmm7, r9d, 3
2410 vpinsrd xmm0, xmm0, edi, 2
2411 vpinsrd xmm0, xmm0, ecx, 3
2412 vinserti128 ymm0, ymm0, xmm7, 1
2413 vpsrlvd ymm0, ymm0, ymm4
2414 vpand ymm0, ymm0, ymm1
2415 vmovdqu ymmword ptr [r15 - 64], ymm0
2416 mov r11d, dword ptr [rbx - 24]
2417 mov r9d, dword ptr [rbx - 20]
2418 shld r9d, r11d, 15
2419 mov r10d, dword ptr [rbx - 28]
2420 shld r11d, r10d, 6
2421 mov esi, dword ptr [rbx - 32]
2422 mov edi, r10d
2423 mov ecx, dword ptr [rbx - 36]
2424 shld edi, esi, 20
2425 mov edx, dword ptr [rbx - 44]
2426 mov eax, dword ptr [rbx - 40]
2427 shld esi, ecx, 11
2428 shrd edx, eax, 16
2429 shld ecx, eax, 2
2430 vmovd xmm0, edi
2431 vpinsrd xmm0, xmm0, r10d, 1
2432 vmovd xmm7, edx
2433 vpinsrd xmm0, xmm0, r11d, 2
2434 vpinsrd xmm7, xmm7, eax, 1
2435 vpinsrd xmm0, xmm0, r9d, 3
2436 vpinsrd xmm7, xmm7, ecx, 2
2437 vpinsrd xmm7, xmm7, esi, 3
2438 vinserti128 ymm0, ymm7, xmm0, 1
2439 vpsrlvd ymm0, ymm0, ymm5
2440 vpand ymm0, ymm0, ymm1
2441 vmovdqu ymmword ptr [r15 - 32], ymm0
2442 mov r9d, dword ptr [rbx]
2443 mov ecx, dword ptr [rbx - 4]
2444 mov edx, r9d
2445 shld edx, ecx, 14
2446 mov esi, dword ptr [rbx - 8]
2447 shld ecx, esi, 5
2448 mov edi, dword ptr [rbx - 12]
2449 vmovd xmm0, esi
2450 shld esi, edi, 19
2451 mov r10d, dword ptr [rbx - 20]
2452 mov eax, dword ptr [rbx - 16]
2453 shld edi, eax, 10
2454 shld eax, r10d, 1
2455 vpinsrd xmm0, xmm0, ecx, 1
2456 vmovd xmm7, r10d
2457 vpinsrd xmm0, xmm0, edx, 2
2458 vpinsrd xmm7, xmm7, eax, 1
2459 vpinsrd xmm0, xmm0, r9d, 3
2460 vpinsrd xmm7, xmm7, edi, 2
2461 vpinsrd xmm7, xmm7, esi, 3
2462 vinserti128 ymm0, ymm7, xmm0, 1
2463 vpsrlvd ymm0, ymm0, ymm6
2464 vpand ymm0, ymm0, ymm1
2465 vmovdqu ymmword ptr [r15], ymm0
2466 sub r15, -128
2467 add rbx, 92
2468 add r8, -1
2469 jne .LBB0_71
2470 jmp .LBB0_147
2471.LBB0_43:
2472 cmp ecx, 14
2473 je .LBB0_123
2474# %bb.44:
2475 cmp ecx, 15
2476 jne .LBB0_147
2477# %bb.45:
2478 cmp edx, 32
2479 jl .LBB0_147
2480# %bb.46:
2481 mov r8d, r14d
2482 add r15, 96
2483 add rbx, 56
2484 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_82] # ymm0 = [140733193420799,140733193420799,140733193420799,140733193420799]
2485 vmovdqa ymm1, ymmword ptr [rip + .LCPI0_81] # ymm1 = [0,15,0,13,0,11,0,9]
2486 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_83] # ymm2 = [0,7,0,5,0,3,0,1]
2487 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_84] # ymm3 = [16,0,14,0,12,0,10,0]
2488 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_85] # ymm4 = [8,0,6,0,4,0,2,17]
2489 .p2align 4, 0x90
2490.LBB0_47: # =>This Inner Loop Header: Depth=1
2491 mov r9d, dword ptr [rbx - 44]
2492 mov eax, dword ptr [rbx - 48]
2493 mov esi, r9d
2494 shld esi, eax, 6
2495 mov r10d, dword ptr [rbx - 52]
2496 mov edx, eax
2497 shld edx, r10d, 4
2498 mov ecx, dword ptr [rbx - 56]
2499 mov edi, r10d
2500 shld edi, ecx, 2
2501 vmovd xmm5, edx
2502 vpinsrd xmm5, xmm5, eax, 1
2503 vpinsrd xmm5, xmm5, esi, 2
2504 vpinsrd xmm5, xmm5, r9d, 3
2505 vmovd xmm6, ecx
2506 vpinsrd xmm6, xmm6, ecx, 1
2507 vpinsrd xmm6, xmm6, edi, 2
2508 vpinsrd xmm6, xmm6, r10d, 3
2509 vinserti128 ymm5, ymm6, xmm5, 1
2510 vpsrlvd ymm5, ymm5, ymm1
2511 vpand ymm5, ymm5, ymm0
2512 vmovdqu ymmword ptr [r15 - 96], ymm5
2513 mov r9d, dword ptr [rbx - 28]
2514 mov r11d, dword ptr [rbx - 32]
2515 mov edx, r9d
2516 shld edx, r11d, 14
2517 mov r10d, dword ptr [rbx - 36]
2518 mov edi, r11d
2519 shld edi, r10d, 12
2520 mov eax, dword ptr [rbx - 44]
2521 mov esi, dword ptr [rbx - 40]
2522 mov ecx, r10d
2523 shld ecx, esi, 10
2524 shrd eax, esi, 24
2525 vmovd xmm5, edi
2526 vpinsrd xmm5, xmm5, r11d, 1
2527 vpinsrd xmm5, xmm5, edx, 2
2528 vpinsrd xmm5, xmm5, r9d, 3
2529 vmovd xmm6, eax
2530 vpinsrd xmm6, xmm6, esi, 1
2531 vpinsrd xmm6, xmm6, ecx, 2
2532 vpinsrd xmm6, xmm6, r10d, 3
2533 vinserti128 ymm5, ymm6, xmm5, 1
2534 vpsrlvd ymm5, ymm5, ymm2
2535 vpand ymm5, ymm5, ymm0
2536 vmovdqu ymmword ptr [r15 - 64], ymm5
2537 mov eax, dword ptr [rbx - 16]
2538 mov r10d, dword ptr [rbx - 12]
2539 shld r10d, eax, 7
2540 mov edx, dword ptr [rbx - 20]
2541 mov esi, eax
2542 shld esi, edx, 5
2543 mov r9d, dword ptr [rbx - 28]
2544 mov ecx, dword ptr [rbx - 24]
2545 mov edi, ecx
2546 shld edi, r9d, 1
2547 vmovd xmm5, edx
2548 shld edx, ecx, 3
2549 vpinsrd xmm5, xmm5, esi, 1
2550 vpinsrd xmm5, xmm5, eax, 2
2551 vpinsrd xmm5, xmm5, r10d, 3
2552 vmovd xmm6, r9d
2553 vpinsrd xmm6, xmm6, edi, 1
2554 vpinsrd xmm6, xmm6, ecx, 2
2555 vpinsrd xmm6, xmm6, edx, 3
2556 vinserti128 ymm5, ymm6, xmm5, 1
2557 vpsrlvd ymm5, ymm5, ymm3
2558 vpand ymm5, ymm5, ymm0
2559 vmovdqu ymmword ptr [r15 - 32], ymm5
2560 mov r9d, dword ptr [rbx]
2561 mov ecx, dword ptr [rbx - 4]
2562 mov edx, r9d
2563 shld edx, ecx, 13
2564 mov eax, dword ptr [rbx - 8]
2565 vmovd xmm5, ecx
2566 shld ecx, eax, 11
2567 mov edi, dword ptr [rbx - 12]
2568 mov esi, eax
2569 shld esi, edi, 9
2570 vmovd xmm6, edi
2571 vpinsrd xmm6, xmm6, esi, 1
2572 vpinsrd xmm6, xmm6, eax, 2
2573 vpinsrd xmm6, xmm6, ecx, 3
2574 vpinsrd xmm5, xmm5, edx, 1
2575 vpinsrd xmm5, xmm5, r9d, 2
2576 vpinsrd xmm5, xmm5, r9d, 3
2577 vinserti128 ymm5, ymm6, xmm5, 1
2578 vpsrlvd ymm5, ymm5, ymm4
2579 vpand ymm5, ymm5, ymm0
2580 vmovdqu ymmword ptr [r15], ymm5
2581 sub r15, -128
2582 add rbx, 60
2583 add r8, -1
2584 jne .LBB0_47
2585 jmp .LBB0_147
2586.LBB0_96:
2587 cmp edx, 32
2588 jl .LBB0_147
2589# %bb.97:
2590 mov r8d, r14d
2591 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_0] # ymm0 = [9223372034707292159,9223372034707292159,9223372034707292159,9223372034707292159]
2592 add r15, 96
2593 vmovdqa ymm8, ymmword ptr [rip + .LCPI0_1] # ymm8 = [24,23,22,21,20,19,18,17]
2594 vmovdqa ymm9, ymmword ptr [rip + .LCPI0_2] # ymm9 = [8,9,10,11,12,13,14,15]
2595 vmovdqa ymm10, ymmword ptr [rip + .LCPI0_3] # ymm10 = [16,15,14,13,12,11,10,9]
2596 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_4] # ymm4 = [16,17,18,19,20,21,22,23]
2597 vmovdqa xmm5, xmmword ptr [rip + .LCPI0_5] # xmm5 = [8,7,6,5]
2598 vmovdqa xmm6, xmmword ptr [rip + .LCPI0_6] # xmm6 = [24,25,26,27]
2599 vmovdqa ymm7, ymmword ptr [rip + .LCPI0_7] # ymm7 = [0,0,0,0,0,0,0,1]
2600 .p2align 4, 0x90
2601.LBB0_98: # =>This Inner Loop Header: Depth=1
2602 mov r10d, dword ptr [rbx + 24]
2603 mov r9d, dword ptr [rbx + 28]
2604 shld r9d, r10d, 7
2605 mov esi, dword ptr [rbx + 20]
2606 shld r10d, esi, 6
2607 mov edi, dword ptr [rbx + 16]
2608 shld esi, edi, 5
2609 mov eax, dword ptr [rbx + 12]
2610 shld edi, eax, 4
2611 mov edx, dword ptr [rbx + 8]
2612 shld eax, edx, 3
2613 mov ecx, dword ptr [rbx + 4]
2614 shld edx, ecx, 2
2615 mov r11d, dword ptr [rbx]
2616 shld ecx, r11d, 1
2617 vmovd xmm1, edi
2618 vpinsrd xmm1, xmm1, esi, 1
2619 vpinsrd xmm1, xmm1, r10d, 2
2620 vpinsrd xmm1, xmm1, r9d, 3
2621 vmovd xmm2, r11d
2622 vpinsrd xmm2, xmm2, ecx, 1
2623 vpinsrd xmm2, xmm2, edx, 2
2624 vpinsrd xmm2, xmm2, eax, 3
2625 vinserti128 ymm1, ymm2, xmm1, 1
2626 vpand ymm1, ymm1, ymm0
2627 vmovdqu ymmword ptr [r15 - 96], ymm1
2628 vmovdqu ymm1, ymmword ptr [rbx + 28]
2629 vpsrlvd ymm1, ymm1, ymm8
2630 vmovdqu xmm2, xmmword ptr [rbx + 44]
2631 vpshufd xmm3, xmm2, 249 # xmm3 = xmm2[1,2,3,3]
2632 vpinsrd xmm3, xmm3, dword ptr [rbx + 60], 3
2633 vpalignr xmm2, xmm2, xmmword ptr [rbx + 28], 4 # xmm2 = mem[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
2634 vinserti128 ymm2, ymm2, xmm3, 1
2635 vpsllvd ymm2, ymm2, ymm9
2636 vpor ymm1, ymm1, ymm2
2637 vpand ymm1, ymm1, ymm0
2638 vmovdqu ymmword ptr [r15 - 64], ymm1
2639 vmovdqu ymm1, ymmword ptr [rbx + 60]
2640 vmovdqu xmm2, xmmword ptr [rbx + 76]
2641 vpshufd xmm3, xmm2, 249 # xmm3 = xmm2[1,2,3,3]
2642 vpinsrd xmm3, xmm3, dword ptr [rbx + 92], 3
2643 vpsrlvd ymm1, ymm1, ymm10
2644 vpalignr xmm2, xmm2, xmmword ptr [rbx + 60], 4 # xmm2 = mem[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
2645 vinserti128 ymm2, ymm2, xmm3, 1
2646 vpsllvd ymm2, ymm2, ymm4
2647 vpor ymm1, ymm1, ymm2
2648 vpand ymm1, ymm1, ymm0
2649 vmovdqu ymmword ptr [r15 - 32], ymm1
2650 mov eax, dword ptr [rbx + 120]
2651 mov ecx, dword ptr [rbx + 116]
2652 mov edx, eax
2653 shld edx, ecx, 30
2654 mov esi, dword ptr [rbx + 112]
2655 shld ecx, esi, 29
2656 mov edi, dword ptr [rbx + 108]
2657 shld esi, edi, 28
2658 vmovdqu xmm1, xmmword ptr [rbx + 92]
2659 vpsrlvd xmm2, xmm1, xmm5
2660 vpshufd xmm1, xmm1, 249 # xmm1 = xmm1[1,2,3,3]
2661 vpinsrd xmm1, xmm1, edi, 3
2662 vpsllvd xmm1, xmm1, xmm6
2663 vmovd xmm3, esi
2664 vpinsrd xmm3, xmm3, ecx, 1
2665 vpinsrd xmm3, xmm3, edx, 2
2666 vpinsrd xmm3, xmm3, eax, 3
2667 vpor xmm1, xmm2, xmm1
2668 vinserti128 ymm1, ymm1, xmm3, 1
2669 vpsrlvd ymm1, ymm1, ymm7
2670 vpand ymm1, ymm1, ymm0
2671 vmovdqu ymmword ptr [r15], ymm1
2672 add rbx, 124
2673 sub r15, -128
2674 add r8, -1
2675 jne .LBB0_98
2676 jmp .LBB0_147
2677.LBB0_144:
2678 cmp edx, 32
2679 jl .LBB0_147
2680# %bb.145:
2681 mov ebx, r14d
2682 .p2align 4, 0x90
2683.LBB0_146: # =>This Inner Loop Header: Depth=1
2684 mov edx, 128
2685 mov rdi, r15
2686 xor esi, esi
2687 call clib·_memset(SB)
2688 sub r15, -128
2689 add rbx, -1
2690 jne .LBB0_146
2691 jmp .LBB0_147
2692.LBB0_120:
2693 cmp edx, 32
2694 jl .LBB0_147
2695# %bb.121:
2696 mov eax, r14d
2697 xor ecx, ecx
2698 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_80] # ymm0 = [68719476736,68719476736,68719476736,68719476736]
2699 vpxor xmm1, xmm1, xmm1
2700 .p2align 4, 0x90
2701.LBB0_122: # =>This Inner Loop Header: Depth=1
2702 vmovdqu xmm2, xmmword ptr [rbx + rcx]
2703 vpermq ymm2, ymm2, 216 # ymm2 = ymm2[0,2,1,3]
2704 vpshufd ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1,4,4,5,5]
2705 vpsrlvd ymm2, ymm2, ymm0
2706 vpblendw ymm2, ymm2, ymm1, 170 # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
2707 vmovdqu ymmword ptr [r15 + 2*rcx], ymm2
2708 vmovdqu xmm2, xmmword ptr [rbx + rcx + 16]
2709 vpermq ymm2, ymm2, 216 # ymm2 = ymm2[0,2,1,3]
2710 vpshufd ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1,4,4,5,5]
2711 vpsrlvd ymm2, ymm2, ymm0
2712 vpblendw ymm2, ymm2, ymm1, 170 # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
2713 vmovdqu ymmword ptr [r15 + 2*rcx + 32], ymm2
2714 vmovdqu xmm2, xmmword ptr [rbx + rcx + 32]
2715 vpermq ymm2, ymm2, 216 # ymm2 = ymm2[0,2,1,3]
2716 vpshufd ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1,4,4,5,5]
2717 vpsrlvd ymm2, ymm2, ymm0
2718 vpblendw ymm2, ymm2, ymm1, 170 # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
2719 vmovdqu ymmword ptr [r15 + 2*rcx + 64], ymm2
2720 vmovdqu xmm2, xmmword ptr [rbx + rcx + 48]
2721 vpermq ymm2, ymm2, 216 # ymm2 = ymm2[0,2,1,3]
2722 vpshufd ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1,4,4,5,5]
2723 vpsrlvd ymm2, ymm2, ymm0
2724 vpblendw ymm2, ymm2, ymm1, 170 # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
2725 vmovdqu ymmword ptr [r15 + 2*rcx + 96], ymm2
2726 add rcx, 64
2727 add rax, -1
2728 jne .LBB0_122
2729 jmp .LBB0_147
2730.LBB0_132:
2731 cmp edx, 32
2732 jl .LBB0_147
2733# %bb.133:
2734 mov eax, r14d
2735 xor ecx, ecx
2736 vbroadcasti128 ymm0, xmmword ptr [rip + .LCPI0_109] # ymm0 = [0,8,16,24,0,8,16,24]
2737 # ymm0 = mem[0,1,0,1]
2738 vpbroadcastd ymm1, dword ptr [rip + .LCPI0_110] # ymm1 = [255,255,255,255,255,255,255,255]
2739 .p2align 4, 0x90
2740.LBB0_134: # =>This Inner Loop Header: Depth=1
2741 vmovq xmm2, qword ptr [rbx + rcx] # xmm2 = mem[0],zero
2742 vpshufd xmm2, xmm2, 80 # xmm2 = xmm2[0,0,1,1]
2743 vpermq ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1]
2744 vpsrlvd ymm2, ymm2, ymm0
2745 vpand ymm2, ymm2, ymm1
2746 vmovdqu ymmword ptr [r15 + 4*rcx], ymm2
2747 vmovq xmm2, qword ptr [rbx + rcx + 8] # xmm2 = mem[0],zero
2748 vpshufd xmm2, xmm2, 80 # xmm2 = xmm2[0,0,1,1]
2749 vpermq ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1]
2750 vpsrlvd ymm2, ymm2, ymm0
2751 vpand ymm2, ymm2, ymm1
2752 vmovdqu ymmword ptr [r15 + 4*rcx + 32], ymm2
2753 vmovq xmm2, qword ptr [rbx + rcx + 16] # xmm2 = mem[0],zero
2754 vpshufd xmm2, xmm2, 80 # xmm2 = xmm2[0,0,1,1]
2755 vpermq ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1]
2756 vpsrlvd ymm2, ymm2, ymm0
2757 vpand ymm2, ymm2, ymm1
2758 vmovdqu ymmword ptr [r15 + 4*rcx + 64], ymm2
2759 vmovq xmm2, qword ptr [rbx + rcx + 24] # xmm2 = mem[0],zero
2760 vpshufd xmm2, xmm2, 80 # xmm2 = xmm2[0,0,1,1]
2761 vpermq ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1]
2762 vpsrlvd ymm2, ymm2, ymm0
2763 vpand ymm2, ymm2, ymm1
2764 vmovdqu ymmword ptr [r15 + 4*rcx + 96], ymm2
2765 add rcx, 32
2766 add rax, -1
2767 jne .LBB0_134
2768 jmp .LBB0_147
2769.LBB0_108:
2770 cmp edx, 32
2771 jl .LBB0_147
2772# %bb.109:
2773 mov r8d, r14d
2774 add r15, 96
2775 add rbx, 92
2776 vbroadcasti128 ymm0, xmmword ptr [rip + .LCPI0_46] # ymm0 = [0,0,0,8,0,0,0,8]
2777 # ymm0 = mem[0,1,0,1]
2778 vpbroadcastd ymm1, dword ptr [rip + .LCPI0_47] # ymm1 = [16777215,16777215,16777215,16777215,16777215,16777215,16777215,16777215]
2779 .p2align 4, 0x90
2780.LBB0_110: # =>This Inner Loop Header: Depth=1
2781 mov r9d, dword ptr [rbx - 72]
2782 mov edx, dword ptr [rbx - 76]
2783 mov esi, r9d
2784 mov edi, dword ptr [rbx - 80]
2785 mov r10d, dword ptr [rbx - 84]
2786 shld esi, edx, 16
2787 mov r11d, dword ptr [rbx - 92]
2788 mov eax, dword ptr [rbx - 88]
2789 shld edx, edi, 8
2790 mov ecx, r10d
2791 shld ecx, eax, 16
2792 shld eax, r11d, 8
2793 vmovd xmm2, edi
2794 vmovd xmm3, r11d
2795 vpinsrd xmm2, xmm2, edx, 1
2796 vpinsrd xmm3, xmm3, eax, 1
2797 vpinsrd xmm2, xmm2, esi, 2
2798 vpinsrd xmm3, xmm3, ecx, 2
2799 vpinsrd xmm2, xmm2, r9d, 3
2800 vpinsrd xmm3, xmm3, r10d, 3
2801 vinserti128 ymm2, ymm3, xmm2, 1
2802 vpsrlvd ymm2, ymm2, ymm0
2803 vpand ymm2, ymm2, ymm1
2804 vmovdqu ymmword ptr [r15 - 96], ymm2
2805 mov r9d, dword ptr [rbx - 48]
2806 mov ecx, dword ptr [rbx - 52]
2807 mov edx, r9d
2808 mov esi, dword ptr [rbx - 56]
2809 mov r10d, dword ptr [rbx - 60]
2810 shld edx, ecx, 16
2811 mov r11d, dword ptr [rbx - 68]
2812 mov edi, dword ptr [rbx - 64]
2813 shld ecx, esi, 8
2814 mov eax, r10d
2815 shld eax, edi, 16
2816 shld edi, r11d, 8
2817 vmovd xmm2, esi
2818 vmovd xmm3, r11d
2819 vpinsrd xmm2, xmm2, ecx, 1
2820 vpinsrd xmm3, xmm3, edi, 1
2821 vpinsrd xmm2, xmm2, edx, 2
2822 vpinsrd xmm3, xmm3, eax, 2
2823 vpinsrd xmm2, xmm2, r9d, 3
2824 vpinsrd xmm3, xmm3, r10d, 3
2825 vinserti128 ymm2, ymm3, xmm2, 1
2826 vpsrlvd ymm2, ymm2, ymm0
2827 vpand ymm2, ymm2, ymm1
2828 vmovdqu ymmword ptr [r15 - 64], ymm2
2829 mov r9d, dword ptr [rbx - 24]
2830 mov ecx, dword ptr [rbx - 28]
2831 mov edx, r9d
2832 mov esi, dword ptr [rbx - 32]
2833 mov r10d, dword ptr [rbx - 36]
2834 shld edx, ecx, 16
2835 mov r11d, dword ptr [rbx - 44]
2836 mov edi, dword ptr [rbx - 40]
2837 shld ecx, esi, 8
2838 mov eax, r10d
2839 shld eax, edi, 16
2840 shld edi, r11d, 8
2841 vmovd xmm2, esi
2842 vmovd xmm3, r11d
2843 vpinsrd xmm2, xmm2, ecx, 1
2844 vpinsrd xmm3, xmm3, edi, 1
2845 vpinsrd xmm2, xmm2, edx, 2
2846 vpinsrd xmm3, xmm3, eax, 2
2847 vpinsrd xmm2, xmm2, r9d, 3
2848 vpinsrd xmm3, xmm3, r10d, 3
2849 vinserti128 ymm2, ymm3, xmm2, 1
2850 vpsrlvd ymm2, ymm2, ymm0
2851 vpand ymm2, ymm2, ymm1
2852 vmovdqu ymmword ptr [r15 - 32], ymm2
2853 mov r9d, dword ptr [rbx]
2854 mov ecx, dword ptr [rbx - 4]
2855 mov edx, r9d
2856 mov esi, dword ptr [rbx - 8]
2857 mov r10d, dword ptr [rbx - 12]
2858 shld edx, ecx, 16
2859 mov r11d, dword ptr [rbx - 20]
2860 mov edi, dword ptr [rbx - 16]
2861 shld ecx, esi, 8
2862 mov eax, r10d
2863 shld eax, edi, 16
2864 shld edi, r11d, 8
2865 vmovd xmm2, esi
2866 vpinsrd xmm2, xmm2, ecx, 1
2867 vmovd xmm3, r11d
2868 vpinsrd xmm2, xmm2, edx, 2
2869 vpinsrd xmm3, xmm3, edi, 1
2870 vpinsrd xmm2, xmm2, r9d, 3
2871 vpinsrd xmm3, xmm3, eax, 2
2872 vpinsrd xmm3, xmm3, r10d, 3
2873 vinserti128 ymm2, ymm3, xmm2, 1
2874 vpsrlvd ymm2, ymm2, ymm0
2875 vpand ymm2, ymm2, ymm1
2876 vmovdqu ymmword ptr [r15], ymm2
2877 sub r15, -128
2878 add rbx, 96
2879 add r8, -1
2880 jne .LBB0_110
2881 jmp .LBB0_147
2882.LBB0_138:
2883 cmp edx, 32
2884 jl .LBB0_147
2885# %bb.139:
2886 mov eax, r14d
2887 xor ecx, ecx
2888 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_124] # ymm0 = [0,4,8,12,16,20,24,28]
2889 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_125] # ymm1 = [64424509455,64424509455,64424509455,64424509455]
2890 .p2align 4, 0x90
2891.LBB0_140: # =>This Inner Loop Header: Depth=1
2892 vpbroadcastd ymm2, dword ptr [rbx + rcx]
2893 vpsrlvd ymm2, ymm2, ymm0
2894 vpand ymm2, ymm2, ymm1
2895 vmovdqu ymmword ptr [r15 + 8*rcx], ymm2
2896 vpbroadcastd ymm2, dword ptr [rbx + rcx + 4]
2897 vpsrlvd ymm2, ymm2, ymm0
2898 vpand ymm2, ymm2, ymm1
2899 vmovdqu ymmword ptr [r15 + 8*rcx + 32], ymm2
2900 vpbroadcastd ymm2, dword ptr [rbx + rcx + 8]
2901 vpsrlvd ymm2, ymm2, ymm0
2902 vpand ymm2, ymm2, ymm1
2903 vmovdqu ymmword ptr [r15 + 8*rcx + 64], ymm2
2904 vpbroadcastd ymm2, dword ptr [rbx + rcx + 12]
2905 vpsrlvd ymm2, ymm2, ymm0
2906 vpand ymm2, ymm2, ymm1
2907 vmovdqu ymmword ptr [r15 + 8*rcx + 96], ymm2
2908 add rcx, 16
2909 add rax, -1
2910 jne .LBB0_140
2911 jmp .LBB0_147
2912.LBB0_114:
2913 cmp edx, 32
2914 jl .LBB0_147
2915# %bb.115:
2916 mov r8d, r14d
2917 add r15, 96
2918 add rbx, 76
2919 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_65] # ymm0 = [0,0,8,0,0,4,0,12]
2920 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_66] # ymm1 = [4503595333451775,4503595333451775,4503595333451775,4503595333451775]
2921 .p2align 4, 0x90
2922.LBB0_116: # =>This Inner Loop Header: Depth=1
2923 mov r9d, dword ptr [rbx - 60]
2924 mov r11d, dword ptr [rbx - 64]
2925 mov esi, r9d
2926 shld esi, r11d, 8
2927 mov edi, dword ptr [rbx - 68]
2928 mov edx, r11d
2929 shld edx, edi, 16
2930 mov eax, dword ptr [rbx - 72]
2931 shld edi, eax, 4
2932 mov r10d, dword ptr [rbx - 76]
2933 mov ecx, eax
2934 shld ecx, r10d, 12
2935 vmovd xmm2, edx
2936 vpinsrd xmm2, xmm2, r11d, 1
2937 vpinsrd xmm2, xmm2, esi, 2
2938 vpinsrd xmm2, xmm2, r9d, 3
2939 vmovd xmm3, r10d
2940 vpinsrd xmm3, xmm3, ecx, 1
2941 vpinsrd xmm3, xmm3, eax, 2
2942 vpinsrd xmm3, xmm3, edi, 3
2943 vinserti128 ymm2, ymm3, xmm2, 1
2944 vpsrlvd ymm2, ymm2, ymm0
2945 vpand ymm2, ymm2, ymm1
2946 vmovdqu ymmword ptr [r15 - 96], ymm2
2947 mov r9d, dword ptr [rbx - 40]
2948 mov r11d, dword ptr [rbx - 44]
2949 mov edx, r9d
2950 shld edx, r11d, 8
2951 mov esi, dword ptr [rbx - 48]
2952 mov edi, r11d
2953 shld edi, esi, 16
2954 mov r10d, dword ptr [rbx - 56]
2955 mov ecx, dword ptr [rbx - 52]
2956 shld esi, ecx, 4
2957 mov eax, ecx
2958 shld eax, r10d, 12
2959 vmovd xmm2, edi
2960 vpinsrd xmm2, xmm2, r11d, 1
2961 vpinsrd xmm2, xmm2, edx, 2
2962 vpinsrd xmm2, xmm2, r9d, 3
2963 vmovd xmm3, r10d
2964 vpinsrd xmm3, xmm3, eax, 1
2965 vpinsrd xmm3, xmm3, ecx, 2
2966 vpinsrd xmm3, xmm3, esi, 3
2967 vinserti128 ymm2, ymm3, xmm2, 1
2968 vpsrlvd ymm2, ymm2, ymm0
2969 vpand ymm2, ymm2, ymm1
2970 vmovdqu ymmword ptr [r15 - 64], ymm2
2971 mov r9d, dword ptr [rbx - 20]
2972 mov r11d, dword ptr [rbx - 24]
2973 mov edx, r9d
2974 shld edx, r11d, 8
2975 mov esi, dword ptr [rbx - 28]
2976 mov edi, r11d
2977 shld edi, esi, 16
2978 mov ecx, dword ptr [rbx - 32]
2979 shld esi, ecx, 4
2980 mov r10d, dword ptr [rbx - 36]
2981 mov eax, ecx
2982 shld eax, r10d, 12
2983 vmovd xmm2, edi
2984 vpinsrd xmm2, xmm2, r11d, 1
2985 vpinsrd xmm2, xmm2, edx, 2
2986 vpinsrd xmm2, xmm2, r9d, 3
2987 vmovd xmm3, r10d
2988 vpinsrd xmm3, xmm3, eax, 1
2989 vpinsrd xmm3, xmm3, ecx, 2
2990 vpinsrd xmm3, xmm3, esi, 3
2991 vinserti128 ymm2, ymm3, xmm2, 1
2992 vpsrlvd ymm2, ymm2, ymm0
2993 vpand ymm2, ymm2, ymm1
2994 vmovdqu ymmword ptr [r15 - 32], ymm2
2995 mov r9d, dword ptr [rbx]
2996 mov r11d, dword ptr [rbx - 4]
2997 mov edx, r9d
2998 shld edx, r11d, 8
2999 mov esi, dword ptr [rbx - 8]
3000 mov edi, r11d
3001 shld edi, esi, 16
3002 mov r10d, dword ptr [rbx - 16]
3003 mov ecx, dword ptr [rbx - 12]
3004 shld esi, ecx, 4
3005 mov eax, ecx
3006 shld eax, r10d, 12
3007 vmovd xmm2, edi
3008 vpinsrd xmm2, xmm2, r11d, 1
3009 vpinsrd xmm2, xmm2, edx, 2
3010 vpinsrd xmm2, xmm2, r9d, 3
3011 vmovd xmm3, r10d
3012 vpinsrd xmm3, xmm3, eax, 1
3013 vpinsrd xmm3, xmm3, ecx, 2
3014 vpinsrd xmm3, xmm3, esi, 3
3015 vinserti128 ymm2, ymm3, xmm2, 1
3016 vpsrlvd ymm2, ymm2, ymm0
3017 vpand ymm2, ymm2, ymm1
3018 vmovdqu ymmword ptr [r15], ymm2
3019 sub r15, -128
3020 add rbx, 80
3021 add r8, -1
3022 jne .LBB0_116
3023 jmp .LBB0_147
3024.LBB0_126:
3025 cmp edx, 32
3026 jl .LBB0_147
3027# %bb.127:
3028 mov r8d, r14d
3029 add r15, 96
3030 add rbx, 44
3031 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_94] # ymm0 = [0,12,0,4,16,0,8,20]
3032 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_95] # ymm1 = [17587891081215,17587891081215,17587891081215,17587891081215]
3033 .p2align 4, 0x90
3034.LBB0_128: # =>This Inner Loop Header: Depth=1
3035 mov ecx, dword ptr [rbx - 36]
3036 mov edx, dword ptr [rbx - 44]
3037 mov esi, dword ptr [rbx - 40]
3038 mov edi, ecx
3039 shld edi, esi, 4
3040 mov eax, esi
3041 shld eax, edx, 8
3042 vmovd xmm2, esi
3043 vpinsrd xmm2, xmm2, edi, 1
3044 vpinsrd xmm2, xmm2, ecx, 2
3045 vpinsrd xmm2, xmm2, ecx, 3
3046 vmovd xmm3, edx
3047 vpinsrd xmm3, xmm3, edx, 1
3048 vpinsrd xmm3, xmm3, eax, 2
3049 vpinsrd xmm3, xmm3, esi, 3
3050 vinserti128 ymm2, ymm3, xmm2, 1
3051 vpsrlvd ymm2, ymm2, ymm0
3052 vpand ymm2, ymm2, ymm1
3053 vmovdqu ymmword ptr [r15 - 96], ymm2
3054 mov eax, dword ptr [rbx - 24]
3055 mov ecx, dword ptr [rbx - 32]
3056 mov edx, dword ptr [rbx - 28]
3057 mov esi, eax
3058 shld esi, edx, 4
3059 mov edi, edx
3060 shld edi, ecx, 8
3061 vmovd xmm2, edx
3062 vpinsrd xmm2, xmm2, esi, 1
3063 vpinsrd xmm2, xmm2, eax, 2
3064 vpinsrd xmm2, xmm2, eax, 3
3065 vmovd xmm3, ecx
3066 vpinsrd xmm3, xmm3, ecx, 1
3067 vpinsrd xmm3, xmm3, edi, 2
3068 vpinsrd xmm3, xmm3, edx, 3
3069 vinserti128 ymm2, ymm3, xmm2, 1
3070 vpsrlvd ymm2, ymm2, ymm0
3071 vpand ymm2, ymm2, ymm1
3072 vmovdqu ymmword ptr [r15 - 64], ymm2
3073 mov eax, dword ptr [rbx - 12]
3074 mov ecx, dword ptr [rbx - 20]
3075 mov edx, dword ptr [rbx - 16]
3076 mov esi, eax
3077 shld esi, edx, 4
3078 mov edi, edx
3079 shld edi, ecx, 8
3080 vmovd xmm2, edx
3081 vpinsrd xmm2, xmm2, esi, 1
3082 vpinsrd xmm2, xmm2, eax, 2
3083 vpinsrd xmm2, xmm2, eax, 3
3084 vmovd xmm3, ecx
3085 vpinsrd xmm3, xmm3, ecx, 1
3086 vpinsrd xmm3, xmm3, edi, 2
3087 vpinsrd xmm3, xmm3, edx, 3
3088 vinserti128 ymm2, ymm3, xmm2, 1
3089 vpsrlvd ymm2, ymm2, ymm0
3090 vpand ymm2, ymm2, ymm1
3091 vmovdqu ymmword ptr [r15 - 32], ymm2
3092 mov eax, dword ptr [rbx]
3093 mov ecx, dword ptr [rbx - 8]
3094 mov edx, dword ptr [rbx - 4]
3095 mov esi, eax
3096 shld esi, edx, 4
3097 mov edi, edx
3098 shld edi, ecx, 8
3099 vmovd xmm2, edx
3100 vpinsrd xmm2, xmm2, esi, 1
3101 vpinsrd xmm2, xmm2, eax, 2
3102 vpinsrd xmm2, xmm2, eax, 3
3103 vmovd xmm3, ecx
3104 vpinsrd xmm3, xmm3, ecx, 1
3105 vpinsrd xmm3, xmm3, edi, 2
3106 vpinsrd xmm3, xmm3, edx, 3
3107 vinserti128 ymm2, ymm3, xmm2, 1
3108 vpsrlvd ymm2, ymm2, ymm0
3109 vpand ymm2, ymm2, ymm1
3110 vmovdqu ymmword ptr [r15], ymm2
3111 sub r15, -128
3112 add rbx, 48
3113 add r8, -1
3114 jne .LBB0_128
3115 jmp .LBB0_147
3116.LBB0_102:
3117 cmp edx, 32
3118 jl .LBB0_147
3119# %bb.103:
3120 mov r8d, r14d
3121 add r15, 96
3122 add rbx, 108
3123 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_22] # ymm0 = [0,0,0,0,0,0,0,4]
3124 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_23] # ymm1 = [1152921500580315135,1152921500580315135,1152921500580315135,1152921500580315135]
3125 .p2align 4, 0x90
3126.LBB0_104: # =>This Inner Loop Header: Depth=1
3127 mov r9d, dword ptr [rbx - 84]
3128 mov edx, dword ptr [rbx - 88]
3129 mov r10d, r9d
3130 shld r10d, edx, 24
3131 mov edi, dword ptr [rbx - 92]
3132 shld edx, edi, 20
3133 mov eax, dword ptr [rbx - 96]
3134 shld edi, eax, 16
3135 mov ecx, dword ptr [rbx - 100]
3136 shld eax, ecx, 12
3137 mov r11d, dword ptr [rbx - 108]
3138 mov esi, dword ptr [rbx - 104]
3139 shld ecx, esi, 8
3140 shld esi, r11d, 4
3141 vmovd xmm2, r11d
3142 vmovd xmm3, edi
3143 vpinsrd xmm2, xmm2, esi, 1
3144 vpinsrd xmm3, xmm3, edx, 1
3145 vpinsrd xmm2, xmm2, ecx, 2
3146 vpinsrd xmm3, xmm3, r10d, 2
3147 vpinsrd xmm2, xmm2, eax, 3
3148 vpinsrd xmm3, xmm3, r9d, 3
3149 vinserti128 ymm2, ymm2, xmm3, 1
3150 vpsrlvd ymm2, ymm2, ymm0
3151 vpand ymm2, ymm2, ymm1
3152 vmovdqu ymmword ptr [r15 - 96], ymm2
3153 mov r9d, dword ptr [rbx - 56]
3154 mov ecx, dword ptr [rbx - 60]
3155 mov r10d, r9d
3156 shld r10d, ecx, 24
3157 mov esi, dword ptr [rbx - 64]
3158 shld ecx, esi, 20
3159 mov edi, dword ptr [rbx - 68]
3160 shld esi, edi, 16
3161 mov eax, dword ptr [rbx - 72]
3162 shld edi, eax, 12
3163 mov r11d, dword ptr [rbx - 80]
3164 mov edx, dword ptr [rbx - 76]
3165 shld eax, edx, 8
3166 shld edx, r11d, 4
3167 vmovd xmm2, r11d
3168 vmovd xmm3, esi
3169 vpinsrd xmm2, xmm2, edx, 1
3170 vpinsrd xmm3, xmm3, ecx, 1
3171 vpinsrd xmm2, xmm2, eax, 2
3172 vpinsrd xmm3, xmm3, r10d, 2
3173 vpinsrd xmm2, xmm2, edi, 3
3174 vpinsrd xmm3, xmm3, r9d, 3
3175 vinserti128 ymm2, ymm2, xmm3, 1
3176 vpsrlvd ymm2, ymm2, ymm0
3177 vpand ymm2, ymm2, ymm1
3178 vmovdqu ymmword ptr [r15 - 64], ymm2
3179 mov r9d, dword ptr [rbx - 28]
3180 mov ecx, dword ptr [rbx - 32]
3181 mov r10d, r9d
3182 shld r10d, ecx, 24
3183 mov esi, dword ptr [rbx - 36]
3184 shld ecx, esi, 20
3185 mov edi, dword ptr [rbx - 40]
3186 shld esi, edi, 16
3187 mov eax, dword ptr [rbx - 44]
3188 shld edi, eax, 12
3189 mov r11d, dword ptr [rbx - 52]
3190 mov edx, dword ptr [rbx - 48]
3191 shld eax, edx, 8
3192 shld edx, r11d, 4
3193 vmovd xmm2, r11d
3194 vmovd xmm3, esi
3195 vpinsrd xmm2, xmm2, edx, 1
3196 vpinsrd xmm3, xmm3, ecx, 1
3197 vpinsrd xmm2, xmm2, eax, 2
3198 vpinsrd xmm3, xmm3, r10d, 2
3199 vpinsrd xmm2, xmm2, edi, 3
3200 vpinsrd xmm3, xmm3, r9d, 3
3201 vinserti128 ymm2, ymm2, xmm3, 1
3202 vpsrlvd ymm2, ymm2, ymm0
3203 vpand ymm2, ymm2, ymm1
3204 vmovdqu ymmword ptr [r15 - 32], ymm2
3205 mov r9d, dword ptr [rbx]
3206 mov ecx, dword ptr [rbx - 4]
3207 mov r10d, r9d
3208 shld r10d, ecx, 24
3209 mov esi, dword ptr [rbx - 8]
3210 shld ecx, esi, 20
3211 mov edi, dword ptr [rbx - 12]
3212 shld esi, edi, 16
3213 mov eax, dword ptr [rbx - 16]
3214 shld edi, eax, 12
3215 mov r11d, dword ptr [rbx - 24]
3216 mov edx, dword ptr [rbx - 20]
3217 shld eax, edx, 8
3218 shld edx, r11d, 4
3219 vmovd xmm2, r11d
3220 vmovd xmm3, esi
3221 vpinsrd xmm2, xmm2, edx, 1
3222 vpinsrd xmm3, xmm3, ecx, 1
3223 vpinsrd xmm2, xmm2, eax, 2
3224 vpinsrd xmm3, xmm3, r10d, 2
3225 vpinsrd xmm2, xmm2, edi, 3
3226 vpinsrd xmm3, xmm3, r9d, 3
3227 vinserti128 ymm2, ymm2, xmm3, 1
3228 vpsrlvd ymm2, ymm2, ymm0
3229 vpand ymm2, ymm2, ymm1
3230 vmovdqu ymmword ptr [r15], ymm2
3231 sub r15, -128
3232 add rbx, 112
3233 add r8, -1
3234 jne .LBB0_104
3235 jmp .LBB0_147
3236.LBB0_141:
3237 cmp edx, 32
3238 jl .LBB0_147
3239# %bb.142:
3240 mov eax, r14d
3241 add r15, 96
3242 xor ecx, ecx
3243 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_131] # ymm0 = [0,2,4,6,8,10,12,14]
3244 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_132] # ymm1 = [12884901891,12884901891,12884901891,12884901891]
3245 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_133] # ymm2 = [16,18,20,22,24,26,28,30]
3246 .p2align 4, 0x90
3247.LBB0_143: # =>This Inner Loop Header: Depth=1
3248 vpbroadcastd ymm3, dword ptr [rbx + 8*rcx]
3249 vpsrlvd ymm3, ymm3, ymm0
3250 vpand ymm3, ymm3, ymm1
3251 vmovdqu ymmword ptr [r15 - 96], ymm3
3252 vpbroadcastd ymm3, dword ptr [rbx + 8*rcx]
3253 vpsrlvd ymm3, ymm3, ymm2
3254 vpand ymm3, ymm3, ymm1
3255 vmovdqu ymmword ptr [r15 - 64], ymm3
3256 vpbroadcastd ymm3, dword ptr [rbx + 8*rcx + 4]
3257 vpsrlvd ymm3, ymm3, ymm0
3258 vpand ymm3, ymm3, ymm1
3259 vmovdqu ymmword ptr [r15 - 32], ymm3
3260 vpbroadcastd ymm3, dword ptr [rbx + 8*rcx + 4]
3261 vpsrlvd ymm3, ymm3, ymm2
3262 vpand ymm3, ymm3, ymm1
3263 vmovdqu ymmword ptr [r15], ymm3
3264 add rcx, 1
3265 sub r15, -128
3266 cmp rax, rcx
3267 jne .LBB0_143
3268 jmp .LBB0_147
3269.LBB0_117:
3270 cmp edx, 32
3271 jl .LBB0_147
3272# %bb.118:
3273 mov r8d, r14d
3274 add r15, 96
3275 add rbx, 68
3276 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_72] # ymm0 = [0,0,4,0,8,0,12,0]
3277 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_73] # ymm1 = [1125895612137471,1125895612137471,1125895612137471,1125895612137471]
3278 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_74] # ymm2 = [0,2,0,6,0,10,0,14]
3279 .p2align 4, 0x90
3280.LBB0_119: # =>This Inner Loop Header: Depth=1
3281 mov ecx, dword ptr [rbx - 56]
3282 mov r10d, dword ptr [rbx - 52]
3283 shld r10d, ecx, 2
3284 mov esi, dword ptr [rbx - 60]
3285 mov edi, ecx
3286 shld edi, esi, 6
3287 mov r9d, dword ptr [rbx - 68]
3288 mov edx, dword ptr [rbx - 64]
3289 mov eax, edx
3290 shld eax, r9d, 14
3291 vmovd xmm3, esi
3292 shld esi, edx, 10
3293 vpinsrd xmm3, xmm3, edi, 1
3294 vpinsrd xmm3, xmm3, ecx, 2
3295 vpinsrd xmm3, xmm3, r10d, 3
3296 vmovd xmm4, r9d
3297 vpinsrd xmm4, xmm4, eax, 1
3298 vpinsrd xmm4, xmm4, edx, 2
3299 vpinsrd xmm4, xmm4, esi, 3
3300 vinserti128 ymm3, ymm4, xmm3, 1
3301 vpsrlvd ymm3, ymm3, ymm0
3302 vpand ymm3, ymm3, ymm1
3303 vmovdqu ymmword ptr [r15 - 96], ymm3
3304 mov r9d, dword ptr [rbx - 36]
3305 mov r11d, dword ptr [rbx - 40]
3306 mov edx, r9d
3307 shld edx, r11d, 4
3308 mov r10d, dword ptr [rbx - 44]
3309 mov edi, r11d
3310 shld edi, r10d, 8
3311 mov eax, dword ptr [rbx - 52]
3312 mov esi, dword ptr [rbx - 48]
3313 mov ecx, r10d
3314 shld ecx, esi, 12
3315 shrd eax, esi, 16
3316 vmovd xmm3, edi
3317 vpinsrd xmm3, xmm3, r11d, 1
3318 vpinsrd xmm3, xmm3, edx, 2
3319 vpinsrd xmm3, xmm3, r9d, 3
3320 vmovd xmm4, eax
3321 vpinsrd xmm4, xmm4, esi, 1
3322 vpinsrd xmm4, xmm4, ecx, 2
3323 vpinsrd xmm4, xmm4, r10d, 3
3324 vinserti128 ymm3, ymm4, xmm3, 1
3325 vpsrlvd ymm3, ymm3, ymm2
3326 vpand ymm3, ymm3, ymm1
3327 vmovdqu ymmword ptr [r15 - 64], ymm3
3328 mov eax, dword ptr [rbx - 20]
3329 mov r10d, dword ptr [rbx - 16]
3330 shld r10d, eax, 2
3331 mov edx, dword ptr [rbx - 24]
3332 mov esi, eax
3333 shld esi, edx, 6
3334 mov r9d, dword ptr [rbx - 32]
3335 mov ecx, dword ptr [rbx - 28]
3336 mov edi, ecx
3337 shld edi, r9d, 14
3338 vmovd xmm3, edx
3339 shld edx, ecx, 10
3340 vpinsrd xmm3, xmm3, esi, 1
3341 vpinsrd xmm3, xmm3, eax, 2
3342 vpinsrd xmm3, xmm3, r10d, 3
3343 vmovd xmm4, r9d
3344 vpinsrd xmm4, xmm4, edi, 1
3345 vpinsrd xmm4, xmm4, ecx, 2
3346 vpinsrd xmm4, xmm4, edx, 3
3347 vinserti128 ymm3, ymm4, xmm3, 1
3348 vpsrlvd ymm3, ymm3, ymm0
3349 vpand ymm3, ymm3, ymm1
3350 vmovdqu ymmword ptr [r15 - 32], ymm3
3351 mov r9d, dword ptr [rbx]
3352 mov r11d, dword ptr [rbx - 4]
3353 mov edx, r9d
3354 shld edx, r11d, 4
3355 mov r10d, dword ptr [rbx - 8]
3356 mov edi, r11d
3357 shld edi, r10d, 8
3358 mov eax, dword ptr [rbx - 16]
3359 mov esi, dword ptr [rbx - 12]
3360 mov ecx, r10d
3361 shld ecx, esi, 12
3362 shrd eax, esi, 16
3363 vmovd xmm3, edi
3364 vpinsrd xmm3, xmm3, r11d, 1
3365 vpinsrd xmm3, xmm3, edx, 2
3366 vpinsrd xmm3, xmm3, r9d, 3
3367 vmovd xmm4, eax
3368 vpinsrd xmm4, xmm4, esi, 1
3369 vpinsrd xmm4, xmm4, ecx, 2
3370 vpinsrd xmm4, xmm4, r10d, 3
3371 vinserti128 ymm3, ymm4, xmm3, 1
3372 vpsrlvd ymm3, ymm3, ymm2
3373 vpand ymm3, ymm3, ymm1
3374 vmovdqu ymmword ptr [r15], ymm3
3375 sub r15, -128
3376 add rbx, 72
3377 add r8, -1
3378 jne .LBB0_119
3379 jmp .LBB0_147
3380.LBB0_129:
3381 cmp edx, 32
3382 jl .LBB0_147
3383# %bb.130:
3384 mov r8d, r14d
3385 add r15, 96
3386 add rbx, 36
3387 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_101] # ymm0 = [0,10,20,0,8,18,0,6]
3388 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_102] # ymm1 = [4393751544831,4393751544831,4393751544831,4393751544831]
3389 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_103] # ymm2 = [16,0,4,14,0,2,12,22]
3390 .p2align 4, 0x90
3391.LBB0_131: # =>This Inner Loop Header: Depth=1
3392 mov ecx, dword ptr [rbx - 28]
3393 mov edx, dword ptr [rbx - 36]
3394 mov esi, dword ptr [rbx - 32]
3395 mov edi, ecx
3396 shld edi, esi, 4
3397 vmovd xmm3, esi
3398 vpinsrd xmm3, xmm3, esi, 1
3399 shld esi, edx, 2
3400 vpinsrd xmm3, xmm3, edi, 2
3401 vpinsrd xmm3, xmm3, ecx, 3
3402 vmovd xmm4, edx
3403 vpinsrd xmm4, xmm4, edx, 1
3404 vpinsrd xmm4, xmm4, edx, 2
3405 vpinsrd xmm4, xmm4, esi, 3
3406 vinserti128 ymm3, ymm4, xmm3, 1
3407 vpsrlvd ymm3, ymm3, ymm0
3408 vpand ymm3, ymm3, ymm1
3409 vmovdqu ymmword ptr [r15 - 96], ymm3
3410 mov ecx, dword ptr [rbx - 20]
3411 mov edx, dword ptr [rbx - 24]
3412 mov esi, ecx
3413 shld esi, edx, 8
3414 mov edi, dword ptr [rbx - 28]
3415 mov eax, edx
3416 shld eax, edi, 6
3417 vmovd xmm3, esi
3418 vpinsrd xmm3, xmm3, ecx, 1
3419 vpinsrd xmm3, xmm3, ecx, 2
3420 vpinsrd xmm3, xmm3, ecx, 3
3421 vmovd xmm4, edi
3422 vpinsrd xmm4, xmm4, eax, 1
3423 vpinsrd xmm4, xmm4, edx, 2
3424 vpinsrd xmm4, xmm4, edx, 3
3425 vinserti128 ymm3, ymm4, xmm3, 1
3426 vpsrlvd ymm3, ymm3, ymm2
3427 vpand ymm3, ymm3, ymm1
3428 vmovdqu ymmword ptr [r15 - 64], ymm3
3429 mov eax, dword ptr [rbx - 8]
3430 mov ecx, dword ptr [rbx - 16]
3431 mov edx, dword ptr [rbx - 12]
3432 mov esi, eax
3433 shld esi, edx, 4
3434 vmovd xmm3, edx
3435 vpinsrd xmm3, xmm3, edx, 1
3436 shld edx, ecx, 2
3437 vpinsrd xmm3, xmm3, esi, 2
3438 vpinsrd xmm3, xmm3, eax, 3
3439 vmovd xmm4, ecx
3440 vpinsrd xmm4, xmm4, ecx, 1
3441 vpinsrd xmm4, xmm4, ecx, 2
3442 vpinsrd xmm4, xmm4, edx, 3
3443 vinserti128 ymm3, ymm4, xmm3, 1
3444 vpsrlvd ymm3, ymm3, ymm0
3445 vpand ymm3, ymm3, ymm1
3446 vmovdqu ymmword ptr [r15 - 32], ymm3
3447 mov eax, dword ptr [rbx]
3448 mov ecx, dword ptr [rbx - 8]
3449 mov edx, dword ptr [rbx - 4]
3450 mov esi, eax
3451 shld esi, edx, 8
3452 mov edi, edx
3453 shld edi, ecx, 6
3454 vmovd xmm3, esi
3455 vpinsrd xmm3, xmm3, eax, 1
3456 vpinsrd xmm3, xmm3, eax, 2
3457 vpinsrd xmm3, xmm3, eax, 3
3458 vmovd xmm4, ecx
3459 vpinsrd xmm4, xmm4, edi, 1
3460 vpinsrd xmm4, xmm4, edx, 2
3461 vpinsrd xmm4, xmm4, edx, 3
3462 vinserti128 ymm3, ymm4, xmm3, 1
3463 vpsrlvd ymm3, ymm3, ymm2
3464 vpand ymm3, ymm3, ymm1
3465 vmovdqu ymmword ptr [r15], ymm3
3466 sub r15, -128
3467 add rbx, 40
3468 add r8, -1
3469 jne .LBB0_131
3470 jmp .LBB0_147
3471.LBB0_105:
3472 cmp edx, 32
3473 jl .LBB0_147
3474# %bb.106:
3475 mov r8d, r14d
3476 add r15, 96
3477 add rbx, 100
3478 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_34] # ymm0 = [288230371923853311,288230371923853311,288230371923853311,288230371923853311]
3479 vpbroadcastq xmm1, qword ptr [rip + .LCPI0_35] # xmm1 = [42949672976,42949672976]
3480 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_33] # ymm2 = [0,0,0,0,0,2,0,0]
3481 vpbroadcastq xmm3, qword ptr [rip + .LCPI0_36] # xmm3 = [94489280528,94489280528]
3482 vmovdqa ymm4, ymmword ptr [rip + .LCPI0_37] # ymm4 = [0,0,4,0,0,0,0,6]
3483 .p2align 4, 0x90
3484.LBB0_107: # =>This Inner Loop Header: Depth=1
3485 mov ecx, dword ptr [rbx - 80]
3486 mov r9d, dword ptr [rbx - 76]
3487 shld r9d, ecx, 10
3488 mov r11d, dword ptr [rbx - 84]
3489 shld ecx, r11d, 4
3490 mov edi, dword ptr [rbx - 88]
3491 mov esi, r11d
3492 shld esi, edi, 24
3493 mov edx, dword ptr [rbx - 92]
3494 shld edi, edx, 18
3495 mov r10d, dword ptr [rbx - 100]
3496 mov eax, dword ptr [rbx - 96]
3497 shld edx, eax, 12
3498 shld eax, r10d, 6
3499 vmovd xmm5, r10d
3500 vmovd xmm6, esi
3501 vpinsrd xmm5, xmm5, eax, 1
3502 vpinsrd xmm6, xmm6, r11d, 1
3503 vpinsrd xmm5, xmm5, edx, 2
3504 vpinsrd xmm6, xmm6, ecx, 2
3505 vpinsrd xmm5, xmm5, edi, 3
3506 vpinsrd xmm6, xmm6, r9d, 3
3507 vinserti128 ymm5, ymm5, xmm6, 1
3508 vpsrlvd ymm5, ymm5, ymm2
3509 vpand ymm5, ymm5, ymm0
3510 vmovdqu ymmword ptr [r15 - 96], ymm5
3511 mov r9d, dword ptr [rbx - 52]
3512 mov ecx, dword ptr [rbx - 56]
3513 mov edx, r9d
3514 shld edx, ecx, 20
3515 mov esi, dword ptr [rbx - 60]
3516 shld ecx, esi, 14
3517 mov edi, dword ptr [rbx - 68]
3518 mov eax, dword ptr [rbx - 64]
3519 shld esi, eax, 8
3520 shld eax, edi, 2
3521 vmovq xmm5, qword ptr [rbx - 76] # xmm5 = mem[0],zero
3522 vpsrlvd xmm6, xmm5, xmm1
3523 vpshufd xmm5, xmm5, 229 # xmm5 = xmm5[1,1,2,3]
3524 vpinsrd xmm5, xmm5, edi, 1
3525 vpsllvd xmm5, xmm5, xmm3
3526 vpor xmm5, xmm6, xmm5
3527 vmovd xmm6, esi
3528 vpinsrd xmm6, xmm6, ecx, 1
3529 vpinsrd xmm6, xmm6, edx, 2
3530 vpinsrd xmm6, xmm6, r9d, 3
3531 vpinsrd xmm5, xmm5, edi, 2
3532 vpinsrd xmm5, xmm5, eax, 3
3533 vinserti128 ymm5, ymm5, xmm6, 1
3534 vpsrlvd ymm5, ymm5, ymm4
3535 vpand ymm5, ymm5, ymm0
3536 vmovdqu ymmword ptr [r15 - 64], ymm5
3537 mov eax, dword ptr [rbx - 28]
3538 mov r9d, dword ptr [rbx - 24]
3539 shld r9d, eax, 10
3540 mov r11d, dword ptr [rbx - 32]
3541 shld eax, r11d, 4
3542 mov esi, dword ptr [rbx - 36]
3543 mov edi, r11d
3544 shld edi, esi, 24
3545 mov ecx, dword ptr [rbx - 40]
3546 shld esi, ecx, 18
3547 mov r10d, dword ptr [rbx - 48]
3548 mov edx, dword ptr [rbx - 44]
3549 shld ecx, edx, 12
3550 shld edx, r10d, 6
3551 vmovd xmm5, r10d
3552 vmovd xmm6, edi
3553 vpinsrd xmm5, xmm5, edx, 1
3554 vpinsrd xmm6, xmm6, r11d, 1
3555 vpinsrd xmm5, xmm5, ecx, 2
3556 vpinsrd xmm6, xmm6, eax, 2
3557 vpinsrd xmm5, xmm5, esi, 3
3558 vpinsrd xmm6, xmm6, r9d, 3
3559 vinserti128 ymm5, ymm5, xmm6, 1
3560 vpsrlvd ymm5, ymm5, ymm2
3561 vpand ymm5, ymm5, ymm0
3562 vmovdqu ymmword ptr [r15 - 32], ymm5
3563 mov r9d, dword ptr [rbx]
3564 mov ecx, dword ptr [rbx - 4]
3565 mov edx, r9d
3566 shld edx, ecx, 20
3567 mov esi, dword ptr [rbx - 8]
3568 shld ecx, esi, 14
3569 mov edi, dword ptr [rbx - 16]
3570 mov eax, dword ptr [rbx - 12]
3571 shld esi, eax, 8
3572 shld eax, edi, 2
3573 vmovq xmm5, qword ptr [rbx - 24] # xmm5 = mem[0],zero
3574 vpsrlvd xmm6, xmm5, xmm1
3575 vpshufd xmm5, xmm5, 229 # xmm5 = xmm5[1,1,2,3]
3576 vpinsrd xmm5, xmm5, edi, 1
3577 vpsllvd xmm5, xmm5, xmm3
3578 vpor xmm5, xmm6, xmm5
3579 vmovd xmm6, esi
3580 vpinsrd xmm6, xmm6, ecx, 1
3581 vpinsrd xmm6, xmm6, edx, 2
3582 vpinsrd xmm6, xmm6, r9d, 3
3583 vpinsrd xmm5, xmm5, edi, 2
3584 vpinsrd xmm5, xmm5, eax, 3
3585 vinserti128 ymm5, ymm5, xmm6, 1
3586 vpsrlvd ymm5, ymm5, ymm4
3587 vpand ymm5, ymm5, ymm0
3588 vmovdqu ymmword ptr [r15], ymm5
3589 sub r15, -128
3590 add rbx, 104
3591 add r8, -1
3592 jne .LBB0_107
3593 jmp .LBB0_147
3594.LBB0_135:
3595 cmp edx, 32
3596 jl .LBB0_147
3597# %bb.136:
3598 mov eax, r14d
3599 add r15, 96
3600 add rbx, 20
3601 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_116] # ymm0 = [0,6,12,18,24,0,4,10]
3602 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_117] # ymm1 = [270582939711,270582939711,270582939711,270582939711]
3603 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_118] # ymm2 = [16,22,0,2,8,14,20,26]
3604 .p2align 4, 0x90
3605.LBB0_137: # =>This Inner Loop Header: Depth=1
3606 mov ecx, dword ptr [rbx - 20]
3607 mov edx, dword ptr [rbx - 16]
3608 mov esi, edx
3609 shld esi, ecx, 2
3610 vmovd xmm3, ecx
3611 vpbroadcastd xmm4, xmm3
3612 vpinsrd xmm3, xmm3, esi, 1
3613 vpinsrd xmm3, xmm3, edx, 2
3614 vpinsrd xmm3, xmm3, edx, 3
3615 vinserti128 ymm3, ymm4, xmm3, 1
3616 vpsrlvd ymm3, ymm3, ymm0
3617 vpand ymm3, ymm3, ymm1
3618 vmovdqu ymmword ptr [r15 - 96], ymm3
3619 mov ecx, dword ptr [rbx - 16]
3620 mov edx, dword ptr [rbx - 12]
3621 mov esi, edx
3622 shld esi, ecx, 4
3623 vmovd xmm3, ecx
3624 vpinsrd xmm3, xmm3, ecx, 1
3625 vpinsrd xmm3, xmm3, esi, 2
3626 vpinsrd xmm3, xmm3, edx, 3
3627 vmovd xmm4, edx
3628 vpbroadcastd xmm4, xmm4
3629 vinserti128 ymm3, ymm3, xmm4, 1
3630 vpsrlvd ymm3, ymm3, ymm2
3631 vpand ymm3, ymm3, ymm1
3632 vmovdqu ymmword ptr [r15 - 64], ymm3
3633 mov ecx, dword ptr [rbx - 8]
3634 mov edx, dword ptr [rbx - 4]
3635 mov esi, edx
3636 shld esi, ecx, 2
3637 vmovd xmm3, ecx
3638 vpinsrd xmm4, xmm3, esi, 1
3639 vpinsrd xmm4, xmm4, edx, 2
3640 vpbroadcastd xmm3, xmm3
3641 vpinsrd xmm4, xmm4, edx, 3
3642 vinserti128 ymm3, ymm3, xmm4, 1
3643 vpsrlvd ymm3, ymm3, ymm0
3644 vpand ymm3, ymm3, ymm1
3645 vmovdqu ymmword ptr [r15 - 32], ymm3
3646 mov ecx, dword ptr [rbx - 4]
3647 mov edx, dword ptr [rbx]
3648 mov esi, edx
3649 shld esi, ecx, 4
3650 vmovd xmm3, ecx
3651 vpinsrd xmm3, xmm3, ecx, 1
3652 vpinsrd xmm3, xmm3, esi, 2
3653 vpinsrd xmm3, xmm3, edx, 3
3654 vmovd xmm4, edx
3655 vpbroadcastd xmm4, xmm4
3656 vinserti128 ymm3, ymm3, xmm4, 1
3657 vpsrlvd ymm3, ymm3, ymm2
3658 vpand ymm3, ymm3, ymm1
3659 vmovdqu ymmword ptr [r15], ymm3
3660 sub r15, -128
3661 add rbx, 24
3662 add rax, -1
3663 jne .LBB0_137
3664 jmp .LBB0_147
3665.LBB0_111:
3666 cmp edx, 32
3667 jl .LBB0_147
3668# %bb.112:
3669 mov r8d, r14d
3670 add r15, 96
3671 add rbx, 84
3672 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_55] # ymm0 = [0,0,0,2,0,0,4,0]
3673 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_56] # ymm1 = [18014394218708991,18014394218708991,18014394218708991,18014394218708991]
3674 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_57] # ymm2 = [0,6,0,0,8,0,0,10]
3675 .p2align 4, 0x90
3676.LBB0_113: # =>This Inner Loop Header: Depth=1
3677 mov r10d, dword ptr [rbx - 68]
3678 mov r9d, dword ptr [rbx - 64]
3679 shld r9d, r10d, 6
3680 mov esi, dword ptr [rbx - 72]
3681 mov edi, r10d
3682 shld edi, esi, 18
3683 mov edx, dword ptr [rbx - 76]
3684 shld esi, edx, 8
3685 mov r11d, dword ptr [rbx - 84]
3686 mov ecx, dword ptr [rbx - 80]
3687 mov eax, edx
3688 shld eax, ecx, 20
3689 shld ecx, r11d, 10
3690 vmovd xmm3, r11d
3691 vmovd xmm4, esi
3692 vpinsrd xmm3, xmm3, ecx, 1
3693 vpinsrd xmm4, xmm4, edi, 1
3694 vpinsrd xmm3, xmm3, eax, 2
3695 vpinsrd xmm4, xmm4, r10d, 2
3696 vpinsrd xmm3, xmm3, edx, 3
3697 vpinsrd xmm4, xmm4, r9d, 3
3698 vinserti128 ymm3, ymm3, xmm4, 1
3699 vpsrlvd ymm3, ymm3, ymm0
3700 vpand ymm3, ymm3, ymm1
3701 vmovdqu ymmword ptr [r15 - 96], ymm3
3702 mov r9d, dword ptr [rbx - 44]
3703 mov ecx, dword ptr [rbx - 48]
3704 mov r10d, r9d
3705 shld r10d, ecx, 12
3706 mov esi, dword ptr [rbx - 52]
3707 shld ecx, esi, 2
3708 mov edi, dword ptr [rbx - 56]
3709 vmovd xmm3, esi
3710 shld esi, edi, 14
3711 mov eax, dword ptr [rbx - 64]
3712 mov edx, dword ptr [rbx - 60]
3713 shld edi, edx, 4
3714 shrd eax, edx, 16
3715 vpinsrd xmm3, xmm3, ecx, 1
3716 vmovd xmm4, eax
3717 vpinsrd xmm3, xmm3, r10d, 2
3718 vpinsrd xmm4, xmm4, edx, 1
3719 vpinsrd xmm3, xmm3, r9d, 3
3720 vpinsrd xmm4, xmm4, edi, 2
3721 vpinsrd xmm4, xmm4, esi, 3
3722 vinserti128 ymm3, ymm4, xmm3, 1
3723 vpsrlvd ymm3, ymm3, ymm2
3724 vpand ymm3, ymm3, ymm1
3725 vmovdqu ymmword ptr [r15 - 64], ymm3
3726 mov r10d, dword ptr [rbx - 24]
3727 mov r9d, dword ptr [rbx - 20]
3728 shld r9d, r10d, 6
3729 mov edx, dword ptr [rbx - 28]
3730 mov esi, r10d
3731 shld esi, edx, 18
3732 mov ecx, dword ptr [rbx - 32]
3733 shld edx, ecx, 8
3734 mov r11d, dword ptr [rbx - 40]
3735 mov eax, dword ptr [rbx - 36]
3736 mov edi, ecx
3737 shld edi, eax, 20
3738 shld eax, r11d, 10
3739 vmovd xmm3, r11d
3740 vmovd xmm4, edx
3741 vpinsrd xmm3, xmm3, eax, 1
3742 vpinsrd xmm4, xmm4, esi, 1
3743 vpinsrd xmm3, xmm3, edi, 2
3744 vpinsrd xmm4, xmm4, r10d, 2
3745 vpinsrd xmm3, xmm3, ecx, 3
3746 vpinsrd xmm4, xmm4, r9d, 3
3747 vinserti128 ymm3, ymm3, xmm4, 1
3748 vpsrlvd ymm3, ymm3, ymm0
3749 vpand ymm3, ymm3, ymm1
3750 vmovdqu ymmword ptr [r15 - 32], ymm3
3751 mov r9d, dword ptr [rbx]
3752 mov ecx, dword ptr [rbx - 4]
3753 mov r10d, r9d
3754 shld r10d, ecx, 12
3755 mov esi, dword ptr [rbx - 8]
3756 shld ecx, esi, 2
3757 mov edi, dword ptr [rbx - 12]
3758 vmovd xmm3, esi
3759 shld esi, edi, 14
3760 mov eax, dword ptr [rbx - 20]
3761 mov edx, dword ptr [rbx - 16]
3762 shld edi, edx, 4
3763 shrd eax, edx, 16
3764 vpinsrd xmm3, xmm3, ecx, 1
3765 vmovd xmm4, eax
3766 vpinsrd xmm3, xmm3, r10d, 2
3767 vpinsrd xmm4, xmm4, edx, 1
3768 vpinsrd xmm3, xmm3, r9d, 3
3769 vpinsrd xmm4, xmm4, edi, 2
3770 vpinsrd xmm4, xmm4, esi, 3
3771 vinserti128 ymm3, ymm4, xmm3, 1
3772 vpsrlvd ymm3, ymm3, ymm2
3773 vpand ymm3, ymm3, ymm1
3774 vmovdqu ymmword ptr [r15], ymm3
3775 sub r15, -128
3776 add rbx, 88
3777 add r8, -1
3778 jne .LBB0_113
3779 jmp .LBB0_147
3780.LBB0_123:
3781 cmp edx, 32
3782 jl .LBB0_147
3783# %bb.124:
3784 mov r8d, r14d
3785 add r15, 96
3786 add rbx, 52
3787 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_86] # ymm0 = [0,14,0,10,0,6,0,2]
3788 vpbroadcastq ymm1, qword ptr [rip + .LCPI0_87] # ymm1 = [70364449226751,70364449226751,70364449226751,70364449226751]
3789 vmovdqa ymm2, ymmword ptr [rip + .LCPI0_88] # ymm2 = [16,0,12,0,8,0,4,18]
3790 .p2align 4, 0x90
3791.LBB0_125: # =>This Inner Loop Header: Depth=1
3792 mov r9d, dword ptr [rbx - 40]
3793 mov ecx, dword ptr [rbx - 44]
3794 mov esi, r9d
3795 shld esi, ecx, 12
3796 mov edi, dword ptr [rbx - 52]
3797 mov r10d, dword ptr [rbx - 48]
3798 mov edx, ecx
3799 shld edx, r10d, 8
3800 mov eax, r10d
3801 shld eax, edi, 4
3802 vmovd xmm3, edx
3803 vpinsrd xmm3, xmm3, ecx, 1
3804 vpinsrd xmm3, xmm3, esi, 2
3805 vpinsrd xmm3, xmm3, r9d, 3
3806 vmovd xmm4, edi
3807 vpinsrd xmm4, xmm4, edi, 1
3808 vpinsrd xmm4, xmm4, eax, 2
3809 vpinsrd xmm4, xmm4, r10d, 3
3810 vinserti128 ymm3, ymm4, xmm3, 1
3811 vpsrlvd ymm3, ymm3, ymm0
3812 vpand ymm3, ymm3, ymm1
3813 vmovdqu ymmword ptr [r15 - 96], ymm3
3814 mov eax, dword ptr [rbx - 28]
3815 mov ecx, dword ptr [rbx - 32]
3816 mov edx, eax
3817 shld edx, ecx, 10
3818 mov r9d, dword ptr [rbx - 40]
3819 mov esi, dword ptr [rbx - 36]
3820 vmovd xmm3, ecx
3821 shld ecx, esi, 6
3822 mov edi, esi
3823 shld edi, r9d, 2
3824 vmovd xmm4, r9d
3825 vpinsrd xmm4, xmm4, edi, 1
3826 vpinsrd xmm4, xmm4, esi, 2
3827 vpinsrd xmm4, xmm4, ecx, 3
3828 vpinsrd xmm3, xmm3, edx, 1
3829 vpinsrd xmm3, xmm3, eax, 2
3830 vpinsrd xmm3, xmm3, eax, 3
3831 vinserti128 ymm3, ymm4, xmm3, 1
3832 vpsrlvd ymm3, ymm3, ymm2
3833 vpand ymm3, ymm3, ymm1
3834 vmovdqu ymmword ptr [r15 - 64], ymm3
3835 mov r9d, dword ptr [rbx - 12]
3836 mov eax, dword ptr [rbx - 16]
3837 mov edx, r9d
3838 shld edx, eax, 12
3839 mov esi, dword ptr [rbx - 24]
3840 mov r10d, dword ptr [rbx - 20]
3841 mov ecx, eax
3842 shld ecx, r10d, 8
3843 mov edi, r10d
3844 shld edi, esi, 4
3845 vmovd xmm3, ecx
3846 vpinsrd xmm3, xmm3, eax, 1
3847 vpinsrd xmm3, xmm3, edx, 2
3848 vpinsrd xmm3, xmm3, r9d, 3
3849 vmovd xmm4, esi
3850 vpinsrd xmm4, xmm4, esi, 1
3851 vpinsrd xmm4, xmm4, edi, 2
3852 vpinsrd xmm4, xmm4, r10d, 3
3853 vinserti128 ymm3, ymm4, xmm3, 1
3854 vpsrlvd ymm3, ymm3, ymm0
3855 vpand ymm3, ymm3, ymm1
3856 vmovdqu ymmword ptr [r15 - 32], ymm3
3857 mov r9d, dword ptr [rbx]
3858 mov ecx, dword ptr [rbx - 4]
3859 mov edx, r9d
3860 shld edx, ecx, 10
3861 mov eax, dword ptr [rbx - 8]
3862 vmovd xmm3, ecx
3863 shld ecx, eax, 6
3864 mov edi, dword ptr [rbx - 12]
3865 mov esi, eax
3866 shld esi, edi, 2
3867 vmovd xmm4, edi
3868 vpinsrd xmm4, xmm4, esi, 1
3869 vpinsrd xmm4, xmm4, eax, 2
3870 vpinsrd xmm4, xmm4, ecx, 3
3871 vpinsrd xmm3, xmm3, edx, 1
3872 vpinsrd xmm3, xmm3, r9d, 2
3873 vpinsrd xmm3, xmm3, r9d, 3
3874 vinserti128 ymm3, ymm4, xmm3, 1
3875 vpsrlvd ymm3, ymm3, ymm2
3876 vpand ymm3, ymm3, ymm1
3877 vmovdqu ymmword ptr [r15], ymm3
3878 sub r15, -128
3879 add rbx, 56
3880 add r8, -1
3881 jne .LBB0_125
3882 jmp .LBB0_147
3883.LBB0_99:
3884 cmp edx, 32
3885 jl .LBB0_147
3886# %bb.100:
3887 mov r8d, r14d
3888 add r15, 96
3889 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_8] # ymm0 = [4611686015206162431,4611686015206162431,4611686015206162431,4611686015206162431]
3890 add rbx, 116
3891 vmovdqa xmm1, xmmword ptr [rip + .LCPI0_9] # xmm1 = [16,14,12,10]
3892 vmovdqa xmm2, xmmword ptr [rip + .LCPI0_10] # xmm2 = [16,18,20,22]
3893 vmovdqa ymm3, ymmword ptr [rip + .LCPI0_11] # ymm3 = [0,0,0,0,0,0,0,2]
3894 .p2align 4, 0x90
3895.LBB0_101: # =>This Inner Loop Header: Depth=1
3896 mov r11d, dword ptr [rbx - 92]
3897 mov r9d, dword ptr [rbx - 88]
3898 shld r9d, r11d, 14
3899 mov esi, dword ptr [rbx - 96]
3900 shld r11d, esi, 12
3901 mov edi, dword ptr [rbx - 100]
3902 shld esi, edi, 10
3903 mov eax, dword ptr [rbx - 104]
3904 shld edi, eax, 8
3905 mov edx, dword ptr [rbx - 108]
3906 shld eax, edx, 6
3907 mov r10d, dword ptr [rbx - 116]
3908 mov ecx, dword ptr [rbx - 112]
3909 shld edx, ecx, 4
3910 shld ecx, r10d, 2
3911 vmovd xmm4, r10d
3912 vmovd xmm5, edi
3913 vpinsrd xmm4, xmm4, ecx, 1
3914 vpinsrd xmm5, xmm5, esi, 1
3915 vpinsrd xmm4, xmm4, edx, 2
3916 vpinsrd xmm5, xmm5, r11d, 2
3917 vpinsrd xmm4, xmm4, eax, 3
3918 vpinsrd xmm5, xmm5, r9d, 3
3919 vinserti128 ymm4, ymm4, xmm5, 1
3920 vpand ymm4, ymm4, ymm0
3921 vmovdqu ymmword ptr [r15 - 96], ymm4
3922 mov eax, dword ptr [rbx - 60]
3923 mov ecx, dword ptr [rbx - 64]
3924 mov edx, eax
3925 shld edx, ecx, 28
3926 mov esi, dword ptr [rbx - 68]
3927 mov edi, dword ptr [rbx - 72]
3928 shld ecx, esi, 26
3929 shld esi, edi, 24
3930 vmovdqu xmm4, xmmword ptr [rbx - 88]
3931 vpsrlvd xmm5, xmm4, xmm1
3932 vpshufd xmm4, xmm4, 249 # xmm4 = xmm4[1,2,3,3]
3933 vpinsrd xmm4, xmm4, edi, 3
3934 vmovd xmm6, esi
3935 vpinsrd xmm6, xmm6, ecx, 1
3936 vpinsrd xmm6, xmm6, edx, 2
3937 vpsllvd xmm4, xmm4, xmm2
3938 vpinsrd xmm6, xmm6, eax, 3
3939 vpor xmm4, xmm5, xmm4
3940 vinserti128 ymm4, ymm4, xmm6, 1
3941 vpsrlvd ymm4, ymm4, ymm3
3942 vpand ymm4, ymm4, ymm0
3943 vmovdqu ymmword ptr [r15 - 64], ymm4
3944 mov r11d, dword ptr [rbx - 32]
3945 mov r9d, dword ptr [rbx - 28]
3946 shld r9d, r11d, 14
3947 mov edx, dword ptr [rbx - 36]
3948 shld r11d, edx, 12
3949 mov esi, dword ptr [rbx - 40]
3950 shld edx, esi, 10
3951 mov edi, dword ptr [rbx - 44]
3952 shld esi, edi, 8
3953 mov ecx, dword ptr [rbx - 48]
3954 shld edi, ecx, 6
3955 mov r10d, dword ptr [rbx - 56]
3956 mov eax, dword ptr [rbx - 52]
3957 shld ecx, eax, 4
3958 shld eax, r10d, 2
3959 vmovd xmm4, r10d
3960 vmovd xmm5, esi
3961 vpinsrd xmm4, xmm4, eax, 1
3962 vpinsrd xmm5, xmm5, edx, 1
3963 vpinsrd xmm4, xmm4, ecx, 2
3964 vpinsrd xmm5, xmm5, r11d, 2
3965 vpinsrd xmm4, xmm4, edi, 3
3966 vpinsrd xmm5, xmm5, r9d, 3
3967 vinserti128 ymm4, ymm4, xmm5, 1
3968 vpand ymm4, ymm4, ymm0
3969 vmovdqu ymmword ptr [r15 - 32], ymm4
3970 mov eax, dword ptr [rbx]
3971 mov ecx, dword ptr [rbx - 4]
3972 mov edx, eax
3973 shld edx, ecx, 28
3974 mov esi, dword ptr [rbx - 8]
3975 shld ecx, esi, 26
3976 mov edi, dword ptr [rbx - 12]
3977 vmovdqu xmm4, xmmword ptr [rbx - 28]
3978 shld esi, edi, 24
3979 vpsrlvd xmm5, xmm4, xmm1
3980 vpshufd xmm4, xmm4, 249 # xmm4 = xmm4[1,2,3,3]
3981 vpinsrd xmm4, xmm4, edi, 3
3982 vmovd xmm6, esi
3983 vpinsrd xmm6, xmm6, ecx, 1
3984 vpsllvd xmm4, xmm4, xmm2
3985 vpinsrd xmm6, xmm6, edx, 2
3986 vpinsrd xmm6, xmm6, eax, 3
3987 vpor xmm4, xmm5, xmm4
3988 vinserti128 ymm4, ymm4, xmm6, 1
3989 vpsrlvd ymm4, ymm4, ymm3
3990 vpand ymm4, ymm4, ymm0
3991 vmovdqu ymmword ptr [r15], ymm4
3992 sub r15, -128
3993 add rbx, 120
3994 add r8, -1
3995 jne .LBB0_101
3996.LBB0_147:
3997 shl r14d, 5
3998 mov eax, r14d
3999 lea rsp, [rbp - 32]
4000 pop rbx
4001 pop r12
4002 pop r14
4003 pop r15
4004 pop rbp
4005 vzeroupper
4006 ret
4007.Lfunc_end0:
4008 .size unpack32_avx2, .Lfunc_end0-unpack32_avx2
4009 # -- End function
4010 .ident "Debian clang version 11.1.0-++20210428103820+1fdec59bffc1-1~exp1~20210428204437.162"
4011 .section ".note.GNU-stack","",@progbits
4012 .addrsig
View as plain text