1// Code generated by command: go run gen.go -avx -out ../accum_vector_avx_amd64.s -pkg xxh3. DO NOT EDIT.
2
3#include "textflag.h"
4
5DATA prime_avx<>+0(SB)/8, $0x000000009e3779b1
6DATA prime_avx<>+8(SB)/8, $0x000000009e3779b1
7DATA prime_avx<>+16(SB)/8, $0x000000009e3779b1
8DATA prime_avx<>+24(SB)/8, $0x000000009e3779b1
9GLOBL prime_avx<>(SB), RODATA|NOPTR, $32
10
11// func accumAVX2(acc *[8]uint64, data *byte, key *byte, len uint64)
12// Requires: AVX, AVX2, MMX+
13TEXT ·accumAVX2(SB), NOSPLIT, $0-32
14 MOVQ acc+0(FP), AX
15 MOVQ data+8(FP), CX
16 MOVQ key+16(FP), DX
17 MOVQ key+16(FP), BX
18 MOVQ len+24(FP), SI
19 VMOVDQU (AX), Y1
20 VMOVDQU 32(AX), Y2
21 VMOVDQU prime_avx<>+0(SB), Y0
22
23accum_large:
24 CMPQ SI, $0x00000400
25 JLE accum
26 VMOVDQU (CX), Y3
27 VMOVDQU 32(CX), Y6
28 PREFETCHT0 512(CX)
29 VPXOR (DX), Y3, Y4
30 VPXOR 32(DX), Y6, Y7
31 VPSHUFD $0x31, Y4, Y5
32 VPSHUFD $0x31, Y7, Y8
33 VPMULUDQ Y4, Y5, Y4
34 VPMULUDQ Y7, Y8, Y7
35 VPSHUFD $0x4e, Y3, Y3
36 VPSHUFD $0x4e, Y6, Y6
37 VPADDQ Y1, Y3, Y1
38 VPADDQ Y1, Y4, Y1
39 VPADDQ Y2, Y6, Y2
40 VPADDQ Y2, Y7, Y2
41 VMOVDQU 64(CX), Y3
42 VMOVDQU 96(CX), Y6
43 PREFETCHT0 576(CX)
44 VPXOR 8(DX), Y3, Y4
45 VPXOR 40(DX), Y6, Y7
46 VPSHUFD $0x31, Y4, Y5
47 VPSHUFD $0x31, Y7, Y8
48 VPMULUDQ Y4, Y5, Y4
49 VPMULUDQ Y7, Y8, Y7
50 VPSHUFD $0x4e, Y3, Y3
51 VPSHUFD $0x4e, Y6, Y6
52 VPADDQ Y1, Y3, Y1
53 VPADDQ Y1, Y4, Y1
54 VPADDQ Y2, Y6, Y2
55 VPADDQ Y2, Y7, Y2
56 VMOVDQU 128(CX), Y3
57 VMOVDQU 160(CX), Y6
58 PREFETCHT0 640(CX)
59 VPXOR 16(DX), Y3, Y4
60 VPXOR 48(DX), Y6, Y7
61 VPSHUFD $0x31, Y4, Y5
62 VPSHUFD $0x31, Y7, Y8
63 VPMULUDQ Y4, Y5, Y4
64 VPMULUDQ Y7, Y8, Y7
65 VPSHUFD $0x4e, Y3, Y3
66 VPSHUFD $0x4e, Y6, Y6
67 VPADDQ Y1, Y3, Y1
68 VPADDQ Y1, Y4, Y1
69 VPADDQ Y2, Y6, Y2
70 VPADDQ Y2, Y7, Y2
71 VMOVDQU 192(CX), Y3
72 VMOVDQU 224(CX), Y6
73 PREFETCHT0 704(CX)
74 VPXOR 24(DX), Y3, Y4
75 VPXOR 56(DX), Y6, Y7
76 VPSHUFD $0x31, Y4, Y5
77 VPSHUFD $0x31, Y7, Y8
78 VPMULUDQ Y4, Y5, Y4
79 VPMULUDQ Y7, Y8, Y7
80 VPSHUFD $0x4e, Y3, Y3
81 VPSHUFD $0x4e, Y6, Y6
82 VPADDQ Y1, Y3, Y1
83 VPADDQ Y1, Y4, Y1
84 VPADDQ Y2, Y6, Y2
85 VPADDQ Y2, Y7, Y2
86 VMOVDQU 256(CX), Y3
87 VMOVDQU 288(CX), Y6
88 PREFETCHT0 768(CX)
89 VPXOR 32(DX), Y3, Y4
90 VPXOR 64(DX), Y6, Y7
91 VPSHUFD $0x31, Y4, Y5
92 VPSHUFD $0x31, Y7, Y8
93 VPMULUDQ Y4, Y5, Y4
94 VPMULUDQ Y7, Y8, Y7
95 VPSHUFD $0x4e, Y3, Y3
96 VPSHUFD $0x4e, Y6, Y6
97 VPADDQ Y1, Y3, Y1
98 VPADDQ Y1, Y4, Y1
99 VPADDQ Y2, Y6, Y2
100 VPADDQ Y2, Y7, Y2
101 VMOVDQU 320(CX), Y3
102 VMOVDQU 352(CX), Y6
103 PREFETCHT0 832(CX)
104 VPXOR 40(DX), Y3, Y4
105 VPXOR 72(DX), Y6, Y7
106 VPSHUFD $0x31, Y4, Y5
107 VPSHUFD $0x31, Y7, Y8
108 VPMULUDQ Y4, Y5, Y4
109 VPMULUDQ Y7, Y8, Y7
110 VPSHUFD $0x4e, Y3, Y3
111 VPSHUFD $0x4e, Y6, Y6
112 VPADDQ Y1, Y3, Y1
113 VPADDQ Y1, Y4, Y1
114 VPADDQ Y2, Y6, Y2
115 VPADDQ Y2, Y7, Y2
116 VMOVDQU 384(CX), Y3
117 VMOVDQU 416(CX), Y6
118 PREFETCHT0 896(CX)
119 VPXOR 48(DX), Y3, Y4
120 VPXOR 80(DX), Y6, Y7
121 VPSHUFD $0x31, Y4, Y5
122 VPSHUFD $0x31, Y7, Y8
123 VPMULUDQ Y4, Y5, Y4
124 VPMULUDQ Y7, Y8, Y7
125 VPSHUFD $0x4e, Y3, Y3
126 VPSHUFD $0x4e, Y6, Y6
127 VPADDQ Y1, Y3, Y1
128 VPADDQ Y1, Y4, Y1
129 VPADDQ Y2, Y6, Y2
130 VPADDQ Y2, Y7, Y2
131 VMOVDQU 448(CX), Y3
132 VMOVDQU 480(CX), Y6
133 PREFETCHT0 960(CX)
134 VPXOR 56(DX), Y3, Y4
135 VPXOR 88(DX), Y6, Y7
136 VPSHUFD $0x31, Y4, Y5
137 VPSHUFD $0x31, Y7, Y8
138 VPMULUDQ Y4, Y5, Y4
139 VPMULUDQ Y7, Y8, Y7
140 VPSHUFD $0x4e, Y3, Y3
141 VPSHUFD $0x4e, Y6, Y6
142 VPADDQ Y1, Y3, Y1
143 VPADDQ Y1, Y4, Y1
144 VPADDQ Y2, Y6, Y2
145 VPADDQ Y2, Y7, Y2
146 VMOVDQU 512(CX), Y3
147 VMOVDQU 544(CX), Y6
148 PREFETCHT0 1024(CX)
149 VPXOR 64(DX), Y3, Y4
150 VPXOR 96(DX), Y6, Y7
151 VPSHUFD $0x31, Y4, Y5
152 VPSHUFD $0x31, Y7, Y8
153 VPMULUDQ Y4, Y5, Y4
154 VPMULUDQ Y7, Y8, Y7
155 VPSHUFD $0x4e, Y3, Y3
156 VPSHUFD $0x4e, Y6, Y6
157 VPADDQ Y1, Y3, Y1
158 VPADDQ Y1, Y4, Y1
159 VPADDQ Y2, Y6, Y2
160 VPADDQ Y2, Y7, Y2
161 VMOVDQU 576(CX), Y3
162 VMOVDQU 608(CX), Y6
163 PREFETCHT0 1088(CX)
164 VPXOR 72(DX), Y3, Y4
165 VPXOR 104(DX), Y6, Y7
166 VPSHUFD $0x31, Y4, Y5
167 VPSHUFD $0x31, Y7, Y8
168 VPMULUDQ Y4, Y5, Y4
169 VPMULUDQ Y7, Y8, Y7
170 VPSHUFD $0x4e, Y3, Y3
171 VPSHUFD $0x4e, Y6, Y6
172 VPADDQ Y1, Y3, Y1
173 VPADDQ Y1, Y4, Y1
174 VPADDQ Y2, Y6, Y2
175 VPADDQ Y2, Y7, Y2
176 VMOVDQU 640(CX), Y3
177 VMOVDQU 672(CX), Y6
178 PREFETCHT0 1152(CX)
179 VPXOR 80(DX), Y3, Y4
180 VPXOR 112(DX), Y6, Y7
181 VPSHUFD $0x31, Y4, Y5
182 VPSHUFD $0x31, Y7, Y8
183 VPMULUDQ Y4, Y5, Y4
184 VPMULUDQ Y7, Y8, Y7
185 VPSHUFD $0x4e, Y3, Y3
186 VPSHUFD $0x4e, Y6, Y6
187 VPADDQ Y1, Y3, Y1
188 VPADDQ Y1, Y4, Y1
189 VPADDQ Y2, Y6, Y2
190 VPADDQ Y2, Y7, Y2
191 VMOVDQU 704(CX), Y3
192 VMOVDQU 736(CX), Y6
193 PREFETCHT0 1216(CX)
194 VPXOR 88(DX), Y3, Y4
195 VPXOR 120(DX), Y6, Y7
196 VPSHUFD $0x31, Y4, Y5
197 VPSHUFD $0x31, Y7, Y8
198 VPMULUDQ Y4, Y5, Y4
199 VPMULUDQ Y7, Y8, Y7
200 VPSHUFD $0x4e, Y3, Y3
201 VPSHUFD $0x4e, Y6, Y6
202 VPADDQ Y1, Y3, Y1
203 VPADDQ Y1, Y4, Y1
204 VPADDQ Y2, Y6, Y2
205 VPADDQ Y2, Y7, Y2
206 VMOVDQU 768(CX), Y3
207 VMOVDQU 800(CX), Y6
208 PREFETCHT0 1280(CX)
209 VPXOR 96(DX), Y3, Y4
210 VPXOR 128(DX), Y6, Y7
211 VPSHUFD $0x31, Y4, Y5
212 VPSHUFD $0x31, Y7, Y8
213 VPMULUDQ Y4, Y5, Y4
214 VPMULUDQ Y7, Y8, Y7
215 VPSHUFD $0x4e, Y3, Y3
216 VPSHUFD $0x4e, Y6, Y6
217 VPADDQ Y1, Y3, Y1
218 VPADDQ Y1, Y4, Y1
219 VPADDQ Y2, Y6, Y2
220 VPADDQ Y2, Y7, Y2
221 VMOVDQU 832(CX), Y3
222 VMOVDQU 864(CX), Y6
223 PREFETCHT0 1344(CX)
224 VPXOR 104(DX), Y3, Y4
225 VPXOR 136(DX), Y6, Y7
226 VPSHUFD $0x31, Y4, Y5
227 VPSHUFD $0x31, Y7, Y8
228 VPMULUDQ Y4, Y5, Y4
229 VPMULUDQ Y7, Y8, Y7
230 VPSHUFD $0x4e, Y3, Y3
231 VPSHUFD $0x4e, Y6, Y6
232 VPADDQ Y1, Y3, Y1
233 VPADDQ Y1, Y4, Y1
234 VPADDQ Y2, Y6, Y2
235 VPADDQ Y2, Y7, Y2
236 VMOVDQU 896(CX), Y3
237 VMOVDQU 928(CX), Y6
238 PREFETCHT0 1408(CX)
239 VPXOR 112(DX), Y3, Y4
240 VPXOR 144(DX), Y6, Y7
241 VPSHUFD $0x31, Y4, Y5
242 VPSHUFD $0x31, Y7, Y8
243 VPMULUDQ Y4, Y5, Y4
244 VPMULUDQ Y7, Y8, Y7
245 VPSHUFD $0x4e, Y3, Y3
246 VPSHUFD $0x4e, Y6, Y6
247 VPADDQ Y1, Y3, Y1
248 VPADDQ Y1, Y4, Y1
249 VPADDQ Y2, Y6, Y2
250 VPADDQ Y2, Y7, Y2
251 VMOVDQU 960(CX), Y3
252 VMOVDQU 992(CX), Y6
253 PREFETCHT0 1472(CX)
254 VPXOR 120(DX), Y3, Y4
255 VPXOR 152(DX), Y6, Y7
256 VPSHUFD $0x31, Y4, Y5
257 VPSHUFD $0x31, Y7, Y8
258 VPMULUDQ Y4, Y5, Y4
259 VPMULUDQ Y7, Y8, Y7
260 VPSHUFD $0x4e, Y3, Y3
261 VPSHUFD $0x4e, Y6, Y6
262 VPADDQ Y1, Y3, Y1
263 VPADDQ Y1, Y4, Y1
264 VPADDQ Y2, Y6, Y2
265 VPADDQ Y2, Y7, Y2
266 ADDQ $0x00000400, CX
267 SUBQ $0x00000400, SI
268 VPSRLQ $0x2f, Y1, Y3
269 VPXOR Y1, Y3, Y3
270 VPXOR 128(DX), Y3, Y3
271 VPMULUDQ Y0, Y3, Y1
272 VPSHUFD $0xf5, Y3, Y3
273 VPMULUDQ Y0, Y3, Y3
274 VPSLLQ $0x20, Y3, Y3
275 VPADDQ Y1, Y3, Y1
276 VPSRLQ $0x2f, Y2, Y3
277 VPXOR Y2, Y3, Y3
278 VPXOR 160(DX), Y3, Y3
279 VPMULUDQ Y0, Y3, Y2
280 VPSHUFD $0xf5, Y3, Y3
281 VPMULUDQ Y0, Y3, Y3
282 VPSLLQ $0x20, Y3, Y3
283 VPADDQ Y2, Y3, Y2
284 JMP accum_large
285
286accum:
287 CMPQ SI, $0x40
288 JLE finalize
289 VMOVDQU (CX), Y0
290 VMOVDQU 32(CX), Y5
291 VPXOR (BX), Y0, Y3
292 VPXOR 32(BX), Y5, Y6
293 VPSHUFD $0x31, Y3, Y4
294 VPSHUFD $0x31, Y6, Y7
295 VPMULUDQ Y3, Y4, Y3
296 VPMULUDQ Y6, Y7, Y6
297 VPSHUFD $0x4e, Y0, Y0
298 VPSHUFD $0x4e, Y5, Y5
299 VPADDQ Y1, Y0, Y1
300 VPADDQ Y1, Y3, Y1
301 VPADDQ Y2, Y5, Y2
302 VPADDQ Y2, Y6, Y2
303 ADDQ $0x00000040, CX
304 SUBQ $0x00000040, SI
305 ADDQ $0x00000008, BX
306 JMP accum
307
308finalize:
309 CMPQ SI, $0x00
310 JE return
311 SUBQ $0x40, CX
312 ADDQ SI, CX
313 VMOVDQU (CX), Y0
314 VMOVDQU 32(CX), Y5
315 VPXOR 121(DX), Y0, Y3
316 VPXOR 153(DX), Y5, Y6
317 VPSHUFD $0x31, Y3, Y4
318 VPSHUFD $0x31, Y6, Y7
319 VPMULUDQ Y3, Y4, Y3
320 VPMULUDQ Y6, Y7, Y6
321 VPSHUFD $0x4e, Y0, Y0
322 VPSHUFD $0x4e, Y5, Y5
323 VPADDQ Y1, Y0, Y1
324 VPADDQ Y1, Y3, Y1
325 VPADDQ Y2, Y5, Y2
326 VPADDQ Y2, Y6, Y2
327
328return:
329 VMOVDQU Y1, (AX)
330 VMOVDQU Y2, 32(AX)
331 VZEROUPPER
332 RET
333
334// func accumBlockAVX2(acc *[8]uint64, data *byte, key *byte)
335// Requires: AVX, AVX2
336TEXT ·accumBlockAVX2(SB), NOSPLIT, $0-24
337 MOVQ acc+0(FP), AX
338 MOVQ data+8(FP), CX
339 MOVQ key+16(FP), DX
340 VMOVDQU (AX), Y1
341 VMOVDQU 32(AX), Y2
342 VMOVDQU prime_avx<>+0(SB), Y0
343 VMOVDQU (CX), Y3
344 VMOVDQU 32(CX), Y6
345 VPXOR (DX), Y3, Y4
346 VPXOR 32(DX), Y6, Y7
347 VPSHUFD $0x31, Y4, Y5
348 VPSHUFD $0x31, Y7, Y8
349 VPMULUDQ Y4, Y5, Y4
350 VPMULUDQ Y7, Y8, Y7
351 VPSHUFD $0x4e, Y3, Y3
352 VPSHUFD $0x4e, Y6, Y6
353 VPADDQ Y1, Y3, Y1
354 VPADDQ Y1, Y4, Y1
355 VPADDQ Y2, Y6, Y2
356 VPADDQ Y2, Y7, Y2
357 VMOVDQU 64(CX), Y3
358 VMOVDQU 96(CX), Y6
359 VPXOR 8(DX), Y3, Y4
360 VPXOR 40(DX), Y6, Y7
361 VPSHUFD $0x31, Y4, Y5
362 VPSHUFD $0x31, Y7, Y8
363 VPMULUDQ Y4, Y5, Y4
364 VPMULUDQ Y7, Y8, Y7
365 VPSHUFD $0x4e, Y3, Y3
366 VPSHUFD $0x4e, Y6, Y6
367 VPADDQ Y1, Y3, Y1
368 VPADDQ Y1, Y4, Y1
369 VPADDQ Y2, Y6, Y2
370 VPADDQ Y2, Y7, Y2
371 VMOVDQU 128(CX), Y3
372 VMOVDQU 160(CX), Y6
373 VPXOR 16(DX), Y3, Y4
374 VPXOR 48(DX), Y6, Y7
375 VPSHUFD $0x31, Y4, Y5
376 VPSHUFD $0x31, Y7, Y8
377 VPMULUDQ Y4, Y5, Y4
378 VPMULUDQ Y7, Y8, Y7
379 VPSHUFD $0x4e, Y3, Y3
380 VPSHUFD $0x4e, Y6, Y6
381 VPADDQ Y1, Y3, Y1
382 VPADDQ Y1, Y4, Y1
383 VPADDQ Y2, Y6, Y2
384 VPADDQ Y2, Y7, Y2
385 VMOVDQU 192(CX), Y3
386 VMOVDQU 224(CX), Y6
387 VPXOR 24(DX), Y3, Y4
388 VPXOR 56(DX), Y6, Y7
389 VPSHUFD $0x31, Y4, Y5
390 VPSHUFD $0x31, Y7, Y8
391 VPMULUDQ Y4, Y5, Y4
392 VPMULUDQ Y7, Y8, Y7
393 VPSHUFD $0x4e, Y3, Y3
394 VPSHUFD $0x4e, Y6, Y6
395 VPADDQ Y1, Y3, Y1
396 VPADDQ Y1, Y4, Y1
397 VPADDQ Y2, Y6, Y2
398 VPADDQ Y2, Y7, Y2
399 VMOVDQU 256(CX), Y3
400 VMOVDQU 288(CX), Y6
401 VPXOR 32(DX), Y3, Y4
402 VPXOR 64(DX), Y6, Y7
403 VPSHUFD $0x31, Y4, Y5
404 VPSHUFD $0x31, Y7, Y8
405 VPMULUDQ Y4, Y5, Y4
406 VPMULUDQ Y7, Y8, Y7
407 VPSHUFD $0x4e, Y3, Y3
408 VPSHUFD $0x4e, Y6, Y6
409 VPADDQ Y1, Y3, Y1
410 VPADDQ Y1, Y4, Y1
411 VPADDQ Y2, Y6, Y2
412 VPADDQ Y2, Y7, Y2
413 VMOVDQU 320(CX), Y3
414 VMOVDQU 352(CX), Y6
415 VPXOR 40(DX), Y3, Y4
416 VPXOR 72(DX), Y6, Y7
417 VPSHUFD $0x31, Y4, Y5
418 VPSHUFD $0x31, Y7, Y8
419 VPMULUDQ Y4, Y5, Y4
420 VPMULUDQ Y7, Y8, Y7
421 VPSHUFD $0x4e, Y3, Y3
422 VPSHUFD $0x4e, Y6, Y6
423 VPADDQ Y1, Y3, Y1
424 VPADDQ Y1, Y4, Y1
425 VPADDQ Y2, Y6, Y2
426 VPADDQ Y2, Y7, Y2
427 VMOVDQU 384(CX), Y3
428 VMOVDQU 416(CX), Y6
429 VPXOR 48(DX), Y3, Y4
430 VPXOR 80(DX), Y6, Y7
431 VPSHUFD $0x31, Y4, Y5
432 VPSHUFD $0x31, Y7, Y8
433 VPMULUDQ Y4, Y5, Y4
434 VPMULUDQ Y7, Y8, Y7
435 VPSHUFD $0x4e, Y3, Y3
436 VPSHUFD $0x4e, Y6, Y6
437 VPADDQ Y1, Y3, Y1
438 VPADDQ Y1, Y4, Y1
439 VPADDQ Y2, Y6, Y2
440 VPADDQ Y2, Y7, Y2
441 VMOVDQU 448(CX), Y3
442 VMOVDQU 480(CX), Y6
443 VPXOR 56(DX), Y3, Y4
444 VPXOR 88(DX), Y6, Y7
445 VPSHUFD $0x31, Y4, Y5
446 VPSHUFD $0x31, Y7, Y8
447 VPMULUDQ Y4, Y5, Y4
448 VPMULUDQ Y7, Y8, Y7
449 VPSHUFD $0x4e, Y3, Y3
450 VPSHUFD $0x4e, Y6, Y6
451 VPADDQ Y1, Y3, Y1
452 VPADDQ Y1, Y4, Y1
453 VPADDQ Y2, Y6, Y2
454 VPADDQ Y2, Y7, Y2
455 VMOVDQU 512(CX), Y3
456 VMOVDQU 544(CX), Y6
457 VPXOR 64(DX), Y3, Y4
458 VPXOR 96(DX), Y6, Y7
459 VPSHUFD $0x31, Y4, Y5
460 VPSHUFD $0x31, Y7, Y8
461 VPMULUDQ Y4, Y5, Y4
462 VPMULUDQ Y7, Y8, Y7
463 VPSHUFD $0x4e, Y3, Y3
464 VPSHUFD $0x4e, Y6, Y6
465 VPADDQ Y1, Y3, Y1
466 VPADDQ Y1, Y4, Y1
467 VPADDQ Y2, Y6, Y2
468 VPADDQ Y2, Y7, Y2
469 VMOVDQU 576(CX), Y3
470 VMOVDQU 608(CX), Y6
471 VPXOR 72(DX), Y3, Y4
472 VPXOR 104(DX), Y6, Y7
473 VPSHUFD $0x31, Y4, Y5
474 VPSHUFD $0x31, Y7, Y8
475 VPMULUDQ Y4, Y5, Y4
476 VPMULUDQ Y7, Y8, Y7
477 VPSHUFD $0x4e, Y3, Y3
478 VPSHUFD $0x4e, Y6, Y6
479 VPADDQ Y1, Y3, Y1
480 VPADDQ Y1, Y4, Y1
481 VPADDQ Y2, Y6, Y2
482 VPADDQ Y2, Y7, Y2
483 VMOVDQU 640(CX), Y3
484 VMOVDQU 672(CX), Y6
485 VPXOR 80(DX), Y3, Y4
486 VPXOR 112(DX), Y6, Y7
487 VPSHUFD $0x31, Y4, Y5
488 VPSHUFD $0x31, Y7, Y8
489 VPMULUDQ Y4, Y5, Y4
490 VPMULUDQ Y7, Y8, Y7
491 VPSHUFD $0x4e, Y3, Y3
492 VPSHUFD $0x4e, Y6, Y6
493 VPADDQ Y1, Y3, Y1
494 VPADDQ Y1, Y4, Y1
495 VPADDQ Y2, Y6, Y2
496 VPADDQ Y2, Y7, Y2
497 VMOVDQU 704(CX), Y3
498 VMOVDQU 736(CX), Y6
499 VPXOR 88(DX), Y3, Y4
500 VPXOR 120(DX), Y6, Y7
501 VPSHUFD $0x31, Y4, Y5
502 VPSHUFD $0x31, Y7, Y8
503 VPMULUDQ Y4, Y5, Y4
504 VPMULUDQ Y7, Y8, Y7
505 VPSHUFD $0x4e, Y3, Y3
506 VPSHUFD $0x4e, Y6, Y6
507 VPADDQ Y1, Y3, Y1
508 VPADDQ Y1, Y4, Y1
509 VPADDQ Y2, Y6, Y2
510 VPADDQ Y2, Y7, Y2
511 VMOVDQU 768(CX), Y3
512 VMOVDQU 800(CX), Y6
513 VPXOR 96(DX), Y3, Y4
514 VPXOR 128(DX), Y6, Y7
515 VPSHUFD $0x31, Y4, Y5
516 VPSHUFD $0x31, Y7, Y8
517 VPMULUDQ Y4, Y5, Y4
518 VPMULUDQ Y7, Y8, Y7
519 VPSHUFD $0x4e, Y3, Y3
520 VPSHUFD $0x4e, Y6, Y6
521 VPADDQ Y1, Y3, Y1
522 VPADDQ Y1, Y4, Y1
523 VPADDQ Y2, Y6, Y2
524 VPADDQ Y2, Y7, Y2
525 VMOVDQU 832(CX), Y3
526 VMOVDQU 864(CX), Y6
527 VPXOR 104(DX), Y3, Y4
528 VPXOR 136(DX), Y6, Y7
529 VPSHUFD $0x31, Y4, Y5
530 VPSHUFD $0x31, Y7, Y8
531 VPMULUDQ Y4, Y5, Y4
532 VPMULUDQ Y7, Y8, Y7
533 VPSHUFD $0x4e, Y3, Y3
534 VPSHUFD $0x4e, Y6, Y6
535 VPADDQ Y1, Y3, Y1
536 VPADDQ Y1, Y4, Y1
537 VPADDQ Y2, Y6, Y2
538 VPADDQ Y2, Y7, Y2
539 VMOVDQU 896(CX), Y3
540 VMOVDQU 928(CX), Y6
541 VPXOR 112(DX), Y3, Y4
542 VPXOR 144(DX), Y6, Y7
543 VPSHUFD $0x31, Y4, Y5
544 VPSHUFD $0x31, Y7, Y8
545 VPMULUDQ Y4, Y5, Y4
546 VPMULUDQ Y7, Y8, Y7
547 VPSHUFD $0x4e, Y3, Y3
548 VPSHUFD $0x4e, Y6, Y6
549 VPADDQ Y1, Y3, Y1
550 VPADDQ Y1, Y4, Y1
551 VPADDQ Y2, Y6, Y2
552 VPADDQ Y2, Y7, Y2
553 VMOVDQU 960(CX), Y3
554 VMOVDQU 992(CX), Y6
555 VPXOR 120(DX), Y3, Y4
556 VPXOR 152(DX), Y6, Y7
557 VPSHUFD $0x31, Y4, Y5
558 VPSHUFD $0x31, Y7, Y8
559 VPMULUDQ Y4, Y5, Y4
560 VPMULUDQ Y7, Y8, Y7
561 VPSHUFD $0x4e, Y3, Y3
562 VPSHUFD $0x4e, Y6, Y6
563 VPADDQ Y1, Y3, Y1
564 VPADDQ Y1, Y4, Y1
565 VPADDQ Y2, Y6, Y2
566 VPADDQ Y2, Y7, Y2
567 VPSRLQ $0x2f, Y1, Y3
568 VPXOR Y1, Y3, Y3
569 VPXOR 128(DX), Y3, Y3
570 VPMULUDQ Y0, Y3, Y1
571 VPSHUFD $0xf5, Y3, Y3
572 VPMULUDQ Y0, Y3, Y3
573 VPSLLQ $0x20, Y3, Y3
574 VPADDQ Y1, Y3, Y1
575 VPSRLQ $0x2f, Y2, Y3
576 VPXOR Y2, Y3, Y3
577 VPXOR 160(DX), Y3, Y3
578 VPMULUDQ Y0, Y3, Y2
579 VPSHUFD $0xf5, Y3, Y3
580 VPMULUDQ Y0, Y3, Y3
581 VPSLLQ $0x20, Y3, Y3
582 VPADDQ Y2, Y3, Y2
583 VMOVDQU Y1, (AX)
584 VMOVDQU Y2, 32(AX)
585 VZEROUPPER
586 RET
View as plain text