1// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
2
3//go:build !appengine && !noasm && gc && !noasm
4
5// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
6// Requires: CMOV
7TEXT ·sequenceDecs_decode_amd64(SB), $8-32
8 MOVQ br+8(FP), CX
9 MOVQ 24(CX), DX
10 MOVBQZX 32(CX), BX
11 MOVQ (CX), AX
12 MOVQ 8(CX), SI
13 ADDQ SI, AX
14 MOVQ AX, (SP)
15 MOVQ ctx+16(FP), AX
16 MOVQ 72(AX), DI
17 MOVQ 80(AX), R8
18 MOVQ 88(AX), R9
19 MOVQ 104(AX), R10
20 MOVQ s+0(FP), AX
21 MOVQ 144(AX), R11
22 MOVQ 152(AX), R12
23 MOVQ 160(AX), R13
24
25sequenceDecs_decode_amd64_main_loop:
26 MOVQ (SP), R14
27
28 // Fill bitreader to have enough for the offset and match length.
29 CMPQ SI, $0x08
30 JL sequenceDecs_decode_amd64_fill_byte_by_byte
31 MOVQ BX, AX
32 SHRQ $0x03, AX
33 SUBQ AX, R14
34 MOVQ (R14), DX
35 SUBQ AX, SI
36 ANDQ $0x07, BX
37 JMP sequenceDecs_decode_amd64_fill_end
38
39sequenceDecs_decode_amd64_fill_byte_by_byte:
40 CMPQ SI, $0x00
41 JLE sequenceDecs_decode_amd64_fill_check_overread
42 CMPQ BX, $0x07
43 JLE sequenceDecs_decode_amd64_fill_end
44 SHLQ $0x08, DX
45 SUBQ $0x01, R14
46 SUBQ $0x01, SI
47 SUBQ $0x08, BX
48 MOVBQZX (R14), AX
49 ORQ AX, DX
50 JMP sequenceDecs_decode_amd64_fill_byte_by_byte
51
52sequenceDecs_decode_amd64_fill_check_overread:
53 CMPQ BX, $0x40
54 JA error_overread
55
56sequenceDecs_decode_amd64_fill_end:
57 // Update offset
58 MOVQ R9, AX
59 MOVQ BX, CX
60 MOVQ DX, R15
61 SHLQ CL, R15
62 MOVB AH, CL
63 SHRQ $0x20, AX
64 TESTQ CX, CX
65 JZ sequenceDecs_decode_amd64_of_update_zero
66 ADDQ CX, BX
67 CMPQ BX, $0x40
68 JA sequenceDecs_decode_amd64_of_update_zero
69 CMPQ CX, $0x40
70 JAE sequenceDecs_decode_amd64_of_update_zero
71 NEGQ CX
72 SHRQ CL, R15
73 ADDQ R15, AX
74
75sequenceDecs_decode_amd64_of_update_zero:
76 MOVQ AX, 16(R10)
77
78 // Update match length
79 MOVQ R8, AX
80 MOVQ BX, CX
81 MOVQ DX, R15
82 SHLQ CL, R15
83 MOVB AH, CL
84 SHRQ $0x20, AX
85 TESTQ CX, CX
86 JZ sequenceDecs_decode_amd64_ml_update_zero
87 ADDQ CX, BX
88 CMPQ BX, $0x40
89 JA sequenceDecs_decode_amd64_ml_update_zero
90 CMPQ CX, $0x40
91 JAE sequenceDecs_decode_amd64_ml_update_zero
92 NEGQ CX
93 SHRQ CL, R15
94 ADDQ R15, AX
95
96sequenceDecs_decode_amd64_ml_update_zero:
97 MOVQ AX, 8(R10)
98
99 // Fill bitreader to have enough for the remaining
100 CMPQ SI, $0x08
101 JL sequenceDecs_decode_amd64_fill_2_byte_by_byte
102 MOVQ BX, AX
103 SHRQ $0x03, AX
104 SUBQ AX, R14
105 MOVQ (R14), DX
106 SUBQ AX, SI
107 ANDQ $0x07, BX
108 JMP sequenceDecs_decode_amd64_fill_2_end
109
110sequenceDecs_decode_amd64_fill_2_byte_by_byte:
111 CMPQ SI, $0x00
112 JLE sequenceDecs_decode_amd64_fill_2_check_overread
113 CMPQ BX, $0x07
114 JLE sequenceDecs_decode_amd64_fill_2_end
115 SHLQ $0x08, DX
116 SUBQ $0x01, R14
117 SUBQ $0x01, SI
118 SUBQ $0x08, BX
119 MOVBQZX (R14), AX
120 ORQ AX, DX
121 JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte
122
123sequenceDecs_decode_amd64_fill_2_check_overread:
124 CMPQ BX, $0x40
125 JA error_overread
126
127sequenceDecs_decode_amd64_fill_2_end:
128 // Update literal length
129 MOVQ DI, AX
130 MOVQ BX, CX
131 MOVQ DX, R15
132 SHLQ CL, R15
133 MOVB AH, CL
134 SHRQ $0x20, AX
135 TESTQ CX, CX
136 JZ sequenceDecs_decode_amd64_ll_update_zero
137 ADDQ CX, BX
138 CMPQ BX, $0x40
139 JA sequenceDecs_decode_amd64_ll_update_zero
140 CMPQ CX, $0x40
141 JAE sequenceDecs_decode_amd64_ll_update_zero
142 NEGQ CX
143 SHRQ CL, R15
144 ADDQ R15, AX
145
146sequenceDecs_decode_amd64_ll_update_zero:
147 MOVQ AX, (R10)
148
149 // Fill bitreader for state updates
150 MOVQ R14, (SP)
151 MOVQ R9, AX
152 SHRQ $0x08, AX
153 MOVBQZX AL, AX
154 MOVQ ctx+16(FP), CX
155 CMPQ 96(CX), $0x00
156 JZ sequenceDecs_decode_amd64_skip_update
157
158 // Update Literal Length State
159 MOVBQZX DI, R14
160 SHRL $0x10, DI
161 LEAQ (BX)(R14*1), CX
162 MOVQ DX, R15
163 MOVQ CX, BX
164 ROLQ CL, R15
165 MOVL $0x00000001, BP
166 MOVB R14, CL
167 SHLL CL, BP
168 DECL BP
169 ANDQ BP, R15
170 ADDQ R15, DI
171
172 // Load ctx.llTable
173 MOVQ ctx+16(FP), CX
174 MOVQ (CX), CX
175 MOVQ (CX)(DI*8), DI
176
177 // Update Match Length State
178 MOVBQZX R8, R14
179 SHRL $0x10, R8
180 LEAQ (BX)(R14*1), CX
181 MOVQ DX, R15
182 MOVQ CX, BX
183 ROLQ CL, R15
184 MOVL $0x00000001, BP
185 MOVB R14, CL
186 SHLL CL, BP
187 DECL BP
188 ANDQ BP, R15
189 ADDQ R15, R8
190
191 // Load ctx.mlTable
192 MOVQ ctx+16(FP), CX
193 MOVQ 24(CX), CX
194 MOVQ (CX)(R8*8), R8
195
196 // Update Offset State
197 MOVBQZX R9, R14
198 SHRL $0x10, R9
199 LEAQ (BX)(R14*1), CX
200 MOVQ DX, R15
201 MOVQ CX, BX
202 ROLQ CL, R15
203 MOVL $0x00000001, BP
204 MOVB R14, CL
205 SHLL CL, BP
206 DECL BP
207 ANDQ BP, R15
208 ADDQ R15, R9
209
210 // Load ctx.ofTable
211 MOVQ ctx+16(FP), CX
212 MOVQ 48(CX), CX
213 MOVQ (CX)(R9*8), R9
214
215sequenceDecs_decode_amd64_skip_update:
216 // Adjust offset
217 MOVQ 16(R10), CX
218 CMPQ AX, $0x01
219 JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
220 MOVQ R12, R13
221 MOVQ R11, R12
222 MOVQ CX, R11
223 JMP sequenceDecs_decode_amd64_after_adjust
224
225sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
226 CMPQ (R10), $0x00000000
227 JNE sequenceDecs_decode_amd64_adjust_offset_maybezero
228 INCQ CX
229 JMP sequenceDecs_decode_amd64_adjust_offset_nonzero
230
231sequenceDecs_decode_amd64_adjust_offset_maybezero:
232 TESTQ CX, CX
233 JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero
234 MOVQ R11, CX
235 JMP sequenceDecs_decode_amd64_after_adjust
236
237sequenceDecs_decode_amd64_adjust_offset_nonzero:
238 CMPQ CX, $0x01
239 JB sequenceDecs_decode_amd64_adjust_zero
240 JEQ sequenceDecs_decode_amd64_adjust_one
241 CMPQ CX, $0x02
242 JA sequenceDecs_decode_amd64_adjust_three
243 JMP sequenceDecs_decode_amd64_adjust_two
244
245sequenceDecs_decode_amd64_adjust_zero:
246 MOVQ R11, AX
247 JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
248
249sequenceDecs_decode_amd64_adjust_one:
250 MOVQ R12, AX
251 JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
252
253sequenceDecs_decode_amd64_adjust_two:
254 MOVQ R13, AX
255 JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
256
257sequenceDecs_decode_amd64_adjust_three:
258 LEAQ -1(R11), AX
259
260sequenceDecs_decode_amd64_adjust_test_temp_valid:
261 TESTQ AX, AX
262 JNZ sequenceDecs_decode_amd64_adjust_temp_valid
263 MOVQ $0x00000001, AX
264
265sequenceDecs_decode_amd64_adjust_temp_valid:
266 CMPQ CX, $0x01
267 CMOVQNE R12, R13
268 MOVQ R11, R12
269 MOVQ AX, R11
270 MOVQ AX, CX
271
272sequenceDecs_decode_amd64_after_adjust:
273 MOVQ CX, 16(R10)
274
275 // Check values
276 MOVQ 8(R10), AX
277 MOVQ (R10), R14
278 LEAQ (AX)(R14*1), R15
279 MOVQ s+0(FP), BP
280 ADDQ R15, 256(BP)
281 MOVQ ctx+16(FP), R15
282 SUBQ R14, 128(R15)
283 JS error_not_enough_literals
284 CMPQ AX, $0x00020002
285 JA sequenceDecs_decode_amd64_error_match_len_too_big
286 TESTQ CX, CX
287 JNZ sequenceDecs_decode_amd64_match_len_ofs_ok
288 TESTQ AX, AX
289 JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
290
291sequenceDecs_decode_amd64_match_len_ofs_ok:
292 ADDQ $0x18, R10
293 MOVQ ctx+16(FP), AX
294 DECQ 96(AX)
295 JNS sequenceDecs_decode_amd64_main_loop
296 MOVQ s+0(FP), AX
297 MOVQ R11, 144(AX)
298 MOVQ R12, 152(AX)
299 MOVQ R13, 160(AX)
300 MOVQ br+8(FP), AX
301 MOVQ DX, 24(AX)
302 MOVB BL, 32(AX)
303 MOVQ SI, 8(AX)
304
305 // Return success
306 MOVQ $0x00000000, ret+24(FP)
307 RET
308
309 // Return with match length error
310sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
311 MOVQ $0x00000001, ret+24(FP)
312 RET
313
314 // Return with match too long error
315sequenceDecs_decode_amd64_error_match_len_too_big:
316 MOVQ $0x00000002, ret+24(FP)
317 RET
318
319 // Return with match offset too long error
320 MOVQ $0x00000003, ret+24(FP)
321 RET
322
323 // Return with not enough literals error
324error_not_enough_literals:
325 MOVQ $0x00000004, ret+24(FP)
326 RET
327
328 // Return with overread error
329error_overread:
330 MOVQ $0x00000006, ret+24(FP)
331 RET
332
333// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
334// Requires: CMOV
335TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
336 MOVQ br+8(FP), CX
337 MOVQ 24(CX), DX
338 MOVBQZX 32(CX), BX
339 MOVQ (CX), AX
340 MOVQ 8(CX), SI
341 ADDQ SI, AX
342 MOVQ AX, (SP)
343 MOVQ ctx+16(FP), AX
344 MOVQ 72(AX), DI
345 MOVQ 80(AX), R8
346 MOVQ 88(AX), R9
347 MOVQ 104(AX), R10
348 MOVQ s+0(FP), AX
349 MOVQ 144(AX), R11
350 MOVQ 152(AX), R12
351 MOVQ 160(AX), R13
352
353sequenceDecs_decode_56_amd64_main_loop:
354 MOVQ (SP), R14
355
356 // Fill bitreader to have enough for the offset and match length.
357 CMPQ SI, $0x08
358 JL sequenceDecs_decode_56_amd64_fill_byte_by_byte
359 MOVQ BX, AX
360 SHRQ $0x03, AX
361 SUBQ AX, R14
362 MOVQ (R14), DX
363 SUBQ AX, SI
364 ANDQ $0x07, BX
365 JMP sequenceDecs_decode_56_amd64_fill_end
366
367sequenceDecs_decode_56_amd64_fill_byte_by_byte:
368 CMPQ SI, $0x00
369 JLE sequenceDecs_decode_56_amd64_fill_check_overread
370 CMPQ BX, $0x07
371 JLE sequenceDecs_decode_56_amd64_fill_end
372 SHLQ $0x08, DX
373 SUBQ $0x01, R14
374 SUBQ $0x01, SI
375 SUBQ $0x08, BX
376 MOVBQZX (R14), AX
377 ORQ AX, DX
378 JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte
379
380sequenceDecs_decode_56_amd64_fill_check_overread:
381 CMPQ BX, $0x40
382 JA error_overread
383
384sequenceDecs_decode_56_amd64_fill_end:
385 // Update offset
386 MOVQ R9, AX
387 MOVQ BX, CX
388 MOVQ DX, R15
389 SHLQ CL, R15
390 MOVB AH, CL
391 SHRQ $0x20, AX
392 TESTQ CX, CX
393 JZ sequenceDecs_decode_56_amd64_of_update_zero
394 ADDQ CX, BX
395 CMPQ BX, $0x40
396 JA sequenceDecs_decode_56_amd64_of_update_zero
397 CMPQ CX, $0x40
398 JAE sequenceDecs_decode_56_amd64_of_update_zero
399 NEGQ CX
400 SHRQ CL, R15
401 ADDQ R15, AX
402
403sequenceDecs_decode_56_amd64_of_update_zero:
404 MOVQ AX, 16(R10)
405
406 // Update match length
407 MOVQ R8, AX
408 MOVQ BX, CX
409 MOVQ DX, R15
410 SHLQ CL, R15
411 MOVB AH, CL
412 SHRQ $0x20, AX
413 TESTQ CX, CX
414 JZ sequenceDecs_decode_56_amd64_ml_update_zero
415 ADDQ CX, BX
416 CMPQ BX, $0x40
417 JA sequenceDecs_decode_56_amd64_ml_update_zero
418 CMPQ CX, $0x40
419 JAE sequenceDecs_decode_56_amd64_ml_update_zero
420 NEGQ CX
421 SHRQ CL, R15
422 ADDQ R15, AX
423
424sequenceDecs_decode_56_amd64_ml_update_zero:
425 MOVQ AX, 8(R10)
426
427 // Update literal length
428 MOVQ DI, AX
429 MOVQ BX, CX
430 MOVQ DX, R15
431 SHLQ CL, R15
432 MOVB AH, CL
433 SHRQ $0x20, AX
434 TESTQ CX, CX
435 JZ sequenceDecs_decode_56_amd64_ll_update_zero
436 ADDQ CX, BX
437 CMPQ BX, $0x40
438 JA sequenceDecs_decode_56_amd64_ll_update_zero
439 CMPQ CX, $0x40
440 JAE sequenceDecs_decode_56_amd64_ll_update_zero
441 NEGQ CX
442 SHRQ CL, R15
443 ADDQ R15, AX
444
445sequenceDecs_decode_56_amd64_ll_update_zero:
446 MOVQ AX, (R10)
447
448 // Fill bitreader for state updates
449 MOVQ R14, (SP)
450 MOVQ R9, AX
451 SHRQ $0x08, AX
452 MOVBQZX AL, AX
453 MOVQ ctx+16(FP), CX
454 CMPQ 96(CX), $0x00
455 JZ sequenceDecs_decode_56_amd64_skip_update
456
457 // Update Literal Length State
458 MOVBQZX DI, R14
459 SHRL $0x10, DI
460 LEAQ (BX)(R14*1), CX
461 MOVQ DX, R15
462 MOVQ CX, BX
463 ROLQ CL, R15
464 MOVL $0x00000001, BP
465 MOVB R14, CL
466 SHLL CL, BP
467 DECL BP
468 ANDQ BP, R15
469 ADDQ R15, DI
470
471 // Load ctx.llTable
472 MOVQ ctx+16(FP), CX
473 MOVQ (CX), CX
474 MOVQ (CX)(DI*8), DI
475
476 // Update Match Length State
477 MOVBQZX R8, R14
478 SHRL $0x10, R8
479 LEAQ (BX)(R14*1), CX
480 MOVQ DX, R15
481 MOVQ CX, BX
482 ROLQ CL, R15
483 MOVL $0x00000001, BP
484 MOVB R14, CL
485 SHLL CL, BP
486 DECL BP
487 ANDQ BP, R15
488 ADDQ R15, R8
489
490 // Load ctx.mlTable
491 MOVQ ctx+16(FP), CX
492 MOVQ 24(CX), CX
493 MOVQ (CX)(R8*8), R8
494
495 // Update Offset State
496 MOVBQZX R9, R14
497 SHRL $0x10, R9
498 LEAQ (BX)(R14*1), CX
499 MOVQ DX, R15
500 MOVQ CX, BX
501 ROLQ CL, R15
502 MOVL $0x00000001, BP
503 MOVB R14, CL
504 SHLL CL, BP
505 DECL BP
506 ANDQ BP, R15
507 ADDQ R15, R9
508
509 // Load ctx.ofTable
510 MOVQ ctx+16(FP), CX
511 MOVQ 48(CX), CX
512 MOVQ (CX)(R9*8), R9
513
514sequenceDecs_decode_56_amd64_skip_update:
515 // Adjust offset
516 MOVQ 16(R10), CX
517 CMPQ AX, $0x01
518 JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
519 MOVQ R12, R13
520 MOVQ R11, R12
521 MOVQ CX, R11
522 JMP sequenceDecs_decode_56_amd64_after_adjust
523
524sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
525 CMPQ (R10), $0x00000000
526 JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero
527 INCQ CX
528 JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero
529
530sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
531 TESTQ CX, CX
532 JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero
533 MOVQ R11, CX
534 JMP sequenceDecs_decode_56_amd64_after_adjust
535
536sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
537 CMPQ CX, $0x01
538 JB sequenceDecs_decode_56_amd64_adjust_zero
539 JEQ sequenceDecs_decode_56_amd64_adjust_one
540 CMPQ CX, $0x02
541 JA sequenceDecs_decode_56_amd64_adjust_three
542 JMP sequenceDecs_decode_56_amd64_adjust_two
543
544sequenceDecs_decode_56_amd64_adjust_zero:
545 MOVQ R11, AX
546 JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
547
548sequenceDecs_decode_56_amd64_adjust_one:
549 MOVQ R12, AX
550 JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
551
552sequenceDecs_decode_56_amd64_adjust_two:
553 MOVQ R13, AX
554 JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
555
556sequenceDecs_decode_56_amd64_adjust_three:
557 LEAQ -1(R11), AX
558
559sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
560 TESTQ AX, AX
561 JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid
562 MOVQ $0x00000001, AX
563
564sequenceDecs_decode_56_amd64_adjust_temp_valid:
565 CMPQ CX, $0x01
566 CMOVQNE R12, R13
567 MOVQ R11, R12
568 MOVQ AX, R11
569 MOVQ AX, CX
570
571sequenceDecs_decode_56_amd64_after_adjust:
572 MOVQ CX, 16(R10)
573
574 // Check values
575 MOVQ 8(R10), AX
576 MOVQ (R10), R14
577 LEAQ (AX)(R14*1), R15
578 MOVQ s+0(FP), BP
579 ADDQ R15, 256(BP)
580 MOVQ ctx+16(FP), R15
581 SUBQ R14, 128(R15)
582 JS error_not_enough_literals
583 CMPQ AX, $0x00020002
584 JA sequenceDecs_decode_56_amd64_error_match_len_too_big
585 TESTQ CX, CX
586 JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok
587 TESTQ AX, AX
588 JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
589
590sequenceDecs_decode_56_amd64_match_len_ofs_ok:
591 ADDQ $0x18, R10
592 MOVQ ctx+16(FP), AX
593 DECQ 96(AX)
594 JNS sequenceDecs_decode_56_amd64_main_loop
595 MOVQ s+0(FP), AX
596 MOVQ R11, 144(AX)
597 MOVQ R12, 152(AX)
598 MOVQ R13, 160(AX)
599 MOVQ br+8(FP), AX
600 MOVQ DX, 24(AX)
601 MOVB BL, 32(AX)
602 MOVQ SI, 8(AX)
603
604 // Return success
605 MOVQ $0x00000000, ret+24(FP)
606 RET
607
608 // Return with match length error
609sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
610 MOVQ $0x00000001, ret+24(FP)
611 RET
612
613 // Return with match too long error
614sequenceDecs_decode_56_amd64_error_match_len_too_big:
615 MOVQ $0x00000002, ret+24(FP)
616 RET
617
618 // Return with match offset too long error
619 MOVQ $0x00000003, ret+24(FP)
620 RET
621
622 // Return with not enough literals error
623error_not_enough_literals:
624 MOVQ $0x00000004, ret+24(FP)
625 RET
626
627 // Return with overread error
628error_overread:
629 MOVQ $0x00000006, ret+24(FP)
630 RET
631
632// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
633// Requires: BMI, BMI2, CMOV
634TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
635 MOVQ br+8(FP), BX
636 MOVQ 24(BX), AX
637 MOVBQZX 32(BX), DX
638 MOVQ (BX), CX
639 MOVQ 8(BX), BX
640 ADDQ BX, CX
641 MOVQ CX, (SP)
642 MOVQ ctx+16(FP), CX
643 MOVQ 72(CX), SI
644 MOVQ 80(CX), DI
645 MOVQ 88(CX), R8
646 MOVQ 104(CX), R9
647 MOVQ s+0(FP), CX
648 MOVQ 144(CX), R10
649 MOVQ 152(CX), R11
650 MOVQ 160(CX), R12
651
652sequenceDecs_decode_bmi2_main_loop:
653 MOVQ (SP), R13
654
655 // Fill bitreader to have enough for the offset and match length.
656 CMPQ BX, $0x08
657 JL sequenceDecs_decode_bmi2_fill_byte_by_byte
658 MOVQ DX, CX
659 SHRQ $0x03, CX
660 SUBQ CX, R13
661 MOVQ (R13), AX
662 SUBQ CX, BX
663 ANDQ $0x07, DX
664 JMP sequenceDecs_decode_bmi2_fill_end
665
666sequenceDecs_decode_bmi2_fill_byte_by_byte:
667 CMPQ BX, $0x00
668 JLE sequenceDecs_decode_bmi2_fill_check_overread
669 CMPQ DX, $0x07
670 JLE sequenceDecs_decode_bmi2_fill_end
671 SHLQ $0x08, AX
672 SUBQ $0x01, R13
673 SUBQ $0x01, BX
674 SUBQ $0x08, DX
675 MOVBQZX (R13), CX
676 ORQ CX, AX
677 JMP sequenceDecs_decode_bmi2_fill_byte_by_byte
678
679sequenceDecs_decode_bmi2_fill_check_overread:
680 CMPQ DX, $0x40
681 JA error_overread
682
683sequenceDecs_decode_bmi2_fill_end:
684 // Update offset
685 MOVQ $0x00000808, CX
686 BEXTRQ CX, R8, R14
687 MOVQ AX, R15
688 LEAQ (DX)(R14*1), CX
689 ROLQ CL, R15
690 BZHIQ R14, R15, R15
691 MOVQ CX, DX
692 MOVQ R8, CX
693 SHRQ $0x20, CX
694 ADDQ R15, CX
695 MOVQ CX, 16(R9)
696
697 // Update match length
698 MOVQ $0x00000808, CX
699 BEXTRQ CX, DI, R14
700 MOVQ AX, R15
701 LEAQ (DX)(R14*1), CX
702 ROLQ CL, R15
703 BZHIQ R14, R15, R15
704 MOVQ CX, DX
705 MOVQ DI, CX
706 SHRQ $0x20, CX
707 ADDQ R15, CX
708 MOVQ CX, 8(R9)
709
710 // Fill bitreader to have enough for the remaining
711 CMPQ BX, $0x08
712 JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte
713 MOVQ DX, CX
714 SHRQ $0x03, CX
715 SUBQ CX, R13
716 MOVQ (R13), AX
717 SUBQ CX, BX
718 ANDQ $0x07, DX
719 JMP sequenceDecs_decode_bmi2_fill_2_end
720
721sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
722 CMPQ BX, $0x00
723 JLE sequenceDecs_decode_bmi2_fill_2_check_overread
724 CMPQ DX, $0x07
725 JLE sequenceDecs_decode_bmi2_fill_2_end
726 SHLQ $0x08, AX
727 SUBQ $0x01, R13
728 SUBQ $0x01, BX
729 SUBQ $0x08, DX
730 MOVBQZX (R13), CX
731 ORQ CX, AX
732 JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte
733
734sequenceDecs_decode_bmi2_fill_2_check_overread:
735 CMPQ DX, $0x40
736 JA error_overread
737
738sequenceDecs_decode_bmi2_fill_2_end:
739 // Update literal length
740 MOVQ $0x00000808, CX
741 BEXTRQ CX, SI, R14
742 MOVQ AX, R15
743 LEAQ (DX)(R14*1), CX
744 ROLQ CL, R15
745 BZHIQ R14, R15, R15
746 MOVQ CX, DX
747 MOVQ SI, CX
748 SHRQ $0x20, CX
749 ADDQ R15, CX
750 MOVQ CX, (R9)
751
752 // Fill bitreader for state updates
753 MOVQ R13, (SP)
754 MOVQ $0x00000808, CX
755 BEXTRQ CX, R8, R13
756 MOVQ ctx+16(FP), CX
757 CMPQ 96(CX), $0x00
758 JZ sequenceDecs_decode_bmi2_skip_update
759 LEAQ (SI)(DI*1), R14
760 ADDQ R8, R14
761 MOVBQZX R14, R14
762 LEAQ (DX)(R14*1), CX
763 MOVQ AX, R15
764 MOVQ CX, DX
765 ROLQ CL, R15
766 BZHIQ R14, R15, R15
767
768 // Update Offset State
769 BZHIQ R8, R15, CX
770 SHRXQ R8, R15, R15
771 SHRL $0x10, R8
772 ADDQ CX, R8
773
774 // Load ctx.ofTable
775 MOVQ ctx+16(FP), CX
776 MOVQ 48(CX), CX
777 MOVQ (CX)(R8*8), R8
778
779 // Update Match Length State
780 BZHIQ DI, R15, CX
781 SHRXQ DI, R15, R15
782 SHRL $0x10, DI
783 ADDQ CX, DI
784
785 // Load ctx.mlTable
786 MOVQ ctx+16(FP), CX
787 MOVQ 24(CX), CX
788 MOVQ (CX)(DI*8), DI
789
790 // Update Literal Length State
791 BZHIQ SI, R15, CX
792 SHRL $0x10, SI
793 ADDQ CX, SI
794
795 // Load ctx.llTable
796 MOVQ ctx+16(FP), CX
797 MOVQ (CX), CX
798 MOVQ (CX)(SI*8), SI
799
800sequenceDecs_decode_bmi2_skip_update:
801 // Adjust offset
802 MOVQ 16(R9), CX
803 CMPQ R13, $0x01
804 JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
805 MOVQ R11, R12
806 MOVQ R10, R11
807 MOVQ CX, R10
808 JMP sequenceDecs_decode_bmi2_after_adjust
809
810sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
811 CMPQ (R9), $0x00000000
812 JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero
813 INCQ CX
814 JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero
815
816sequenceDecs_decode_bmi2_adjust_offset_maybezero:
817 TESTQ CX, CX
818 JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero
819 MOVQ R10, CX
820 JMP sequenceDecs_decode_bmi2_after_adjust
821
822sequenceDecs_decode_bmi2_adjust_offset_nonzero:
823 CMPQ CX, $0x01
824 JB sequenceDecs_decode_bmi2_adjust_zero
825 JEQ sequenceDecs_decode_bmi2_adjust_one
826 CMPQ CX, $0x02
827 JA sequenceDecs_decode_bmi2_adjust_three
828 JMP sequenceDecs_decode_bmi2_adjust_two
829
830sequenceDecs_decode_bmi2_adjust_zero:
831 MOVQ R10, R13
832 JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
833
834sequenceDecs_decode_bmi2_adjust_one:
835 MOVQ R11, R13
836 JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
837
838sequenceDecs_decode_bmi2_adjust_two:
839 MOVQ R12, R13
840 JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
841
842sequenceDecs_decode_bmi2_adjust_three:
843 LEAQ -1(R10), R13
844
845sequenceDecs_decode_bmi2_adjust_test_temp_valid:
846 TESTQ R13, R13
847 JNZ sequenceDecs_decode_bmi2_adjust_temp_valid
848 MOVQ $0x00000001, R13
849
850sequenceDecs_decode_bmi2_adjust_temp_valid:
851 CMPQ CX, $0x01
852 CMOVQNE R11, R12
853 MOVQ R10, R11
854 MOVQ R13, R10
855 MOVQ R13, CX
856
857sequenceDecs_decode_bmi2_after_adjust:
858 MOVQ CX, 16(R9)
859
860 // Check values
861 MOVQ 8(R9), R13
862 MOVQ (R9), R14
863 LEAQ (R13)(R14*1), R15
864 MOVQ s+0(FP), BP
865 ADDQ R15, 256(BP)
866 MOVQ ctx+16(FP), R15
867 SUBQ R14, 128(R15)
868 JS error_not_enough_literals
869 CMPQ R13, $0x00020002
870 JA sequenceDecs_decode_bmi2_error_match_len_too_big
871 TESTQ CX, CX
872 JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok
873 TESTQ R13, R13
874 JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
875
876sequenceDecs_decode_bmi2_match_len_ofs_ok:
877 ADDQ $0x18, R9
878 MOVQ ctx+16(FP), CX
879 DECQ 96(CX)
880 JNS sequenceDecs_decode_bmi2_main_loop
881 MOVQ s+0(FP), CX
882 MOVQ R10, 144(CX)
883 MOVQ R11, 152(CX)
884 MOVQ R12, 160(CX)
885 MOVQ br+8(FP), CX
886 MOVQ AX, 24(CX)
887 MOVB DL, 32(CX)
888 MOVQ BX, 8(CX)
889
890 // Return success
891 MOVQ $0x00000000, ret+24(FP)
892 RET
893
894 // Return with match length error
895sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
896 MOVQ $0x00000001, ret+24(FP)
897 RET
898
899 // Return with match too long error
900sequenceDecs_decode_bmi2_error_match_len_too_big:
901 MOVQ $0x00000002, ret+24(FP)
902 RET
903
904 // Return with match offset too long error
905 MOVQ $0x00000003, ret+24(FP)
906 RET
907
908 // Return with not enough literals error
909error_not_enough_literals:
910 MOVQ $0x00000004, ret+24(FP)
911 RET
912
913 // Return with overread error
914error_overread:
915 MOVQ $0x00000006, ret+24(FP)
916 RET
917
918// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
919// Requires: BMI, BMI2, CMOV
920TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
921 MOVQ br+8(FP), BX
922 MOVQ 24(BX), AX
923 MOVBQZX 32(BX), DX
924 MOVQ (BX), CX
925 MOVQ 8(BX), BX
926 ADDQ BX, CX
927 MOVQ CX, (SP)
928 MOVQ ctx+16(FP), CX
929 MOVQ 72(CX), SI
930 MOVQ 80(CX), DI
931 MOVQ 88(CX), R8
932 MOVQ 104(CX), R9
933 MOVQ s+0(FP), CX
934 MOVQ 144(CX), R10
935 MOVQ 152(CX), R11
936 MOVQ 160(CX), R12
937
938sequenceDecs_decode_56_bmi2_main_loop:
939 MOVQ (SP), R13
940
941 // Fill bitreader to have enough for the offset and match length.
942 CMPQ BX, $0x08
943 JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte
944 MOVQ DX, CX
945 SHRQ $0x03, CX
946 SUBQ CX, R13
947 MOVQ (R13), AX
948 SUBQ CX, BX
949 ANDQ $0x07, DX
950 JMP sequenceDecs_decode_56_bmi2_fill_end
951
952sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
953 CMPQ BX, $0x00
954 JLE sequenceDecs_decode_56_bmi2_fill_check_overread
955 CMPQ DX, $0x07
956 JLE sequenceDecs_decode_56_bmi2_fill_end
957 SHLQ $0x08, AX
958 SUBQ $0x01, R13
959 SUBQ $0x01, BX
960 SUBQ $0x08, DX
961 MOVBQZX (R13), CX
962 ORQ CX, AX
963 JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte
964
965sequenceDecs_decode_56_bmi2_fill_check_overread:
966 CMPQ DX, $0x40
967 JA error_overread
968
969sequenceDecs_decode_56_bmi2_fill_end:
970 // Update offset
971 MOVQ $0x00000808, CX
972 BEXTRQ CX, R8, R14
973 MOVQ AX, R15
974 LEAQ (DX)(R14*1), CX
975 ROLQ CL, R15
976 BZHIQ R14, R15, R15
977 MOVQ CX, DX
978 MOVQ R8, CX
979 SHRQ $0x20, CX
980 ADDQ R15, CX
981 MOVQ CX, 16(R9)
982
983 // Update match length
984 MOVQ $0x00000808, CX
985 BEXTRQ CX, DI, R14
986 MOVQ AX, R15
987 LEAQ (DX)(R14*1), CX
988 ROLQ CL, R15
989 BZHIQ R14, R15, R15
990 MOVQ CX, DX
991 MOVQ DI, CX
992 SHRQ $0x20, CX
993 ADDQ R15, CX
994 MOVQ CX, 8(R9)
995
996 // Update literal length
997 MOVQ $0x00000808, CX
998 BEXTRQ CX, SI, R14
999 MOVQ AX, R15
1000 LEAQ (DX)(R14*1), CX
1001 ROLQ CL, R15
1002 BZHIQ R14, R15, R15
1003 MOVQ CX, DX
1004 MOVQ SI, CX
1005 SHRQ $0x20, CX
1006 ADDQ R15, CX
1007 MOVQ CX, (R9)
1008
1009 // Fill bitreader for state updates
1010 MOVQ R13, (SP)
1011 MOVQ $0x00000808, CX
1012 BEXTRQ CX, R8, R13
1013 MOVQ ctx+16(FP), CX
1014 CMPQ 96(CX), $0x00
1015 JZ sequenceDecs_decode_56_bmi2_skip_update
1016 LEAQ (SI)(DI*1), R14
1017 ADDQ R8, R14
1018 MOVBQZX R14, R14
1019 LEAQ (DX)(R14*1), CX
1020 MOVQ AX, R15
1021 MOVQ CX, DX
1022 ROLQ CL, R15
1023 BZHIQ R14, R15, R15
1024
1025 // Update Offset State
1026 BZHIQ R8, R15, CX
1027 SHRXQ R8, R15, R15
1028 SHRL $0x10, R8
1029 ADDQ CX, R8
1030
1031 // Load ctx.ofTable
1032 MOVQ ctx+16(FP), CX
1033 MOVQ 48(CX), CX
1034 MOVQ (CX)(R8*8), R8
1035
1036 // Update Match Length State
1037 BZHIQ DI, R15, CX
1038 SHRXQ DI, R15, R15
1039 SHRL $0x10, DI
1040 ADDQ CX, DI
1041
1042 // Load ctx.mlTable
1043 MOVQ ctx+16(FP), CX
1044 MOVQ 24(CX), CX
1045 MOVQ (CX)(DI*8), DI
1046
1047 // Update Literal Length State
1048 BZHIQ SI, R15, CX
1049 SHRL $0x10, SI
1050 ADDQ CX, SI
1051
1052 // Load ctx.llTable
1053 MOVQ ctx+16(FP), CX
1054 MOVQ (CX), CX
1055 MOVQ (CX)(SI*8), SI
1056
1057sequenceDecs_decode_56_bmi2_skip_update:
1058 // Adjust offset
1059 MOVQ 16(R9), CX
1060 CMPQ R13, $0x01
1061 JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
1062 MOVQ R11, R12
1063 MOVQ R10, R11
1064 MOVQ CX, R10
1065 JMP sequenceDecs_decode_56_bmi2_after_adjust
1066
1067sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
1068 CMPQ (R9), $0x00000000
1069 JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
1070 INCQ CX
1071 JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
1072
1073sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
1074 TESTQ CX, CX
1075 JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
1076 MOVQ R10, CX
1077 JMP sequenceDecs_decode_56_bmi2_after_adjust
1078
1079sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
1080 CMPQ CX, $0x01
1081 JB sequenceDecs_decode_56_bmi2_adjust_zero
1082 JEQ sequenceDecs_decode_56_bmi2_adjust_one
1083 CMPQ CX, $0x02
1084 JA sequenceDecs_decode_56_bmi2_adjust_three
1085 JMP sequenceDecs_decode_56_bmi2_adjust_two
1086
1087sequenceDecs_decode_56_bmi2_adjust_zero:
1088 MOVQ R10, R13
1089 JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
1090
1091sequenceDecs_decode_56_bmi2_adjust_one:
1092 MOVQ R11, R13
1093 JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
1094
1095sequenceDecs_decode_56_bmi2_adjust_two:
1096 MOVQ R12, R13
1097 JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
1098
1099sequenceDecs_decode_56_bmi2_adjust_three:
1100 LEAQ -1(R10), R13
1101
1102sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
1103 TESTQ R13, R13
1104 JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid
1105 MOVQ $0x00000001, R13
1106
1107sequenceDecs_decode_56_bmi2_adjust_temp_valid:
1108 CMPQ CX, $0x01
1109 CMOVQNE R11, R12
1110 MOVQ R10, R11
1111 MOVQ R13, R10
1112 MOVQ R13, CX
1113
1114sequenceDecs_decode_56_bmi2_after_adjust:
1115 MOVQ CX, 16(R9)
1116
1117 // Check values
1118 MOVQ 8(R9), R13
1119 MOVQ (R9), R14
1120 LEAQ (R13)(R14*1), R15
1121 MOVQ s+0(FP), BP
1122 ADDQ R15, 256(BP)
1123 MOVQ ctx+16(FP), R15
1124 SUBQ R14, 128(R15)
1125 JS error_not_enough_literals
1126 CMPQ R13, $0x00020002
1127 JA sequenceDecs_decode_56_bmi2_error_match_len_too_big
1128 TESTQ CX, CX
1129 JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok
1130 TESTQ R13, R13
1131 JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
1132
1133sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
1134 ADDQ $0x18, R9
1135 MOVQ ctx+16(FP), CX
1136 DECQ 96(CX)
1137 JNS sequenceDecs_decode_56_bmi2_main_loop
1138 MOVQ s+0(FP), CX
1139 MOVQ R10, 144(CX)
1140 MOVQ R11, 152(CX)
1141 MOVQ R12, 160(CX)
1142 MOVQ br+8(FP), CX
1143 MOVQ AX, 24(CX)
1144 MOVB DL, 32(CX)
1145 MOVQ BX, 8(CX)
1146
1147 // Return success
1148 MOVQ $0x00000000, ret+24(FP)
1149 RET
1150
1151 // Return with match length error
1152sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
1153 MOVQ $0x00000001, ret+24(FP)
1154 RET
1155
1156 // Return with match too long error
1157sequenceDecs_decode_56_bmi2_error_match_len_too_big:
1158 MOVQ $0x00000002, ret+24(FP)
1159 RET
1160
1161 // Return with match offset too long error
1162 MOVQ $0x00000003, ret+24(FP)
1163 RET
1164
1165 // Return with not enough literals error
1166error_not_enough_literals:
1167 MOVQ $0x00000004, ret+24(FP)
1168 RET
1169
1170 // Return with overread error
1171error_overread:
1172 MOVQ $0x00000006, ret+24(FP)
1173 RET
1174
1175// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
1176// Requires: SSE
1177TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
1178 MOVQ ctx+0(FP), R10
1179 MOVQ 8(R10), CX
1180 TESTQ CX, CX
1181 JZ empty_seqs
1182 MOVQ (R10), AX
1183 MOVQ 24(R10), DX
1184 MOVQ 32(R10), BX
1185 MOVQ 80(R10), SI
1186 MOVQ 104(R10), DI
1187 MOVQ 120(R10), R8
1188 MOVQ 56(R10), R9
1189 MOVQ 64(R10), R10
1190 ADDQ R10, R9
1191
1192 // seqsBase += 24 * seqIndex
1193 LEAQ (DX)(DX*2), R11
1194 SHLQ $0x03, R11
1195 ADDQ R11, AX
1196
1197 // outBase += outPosition
1198 ADDQ DI, BX
1199
1200main_loop:
1201 MOVQ (AX), R11
1202 MOVQ 16(AX), R12
1203 MOVQ 8(AX), R13
1204
1205 // Copy literals
1206 TESTQ R11, R11
1207 JZ check_offset
1208 XORQ R14, R14
1209
1210copy_1:
1211 MOVUPS (SI)(R14*1), X0
1212 MOVUPS X0, (BX)(R14*1)
1213 ADDQ $0x10, R14
1214 CMPQ R14, R11
1215 JB copy_1
1216 ADDQ R11, SI
1217 ADDQ R11, BX
1218 ADDQ R11, DI
1219
1220 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
1221check_offset:
1222 LEAQ (DI)(R10*1), R11
1223 CMPQ R12, R11
1224 JG error_match_off_too_big
1225 CMPQ R12, R8
1226 JG error_match_off_too_big
1227
1228 // Copy match from history
1229 MOVQ R12, R11
1230 SUBQ DI, R11
1231 JLS copy_match
1232 MOVQ R9, R14
1233 SUBQ R11, R14
1234 CMPQ R13, R11
1235 JG copy_all_from_history
1236 MOVQ R13, R11
1237 SUBQ $0x10, R11
1238 JB copy_4_small
1239
1240copy_4_loop:
1241 MOVUPS (R14), X0
1242 MOVUPS X0, (BX)
1243 ADDQ $0x10, R14
1244 ADDQ $0x10, BX
1245 SUBQ $0x10, R11
1246 JAE copy_4_loop
1247 LEAQ 16(R14)(R11*1), R14
1248 LEAQ 16(BX)(R11*1), BX
1249 MOVUPS -16(R14), X0
1250 MOVUPS X0, -16(BX)
1251 JMP copy_4_end
1252
1253copy_4_small:
1254 CMPQ R13, $0x03
1255 JE copy_4_move_3
1256 CMPQ R13, $0x08
1257 JB copy_4_move_4through7
1258 JMP copy_4_move_8through16
1259
1260copy_4_move_3:
1261 MOVW (R14), R11
1262 MOVB 2(R14), R12
1263 MOVW R11, (BX)
1264 MOVB R12, 2(BX)
1265 ADDQ R13, R14
1266 ADDQ R13, BX
1267 JMP copy_4_end
1268
1269copy_4_move_4through7:
1270 MOVL (R14), R11
1271 MOVL -4(R14)(R13*1), R12
1272 MOVL R11, (BX)
1273 MOVL R12, -4(BX)(R13*1)
1274 ADDQ R13, R14
1275 ADDQ R13, BX
1276 JMP copy_4_end
1277
1278copy_4_move_8through16:
1279 MOVQ (R14), R11
1280 MOVQ -8(R14)(R13*1), R12
1281 MOVQ R11, (BX)
1282 MOVQ R12, -8(BX)(R13*1)
1283 ADDQ R13, R14
1284 ADDQ R13, BX
1285
1286copy_4_end:
1287 ADDQ R13, DI
1288 ADDQ $0x18, AX
1289 INCQ DX
1290 CMPQ DX, CX
1291 JB main_loop
1292 JMP loop_finished
1293
1294copy_all_from_history:
1295 MOVQ R11, R15
1296 SUBQ $0x10, R15
1297 JB copy_5_small
1298
1299copy_5_loop:
1300 MOVUPS (R14), X0
1301 MOVUPS X0, (BX)
1302 ADDQ $0x10, R14
1303 ADDQ $0x10, BX
1304 SUBQ $0x10, R15
1305 JAE copy_5_loop
1306 LEAQ 16(R14)(R15*1), R14
1307 LEAQ 16(BX)(R15*1), BX
1308 MOVUPS -16(R14), X0
1309 MOVUPS X0, -16(BX)
1310 JMP copy_5_end
1311
1312copy_5_small:
1313 CMPQ R11, $0x03
1314 JE copy_5_move_3
1315 JB copy_5_move_1or2
1316 CMPQ R11, $0x08
1317 JB copy_5_move_4through7
1318 JMP copy_5_move_8through16
1319
1320copy_5_move_1or2:
1321 MOVB (R14), R15
1322 MOVB -1(R14)(R11*1), BP
1323 MOVB R15, (BX)
1324 MOVB BP, -1(BX)(R11*1)
1325 ADDQ R11, R14
1326 ADDQ R11, BX
1327 JMP copy_5_end
1328
1329copy_5_move_3:
1330 MOVW (R14), R15
1331 MOVB 2(R14), BP
1332 MOVW R15, (BX)
1333 MOVB BP, 2(BX)
1334 ADDQ R11, R14
1335 ADDQ R11, BX
1336 JMP copy_5_end
1337
1338copy_5_move_4through7:
1339 MOVL (R14), R15
1340 MOVL -4(R14)(R11*1), BP
1341 MOVL R15, (BX)
1342 MOVL BP, -4(BX)(R11*1)
1343 ADDQ R11, R14
1344 ADDQ R11, BX
1345 JMP copy_5_end
1346
1347copy_5_move_8through16:
1348 MOVQ (R14), R15
1349 MOVQ -8(R14)(R11*1), BP
1350 MOVQ R15, (BX)
1351 MOVQ BP, -8(BX)(R11*1)
1352 ADDQ R11, R14
1353 ADDQ R11, BX
1354
1355copy_5_end:
1356 ADDQ R11, DI
1357 SUBQ R11, R13
1358
1359 // Copy match from the current buffer
1360copy_match:
1361 MOVQ BX, R11
1362 SUBQ R12, R11
1363
1364 // ml <= mo
1365 CMPQ R13, R12
1366 JA copy_overlapping_match
1367
1368 // Copy non-overlapping match
1369 ADDQ R13, DI
1370 MOVQ BX, R12
1371 ADDQ R13, BX
1372
1373copy_2:
1374 MOVUPS (R11), X0
1375 MOVUPS X0, (R12)
1376 ADDQ $0x10, R11
1377 ADDQ $0x10, R12
1378 SUBQ $0x10, R13
1379 JHI copy_2
1380 JMP handle_loop
1381
1382 // Copy overlapping match
1383copy_overlapping_match:
1384 ADDQ R13, DI
1385
1386copy_slow_3:
1387 MOVB (R11), R12
1388 MOVB R12, (BX)
1389 INCQ R11
1390 INCQ BX
1391 DECQ R13
1392 JNZ copy_slow_3
1393
1394handle_loop:
1395 ADDQ $0x18, AX
1396 INCQ DX
1397 CMPQ DX, CX
1398 JB main_loop
1399
1400loop_finished:
1401 // Return value
1402 MOVB $0x01, ret+8(FP)
1403
1404 // Update the context
1405 MOVQ ctx+0(FP), AX
1406 MOVQ DX, 24(AX)
1407 MOVQ DI, 104(AX)
1408 SUBQ 80(AX), SI
1409 MOVQ SI, 112(AX)
1410 RET
1411
1412error_match_off_too_big:
1413 // Return value
1414 MOVB $0x00, ret+8(FP)
1415
1416 // Update the context
1417 MOVQ ctx+0(FP), AX
1418 MOVQ DX, 24(AX)
1419 MOVQ DI, 104(AX)
1420 SUBQ 80(AX), SI
1421 MOVQ SI, 112(AX)
1422 RET
1423
1424empty_seqs:
1425 // Return value
1426 MOVB $0x01, ret+8(FP)
1427 RET
1428
1429// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
1430// Requires: SSE
1431TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
1432 MOVQ ctx+0(FP), R10
1433 MOVQ 8(R10), CX
1434 TESTQ CX, CX
1435 JZ empty_seqs
1436 MOVQ (R10), AX
1437 MOVQ 24(R10), DX
1438 MOVQ 32(R10), BX
1439 MOVQ 80(R10), SI
1440 MOVQ 104(R10), DI
1441 MOVQ 120(R10), R8
1442 MOVQ 56(R10), R9
1443 MOVQ 64(R10), R10
1444 ADDQ R10, R9
1445
1446 // seqsBase += 24 * seqIndex
1447 LEAQ (DX)(DX*2), R11
1448 SHLQ $0x03, R11
1449 ADDQ R11, AX
1450
1451 // outBase += outPosition
1452 ADDQ DI, BX
1453
1454main_loop:
1455 MOVQ (AX), R11
1456 MOVQ 16(AX), R12
1457 MOVQ 8(AX), R13
1458
1459 // Copy literals
1460 TESTQ R11, R11
1461 JZ check_offset
1462 MOVQ R11, R14
1463 SUBQ $0x10, R14
1464 JB copy_1_small
1465
1466copy_1_loop:
1467 MOVUPS (SI), X0
1468 MOVUPS X0, (BX)
1469 ADDQ $0x10, SI
1470 ADDQ $0x10, BX
1471 SUBQ $0x10, R14
1472 JAE copy_1_loop
1473 LEAQ 16(SI)(R14*1), SI
1474 LEAQ 16(BX)(R14*1), BX
1475 MOVUPS -16(SI), X0
1476 MOVUPS X0, -16(BX)
1477 JMP copy_1_end
1478
1479copy_1_small:
1480 CMPQ R11, $0x03
1481 JE copy_1_move_3
1482 JB copy_1_move_1or2
1483 CMPQ R11, $0x08
1484 JB copy_1_move_4through7
1485 JMP copy_1_move_8through16
1486
1487copy_1_move_1or2:
1488 MOVB (SI), R14
1489 MOVB -1(SI)(R11*1), R15
1490 MOVB R14, (BX)
1491 MOVB R15, -1(BX)(R11*1)
1492 ADDQ R11, SI
1493 ADDQ R11, BX
1494 JMP copy_1_end
1495
1496copy_1_move_3:
1497 MOVW (SI), R14
1498 MOVB 2(SI), R15
1499 MOVW R14, (BX)
1500 MOVB R15, 2(BX)
1501 ADDQ R11, SI
1502 ADDQ R11, BX
1503 JMP copy_1_end
1504
1505copy_1_move_4through7:
1506 MOVL (SI), R14
1507 MOVL -4(SI)(R11*1), R15
1508 MOVL R14, (BX)
1509 MOVL R15, -4(BX)(R11*1)
1510 ADDQ R11, SI
1511 ADDQ R11, BX
1512 JMP copy_1_end
1513
1514copy_1_move_8through16:
1515 MOVQ (SI), R14
1516 MOVQ -8(SI)(R11*1), R15
1517 MOVQ R14, (BX)
1518 MOVQ R15, -8(BX)(R11*1)
1519 ADDQ R11, SI
1520 ADDQ R11, BX
1521
1522copy_1_end:
1523 ADDQ R11, DI
1524
1525 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
1526check_offset:
1527 LEAQ (DI)(R10*1), R11
1528 CMPQ R12, R11
1529 JG error_match_off_too_big
1530 CMPQ R12, R8
1531 JG error_match_off_too_big
1532
1533 // Copy match from history
1534 MOVQ R12, R11
1535 SUBQ DI, R11
1536 JLS copy_match
1537 MOVQ R9, R14
1538 SUBQ R11, R14
1539 CMPQ R13, R11
1540 JG copy_all_from_history
1541 MOVQ R13, R11
1542 SUBQ $0x10, R11
1543 JB copy_4_small
1544
1545copy_4_loop:
1546 MOVUPS (R14), X0
1547 MOVUPS X0, (BX)
1548 ADDQ $0x10, R14
1549 ADDQ $0x10, BX
1550 SUBQ $0x10, R11
1551 JAE copy_4_loop
1552 LEAQ 16(R14)(R11*1), R14
1553 LEAQ 16(BX)(R11*1), BX
1554 MOVUPS -16(R14), X0
1555 MOVUPS X0, -16(BX)
1556 JMP copy_4_end
1557
1558copy_4_small:
1559 CMPQ R13, $0x03
1560 JE copy_4_move_3
1561 CMPQ R13, $0x08
1562 JB copy_4_move_4through7
1563 JMP copy_4_move_8through16
1564
1565copy_4_move_3:
1566 MOVW (R14), R11
1567 MOVB 2(R14), R12
1568 MOVW R11, (BX)
1569 MOVB R12, 2(BX)
1570 ADDQ R13, R14
1571 ADDQ R13, BX
1572 JMP copy_4_end
1573
1574copy_4_move_4through7:
1575 MOVL (R14), R11
1576 MOVL -4(R14)(R13*1), R12
1577 MOVL R11, (BX)
1578 MOVL R12, -4(BX)(R13*1)
1579 ADDQ R13, R14
1580 ADDQ R13, BX
1581 JMP copy_4_end
1582
1583copy_4_move_8through16:
1584 MOVQ (R14), R11
1585 MOVQ -8(R14)(R13*1), R12
1586 MOVQ R11, (BX)
1587 MOVQ R12, -8(BX)(R13*1)
1588 ADDQ R13, R14
1589 ADDQ R13, BX
1590
1591copy_4_end:
1592 ADDQ R13, DI
1593 ADDQ $0x18, AX
1594 INCQ DX
1595 CMPQ DX, CX
1596 JB main_loop
1597 JMP loop_finished
1598
1599copy_all_from_history:
1600 MOVQ R11, R15
1601 SUBQ $0x10, R15
1602 JB copy_5_small
1603
1604copy_5_loop:
1605 MOVUPS (R14), X0
1606 MOVUPS X0, (BX)
1607 ADDQ $0x10, R14
1608 ADDQ $0x10, BX
1609 SUBQ $0x10, R15
1610 JAE copy_5_loop
1611 LEAQ 16(R14)(R15*1), R14
1612 LEAQ 16(BX)(R15*1), BX
1613 MOVUPS -16(R14), X0
1614 MOVUPS X0, -16(BX)
1615 JMP copy_5_end
1616
1617copy_5_small:
1618 CMPQ R11, $0x03
1619 JE copy_5_move_3
1620 JB copy_5_move_1or2
1621 CMPQ R11, $0x08
1622 JB copy_5_move_4through7
1623 JMP copy_5_move_8through16
1624
1625copy_5_move_1or2:
1626 MOVB (R14), R15
1627 MOVB -1(R14)(R11*1), BP
1628 MOVB R15, (BX)
1629 MOVB BP, -1(BX)(R11*1)
1630 ADDQ R11, R14
1631 ADDQ R11, BX
1632 JMP copy_5_end
1633
1634copy_5_move_3:
1635 MOVW (R14), R15
1636 MOVB 2(R14), BP
1637 MOVW R15, (BX)
1638 MOVB BP, 2(BX)
1639 ADDQ R11, R14
1640 ADDQ R11, BX
1641 JMP copy_5_end
1642
1643copy_5_move_4through7:
1644 MOVL (R14), R15
1645 MOVL -4(R14)(R11*1), BP
1646 MOVL R15, (BX)
1647 MOVL BP, -4(BX)(R11*1)
1648 ADDQ R11, R14
1649 ADDQ R11, BX
1650 JMP copy_5_end
1651
1652copy_5_move_8through16:
1653 MOVQ (R14), R15
1654 MOVQ -8(R14)(R11*1), BP
1655 MOVQ R15, (BX)
1656 MOVQ BP, -8(BX)(R11*1)
1657 ADDQ R11, R14
1658 ADDQ R11, BX
1659
1660copy_5_end:
1661 ADDQ R11, DI
1662 SUBQ R11, R13
1663
1664 // Copy match from the current buffer
1665copy_match:
1666 MOVQ BX, R11
1667 SUBQ R12, R11
1668
1669 // ml <= mo
1670 CMPQ R13, R12
1671 JA copy_overlapping_match
1672
1673 // Copy non-overlapping match
1674 ADDQ R13, DI
1675 MOVQ R13, R12
1676 SUBQ $0x10, R12
1677 JB copy_2_small
1678
1679copy_2_loop:
1680 MOVUPS (R11), X0
1681 MOVUPS X0, (BX)
1682 ADDQ $0x10, R11
1683 ADDQ $0x10, BX
1684 SUBQ $0x10, R12
1685 JAE copy_2_loop
1686 LEAQ 16(R11)(R12*1), R11
1687 LEAQ 16(BX)(R12*1), BX
1688 MOVUPS -16(R11), X0
1689 MOVUPS X0, -16(BX)
1690 JMP copy_2_end
1691
1692copy_2_small:
1693 CMPQ R13, $0x03
1694 JE copy_2_move_3
1695 JB copy_2_move_1or2
1696 CMPQ R13, $0x08
1697 JB copy_2_move_4through7
1698 JMP copy_2_move_8through16
1699
1700copy_2_move_1or2:
1701 MOVB (R11), R12
1702 MOVB -1(R11)(R13*1), R14
1703 MOVB R12, (BX)
1704 MOVB R14, -1(BX)(R13*1)
1705 ADDQ R13, R11
1706 ADDQ R13, BX
1707 JMP copy_2_end
1708
1709copy_2_move_3:
1710 MOVW (R11), R12
1711 MOVB 2(R11), R14
1712 MOVW R12, (BX)
1713 MOVB R14, 2(BX)
1714 ADDQ R13, R11
1715 ADDQ R13, BX
1716 JMP copy_2_end
1717
1718copy_2_move_4through7:
1719 MOVL (R11), R12
1720 MOVL -4(R11)(R13*1), R14
1721 MOVL R12, (BX)
1722 MOVL R14, -4(BX)(R13*1)
1723 ADDQ R13, R11
1724 ADDQ R13, BX
1725 JMP copy_2_end
1726
1727copy_2_move_8through16:
1728 MOVQ (R11), R12
1729 MOVQ -8(R11)(R13*1), R14
1730 MOVQ R12, (BX)
1731 MOVQ R14, -8(BX)(R13*1)
1732 ADDQ R13, R11
1733 ADDQ R13, BX
1734
1735copy_2_end:
1736 JMP handle_loop
1737
1738 // Copy overlapping match
1739copy_overlapping_match:
1740 ADDQ R13, DI
1741
1742copy_slow_3:
1743 MOVB (R11), R12
1744 MOVB R12, (BX)
1745 INCQ R11
1746 INCQ BX
1747 DECQ R13
1748 JNZ copy_slow_3
1749
1750handle_loop:
1751 ADDQ $0x18, AX
1752 INCQ DX
1753 CMPQ DX, CX
1754 JB main_loop
1755
1756loop_finished:
1757 // Return value
1758 MOVB $0x01, ret+8(FP)
1759
1760 // Update the context
1761 MOVQ ctx+0(FP), AX
1762 MOVQ DX, 24(AX)
1763 MOVQ DI, 104(AX)
1764 SUBQ 80(AX), SI
1765 MOVQ SI, 112(AX)
1766 RET
1767
1768error_match_off_too_big:
1769 // Return value
1770 MOVB $0x00, ret+8(FP)
1771
1772 // Update the context
1773 MOVQ ctx+0(FP), AX
1774 MOVQ DX, 24(AX)
1775 MOVQ DI, 104(AX)
1776 SUBQ 80(AX), SI
1777 MOVQ SI, 112(AX)
1778 RET
1779
1780empty_seqs:
1781 // Return value
1782 MOVB $0x01, ret+8(FP)
1783 RET
1784
1785// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
1786// Requires: CMOV, SSE
1787TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
1788 MOVQ br+8(FP), CX
1789 MOVQ 24(CX), DX
1790 MOVBQZX 32(CX), BX
1791 MOVQ (CX), AX
1792 MOVQ 8(CX), SI
1793 ADDQ SI, AX
1794 MOVQ AX, (SP)
1795 MOVQ ctx+16(FP), AX
1796 MOVQ 72(AX), DI
1797 MOVQ 80(AX), R8
1798 MOVQ 88(AX), R9
1799 XORQ CX, CX
1800 MOVQ CX, 8(SP)
1801 MOVQ CX, 16(SP)
1802 MOVQ CX, 24(SP)
1803 MOVQ 112(AX), R10
1804 MOVQ 128(AX), CX
1805 MOVQ CX, 32(SP)
1806 MOVQ 144(AX), R11
1807 MOVQ 136(AX), R12
1808 MOVQ 200(AX), CX
1809 MOVQ CX, 56(SP)
1810 MOVQ 176(AX), CX
1811 MOVQ CX, 48(SP)
1812 MOVQ 184(AX), AX
1813 MOVQ AX, 40(SP)
1814 MOVQ 40(SP), AX
1815 ADDQ AX, 48(SP)
1816
1817 // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
1818 ADDQ R10, 32(SP)
1819
1820 // outBase += outPosition
1821 ADDQ R12, R10
1822
1823sequenceDecs_decodeSync_amd64_main_loop:
1824 MOVQ (SP), R13
1825
1826 // Fill bitreader to have enough for the offset and match length.
1827 CMPQ SI, $0x08
1828 JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte
1829 MOVQ BX, AX
1830 SHRQ $0x03, AX
1831 SUBQ AX, R13
1832 MOVQ (R13), DX
1833 SUBQ AX, SI
1834 ANDQ $0x07, BX
1835 JMP sequenceDecs_decodeSync_amd64_fill_end
1836
1837sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
1838 CMPQ SI, $0x00
1839 JLE sequenceDecs_decodeSync_amd64_fill_check_overread
1840 CMPQ BX, $0x07
1841 JLE sequenceDecs_decodeSync_amd64_fill_end
1842 SHLQ $0x08, DX
1843 SUBQ $0x01, R13
1844 SUBQ $0x01, SI
1845 SUBQ $0x08, BX
1846 MOVBQZX (R13), AX
1847 ORQ AX, DX
1848 JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte
1849
1850sequenceDecs_decodeSync_amd64_fill_check_overread:
1851 CMPQ BX, $0x40
1852 JA error_overread
1853
1854sequenceDecs_decodeSync_amd64_fill_end:
1855 // Update offset
1856 MOVQ R9, AX
1857 MOVQ BX, CX
1858 MOVQ DX, R14
1859 SHLQ CL, R14
1860 MOVB AH, CL
1861 SHRQ $0x20, AX
1862 TESTQ CX, CX
1863 JZ sequenceDecs_decodeSync_amd64_of_update_zero
1864 ADDQ CX, BX
1865 CMPQ BX, $0x40
1866 JA sequenceDecs_decodeSync_amd64_of_update_zero
1867 CMPQ CX, $0x40
1868 JAE sequenceDecs_decodeSync_amd64_of_update_zero
1869 NEGQ CX
1870 SHRQ CL, R14
1871 ADDQ R14, AX
1872
1873sequenceDecs_decodeSync_amd64_of_update_zero:
1874 MOVQ AX, 8(SP)
1875
1876 // Update match length
1877 MOVQ R8, AX
1878 MOVQ BX, CX
1879 MOVQ DX, R14
1880 SHLQ CL, R14
1881 MOVB AH, CL
1882 SHRQ $0x20, AX
1883 TESTQ CX, CX
1884 JZ sequenceDecs_decodeSync_amd64_ml_update_zero
1885 ADDQ CX, BX
1886 CMPQ BX, $0x40
1887 JA sequenceDecs_decodeSync_amd64_ml_update_zero
1888 CMPQ CX, $0x40
1889 JAE sequenceDecs_decodeSync_amd64_ml_update_zero
1890 NEGQ CX
1891 SHRQ CL, R14
1892 ADDQ R14, AX
1893
1894sequenceDecs_decodeSync_amd64_ml_update_zero:
1895 MOVQ AX, 16(SP)
1896
1897 // Fill bitreader to have enough for the remaining
1898 CMPQ SI, $0x08
1899 JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
1900 MOVQ BX, AX
1901 SHRQ $0x03, AX
1902 SUBQ AX, R13
1903 MOVQ (R13), DX
1904 SUBQ AX, SI
1905 ANDQ $0x07, BX
1906 JMP sequenceDecs_decodeSync_amd64_fill_2_end
1907
1908sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
1909 CMPQ SI, $0x00
1910 JLE sequenceDecs_decodeSync_amd64_fill_2_check_overread
1911 CMPQ BX, $0x07
1912 JLE sequenceDecs_decodeSync_amd64_fill_2_end
1913 SHLQ $0x08, DX
1914 SUBQ $0x01, R13
1915 SUBQ $0x01, SI
1916 SUBQ $0x08, BX
1917 MOVBQZX (R13), AX
1918 ORQ AX, DX
1919 JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
1920
1921sequenceDecs_decodeSync_amd64_fill_2_check_overread:
1922 CMPQ BX, $0x40
1923 JA error_overread
1924
1925sequenceDecs_decodeSync_amd64_fill_2_end:
1926 // Update literal length
1927 MOVQ DI, AX
1928 MOVQ BX, CX
1929 MOVQ DX, R14
1930 SHLQ CL, R14
1931 MOVB AH, CL
1932 SHRQ $0x20, AX
1933 TESTQ CX, CX
1934 JZ sequenceDecs_decodeSync_amd64_ll_update_zero
1935 ADDQ CX, BX
1936 CMPQ BX, $0x40
1937 JA sequenceDecs_decodeSync_amd64_ll_update_zero
1938 CMPQ CX, $0x40
1939 JAE sequenceDecs_decodeSync_amd64_ll_update_zero
1940 NEGQ CX
1941 SHRQ CL, R14
1942 ADDQ R14, AX
1943
1944sequenceDecs_decodeSync_amd64_ll_update_zero:
1945 MOVQ AX, 24(SP)
1946
1947 // Fill bitreader for state updates
1948 MOVQ R13, (SP)
1949 MOVQ R9, AX
1950 SHRQ $0x08, AX
1951 MOVBQZX AL, AX
1952 MOVQ ctx+16(FP), CX
1953 CMPQ 96(CX), $0x00
1954 JZ sequenceDecs_decodeSync_amd64_skip_update
1955
1956 // Update Literal Length State
1957 MOVBQZX DI, R13
1958 SHRL $0x10, DI
1959 LEAQ (BX)(R13*1), CX
1960 MOVQ DX, R14
1961 MOVQ CX, BX
1962 ROLQ CL, R14
1963 MOVL $0x00000001, R15
1964 MOVB R13, CL
1965 SHLL CL, R15
1966 DECL R15
1967 ANDQ R15, R14
1968 ADDQ R14, DI
1969
1970 // Load ctx.llTable
1971 MOVQ ctx+16(FP), CX
1972 MOVQ (CX), CX
1973 MOVQ (CX)(DI*8), DI
1974
1975 // Update Match Length State
1976 MOVBQZX R8, R13
1977 SHRL $0x10, R8
1978 LEAQ (BX)(R13*1), CX
1979 MOVQ DX, R14
1980 MOVQ CX, BX
1981 ROLQ CL, R14
1982 MOVL $0x00000001, R15
1983 MOVB R13, CL
1984 SHLL CL, R15
1985 DECL R15
1986 ANDQ R15, R14
1987 ADDQ R14, R8
1988
1989 // Load ctx.mlTable
1990 MOVQ ctx+16(FP), CX
1991 MOVQ 24(CX), CX
1992 MOVQ (CX)(R8*8), R8
1993
1994 // Update Offset State
1995 MOVBQZX R9, R13
1996 SHRL $0x10, R9
1997 LEAQ (BX)(R13*1), CX
1998 MOVQ DX, R14
1999 MOVQ CX, BX
2000 ROLQ CL, R14
2001 MOVL $0x00000001, R15
2002 MOVB R13, CL
2003 SHLL CL, R15
2004 DECL R15
2005 ANDQ R15, R14
2006 ADDQ R14, R9
2007
2008 // Load ctx.ofTable
2009 MOVQ ctx+16(FP), CX
2010 MOVQ 48(CX), CX
2011 MOVQ (CX)(R9*8), R9
2012
2013sequenceDecs_decodeSync_amd64_skip_update:
2014 // Adjust offset
2015 MOVQ s+0(FP), CX
2016 MOVQ 8(SP), R13
2017 CMPQ AX, $0x01
2018 JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
2019 MOVUPS 144(CX), X0
2020 MOVQ R13, 144(CX)
2021 MOVUPS X0, 152(CX)
2022 JMP sequenceDecs_decodeSync_amd64_after_adjust
2023
2024sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
2025 CMPQ 24(SP), $0x00000000
2026 JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
2027 INCQ R13
2028 JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
2029
2030sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
2031 TESTQ R13, R13
2032 JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
2033 MOVQ 144(CX), R13
2034 JMP sequenceDecs_decodeSync_amd64_after_adjust
2035
2036sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
2037 MOVQ R13, AX
2038 XORQ R14, R14
2039 MOVQ $-1, R15
2040 CMPQ R13, $0x03
2041 CMOVQEQ R14, AX
2042 CMOVQEQ R15, R14
2043 ADDQ 144(CX)(AX*8), R14
2044 JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid
2045 MOVQ $0x00000001, R14
2046
2047sequenceDecs_decodeSync_amd64_adjust_temp_valid:
2048 CMPQ R13, $0x01
2049 JZ sequenceDecs_decodeSync_amd64_adjust_skip
2050 MOVQ 152(CX), AX
2051 MOVQ AX, 160(CX)
2052
2053sequenceDecs_decodeSync_amd64_adjust_skip:
2054 MOVQ 144(CX), AX
2055 MOVQ AX, 152(CX)
2056 MOVQ R14, 144(CX)
2057 MOVQ R14, R13
2058
2059sequenceDecs_decodeSync_amd64_after_adjust:
2060 MOVQ R13, 8(SP)
2061
2062 // Check values
2063 MOVQ 16(SP), AX
2064 MOVQ 24(SP), CX
2065 LEAQ (AX)(CX*1), R14
2066 MOVQ s+0(FP), R15
2067 ADDQ R14, 256(R15)
2068 MOVQ ctx+16(FP), R14
2069 SUBQ CX, 104(R14)
2070 JS error_not_enough_literals
2071 CMPQ AX, $0x00020002
2072 JA sequenceDecs_decodeSync_amd64_error_match_len_too_big
2073 TESTQ R13, R13
2074 JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok
2075 TESTQ AX, AX
2076 JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
2077
2078sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
2079 MOVQ 24(SP), AX
2080 MOVQ 8(SP), CX
2081 MOVQ 16(SP), R13
2082
2083 // Check if we have enough space in s.out
2084 LEAQ (AX)(R13*1), R14
2085 ADDQ R10, R14
2086 CMPQ R14, 32(SP)
2087 JA error_not_enough_space
2088
2089 // Copy literals
2090 TESTQ AX, AX
2091 JZ check_offset
2092 XORQ R14, R14
2093
2094copy_1:
2095 MOVUPS (R11)(R14*1), X0
2096 MOVUPS X0, (R10)(R14*1)
2097 ADDQ $0x10, R14
2098 CMPQ R14, AX
2099 JB copy_1
2100 ADDQ AX, R11
2101 ADDQ AX, R10
2102 ADDQ AX, R12
2103
2104 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
2105check_offset:
2106 MOVQ R12, AX
2107 ADDQ 40(SP), AX
2108 CMPQ CX, AX
2109 JG error_match_off_too_big
2110 CMPQ CX, 56(SP)
2111 JG error_match_off_too_big
2112
2113 // Copy match from history
2114 MOVQ CX, AX
2115 SUBQ R12, AX
2116 JLS copy_match
2117 MOVQ 48(SP), R14
2118 SUBQ AX, R14
2119 CMPQ R13, AX
2120 JG copy_all_from_history
2121 MOVQ R13, AX
2122 SUBQ $0x10, AX
2123 JB copy_4_small
2124
2125copy_4_loop:
2126 MOVUPS (R14), X0
2127 MOVUPS X0, (R10)
2128 ADDQ $0x10, R14
2129 ADDQ $0x10, R10
2130 SUBQ $0x10, AX
2131 JAE copy_4_loop
2132 LEAQ 16(R14)(AX*1), R14
2133 LEAQ 16(R10)(AX*1), R10
2134 MOVUPS -16(R14), X0
2135 MOVUPS X0, -16(R10)
2136 JMP copy_4_end
2137
2138copy_4_small:
2139 CMPQ R13, $0x03
2140 JE copy_4_move_3
2141 CMPQ R13, $0x08
2142 JB copy_4_move_4through7
2143 JMP copy_4_move_8through16
2144
2145copy_4_move_3:
2146 MOVW (R14), AX
2147 MOVB 2(R14), CL
2148 MOVW AX, (R10)
2149 MOVB CL, 2(R10)
2150 ADDQ R13, R14
2151 ADDQ R13, R10
2152 JMP copy_4_end
2153
2154copy_4_move_4through7:
2155 MOVL (R14), AX
2156 MOVL -4(R14)(R13*1), CX
2157 MOVL AX, (R10)
2158 MOVL CX, -4(R10)(R13*1)
2159 ADDQ R13, R14
2160 ADDQ R13, R10
2161 JMP copy_4_end
2162
2163copy_4_move_8through16:
2164 MOVQ (R14), AX
2165 MOVQ -8(R14)(R13*1), CX
2166 MOVQ AX, (R10)
2167 MOVQ CX, -8(R10)(R13*1)
2168 ADDQ R13, R14
2169 ADDQ R13, R10
2170
2171copy_4_end:
2172 ADDQ R13, R12
2173 JMP handle_loop
2174 JMP loop_finished
2175
2176copy_all_from_history:
2177 MOVQ AX, R15
2178 SUBQ $0x10, R15
2179 JB copy_5_small
2180
2181copy_5_loop:
2182 MOVUPS (R14), X0
2183 MOVUPS X0, (R10)
2184 ADDQ $0x10, R14
2185 ADDQ $0x10, R10
2186 SUBQ $0x10, R15
2187 JAE copy_5_loop
2188 LEAQ 16(R14)(R15*1), R14
2189 LEAQ 16(R10)(R15*1), R10
2190 MOVUPS -16(R14), X0
2191 MOVUPS X0, -16(R10)
2192 JMP copy_5_end
2193
2194copy_5_small:
2195 CMPQ AX, $0x03
2196 JE copy_5_move_3
2197 JB copy_5_move_1or2
2198 CMPQ AX, $0x08
2199 JB copy_5_move_4through7
2200 JMP copy_5_move_8through16
2201
2202copy_5_move_1or2:
2203 MOVB (R14), R15
2204 MOVB -1(R14)(AX*1), BP
2205 MOVB R15, (R10)
2206 MOVB BP, -1(R10)(AX*1)
2207 ADDQ AX, R14
2208 ADDQ AX, R10
2209 JMP copy_5_end
2210
2211copy_5_move_3:
2212 MOVW (R14), R15
2213 MOVB 2(R14), BP
2214 MOVW R15, (R10)
2215 MOVB BP, 2(R10)
2216 ADDQ AX, R14
2217 ADDQ AX, R10
2218 JMP copy_5_end
2219
2220copy_5_move_4through7:
2221 MOVL (R14), R15
2222 MOVL -4(R14)(AX*1), BP
2223 MOVL R15, (R10)
2224 MOVL BP, -4(R10)(AX*1)
2225 ADDQ AX, R14
2226 ADDQ AX, R10
2227 JMP copy_5_end
2228
2229copy_5_move_8through16:
2230 MOVQ (R14), R15
2231 MOVQ -8(R14)(AX*1), BP
2232 MOVQ R15, (R10)
2233 MOVQ BP, -8(R10)(AX*1)
2234 ADDQ AX, R14
2235 ADDQ AX, R10
2236
2237copy_5_end:
2238 ADDQ AX, R12
2239 SUBQ AX, R13
2240
2241 // Copy match from the current buffer
2242copy_match:
2243 MOVQ R10, AX
2244 SUBQ CX, AX
2245
2246 // ml <= mo
2247 CMPQ R13, CX
2248 JA copy_overlapping_match
2249
2250 // Copy non-overlapping match
2251 ADDQ R13, R12
2252 MOVQ R10, CX
2253 ADDQ R13, R10
2254
2255copy_2:
2256 MOVUPS (AX), X0
2257 MOVUPS X0, (CX)
2258 ADDQ $0x10, AX
2259 ADDQ $0x10, CX
2260 SUBQ $0x10, R13
2261 JHI copy_2
2262 JMP handle_loop
2263
2264 // Copy overlapping match
2265copy_overlapping_match:
2266 ADDQ R13, R12
2267
2268copy_slow_3:
2269 MOVB (AX), CL
2270 MOVB CL, (R10)
2271 INCQ AX
2272 INCQ R10
2273 DECQ R13
2274 JNZ copy_slow_3
2275
2276handle_loop:
2277 MOVQ ctx+16(FP), AX
2278 DECQ 96(AX)
2279 JNS sequenceDecs_decodeSync_amd64_main_loop
2280
2281loop_finished:
2282 MOVQ br+8(FP), AX
2283 MOVQ DX, 24(AX)
2284 MOVB BL, 32(AX)
2285 MOVQ SI, 8(AX)
2286
2287 // Update the context
2288 MOVQ ctx+16(FP), AX
2289 MOVQ R12, 136(AX)
2290 MOVQ 144(AX), CX
2291 SUBQ CX, R11
2292 MOVQ R11, 168(AX)
2293
2294 // Return success
2295 MOVQ $0x00000000, ret+24(FP)
2296 RET
2297
2298 // Return with match length error
2299sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
2300 MOVQ 16(SP), AX
2301 MOVQ ctx+16(FP), CX
2302 MOVQ AX, 216(CX)
2303 MOVQ $0x00000001, ret+24(FP)
2304 RET
2305
2306 // Return with match too long error
2307sequenceDecs_decodeSync_amd64_error_match_len_too_big:
2308 MOVQ ctx+16(FP), AX
2309 MOVQ 16(SP), CX
2310 MOVQ CX, 216(AX)
2311 MOVQ $0x00000002, ret+24(FP)
2312 RET
2313
2314 // Return with match offset too long error
2315error_match_off_too_big:
2316 MOVQ ctx+16(FP), AX
2317 MOVQ 8(SP), CX
2318 MOVQ CX, 224(AX)
2319 MOVQ R12, 136(AX)
2320 MOVQ $0x00000003, ret+24(FP)
2321 RET
2322
2323 // Return with not enough literals error
2324error_not_enough_literals:
2325 MOVQ ctx+16(FP), AX
2326 MOVQ 24(SP), CX
2327 MOVQ CX, 208(AX)
2328 MOVQ $0x00000004, ret+24(FP)
2329 RET
2330
2331 // Return with overread error
2332error_overread:
2333 MOVQ $0x00000006, ret+24(FP)
2334 RET
2335
2336 // Return with not enough output space error
2337error_not_enough_space:
2338 MOVQ ctx+16(FP), AX
2339 MOVQ 24(SP), CX
2340 MOVQ CX, 208(AX)
2341 MOVQ 16(SP), CX
2342 MOVQ CX, 216(AX)
2343 MOVQ R12, 136(AX)
2344 MOVQ $0x00000005, ret+24(FP)
2345 RET
2346
2347// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
2348// Requires: BMI, BMI2, CMOV, SSE
2349TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
2350 MOVQ br+8(FP), BX
2351 MOVQ 24(BX), AX
2352 MOVBQZX 32(BX), DX
2353 MOVQ (BX), CX
2354 MOVQ 8(BX), BX
2355 ADDQ BX, CX
2356 MOVQ CX, (SP)
2357 MOVQ ctx+16(FP), CX
2358 MOVQ 72(CX), SI
2359 MOVQ 80(CX), DI
2360 MOVQ 88(CX), R8
2361 XORQ R9, R9
2362 MOVQ R9, 8(SP)
2363 MOVQ R9, 16(SP)
2364 MOVQ R9, 24(SP)
2365 MOVQ 112(CX), R9
2366 MOVQ 128(CX), R10
2367 MOVQ R10, 32(SP)
2368 MOVQ 144(CX), R10
2369 MOVQ 136(CX), R11
2370 MOVQ 200(CX), R12
2371 MOVQ R12, 56(SP)
2372 MOVQ 176(CX), R12
2373 MOVQ R12, 48(SP)
2374 MOVQ 184(CX), CX
2375 MOVQ CX, 40(SP)
2376 MOVQ 40(SP), CX
2377 ADDQ CX, 48(SP)
2378
2379 // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
2380 ADDQ R9, 32(SP)
2381
2382 // outBase += outPosition
2383 ADDQ R11, R9
2384
2385sequenceDecs_decodeSync_bmi2_main_loop:
2386 MOVQ (SP), R12
2387
2388 // Fill bitreader to have enough for the offset and match length.
2389 CMPQ BX, $0x08
2390 JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
2391 MOVQ DX, CX
2392 SHRQ $0x03, CX
2393 SUBQ CX, R12
2394 MOVQ (R12), AX
2395 SUBQ CX, BX
2396 ANDQ $0x07, DX
2397 JMP sequenceDecs_decodeSync_bmi2_fill_end
2398
2399sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
2400 CMPQ BX, $0x00
2401 JLE sequenceDecs_decodeSync_bmi2_fill_check_overread
2402 CMPQ DX, $0x07
2403 JLE sequenceDecs_decodeSync_bmi2_fill_end
2404 SHLQ $0x08, AX
2405 SUBQ $0x01, R12
2406 SUBQ $0x01, BX
2407 SUBQ $0x08, DX
2408 MOVBQZX (R12), CX
2409 ORQ CX, AX
2410 JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
2411
2412sequenceDecs_decodeSync_bmi2_fill_check_overread:
2413 CMPQ DX, $0x40
2414 JA error_overread
2415
2416sequenceDecs_decodeSync_bmi2_fill_end:
2417 // Update offset
2418 MOVQ $0x00000808, CX
2419 BEXTRQ CX, R8, R13
2420 MOVQ AX, R14
2421 LEAQ (DX)(R13*1), CX
2422 ROLQ CL, R14
2423 BZHIQ R13, R14, R14
2424 MOVQ CX, DX
2425 MOVQ R8, CX
2426 SHRQ $0x20, CX
2427 ADDQ R14, CX
2428 MOVQ CX, 8(SP)
2429
2430 // Update match length
2431 MOVQ $0x00000808, CX
2432 BEXTRQ CX, DI, R13
2433 MOVQ AX, R14
2434 LEAQ (DX)(R13*1), CX
2435 ROLQ CL, R14
2436 BZHIQ R13, R14, R14
2437 MOVQ CX, DX
2438 MOVQ DI, CX
2439 SHRQ $0x20, CX
2440 ADDQ R14, CX
2441 MOVQ CX, 16(SP)
2442
2443 // Fill bitreader to have enough for the remaining
2444 CMPQ BX, $0x08
2445 JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
2446 MOVQ DX, CX
2447 SHRQ $0x03, CX
2448 SUBQ CX, R12
2449 MOVQ (R12), AX
2450 SUBQ CX, BX
2451 ANDQ $0x07, DX
2452 JMP sequenceDecs_decodeSync_bmi2_fill_2_end
2453
2454sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
2455 CMPQ BX, $0x00
2456 JLE sequenceDecs_decodeSync_bmi2_fill_2_check_overread
2457 CMPQ DX, $0x07
2458 JLE sequenceDecs_decodeSync_bmi2_fill_2_end
2459 SHLQ $0x08, AX
2460 SUBQ $0x01, R12
2461 SUBQ $0x01, BX
2462 SUBQ $0x08, DX
2463 MOVBQZX (R12), CX
2464 ORQ CX, AX
2465 JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
2466
2467sequenceDecs_decodeSync_bmi2_fill_2_check_overread:
2468 CMPQ DX, $0x40
2469 JA error_overread
2470
2471sequenceDecs_decodeSync_bmi2_fill_2_end:
2472 // Update literal length
2473 MOVQ $0x00000808, CX
2474 BEXTRQ CX, SI, R13
2475 MOVQ AX, R14
2476 LEAQ (DX)(R13*1), CX
2477 ROLQ CL, R14
2478 BZHIQ R13, R14, R14
2479 MOVQ CX, DX
2480 MOVQ SI, CX
2481 SHRQ $0x20, CX
2482 ADDQ R14, CX
2483 MOVQ CX, 24(SP)
2484
2485 // Fill bitreader for state updates
2486 MOVQ R12, (SP)
2487 MOVQ $0x00000808, CX
2488 BEXTRQ CX, R8, R12
2489 MOVQ ctx+16(FP), CX
2490 CMPQ 96(CX), $0x00
2491 JZ sequenceDecs_decodeSync_bmi2_skip_update
2492 LEAQ (SI)(DI*1), R13
2493 ADDQ R8, R13
2494 MOVBQZX R13, R13
2495 LEAQ (DX)(R13*1), CX
2496 MOVQ AX, R14
2497 MOVQ CX, DX
2498 ROLQ CL, R14
2499 BZHIQ R13, R14, R14
2500
2501 // Update Offset State
2502 BZHIQ R8, R14, CX
2503 SHRXQ R8, R14, R14
2504 SHRL $0x10, R8
2505 ADDQ CX, R8
2506
2507 // Load ctx.ofTable
2508 MOVQ ctx+16(FP), CX
2509 MOVQ 48(CX), CX
2510 MOVQ (CX)(R8*8), R8
2511
2512 // Update Match Length State
2513 BZHIQ DI, R14, CX
2514 SHRXQ DI, R14, R14
2515 SHRL $0x10, DI
2516 ADDQ CX, DI
2517
2518 // Load ctx.mlTable
2519 MOVQ ctx+16(FP), CX
2520 MOVQ 24(CX), CX
2521 MOVQ (CX)(DI*8), DI
2522
2523 // Update Literal Length State
2524 BZHIQ SI, R14, CX
2525 SHRL $0x10, SI
2526 ADDQ CX, SI
2527
2528 // Load ctx.llTable
2529 MOVQ ctx+16(FP), CX
2530 MOVQ (CX), CX
2531 MOVQ (CX)(SI*8), SI
2532
2533sequenceDecs_decodeSync_bmi2_skip_update:
2534 // Adjust offset
2535 MOVQ s+0(FP), CX
2536 MOVQ 8(SP), R13
2537 CMPQ R12, $0x01
2538 JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
2539 MOVUPS 144(CX), X0
2540 MOVQ R13, 144(CX)
2541 MOVUPS X0, 152(CX)
2542 JMP sequenceDecs_decodeSync_bmi2_after_adjust
2543
2544sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
2545 CMPQ 24(SP), $0x00000000
2546 JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
2547 INCQ R13
2548 JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
2549
2550sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
2551 TESTQ R13, R13
2552 JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
2553 MOVQ 144(CX), R13
2554 JMP sequenceDecs_decodeSync_bmi2_after_adjust
2555
2556sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
2557 MOVQ R13, R12
2558 XORQ R14, R14
2559 MOVQ $-1, R15
2560 CMPQ R13, $0x03
2561 CMOVQEQ R14, R12
2562 CMOVQEQ R15, R14
2563 ADDQ 144(CX)(R12*8), R14
2564 JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid
2565 MOVQ $0x00000001, R14
2566
2567sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
2568 CMPQ R13, $0x01
2569 JZ sequenceDecs_decodeSync_bmi2_adjust_skip
2570 MOVQ 152(CX), R12
2571 MOVQ R12, 160(CX)
2572
2573sequenceDecs_decodeSync_bmi2_adjust_skip:
2574 MOVQ 144(CX), R12
2575 MOVQ R12, 152(CX)
2576 MOVQ R14, 144(CX)
2577 MOVQ R14, R13
2578
2579sequenceDecs_decodeSync_bmi2_after_adjust:
2580 MOVQ R13, 8(SP)
2581
2582 // Check values
2583 MOVQ 16(SP), CX
2584 MOVQ 24(SP), R12
2585 LEAQ (CX)(R12*1), R14
2586 MOVQ s+0(FP), R15
2587 ADDQ R14, 256(R15)
2588 MOVQ ctx+16(FP), R14
2589 SUBQ R12, 104(R14)
2590 JS error_not_enough_literals
2591 CMPQ CX, $0x00020002
2592 JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big
2593 TESTQ R13, R13
2594 JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
2595 TESTQ CX, CX
2596 JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
2597
2598sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
2599 MOVQ 24(SP), CX
2600 MOVQ 8(SP), R12
2601 MOVQ 16(SP), R13
2602
2603 // Check if we have enough space in s.out
2604 LEAQ (CX)(R13*1), R14
2605 ADDQ R9, R14
2606 CMPQ R14, 32(SP)
2607 JA error_not_enough_space
2608
2609 // Copy literals
2610 TESTQ CX, CX
2611 JZ check_offset
2612 XORQ R14, R14
2613
2614copy_1:
2615 MOVUPS (R10)(R14*1), X0
2616 MOVUPS X0, (R9)(R14*1)
2617 ADDQ $0x10, R14
2618 CMPQ R14, CX
2619 JB copy_1
2620 ADDQ CX, R10
2621 ADDQ CX, R9
2622 ADDQ CX, R11
2623
2624 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
2625check_offset:
2626 MOVQ R11, CX
2627 ADDQ 40(SP), CX
2628 CMPQ R12, CX
2629 JG error_match_off_too_big
2630 CMPQ R12, 56(SP)
2631 JG error_match_off_too_big
2632
2633 // Copy match from history
2634 MOVQ R12, CX
2635 SUBQ R11, CX
2636 JLS copy_match
2637 MOVQ 48(SP), R14
2638 SUBQ CX, R14
2639 CMPQ R13, CX
2640 JG copy_all_from_history
2641 MOVQ R13, CX
2642 SUBQ $0x10, CX
2643 JB copy_4_small
2644
2645copy_4_loop:
2646 MOVUPS (R14), X0
2647 MOVUPS X0, (R9)
2648 ADDQ $0x10, R14
2649 ADDQ $0x10, R9
2650 SUBQ $0x10, CX
2651 JAE copy_4_loop
2652 LEAQ 16(R14)(CX*1), R14
2653 LEAQ 16(R9)(CX*1), R9
2654 MOVUPS -16(R14), X0
2655 MOVUPS X0, -16(R9)
2656 JMP copy_4_end
2657
2658copy_4_small:
2659 CMPQ R13, $0x03
2660 JE copy_4_move_3
2661 CMPQ R13, $0x08
2662 JB copy_4_move_4through7
2663 JMP copy_4_move_8through16
2664
2665copy_4_move_3:
2666 MOVW (R14), CX
2667 MOVB 2(R14), R12
2668 MOVW CX, (R9)
2669 MOVB R12, 2(R9)
2670 ADDQ R13, R14
2671 ADDQ R13, R9
2672 JMP copy_4_end
2673
2674copy_4_move_4through7:
2675 MOVL (R14), CX
2676 MOVL -4(R14)(R13*1), R12
2677 MOVL CX, (R9)
2678 MOVL R12, -4(R9)(R13*1)
2679 ADDQ R13, R14
2680 ADDQ R13, R9
2681 JMP copy_4_end
2682
2683copy_4_move_8through16:
2684 MOVQ (R14), CX
2685 MOVQ -8(R14)(R13*1), R12
2686 MOVQ CX, (R9)
2687 MOVQ R12, -8(R9)(R13*1)
2688 ADDQ R13, R14
2689 ADDQ R13, R9
2690
2691copy_4_end:
2692 ADDQ R13, R11
2693 JMP handle_loop
2694 JMP loop_finished
2695
2696copy_all_from_history:
2697 MOVQ CX, R15
2698 SUBQ $0x10, R15
2699 JB copy_5_small
2700
2701copy_5_loop:
2702 MOVUPS (R14), X0
2703 MOVUPS X0, (R9)
2704 ADDQ $0x10, R14
2705 ADDQ $0x10, R9
2706 SUBQ $0x10, R15
2707 JAE copy_5_loop
2708 LEAQ 16(R14)(R15*1), R14
2709 LEAQ 16(R9)(R15*1), R9
2710 MOVUPS -16(R14), X0
2711 MOVUPS X0, -16(R9)
2712 JMP copy_5_end
2713
2714copy_5_small:
2715 CMPQ CX, $0x03
2716 JE copy_5_move_3
2717 JB copy_5_move_1or2
2718 CMPQ CX, $0x08
2719 JB copy_5_move_4through7
2720 JMP copy_5_move_8through16
2721
2722copy_5_move_1or2:
2723 MOVB (R14), R15
2724 MOVB -1(R14)(CX*1), BP
2725 MOVB R15, (R9)
2726 MOVB BP, -1(R9)(CX*1)
2727 ADDQ CX, R14
2728 ADDQ CX, R9
2729 JMP copy_5_end
2730
2731copy_5_move_3:
2732 MOVW (R14), R15
2733 MOVB 2(R14), BP
2734 MOVW R15, (R9)
2735 MOVB BP, 2(R9)
2736 ADDQ CX, R14
2737 ADDQ CX, R9
2738 JMP copy_5_end
2739
2740copy_5_move_4through7:
2741 MOVL (R14), R15
2742 MOVL -4(R14)(CX*1), BP
2743 MOVL R15, (R9)
2744 MOVL BP, -4(R9)(CX*1)
2745 ADDQ CX, R14
2746 ADDQ CX, R9
2747 JMP copy_5_end
2748
2749copy_5_move_8through16:
2750 MOVQ (R14), R15
2751 MOVQ -8(R14)(CX*1), BP
2752 MOVQ R15, (R9)
2753 MOVQ BP, -8(R9)(CX*1)
2754 ADDQ CX, R14
2755 ADDQ CX, R9
2756
2757copy_5_end:
2758 ADDQ CX, R11
2759 SUBQ CX, R13
2760
2761 // Copy match from the current buffer
2762copy_match:
2763 MOVQ R9, CX
2764 SUBQ R12, CX
2765
2766 // ml <= mo
2767 CMPQ R13, R12
2768 JA copy_overlapping_match
2769
2770 // Copy non-overlapping match
2771 ADDQ R13, R11
2772 MOVQ R9, R12
2773 ADDQ R13, R9
2774
2775copy_2:
2776 MOVUPS (CX), X0
2777 MOVUPS X0, (R12)
2778 ADDQ $0x10, CX
2779 ADDQ $0x10, R12
2780 SUBQ $0x10, R13
2781 JHI copy_2
2782 JMP handle_loop
2783
2784 // Copy overlapping match
2785copy_overlapping_match:
2786 ADDQ R13, R11
2787
2788copy_slow_3:
2789 MOVB (CX), R12
2790 MOVB R12, (R9)
2791 INCQ CX
2792 INCQ R9
2793 DECQ R13
2794 JNZ copy_slow_3
2795
2796handle_loop:
2797 MOVQ ctx+16(FP), CX
2798 DECQ 96(CX)
2799 JNS sequenceDecs_decodeSync_bmi2_main_loop
2800
2801loop_finished:
2802 MOVQ br+8(FP), CX
2803 MOVQ AX, 24(CX)
2804 MOVB DL, 32(CX)
2805 MOVQ BX, 8(CX)
2806
2807 // Update the context
2808 MOVQ ctx+16(FP), AX
2809 MOVQ R11, 136(AX)
2810 MOVQ 144(AX), CX
2811 SUBQ CX, R10
2812 MOVQ R10, 168(AX)
2813
2814 // Return success
2815 MOVQ $0x00000000, ret+24(FP)
2816 RET
2817
2818 // Return with match length error
2819sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
2820 MOVQ 16(SP), AX
2821 MOVQ ctx+16(FP), CX
2822 MOVQ AX, 216(CX)
2823 MOVQ $0x00000001, ret+24(FP)
2824 RET
2825
2826 // Return with match too long error
2827sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
2828 MOVQ ctx+16(FP), AX
2829 MOVQ 16(SP), CX
2830 MOVQ CX, 216(AX)
2831 MOVQ $0x00000002, ret+24(FP)
2832 RET
2833
2834 // Return with match offset too long error
2835error_match_off_too_big:
2836 MOVQ ctx+16(FP), AX
2837 MOVQ 8(SP), CX
2838 MOVQ CX, 224(AX)
2839 MOVQ R11, 136(AX)
2840 MOVQ $0x00000003, ret+24(FP)
2841 RET
2842
2843 // Return with not enough literals error
2844error_not_enough_literals:
2845 MOVQ ctx+16(FP), AX
2846 MOVQ 24(SP), CX
2847 MOVQ CX, 208(AX)
2848 MOVQ $0x00000004, ret+24(FP)
2849 RET
2850
2851 // Return with overread error
2852error_overread:
2853 MOVQ $0x00000006, ret+24(FP)
2854 RET
2855
2856 // Return with not enough output space error
2857error_not_enough_space:
2858 MOVQ ctx+16(FP), AX
2859 MOVQ 24(SP), CX
2860 MOVQ CX, 208(AX)
2861 MOVQ 16(SP), CX
2862 MOVQ CX, 216(AX)
2863 MOVQ R11, 136(AX)
2864 MOVQ $0x00000005, ret+24(FP)
2865 RET
2866
2867// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
2868// Requires: CMOV, SSE
2869TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
2870 MOVQ br+8(FP), CX
2871 MOVQ 24(CX), DX
2872 MOVBQZX 32(CX), BX
2873 MOVQ (CX), AX
2874 MOVQ 8(CX), SI
2875 ADDQ SI, AX
2876 MOVQ AX, (SP)
2877 MOVQ ctx+16(FP), AX
2878 MOVQ 72(AX), DI
2879 MOVQ 80(AX), R8
2880 MOVQ 88(AX), R9
2881 XORQ CX, CX
2882 MOVQ CX, 8(SP)
2883 MOVQ CX, 16(SP)
2884 MOVQ CX, 24(SP)
2885 MOVQ 112(AX), R10
2886 MOVQ 128(AX), CX
2887 MOVQ CX, 32(SP)
2888 MOVQ 144(AX), R11
2889 MOVQ 136(AX), R12
2890 MOVQ 200(AX), CX
2891 MOVQ CX, 56(SP)
2892 MOVQ 176(AX), CX
2893 MOVQ CX, 48(SP)
2894 MOVQ 184(AX), AX
2895 MOVQ AX, 40(SP)
2896 MOVQ 40(SP), AX
2897 ADDQ AX, 48(SP)
2898
2899 // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
2900 ADDQ R10, 32(SP)
2901
2902 // outBase += outPosition
2903 ADDQ R12, R10
2904
2905sequenceDecs_decodeSync_safe_amd64_main_loop:
2906 MOVQ (SP), R13
2907
2908 // Fill bitreader to have enough for the offset and match length.
2909 CMPQ SI, $0x08
2910 JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
2911 MOVQ BX, AX
2912 SHRQ $0x03, AX
2913 SUBQ AX, R13
2914 MOVQ (R13), DX
2915 SUBQ AX, SI
2916 ANDQ $0x07, BX
2917 JMP sequenceDecs_decodeSync_safe_amd64_fill_end
2918
2919sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
2920 CMPQ SI, $0x00
2921 JLE sequenceDecs_decodeSync_safe_amd64_fill_check_overread
2922 CMPQ BX, $0x07
2923 JLE sequenceDecs_decodeSync_safe_amd64_fill_end
2924 SHLQ $0x08, DX
2925 SUBQ $0x01, R13
2926 SUBQ $0x01, SI
2927 SUBQ $0x08, BX
2928 MOVBQZX (R13), AX
2929 ORQ AX, DX
2930 JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
2931
2932sequenceDecs_decodeSync_safe_amd64_fill_check_overread:
2933 CMPQ BX, $0x40
2934 JA error_overread
2935
2936sequenceDecs_decodeSync_safe_amd64_fill_end:
2937 // Update offset
2938 MOVQ R9, AX
2939 MOVQ BX, CX
2940 MOVQ DX, R14
2941 SHLQ CL, R14
2942 MOVB AH, CL
2943 SHRQ $0x20, AX
2944 TESTQ CX, CX
2945 JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero
2946 ADDQ CX, BX
2947 CMPQ BX, $0x40
2948 JA sequenceDecs_decodeSync_safe_amd64_of_update_zero
2949 CMPQ CX, $0x40
2950 JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero
2951 NEGQ CX
2952 SHRQ CL, R14
2953 ADDQ R14, AX
2954
2955sequenceDecs_decodeSync_safe_amd64_of_update_zero:
2956 MOVQ AX, 8(SP)
2957
2958 // Update match length
2959 MOVQ R8, AX
2960 MOVQ BX, CX
2961 MOVQ DX, R14
2962 SHLQ CL, R14
2963 MOVB AH, CL
2964 SHRQ $0x20, AX
2965 TESTQ CX, CX
2966 JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero
2967 ADDQ CX, BX
2968 CMPQ BX, $0x40
2969 JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero
2970 CMPQ CX, $0x40
2971 JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero
2972 NEGQ CX
2973 SHRQ CL, R14
2974 ADDQ R14, AX
2975
2976sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
2977 MOVQ AX, 16(SP)
2978
2979 // Fill bitreader to have enough for the remaining
2980 CMPQ SI, $0x08
2981 JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
2982 MOVQ BX, AX
2983 SHRQ $0x03, AX
2984 SUBQ AX, R13
2985 MOVQ (R13), DX
2986 SUBQ AX, SI
2987 ANDQ $0x07, BX
2988 JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end
2989
2990sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
2991 CMPQ SI, $0x00
2992 JLE sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread
2993 CMPQ BX, $0x07
2994 JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
2995 SHLQ $0x08, DX
2996 SUBQ $0x01, R13
2997 SUBQ $0x01, SI
2998 SUBQ $0x08, BX
2999 MOVBQZX (R13), AX
3000 ORQ AX, DX
3001 JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
3002
3003sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread:
3004 CMPQ BX, $0x40
3005 JA error_overread
3006
3007sequenceDecs_decodeSync_safe_amd64_fill_2_end:
3008 // Update literal length
3009 MOVQ DI, AX
3010 MOVQ BX, CX
3011 MOVQ DX, R14
3012 SHLQ CL, R14
3013 MOVB AH, CL
3014 SHRQ $0x20, AX
3015 TESTQ CX, CX
3016 JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero
3017 ADDQ CX, BX
3018 CMPQ BX, $0x40
3019 JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero
3020 CMPQ CX, $0x40
3021 JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero
3022 NEGQ CX
3023 SHRQ CL, R14
3024 ADDQ R14, AX
3025
3026sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
3027 MOVQ AX, 24(SP)
3028
3029 // Fill bitreader for state updates
3030 MOVQ R13, (SP)
3031 MOVQ R9, AX
3032 SHRQ $0x08, AX
3033 MOVBQZX AL, AX
3034 MOVQ ctx+16(FP), CX
3035 CMPQ 96(CX), $0x00
3036 JZ sequenceDecs_decodeSync_safe_amd64_skip_update
3037
3038 // Update Literal Length State
3039 MOVBQZX DI, R13
3040 SHRL $0x10, DI
3041 LEAQ (BX)(R13*1), CX
3042 MOVQ DX, R14
3043 MOVQ CX, BX
3044 ROLQ CL, R14
3045 MOVL $0x00000001, R15
3046 MOVB R13, CL
3047 SHLL CL, R15
3048 DECL R15
3049 ANDQ R15, R14
3050 ADDQ R14, DI
3051
3052 // Load ctx.llTable
3053 MOVQ ctx+16(FP), CX
3054 MOVQ (CX), CX
3055 MOVQ (CX)(DI*8), DI
3056
3057 // Update Match Length State
3058 MOVBQZX R8, R13
3059 SHRL $0x10, R8
3060 LEAQ (BX)(R13*1), CX
3061 MOVQ DX, R14
3062 MOVQ CX, BX
3063 ROLQ CL, R14
3064 MOVL $0x00000001, R15
3065 MOVB R13, CL
3066 SHLL CL, R15
3067 DECL R15
3068 ANDQ R15, R14
3069 ADDQ R14, R8
3070
3071 // Load ctx.mlTable
3072 MOVQ ctx+16(FP), CX
3073 MOVQ 24(CX), CX
3074 MOVQ (CX)(R8*8), R8
3075
3076 // Update Offset State
3077 MOVBQZX R9, R13
3078 SHRL $0x10, R9
3079 LEAQ (BX)(R13*1), CX
3080 MOVQ DX, R14
3081 MOVQ CX, BX
3082 ROLQ CL, R14
3083 MOVL $0x00000001, R15
3084 MOVB R13, CL
3085 SHLL CL, R15
3086 DECL R15
3087 ANDQ R15, R14
3088 ADDQ R14, R9
3089
3090 // Load ctx.ofTable
3091 MOVQ ctx+16(FP), CX
3092 MOVQ 48(CX), CX
3093 MOVQ (CX)(R9*8), R9
3094
3095sequenceDecs_decodeSync_safe_amd64_skip_update:
3096 // Adjust offset
3097 MOVQ s+0(FP), CX
3098 MOVQ 8(SP), R13
3099 CMPQ AX, $0x01
3100 JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
3101 MOVUPS 144(CX), X0
3102 MOVQ R13, 144(CX)
3103 MOVUPS X0, 152(CX)
3104 JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
3105
3106sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
3107 CMPQ 24(SP), $0x00000000
3108 JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
3109 INCQ R13
3110 JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
3111
3112sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
3113 TESTQ R13, R13
3114 JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
3115 MOVQ 144(CX), R13
3116 JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
3117
3118sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
3119 MOVQ R13, AX
3120 XORQ R14, R14
3121 MOVQ $-1, R15
3122 CMPQ R13, $0x03
3123 CMOVQEQ R14, AX
3124 CMOVQEQ R15, R14
3125 ADDQ 144(CX)(AX*8), R14
3126 JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
3127 MOVQ $0x00000001, R14
3128
3129sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
3130 CMPQ R13, $0x01
3131 JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip
3132 MOVQ 152(CX), AX
3133 MOVQ AX, 160(CX)
3134
3135sequenceDecs_decodeSync_safe_amd64_adjust_skip:
3136 MOVQ 144(CX), AX
3137 MOVQ AX, 152(CX)
3138 MOVQ R14, 144(CX)
3139 MOVQ R14, R13
3140
3141sequenceDecs_decodeSync_safe_amd64_after_adjust:
3142 MOVQ R13, 8(SP)
3143
3144 // Check values
3145 MOVQ 16(SP), AX
3146 MOVQ 24(SP), CX
3147 LEAQ (AX)(CX*1), R14
3148 MOVQ s+0(FP), R15
3149 ADDQ R14, 256(R15)
3150 MOVQ ctx+16(FP), R14
3151 SUBQ CX, 104(R14)
3152 JS error_not_enough_literals
3153 CMPQ AX, $0x00020002
3154 JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
3155 TESTQ R13, R13
3156 JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
3157 TESTQ AX, AX
3158 JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
3159
3160sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
3161 MOVQ 24(SP), AX
3162 MOVQ 8(SP), CX
3163 MOVQ 16(SP), R13
3164
3165 // Check if we have enough space in s.out
3166 LEAQ (AX)(R13*1), R14
3167 ADDQ R10, R14
3168 CMPQ R14, 32(SP)
3169 JA error_not_enough_space
3170
3171 // Copy literals
3172 TESTQ AX, AX
3173 JZ check_offset
3174 MOVQ AX, R14
3175 SUBQ $0x10, R14
3176 JB copy_1_small
3177
3178copy_1_loop:
3179 MOVUPS (R11), X0
3180 MOVUPS X0, (R10)
3181 ADDQ $0x10, R11
3182 ADDQ $0x10, R10
3183 SUBQ $0x10, R14
3184 JAE copy_1_loop
3185 LEAQ 16(R11)(R14*1), R11
3186 LEAQ 16(R10)(R14*1), R10
3187 MOVUPS -16(R11), X0
3188 MOVUPS X0, -16(R10)
3189 JMP copy_1_end
3190
3191copy_1_small:
3192 CMPQ AX, $0x03
3193 JE copy_1_move_3
3194 JB copy_1_move_1or2
3195 CMPQ AX, $0x08
3196 JB copy_1_move_4through7
3197 JMP copy_1_move_8through16
3198
3199copy_1_move_1or2:
3200 MOVB (R11), R14
3201 MOVB -1(R11)(AX*1), R15
3202 MOVB R14, (R10)
3203 MOVB R15, -1(R10)(AX*1)
3204 ADDQ AX, R11
3205 ADDQ AX, R10
3206 JMP copy_1_end
3207
3208copy_1_move_3:
3209 MOVW (R11), R14
3210 MOVB 2(R11), R15
3211 MOVW R14, (R10)
3212 MOVB R15, 2(R10)
3213 ADDQ AX, R11
3214 ADDQ AX, R10
3215 JMP copy_1_end
3216
3217copy_1_move_4through7:
3218 MOVL (R11), R14
3219 MOVL -4(R11)(AX*1), R15
3220 MOVL R14, (R10)
3221 MOVL R15, -4(R10)(AX*1)
3222 ADDQ AX, R11
3223 ADDQ AX, R10
3224 JMP copy_1_end
3225
3226copy_1_move_8through16:
3227 MOVQ (R11), R14
3228 MOVQ -8(R11)(AX*1), R15
3229 MOVQ R14, (R10)
3230 MOVQ R15, -8(R10)(AX*1)
3231 ADDQ AX, R11
3232 ADDQ AX, R10
3233
3234copy_1_end:
3235 ADDQ AX, R12
3236
3237 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
3238check_offset:
3239 MOVQ R12, AX
3240 ADDQ 40(SP), AX
3241 CMPQ CX, AX
3242 JG error_match_off_too_big
3243 CMPQ CX, 56(SP)
3244 JG error_match_off_too_big
3245
3246 // Copy match from history
3247 MOVQ CX, AX
3248 SUBQ R12, AX
3249 JLS copy_match
3250 MOVQ 48(SP), R14
3251 SUBQ AX, R14
3252 CMPQ R13, AX
3253 JG copy_all_from_history
3254 MOVQ R13, AX
3255 SUBQ $0x10, AX
3256 JB copy_4_small
3257
3258copy_4_loop:
3259 MOVUPS (R14), X0
3260 MOVUPS X0, (R10)
3261 ADDQ $0x10, R14
3262 ADDQ $0x10, R10
3263 SUBQ $0x10, AX
3264 JAE copy_4_loop
3265 LEAQ 16(R14)(AX*1), R14
3266 LEAQ 16(R10)(AX*1), R10
3267 MOVUPS -16(R14), X0
3268 MOVUPS X0, -16(R10)
3269 JMP copy_4_end
3270
3271copy_4_small:
3272 CMPQ R13, $0x03
3273 JE copy_4_move_3
3274 CMPQ R13, $0x08
3275 JB copy_4_move_4through7
3276 JMP copy_4_move_8through16
3277
3278copy_4_move_3:
3279 MOVW (R14), AX
3280 MOVB 2(R14), CL
3281 MOVW AX, (R10)
3282 MOVB CL, 2(R10)
3283 ADDQ R13, R14
3284 ADDQ R13, R10
3285 JMP copy_4_end
3286
3287copy_4_move_4through7:
3288 MOVL (R14), AX
3289 MOVL -4(R14)(R13*1), CX
3290 MOVL AX, (R10)
3291 MOVL CX, -4(R10)(R13*1)
3292 ADDQ R13, R14
3293 ADDQ R13, R10
3294 JMP copy_4_end
3295
3296copy_4_move_8through16:
3297 MOVQ (R14), AX
3298 MOVQ -8(R14)(R13*1), CX
3299 MOVQ AX, (R10)
3300 MOVQ CX, -8(R10)(R13*1)
3301 ADDQ R13, R14
3302 ADDQ R13, R10
3303
3304copy_4_end:
3305 ADDQ R13, R12
3306 JMP handle_loop
3307 JMP loop_finished
3308
3309copy_all_from_history:
3310 MOVQ AX, R15
3311 SUBQ $0x10, R15
3312 JB copy_5_small
3313
3314copy_5_loop:
3315 MOVUPS (R14), X0
3316 MOVUPS X0, (R10)
3317 ADDQ $0x10, R14
3318 ADDQ $0x10, R10
3319 SUBQ $0x10, R15
3320 JAE copy_5_loop
3321 LEAQ 16(R14)(R15*1), R14
3322 LEAQ 16(R10)(R15*1), R10
3323 MOVUPS -16(R14), X0
3324 MOVUPS X0, -16(R10)
3325 JMP copy_5_end
3326
3327copy_5_small:
3328 CMPQ AX, $0x03
3329 JE copy_5_move_3
3330 JB copy_5_move_1or2
3331 CMPQ AX, $0x08
3332 JB copy_5_move_4through7
3333 JMP copy_5_move_8through16
3334
3335copy_5_move_1or2:
3336 MOVB (R14), R15
3337 MOVB -1(R14)(AX*1), BP
3338 MOVB R15, (R10)
3339 MOVB BP, -1(R10)(AX*1)
3340 ADDQ AX, R14
3341 ADDQ AX, R10
3342 JMP copy_5_end
3343
3344copy_5_move_3:
3345 MOVW (R14), R15
3346 MOVB 2(R14), BP
3347 MOVW R15, (R10)
3348 MOVB BP, 2(R10)
3349 ADDQ AX, R14
3350 ADDQ AX, R10
3351 JMP copy_5_end
3352
3353copy_5_move_4through7:
3354 MOVL (R14), R15
3355 MOVL -4(R14)(AX*1), BP
3356 MOVL R15, (R10)
3357 MOVL BP, -4(R10)(AX*1)
3358 ADDQ AX, R14
3359 ADDQ AX, R10
3360 JMP copy_5_end
3361
3362copy_5_move_8through16:
3363 MOVQ (R14), R15
3364 MOVQ -8(R14)(AX*1), BP
3365 MOVQ R15, (R10)
3366 MOVQ BP, -8(R10)(AX*1)
3367 ADDQ AX, R14
3368 ADDQ AX, R10
3369
3370copy_5_end:
3371 ADDQ AX, R12
3372 SUBQ AX, R13
3373
3374 // Copy match from the current buffer
3375copy_match:
3376 MOVQ R10, AX
3377 SUBQ CX, AX
3378
3379 // ml <= mo
3380 CMPQ R13, CX
3381 JA copy_overlapping_match
3382
3383 // Copy non-overlapping match
3384 ADDQ R13, R12
3385 MOVQ R13, CX
3386 SUBQ $0x10, CX
3387 JB copy_2_small
3388
3389copy_2_loop:
3390 MOVUPS (AX), X0
3391 MOVUPS X0, (R10)
3392 ADDQ $0x10, AX
3393 ADDQ $0x10, R10
3394 SUBQ $0x10, CX
3395 JAE copy_2_loop
3396 LEAQ 16(AX)(CX*1), AX
3397 LEAQ 16(R10)(CX*1), R10
3398 MOVUPS -16(AX), X0
3399 MOVUPS X0, -16(R10)
3400 JMP copy_2_end
3401
3402copy_2_small:
3403 CMPQ R13, $0x03
3404 JE copy_2_move_3
3405 JB copy_2_move_1or2
3406 CMPQ R13, $0x08
3407 JB copy_2_move_4through7
3408 JMP copy_2_move_8through16
3409
3410copy_2_move_1or2:
3411 MOVB (AX), CL
3412 MOVB -1(AX)(R13*1), R14
3413 MOVB CL, (R10)
3414 MOVB R14, -1(R10)(R13*1)
3415 ADDQ R13, AX
3416 ADDQ R13, R10
3417 JMP copy_2_end
3418
3419copy_2_move_3:
3420 MOVW (AX), CX
3421 MOVB 2(AX), R14
3422 MOVW CX, (R10)
3423 MOVB R14, 2(R10)
3424 ADDQ R13, AX
3425 ADDQ R13, R10
3426 JMP copy_2_end
3427
3428copy_2_move_4through7:
3429 MOVL (AX), CX
3430 MOVL -4(AX)(R13*1), R14
3431 MOVL CX, (R10)
3432 MOVL R14, -4(R10)(R13*1)
3433 ADDQ R13, AX
3434 ADDQ R13, R10
3435 JMP copy_2_end
3436
3437copy_2_move_8through16:
3438 MOVQ (AX), CX
3439 MOVQ -8(AX)(R13*1), R14
3440 MOVQ CX, (R10)
3441 MOVQ R14, -8(R10)(R13*1)
3442 ADDQ R13, AX
3443 ADDQ R13, R10
3444
3445copy_2_end:
3446 JMP handle_loop
3447
3448 // Copy overlapping match
3449copy_overlapping_match:
3450 ADDQ R13, R12
3451
3452copy_slow_3:
3453 MOVB (AX), CL
3454 MOVB CL, (R10)
3455 INCQ AX
3456 INCQ R10
3457 DECQ R13
3458 JNZ copy_slow_3
3459
3460handle_loop:
3461 MOVQ ctx+16(FP), AX
3462 DECQ 96(AX)
3463 JNS sequenceDecs_decodeSync_safe_amd64_main_loop
3464
3465loop_finished:
3466 MOVQ br+8(FP), AX
3467 MOVQ DX, 24(AX)
3468 MOVB BL, 32(AX)
3469 MOVQ SI, 8(AX)
3470
3471 // Update the context
3472 MOVQ ctx+16(FP), AX
3473 MOVQ R12, 136(AX)
3474 MOVQ 144(AX), CX
3475 SUBQ CX, R11
3476 MOVQ R11, 168(AX)
3477
3478 // Return success
3479 MOVQ $0x00000000, ret+24(FP)
3480 RET
3481
3482 // Return with match length error
3483sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
3484 MOVQ 16(SP), AX
3485 MOVQ ctx+16(FP), CX
3486 MOVQ AX, 216(CX)
3487 MOVQ $0x00000001, ret+24(FP)
3488 RET
3489
3490 // Return with match too long error
3491sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
3492 MOVQ ctx+16(FP), AX
3493 MOVQ 16(SP), CX
3494 MOVQ CX, 216(AX)
3495 MOVQ $0x00000002, ret+24(FP)
3496 RET
3497
3498 // Return with match offset too long error
3499error_match_off_too_big:
3500 MOVQ ctx+16(FP), AX
3501 MOVQ 8(SP), CX
3502 MOVQ CX, 224(AX)
3503 MOVQ R12, 136(AX)
3504 MOVQ $0x00000003, ret+24(FP)
3505 RET
3506
3507 // Return with not enough literals error
3508error_not_enough_literals:
3509 MOVQ ctx+16(FP), AX
3510 MOVQ 24(SP), CX
3511 MOVQ CX, 208(AX)
3512 MOVQ $0x00000004, ret+24(FP)
3513 RET
3514
3515 // Return with overread error
3516error_overread:
3517 MOVQ $0x00000006, ret+24(FP)
3518 RET
3519
3520 // Return with not enough output space error
3521error_not_enough_space:
3522 MOVQ ctx+16(FP), AX
3523 MOVQ 24(SP), CX
3524 MOVQ CX, 208(AX)
3525 MOVQ 16(SP), CX
3526 MOVQ CX, 216(AX)
3527 MOVQ R12, 136(AX)
3528 MOVQ $0x00000005, ret+24(FP)
3529 RET
3530
3531// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
3532// Requires: BMI, BMI2, CMOV, SSE
3533TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
3534 MOVQ br+8(FP), BX
3535 MOVQ 24(BX), AX
3536 MOVBQZX 32(BX), DX
3537 MOVQ (BX), CX
3538 MOVQ 8(BX), BX
3539 ADDQ BX, CX
3540 MOVQ CX, (SP)
3541 MOVQ ctx+16(FP), CX
3542 MOVQ 72(CX), SI
3543 MOVQ 80(CX), DI
3544 MOVQ 88(CX), R8
3545 XORQ R9, R9
3546 MOVQ R9, 8(SP)
3547 MOVQ R9, 16(SP)
3548 MOVQ R9, 24(SP)
3549 MOVQ 112(CX), R9
3550 MOVQ 128(CX), R10
3551 MOVQ R10, 32(SP)
3552 MOVQ 144(CX), R10
3553 MOVQ 136(CX), R11
3554 MOVQ 200(CX), R12
3555 MOVQ R12, 56(SP)
3556 MOVQ 176(CX), R12
3557 MOVQ R12, 48(SP)
3558 MOVQ 184(CX), CX
3559 MOVQ CX, 40(SP)
3560 MOVQ 40(SP), CX
3561 ADDQ CX, 48(SP)
3562
3563 // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
3564 ADDQ R9, 32(SP)
3565
3566 // outBase += outPosition
3567 ADDQ R11, R9
3568
3569sequenceDecs_decodeSync_safe_bmi2_main_loop:
3570 MOVQ (SP), R12
3571
3572 // Fill bitreader to have enough for the offset and match length.
3573 CMPQ BX, $0x08
3574 JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
3575 MOVQ DX, CX
3576 SHRQ $0x03, CX
3577 SUBQ CX, R12
3578 MOVQ (R12), AX
3579 SUBQ CX, BX
3580 ANDQ $0x07, DX
3581 JMP sequenceDecs_decodeSync_safe_bmi2_fill_end
3582
3583sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
3584 CMPQ BX, $0x00
3585 JLE sequenceDecs_decodeSync_safe_bmi2_fill_check_overread
3586 CMPQ DX, $0x07
3587 JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
3588 SHLQ $0x08, AX
3589 SUBQ $0x01, R12
3590 SUBQ $0x01, BX
3591 SUBQ $0x08, DX
3592 MOVBQZX (R12), CX
3593 ORQ CX, AX
3594 JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
3595
3596sequenceDecs_decodeSync_safe_bmi2_fill_check_overread:
3597 CMPQ DX, $0x40
3598 JA error_overread
3599
3600sequenceDecs_decodeSync_safe_bmi2_fill_end:
3601 // Update offset
3602 MOVQ $0x00000808, CX
3603 BEXTRQ CX, R8, R13
3604 MOVQ AX, R14
3605 LEAQ (DX)(R13*1), CX
3606 ROLQ CL, R14
3607 BZHIQ R13, R14, R14
3608 MOVQ CX, DX
3609 MOVQ R8, CX
3610 SHRQ $0x20, CX
3611 ADDQ R14, CX
3612 MOVQ CX, 8(SP)
3613
3614 // Update match length
3615 MOVQ $0x00000808, CX
3616 BEXTRQ CX, DI, R13
3617 MOVQ AX, R14
3618 LEAQ (DX)(R13*1), CX
3619 ROLQ CL, R14
3620 BZHIQ R13, R14, R14
3621 MOVQ CX, DX
3622 MOVQ DI, CX
3623 SHRQ $0x20, CX
3624 ADDQ R14, CX
3625 MOVQ CX, 16(SP)
3626
3627 // Fill bitreader to have enough for the remaining
3628 CMPQ BX, $0x08
3629 JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
3630 MOVQ DX, CX
3631 SHRQ $0x03, CX
3632 SUBQ CX, R12
3633 MOVQ (R12), AX
3634 SUBQ CX, BX
3635 ANDQ $0x07, DX
3636 JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end
3637
3638sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
3639 CMPQ BX, $0x00
3640 JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread
3641 CMPQ DX, $0x07
3642 JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
3643 SHLQ $0x08, AX
3644 SUBQ $0x01, R12
3645 SUBQ $0x01, BX
3646 SUBQ $0x08, DX
3647 MOVBQZX (R12), CX
3648 ORQ CX, AX
3649 JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
3650
3651sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread:
3652 CMPQ DX, $0x40
3653 JA error_overread
3654
3655sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
3656 // Update literal length
3657 MOVQ $0x00000808, CX
3658 BEXTRQ CX, SI, R13
3659 MOVQ AX, R14
3660 LEAQ (DX)(R13*1), CX
3661 ROLQ CL, R14
3662 BZHIQ R13, R14, R14
3663 MOVQ CX, DX
3664 MOVQ SI, CX
3665 SHRQ $0x20, CX
3666 ADDQ R14, CX
3667 MOVQ CX, 24(SP)
3668
3669 // Fill bitreader for state updates
3670 MOVQ R12, (SP)
3671 MOVQ $0x00000808, CX
3672 BEXTRQ CX, R8, R12
3673 MOVQ ctx+16(FP), CX
3674 CMPQ 96(CX), $0x00
3675 JZ sequenceDecs_decodeSync_safe_bmi2_skip_update
3676 LEAQ (SI)(DI*1), R13
3677 ADDQ R8, R13
3678 MOVBQZX R13, R13
3679 LEAQ (DX)(R13*1), CX
3680 MOVQ AX, R14
3681 MOVQ CX, DX
3682 ROLQ CL, R14
3683 BZHIQ R13, R14, R14
3684
3685 // Update Offset State
3686 BZHIQ R8, R14, CX
3687 SHRXQ R8, R14, R14
3688 SHRL $0x10, R8
3689 ADDQ CX, R8
3690
3691 // Load ctx.ofTable
3692 MOVQ ctx+16(FP), CX
3693 MOVQ 48(CX), CX
3694 MOVQ (CX)(R8*8), R8
3695
3696 // Update Match Length State
3697 BZHIQ DI, R14, CX
3698 SHRXQ DI, R14, R14
3699 SHRL $0x10, DI
3700 ADDQ CX, DI
3701
3702 // Load ctx.mlTable
3703 MOVQ ctx+16(FP), CX
3704 MOVQ 24(CX), CX
3705 MOVQ (CX)(DI*8), DI
3706
3707 // Update Literal Length State
3708 BZHIQ SI, R14, CX
3709 SHRL $0x10, SI
3710 ADDQ CX, SI
3711
3712 // Load ctx.llTable
3713 MOVQ ctx+16(FP), CX
3714 MOVQ (CX), CX
3715 MOVQ (CX)(SI*8), SI
3716
3717sequenceDecs_decodeSync_safe_bmi2_skip_update:
3718 // Adjust offset
3719 MOVQ s+0(FP), CX
3720 MOVQ 8(SP), R13
3721 CMPQ R12, $0x01
3722 JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
3723 MOVUPS 144(CX), X0
3724 MOVQ R13, 144(CX)
3725 MOVUPS X0, 152(CX)
3726 JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
3727
3728sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
3729 CMPQ 24(SP), $0x00000000
3730 JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
3731 INCQ R13
3732 JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
3733
3734sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
3735 TESTQ R13, R13
3736 JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
3737 MOVQ 144(CX), R13
3738 JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
3739
3740sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
3741 MOVQ R13, R12
3742 XORQ R14, R14
3743 MOVQ $-1, R15
3744 CMPQ R13, $0x03
3745 CMOVQEQ R14, R12
3746 CMOVQEQ R15, R14
3747 ADDQ 144(CX)(R12*8), R14
3748 JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
3749 MOVQ $0x00000001, R14
3750
3751sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
3752 CMPQ R13, $0x01
3753 JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip
3754 MOVQ 152(CX), R12
3755 MOVQ R12, 160(CX)
3756
3757sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
3758 MOVQ 144(CX), R12
3759 MOVQ R12, 152(CX)
3760 MOVQ R14, 144(CX)
3761 MOVQ R14, R13
3762
3763sequenceDecs_decodeSync_safe_bmi2_after_adjust:
3764 MOVQ R13, 8(SP)
3765
3766 // Check values
3767 MOVQ 16(SP), CX
3768 MOVQ 24(SP), R12
3769 LEAQ (CX)(R12*1), R14
3770 MOVQ s+0(FP), R15
3771 ADDQ R14, 256(R15)
3772 MOVQ ctx+16(FP), R14
3773 SUBQ R12, 104(R14)
3774 JS error_not_enough_literals
3775 CMPQ CX, $0x00020002
3776 JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
3777 TESTQ R13, R13
3778 JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
3779 TESTQ CX, CX
3780 JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
3781
3782sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
3783 MOVQ 24(SP), CX
3784 MOVQ 8(SP), R12
3785 MOVQ 16(SP), R13
3786
3787 // Check if we have enough space in s.out
3788 LEAQ (CX)(R13*1), R14
3789 ADDQ R9, R14
3790 CMPQ R14, 32(SP)
3791 JA error_not_enough_space
3792
3793 // Copy literals
3794 TESTQ CX, CX
3795 JZ check_offset
3796 MOVQ CX, R14
3797 SUBQ $0x10, R14
3798 JB copy_1_small
3799
3800copy_1_loop:
3801 MOVUPS (R10), X0
3802 MOVUPS X0, (R9)
3803 ADDQ $0x10, R10
3804 ADDQ $0x10, R9
3805 SUBQ $0x10, R14
3806 JAE copy_1_loop
3807 LEAQ 16(R10)(R14*1), R10
3808 LEAQ 16(R9)(R14*1), R9
3809 MOVUPS -16(R10), X0
3810 MOVUPS X0, -16(R9)
3811 JMP copy_1_end
3812
3813copy_1_small:
3814 CMPQ CX, $0x03
3815 JE copy_1_move_3
3816 JB copy_1_move_1or2
3817 CMPQ CX, $0x08
3818 JB copy_1_move_4through7
3819 JMP copy_1_move_8through16
3820
3821copy_1_move_1or2:
3822 MOVB (R10), R14
3823 MOVB -1(R10)(CX*1), R15
3824 MOVB R14, (R9)
3825 MOVB R15, -1(R9)(CX*1)
3826 ADDQ CX, R10
3827 ADDQ CX, R9
3828 JMP copy_1_end
3829
3830copy_1_move_3:
3831 MOVW (R10), R14
3832 MOVB 2(R10), R15
3833 MOVW R14, (R9)
3834 MOVB R15, 2(R9)
3835 ADDQ CX, R10
3836 ADDQ CX, R9
3837 JMP copy_1_end
3838
3839copy_1_move_4through7:
3840 MOVL (R10), R14
3841 MOVL -4(R10)(CX*1), R15
3842 MOVL R14, (R9)
3843 MOVL R15, -4(R9)(CX*1)
3844 ADDQ CX, R10
3845 ADDQ CX, R9
3846 JMP copy_1_end
3847
3848copy_1_move_8through16:
3849 MOVQ (R10), R14
3850 MOVQ -8(R10)(CX*1), R15
3851 MOVQ R14, (R9)
3852 MOVQ R15, -8(R9)(CX*1)
3853 ADDQ CX, R10
3854 ADDQ CX, R9
3855
3856copy_1_end:
3857 ADDQ CX, R11
3858
3859 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
3860check_offset:
3861 MOVQ R11, CX
3862 ADDQ 40(SP), CX
3863 CMPQ R12, CX
3864 JG error_match_off_too_big
3865 CMPQ R12, 56(SP)
3866 JG error_match_off_too_big
3867
3868 // Copy match from history
3869 MOVQ R12, CX
3870 SUBQ R11, CX
3871 JLS copy_match
3872 MOVQ 48(SP), R14
3873 SUBQ CX, R14
3874 CMPQ R13, CX
3875 JG copy_all_from_history
3876 MOVQ R13, CX
3877 SUBQ $0x10, CX
3878 JB copy_4_small
3879
3880copy_4_loop:
3881 MOVUPS (R14), X0
3882 MOVUPS X0, (R9)
3883 ADDQ $0x10, R14
3884 ADDQ $0x10, R9
3885 SUBQ $0x10, CX
3886 JAE copy_4_loop
3887 LEAQ 16(R14)(CX*1), R14
3888 LEAQ 16(R9)(CX*1), R9
3889 MOVUPS -16(R14), X0
3890 MOVUPS X0, -16(R9)
3891 JMP copy_4_end
3892
3893copy_4_small:
3894 CMPQ R13, $0x03
3895 JE copy_4_move_3
3896 CMPQ R13, $0x08
3897 JB copy_4_move_4through7
3898 JMP copy_4_move_8through16
3899
3900copy_4_move_3:
3901 MOVW (R14), CX
3902 MOVB 2(R14), R12
3903 MOVW CX, (R9)
3904 MOVB R12, 2(R9)
3905 ADDQ R13, R14
3906 ADDQ R13, R9
3907 JMP copy_4_end
3908
3909copy_4_move_4through7:
3910 MOVL (R14), CX
3911 MOVL -4(R14)(R13*1), R12
3912 MOVL CX, (R9)
3913 MOVL R12, -4(R9)(R13*1)
3914 ADDQ R13, R14
3915 ADDQ R13, R9
3916 JMP copy_4_end
3917
3918copy_4_move_8through16:
3919 MOVQ (R14), CX
3920 MOVQ -8(R14)(R13*1), R12
3921 MOVQ CX, (R9)
3922 MOVQ R12, -8(R9)(R13*1)
3923 ADDQ R13, R14
3924 ADDQ R13, R9
3925
3926copy_4_end:
3927 ADDQ R13, R11
3928 JMP handle_loop
3929 JMP loop_finished
3930
3931copy_all_from_history:
3932 MOVQ CX, R15
3933 SUBQ $0x10, R15
3934 JB copy_5_small
3935
3936copy_5_loop:
3937 MOVUPS (R14), X0
3938 MOVUPS X0, (R9)
3939 ADDQ $0x10, R14
3940 ADDQ $0x10, R9
3941 SUBQ $0x10, R15
3942 JAE copy_5_loop
3943 LEAQ 16(R14)(R15*1), R14
3944 LEAQ 16(R9)(R15*1), R9
3945 MOVUPS -16(R14), X0
3946 MOVUPS X0, -16(R9)
3947 JMP copy_5_end
3948
3949copy_5_small:
3950 CMPQ CX, $0x03
3951 JE copy_5_move_3
3952 JB copy_5_move_1or2
3953 CMPQ CX, $0x08
3954 JB copy_5_move_4through7
3955 JMP copy_5_move_8through16
3956
3957copy_5_move_1or2:
3958 MOVB (R14), R15
3959 MOVB -1(R14)(CX*1), BP
3960 MOVB R15, (R9)
3961 MOVB BP, -1(R9)(CX*1)
3962 ADDQ CX, R14
3963 ADDQ CX, R9
3964 JMP copy_5_end
3965
3966copy_5_move_3:
3967 MOVW (R14), R15
3968 MOVB 2(R14), BP
3969 MOVW R15, (R9)
3970 MOVB BP, 2(R9)
3971 ADDQ CX, R14
3972 ADDQ CX, R9
3973 JMP copy_5_end
3974
3975copy_5_move_4through7:
3976 MOVL (R14), R15
3977 MOVL -4(R14)(CX*1), BP
3978 MOVL R15, (R9)
3979 MOVL BP, -4(R9)(CX*1)
3980 ADDQ CX, R14
3981 ADDQ CX, R9
3982 JMP copy_5_end
3983
3984copy_5_move_8through16:
3985 MOVQ (R14), R15
3986 MOVQ -8(R14)(CX*1), BP
3987 MOVQ R15, (R9)
3988 MOVQ BP, -8(R9)(CX*1)
3989 ADDQ CX, R14
3990 ADDQ CX, R9
3991
3992copy_5_end:
3993 ADDQ CX, R11
3994 SUBQ CX, R13
3995
3996 // Copy match from the current buffer
3997copy_match:
3998 MOVQ R9, CX
3999 SUBQ R12, CX
4000
4001 // ml <= mo
4002 CMPQ R13, R12
4003 JA copy_overlapping_match
4004
4005 // Copy non-overlapping match
4006 ADDQ R13, R11
4007 MOVQ R13, R12
4008 SUBQ $0x10, R12
4009 JB copy_2_small
4010
4011copy_2_loop:
4012 MOVUPS (CX), X0
4013 MOVUPS X0, (R9)
4014 ADDQ $0x10, CX
4015 ADDQ $0x10, R9
4016 SUBQ $0x10, R12
4017 JAE copy_2_loop
4018 LEAQ 16(CX)(R12*1), CX
4019 LEAQ 16(R9)(R12*1), R9
4020 MOVUPS -16(CX), X0
4021 MOVUPS X0, -16(R9)
4022 JMP copy_2_end
4023
4024copy_2_small:
4025 CMPQ R13, $0x03
4026 JE copy_2_move_3
4027 JB copy_2_move_1or2
4028 CMPQ R13, $0x08
4029 JB copy_2_move_4through7
4030 JMP copy_2_move_8through16
4031
4032copy_2_move_1or2:
4033 MOVB (CX), R12
4034 MOVB -1(CX)(R13*1), R14
4035 MOVB R12, (R9)
4036 MOVB R14, -1(R9)(R13*1)
4037 ADDQ R13, CX
4038 ADDQ R13, R9
4039 JMP copy_2_end
4040
4041copy_2_move_3:
4042 MOVW (CX), R12
4043 MOVB 2(CX), R14
4044 MOVW R12, (R9)
4045 MOVB R14, 2(R9)
4046 ADDQ R13, CX
4047 ADDQ R13, R9
4048 JMP copy_2_end
4049
4050copy_2_move_4through7:
4051 MOVL (CX), R12
4052 MOVL -4(CX)(R13*1), R14
4053 MOVL R12, (R9)
4054 MOVL R14, -4(R9)(R13*1)
4055 ADDQ R13, CX
4056 ADDQ R13, R9
4057 JMP copy_2_end
4058
4059copy_2_move_8through16:
4060 MOVQ (CX), R12
4061 MOVQ -8(CX)(R13*1), R14
4062 MOVQ R12, (R9)
4063 MOVQ R14, -8(R9)(R13*1)
4064 ADDQ R13, CX
4065 ADDQ R13, R9
4066
4067copy_2_end:
4068 JMP handle_loop
4069
4070 // Copy overlapping match
4071copy_overlapping_match:
4072 ADDQ R13, R11
4073
4074copy_slow_3:
4075 MOVB (CX), R12
4076 MOVB R12, (R9)
4077 INCQ CX
4078 INCQ R9
4079 DECQ R13
4080 JNZ copy_slow_3
4081
4082handle_loop:
4083 MOVQ ctx+16(FP), CX
4084 DECQ 96(CX)
4085 JNS sequenceDecs_decodeSync_safe_bmi2_main_loop
4086
4087loop_finished:
4088 MOVQ br+8(FP), CX
4089 MOVQ AX, 24(CX)
4090 MOVB DL, 32(CX)
4091 MOVQ BX, 8(CX)
4092
4093 // Update the context
4094 MOVQ ctx+16(FP), AX
4095 MOVQ R11, 136(AX)
4096 MOVQ 144(AX), CX
4097 SUBQ CX, R10
4098 MOVQ R10, 168(AX)
4099
4100 // Return success
4101 MOVQ $0x00000000, ret+24(FP)
4102 RET
4103
4104 // Return with match length error
4105sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
4106 MOVQ 16(SP), AX
4107 MOVQ ctx+16(FP), CX
4108 MOVQ AX, 216(CX)
4109 MOVQ $0x00000001, ret+24(FP)
4110 RET
4111
4112 // Return with match too long error
4113sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
4114 MOVQ ctx+16(FP), AX
4115 MOVQ 16(SP), CX
4116 MOVQ CX, 216(AX)
4117 MOVQ $0x00000002, ret+24(FP)
4118 RET
4119
4120 // Return with match offset too long error
4121error_match_off_too_big:
4122 MOVQ ctx+16(FP), AX
4123 MOVQ 8(SP), CX
4124 MOVQ CX, 224(AX)
4125 MOVQ R11, 136(AX)
4126 MOVQ $0x00000003, ret+24(FP)
4127 RET
4128
4129 // Return with not enough literals error
4130error_not_enough_literals:
4131 MOVQ ctx+16(FP), AX
4132 MOVQ 24(SP), CX
4133 MOVQ CX, 208(AX)
4134 MOVQ $0x00000004, ret+24(FP)
4135 RET
4136
4137 // Return with overread error
4138error_overread:
4139 MOVQ $0x00000006, ret+24(FP)
4140 RET
4141
4142 // Return with not enough output space error
4143error_not_enough_space:
4144 MOVQ ctx+16(FP), AX
4145 MOVQ 24(SP), CX
4146 MOVQ CX, 208(AX)
4147 MOVQ 16(SP), CX
4148 MOVQ CX, 216(AX)
4149 MOVQ R11, 136(AX)
4150 MOVQ $0x00000005, ret+24(FP)
4151 RET
View as plain text