1// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
2
3//go:build !appengine && !noasm && gc && !noasm
4
5#include "textflag.h"
6
7// func _dummy_()
8TEXT ·_dummy_(SB), $0
9#ifdef GOAMD64_v4
10#ifndef GOAMD64_v3
11#define GOAMD64_v3
12#endif
13#endif
14 RET
15
16// func encodeBlockAsm(dst []byte, src []byte) int
17// Requires: BMI, SSE2
18TEXT ·encodeBlockAsm(SB), $65560-56
19 MOVQ dst_base+0(FP), AX
20 MOVQ $0x00000200, CX
21 LEAQ 24(SP), DX
22 PXOR X0, X0
23
24zero_loop_encodeBlockAsm:
25 MOVOU X0, (DX)
26 MOVOU X0, 16(DX)
27 MOVOU X0, 32(DX)
28 MOVOU X0, 48(DX)
29 MOVOU X0, 64(DX)
30 MOVOU X0, 80(DX)
31 MOVOU X0, 96(DX)
32 MOVOU X0, 112(DX)
33 ADDQ $0x80, DX
34 DECQ CX
35 JNZ zero_loop_encodeBlockAsm
36 MOVL $0x00000000, 12(SP)
37 MOVQ src_len+32(FP), CX
38 LEAQ -9(CX), DX
39 LEAQ -8(CX), BX
40 MOVL BX, 8(SP)
41 SHRQ $0x05, CX
42 SUBL CX, DX
43 LEAQ (AX)(DX*1), DX
44 MOVQ DX, (SP)
45 MOVL $0x00000001, CX
46 MOVL CX, 16(SP)
47 MOVQ src_base+24(FP), DX
48
49search_loop_encodeBlockAsm:
50 MOVL CX, BX
51 SUBL 12(SP), BX
52 SHRL $0x06, BX
53 LEAL 4(CX)(BX*1), BX
54 CMPL BX, 8(SP)
55 JAE emit_remainder_encodeBlockAsm
56 MOVQ (DX)(CX*1), SI
57 MOVL BX, 20(SP)
58 MOVQ $0x0000cf1bbcdcbf9b, R8
59 MOVQ SI, R9
60 MOVQ SI, R10
61 SHRQ $0x08, R10
62 SHLQ $0x10, R9
63 IMULQ R8, R9
64 SHRQ $0x32, R9
65 SHLQ $0x10, R10
66 IMULQ R8, R10
67 SHRQ $0x32, R10
68 MOVL 24(SP)(R9*4), BX
69 MOVL 24(SP)(R10*4), DI
70 MOVL CX, 24(SP)(R9*4)
71 LEAL 1(CX), R9
72 MOVL R9, 24(SP)(R10*4)
73 MOVQ SI, R9
74 SHRQ $0x10, R9
75 SHLQ $0x10, R9
76 IMULQ R8, R9
77 SHRQ $0x32, R9
78 MOVL CX, R8
79 SUBL 16(SP), R8
80 MOVL 1(DX)(R8*1), R10
81 MOVQ SI, R8
82 SHRQ $0x08, R8
83 CMPL R8, R10
84 JNE no_repeat_found_encodeBlockAsm
85 LEAL 1(CX), SI
86 MOVL 12(SP), DI
87 MOVL SI, BX
88 SUBL 16(SP), BX
89 JZ repeat_extend_back_end_encodeBlockAsm
90
91repeat_extend_back_loop_encodeBlockAsm:
92 CMPL SI, DI
93 JBE repeat_extend_back_end_encodeBlockAsm
94 MOVB -1(DX)(BX*1), R8
95 MOVB -1(DX)(SI*1), R9
96 CMPB R8, R9
97 JNE repeat_extend_back_end_encodeBlockAsm
98 LEAL -1(SI), SI
99 DECL BX
100 JNZ repeat_extend_back_loop_encodeBlockAsm
101
102repeat_extend_back_end_encodeBlockAsm:
103 MOVL SI, BX
104 SUBL 12(SP), BX
105 LEAQ 5(AX)(BX*1), BX
106 CMPQ BX, (SP)
107 JB repeat_dst_size_check_encodeBlockAsm
108 MOVQ $0x00000000, ret+48(FP)
109 RET
110
111repeat_dst_size_check_encodeBlockAsm:
112 MOVL 12(SP), BX
113 CMPL BX, SI
114 JEQ emit_literal_done_repeat_emit_encodeBlockAsm
115 MOVL SI, R8
116 MOVL SI, 12(SP)
117 LEAQ (DX)(BX*1), R9
118 SUBL BX, R8
119 LEAL -1(R8), BX
120 CMPL BX, $0x3c
121 JB one_byte_repeat_emit_encodeBlockAsm
122 CMPL BX, $0x00000100
123 JB two_bytes_repeat_emit_encodeBlockAsm
124 CMPL BX, $0x00010000
125 JB three_bytes_repeat_emit_encodeBlockAsm
126 CMPL BX, $0x01000000
127 JB four_bytes_repeat_emit_encodeBlockAsm
128 MOVB $0xfc, (AX)
129 MOVL BX, 1(AX)
130 ADDQ $0x05, AX
131 JMP memmove_long_repeat_emit_encodeBlockAsm
132
133four_bytes_repeat_emit_encodeBlockAsm:
134 MOVL BX, R10
135 SHRL $0x10, R10
136 MOVB $0xf8, (AX)
137 MOVW BX, 1(AX)
138 MOVB R10, 3(AX)
139 ADDQ $0x04, AX
140 JMP memmove_long_repeat_emit_encodeBlockAsm
141
142three_bytes_repeat_emit_encodeBlockAsm:
143 MOVB $0xf4, (AX)
144 MOVW BX, 1(AX)
145 ADDQ $0x03, AX
146 JMP memmove_long_repeat_emit_encodeBlockAsm
147
148two_bytes_repeat_emit_encodeBlockAsm:
149 MOVB $0xf0, (AX)
150 MOVB BL, 1(AX)
151 ADDQ $0x02, AX
152 CMPL BX, $0x40
153 JB memmove_repeat_emit_encodeBlockAsm
154 JMP memmove_long_repeat_emit_encodeBlockAsm
155
156one_byte_repeat_emit_encodeBlockAsm:
157 SHLB $0x02, BL
158 MOVB BL, (AX)
159 ADDQ $0x01, AX
160
161memmove_repeat_emit_encodeBlockAsm:
162 LEAQ (AX)(R8*1), BX
163
164 // genMemMoveShort
165 CMPQ R8, $0x08
166 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8
167 CMPQ R8, $0x10
168 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16
169 CMPQ R8, $0x20
170 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32
171 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64
172
173emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8:
174 MOVQ (R9), R10
175 MOVQ R10, (AX)
176 JMP memmove_end_copy_repeat_emit_encodeBlockAsm
177
178emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16:
179 MOVQ (R9), R10
180 MOVQ -8(R9)(R8*1), R9
181 MOVQ R10, (AX)
182 MOVQ R9, -8(AX)(R8*1)
183 JMP memmove_end_copy_repeat_emit_encodeBlockAsm
184
185emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32:
186 MOVOU (R9), X0
187 MOVOU -16(R9)(R8*1), X1
188 MOVOU X0, (AX)
189 MOVOU X1, -16(AX)(R8*1)
190 JMP memmove_end_copy_repeat_emit_encodeBlockAsm
191
192emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64:
193 MOVOU (R9), X0
194 MOVOU 16(R9), X1
195 MOVOU -32(R9)(R8*1), X2
196 MOVOU -16(R9)(R8*1), X3
197 MOVOU X0, (AX)
198 MOVOU X1, 16(AX)
199 MOVOU X2, -32(AX)(R8*1)
200 MOVOU X3, -16(AX)(R8*1)
201
202memmove_end_copy_repeat_emit_encodeBlockAsm:
203 MOVQ BX, AX
204 JMP emit_literal_done_repeat_emit_encodeBlockAsm
205
206memmove_long_repeat_emit_encodeBlockAsm:
207 LEAQ (AX)(R8*1), BX
208
209 // genMemMoveLong
210 MOVOU (R9), X0
211 MOVOU 16(R9), X1
212 MOVOU -32(R9)(R8*1), X2
213 MOVOU -16(R9)(R8*1), X3
214 MOVQ R8, R11
215 SHRQ $0x05, R11
216 MOVQ AX, R10
217 ANDL $0x0000001f, R10
218 MOVQ $0x00000040, R12
219 SUBQ R10, R12
220 DECQ R11
221 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
222 LEAQ -32(R9)(R12*1), R10
223 LEAQ -32(AX)(R12*1), R13
224
225emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back:
226 MOVOU (R10), X4
227 MOVOU 16(R10), X5
228 MOVOA X4, (R13)
229 MOVOA X5, 16(R13)
230 ADDQ $0x20, R13
231 ADDQ $0x20, R10
232 ADDQ $0x20, R12
233 DECQ R11
234 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back
235
236emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32:
237 MOVOU -32(R9)(R12*1), X4
238 MOVOU -16(R9)(R12*1), X5
239 MOVOA X4, -32(AX)(R12*1)
240 MOVOA X5, -16(AX)(R12*1)
241 ADDQ $0x20, R12
242 CMPQ R8, R12
243 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
244 MOVOU X0, (AX)
245 MOVOU X1, 16(AX)
246 MOVOU X2, -32(AX)(R8*1)
247 MOVOU X3, -16(AX)(R8*1)
248 MOVQ BX, AX
249
250emit_literal_done_repeat_emit_encodeBlockAsm:
251 ADDL $0x05, CX
252 MOVL CX, BX
253 SUBL 16(SP), BX
254 MOVQ src_len+32(FP), R8
255 SUBL CX, R8
256 LEAQ (DX)(CX*1), R9
257 LEAQ (DX)(BX*1), BX
258
259 // matchLen
260 XORL R11, R11
261
262matchlen_loopback_16_repeat_extend_encodeBlockAsm:
263 CMPL R8, $0x10
264 JB matchlen_match8_repeat_extend_encodeBlockAsm
265 MOVQ (R9)(R11*1), R10
266 MOVQ 8(R9)(R11*1), R12
267 XORQ (BX)(R11*1), R10
268 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm
269 XORQ 8(BX)(R11*1), R12
270 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm
271 LEAL -16(R8), R8
272 LEAL 16(R11), R11
273 JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm
274
275matchlen_bsf_16repeat_extend_encodeBlockAsm:
276#ifdef GOAMD64_v3
277 TZCNTQ R12, R12
278
279#else
280 BSFQ R12, R12
281
282#endif
283 SARQ $0x03, R12
284 LEAL 8(R11)(R12*1), R11
285 JMP repeat_extend_forward_end_encodeBlockAsm
286
287matchlen_match8_repeat_extend_encodeBlockAsm:
288 CMPL R8, $0x08
289 JB matchlen_match4_repeat_extend_encodeBlockAsm
290 MOVQ (R9)(R11*1), R10
291 XORQ (BX)(R11*1), R10
292 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm
293 LEAL -8(R8), R8
294 LEAL 8(R11), R11
295 JMP matchlen_match4_repeat_extend_encodeBlockAsm
296
297matchlen_bsf_8_repeat_extend_encodeBlockAsm:
298#ifdef GOAMD64_v3
299 TZCNTQ R10, R10
300
301#else
302 BSFQ R10, R10
303
304#endif
305 SARQ $0x03, R10
306 LEAL (R11)(R10*1), R11
307 JMP repeat_extend_forward_end_encodeBlockAsm
308
309matchlen_match4_repeat_extend_encodeBlockAsm:
310 CMPL R8, $0x04
311 JB matchlen_match2_repeat_extend_encodeBlockAsm
312 MOVL (R9)(R11*1), R10
313 CMPL (BX)(R11*1), R10
314 JNE matchlen_match2_repeat_extend_encodeBlockAsm
315 LEAL -4(R8), R8
316 LEAL 4(R11), R11
317
318matchlen_match2_repeat_extend_encodeBlockAsm:
319 CMPL R8, $0x01
320 JE matchlen_match1_repeat_extend_encodeBlockAsm
321 JB repeat_extend_forward_end_encodeBlockAsm
322 MOVW (R9)(R11*1), R10
323 CMPW (BX)(R11*1), R10
324 JNE matchlen_match1_repeat_extend_encodeBlockAsm
325 LEAL 2(R11), R11
326 SUBL $0x02, R8
327 JZ repeat_extend_forward_end_encodeBlockAsm
328
329matchlen_match1_repeat_extend_encodeBlockAsm:
330 MOVB (R9)(R11*1), R10
331 CMPB (BX)(R11*1), R10
332 JNE repeat_extend_forward_end_encodeBlockAsm
333 LEAL 1(R11), R11
334
335repeat_extend_forward_end_encodeBlockAsm:
336 ADDL R11, CX
337 MOVL CX, BX
338 SUBL SI, BX
339 MOVL 16(SP), SI
340 TESTL DI, DI
341 JZ repeat_as_copy_encodeBlockAsm
342
343 // emitRepeat
344emit_repeat_again_match_repeat_encodeBlockAsm:
345 MOVL BX, DI
346 LEAL -4(BX), BX
347 CMPL DI, $0x08
348 JBE repeat_two_match_repeat_encodeBlockAsm
349 CMPL DI, $0x0c
350 JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm
351 CMPL SI, $0x00000800
352 JB repeat_two_offset_match_repeat_encodeBlockAsm
353
354cant_repeat_two_offset_match_repeat_encodeBlockAsm:
355 CMPL BX, $0x00000104
356 JB repeat_three_match_repeat_encodeBlockAsm
357 CMPL BX, $0x00010100
358 JB repeat_four_match_repeat_encodeBlockAsm
359 CMPL BX, $0x0100ffff
360 JB repeat_five_match_repeat_encodeBlockAsm
361 LEAL -16842747(BX), BX
362 MOVL $0xfffb001d, (AX)
363 MOVB $0xff, 4(AX)
364 ADDQ $0x05, AX
365 JMP emit_repeat_again_match_repeat_encodeBlockAsm
366
367repeat_five_match_repeat_encodeBlockAsm:
368 LEAL -65536(BX), BX
369 MOVL BX, SI
370 MOVW $0x001d, (AX)
371 MOVW BX, 2(AX)
372 SARL $0x10, SI
373 MOVB SI, 4(AX)
374 ADDQ $0x05, AX
375 JMP repeat_end_emit_encodeBlockAsm
376
377repeat_four_match_repeat_encodeBlockAsm:
378 LEAL -256(BX), BX
379 MOVW $0x0019, (AX)
380 MOVW BX, 2(AX)
381 ADDQ $0x04, AX
382 JMP repeat_end_emit_encodeBlockAsm
383
384repeat_three_match_repeat_encodeBlockAsm:
385 LEAL -4(BX), BX
386 MOVW $0x0015, (AX)
387 MOVB BL, 2(AX)
388 ADDQ $0x03, AX
389 JMP repeat_end_emit_encodeBlockAsm
390
391repeat_two_match_repeat_encodeBlockAsm:
392 SHLL $0x02, BX
393 ORL $0x01, BX
394 MOVW BX, (AX)
395 ADDQ $0x02, AX
396 JMP repeat_end_emit_encodeBlockAsm
397
398repeat_two_offset_match_repeat_encodeBlockAsm:
399 XORQ DI, DI
400 LEAL 1(DI)(BX*4), BX
401 MOVB SI, 1(AX)
402 SARL $0x08, SI
403 SHLL $0x05, SI
404 ORL SI, BX
405 MOVB BL, (AX)
406 ADDQ $0x02, AX
407 JMP repeat_end_emit_encodeBlockAsm
408
409repeat_as_copy_encodeBlockAsm:
410 // emitCopy
411 CMPL SI, $0x00010000
412 JB two_byte_offset_repeat_as_copy_encodeBlockAsm
413 CMPL BX, $0x40
414 JBE four_bytes_remain_repeat_as_copy_encodeBlockAsm
415 MOVB $0xff, (AX)
416 MOVL SI, 1(AX)
417 LEAL -64(BX), BX
418 ADDQ $0x05, AX
419 CMPL BX, $0x04
420 JB four_bytes_remain_repeat_as_copy_encodeBlockAsm
421
422 // emitRepeat
423emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy:
424 MOVL BX, DI
425 LEAL -4(BX), BX
426 CMPL DI, $0x08
427 JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy
428 CMPL DI, $0x0c
429 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
430 CMPL SI, $0x00000800
431 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
432
433cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
434 CMPL BX, $0x00000104
435 JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy
436 CMPL BX, $0x00010100
437 JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy
438 CMPL BX, $0x0100ffff
439 JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy
440 LEAL -16842747(BX), BX
441 MOVL $0xfffb001d, (AX)
442 MOVB $0xff, 4(AX)
443 ADDQ $0x05, AX
444 JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy
445
446repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy:
447 LEAL -65536(BX), BX
448 MOVL BX, SI
449 MOVW $0x001d, (AX)
450 MOVW BX, 2(AX)
451 SARL $0x10, SI
452 MOVB SI, 4(AX)
453 ADDQ $0x05, AX
454 JMP repeat_end_emit_encodeBlockAsm
455
456repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy:
457 LEAL -256(BX), BX
458 MOVW $0x0019, (AX)
459 MOVW BX, 2(AX)
460 ADDQ $0x04, AX
461 JMP repeat_end_emit_encodeBlockAsm
462
463repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy:
464 LEAL -4(BX), BX
465 MOVW $0x0015, (AX)
466 MOVB BL, 2(AX)
467 ADDQ $0x03, AX
468 JMP repeat_end_emit_encodeBlockAsm
469
470repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy:
471 SHLL $0x02, BX
472 ORL $0x01, BX
473 MOVW BX, (AX)
474 ADDQ $0x02, AX
475 JMP repeat_end_emit_encodeBlockAsm
476
477repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
478 XORQ DI, DI
479 LEAL 1(DI)(BX*4), BX
480 MOVB SI, 1(AX)
481 SARL $0x08, SI
482 SHLL $0x05, SI
483 ORL SI, BX
484 MOVB BL, (AX)
485 ADDQ $0x02, AX
486 JMP repeat_end_emit_encodeBlockAsm
487
488four_bytes_remain_repeat_as_copy_encodeBlockAsm:
489 TESTL BX, BX
490 JZ repeat_end_emit_encodeBlockAsm
491 XORL DI, DI
492 LEAL -1(DI)(BX*4), BX
493 MOVB BL, (AX)
494 MOVL SI, 1(AX)
495 ADDQ $0x05, AX
496 JMP repeat_end_emit_encodeBlockAsm
497
498two_byte_offset_repeat_as_copy_encodeBlockAsm:
499 CMPL BX, $0x40
500 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm
501 CMPL SI, $0x00000800
502 JAE long_offset_short_repeat_as_copy_encodeBlockAsm
503 MOVL $0x00000001, DI
504 LEAL 16(DI), DI
505 MOVB SI, 1(AX)
506 MOVL SI, R8
507 SHRL $0x08, R8
508 SHLL $0x05, R8
509 ORL R8, DI
510 MOVB DI, (AX)
511 ADDQ $0x02, AX
512 SUBL $0x08, BX
513
514 // emitRepeat
515 LEAL -4(BX), BX
516 JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
517
518emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
519 MOVL BX, DI
520 LEAL -4(BX), BX
521 CMPL DI, $0x08
522 JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
523 CMPL DI, $0x0c
524 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
525 CMPL SI, $0x00000800
526 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
527
528cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
529 CMPL BX, $0x00000104
530 JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
531 CMPL BX, $0x00010100
532 JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
533 CMPL BX, $0x0100ffff
534 JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
535 LEAL -16842747(BX), BX
536 MOVL $0xfffb001d, (AX)
537 MOVB $0xff, 4(AX)
538 ADDQ $0x05, AX
539 JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
540
541repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
542 LEAL -65536(BX), BX
543 MOVL BX, SI
544 MOVW $0x001d, (AX)
545 MOVW BX, 2(AX)
546 SARL $0x10, SI
547 MOVB SI, 4(AX)
548 ADDQ $0x05, AX
549 JMP repeat_end_emit_encodeBlockAsm
550
551repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
552 LEAL -256(BX), BX
553 MOVW $0x0019, (AX)
554 MOVW BX, 2(AX)
555 ADDQ $0x04, AX
556 JMP repeat_end_emit_encodeBlockAsm
557
558repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
559 LEAL -4(BX), BX
560 MOVW $0x0015, (AX)
561 MOVB BL, 2(AX)
562 ADDQ $0x03, AX
563 JMP repeat_end_emit_encodeBlockAsm
564
565repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
566 SHLL $0x02, BX
567 ORL $0x01, BX
568 MOVW BX, (AX)
569 ADDQ $0x02, AX
570 JMP repeat_end_emit_encodeBlockAsm
571
572repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
573 XORQ DI, DI
574 LEAL 1(DI)(BX*4), BX
575 MOVB SI, 1(AX)
576 SARL $0x08, SI
577 SHLL $0x05, SI
578 ORL SI, BX
579 MOVB BL, (AX)
580 ADDQ $0x02, AX
581 JMP repeat_end_emit_encodeBlockAsm
582
583long_offset_short_repeat_as_copy_encodeBlockAsm:
584 MOVB $0xee, (AX)
585 MOVW SI, 1(AX)
586 LEAL -60(BX), BX
587 ADDQ $0x03, AX
588
589 // emitRepeat
590emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short:
591 MOVL BX, DI
592 LEAL -4(BX), BX
593 CMPL DI, $0x08
594 JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short
595 CMPL DI, $0x0c
596 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
597 CMPL SI, $0x00000800
598 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
599
600cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
601 CMPL BX, $0x00000104
602 JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short
603 CMPL BX, $0x00010100
604 JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short
605 CMPL BX, $0x0100ffff
606 JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short
607 LEAL -16842747(BX), BX
608 MOVL $0xfffb001d, (AX)
609 MOVB $0xff, 4(AX)
610 ADDQ $0x05, AX
611 JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short
612
613repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short:
614 LEAL -65536(BX), BX
615 MOVL BX, SI
616 MOVW $0x001d, (AX)
617 MOVW BX, 2(AX)
618 SARL $0x10, SI
619 MOVB SI, 4(AX)
620 ADDQ $0x05, AX
621 JMP repeat_end_emit_encodeBlockAsm
622
623repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short:
624 LEAL -256(BX), BX
625 MOVW $0x0019, (AX)
626 MOVW BX, 2(AX)
627 ADDQ $0x04, AX
628 JMP repeat_end_emit_encodeBlockAsm
629
630repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short:
631 LEAL -4(BX), BX
632 MOVW $0x0015, (AX)
633 MOVB BL, 2(AX)
634 ADDQ $0x03, AX
635 JMP repeat_end_emit_encodeBlockAsm
636
637repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short:
638 SHLL $0x02, BX
639 ORL $0x01, BX
640 MOVW BX, (AX)
641 ADDQ $0x02, AX
642 JMP repeat_end_emit_encodeBlockAsm
643
644repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
645 XORQ DI, DI
646 LEAL 1(DI)(BX*4), BX
647 MOVB SI, 1(AX)
648 SARL $0x08, SI
649 SHLL $0x05, SI
650 ORL SI, BX
651 MOVB BL, (AX)
652 ADDQ $0x02, AX
653 JMP repeat_end_emit_encodeBlockAsm
654
655two_byte_offset_short_repeat_as_copy_encodeBlockAsm:
656 MOVL BX, DI
657 SHLL $0x02, DI
658 CMPL BX, $0x0c
659 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm
660 CMPL SI, $0x00000800
661 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm
662 LEAL -15(DI), DI
663 MOVB SI, 1(AX)
664 SHRL $0x08, SI
665 SHLL $0x05, SI
666 ORL SI, DI
667 MOVB DI, (AX)
668 ADDQ $0x02, AX
669 JMP repeat_end_emit_encodeBlockAsm
670
671emit_copy_three_repeat_as_copy_encodeBlockAsm:
672 LEAL -2(DI), DI
673 MOVB DI, (AX)
674 MOVW SI, 1(AX)
675 ADDQ $0x03, AX
676
677repeat_end_emit_encodeBlockAsm:
678 MOVL CX, 12(SP)
679 JMP search_loop_encodeBlockAsm
680
681no_repeat_found_encodeBlockAsm:
682 CMPL (DX)(BX*1), SI
683 JEQ candidate_match_encodeBlockAsm
684 SHRQ $0x08, SI
685 MOVL 24(SP)(R9*4), BX
686 LEAL 2(CX), R8
687 CMPL (DX)(DI*1), SI
688 JEQ candidate2_match_encodeBlockAsm
689 MOVL R8, 24(SP)(R9*4)
690 SHRQ $0x08, SI
691 CMPL (DX)(BX*1), SI
692 JEQ candidate3_match_encodeBlockAsm
693 MOVL 20(SP), CX
694 JMP search_loop_encodeBlockAsm
695
696candidate3_match_encodeBlockAsm:
697 ADDL $0x02, CX
698 JMP candidate_match_encodeBlockAsm
699
700candidate2_match_encodeBlockAsm:
701 MOVL R8, 24(SP)(R9*4)
702 INCL CX
703 MOVL DI, BX
704
705candidate_match_encodeBlockAsm:
706 MOVL 12(SP), SI
707 TESTL BX, BX
708 JZ match_extend_back_end_encodeBlockAsm
709
710match_extend_back_loop_encodeBlockAsm:
711 CMPL CX, SI
712 JBE match_extend_back_end_encodeBlockAsm
713 MOVB -1(DX)(BX*1), DI
714 MOVB -1(DX)(CX*1), R8
715 CMPB DI, R8
716 JNE match_extend_back_end_encodeBlockAsm
717 LEAL -1(CX), CX
718 DECL BX
719 JZ match_extend_back_end_encodeBlockAsm
720 JMP match_extend_back_loop_encodeBlockAsm
721
722match_extend_back_end_encodeBlockAsm:
723 MOVL CX, SI
724 SUBL 12(SP), SI
725 LEAQ 5(AX)(SI*1), SI
726 CMPQ SI, (SP)
727 JB match_dst_size_check_encodeBlockAsm
728 MOVQ $0x00000000, ret+48(FP)
729 RET
730
731match_dst_size_check_encodeBlockAsm:
732 MOVL CX, SI
733 MOVL 12(SP), DI
734 CMPL DI, SI
735 JEQ emit_literal_done_match_emit_encodeBlockAsm
736 MOVL SI, R8
737 MOVL SI, 12(SP)
738 LEAQ (DX)(DI*1), SI
739 SUBL DI, R8
740 LEAL -1(R8), DI
741 CMPL DI, $0x3c
742 JB one_byte_match_emit_encodeBlockAsm
743 CMPL DI, $0x00000100
744 JB two_bytes_match_emit_encodeBlockAsm
745 CMPL DI, $0x00010000
746 JB three_bytes_match_emit_encodeBlockAsm
747 CMPL DI, $0x01000000
748 JB four_bytes_match_emit_encodeBlockAsm
749 MOVB $0xfc, (AX)
750 MOVL DI, 1(AX)
751 ADDQ $0x05, AX
752 JMP memmove_long_match_emit_encodeBlockAsm
753
754four_bytes_match_emit_encodeBlockAsm:
755 MOVL DI, R9
756 SHRL $0x10, R9
757 MOVB $0xf8, (AX)
758 MOVW DI, 1(AX)
759 MOVB R9, 3(AX)
760 ADDQ $0x04, AX
761 JMP memmove_long_match_emit_encodeBlockAsm
762
763three_bytes_match_emit_encodeBlockAsm:
764 MOVB $0xf4, (AX)
765 MOVW DI, 1(AX)
766 ADDQ $0x03, AX
767 JMP memmove_long_match_emit_encodeBlockAsm
768
769two_bytes_match_emit_encodeBlockAsm:
770 MOVB $0xf0, (AX)
771 MOVB DI, 1(AX)
772 ADDQ $0x02, AX
773 CMPL DI, $0x40
774 JB memmove_match_emit_encodeBlockAsm
775 JMP memmove_long_match_emit_encodeBlockAsm
776
777one_byte_match_emit_encodeBlockAsm:
778 SHLB $0x02, DI
779 MOVB DI, (AX)
780 ADDQ $0x01, AX
781
782memmove_match_emit_encodeBlockAsm:
783 LEAQ (AX)(R8*1), DI
784
785 // genMemMoveShort
786 CMPQ R8, $0x08
787 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8
788 CMPQ R8, $0x10
789 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16
790 CMPQ R8, $0x20
791 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32
792 JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64
793
794emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8:
795 MOVQ (SI), R9
796 MOVQ R9, (AX)
797 JMP memmove_end_copy_match_emit_encodeBlockAsm
798
799emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16:
800 MOVQ (SI), R9
801 MOVQ -8(SI)(R8*1), SI
802 MOVQ R9, (AX)
803 MOVQ SI, -8(AX)(R8*1)
804 JMP memmove_end_copy_match_emit_encodeBlockAsm
805
806emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32:
807 MOVOU (SI), X0
808 MOVOU -16(SI)(R8*1), X1
809 MOVOU X0, (AX)
810 MOVOU X1, -16(AX)(R8*1)
811 JMP memmove_end_copy_match_emit_encodeBlockAsm
812
813emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64:
814 MOVOU (SI), X0
815 MOVOU 16(SI), X1
816 MOVOU -32(SI)(R8*1), X2
817 MOVOU -16(SI)(R8*1), X3
818 MOVOU X0, (AX)
819 MOVOU X1, 16(AX)
820 MOVOU X2, -32(AX)(R8*1)
821 MOVOU X3, -16(AX)(R8*1)
822
823memmove_end_copy_match_emit_encodeBlockAsm:
824 MOVQ DI, AX
825 JMP emit_literal_done_match_emit_encodeBlockAsm
826
827memmove_long_match_emit_encodeBlockAsm:
828 LEAQ (AX)(R8*1), DI
829
830 // genMemMoveLong
831 MOVOU (SI), X0
832 MOVOU 16(SI), X1
833 MOVOU -32(SI)(R8*1), X2
834 MOVOU -16(SI)(R8*1), X3
835 MOVQ R8, R10
836 SHRQ $0x05, R10
837 MOVQ AX, R9
838 ANDL $0x0000001f, R9
839 MOVQ $0x00000040, R11
840 SUBQ R9, R11
841 DECQ R10
842 JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
843 LEAQ -32(SI)(R11*1), R9
844 LEAQ -32(AX)(R11*1), R12
845
846emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back:
847 MOVOU (R9), X4
848 MOVOU 16(R9), X5
849 MOVOA X4, (R12)
850 MOVOA X5, 16(R12)
851 ADDQ $0x20, R12
852 ADDQ $0x20, R9
853 ADDQ $0x20, R11
854 DECQ R10
855 JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back
856
857emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32:
858 MOVOU -32(SI)(R11*1), X4
859 MOVOU -16(SI)(R11*1), X5
860 MOVOA X4, -32(AX)(R11*1)
861 MOVOA X5, -16(AX)(R11*1)
862 ADDQ $0x20, R11
863 CMPQ R8, R11
864 JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
865 MOVOU X0, (AX)
866 MOVOU X1, 16(AX)
867 MOVOU X2, -32(AX)(R8*1)
868 MOVOU X3, -16(AX)(R8*1)
869 MOVQ DI, AX
870
871emit_literal_done_match_emit_encodeBlockAsm:
872match_nolit_loop_encodeBlockAsm:
873 MOVL CX, SI
874 SUBL BX, SI
875 MOVL SI, 16(SP)
876 ADDL $0x04, CX
877 ADDL $0x04, BX
878 MOVQ src_len+32(FP), SI
879 SUBL CX, SI
880 LEAQ (DX)(CX*1), DI
881 LEAQ (DX)(BX*1), BX
882
883 // matchLen
884 XORL R9, R9
885
886matchlen_loopback_16_match_nolit_encodeBlockAsm:
887 CMPL SI, $0x10
888 JB matchlen_match8_match_nolit_encodeBlockAsm
889 MOVQ (DI)(R9*1), R8
890 MOVQ 8(DI)(R9*1), R10
891 XORQ (BX)(R9*1), R8
892 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm
893 XORQ 8(BX)(R9*1), R10
894 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm
895 LEAL -16(SI), SI
896 LEAL 16(R9), R9
897 JMP matchlen_loopback_16_match_nolit_encodeBlockAsm
898
899matchlen_bsf_16match_nolit_encodeBlockAsm:
900#ifdef GOAMD64_v3
901 TZCNTQ R10, R10
902
903#else
904 BSFQ R10, R10
905
906#endif
907 SARQ $0x03, R10
908 LEAL 8(R9)(R10*1), R9
909 JMP match_nolit_end_encodeBlockAsm
910
911matchlen_match8_match_nolit_encodeBlockAsm:
912 CMPL SI, $0x08
913 JB matchlen_match4_match_nolit_encodeBlockAsm
914 MOVQ (DI)(R9*1), R8
915 XORQ (BX)(R9*1), R8
916 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm
917 LEAL -8(SI), SI
918 LEAL 8(R9), R9
919 JMP matchlen_match4_match_nolit_encodeBlockAsm
920
921matchlen_bsf_8_match_nolit_encodeBlockAsm:
922#ifdef GOAMD64_v3
923 TZCNTQ R8, R8
924
925#else
926 BSFQ R8, R8
927
928#endif
929 SARQ $0x03, R8
930 LEAL (R9)(R8*1), R9
931 JMP match_nolit_end_encodeBlockAsm
932
933matchlen_match4_match_nolit_encodeBlockAsm:
934 CMPL SI, $0x04
935 JB matchlen_match2_match_nolit_encodeBlockAsm
936 MOVL (DI)(R9*1), R8
937 CMPL (BX)(R9*1), R8
938 JNE matchlen_match2_match_nolit_encodeBlockAsm
939 LEAL -4(SI), SI
940 LEAL 4(R9), R9
941
942matchlen_match2_match_nolit_encodeBlockAsm:
943 CMPL SI, $0x01
944 JE matchlen_match1_match_nolit_encodeBlockAsm
945 JB match_nolit_end_encodeBlockAsm
946 MOVW (DI)(R9*1), R8
947 CMPW (BX)(R9*1), R8
948 JNE matchlen_match1_match_nolit_encodeBlockAsm
949 LEAL 2(R9), R9
950 SUBL $0x02, SI
951 JZ match_nolit_end_encodeBlockAsm
952
953matchlen_match1_match_nolit_encodeBlockAsm:
954 MOVB (DI)(R9*1), R8
955 CMPB (BX)(R9*1), R8
956 JNE match_nolit_end_encodeBlockAsm
957 LEAL 1(R9), R9
958
959match_nolit_end_encodeBlockAsm:
960 ADDL R9, CX
961 MOVL 16(SP), BX
962 ADDL $0x04, R9
963 MOVL CX, 12(SP)
964
965 // emitCopy
966 CMPL BX, $0x00010000
967 JB two_byte_offset_match_nolit_encodeBlockAsm
968 CMPL R9, $0x40
969 JBE four_bytes_remain_match_nolit_encodeBlockAsm
970 MOVB $0xff, (AX)
971 MOVL BX, 1(AX)
972 LEAL -64(R9), R9
973 ADDQ $0x05, AX
974 CMPL R9, $0x04
975 JB four_bytes_remain_match_nolit_encodeBlockAsm
976
977 // emitRepeat
978emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy:
979 MOVL R9, SI
980 LEAL -4(R9), R9
981 CMPL SI, $0x08
982 JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy
983 CMPL SI, $0x0c
984 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
985 CMPL BX, $0x00000800
986 JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
987
988cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
989 CMPL R9, $0x00000104
990 JB repeat_three_match_nolit_encodeBlockAsm_emit_copy
991 CMPL R9, $0x00010100
992 JB repeat_four_match_nolit_encodeBlockAsm_emit_copy
993 CMPL R9, $0x0100ffff
994 JB repeat_five_match_nolit_encodeBlockAsm_emit_copy
995 LEAL -16842747(R9), R9
996 MOVL $0xfffb001d, (AX)
997 MOVB $0xff, 4(AX)
998 ADDQ $0x05, AX
999 JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy
1000
1001repeat_five_match_nolit_encodeBlockAsm_emit_copy:
1002 LEAL -65536(R9), R9
1003 MOVL R9, BX
1004 MOVW $0x001d, (AX)
1005 MOVW R9, 2(AX)
1006 SARL $0x10, BX
1007 MOVB BL, 4(AX)
1008 ADDQ $0x05, AX
1009 JMP match_nolit_emitcopy_end_encodeBlockAsm
1010
1011repeat_four_match_nolit_encodeBlockAsm_emit_copy:
1012 LEAL -256(R9), R9
1013 MOVW $0x0019, (AX)
1014 MOVW R9, 2(AX)
1015 ADDQ $0x04, AX
1016 JMP match_nolit_emitcopy_end_encodeBlockAsm
1017
1018repeat_three_match_nolit_encodeBlockAsm_emit_copy:
1019 LEAL -4(R9), R9
1020 MOVW $0x0015, (AX)
1021 MOVB R9, 2(AX)
1022 ADDQ $0x03, AX
1023 JMP match_nolit_emitcopy_end_encodeBlockAsm
1024
1025repeat_two_match_nolit_encodeBlockAsm_emit_copy:
1026 SHLL $0x02, R9
1027 ORL $0x01, R9
1028 MOVW R9, (AX)
1029 ADDQ $0x02, AX
1030 JMP match_nolit_emitcopy_end_encodeBlockAsm
1031
1032repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
1033 XORQ SI, SI
1034 LEAL 1(SI)(R9*4), R9
1035 MOVB BL, 1(AX)
1036 SARL $0x08, BX
1037 SHLL $0x05, BX
1038 ORL BX, R9
1039 MOVB R9, (AX)
1040 ADDQ $0x02, AX
1041 JMP match_nolit_emitcopy_end_encodeBlockAsm
1042
1043four_bytes_remain_match_nolit_encodeBlockAsm:
1044 TESTL R9, R9
1045 JZ match_nolit_emitcopy_end_encodeBlockAsm
1046 XORL SI, SI
1047 LEAL -1(SI)(R9*4), R9
1048 MOVB R9, (AX)
1049 MOVL BX, 1(AX)
1050 ADDQ $0x05, AX
1051 JMP match_nolit_emitcopy_end_encodeBlockAsm
1052
1053two_byte_offset_match_nolit_encodeBlockAsm:
1054 CMPL R9, $0x40
1055 JBE two_byte_offset_short_match_nolit_encodeBlockAsm
1056 CMPL BX, $0x00000800
1057 JAE long_offset_short_match_nolit_encodeBlockAsm
1058 MOVL $0x00000001, SI
1059 LEAL 16(SI), SI
1060 MOVB BL, 1(AX)
1061 MOVL BX, DI
1062 SHRL $0x08, DI
1063 SHLL $0x05, DI
1064 ORL DI, SI
1065 MOVB SI, (AX)
1066 ADDQ $0x02, AX
1067 SUBL $0x08, R9
1068
1069 // emitRepeat
1070 LEAL -4(R9), R9
1071 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
1072
1073emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b:
1074 MOVL R9, SI
1075 LEAL -4(R9), R9
1076 CMPL SI, $0x08
1077 JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b
1078 CMPL SI, $0x0c
1079 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
1080 CMPL BX, $0x00000800
1081 JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
1082
1083cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
1084 CMPL R9, $0x00000104
1085 JB repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b
1086 CMPL R9, $0x00010100
1087 JB repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b
1088 CMPL R9, $0x0100ffff
1089 JB repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b
1090 LEAL -16842747(R9), R9
1091 MOVL $0xfffb001d, (AX)
1092 MOVB $0xff, 4(AX)
1093 ADDQ $0x05, AX
1094 JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b
1095
1096repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b:
1097 LEAL -65536(R9), R9
1098 MOVL R9, BX
1099 MOVW $0x001d, (AX)
1100 MOVW R9, 2(AX)
1101 SARL $0x10, BX
1102 MOVB BL, 4(AX)
1103 ADDQ $0x05, AX
1104 JMP match_nolit_emitcopy_end_encodeBlockAsm
1105
1106repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b:
1107 LEAL -256(R9), R9
1108 MOVW $0x0019, (AX)
1109 MOVW R9, 2(AX)
1110 ADDQ $0x04, AX
1111 JMP match_nolit_emitcopy_end_encodeBlockAsm
1112
1113repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b:
1114 LEAL -4(R9), R9
1115 MOVW $0x0015, (AX)
1116 MOVB R9, 2(AX)
1117 ADDQ $0x03, AX
1118 JMP match_nolit_emitcopy_end_encodeBlockAsm
1119
1120repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b:
1121 SHLL $0x02, R9
1122 ORL $0x01, R9
1123 MOVW R9, (AX)
1124 ADDQ $0x02, AX
1125 JMP match_nolit_emitcopy_end_encodeBlockAsm
1126
1127repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
1128 XORQ SI, SI
1129 LEAL 1(SI)(R9*4), R9
1130 MOVB BL, 1(AX)
1131 SARL $0x08, BX
1132 SHLL $0x05, BX
1133 ORL BX, R9
1134 MOVB R9, (AX)
1135 ADDQ $0x02, AX
1136 JMP match_nolit_emitcopy_end_encodeBlockAsm
1137
1138long_offset_short_match_nolit_encodeBlockAsm:
1139 MOVB $0xee, (AX)
1140 MOVW BX, 1(AX)
1141 LEAL -60(R9), R9
1142 ADDQ $0x03, AX
1143
1144 // emitRepeat
1145emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short:
1146 MOVL R9, SI
1147 LEAL -4(R9), R9
1148 CMPL SI, $0x08
1149 JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short
1150 CMPL SI, $0x0c
1151 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
1152 CMPL BX, $0x00000800
1153 JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
1154
1155cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
1156 CMPL R9, $0x00000104
1157 JB repeat_three_match_nolit_encodeBlockAsm_emit_copy_short
1158 CMPL R9, $0x00010100
1159 JB repeat_four_match_nolit_encodeBlockAsm_emit_copy_short
1160 CMPL R9, $0x0100ffff
1161 JB repeat_five_match_nolit_encodeBlockAsm_emit_copy_short
1162 LEAL -16842747(R9), R9
1163 MOVL $0xfffb001d, (AX)
1164 MOVB $0xff, 4(AX)
1165 ADDQ $0x05, AX
1166 JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short
1167
1168repeat_five_match_nolit_encodeBlockAsm_emit_copy_short:
1169 LEAL -65536(R9), R9
1170 MOVL R9, BX
1171 MOVW $0x001d, (AX)
1172 MOVW R9, 2(AX)
1173 SARL $0x10, BX
1174 MOVB BL, 4(AX)
1175 ADDQ $0x05, AX
1176 JMP match_nolit_emitcopy_end_encodeBlockAsm
1177
1178repeat_four_match_nolit_encodeBlockAsm_emit_copy_short:
1179 LEAL -256(R9), R9
1180 MOVW $0x0019, (AX)
1181 MOVW R9, 2(AX)
1182 ADDQ $0x04, AX
1183 JMP match_nolit_emitcopy_end_encodeBlockAsm
1184
1185repeat_three_match_nolit_encodeBlockAsm_emit_copy_short:
1186 LEAL -4(R9), R9
1187 MOVW $0x0015, (AX)
1188 MOVB R9, 2(AX)
1189 ADDQ $0x03, AX
1190 JMP match_nolit_emitcopy_end_encodeBlockAsm
1191
1192repeat_two_match_nolit_encodeBlockAsm_emit_copy_short:
1193 SHLL $0x02, R9
1194 ORL $0x01, R9
1195 MOVW R9, (AX)
1196 ADDQ $0x02, AX
1197 JMP match_nolit_emitcopy_end_encodeBlockAsm
1198
1199repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
1200 XORQ SI, SI
1201 LEAL 1(SI)(R9*4), R9
1202 MOVB BL, 1(AX)
1203 SARL $0x08, BX
1204 SHLL $0x05, BX
1205 ORL BX, R9
1206 MOVB R9, (AX)
1207 ADDQ $0x02, AX
1208 JMP match_nolit_emitcopy_end_encodeBlockAsm
1209
1210two_byte_offset_short_match_nolit_encodeBlockAsm:
1211 MOVL R9, SI
1212 SHLL $0x02, SI
1213 CMPL R9, $0x0c
1214 JAE emit_copy_three_match_nolit_encodeBlockAsm
1215 CMPL BX, $0x00000800
1216 JAE emit_copy_three_match_nolit_encodeBlockAsm
1217 LEAL -15(SI), SI
1218 MOVB BL, 1(AX)
1219 SHRL $0x08, BX
1220 SHLL $0x05, BX
1221 ORL BX, SI
1222 MOVB SI, (AX)
1223 ADDQ $0x02, AX
1224 JMP match_nolit_emitcopy_end_encodeBlockAsm
1225
1226emit_copy_three_match_nolit_encodeBlockAsm:
1227 LEAL -2(SI), SI
1228 MOVB SI, (AX)
1229 MOVW BX, 1(AX)
1230 ADDQ $0x03, AX
1231
1232match_nolit_emitcopy_end_encodeBlockAsm:
1233 CMPL CX, 8(SP)
1234 JAE emit_remainder_encodeBlockAsm
1235 MOVQ -2(DX)(CX*1), SI
1236 CMPQ AX, (SP)
1237 JB match_nolit_dst_ok_encodeBlockAsm
1238 MOVQ $0x00000000, ret+48(FP)
1239 RET
1240
1241match_nolit_dst_ok_encodeBlockAsm:
1242 MOVQ $0x0000cf1bbcdcbf9b, R8
1243 MOVQ SI, DI
1244 SHRQ $0x10, SI
1245 MOVQ SI, BX
1246 SHLQ $0x10, DI
1247 IMULQ R8, DI
1248 SHRQ $0x32, DI
1249 SHLQ $0x10, BX
1250 IMULQ R8, BX
1251 SHRQ $0x32, BX
1252 LEAL -2(CX), R8
1253 LEAQ 24(SP)(BX*4), R9
1254 MOVL (R9), BX
1255 MOVL R8, 24(SP)(DI*4)
1256 MOVL CX, (R9)
1257 CMPL (DX)(BX*1), SI
1258 JEQ match_nolit_loop_encodeBlockAsm
1259 INCL CX
1260 JMP search_loop_encodeBlockAsm
1261
1262emit_remainder_encodeBlockAsm:
1263 MOVQ src_len+32(FP), CX
1264 SUBL 12(SP), CX
1265 LEAQ 5(AX)(CX*1), CX
1266 CMPQ CX, (SP)
1267 JB emit_remainder_ok_encodeBlockAsm
1268 MOVQ $0x00000000, ret+48(FP)
1269 RET
1270
1271emit_remainder_ok_encodeBlockAsm:
1272 MOVQ src_len+32(FP), CX
1273 MOVL 12(SP), BX
1274 CMPL BX, CX
1275 JEQ emit_literal_done_emit_remainder_encodeBlockAsm
1276 MOVL CX, SI
1277 MOVL CX, 12(SP)
1278 LEAQ (DX)(BX*1), CX
1279 SUBL BX, SI
1280 LEAL -1(SI), DX
1281 CMPL DX, $0x3c
1282 JB one_byte_emit_remainder_encodeBlockAsm
1283 CMPL DX, $0x00000100
1284 JB two_bytes_emit_remainder_encodeBlockAsm
1285 CMPL DX, $0x00010000
1286 JB three_bytes_emit_remainder_encodeBlockAsm
1287 CMPL DX, $0x01000000
1288 JB four_bytes_emit_remainder_encodeBlockAsm
1289 MOVB $0xfc, (AX)
1290 MOVL DX, 1(AX)
1291 ADDQ $0x05, AX
1292 JMP memmove_long_emit_remainder_encodeBlockAsm
1293
1294four_bytes_emit_remainder_encodeBlockAsm:
1295 MOVL DX, BX
1296 SHRL $0x10, BX
1297 MOVB $0xf8, (AX)
1298 MOVW DX, 1(AX)
1299 MOVB BL, 3(AX)
1300 ADDQ $0x04, AX
1301 JMP memmove_long_emit_remainder_encodeBlockAsm
1302
1303three_bytes_emit_remainder_encodeBlockAsm:
1304 MOVB $0xf4, (AX)
1305 MOVW DX, 1(AX)
1306 ADDQ $0x03, AX
1307 JMP memmove_long_emit_remainder_encodeBlockAsm
1308
1309two_bytes_emit_remainder_encodeBlockAsm:
1310 MOVB $0xf0, (AX)
1311 MOVB DL, 1(AX)
1312 ADDQ $0x02, AX
1313 CMPL DX, $0x40
1314 JB memmove_emit_remainder_encodeBlockAsm
1315 JMP memmove_long_emit_remainder_encodeBlockAsm
1316
1317one_byte_emit_remainder_encodeBlockAsm:
1318 SHLB $0x02, DL
1319 MOVB DL, (AX)
1320 ADDQ $0x01, AX
1321
1322memmove_emit_remainder_encodeBlockAsm:
1323 LEAQ (AX)(SI*1), DX
1324 MOVL SI, BX
1325
1326 // genMemMoveShort
1327 CMPQ BX, $0x03
1328 JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2
1329 JE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3
1330 CMPQ BX, $0x08
1331 JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7
1332 CMPQ BX, $0x10
1333 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16
1334 CMPQ BX, $0x20
1335 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32
1336 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64
1337
1338emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2:
1339 MOVB (CX), SI
1340 MOVB -1(CX)(BX*1), CL
1341 MOVB SI, (AX)
1342 MOVB CL, -1(AX)(BX*1)
1343 JMP memmove_end_copy_emit_remainder_encodeBlockAsm
1344
1345emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3:
1346 MOVW (CX), SI
1347 MOVB 2(CX), CL
1348 MOVW SI, (AX)
1349 MOVB CL, 2(AX)
1350 JMP memmove_end_copy_emit_remainder_encodeBlockAsm
1351
1352emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7:
1353 MOVL (CX), SI
1354 MOVL -4(CX)(BX*1), CX
1355 MOVL SI, (AX)
1356 MOVL CX, -4(AX)(BX*1)
1357 JMP memmove_end_copy_emit_remainder_encodeBlockAsm
1358
1359emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16:
1360 MOVQ (CX), SI
1361 MOVQ -8(CX)(BX*1), CX
1362 MOVQ SI, (AX)
1363 MOVQ CX, -8(AX)(BX*1)
1364 JMP memmove_end_copy_emit_remainder_encodeBlockAsm
1365
1366emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32:
1367 MOVOU (CX), X0
1368 MOVOU -16(CX)(BX*1), X1
1369 MOVOU X0, (AX)
1370 MOVOU X1, -16(AX)(BX*1)
1371 JMP memmove_end_copy_emit_remainder_encodeBlockAsm
1372
1373emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64:
1374 MOVOU (CX), X0
1375 MOVOU 16(CX), X1
1376 MOVOU -32(CX)(BX*1), X2
1377 MOVOU -16(CX)(BX*1), X3
1378 MOVOU X0, (AX)
1379 MOVOU X1, 16(AX)
1380 MOVOU X2, -32(AX)(BX*1)
1381 MOVOU X3, -16(AX)(BX*1)
1382
1383memmove_end_copy_emit_remainder_encodeBlockAsm:
1384 MOVQ DX, AX
1385 JMP emit_literal_done_emit_remainder_encodeBlockAsm
1386
1387memmove_long_emit_remainder_encodeBlockAsm:
1388 LEAQ (AX)(SI*1), DX
1389 MOVL SI, BX
1390
1391 // genMemMoveLong
1392 MOVOU (CX), X0
1393 MOVOU 16(CX), X1
1394 MOVOU -32(CX)(BX*1), X2
1395 MOVOU -16(CX)(BX*1), X3
1396 MOVQ BX, DI
1397 SHRQ $0x05, DI
1398 MOVQ AX, SI
1399 ANDL $0x0000001f, SI
1400 MOVQ $0x00000040, R8
1401 SUBQ SI, R8
1402 DECQ DI
1403 JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
1404 LEAQ -32(CX)(R8*1), SI
1405 LEAQ -32(AX)(R8*1), R9
1406
1407emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back:
1408 MOVOU (SI), X4
1409 MOVOU 16(SI), X5
1410 MOVOA X4, (R9)
1411 MOVOA X5, 16(R9)
1412 ADDQ $0x20, R9
1413 ADDQ $0x20, SI
1414 ADDQ $0x20, R8
1415 DECQ DI
1416 JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back
1417
1418emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32:
1419 MOVOU -32(CX)(R8*1), X4
1420 MOVOU -16(CX)(R8*1), X5
1421 MOVOA X4, -32(AX)(R8*1)
1422 MOVOA X5, -16(AX)(R8*1)
1423 ADDQ $0x20, R8
1424 CMPQ BX, R8
1425 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
1426 MOVOU X0, (AX)
1427 MOVOU X1, 16(AX)
1428 MOVOU X2, -32(AX)(BX*1)
1429 MOVOU X3, -16(AX)(BX*1)
1430 MOVQ DX, AX
1431
1432emit_literal_done_emit_remainder_encodeBlockAsm:
1433 MOVQ dst_base+0(FP), CX
1434 SUBQ CX, AX
1435 MOVQ AX, ret+48(FP)
1436 RET
1437
1438// func encodeBlockAsm4MB(dst []byte, src []byte) int
1439// Requires: BMI, SSE2
1440TEXT ·encodeBlockAsm4MB(SB), $65560-56
1441 MOVQ dst_base+0(FP), AX
1442 MOVQ $0x00000200, CX
1443 LEAQ 24(SP), DX
1444 PXOR X0, X0
1445
1446zero_loop_encodeBlockAsm4MB:
1447 MOVOU X0, (DX)
1448 MOVOU X0, 16(DX)
1449 MOVOU X0, 32(DX)
1450 MOVOU X0, 48(DX)
1451 MOVOU X0, 64(DX)
1452 MOVOU X0, 80(DX)
1453 MOVOU X0, 96(DX)
1454 MOVOU X0, 112(DX)
1455 ADDQ $0x80, DX
1456 DECQ CX
1457 JNZ zero_loop_encodeBlockAsm4MB
1458 MOVL $0x00000000, 12(SP)
1459 MOVQ src_len+32(FP), CX
1460 LEAQ -9(CX), DX
1461 LEAQ -8(CX), BX
1462 MOVL BX, 8(SP)
1463 SHRQ $0x05, CX
1464 SUBL CX, DX
1465 LEAQ (AX)(DX*1), DX
1466 MOVQ DX, (SP)
1467 MOVL $0x00000001, CX
1468 MOVL CX, 16(SP)
1469 MOVQ src_base+24(FP), DX
1470
1471search_loop_encodeBlockAsm4MB:
1472 MOVL CX, BX
1473 SUBL 12(SP), BX
1474 SHRL $0x06, BX
1475 LEAL 4(CX)(BX*1), BX
1476 CMPL BX, 8(SP)
1477 JAE emit_remainder_encodeBlockAsm4MB
1478 MOVQ (DX)(CX*1), SI
1479 MOVL BX, 20(SP)
1480 MOVQ $0x0000cf1bbcdcbf9b, R8
1481 MOVQ SI, R9
1482 MOVQ SI, R10
1483 SHRQ $0x08, R10
1484 SHLQ $0x10, R9
1485 IMULQ R8, R9
1486 SHRQ $0x32, R9
1487 SHLQ $0x10, R10
1488 IMULQ R8, R10
1489 SHRQ $0x32, R10
1490 MOVL 24(SP)(R9*4), BX
1491 MOVL 24(SP)(R10*4), DI
1492 MOVL CX, 24(SP)(R9*4)
1493 LEAL 1(CX), R9
1494 MOVL R9, 24(SP)(R10*4)
1495 MOVQ SI, R9
1496 SHRQ $0x10, R9
1497 SHLQ $0x10, R9
1498 IMULQ R8, R9
1499 SHRQ $0x32, R9
1500 MOVL CX, R8
1501 SUBL 16(SP), R8
1502 MOVL 1(DX)(R8*1), R10
1503 MOVQ SI, R8
1504 SHRQ $0x08, R8
1505 CMPL R8, R10
1506 JNE no_repeat_found_encodeBlockAsm4MB
1507 LEAL 1(CX), SI
1508 MOVL 12(SP), DI
1509 MOVL SI, BX
1510 SUBL 16(SP), BX
1511 JZ repeat_extend_back_end_encodeBlockAsm4MB
1512
1513repeat_extend_back_loop_encodeBlockAsm4MB:
1514 CMPL SI, DI
1515 JBE repeat_extend_back_end_encodeBlockAsm4MB
1516 MOVB -1(DX)(BX*1), R8
1517 MOVB -1(DX)(SI*1), R9
1518 CMPB R8, R9
1519 JNE repeat_extend_back_end_encodeBlockAsm4MB
1520 LEAL -1(SI), SI
1521 DECL BX
1522 JNZ repeat_extend_back_loop_encodeBlockAsm4MB
1523
1524repeat_extend_back_end_encodeBlockAsm4MB:
1525 MOVL SI, BX
1526 SUBL 12(SP), BX
1527 LEAQ 4(AX)(BX*1), BX
1528 CMPQ BX, (SP)
1529 JB repeat_dst_size_check_encodeBlockAsm4MB
1530 MOVQ $0x00000000, ret+48(FP)
1531 RET
1532
1533repeat_dst_size_check_encodeBlockAsm4MB:
1534 MOVL 12(SP), BX
1535 CMPL BX, SI
1536 JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB
1537 MOVL SI, R8
1538 MOVL SI, 12(SP)
1539 LEAQ (DX)(BX*1), R9
1540 SUBL BX, R8
1541 LEAL -1(R8), BX
1542 CMPL BX, $0x3c
1543 JB one_byte_repeat_emit_encodeBlockAsm4MB
1544 CMPL BX, $0x00000100
1545 JB two_bytes_repeat_emit_encodeBlockAsm4MB
1546 CMPL BX, $0x00010000
1547 JB three_bytes_repeat_emit_encodeBlockAsm4MB
1548 MOVL BX, R10
1549 SHRL $0x10, R10
1550 MOVB $0xf8, (AX)
1551 MOVW BX, 1(AX)
1552 MOVB R10, 3(AX)
1553 ADDQ $0x04, AX
1554 JMP memmove_long_repeat_emit_encodeBlockAsm4MB
1555
1556three_bytes_repeat_emit_encodeBlockAsm4MB:
1557 MOVB $0xf4, (AX)
1558 MOVW BX, 1(AX)
1559 ADDQ $0x03, AX
1560 JMP memmove_long_repeat_emit_encodeBlockAsm4MB
1561
1562two_bytes_repeat_emit_encodeBlockAsm4MB:
1563 MOVB $0xf0, (AX)
1564 MOVB BL, 1(AX)
1565 ADDQ $0x02, AX
1566 CMPL BX, $0x40
1567 JB memmove_repeat_emit_encodeBlockAsm4MB
1568 JMP memmove_long_repeat_emit_encodeBlockAsm4MB
1569
1570one_byte_repeat_emit_encodeBlockAsm4MB:
1571 SHLB $0x02, BL
1572 MOVB BL, (AX)
1573 ADDQ $0x01, AX
1574
1575memmove_repeat_emit_encodeBlockAsm4MB:
1576 LEAQ (AX)(R8*1), BX
1577
1578 // genMemMoveShort
1579 CMPQ R8, $0x08
1580 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8
1581 CMPQ R8, $0x10
1582 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16
1583 CMPQ R8, $0x20
1584 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32
1585 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64
1586
1587emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8:
1588 MOVQ (R9), R10
1589 MOVQ R10, (AX)
1590 JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
1591
1592emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16:
1593 MOVQ (R9), R10
1594 MOVQ -8(R9)(R8*1), R9
1595 MOVQ R10, (AX)
1596 MOVQ R9, -8(AX)(R8*1)
1597 JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
1598
1599emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32:
1600 MOVOU (R9), X0
1601 MOVOU -16(R9)(R8*1), X1
1602 MOVOU X0, (AX)
1603 MOVOU X1, -16(AX)(R8*1)
1604 JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
1605
1606emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64:
1607 MOVOU (R9), X0
1608 MOVOU 16(R9), X1
1609 MOVOU -32(R9)(R8*1), X2
1610 MOVOU -16(R9)(R8*1), X3
1611 MOVOU X0, (AX)
1612 MOVOU X1, 16(AX)
1613 MOVOU X2, -32(AX)(R8*1)
1614 MOVOU X3, -16(AX)(R8*1)
1615
1616memmove_end_copy_repeat_emit_encodeBlockAsm4MB:
1617 MOVQ BX, AX
1618 JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB
1619
1620memmove_long_repeat_emit_encodeBlockAsm4MB:
1621 LEAQ (AX)(R8*1), BX
1622
1623 // genMemMoveLong
1624 MOVOU (R9), X0
1625 MOVOU 16(R9), X1
1626 MOVOU -32(R9)(R8*1), X2
1627 MOVOU -16(R9)(R8*1), X3
1628 MOVQ R8, R11
1629 SHRQ $0x05, R11
1630 MOVQ AX, R10
1631 ANDL $0x0000001f, R10
1632 MOVQ $0x00000040, R12
1633 SUBQ R10, R12
1634 DECQ R11
1635 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
1636 LEAQ -32(R9)(R12*1), R10
1637 LEAQ -32(AX)(R12*1), R13
1638
1639emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back:
1640 MOVOU (R10), X4
1641 MOVOU 16(R10), X5
1642 MOVOA X4, (R13)
1643 MOVOA X5, 16(R13)
1644 ADDQ $0x20, R13
1645 ADDQ $0x20, R10
1646 ADDQ $0x20, R12
1647 DECQ R11
1648 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back
1649
1650emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
1651 MOVOU -32(R9)(R12*1), X4
1652 MOVOU -16(R9)(R12*1), X5
1653 MOVOA X4, -32(AX)(R12*1)
1654 MOVOA X5, -16(AX)(R12*1)
1655 ADDQ $0x20, R12
1656 CMPQ R8, R12
1657 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
1658 MOVOU X0, (AX)
1659 MOVOU X1, 16(AX)
1660 MOVOU X2, -32(AX)(R8*1)
1661 MOVOU X3, -16(AX)(R8*1)
1662 MOVQ BX, AX
1663
1664emit_literal_done_repeat_emit_encodeBlockAsm4MB:
1665 ADDL $0x05, CX
1666 MOVL CX, BX
1667 SUBL 16(SP), BX
1668 MOVQ src_len+32(FP), R8
1669 SUBL CX, R8
1670 LEAQ (DX)(CX*1), R9
1671 LEAQ (DX)(BX*1), BX
1672
1673 // matchLen
1674 XORL R11, R11
1675
1676matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB:
1677 CMPL R8, $0x10
1678 JB matchlen_match8_repeat_extend_encodeBlockAsm4MB
1679 MOVQ (R9)(R11*1), R10
1680 MOVQ 8(R9)(R11*1), R12
1681 XORQ (BX)(R11*1), R10
1682 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB
1683 XORQ 8(BX)(R11*1), R12
1684 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm4MB
1685 LEAL -16(R8), R8
1686 LEAL 16(R11), R11
1687 JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB
1688
1689matchlen_bsf_16repeat_extend_encodeBlockAsm4MB:
1690#ifdef GOAMD64_v3
1691 TZCNTQ R12, R12
1692
1693#else
1694 BSFQ R12, R12
1695
1696#endif
1697 SARQ $0x03, R12
1698 LEAL 8(R11)(R12*1), R11
1699 JMP repeat_extend_forward_end_encodeBlockAsm4MB
1700
1701matchlen_match8_repeat_extend_encodeBlockAsm4MB:
1702 CMPL R8, $0x08
1703 JB matchlen_match4_repeat_extend_encodeBlockAsm4MB
1704 MOVQ (R9)(R11*1), R10
1705 XORQ (BX)(R11*1), R10
1706 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB
1707 LEAL -8(R8), R8
1708 LEAL 8(R11), R11
1709 JMP matchlen_match4_repeat_extend_encodeBlockAsm4MB
1710
1711matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB:
1712#ifdef GOAMD64_v3
1713 TZCNTQ R10, R10
1714
1715#else
1716 BSFQ R10, R10
1717
1718#endif
1719 SARQ $0x03, R10
1720 LEAL (R11)(R10*1), R11
1721 JMP repeat_extend_forward_end_encodeBlockAsm4MB
1722
1723matchlen_match4_repeat_extend_encodeBlockAsm4MB:
1724 CMPL R8, $0x04
1725 JB matchlen_match2_repeat_extend_encodeBlockAsm4MB
1726 MOVL (R9)(R11*1), R10
1727 CMPL (BX)(R11*1), R10
1728 JNE matchlen_match2_repeat_extend_encodeBlockAsm4MB
1729 LEAL -4(R8), R8
1730 LEAL 4(R11), R11
1731
1732matchlen_match2_repeat_extend_encodeBlockAsm4MB:
1733 CMPL R8, $0x01
1734 JE matchlen_match1_repeat_extend_encodeBlockAsm4MB
1735 JB repeat_extend_forward_end_encodeBlockAsm4MB
1736 MOVW (R9)(R11*1), R10
1737 CMPW (BX)(R11*1), R10
1738 JNE matchlen_match1_repeat_extend_encodeBlockAsm4MB
1739 LEAL 2(R11), R11
1740 SUBL $0x02, R8
1741 JZ repeat_extend_forward_end_encodeBlockAsm4MB
1742
1743matchlen_match1_repeat_extend_encodeBlockAsm4MB:
1744 MOVB (R9)(R11*1), R10
1745 CMPB (BX)(R11*1), R10
1746 JNE repeat_extend_forward_end_encodeBlockAsm4MB
1747 LEAL 1(R11), R11
1748
1749repeat_extend_forward_end_encodeBlockAsm4MB:
1750 ADDL R11, CX
1751 MOVL CX, BX
1752 SUBL SI, BX
1753 MOVL 16(SP), SI
1754 TESTL DI, DI
1755 JZ repeat_as_copy_encodeBlockAsm4MB
1756
1757 // emitRepeat
1758 MOVL BX, DI
1759 LEAL -4(BX), BX
1760 CMPL DI, $0x08
1761 JBE repeat_two_match_repeat_encodeBlockAsm4MB
1762 CMPL DI, $0x0c
1763 JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB
1764 CMPL SI, $0x00000800
1765 JB repeat_two_offset_match_repeat_encodeBlockAsm4MB
1766
1767cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB:
1768 CMPL BX, $0x00000104
1769 JB repeat_three_match_repeat_encodeBlockAsm4MB
1770 CMPL BX, $0x00010100
1771 JB repeat_four_match_repeat_encodeBlockAsm4MB
1772 LEAL -65536(BX), BX
1773 MOVL BX, SI
1774 MOVW $0x001d, (AX)
1775 MOVW BX, 2(AX)
1776 SARL $0x10, SI
1777 MOVB SI, 4(AX)
1778 ADDQ $0x05, AX
1779 JMP repeat_end_emit_encodeBlockAsm4MB
1780
1781repeat_four_match_repeat_encodeBlockAsm4MB:
1782 LEAL -256(BX), BX
1783 MOVW $0x0019, (AX)
1784 MOVW BX, 2(AX)
1785 ADDQ $0x04, AX
1786 JMP repeat_end_emit_encodeBlockAsm4MB
1787
1788repeat_three_match_repeat_encodeBlockAsm4MB:
1789 LEAL -4(BX), BX
1790 MOVW $0x0015, (AX)
1791 MOVB BL, 2(AX)
1792 ADDQ $0x03, AX
1793 JMP repeat_end_emit_encodeBlockAsm4MB
1794
1795repeat_two_match_repeat_encodeBlockAsm4MB:
1796 SHLL $0x02, BX
1797 ORL $0x01, BX
1798 MOVW BX, (AX)
1799 ADDQ $0x02, AX
1800 JMP repeat_end_emit_encodeBlockAsm4MB
1801
1802repeat_two_offset_match_repeat_encodeBlockAsm4MB:
1803 XORQ DI, DI
1804 LEAL 1(DI)(BX*4), BX
1805 MOVB SI, 1(AX)
1806 SARL $0x08, SI
1807 SHLL $0x05, SI
1808 ORL SI, BX
1809 MOVB BL, (AX)
1810 ADDQ $0x02, AX
1811 JMP repeat_end_emit_encodeBlockAsm4MB
1812
1813repeat_as_copy_encodeBlockAsm4MB:
1814 // emitCopy
1815 CMPL SI, $0x00010000
1816 JB two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
1817 CMPL BX, $0x40
1818 JBE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
1819 MOVB $0xff, (AX)
1820 MOVL SI, 1(AX)
1821 LEAL -64(BX), BX
1822 ADDQ $0x05, AX
1823 CMPL BX, $0x04
1824 JB four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
1825
1826 // emitRepeat
1827 MOVL BX, DI
1828 LEAL -4(BX), BX
1829 CMPL DI, $0x08
1830 JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1831 CMPL DI, $0x0c
1832 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1833 CMPL SI, $0x00000800
1834 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1835
1836cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1837 CMPL BX, $0x00000104
1838 JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1839 CMPL BX, $0x00010100
1840 JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1841 LEAL -65536(BX), BX
1842 MOVL BX, SI
1843 MOVW $0x001d, (AX)
1844 MOVW BX, 2(AX)
1845 SARL $0x10, SI
1846 MOVB SI, 4(AX)
1847 ADDQ $0x05, AX
1848 JMP repeat_end_emit_encodeBlockAsm4MB
1849
1850repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1851 LEAL -256(BX), BX
1852 MOVW $0x0019, (AX)
1853 MOVW BX, 2(AX)
1854 ADDQ $0x04, AX
1855 JMP repeat_end_emit_encodeBlockAsm4MB
1856
1857repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1858 LEAL -4(BX), BX
1859 MOVW $0x0015, (AX)
1860 MOVB BL, 2(AX)
1861 ADDQ $0x03, AX
1862 JMP repeat_end_emit_encodeBlockAsm4MB
1863
1864repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1865 SHLL $0x02, BX
1866 ORL $0x01, BX
1867 MOVW BX, (AX)
1868 ADDQ $0x02, AX
1869 JMP repeat_end_emit_encodeBlockAsm4MB
1870
1871repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1872 XORQ DI, DI
1873 LEAL 1(DI)(BX*4), BX
1874 MOVB SI, 1(AX)
1875 SARL $0x08, SI
1876 SHLL $0x05, SI
1877 ORL SI, BX
1878 MOVB BL, (AX)
1879 ADDQ $0x02, AX
1880 JMP repeat_end_emit_encodeBlockAsm4MB
1881
1882four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB:
1883 TESTL BX, BX
1884 JZ repeat_end_emit_encodeBlockAsm4MB
1885 XORL DI, DI
1886 LEAL -1(DI)(BX*4), BX
1887 MOVB BL, (AX)
1888 MOVL SI, 1(AX)
1889 ADDQ $0x05, AX
1890 JMP repeat_end_emit_encodeBlockAsm4MB
1891
1892two_byte_offset_repeat_as_copy_encodeBlockAsm4MB:
1893 CMPL BX, $0x40
1894 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB
1895 CMPL SI, $0x00000800
1896 JAE long_offset_short_repeat_as_copy_encodeBlockAsm4MB
1897 MOVL $0x00000001, DI
1898 LEAL 16(DI), DI
1899 MOVB SI, 1(AX)
1900 SHRL $0x08, SI
1901 SHLL $0x05, SI
1902 ORL SI, DI
1903 MOVB DI, (AX)
1904 ADDQ $0x02, AX
1905 SUBL $0x08, BX
1906
1907 // emitRepeat
1908 LEAL -4(BX), BX
1909 JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
1910 MOVL BX, DI
1911 LEAL -4(BX), BX
1912 CMPL DI, $0x08
1913 JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
1914 CMPL DI, $0x0c
1915 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
1916 CMPL SI, $0x00000800
1917 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
1918
1919cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
1920 CMPL BX, $0x00000104
1921 JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
1922 CMPL BX, $0x00010100
1923 JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
1924 LEAL -65536(BX), BX
1925 MOVL BX, SI
1926 MOVW $0x001d, (AX)
1927 MOVW BX, 2(AX)
1928 SARL $0x10, SI
1929 MOVB SI, 4(AX)
1930 ADDQ $0x05, AX
1931 JMP repeat_end_emit_encodeBlockAsm4MB
1932
1933repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
1934 LEAL -256(BX), BX
1935 MOVW $0x0019, (AX)
1936 MOVW BX, 2(AX)
1937 ADDQ $0x04, AX
1938 JMP repeat_end_emit_encodeBlockAsm4MB
1939
1940repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
1941 LEAL -4(BX), BX
1942 MOVW $0x0015, (AX)
1943 MOVB BL, 2(AX)
1944 ADDQ $0x03, AX
1945 JMP repeat_end_emit_encodeBlockAsm4MB
1946
1947repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
1948 SHLL $0x02, BX
1949 ORL $0x01, BX
1950 MOVW BX, (AX)
1951 ADDQ $0x02, AX
1952 JMP repeat_end_emit_encodeBlockAsm4MB
1953
1954repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
1955 XORQ DI, DI
1956 LEAL 1(DI)(BX*4), BX
1957 MOVB SI, 1(AX)
1958 SARL $0x08, SI
1959 SHLL $0x05, SI
1960 ORL SI, BX
1961 MOVB BL, (AX)
1962 ADDQ $0x02, AX
1963 JMP repeat_end_emit_encodeBlockAsm4MB
1964
1965long_offset_short_repeat_as_copy_encodeBlockAsm4MB:
1966 MOVB $0xee, (AX)
1967 MOVW SI, 1(AX)
1968 LEAL -60(BX), BX
1969 ADDQ $0x03, AX
1970
1971 // emitRepeat
1972 MOVL BX, DI
1973 LEAL -4(BX), BX
1974 CMPL DI, $0x08
1975 JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1976 CMPL DI, $0x0c
1977 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1978 CMPL SI, $0x00000800
1979 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1980
1981cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
1982 CMPL BX, $0x00000104
1983 JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1984 CMPL BX, $0x00010100
1985 JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1986 LEAL -65536(BX), BX
1987 MOVL BX, SI
1988 MOVW $0x001d, (AX)
1989 MOVW BX, 2(AX)
1990 SARL $0x10, SI
1991 MOVB SI, 4(AX)
1992 ADDQ $0x05, AX
1993 JMP repeat_end_emit_encodeBlockAsm4MB
1994
1995repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
1996 LEAL -256(BX), BX
1997 MOVW $0x0019, (AX)
1998 MOVW BX, 2(AX)
1999 ADDQ $0x04, AX
2000 JMP repeat_end_emit_encodeBlockAsm4MB
2001
2002repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
2003 LEAL -4(BX), BX
2004 MOVW $0x0015, (AX)
2005 MOVB BL, 2(AX)
2006 ADDQ $0x03, AX
2007 JMP repeat_end_emit_encodeBlockAsm4MB
2008
2009repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
2010 SHLL $0x02, BX
2011 ORL $0x01, BX
2012 MOVW BX, (AX)
2013 ADDQ $0x02, AX
2014 JMP repeat_end_emit_encodeBlockAsm4MB
2015
2016repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
2017 XORQ DI, DI
2018 LEAL 1(DI)(BX*4), BX
2019 MOVB SI, 1(AX)
2020 SARL $0x08, SI
2021 SHLL $0x05, SI
2022 ORL SI, BX
2023 MOVB BL, (AX)
2024 ADDQ $0x02, AX
2025 JMP repeat_end_emit_encodeBlockAsm4MB
2026
2027two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB:
2028 MOVL BX, DI
2029 SHLL $0x02, DI
2030 CMPL BX, $0x0c
2031 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
2032 CMPL SI, $0x00000800
2033 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
2034 LEAL -15(DI), DI
2035 MOVB SI, 1(AX)
2036 SHRL $0x08, SI
2037 SHLL $0x05, SI
2038 ORL SI, DI
2039 MOVB DI, (AX)
2040 ADDQ $0x02, AX
2041 JMP repeat_end_emit_encodeBlockAsm4MB
2042
2043emit_copy_three_repeat_as_copy_encodeBlockAsm4MB:
2044 LEAL -2(DI), DI
2045 MOVB DI, (AX)
2046 MOVW SI, 1(AX)
2047 ADDQ $0x03, AX
2048
2049repeat_end_emit_encodeBlockAsm4MB:
2050 MOVL CX, 12(SP)
2051 JMP search_loop_encodeBlockAsm4MB
2052
2053no_repeat_found_encodeBlockAsm4MB:
2054 CMPL (DX)(BX*1), SI
2055 JEQ candidate_match_encodeBlockAsm4MB
2056 SHRQ $0x08, SI
2057 MOVL 24(SP)(R9*4), BX
2058 LEAL 2(CX), R8
2059 CMPL (DX)(DI*1), SI
2060 JEQ candidate2_match_encodeBlockAsm4MB
2061 MOVL R8, 24(SP)(R9*4)
2062 SHRQ $0x08, SI
2063 CMPL (DX)(BX*1), SI
2064 JEQ candidate3_match_encodeBlockAsm4MB
2065 MOVL 20(SP), CX
2066 JMP search_loop_encodeBlockAsm4MB
2067
2068candidate3_match_encodeBlockAsm4MB:
2069 ADDL $0x02, CX
2070 JMP candidate_match_encodeBlockAsm4MB
2071
2072candidate2_match_encodeBlockAsm4MB:
2073 MOVL R8, 24(SP)(R9*4)
2074 INCL CX
2075 MOVL DI, BX
2076
2077candidate_match_encodeBlockAsm4MB:
2078 MOVL 12(SP), SI
2079 TESTL BX, BX
2080 JZ match_extend_back_end_encodeBlockAsm4MB
2081
2082match_extend_back_loop_encodeBlockAsm4MB:
2083 CMPL CX, SI
2084 JBE match_extend_back_end_encodeBlockAsm4MB
2085 MOVB -1(DX)(BX*1), DI
2086 MOVB -1(DX)(CX*1), R8
2087 CMPB DI, R8
2088 JNE match_extend_back_end_encodeBlockAsm4MB
2089 LEAL -1(CX), CX
2090 DECL BX
2091 JZ match_extend_back_end_encodeBlockAsm4MB
2092 JMP match_extend_back_loop_encodeBlockAsm4MB
2093
2094match_extend_back_end_encodeBlockAsm4MB:
2095 MOVL CX, SI
2096 SUBL 12(SP), SI
2097 LEAQ 4(AX)(SI*1), SI
2098 CMPQ SI, (SP)
2099 JB match_dst_size_check_encodeBlockAsm4MB
2100 MOVQ $0x00000000, ret+48(FP)
2101 RET
2102
2103match_dst_size_check_encodeBlockAsm4MB:
2104 MOVL CX, SI
2105 MOVL 12(SP), DI
2106 CMPL DI, SI
2107 JEQ emit_literal_done_match_emit_encodeBlockAsm4MB
2108 MOVL SI, R8
2109 MOVL SI, 12(SP)
2110 LEAQ (DX)(DI*1), SI
2111 SUBL DI, R8
2112 LEAL -1(R8), DI
2113 CMPL DI, $0x3c
2114 JB one_byte_match_emit_encodeBlockAsm4MB
2115 CMPL DI, $0x00000100
2116 JB two_bytes_match_emit_encodeBlockAsm4MB
2117 CMPL DI, $0x00010000
2118 JB three_bytes_match_emit_encodeBlockAsm4MB
2119 MOVL DI, R9
2120 SHRL $0x10, R9
2121 MOVB $0xf8, (AX)
2122 MOVW DI, 1(AX)
2123 MOVB R9, 3(AX)
2124 ADDQ $0x04, AX
2125 JMP memmove_long_match_emit_encodeBlockAsm4MB
2126
2127three_bytes_match_emit_encodeBlockAsm4MB:
2128 MOVB $0xf4, (AX)
2129 MOVW DI, 1(AX)
2130 ADDQ $0x03, AX
2131 JMP memmove_long_match_emit_encodeBlockAsm4MB
2132
2133two_bytes_match_emit_encodeBlockAsm4MB:
2134 MOVB $0xf0, (AX)
2135 MOVB DI, 1(AX)
2136 ADDQ $0x02, AX
2137 CMPL DI, $0x40
2138 JB memmove_match_emit_encodeBlockAsm4MB
2139 JMP memmove_long_match_emit_encodeBlockAsm4MB
2140
2141one_byte_match_emit_encodeBlockAsm4MB:
2142 SHLB $0x02, DI
2143 MOVB DI, (AX)
2144 ADDQ $0x01, AX
2145
2146memmove_match_emit_encodeBlockAsm4MB:
2147 LEAQ (AX)(R8*1), DI
2148
2149 // genMemMoveShort
2150 CMPQ R8, $0x08
2151 JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8
2152 CMPQ R8, $0x10
2153 JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16
2154 CMPQ R8, $0x20
2155 JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32
2156 JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64
2157
2158emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8:
2159 MOVQ (SI), R9
2160 MOVQ R9, (AX)
2161 JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
2162
2163emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16:
2164 MOVQ (SI), R9
2165 MOVQ -8(SI)(R8*1), SI
2166 MOVQ R9, (AX)
2167 MOVQ SI, -8(AX)(R8*1)
2168 JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
2169
2170emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32:
2171 MOVOU (SI), X0
2172 MOVOU -16(SI)(R8*1), X1
2173 MOVOU X0, (AX)
2174 MOVOU X1, -16(AX)(R8*1)
2175 JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
2176
2177emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64:
2178 MOVOU (SI), X0
2179 MOVOU 16(SI), X1
2180 MOVOU -32(SI)(R8*1), X2
2181 MOVOU -16(SI)(R8*1), X3
2182 MOVOU X0, (AX)
2183 MOVOU X1, 16(AX)
2184 MOVOU X2, -32(AX)(R8*1)
2185 MOVOU X3, -16(AX)(R8*1)
2186
2187memmove_end_copy_match_emit_encodeBlockAsm4MB:
2188 MOVQ DI, AX
2189 JMP emit_literal_done_match_emit_encodeBlockAsm4MB
2190
2191memmove_long_match_emit_encodeBlockAsm4MB:
2192 LEAQ (AX)(R8*1), DI
2193
2194 // genMemMoveLong
2195 MOVOU (SI), X0
2196 MOVOU 16(SI), X1
2197 MOVOU -32(SI)(R8*1), X2
2198 MOVOU -16(SI)(R8*1), X3
2199 MOVQ R8, R10
2200 SHRQ $0x05, R10
2201 MOVQ AX, R9
2202 ANDL $0x0000001f, R9
2203 MOVQ $0x00000040, R11
2204 SUBQ R9, R11
2205 DECQ R10
2206 JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
2207 LEAQ -32(SI)(R11*1), R9
2208 LEAQ -32(AX)(R11*1), R12
2209
2210emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back:
2211 MOVOU (R9), X4
2212 MOVOU 16(R9), X5
2213 MOVOA X4, (R12)
2214 MOVOA X5, 16(R12)
2215 ADDQ $0x20, R12
2216 ADDQ $0x20, R9
2217 ADDQ $0x20, R11
2218 DECQ R10
2219 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back
2220
2221emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
2222 MOVOU -32(SI)(R11*1), X4
2223 MOVOU -16(SI)(R11*1), X5
2224 MOVOA X4, -32(AX)(R11*1)
2225 MOVOA X5, -16(AX)(R11*1)
2226 ADDQ $0x20, R11
2227 CMPQ R8, R11
2228 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
2229 MOVOU X0, (AX)
2230 MOVOU X1, 16(AX)
2231 MOVOU X2, -32(AX)(R8*1)
2232 MOVOU X3, -16(AX)(R8*1)
2233 MOVQ DI, AX
2234
2235emit_literal_done_match_emit_encodeBlockAsm4MB:
2236match_nolit_loop_encodeBlockAsm4MB:
2237 MOVL CX, SI
2238 SUBL BX, SI
2239 MOVL SI, 16(SP)
2240 ADDL $0x04, CX
2241 ADDL $0x04, BX
2242 MOVQ src_len+32(FP), SI
2243 SUBL CX, SI
2244 LEAQ (DX)(CX*1), DI
2245 LEAQ (DX)(BX*1), BX
2246
2247 // matchLen
2248 XORL R9, R9
2249
2250matchlen_loopback_16_match_nolit_encodeBlockAsm4MB:
2251 CMPL SI, $0x10
2252 JB matchlen_match8_match_nolit_encodeBlockAsm4MB
2253 MOVQ (DI)(R9*1), R8
2254 MOVQ 8(DI)(R9*1), R10
2255 XORQ (BX)(R9*1), R8
2256 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4MB
2257 XORQ 8(BX)(R9*1), R10
2258 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm4MB
2259 LEAL -16(SI), SI
2260 LEAL 16(R9), R9
2261 JMP matchlen_loopback_16_match_nolit_encodeBlockAsm4MB
2262
2263matchlen_bsf_16match_nolit_encodeBlockAsm4MB:
2264#ifdef GOAMD64_v3
2265 TZCNTQ R10, R10
2266
2267#else
2268 BSFQ R10, R10
2269
2270#endif
2271 SARQ $0x03, R10
2272 LEAL 8(R9)(R10*1), R9
2273 JMP match_nolit_end_encodeBlockAsm4MB
2274
2275matchlen_match8_match_nolit_encodeBlockAsm4MB:
2276 CMPL SI, $0x08
2277 JB matchlen_match4_match_nolit_encodeBlockAsm4MB
2278 MOVQ (DI)(R9*1), R8
2279 XORQ (BX)(R9*1), R8
2280 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4MB
2281 LEAL -8(SI), SI
2282 LEAL 8(R9), R9
2283 JMP matchlen_match4_match_nolit_encodeBlockAsm4MB
2284
2285matchlen_bsf_8_match_nolit_encodeBlockAsm4MB:
2286#ifdef GOAMD64_v3
2287 TZCNTQ R8, R8
2288
2289#else
2290 BSFQ R8, R8
2291
2292#endif
2293 SARQ $0x03, R8
2294 LEAL (R9)(R8*1), R9
2295 JMP match_nolit_end_encodeBlockAsm4MB
2296
2297matchlen_match4_match_nolit_encodeBlockAsm4MB:
2298 CMPL SI, $0x04
2299 JB matchlen_match2_match_nolit_encodeBlockAsm4MB
2300 MOVL (DI)(R9*1), R8
2301 CMPL (BX)(R9*1), R8
2302 JNE matchlen_match2_match_nolit_encodeBlockAsm4MB
2303 LEAL -4(SI), SI
2304 LEAL 4(R9), R9
2305
2306matchlen_match2_match_nolit_encodeBlockAsm4MB:
2307 CMPL SI, $0x01
2308 JE matchlen_match1_match_nolit_encodeBlockAsm4MB
2309 JB match_nolit_end_encodeBlockAsm4MB
2310 MOVW (DI)(R9*1), R8
2311 CMPW (BX)(R9*1), R8
2312 JNE matchlen_match1_match_nolit_encodeBlockAsm4MB
2313 LEAL 2(R9), R9
2314 SUBL $0x02, SI
2315 JZ match_nolit_end_encodeBlockAsm4MB
2316
2317matchlen_match1_match_nolit_encodeBlockAsm4MB:
2318 MOVB (DI)(R9*1), R8
2319 CMPB (BX)(R9*1), R8
2320 JNE match_nolit_end_encodeBlockAsm4MB
2321 LEAL 1(R9), R9
2322
2323match_nolit_end_encodeBlockAsm4MB:
2324 ADDL R9, CX
2325 MOVL 16(SP), BX
2326 ADDL $0x04, R9
2327 MOVL CX, 12(SP)
2328
2329 // emitCopy
2330 CMPL BX, $0x00010000
2331 JB two_byte_offset_match_nolit_encodeBlockAsm4MB
2332 CMPL R9, $0x40
2333 JBE four_bytes_remain_match_nolit_encodeBlockAsm4MB
2334 MOVB $0xff, (AX)
2335 MOVL BX, 1(AX)
2336 LEAL -64(R9), R9
2337 ADDQ $0x05, AX
2338 CMPL R9, $0x04
2339 JB four_bytes_remain_match_nolit_encodeBlockAsm4MB
2340
2341 // emitRepeat
2342 MOVL R9, SI
2343 LEAL -4(R9), R9
2344 CMPL SI, $0x08
2345 JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy
2346 CMPL SI, $0x0c
2347 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
2348 CMPL BX, $0x00000800
2349 JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
2350
2351cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
2352 CMPL R9, $0x00000104
2353 JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy
2354 CMPL R9, $0x00010100
2355 JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy
2356 LEAL -65536(R9), R9
2357 MOVL R9, BX
2358 MOVW $0x001d, (AX)
2359 MOVW R9, 2(AX)
2360 SARL $0x10, BX
2361 MOVB BL, 4(AX)
2362 ADDQ $0x05, AX
2363 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2364
2365repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy:
2366 LEAL -256(R9), R9
2367 MOVW $0x0019, (AX)
2368 MOVW R9, 2(AX)
2369 ADDQ $0x04, AX
2370 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2371
2372repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy:
2373 LEAL -4(R9), R9
2374 MOVW $0x0015, (AX)
2375 MOVB R9, 2(AX)
2376 ADDQ $0x03, AX
2377 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2378
2379repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy:
2380 SHLL $0x02, R9
2381 ORL $0x01, R9
2382 MOVW R9, (AX)
2383 ADDQ $0x02, AX
2384 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2385
2386repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
2387 XORQ SI, SI
2388 LEAL 1(SI)(R9*4), R9
2389 MOVB BL, 1(AX)
2390 SARL $0x08, BX
2391 SHLL $0x05, BX
2392 ORL BX, R9
2393 MOVB R9, (AX)
2394 ADDQ $0x02, AX
2395 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2396
2397four_bytes_remain_match_nolit_encodeBlockAsm4MB:
2398 TESTL R9, R9
2399 JZ match_nolit_emitcopy_end_encodeBlockAsm4MB
2400 XORL SI, SI
2401 LEAL -1(SI)(R9*4), R9
2402 MOVB R9, (AX)
2403 MOVL BX, 1(AX)
2404 ADDQ $0x05, AX
2405 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2406
2407two_byte_offset_match_nolit_encodeBlockAsm4MB:
2408 CMPL R9, $0x40
2409 JBE two_byte_offset_short_match_nolit_encodeBlockAsm4MB
2410 CMPL BX, $0x00000800
2411 JAE long_offset_short_match_nolit_encodeBlockAsm4MB
2412 MOVL $0x00000001, SI
2413 LEAL 16(SI), SI
2414 MOVB BL, 1(AX)
2415 SHRL $0x08, BX
2416 SHLL $0x05, BX
2417 ORL BX, SI
2418 MOVB SI, (AX)
2419 ADDQ $0x02, AX
2420 SUBL $0x08, R9
2421
2422 // emitRepeat
2423 LEAL -4(R9), R9
2424 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
2425 MOVL R9, SI
2426 LEAL -4(R9), R9
2427 CMPL SI, $0x08
2428 JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
2429 CMPL SI, $0x0c
2430 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
2431 CMPL BX, $0x00000800
2432 JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
2433
2434cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
2435 CMPL R9, $0x00000104
2436 JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
2437 CMPL R9, $0x00010100
2438 JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
2439 LEAL -65536(R9), R9
2440 MOVL R9, BX
2441 MOVW $0x001d, (AX)
2442 MOVW R9, 2(AX)
2443 SARL $0x10, BX
2444 MOVB BL, 4(AX)
2445 ADDQ $0x05, AX
2446 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2447
2448repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
2449 LEAL -256(R9), R9
2450 MOVW $0x0019, (AX)
2451 MOVW R9, 2(AX)
2452 ADDQ $0x04, AX
2453 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2454
2455repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
2456 LEAL -4(R9), R9
2457 MOVW $0x0015, (AX)
2458 MOVB R9, 2(AX)
2459 ADDQ $0x03, AX
2460 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2461
2462repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
2463 SHLL $0x02, R9
2464 ORL $0x01, R9
2465 MOVW R9, (AX)
2466 ADDQ $0x02, AX
2467 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2468
2469repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
2470 XORQ SI, SI
2471 LEAL 1(SI)(R9*4), R9
2472 MOVB BL, 1(AX)
2473 SARL $0x08, BX
2474 SHLL $0x05, BX
2475 ORL BX, R9
2476 MOVB R9, (AX)
2477 ADDQ $0x02, AX
2478 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2479
2480long_offset_short_match_nolit_encodeBlockAsm4MB:
2481 MOVB $0xee, (AX)
2482 MOVW BX, 1(AX)
2483 LEAL -60(R9), R9
2484 ADDQ $0x03, AX
2485
2486 // emitRepeat
2487 MOVL R9, SI
2488 LEAL -4(R9), R9
2489 CMPL SI, $0x08
2490 JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short
2491 CMPL SI, $0x0c
2492 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
2493 CMPL BX, $0x00000800
2494 JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
2495
2496cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2497 CMPL R9, $0x00000104
2498 JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short
2499 CMPL R9, $0x00010100
2500 JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short
2501 LEAL -65536(R9), R9
2502 MOVL R9, BX
2503 MOVW $0x001d, (AX)
2504 MOVW R9, 2(AX)
2505 SARL $0x10, BX
2506 MOVB BL, 4(AX)
2507 ADDQ $0x05, AX
2508 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2509
2510repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2511 LEAL -256(R9), R9
2512 MOVW $0x0019, (AX)
2513 MOVW R9, 2(AX)
2514 ADDQ $0x04, AX
2515 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2516
2517repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2518 LEAL -4(R9), R9
2519 MOVW $0x0015, (AX)
2520 MOVB R9, 2(AX)
2521 ADDQ $0x03, AX
2522 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2523
2524repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2525 SHLL $0x02, R9
2526 ORL $0x01, R9
2527 MOVW R9, (AX)
2528 ADDQ $0x02, AX
2529 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2530
2531repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2532 XORQ SI, SI
2533 LEAL 1(SI)(R9*4), R9
2534 MOVB BL, 1(AX)
2535 SARL $0x08, BX
2536 SHLL $0x05, BX
2537 ORL BX, R9
2538 MOVB R9, (AX)
2539 ADDQ $0x02, AX
2540 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2541
2542two_byte_offset_short_match_nolit_encodeBlockAsm4MB:
2543 MOVL R9, SI
2544 SHLL $0x02, SI
2545 CMPL R9, $0x0c
2546 JAE emit_copy_three_match_nolit_encodeBlockAsm4MB
2547 CMPL BX, $0x00000800
2548 JAE emit_copy_three_match_nolit_encodeBlockAsm4MB
2549 LEAL -15(SI), SI
2550 MOVB BL, 1(AX)
2551 SHRL $0x08, BX
2552 SHLL $0x05, BX
2553 ORL BX, SI
2554 MOVB SI, (AX)
2555 ADDQ $0x02, AX
2556 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2557
2558emit_copy_three_match_nolit_encodeBlockAsm4MB:
2559 LEAL -2(SI), SI
2560 MOVB SI, (AX)
2561 MOVW BX, 1(AX)
2562 ADDQ $0x03, AX
2563
2564match_nolit_emitcopy_end_encodeBlockAsm4MB:
2565 CMPL CX, 8(SP)
2566 JAE emit_remainder_encodeBlockAsm4MB
2567 MOVQ -2(DX)(CX*1), SI
2568 CMPQ AX, (SP)
2569 JB match_nolit_dst_ok_encodeBlockAsm4MB
2570 MOVQ $0x00000000, ret+48(FP)
2571 RET
2572
2573match_nolit_dst_ok_encodeBlockAsm4MB:
2574 MOVQ $0x0000cf1bbcdcbf9b, R8
2575 MOVQ SI, DI
2576 SHRQ $0x10, SI
2577 MOVQ SI, BX
2578 SHLQ $0x10, DI
2579 IMULQ R8, DI
2580 SHRQ $0x32, DI
2581 SHLQ $0x10, BX
2582 IMULQ R8, BX
2583 SHRQ $0x32, BX
2584 LEAL -2(CX), R8
2585 LEAQ 24(SP)(BX*4), R9
2586 MOVL (R9), BX
2587 MOVL R8, 24(SP)(DI*4)
2588 MOVL CX, (R9)
2589 CMPL (DX)(BX*1), SI
2590 JEQ match_nolit_loop_encodeBlockAsm4MB
2591 INCL CX
2592 JMP search_loop_encodeBlockAsm4MB
2593
2594emit_remainder_encodeBlockAsm4MB:
2595 MOVQ src_len+32(FP), CX
2596 SUBL 12(SP), CX
2597 LEAQ 4(AX)(CX*1), CX
2598 CMPQ CX, (SP)
2599 JB emit_remainder_ok_encodeBlockAsm4MB
2600 MOVQ $0x00000000, ret+48(FP)
2601 RET
2602
2603emit_remainder_ok_encodeBlockAsm4MB:
2604 MOVQ src_len+32(FP), CX
2605 MOVL 12(SP), BX
2606 CMPL BX, CX
2607 JEQ emit_literal_done_emit_remainder_encodeBlockAsm4MB
2608 MOVL CX, SI
2609 MOVL CX, 12(SP)
2610 LEAQ (DX)(BX*1), CX
2611 SUBL BX, SI
2612 LEAL -1(SI), DX
2613 CMPL DX, $0x3c
2614 JB one_byte_emit_remainder_encodeBlockAsm4MB
2615 CMPL DX, $0x00000100
2616 JB two_bytes_emit_remainder_encodeBlockAsm4MB
2617 CMPL DX, $0x00010000
2618 JB three_bytes_emit_remainder_encodeBlockAsm4MB
2619 MOVL DX, BX
2620 SHRL $0x10, BX
2621 MOVB $0xf8, (AX)
2622 MOVW DX, 1(AX)
2623 MOVB BL, 3(AX)
2624 ADDQ $0x04, AX
2625 JMP memmove_long_emit_remainder_encodeBlockAsm4MB
2626
2627three_bytes_emit_remainder_encodeBlockAsm4MB:
2628 MOVB $0xf4, (AX)
2629 MOVW DX, 1(AX)
2630 ADDQ $0x03, AX
2631 JMP memmove_long_emit_remainder_encodeBlockAsm4MB
2632
2633two_bytes_emit_remainder_encodeBlockAsm4MB:
2634 MOVB $0xf0, (AX)
2635 MOVB DL, 1(AX)
2636 ADDQ $0x02, AX
2637 CMPL DX, $0x40
2638 JB memmove_emit_remainder_encodeBlockAsm4MB
2639 JMP memmove_long_emit_remainder_encodeBlockAsm4MB
2640
2641one_byte_emit_remainder_encodeBlockAsm4MB:
2642 SHLB $0x02, DL
2643 MOVB DL, (AX)
2644 ADDQ $0x01, AX
2645
2646memmove_emit_remainder_encodeBlockAsm4MB:
2647 LEAQ (AX)(SI*1), DX
2648 MOVL SI, BX
2649
2650 // genMemMoveShort
2651 CMPQ BX, $0x03
2652 JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2
2653 JE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3
2654 CMPQ BX, $0x08
2655 JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7
2656 CMPQ BX, $0x10
2657 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16
2658 CMPQ BX, $0x20
2659 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32
2660 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64
2661
2662emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2:
2663 MOVB (CX), SI
2664 MOVB -1(CX)(BX*1), CL
2665 MOVB SI, (AX)
2666 MOVB CL, -1(AX)(BX*1)
2667 JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2668
2669emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3:
2670 MOVW (CX), SI
2671 MOVB 2(CX), CL
2672 MOVW SI, (AX)
2673 MOVB CL, 2(AX)
2674 JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2675
2676emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7:
2677 MOVL (CX), SI
2678 MOVL -4(CX)(BX*1), CX
2679 MOVL SI, (AX)
2680 MOVL CX, -4(AX)(BX*1)
2681 JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2682
2683emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16:
2684 MOVQ (CX), SI
2685 MOVQ -8(CX)(BX*1), CX
2686 MOVQ SI, (AX)
2687 MOVQ CX, -8(AX)(BX*1)
2688 JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2689
2690emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32:
2691 MOVOU (CX), X0
2692 MOVOU -16(CX)(BX*1), X1
2693 MOVOU X0, (AX)
2694 MOVOU X1, -16(AX)(BX*1)
2695 JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2696
2697emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64:
2698 MOVOU (CX), X0
2699 MOVOU 16(CX), X1
2700 MOVOU -32(CX)(BX*1), X2
2701 MOVOU -16(CX)(BX*1), X3
2702 MOVOU X0, (AX)
2703 MOVOU X1, 16(AX)
2704 MOVOU X2, -32(AX)(BX*1)
2705 MOVOU X3, -16(AX)(BX*1)
2706
2707memmove_end_copy_emit_remainder_encodeBlockAsm4MB:
2708 MOVQ DX, AX
2709 JMP emit_literal_done_emit_remainder_encodeBlockAsm4MB
2710
2711memmove_long_emit_remainder_encodeBlockAsm4MB:
2712 LEAQ (AX)(SI*1), DX
2713 MOVL SI, BX
2714
2715 // genMemMoveLong
2716 MOVOU (CX), X0
2717 MOVOU 16(CX), X1
2718 MOVOU -32(CX)(BX*1), X2
2719 MOVOU -16(CX)(BX*1), X3
2720 MOVQ BX, DI
2721 SHRQ $0x05, DI
2722 MOVQ AX, SI
2723 ANDL $0x0000001f, SI
2724 MOVQ $0x00000040, R8
2725 SUBQ SI, R8
2726 DECQ DI
2727 JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
2728 LEAQ -32(CX)(R8*1), SI
2729 LEAQ -32(AX)(R8*1), R9
2730
2731emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back:
2732 MOVOU (SI), X4
2733 MOVOU 16(SI), X5
2734 MOVOA X4, (R9)
2735 MOVOA X5, 16(R9)
2736 ADDQ $0x20, R9
2737 ADDQ $0x20, SI
2738 ADDQ $0x20, R8
2739 DECQ DI
2740 JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back
2741
2742emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32:
2743 MOVOU -32(CX)(R8*1), X4
2744 MOVOU -16(CX)(R8*1), X5
2745 MOVOA X4, -32(AX)(R8*1)
2746 MOVOA X5, -16(AX)(R8*1)
2747 ADDQ $0x20, R8
2748 CMPQ BX, R8
2749 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
2750 MOVOU X0, (AX)
2751 MOVOU X1, 16(AX)
2752 MOVOU X2, -32(AX)(BX*1)
2753 MOVOU X3, -16(AX)(BX*1)
2754 MOVQ DX, AX
2755
2756emit_literal_done_emit_remainder_encodeBlockAsm4MB:
2757 MOVQ dst_base+0(FP), CX
2758 SUBQ CX, AX
2759 MOVQ AX, ret+48(FP)
2760 RET
2761
2762// func encodeBlockAsm12B(dst []byte, src []byte) int
2763// Requires: BMI, SSE2
2764TEXT ·encodeBlockAsm12B(SB), $16408-56
2765 MOVQ dst_base+0(FP), AX
2766 MOVQ $0x00000080, CX
2767 LEAQ 24(SP), DX
2768 PXOR X0, X0
2769
2770zero_loop_encodeBlockAsm12B:
2771 MOVOU X0, (DX)
2772 MOVOU X0, 16(DX)
2773 MOVOU X0, 32(DX)
2774 MOVOU X0, 48(DX)
2775 MOVOU X0, 64(DX)
2776 MOVOU X0, 80(DX)
2777 MOVOU X0, 96(DX)
2778 MOVOU X0, 112(DX)
2779 ADDQ $0x80, DX
2780 DECQ CX
2781 JNZ zero_loop_encodeBlockAsm12B
2782 MOVL $0x00000000, 12(SP)
2783 MOVQ src_len+32(FP), CX
2784 LEAQ -9(CX), DX
2785 LEAQ -8(CX), BX
2786 MOVL BX, 8(SP)
2787 SHRQ $0x05, CX
2788 SUBL CX, DX
2789 LEAQ (AX)(DX*1), DX
2790 MOVQ DX, (SP)
2791 MOVL $0x00000001, CX
2792 MOVL CX, 16(SP)
2793 MOVQ src_base+24(FP), DX
2794
2795search_loop_encodeBlockAsm12B:
2796 MOVL CX, BX
2797 SUBL 12(SP), BX
2798 SHRL $0x05, BX
2799 LEAL 4(CX)(BX*1), BX
2800 CMPL BX, 8(SP)
2801 JAE emit_remainder_encodeBlockAsm12B
2802 MOVQ (DX)(CX*1), SI
2803 MOVL BX, 20(SP)
2804 MOVQ $0x000000cf1bbcdcbb, R8
2805 MOVQ SI, R9
2806 MOVQ SI, R10
2807 SHRQ $0x08, R10
2808 SHLQ $0x18, R9
2809 IMULQ R8, R9
2810 SHRQ $0x34, R9
2811 SHLQ $0x18, R10
2812 IMULQ R8, R10
2813 SHRQ $0x34, R10
2814 MOVL 24(SP)(R9*4), BX
2815 MOVL 24(SP)(R10*4), DI
2816 MOVL CX, 24(SP)(R9*4)
2817 LEAL 1(CX), R9
2818 MOVL R9, 24(SP)(R10*4)
2819 MOVQ SI, R9
2820 SHRQ $0x10, R9
2821 SHLQ $0x18, R9
2822 IMULQ R8, R9
2823 SHRQ $0x34, R9
2824 MOVL CX, R8
2825 SUBL 16(SP), R8
2826 MOVL 1(DX)(R8*1), R10
2827 MOVQ SI, R8
2828 SHRQ $0x08, R8
2829 CMPL R8, R10
2830 JNE no_repeat_found_encodeBlockAsm12B
2831 LEAL 1(CX), SI
2832 MOVL 12(SP), DI
2833 MOVL SI, BX
2834 SUBL 16(SP), BX
2835 JZ repeat_extend_back_end_encodeBlockAsm12B
2836
2837repeat_extend_back_loop_encodeBlockAsm12B:
2838 CMPL SI, DI
2839 JBE repeat_extend_back_end_encodeBlockAsm12B
2840 MOVB -1(DX)(BX*1), R8
2841 MOVB -1(DX)(SI*1), R9
2842 CMPB R8, R9
2843 JNE repeat_extend_back_end_encodeBlockAsm12B
2844 LEAL -1(SI), SI
2845 DECL BX
2846 JNZ repeat_extend_back_loop_encodeBlockAsm12B
2847
2848repeat_extend_back_end_encodeBlockAsm12B:
2849 MOVL SI, BX
2850 SUBL 12(SP), BX
2851 LEAQ 3(AX)(BX*1), BX
2852 CMPQ BX, (SP)
2853 JB repeat_dst_size_check_encodeBlockAsm12B
2854 MOVQ $0x00000000, ret+48(FP)
2855 RET
2856
2857repeat_dst_size_check_encodeBlockAsm12B:
2858 MOVL 12(SP), BX
2859 CMPL BX, SI
2860 JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B
2861 MOVL SI, R8
2862 MOVL SI, 12(SP)
2863 LEAQ (DX)(BX*1), R9
2864 SUBL BX, R8
2865 LEAL -1(R8), BX
2866 CMPL BX, $0x3c
2867 JB one_byte_repeat_emit_encodeBlockAsm12B
2868 CMPL BX, $0x00000100
2869 JB two_bytes_repeat_emit_encodeBlockAsm12B
2870 JB three_bytes_repeat_emit_encodeBlockAsm12B
2871
2872three_bytes_repeat_emit_encodeBlockAsm12B:
2873 MOVB $0xf4, (AX)
2874 MOVW BX, 1(AX)
2875 ADDQ $0x03, AX
2876 JMP memmove_long_repeat_emit_encodeBlockAsm12B
2877
2878two_bytes_repeat_emit_encodeBlockAsm12B:
2879 MOVB $0xf0, (AX)
2880 MOVB BL, 1(AX)
2881 ADDQ $0x02, AX
2882 CMPL BX, $0x40
2883 JB memmove_repeat_emit_encodeBlockAsm12B
2884 JMP memmove_long_repeat_emit_encodeBlockAsm12B
2885
2886one_byte_repeat_emit_encodeBlockAsm12B:
2887 SHLB $0x02, BL
2888 MOVB BL, (AX)
2889 ADDQ $0x01, AX
2890
2891memmove_repeat_emit_encodeBlockAsm12B:
2892 LEAQ (AX)(R8*1), BX
2893
2894 // genMemMoveShort
2895 CMPQ R8, $0x08
2896 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8
2897 CMPQ R8, $0x10
2898 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16
2899 CMPQ R8, $0x20
2900 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32
2901 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64
2902
2903emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8:
2904 MOVQ (R9), R10
2905 MOVQ R10, (AX)
2906 JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
2907
2908emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16:
2909 MOVQ (R9), R10
2910 MOVQ -8(R9)(R8*1), R9
2911 MOVQ R10, (AX)
2912 MOVQ R9, -8(AX)(R8*1)
2913 JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
2914
2915emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32:
2916 MOVOU (R9), X0
2917 MOVOU -16(R9)(R8*1), X1
2918 MOVOU X0, (AX)
2919 MOVOU X1, -16(AX)(R8*1)
2920 JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
2921
2922emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64:
2923 MOVOU (R9), X0
2924 MOVOU 16(R9), X1
2925 MOVOU -32(R9)(R8*1), X2
2926 MOVOU -16(R9)(R8*1), X3
2927 MOVOU X0, (AX)
2928 MOVOU X1, 16(AX)
2929 MOVOU X2, -32(AX)(R8*1)
2930 MOVOU X3, -16(AX)(R8*1)
2931
2932memmove_end_copy_repeat_emit_encodeBlockAsm12B:
2933 MOVQ BX, AX
2934 JMP emit_literal_done_repeat_emit_encodeBlockAsm12B
2935
2936memmove_long_repeat_emit_encodeBlockAsm12B:
2937 LEAQ (AX)(R8*1), BX
2938
2939 // genMemMoveLong
2940 MOVOU (R9), X0
2941 MOVOU 16(R9), X1
2942 MOVOU -32(R9)(R8*1), X2
2943 MOVOU -16(R9)(R8*1), X3
2944 MOVQ R8, R11
2945 SHRQ $0x05, R11
2946 MOVQ AX, R10
2947 ANDL $0x0000001f, R10
2948 MOVQ $0x00000040, R12
2949 SUBQ R10, R12
2950 DECQ R11
2951 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
2952 LEAQ -32(R9)(R12*1), R10
2953 LEAQ -32(AX)(R12*1), R13
2954
2955emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back:
2956 MOVOU (R10), X4
2957 MOVOU 16(R10), X5
2958 MOVOA X4, (R13)
2959 MOVOA X5, 16(R13)
2960 ADDQ $0x20, R13
2961 ADDQ $0x20, R10
2962 ADDQ $0x20, R12
2963 DECQ R11
2964 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back
2965
2966emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
2967 MOVOU -32(R9)(R12*1), X4
2968 MOVOU -16(R9)(R12*1), X5
2969 MOVOA X4, -32(AX)(R12*1)
2970 MOVOA X5, -16(AX)(R12*1)
2971 ADDQ $0x20, R12
2972 CMPQ R8, R12
2973 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
2974 MOVOU X0, (AX)
2975 MOVOU X1, 16(AX)
2976 MOVOU X2, -32(AX)(R8*1)
2977 MOVOU X3, -16(AX)(R8*1)
2978 MOVQ BX, AX
2979
2980emit_literal_done_repeat_emit_encodeBlockAsm12B:
2981 ADDL $0x05, CX
2982 MOVL CX, BX
2983 SUBL 16(SP), BX
2984 MOVQ src_len+32(FP), R8
2985 SUBL CX, R8
2986 LEAQ (DX)(CX*1), R9
2987 LEAQ (DX)(BX*1), BX
2988
2989 // matchLen
2990 XORL R11, R11
2991
2992matchlen_loopback_16_repeat_extend_encodeBlockAsm12B:
2993 CMPL R8, $0x10
2994 JB matchlen_match8_repeat_extend_encodeBlockAsm12B
2995 MOVQ (R9)(R11*1), R10
2996 MOVQ 8(R9)(R11*1), R12
2997 XORQ (BX)(R11*1), R10
2998 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm12B
2999 XORQ 8(BX)(R11*1), R12
3000 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm12B
3001 LEAL -16(R8), R8
3002 LEAL 16(R11), R11
3003 JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm12B
3004
3005matchlen_bsf_16repeat_extend_encodeBlockAsm12B:
3006#ifdef GOAMD64_v3
3007 TZCNTQ R12, R12
3008
3009#else
3010 BSFQ R12, R12
3011
3012#endif
3013 SARQ $0x03, R12
3014 LEAL 8(R11)(R12*1), R11
3015 JMP repeat_extend_forward_end_encodeBlockAsm12B
3016
3017matchlen_match8_repeat_extend_encodeBlockAsm12B:
3018 CMPL R8, $0x08
3019 JB matchlen_match4_repeat_extend_encodeBlockAsm12B
3020 MOVQ (R9)(R11*1), R10
3021 XORQ (BX)(R11*1), R10
3022 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm12B
3023 LEAL -8(R8), R8
3024 LEAL 8(R11), R11
3025 JMP matchlen_match4_repeat_extend_encodeBlockAsm12B
3026
3027matchlen_bsf_8_repeat_extend_encodeBlockAsm12B:
3028#ifdef GOAMD64_v3
3029 TZCNTQ R10, R10
3030
3031#else
3032 BSFQ R10, R10
3033
3034#endif
3035 SARQ $0x03, R10
3036 LEAL (R11)(R10*1), R11
3037 JMP repeat_extend_forward_end_encodeBlockAsm12B
3038
3039matchlen_match4_repeat_extend_encodeBlockAsm12B:
3040 CMPL R8, $0x04
3041 JB matchlen_match2_repeat_extend_encodeBlockAsm12B
3042 MOVL (R9)(R11*1), R10
3043 CMPL (BX)(R11*1), R10
3044 JNE matchlen_match2_repeat_extend_encodeBlockAsm12B
3045 LEAL -4(R8), R8
3046 LEAL 4(R11), R11
3047
3048matchlen_match2_repeat_extend_encodeBlockAsm12B:
3049 CMPL R8, $0x01
3050 JE matchlen_match1_repeat_extend_encodeBlockAsm12B
3051 JB repeat_extend_forward_end_encodeBlockAsm12B
3052 MOVW (R9)(R11*1), R10
3053 CMPW (BX)(R11*1), R10
3054 JNE matchlen_match1_repeat_extend_encodeBlockAsm12B
3055 LEAL 2(R11), R11
3056 SUBL $0x02, R8
3057 JZ repeat_extend_forward_end_encodeBlockAsm12B
3058
3059matchlen_match1_repeat_extend_encodeBlockAsm12B:
3060 MOVB (R9)(R11*1), R10
3061 CMPB (BX)(R11*1), R10
3062 JNE repeat_extend_forward_end_encodeBlockAsm12B
3063 LEAL 1(R11), R11
3064
3065repeat_extend_forward_end_encodeBlockAsm12B:
3066 ADDL R11, CX
3067 MOVL CX, BX
3068 SUBL SI, BX
3069 MOVL 16(SP), SI
3070 TESTL DI, DI
3071 JZ repeat_as_copy_encodeBlockAsm12B
3072
3073 // emitRepeat
3074 MOVL BX, DI
3075 LEAL -4(BX), BX
3076 CMPL DI, $0x08
3077 JBE repeat_two_match_repeat_encodeBlockAsm12B
3078 CMPL DI, $0x0c
3079 JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B
3080 CMPL SI, $0x00000800
3081 JB repeat_two_offset_match_repeat_encodeBlockAsm12B
3082
3083cant_repeat_two_offset_match_repeat_encodeBlockAsm12B:
3084 CMPL BX, $0x00000104
3085 JB repeat_three_match_repeat_encodeBlockAsm12B
3086 LEAL -256(BX), BX
3087 MOVW $0x0019, (AX)
3088 MOVW BX, 2(AX)
3089 ADDQ $0x04, AX
3090 JMP repeat_end_emit_encodeBlockAsm12B
3091
3092repeat_three_match_repeat_encodeBlockAsm12B:
3093 LEAL -4(BX), BX
3094 MOVW $0x0015, (AX)
3095 MOVB BL, 2(AX)
3096 ADDQ $0x03, AX
3097 JMP repeat_end_emit_encodeBlockAsm12B
3098
3099repeat_two_match_repeat_encodeBlockAsm12B:
3100 SHLL $0x02, BX
3101 ORL $0x01, BX
3102 MOVW BX, (AX)
3103 ADDQ $0x02, AX
3104 JMP repeat_end_emit_encodeBlockAsm12B
3105
3106repeat_two_offset_match_repeat_encodeBlockAsm12B:
3107 XORQ DI, DI
3108 LEAL 1(DI)(BX*4), BX
3109 MOVB SI, 1(AX)
3110 SARL $0x08, SI
3111 SHLL $0x05, SI
3112 ORL SI, BX
3113 MOVB BL, (AX)
3114 ADDQ $0x02, AX
3115 JMP repeat_end_emit_encodeBlockAsm12B
3116
3117repeat_as_copy_encodeBlockAsm12B:
3118 // emitCopy
3119 CMPL BX, $0x40
3120 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B
3121 CMPL SI, $0x00000800
3122 JAE long_offset_short_repeat_as_copy_encodeBlockAsm12B
3123 MOVL $0x00000001, DI
3124 LEAL 16(DI), DI
3125 MOVB SI, 1(AX)
3126 SHRL $0x08, SI
3127 SHLL $0x05, SI
3128 ORL SI, DI
3129 MOVB DI, (AX)
3130 ADDQ $0x02, AX
3131 SUBL $0x08, BX
3132
3133 // emitRepeat
3134 LEAL -4(BX), BX
3135 JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
3136 MOVL BX, DI
3137 LEAL -4(BX), BX
3138 CMPL DI, $0x08
3139 JBE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
3140 CMPL DI, $0x0c
3141 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
3142 CMPL SI, $0x00000800
3143 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
3144
3145cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
3146 CMPL BX, $0x00000104
3147 JB repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
3148 LEAL -256(BX), BX
3149 MOVW $0x0019, (AX)
3150 MOVW BX, 2(AX)
3151 ADDQ $0x04, AX
3152 JMP repeat_end_emit_encodeBlockAsm12B
3153
3154repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
3155 LEAL -4(BX), BX
3156 MOVW $0x0015, (AX)
3157 MOVB BL, 2(AX)
3158 ADDQ $0x03, AX
3159 JMP repeat_end_emit_encodeBlockAsm12B
3160
3161repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
3162 SHLL $0x02, BX
3163 ORL $0x01, BX
3164 MOVW BX, (AX)
3165 ADDQ $0x02, AX
3166 JMP repeat_end_emit_encodeBlockAsm12B
3167
3168repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
3169 XORQ DI, DI
3170 LEAL 1(DI)(BX*4), BX
3171 MOVB SI, 1(AX)
3172 SARL $0x08, SI
3173 SHLL $0x05, SI
3174 ORL SI, BX
3175 MOVB BL, (AX)
3176 ADDQ $0x02, AX
3177 JMP repeat_end_emit_encodeBlockAsm12B
3178
3179long_offset_short_repeat_as_copy_encodeBlockAsm12B:
3180 MOVB $0xee, (AX)
3181 MOVW SI, 1(AX)
3182 LEAL -60(BX), BX
3183 ADDQ $0x03, AX
3184
3185 // emitRepeat
3186 MOVL BX, DI
3187 LEAL -4(BX), BX
3188 CMPL DI, $0x08
3189 JBE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
3190 CMPL DI, $0x0c
3191 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
3192 CMPL SI, $0x00000800
3193 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
3194
3195cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
3196 CMPL BX, $0x00000104
3197 JB repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
3198 LEAL -256(BX), BX
3199 MOVW $0x0019, (AX)
3200 MOVW BX, 2(AX)
3201 ADDQ $0x04, AX
3202 JMP repeat_end_emit_encodeBlockAsm12B
3203
3204repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
3205 LEAL -4(BX), BX
3206 MOVW $0x0015, (AX)
3207 MOVB BL, 2(AX)
3208 ADDQ $0x03, AX
3209 JMP repeat_end_emit_encodeBlockAsm12B
3210
3211repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
3212 SHLL $0x02, BX
3213 ORL $0x01, BX
3214 MOVW BX, (AX)
3215 ADDQ $0x02, AX
3216 JMP repeat_end_emit_encodeBlockAsm12B
3217
3218repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
3219 XORQ DI, DI
3220 LEAL 1(DI)(BX*4), BX
3221 MOVB SI, 1(AX)
3222 SARL $0x08, SI
3223 SHLL $0x05, SI
3224 ORL SI, BX
3225 MOVB BL, (AX)
3226 ADDQ $0x02, AX
3227 JMP repeat_end_emit_encodeBlockAsm12B
3228
3229two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B:
3230 MOVL BX, DI
3231 SHLL $0x02, DI
3232 CMPL BX, $0x0c
3233 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm12B
3234 CMPL SI, $0x00000800
3235 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm12B
3236 LEAL -15(DI), DI
3237 MOVB SI, 1(AX)
3238 SHRL $0x08, SI
3239 SHLL $0x05, SI
3240 ORL SI, DI
3241 MOVB DI, (AX)
3242 ADDQ $0x02, AX
3243 JMP repeat_end_emit_encodeBlockAsm12B
3244
3245emit_copy_three_repeat_as_copy_encodeBlockAsm12B:
3246 LEAL -2(DI), DI
3247 MOVB DI, (AX)
3248 MOVW SI, 1(AX)
3249 ADDQ $0x03, AX
3250
3251repeat_end_emit_encodeBlockAsm12B:
3252 MOVL CX, 12(SP)
3253 JMP search_loop_encodeBlockAsm12B
3254
3255no_repeat_found_encodeBlockAsm12B:
3256 CMPL (DX)(BX*1), SI
3257 JEQ candidate_match_encodeBlockAsm12B
3258 SHRQ $0x08, SI
3259 MOVL 24(SP)(R9*4), BX
3260 LEAL 2(CX), R8
3261 CMPL (DX)(DI*1), SI
3262 JEQ candidate2_match_encodeBlockAsm12B
3263 MOVL R8, 24(SP)(R9*4)
3264 SHRQ $0x08, SI
3265 CMPL (DX)(BX*1), SI
3266 JEQ candidate3_match_encodeBlockAsm12B
3267 MOVL 20(SP), CX
3268 JMP search_loop_encodeBlockAsm12B
3269
3270candidate3_match_encodeBlockAsm12B:
3271 ADDL $0x02, CX
3272 JMP candidate_match_encodeBlockAsm12B
3273
3274candidate2_match_encodeBlockAsm12B:
3275 MOVL R8, 24(SP)(R9*4)
3276 INCL CX
3277 MOVL DI, BX
3278
3279candidate_match_encodeBlockAsm12B:
3280 MOVL 12(SP), SI
3281 TESTL BX, BX
3282 JZ match_extend_back_end_encodeBlockAsm12B
3283
3284match_extend_back_loop_encodeBlockAsm12B:
3285 CMPL CX, SI
3286 JBE match_extend_back_end_encodeBlockAsm12B
3287 MOVB -1(DX)(BX*1), DI
3288 MOVB -1(DX)(CX*1), R8
3289 CMPB DI, R8
3290 JNE match_extend_back_end_encodeBlockAsm12B
3291 LEAL -1(CX), CX
3292 DECL BX
3293 JZ match_extend_back_end_encodeBlockAsm12B
3294 JMP match_extend_back_loop_encodeBlockAsm12B
3295
3296match_extend_back_end_encodeBlockAsm12B:
3297 MOVL CX, SI
3298 SUBL 12(SP), SI
3299 LEAQ 3(AX)(SI*1), SI
3300 CMPQ SI, (SP)
3301 JB match_dst_size_check_encodeBlockAsm12B
3302 MOVQ $0x00000000, ret+48(FP)
3303 RET
3304
3305match_dst_size_check_encodeBlockAsm12B:
3306 MOVL CX, SI
3307 MOVL 12(SP), DI
3308 CMPL DI, SI
3309 JEQ emit_literal_done_match_emit_encodeBlockAsm12B
3310 MOVL SI, R8
3311 MOVL SI, 12(SP)
3312 LEAQ (DX)(DI*1), SI
3313 SUBL DI, R8
3314 LEAL -1(R8), DI
3315 CMPL DI, $0x3c
3316 JB one_byte_match_emit_encodeBlockAsm12B
3317 CMPL DI, $0x00000100
3318 JB two_bytes_match_emit_encodeBlockAsm12B
3319 JB three_bytes_match_emit_encodeBlockAsm12B
3320
3321three_bytes_match_emit_encodeBlockAsm12B:
3322 MOVB $0xf4, (AX)
3323 MOVW DI, 1(AX)
3324 ADDQ $0x03, AX
3325 JMP memmove_long_match_emit_encodeBlockAsm12B
3326
3327two_bytes_match_emit_encodeBlockAsm12B:
3328 MOVB $0xf0, (AX)
3329 MOVB DI, 1(AX)
3330 ADDQ $0x02, AX
3331 CMPL DI, $0x40
3332 JB memmove_match_emit_encodeBlockAsm12B
3333 JMP memmove_long_match_emit_encodeBlockAsm12B
3334
3335one_byte_match_emit_encodeBlockAsm12B:
3336 SHLB $0x02, DI
3337 MOVB DI, (AX)
3338 ADDQ $0x01, AX
3339
3340memmove_match_emit_encodeBlockAsm12B:
3341 LEAQ (AX)(R8*1), DI
3342
3343 // genMemMoveShort
3344 CMPQ R8, $0x08
3345 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8
3346 CMPQ R8, $0x10
3347 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16
3348 CMPQ R8, $0x20
3349 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32
3350 JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64
3351
3352emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8:
3353 MOVQ (SI), R9
3354 MOVQ R9, (AX)
3355 JMP memmove_end_copy_match_emit_encodeBlockAsm12B
3356
3357emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16:
3358 MOVQ (SI), R9
3359 MOVQ -8(SI)(R8*1), SI
3360 MOVQ R9, (AX)
3361 MOVQ SI, -8(AX)(R8*1)
3362 JMP memmove_end_copy_match_emit_encodeBlockAsm12B
3363
3364emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32:
3365 MOVOU (SI), X0
3366 MOVOU -16(SI)(R8*1), X1
3367 MOVOU X0, (AX)
3368 MOVOU X1, -16(AX)(R8*1)
3369 JMP memmove_end_copy_match_emit_encodeBlockAsm12B
3370
3371emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64:
3372 MOVOU (SI), X0
3373 MOVOU 16(SI), X1
3374 MOVOU -32(SI)(R8*1), X2
3375 MOVOU -16(SI)(R8*1), X3
3376 MOVOU X0, (AX)
3377 MOVOU X1, 16(AX)
3378 MOVOU X2, -32(AX)(R8*1)
3379 MOVOU X3, -16(AX)(R8*1)
3380
3381memmove_end_copy_match_emit_encodeBlockAsm12B:
3382 MOVQ DI, AX
3383 JMP emit_literal_done_match_emit_encodeBlockAsm12B
3384
3385memmove_long_match_emit_encodeBlockAsm12B:
3386 LEAQ (AX)(R8*1), DI
3387
3388 // genMemMoveLong
3389 MOVOU (SI), X0
3390 MOVOU 16(SI), X1
3391 MOVOU -32(SI)(R8*1), X2
3392 MOVOU -16(SI)(R8*1), X3
3393 MOVQ R8, R10
3394 SHRQ $0x05, R10
3395 MOVQ AX, R9
3396 ANDL $0x0000001f, R9
3397 MOVQ $0x00000040, R11
3398 SUBQ R9, R11
3399 DECQ R10
3400 JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
3401 LEAQ -32(SI)(R11*1), R9
3402 LEAQ -32(AX)(R11*1), R12
3403
3404emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back:
3405 MOVOU (R9), X4
3406 MOVOU 16(R9), X5
3407 MOVOA X4, (R12)
3408 MOVOA X5, 16(R12)
3409 ADDQ $0x20, R12
3410 ADDQ $0x20, R9
3411 ADDQ $0x20, R11
3412 DECQ R10
3413 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back
3414
3415emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
3416 MOVOU -32(SI)(R11*1), X4
3417 MOVOU -16(SI)(R11*1), X5
3418 MOVOA X4, -32(AX)(R11*1)
3419 MOVOA X5, -16(AX)(R11*1)
3420 ADDQ $0x20, R11
3421 CMPQ R8, R11
3422 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
3423 MOVOU X0, (AX)
3424 MOVOU X1, 16(AX)
3425 MOVOU X2, -32(AX)(R8*1)
3426 MOVOU X3, -16(AX)(R8*1)
3427 MOVQ DI, AX
3428
3429emit_literal_done_match_emit_encodeBlockAsm12B:
3430match_nolit_loop_encodeBlockAsm12B:
3431 MOVL CX, SI
3432 SUBL BX, SI
3433 MOVL SI, 16(SP)
3434 ADDL $0x04, CX
3435 ADDL $0x04, BX
3436 MOVQ src_len+32(FP), SI
3437 SUBL CX, SI
3438 LEAQ (DX)(CX*1), DI
3439 LEAQ (DX)(BX*1), BX
3440
3441 // matchLen
3442 XORL R9, R9
3443
3444matchlen_loopback_16_match_nolit_encodeBlockAsm12B:
3445 CMPL SI, $0x10
3446 JB matchlen_match8_match_nolit_encodeBlockAsm12B
3447 MOVQ (DI)(R9*1), R8
3448 MOVQ 8(DI)(R9*1), R10
3449 XORQ (BX)(R9*1), R8
3450 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm12B
3451 XORQ 8(BX)(R9*1), R10
3452 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm12B
3453 LEAL -16(SI), SI
3454 LEAL 16(R9), R9
3455 JMP matchlen_loopback_16_match_nolit_encodeBlockAsm12B
3456
3457matchlen_bsf_16match_nolit_encodeBlockAsm12B:
3458#ifdef GOAMD64_v3
3459 TZCNTQ R10, R10
3460
3461#else
3462 BSFQ R10, R10
3463
3464#endif
3465 SARQ $0x03, R10
3466 LEAL 8(R9)(R10*1), R9
3467 JMP match_nolit_end_encodeBlockAsm12B
3468
3469matchlen_match8_match_nolit_encodeBlockAsm12B:
3470 CMPL SI, $0x08
3471 JB matchlen_match4_match_nolit_encodeBlockAsm12B
3472 MOVQ (DI)(R9*1), R8
3473 XORQ (BX)(R9*1), R8
3474 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm12B
3475 LEAL -8(SI), SI
3476 LEAL 8(R9), R9
3477 JMP matchlen_match4_match_nolit_encodeBlockAsm12B
3478
3479matchlen_bsf_8_match_nolit_encodeBlockAsm12B:
3480#ifdef GOAMD64_v3
3481 TZCNTQ R8, R8
3482
3483#else
3484 BSFQ R8, R8
3485
3486#endif
3487 SARQ $0x03, R8
3488 LEAL (R9)(R8*1), R9
3489 JMP match_nolit_end_encodeBlockAsm12B
3490
3491matchlen_match4_match_nolit_encodeBlockAsm12B:
3492 CMPL SI, $0x04
3493 JB matchlen_match2_match_nolit_encodeBlockAsm12B
3494 MOVL (DI)(R9*1), R8
3495 CMPL (BX)(R9*1), R8
3496 JNE matchlen_match2_match_nolit_encodeBlockAsm12B
3497 LEAL -4(SI), SI
3498 LEAL 4(R9), R9
3499
3500matchlen_match2_match_nolit_encodeBlockAsm12B:
3501 CMPL SI, $0x01
3502 JE matchlen_match1_match_nolit_encodeBlockAsm12B
3503 JB match_nolit_end_encodeBlockAsm12B
3504 MOVW (DI)(R9*1), R8
3505 CMPW (BX)(R9*1), R8
3506 JNE matchlen_match1_match_nolit_encodeBlockAsm12B
3507 LEAL 2(R9), R9
3508 SUBL $0x02, SI
3509 JZ match_nolit_end_encodeBlockAsm12B
3510
3511matchlen_match1_match_nolit_encodeBlockAsm12B:
3512 MOVB (DI)(R9*1), R8
3513 CMPB (BX)(R9*1), R8
3514 JNE match_nolit_end_encodeBlockAsm12B
3515 LEAL 1(R9), R9
3516
3517match_nolit_end_encodeBlockAsm12B:
3518 ADDL R9, CX
3519 MOVL 16(SP), BX
3520 ADDL $0x04, R9
3521 MOVL CX, 12(SP)
3522
3523 // emitCopy
3524 CMPL R9, $0x40
3525 JBE two_byte_offset_short_match_nolit_encodeBlockAsm12B
3526 CMPL BX, $0x00000800
3527 JAE long_offset_short_match_nolit_encodeBlockAsm12B
3528 MOVL $0x00000001, SI
3529 LEAL 16(SI), SI
3530 MOVB BL, 1(AX)
3531 SHRL $0x08, BX
3532 SHLL $0x05, BX
3533 ORL BX, SI
3534 MOVB SI, (AX)
3535 ADDQ $0x02, AX
3536 SUBL $0x08, R9
3537
3538 // emitRepeat
3539 LEAL -4(R9), R9
3540 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
3541 MOVL R9, SI
3542 LEAL -4(R9), R9
3543 CMPL SI, $0x08
3544 JBE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
3545 CMPL SI, $0x0c
3546 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
3547 CMPL BX, $0x00000800
3548 JB repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
3549
3550cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
3551 CMPL R9, $0x00000104
3552 JB repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
3553 LEAL -256(R9), R9
3554 MOVW $0x0019, (AX)
3555 MOVW R9, 2(AX)
3556 ADDQ $0x04, AX
3557 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3558
3559repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
3560 LEAL -4(R9), R9
3561 MOVW $0x0015, (AX)
3562 MOVB R9, 2(AX)
3563 ADDQ $0x03, AX
3564 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3565
3566repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
3567 SHLL $0x02, R9
3568 ORL $0x01, R9
3569 MOVW R9, (AX)
3570 ADDQ $0x02, AX
3571 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3572
3573repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
3574 XORQ SI, SI
3575 LEAL 1(SI)(R9*4), R9
3576 MOVB BL, 1(AX)
3577 SARL $0x08, BX
3578 SHLL $0x05, BX
3579 ORL BX, R9
3580 MOVB R9, (AX)
3581 ADDQ $0x02, AX
3582 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3583
3584long_offset_short_match_nolit_encodeBlockAsm12B:
3585 MOVB $0xee, (AX)
3586 MOVW BX, 1(AX)
3587 LEAL -60(R9), R9
3588 ADDQ $0x03, AX
3589
3590 // emitRepeat
3591 MOVL R9, SI
3592 LEAL -4(R9), R9
3593 CMPL SI, $0x08
3594 JBE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short
3595 CMPL SI, $0x0c
3596 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
3597 CMPL BX, $0x00000800
3598 JB repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
3599
3600cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
3601 CMPL R9, $0x00000104
3602 JB repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short
3603 LEAL -256(R9), R9
3604 MOVW $0x0019, (AX)
3605 MOVW R9, 2(AX)
3606 ADDQ $0x04, AX
3607 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3608
3609repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short:
3610 LEAL -4(R9), R9
3611 MOVW $0x0015, (AX)
3612 MOVB R9, 2(AX)
3613 ADDQ $0x03, AX
3614 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3615
3616repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short:
3617 SHLL $0x02, R9
3618 ORL $0x01, R9
3619 MOVW R9, (AX)
3620 ADDQ $0x02, AX
3621 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3622
3623repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
3624 XORQ SI, SI
3625 LEAL 1(SI)(R9*4), R9
3626 MOVB BL, 1(AX)
3627 SARL $0x08, BX
3628 SHLL $0x05, BX
3629 ORL BX, R9
3630 MOVB R9, (AX)
3631 ADDQ $0x02, AX
3632 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3633
3634two_byte_offset_short_match_nolit_encodeBlockAsm12B:
3635 MOVL R9, SI
3636 SHLL $0x02, SI
3637 CMPL R9, $0x0c
3638 JAE emit_copy_three_match_nolit_encodeBlockAsm12B
3639 CMPL BX, $0x00000800
3640 JAE emit_copy_three_match_nolit_encodeBlockAsm12B
3641 LEAL -15(SI), SI
3642 MOVB BL, 1(AX)
3643 SHRL $0x08, BX
3644 SHLL $0x05, BX
3645 ORL BX, SI
3646 MOVB SI, (AX)
3647 ADDQ $0x02, AX
3648 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3649
3650emit_copy_three_match_nolit_encodeBlockAsm12B:
3651 LEAL -2(SI), SI
3652 MOVB SI, (AX)
3653 MOVW BX, 1(AX)
3654 ADDQ $0x03, AX
3655
3656match_nolit_emitcopy_end_encodeBlockAsm12B:
3657 CMPL CX, 8(SP)
3658 JAE emit_remainder_encodeBlockAsm12B
3659 MOVQ -2(DX)(CX*1), SI
3660 CMPQ AX, (SP)
3661 JB match_nolit_dst_ok_encodeBlockAsm12B
3662 MOVQ $0x00000000, ret+48(FP)
3663 RET
3664
3665match_nolit_dst_ok_encodeBlockAsm12B:
3666 MOVQ $0x000000cf1bbcdcbb, R8
3667 MOVQ SI, DI
3668 SHRQ $0x10, SI
3669 MOVQ SI, BX
3670 SHLQ $0x18, DI
3671 IMULQ R8, DI
3672 SHRQ $0x34, DI
3673 SHLQ $0x18, BX
3674 IMULQ R8, BX
3675 SHRQ $0x34, BX
3676 LEAL -2(CX), R8
3677 LEAQ 24(SP)(BX*4), R9
3678 MOVL (R9), BX
3679 MOVL R8, 24(SP)(DI*4)
3680 MOVL CX, (R9)
3681 CMPL (DX)(BX*1), SI
3682 JEQ match_nolit_loop_encodeBlockAsm12B
3683 INCL CX
3684 JMP search_loop_encodeBlockAsm12B
3685
3686emit_remainder_encodeBlockAsm12B:
3687 MOVQ src_len+32(FP), CX
3688 SUBL 12(SP), CX
3689 LEAQ 3(AX)(CX*1), CX
3690 CMPQ CX, (SP)
3691 JB emit_remainder_ok_encodeBlockAsm12B
3692 MOVQ $0x00000000, ret+48(FP)
3693 RET
3694
3695emit_remainder_ok_encodeBlockAsm12B:
3696 MOVQ src_len+32(FP), CX
3697 MOVL 12(SP), BX
3698 CMPL BX, CX
3699 JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B
3700 MOVL CX, SI
3701 MOVL CX, 12(SP)
3702 LEAQ (DX)(BX*1), CX
3703 SUBL BX, SI
3704 LEAL -1(SI), DX
3705 CMPL DX, $0x3c
3706 JB one_byte_emit_remainder_encodeBlockAsm12B
3707 CMPL DX, $0x00000100
3708 JB two_bytes_emit_remainder_encodeBlockAsm12B
3709 JB three_bytes_emit_remainder_encodeBlockAsm12B
3710
3711three_bytes_emit_remainder_encodeBlockAsm12B:
3712 MOVB $0xf4, (AX)
3713 MOVW DX, 1(AX)
3714 ADDQ $0x03, AX
3715 JMP memmove_long_emit_remainder_encodeBlockAsm12B
3716
3717two_bytes_emit_remainder_encodeBlockAsm12B:
3718 MOVB $0xf0, (AX)
3719 MOVB DL, 1(AX)
3720 ADDQ $0x02, AX
3721 CMPL DX, $0x40
3722 JB memmove_emit_remainder_encodeBlockAsm12B
3723 JMP memmove_long_emit_remainder_encodeBlockAsm12B
3724
3725one_byte_emit_remainder_encodeBlockAsm12B:
3726 SHLB $0x02, DL
3727 MOVB DL, (AX)
3728 ADDQ $0x01, AX
3729
3730memmove_emit_remainder_encodeBlockAsm12B:
3731 LEAQ (AX)(SI*1), DX
3732 MOVL SI, BX
3733
3734 // genMemMoveShort
3735 CMPQ BX, $0x03
3736 JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2
3737 JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3
3738 CMPQ BX, $0x08
3739 JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7
3740 CMPQ BX, $0x10
3741 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16
3742 CMPQ BX, $0x20
3743 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32
3744 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64
3745
3746emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2:
3747 MOVB (CX), SI
3748 MOVB -1(CX)(BX*1), CL
3749 MOVB SI, (AX)
3750 MOVB CL, -1(AX)(BX*1)
3751 JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
3752
3753emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3:
3754 MOVW (CX), SI
3755 MOVB 2(CX), CL
3756 MOVW SI, (AX)
3757 MOVB CL, 2(AX)
3758 JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
3759
3760emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7:
3761 MOVL (CX), SI
3762 MOVL -4(CX)(BX*1), CX
3763 MOVL SI, (AX)
3764 MOVL CX, -4(AX)(BX*1)
3765 JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
3766
3767emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16:
3768 MOVQ (CX), SI
3769 MOVQ -8(CX)(BX*1), CX
3770 MOVQ SI, (AX)
3771 MOVQ CX, -8(AX)(BX*1)
3772 JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
3773
3774emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32:
3775 MOVOU (CX), X0
3776 MOVOU -16(CX)(BX*1), X1
3777 MOVOU X0, (AX)
3778 MOVOU X1, -16(AX)(BX*1)
3779 JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
3780
3781emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64:
3782 MOVOU (CX), X0
3783 MOVOU 16(CX), X1
3784 MOVOU -32(CX)(BX*1), X2
3785 MOVOU -16(CX)(BX*1), X3
3786 MOVOU X0, (AX)
3787 MOVOU X1, 16(AX)
3788 MOVOU X2, -32(AX)(BX*1)
3789 MOVOU X3, -16(AX)(BX*1)
3790
3791memmove_end_copy_emit_remainder_encodeBlockAsm12B:
3792 MOVQ DX, AX
3793 JMP emit_literal_done_emit_remainder_encodeBlockAsm12B
3794
3795memmove_long_emit_remainder_encodeBlockAsm12B:
3796 LEAQ (AX)(SI*1), DX
3797 MOVL SI, BX
3798
3799 // genMemMoveLong
3800 MOVOU (CX), X0
3801 MOVOU 16(CX), X1
3802 MOVOU -32(CX)(BX*1), X2
3803 MOVOU -16(CX)(BX*1), X3
3804 MOVQ BX, DI
3805 SHRQ $0x05, DI
3806 MOVQ AX, SI
3807 ANDL $0x0000001f, SI
3808 MOVQ $0x00000040, R8
3809 SUBQ SI, R8
3810 DECQ DI
3811 JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
3812 LEAQ -32(CX)(R8*1), SI
3813 LEAQ -32(AX)(R8*1), R9
3814
3815emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back:
3816 MOVOU (SI), X4
3817 MOVOU 16(SI), X5
3818 MOVOA X4, (R9)
3819 MOVOA X5, 16(R9)
3820 ADDQ $0x20, R9
3821 ADDQ $0x20, SI
3822 ADDQ $0x20, R8
3823 DECQ DI
3824 JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back
3825
3826emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32:
3827 MOVOU -32(CX)(R8*1), X4
3828 MOVOU -16(CX)(R8*1), X5
3829 MOVOA X4, -32(AX)(R8*1)
3830 MOVOA X5, -16(AX)(R8*1)
3831 ADDQ $0x20, R8
3832 CMPQ BX, R8
3833 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
3834 MOVOU X0, (AX)
3835 MOVOU X1, 16(AX)
3836 MOVOU X2, -32(AX)(BX*1)
3837 MOVOU X3, -16(AX)(BX*1)
3838 MOVQ DX, AX
3839
3840emit_literal_done_emit_remainder_encodeBlockAsm12B:
3841 MOVQ dst_base+0(FP), CX
3842 SUBQ CX, AX
3843 MOVQ AX, ret+48(FP)
3844 RET
3845
3846// func encodeBlockAsm10B(dst []byte, src []byte) int
3847// Requires: BMI, SSE2
3848TEXT ·encodeBlockAsm10B(SB), $4120-56
3849 MOVQ dst_base+0(FP), AX
3850 MOVQ $0x00000020, CX
3851 LEAQ 24(SP), DX
3852 PXOR X0, X0
3853
3854zero_loop_encodeBlockAsm10B:
3855 MOVOU X0, (DX)
3856 MOVOU X0, 16(DX)
3857 MOVOU X0, 32(DX)
3858 MOVOU X0, 48(DX)
3859 MOVOU X0, 64(DX)
3860 MOVOU X0, 80(DX)
3861 MOVOU X0, 96(DX)
3862 MOVOU X0, 112(DX)
3863 ADDQ $0x80, DX
3864 DECQ CX
3865 JNZ zero_loop_encodeBlockAsm10B
3866 MOVL $0x00000000, 12(SP)
3867 MOVQ src_len+32(FP), CX
3868 LEAQ -9(CX), DX
3869 LEAQ -8(CX), BX
3870 MOVL BX, 8(SP)
3871 SHRQ $0x05, CX
3872 SUBL CX, DX
3873 LEAQ (AX)(DX*1), DX
3874 MOVQ DX, (SP)
3875 MOVL $0x00000001, CX
3876 MOVL CX, 16(SP)
3877 MOVQ src_base+24(FP), DX
3878
3879search_loop_encodeBlockAsm10B:
3880 MOVL CX, BX
3881 SUBL 12(SP), BX
3882 SHRL $0x05, BX
3883 LEAL 4(CX)(BX*1), BX
3884 CMPL BX, 8(SP)
3885 JAE emit_remainder_encodeBlockAsm10B
3886 MOVQ (DX)(CX*1), SI
3887 MOVL BX, 20(SP)
3888 MOVQ $0x9e3779b1, R8
3889 MOVQ SI, R9
3890 MOVQ SI, R10
3891 SHRQ $0x08, R10
3892 SHLQ $0x20, R9
3893 IMULQ R8, R9
3894 SHRQ $0x36, R9
3895 SHLQ $0x20, R10
3896 IMULQ R8, R10
3897 SHRQ $0x36, R10
3898 MOVL 24(SP)(R9*4), BX
3899 MOVL 24(SP)(R10*4), DI
3900 MOVL CX, 24(SP)(R9*4)
3901 LEAL 1(CX), R9
3902 MOVL R9, 24(SP)(R10*4)
3903 MOVQ SI, R9
3904 SHRQ $0x10, R9
3905 SHLQ $0x20, R9
3906 IMULQ R8, R9
3907 SHRQ $0x36, R9
3908 MOVL CX, R8
3909 SUBL 16(SP), R8
3910 MOVL 1(DX)(R8*1), R10
3911 MOVQ SI, R8
3912 SHRQ $0x08, R8
3913 CMPL R8, R10
3914 JNE no_repeat_found_encodeBlockAsm10B
3915 LEAL 1(CX), SI
3916 MOVL 12(SP), DI
3917 MOVL SI, BX
3918 SUBL 16(SP), BX
3919 JZ repeat_extend_back_end_encodeBlockAsm10B
3920
3921repeat_extend_back_loop_encodeBlockAsm10B:
3922 CMPL SI, DI
3923 JBE repeat_extend_back_end_encodeBlockAsm10B
3924 MOVB -1(DX)(BX*1), R8
3925 MOVB -1(DX)(SI*1), R9
3926 CMPB R8, R9
3927 JNE repeat_extend_back_end_encodeBlockAsm10B
3928 LEAL -1(SI), SI
3929 DECL BX
3930 JNZ repeat_extend_back_loop_encodeBlockAsm10B
3931
3932repeat_extend_back_end_encodeBlockAsm10B:
3933 MOVL SI, BX
3934 SUBL 12(SP), BX
3935 LEAQ 3(AX)(BX*1), BX
3936 CMPQ BX, (SP)
3937 JB repeat_dst_size_check_encodeBlockAsm10B
3938 MOVQ $0x00000000, ret+48(FP)
3939 RET
3940
3941repeat_dst_size_check_encodeBlockAsm10B:
3942 MOVL 12(SP), BX
3943 CMPL BX, SI
3944 JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B
3945 MOVL SI, R8
3946 MOVL SI, 12(SP)
3947 LEAQ (DX)(BX*1), R9
3948 SUBL BX, R8
3949 LEAL -1(R8), BX
3950 CMPL BX, $0x3c
3951 JB one_byte_repeat_emit_encodeBlockAsm10B
3952 CMPL BX, $0x00000100
3953 JB two_bytes_repeat_emit_encodeBlockAsm10B
3954 JB three_bytes_repeat_emit_encodeBlockAsm10B
3955
3956three_bytes_repeat_emit_encodeBlockAsm10B:
3957 MOVB $0xf4, (AX)
3958 MOVW BX, 1(AX)
3959 ADDQ $0x03, AX
3960 JMP memmove_long_repeat_emit_encodeBlockAsm10B
3961
3962two_bytes_repeat_emit_encodeBlockAsm10B:
3963 MOVB $0xf0, (AX)
3964 MOVB BL, 1(AX)
3965 ADDQ $0x02, AX
3966 CMPL BX, $0x40
3967 JB memmove_repeat_emit_encodeBlockAsm10B
3968 JMP memmove_long_repeat_emit_encodeBlockAsm10B
3969
3970one_byte_repeat_emit_encodeBlockAsm10B:
3971 SHLB $0x02, BL
3972 MOVB BL, (AX)
3973 ADDQ $0x01, AX
3974
3975memmove_repeat_emit_encodeBlockAsm10B:
3976 LEAQ (AX)(R8*1), BX
3977
3978 // genMemMoveShort
3979 CMPQ R8, $0x08
3980 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8
3981 CMPQ R8, $0x10
3982 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16
3983 CMPQ R8, $0x20
3984 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32
3985 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64
3986
3987emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8:
3988 MOVQ (R9), R10
3989 MOVQ R10, (AX)
3990 JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
3991
3992emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16:
3993 MOVQ (R9), R10
3994 MOVQ -8(R9)(R8*1), R9
3995 MOVQ R10, (AX)
3996 MOVQ R9, -8(AX)(R8*1)
3997 JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
3998
3999emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32:
4000 MOVOU (R9), X0
4001 MOVOU -16(R9)(R8*1), X1
4002 MOVOU X0, (AX)
4003 MOVOU X1, -16(AX)(R8*1)
4004 JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
4005
4006emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64:
4007 MOVOU (R9), X0
4008 MOVOU 16(R9), X1
4009 MOVOU -32(R9)(R8*1), X2
4010 MOVOU -16(R9)(R8*1), X3
4011 MOVOU X0, (AX)
4012 MOVOU X1, 16(AX)
4013 MOVOU X2, -32(AX)(R8*1)
4014 MOVOU X3, -16(AX)(R8*1)
4015
4016memmove_end_copy_repeat_emit_encodeBlockAsm10B:
4017 MOVQ BX, AX
4018 JMP emit_literal_done_repeat_emit_encodeBlockAsm10B
4019
4020memmove_long_repeat_emit_encodeBlockAsm10B:
4021 LEAQ (AX)(R8*1), BX
4022
4023 // genMemMoveLong
4024 MOVOU (R9), X0
4025 MOVOU 16(R9), X1
4026 MOVOU -32(R9)(R8*1), X2
4027 MOVOU -16(R9)(R8*1), X3
4028 MOVQ R8, R11
4029 SHRQ $0x05, R11
4030 MOVQ AX, R10
4031 ANDL $0x0000001f, R10
4032 MOVQ $0x00000040, R12
4033 SUBQ R10, R12
4034 DECQ R11
4035 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
4036 LEAQ -32(R9)(R12*1), R10
4037 LEAQ -32(AX)(R12*1), R13
4038
4039emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back:
4040 MOVOU (R10), X4
4041 MOVOU 16(R10), X5
4042 MOVOA X4, (R13)
4043 MOVOA X5, 16(R13)
4044 ADDQ $0x20, R13
4045 ADDQ $0x20, R10
4046 ADDQ $0x20, R12
4047 DECQ R11
4048 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back
4049
4050emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
4051 MOVOU -32(R9)(R12*1), X4
4052 MOVOU -16(R9)(R12*1), X5
4053 MOVOA X4, -32(AX)(R12*1)
4054 MOVOA X5, -16(AX)(R12*1)
4055 ADDQ $0x20, R12
4056 CMPQ R8, R12
4057 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
4058 MOVOU X0, (AX)
4059 MOVOU X1, 16(AX)
4060 MOVOU X2, -32(AX)(R8*1)
4061 MOVOU X3, -16(AX)(R8*1)
4062 MOVQ BX, AX
4063
4064emit_literal_done_repeat_emit_encodeBlockAsm10B:
4065 ADDL $0x05, CX
4066 MOVL CX, BX
4067 SUBL 16(SP), BX
4068 MOVQ src_len+32(FP), R8
4069 SUBL CX, R8
4070 LEAQ (DX)(CX*1), R9
4071 LEAQ (DX)(BX*1), BX
4072
4073 // matchLen
4074 XORL R11, R11
4075
4076matchlen_loopback_16_repeat_extend_encodeBlockAsm10B:
4077 CMPL R8, $0x10
4078 JB matchlen_match8_repeat_extend_encodeBlockAsm10B
4079 MOVQ (R9)(R11*1), R10
4080 MOVQ 8(R9)(R11*1), R12
4081 XORQ (BX)(R11*1), R10
4082 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm10B
4083 XORQ 8(BX)(R11*1), R12
4084 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm10B
4085 LEAL -16(R8), R8
4086 LEAL 16(R11), R11
4087 JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm10B
4088
4089matchlen_bsf_16repeat_extend_encodeBlockAsm10B:
4090#ifdef GOAMD64_v3
4091 TZCNTQ R12, R12
4092
4093#else
4094 BSFQ R12, R12
4095
4096#endif
4097 SARQ $0x03, R12
4098 LEAL 8(R11)(R12*1), R11
4099 JMP repeat_extend_forward_end_encodeBlockAsm10B
4100
4101matchlen_match8_repeat_extend_encodeBlockAsm10B:
4102 CMPL R8, $0x08
4103 JB matchlen_match4_repeat_extend_encodeBlockAsm10B
4104 MOVQ (R9)(R11*1), R10
4105 XORQ (BX)(R11*1), R10
4106 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm10B
4107 LEAL -8(R8), R8
4108 LEAL 8(R11), R11
4109 JMP matchlen_match4_repeat_extend_encodeBlockAsm10B
4110
4111matchlen_bsf_8_repeat_extend_encodeBlockAsm10B:
4112#ifdef GOAMD64_v3
4113 TZCNTQ R10, R10
4114
4115#else
4116 BSFQ R10, R10
4117
4118#endif
4119 SARQ $0x03, R10
4120 LEAL (R11)(R10*1), R11
4121 JMP repeat_extend_forward_end_encodeBlockAsm10B
4122
4123matchlen_match4_repeat_extend_encodeBlockAsm10B:
4124 CMPL R8, $0x04
4125 JB matchlen_match2_repeat_extend_encodeBlockAsm10B
4126 MOVL (R9)(R11*1), R10
4127 CMPL (BX)(R11*1), R10
4128 JNE matchlen_match2_repeat_extend_encodeBlockAsm10B
4129 LEAL -4(R8), R8
4130 LEAL 4(R11), R11
4131
4132matchlen_match2_repeat_extend_encodeBlockAsm10B:
4133 CMPL R8, $0x01
4134 JE matchlen_match1_repeat_extend_encodeBlockAsm10B
4135 JB repeat_extend_forward_end_encodeBlockAsm10B
4136 MOVW (R9)(R11*1), R10
4137 CMPW (BX)(R11*1), R10
4138 JNE matchlen_match1_repeat_extend_encodeBlockAsm10B
4139 LEAL 2(R11), R11
4140 SUBL $0x02, R8
4141 JZ repeat_extend_forward_end_encodeBlockAsm10B
4142
4143matchlen_match1_repeat_extend_encodeBlockAsm10B:
4144 MOVB (R9)(R11*1), R10
4145 CMPB (BX)(R11*1), R10
4146 JNE repeat_extend_forward_end_encodeBlockAsm10B
4147 LEAL 1(R11), R11
4148
4149repeat_extend_forward_end_encodeBlockAsm10B:
4150 ADDL R11, CX
4151 MOVL CX, BX
4152 SUBL SI, BX
4153 MOVL 16(SP), SI
4154 TESTL DI, DI
4155 JZ repeat_as_copy_encodeBlockAsm10B
4156
4157 // emitRepeat
4158 MOVL BX, DI
4159 LEAL -4(BX), BX
4160 CMPL DI, $0x08
4161 JBE repeat_two_match_repeat_encodeBlockAsm10B
4162 CMPL DI, $0x0c
4163 JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B
4164 CMPL SI, $0x00000800
4165 JB repeat_two_offset_match_repeat_encodeBlockAsm10B
4166
4167cant_repeat_two_offset_match_repeat_encodeBlockAsm10B:
4168 CMPL BX, $0x00000104
4169 JB repeat_three_match_repeat_encodeBlockAsm10B
4170 LEAL -256(BX), BX
4171 MOVW $0x0019, (AX)
4172 MOVW BX, 2(AX)
4173 ADDQ $0x04, AX
4174 JMP repeat_end_emit_encodeBlockAsm10B
4175
4176repeat_three_match_repeat_encodeBlockAsm10B:
4177 LEAL -4(BX), BX
4178 MOVW $0x0015, (AX)
4179 MOVB BL, 2(AX)
4180 ADDQ $0x03, AX
4181 JMP repeat_end_emit_encodeBlockAsm10B
4182
4183repeat_two_match_repeat_encodeBlockAsm10B:
4184 SHLL $0x02, BX
4185 ORL $0x01, BX
4186 MOVW BX, (AX)
4187 ADDQ $0x02, AX
4188 JMP repeat_end_emit_encodeBlockAsm10B
4189
4190repeat_two_offset_match_repeat_encodeBlockAsm10B:
4191 XORQ DI, DI
4192 LEAL 1(DI)(BX*4), BX
4193 MOVB SI, 1(AX)
4194 SARL $0x08, SI
4195 SHLL $0x05, SI
4196 ORL SI, BX
4197 MOVB BL, (AX)
4198 ADDQ $0x02, AX
4199 JMP repeat_end_emit_encodeBlockAsm10B
4200
4201repeat_as_copy_encodeBlockAsm10B:
4202 // emitCopy
4203 CMPL BX, $0x40
4204 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B
4205 CMPL SI, $0x00000800
4206 JAE long_offset_short_repeat_as_copy_encodeBlockAsm10B
4207 MOVL $0x00000001, DI
4208 LEAL 16(DI), DI
4209 MOVB SI, 1(AX)
4210 SHRL $0x08, SI
4211 SHLL $0x05, SI
4212 ORL SI, DI
4213 MOVB DI, (AX)
4214 ADDQ $0x02, AX
4215 SUBL $0x08, BX
4216
4217 // emitRepeat
4218 LEAL -4(BX), BX
4219 JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
4220 MOVL BX, DI
4221 LEAL -4(BX), BX
4222 CMPL DI, $0x08
4223 JBE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
4224 CMPL DI, $0x0c
4225 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
4226 CMPL SI, $0x00000800
4227 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
4228
4229cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
4230 CMPL BX, $0x00000104
4231 JB repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
4232 LEAL -256(BX), BX
4233 MOVW $0x0019, (AX)
4234 MOVW BX, 2(AX)
4235 ADDQ $0x04, AX
4236 JMP repeat_end_emit_encodeBlockAsm10B
4237
4238repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
4239 LEAL -4(BX), BX
4240 MOVW $0x0015, (AX)
4241 MOVB BL, 2(AX)
4242 ADDQ $0x03, AX
4243 JMP repeat_end_emit_encodeBlockAsm10B
4244
4245repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
4246 SHLL $0x02, BX
4247 ORL $0x01, BX
4248 MOVW BX, (AX)
4249 ADDQ $0x02, AX
4250 JMP repeat_end_emit_encodeBlockAsm10B
4251
4252repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
4253 XORQ DI, DI
4254 LEAL 1(DI)(BX*4), BX
4255 MOVB SI, 1(AX)
4256 SARL $0x08, SI
4257 SHLL $0x05, SI
4258 ORL SI, BX
4259 MOVB BL, (AX)
4260 ADDQ $0x02, AX
4261 JMP repeat_end_emit_encodeBlockAsm10B
4262
4263long_offset_short_repeat_as_copy_encodeBlockAsm10B:
4264 MOVB $0xee, (AX)
4265 MOVW SI, 1(AX)
4266 LEAL -60(BX), BX
4267 ADDQ $0x03, AX
4268
4269 // emitRepeat
4270 MOVL BX, DI
4271 LEAL -4(BX), BX
4272 CMPL DI, $0x08
4273 JBE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
4274 CMPL DI, $0x0c
4275 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
4276 CMPL SI, $0x00000800
4277 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
4278
4279cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
4280 CMPL BX, $0x00000104
4281 JB repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
4282 LEAL -256(BX), BX
4283 MOVW $0x0019, (AX)
4284 MOVW BX, 2(AX)
4285 ADDQ $0x04, AX
4286 JMP repeat_end_emit_encodeBlockAsm10B
4287
4288repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
4289 LEAL -4(BX), BX
4290 MOVW $0x0015, (AX)
4291 MOVB BL, 2(AX)
4292 ADDQ $0x03, AX
4293 JMP repeat_end_emit_encodeBlockAsm10B
4294
4295repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
4296 SHLL $0x02, BX
4297 ORL $0x01, BX
4298 MOVW BX, (AX)
4299 ADDQ $0x02, AX
4300 JMP repeat_end_emit_encodeBlockAsm10B
4301
4302repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
4303 XORQ DI, DI
4304 LEAL 1(DI)(BX*4), BX
4305 MOVB SI, 1(AX)
4306 SARL $0x08, SI
4307 SHLL $0x05, SI
4308 ORL SI, BX
4309 MOVB BL, (AX)
4310 ADDQ $0x02, AX
4311 JMP repeat_end_emit_encodeBlockAsm10B
4312
4313two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B:
4314 MOVL BX, DI
4315 SHLL $0x02, DI
4316 CMPL BX, $0x0c
4317 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm10B
4318 CMPL SI, $0x00000800
4319 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm10B
4320 LEAL -15(DI), DI
4321 MOVB SI, 1(AX)
4322 SHRL $0x08, SI
4323 SHLL $0x05, SI
4324 ORL SI, DI
4325 MOVB DI, (AX)
4326 ADDQ $0x02, AX
4327 JMP repeat_end_emit_encodeBlockAsm10B
4328
4329emit_copy_three_repeat_as_copy_encodeBlockAsm10B:
4330 LEAL -2(DI), DI
4331 MOVB DI, (AX)
4332 MOVW SI, 1(AX)
4333 ADDQ $0x03, AX
4334
4335repeat_end_emit_encodeBlockAsm10B:
4336 MOVL CX, 12(SP)
4337 JMP search_loop_encodeBlockAsm10B
4338
4339no_repeat_found_encodeBlockAsm10B:
4340 CMPL (DX)(BX*1), SI
4341 JEQ candidate_match_encodeBlockAsm10B
4342 SHRQ $0x08, SI
4343 MOVL 24(SP)(R9*4), BX
4344 LEAL 2(CX), R8
4345 CMPL (DX)(DI*1), SI
4346 JEQ candidate2_match_encodeBlockAsm10B
4347 MOVL R8, 24(SP)(R9*4)
4348 SHRQ $0x08, SI
4349 CMPL (DX)(BX*1), SI
4350 JEQ candidate3_match_encodeBlockAsm10B
4351 MOVL 20(SP), CX
4352 JMP search_loop_encodeBlockAsm10B
4353
4354candidate3_match_encodeBlockAsm10B:
4355 ADDL $0x02, CX
4356 JMP candidate_match_encodeBlockAsm10B
4357
4358candidate2_match_encodeBlockAsm10B:
4359 MOVL R8, 24(SP)(R9*4)
4360 INCL CX
4361 MOVL DI, BX
4362
4363candidate_match_encodeBlockAsm10B:
4364 MOVL 12(SP), SI
4365 TESTL BX, BX
4366 JZ match_extend_back_end_encodeBlockAsm10B
4367
4368match_extend_back_loop_encodeBlockAsm10B:
4369 CMPL CX, SI
4370 JBE match_extend_back_end_encodeBlockAsm10B
4371 MOVB -1(DX)(BX*1), DI
4372 MOVB -1(DX)(CX*1), R8
4373 CMPB DI, R8
4374 JNE match_extend_back_end_encodeBlockAsm10B
4375 LEAL -1(CX), CX
4376 DECL BX
4377 JZ match_extend_back_end_encodeBlockAsm10B
4378 JMP match_extend_back_loop_encodeBlockAsm10B
4379
4380match_extend_back_end_encodeBlockAsm10B:
4381 MOVL CX, SI
4382 SUBL 12(SP), SI
4383 LEAQ 3(AX)(SI*1), SI
4384 CMPQ SI, (SP)
4385 JB match_dst_size_check_encodeBlockAsm10B
4386 MOVQ $0x00000000, ret+48(FP)
4387 RET
4388
4389match_dst_size_check_encodeBlockAsm10B:
4390 MOVL CX, SI
4391 MOVL 12(SP), DI
4392 CMPL DI, SI
4393 JEQ emit_literal_done_match_emit_encodeBlockAsm10B
4394 MOVL SI, R8
4395 MOVL SI, 12(SP)
4396 LEAQ (DX)(DI*1), SI
4397 SUBL DI, R8
4398 LEAL -1(R8), DI
4399 CMPL DI, $0x3c
4400 JB one_byte_match_emit_encodeBlockAsm10B
4401 CMPL DI, $0x00000100
4402 JB two_bytes_match_emit_encodeBlockAsm10B
4403 JB three_bytes_match_emit_encodeBlockAsm10B
4404
4405three_bytes_match_emit_encodeBlockAsm10B:
4406 MOVB $0xf4, (AX)
4407 MOVW DI, 1(AX)
4408 ADDQ $0x03, AX
4409 JMP memmove_long_match_emit_encodeBlockAsm10B
4410
4411two_bytes_match_emit_encodeBlockAsm10B:
4412 MOVB $0xf0, (AX)
4413 MOVB DI, 1(AX)
4414 ADDQ $0x02, AX
4415 CMPL DI, $0x40
4416 JB memmove_match_emit_encodeBlockAsm10B
4417 JMP memmove_long_match_emit_encodeBlockAsm10B
4418
4419one_byte_match_emit_encodeBlockAsm10B:
4420 SHLB $0x02, DI
4421 MOVB DI, (AX)
4422 ADDQ $0x01, AX
4423
4424memmove_match_emit_encodeBlockAsm10B:
4425 LEAQ (AX)(R8*1), DI
4426
4427 // genMemMoveShort
4428 CMPQ R8, $0x08
4429 JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8
4430 CMPQ R8, $0x10
4431 JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16
4432 CMPQ R8, $0x20
4433 JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32
4434 JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64
4435
4436emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8:
4437 MOVQ (SI), R9
4438 MOVQ R9, (AX)
4439 JMP memmove_end_copy_match_emit_encodeBlockAsm10B
4440
4441emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16:
4442 MOVQ (SI), R9
4443 MOVQ -8(SI)(R8*1), SI
4444 MOVQ R9, (AX)
4445 MOVQ SI, -8(AX)(R8*1)
4446 JMP memmove_end_copy_match_emit_encodeBlockAsm10B
4447
4448emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32:
4449 MOVOU (SI), X0
4450 MOVOU -16(SI)(R8*1), X1
4451 MOVOU X0, (AX)
4452 MOVOU X1, -16(AX)(R8*1)
4453 JMP memmove_end_copy_match_emit_encodeBlockAsm10B
4454
4455emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64:
4456 MOVOU (SI), X0
4457 MOVOU 16(SI), X1
4458 MOVOU -32(SI)(R8*1), X2
4459 MOVOU -16(SI)(R8*1), X3
4460 MOVOU X0, (AX)
4461 MOVOU X1, 16(AX)
4462 MOVOU X2, -32(AX)(R8*1)
4463 MOVOU X3, -16(AX)(R8*1)
4464
4465memmove_end_copy_match_emit_encodeBlockAsm10B:
4466 MOVQ DI, AX
4467 JMP emit_literal_done_match_emit_encodeBlockAsm10B
4468
4469memmove_long_match_emit_encodeBlockAsm10B:
4470 LEAQ (AX)(R8*1), DI
4471
4472 // genMemMoveLong
4473 MOVOU (SI), X0
4474 MOVOU 16(SI), X1
4475 MOVOU -32(SI)(R8*1), X2
4476 MOVOU -16(SI)(R8*1), X3
4477 MOVQ R8, R10
4478 SHRQ $0x05, R10
4479 MOVQ AX, R9
4480 ANDL $0x0000001f, R9
4481 MOVQ $0x00000040, R11
4482 SUBQ R9, R11
4483 DECQ R10
4484 JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
4485 LEAQ -32(SI)(R11*1), R9
4486 LEAQ -32(AX)(R11*1), R12
4487
4488emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back:
4489 MOVOU (R9), X4
4490 MOVOU 16(R9), X5
4491 MOVOA X4, (R12)
4492 MOVOA X5, 16(R12)
4493 ADDQ $0x20, R12
4494 ADDQ $0x20, R9
4495 ADDQ $0x20, R11
4496 DECQ R10
4497 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back
4498
4499emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
4500 MOVOU -32(SI)(R11*1), X4
4501 MOVOU -16(SI)(R11*1), X5
4502 MOVOA X4, -32(AX)(R11*1)
4503 MOVOA X5, -16(AX)(R11*1)
4504 ADDQ $0x20, R11
4505 CMPQ R8, R11
4506 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
4507 MOVOU X0, (AX)
4508 MOVOU X1, 16(AX)
4509 MOVOU X2, -32(AX)(R8*1)
4510 MOVOU X3, -16(AX)(R8*1)
4511 MOVQ DI, AX
4512
4513emit_literal_done_match_emit_encodeBlockAsm10B:
4514match_nolit_loop_encodeBlockAsm10B:
4515 MOVL CX, SI
4516 SUBL BX, SI
4517 MOVL SI, 16(SP)
4518 ADDL $0x04, CX
4519 ADDL $0x04, BX
4520 MOVQ src_len+32(FP), SI
4521 SUBL CX, SI
4522 LEAQ (DX)(CX*1), DI
4523 LEAQ (DX)(BX*1), BX
4524
4525 // matchLen
4526 XORL R9, R9
4527
4528matchlen_loopback_16_match_nolit_encodeBlockAsm10B:
4529 CMPL SI, $0x10
4530 JB matchlen_match8_match_nolit_encodeBlockAsm10B
4531 MOVQ (DI)(R9*1), R8
4532 MOVQ 8(DI)(R9*1), R10
4533 XORQ (BX)(R9*1), R8
4534 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm10B
4535 XORQ 8(BX)(R9*1), R10
4536 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm10B
4537 LEAL -16(SI), SI
4538 LEAL 16(R9), R9
4539 JMP matchlen_loopback_16_match_nolit_encodeBlockAsm10B
4540
4541matchlen_bsf_16match_nolit_encodeBlockAsm10B:
4542#ifdef GOAMD64_v3
4543 TZCNTQ R10, R10
4544
4545#else
4546 BSFQ R10, R10
4547
4548#endif
4549 SARQ $0x03, R10
4550 LEAL 8(R9)(R10*1), R9
4551 JMP match_nolit_end_encodeBlockAsm10B
4552
4553matchlen_match8_match_nolit_encodeBlockAsm10B:
4554 CMPL SI, $0x08
4555 JB matchlen_match4_match_nolit_encodeBlockAsm10B
4556 MOVQ (DI)(R9*1), R8
4557 XORQ (BX)(R9*1), R8
4558 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm10B
4559 LEAL -8(SI), SI
4560 LEAL 8(R9), R9
4561 JMP matchlen_match4_match_nolit_encodeBlockAsm10B
4562
4563matchlen_bsf_8_match_nolit_encodeBlockAsm10B:
4564#ifdef GOAMD64_v3
4565 TZCNTQ R8, R8
4566
4567#else
4568 BSFQ R8, R8
4569
4570#endif
4571 SARQ $0x03, R8
4572 LEAL (R9)(R8*1), R9
4573 JMP match_nolit_end_encodeBlockAsm10B
4574
4575matchlen_match4_match_nolit_encodeBlockAsm10B:
4576 CMPL SI, $0x04
4577 JB matchlen_match2_match_nolit_encodeBlockAsm10B
4578 MOVL (DI)(R9*1), R8
4579 CMPL (BX)(R9*1), R8
4580 JNE matchlen_match2_match_nolit_encodeBlockAsm10B
4581 LEAL -4(SI), SI
4582 LEAL 4(R9), R9
4583
4584matchlen_match2_match_nolit_encodeBlockAsm10B:
4585 CMPL SI, $0x01
4586 JE matchlen_match1_match_nolit_encodeBlockAsm10B
4587 JB match_nolit_end_encodeBlockAsm10B
4588 MOVW (DI)(R9*1), R8
4589 CMPW (BX)(R9*1), R8
4590 JNE matchlen_match1_match_nolit_encodeBlockAsm10B
4591 LEAL 2(R9), R9
4592 SUBL $0x02, SI
4593 JZ match_nolit_end_encodeBlockAsm10B
4594
4595matchlen_match1_match_nolit_encodeBlockAsm10B:
4596 MOVB (DI)(R9*1), R8
4597 CMPB (BX)(R9*1), R8
4598 JNE match_nolit_end_encodeBlockAsm10B
4599 LEAL 1(R9), R9
4600
4601match_nolit_end_encodeBlockAsm10B:
4602 ADDL R9, CX
4603 MOVL 16(SP), BX
4604 ADDL $0x04, R9
4605 MOVL CX, 12(SP)
4606
4607 // emitCopy
4608 CMPL R9, $0x40
4609 JBE two_byte_offset_short_match_nolit_encodeBlockAsm10B
4610 CMPL BX, $0x00000800
4611 JAE long_offset_short_match_nolit_encodeBlockAsm10B
4612 MOVL $0x00000001, SI
4613 LEAL 16(SI), SI
4614 MOVB BL, 1(AX)
4615 SHRL $0x08, BX
4616 SHLL $0x05, BX
4617 ORL BX, SI
4618 MOVB SI, (AX)
4619 ADDQ $0x02, AX
4620 SUBL $0x08, R9
4621
4622 // emitRepeat
4623 LEAL -4(R9), R9
4624 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
4625 MOVL R9, SI
4626 LEAL -4(R9), R9
4627 CMPL SI, $0x08
4628 JBE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
4629 CMPL SI, $0x0c
4630 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
4631 CMPL BX, $0x00000800
4632 JB repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
4633
4634cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
4635 CMPL R9, $0x00000104
4636 JB repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
4637 LEAL -256(R9), R9
4638 MOVW $0x0019, (AX)
4639 MOVW R9, 2(AX)
4640 ADDQ $0x04, AX
4641 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4642
4643repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
4644 LEAL -4(R9), R9
4645 MOVW $0x0015, (AX)
4646 MOVB R9, 2(AX)
4647 ADDQ $0x03, AX
4648 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4649
4650repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
4651 SHLL $0x02, R9
4652 ORL $0x01, R9
4653 MOVW R9, (AX)
4654 ADDQ $0x02, AX
4655 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4656
4657repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
4658 XORQ SI, SI
4659 LEAL 1(SI)(R9*4), R9
4660 MOVB BL, 1(AX)
4661 SARL $0x08, BX
4662 SHLL $0x05, BX
4663 ORL BX, R9
4664 MOVB R9, (AX)
4665 ADDQ $0x02, AX
4666 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4667
4668long_offset_short_match_nolit_encodeBlockAsm10B:
4669 MOVB $0xee, (AX)
4670 MOVW BX, 1(AX)
4671 LEAL -60(R9), R9
4672 ADDQ $0x03, AX
4673
4674 // emitRepeat
4675 MOVL R9, SI
4676 LEAL -4(R9), R9
4677 CMPL SI, $0x08
4678 JBE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short
4679 CMPL SI, $0x0c
4680 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
4681 CMPL BX, $0x00000800
4682 JB repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
4683
4684cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
4685 CMPL R9, $0x00000104
4686 JB repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short
4687 LEAL -256(R9), R9
4688 MOVW $0x0019, (AX)
4689 MOVW R9, 2(AX)
4690 ADDQ $0x04, AX
4691 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4692
4693repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short:
4694 LEAL -4(R9), R9
4695 MOVW $0x0015, (AX)
4696 MOVB R9, 2(AX)
4697 ADDQ $0x03, AX
4698 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4699
4700repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short:
4701 SHLL $0x02, R9
4702 ORL $0x01, R9
4703 MOVW R9, (AX)
4704 ADDQ $0x02, AX
4705 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4706
4707repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
4708 XORQ SI, SI
4709 LEAL 1(SI)(R9*4), R9
4710 MOVB BL, 1(AX)
4711 SARL $0x08, BX
4712 SHLL $0x05, BX
4713 ORL BX, R9
4714 MOVB R9, (AX)
4715 ADDQ $0x02, AX
4716 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4717
4718two_byte_offset_short_match_nolit_encodeBlockAsm10B:
4719 MOVL R9, SI
4720 SHLL $0x02, SI
4721 CMPL R9, $0x0c
4722 JAE emit_copy_three_match_nolit_encodeBlockAsm10B
4723 CMPL BX, $0x00000800
4724 JAE emit_copy_three_match_nolit_encodeBlockAsm10B
4725 LEAL -15(SI), SI
4726 MOVB BL, 1(AX)
4727 SHRL $0x08, BX
4728 SHLL $0x05, BX
4729 ORL BX, SI
4730 MOVB SI, (AX)
4731 ADDQ $0x02, AX
4732 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4733
4734emit_copy_three_match_nolit_encodeBlockAsm10B:
4735 LEAL -2(SI), SI
4736 MOVB SI, (AX)
4737 MOVW BX, 1(AX)
4738 ADDQ $0x03, AX
4739
4740match_nolit_emitcopy_end_encodeBlockAsm10B:
4741 CMPL CX, 8(SP)
4742 JAE emit_remainder_encodeBlockAsm10B
4743 MOVQ -2(DX)(CX*1), SI
4744 CMPQ AX, (SP)
4745 JB match_nolit_dst_ok_encodeBlockAsm10B
4746 MOVQ $0x00000000, ret+48(FP)
4747 RET
4748
4749match_nolit_dst_ok_encodeBlockAsm10B:
4750 MOVQ $0x9e3779b1, R8
4751 MOVQ SI, DI
4752 SHRQ $0x10, SI
4753 MOVQ SI, BX
4754 SHLQ $0x20, DI
4755 IMULQ R8, DI
4756 SHRQ $0x36, DI
4757 SHLQ $0x20, BX
4758 IMULQ R8, BX
4759 SHRQ $0x36, BX
4760 LEAL -2(CX), R8
4761 LEAQ 24(SP)(BX*4), R9
4762 MOVL (R9), BX
4763 MOVL R8, 24(SP)(DI*4)
4764 MOVL CX, (R9)
4765 CMPL (DX)(BX*1), SI
4766 JEQ match_nolit_loop_encodeBlockAsm10B
4767 INCL CX
4768 JMP search_loop_encodeBlockAsm10B
4769
4770emit_remainder_encodeBlockAsm10B:
4771 MOVQ src_len+32(FP), CX
4772 SUBL 12(SP), CX
4773 LEAQ 3(AX)(CX*1), CX
4774 CMPQ CX, (SP)
4775 JB emit_remainder_ok_encodeBlockAsm10B
4776 MOVQ $0x00000000, ret+48(FP)
4777 RET
4778
4779emit_remainder_ok_encodeBlockAsm10B:
4780 MOVQ src_len+32(FP), CX
4781 MOVL 12(SP), BX
4782 CMPL BX, CX
4783 JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B
4784 MOVL CX, SI
4785 MOVL CX, 12(SP)
4786 LEAQ (DX)(BX*1), CX
4787 SUBL BX, SI
4788 LEAL -1(SI), DX
4789 CMPL DX, $0x3c
4790 JB one_byte_emit_remainder_encodeBlockAsm10B
4791 CMPL DX, $0x00000100
4792 JB two_bytes_emit_remainder_encodeBlockAsm10B
4793 JB three_bytes_emit_remainder_encodeBlockAsm10B
4794
4795three_bytes_emit_remainder_encodeBlockAsm10B:
4796 MOVB $0xf4, (AX)
4797 MOVW DX, 1(AX)
4798 ADDQ $0x03, AX
4799 JMP memmove_long_emit_remainder_encodeBlockAsm10B
4800
4801two_bytes_emit_remainder_encodeBlockAsm10B:
4802 MOVB $0xf0, (AX)
4803 MOVB DL, 1(AX)
4804 ADDQ $0x02, AX
4805 CMPL DX, $0x40
4806 JB memmove_emit_remainder_encodeBlockAsm10B
4807 JMP memmove_long_emit_remainder_encodeBlockAsm10B
4808
4809one_byte_emit_remainder_encodeBlockAsm10B:
4810 SHLB $0x02, DL
4811 MOVB DL, (AX)
4812 ADDQ $0x01, AX
4813
4814memmove_emit_remainder_encodeBlockAsm10B:
4815 LEAQ (AX)(SI*1), DX
4816 MOVL SI, BX
4817
4818 // genMemMoveShort
4819 CMPQ BX, $0x03
4820 JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2
4821 JE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3
4822 CMPQ BX, $0x08
4823 JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7
4824 CMPQ BX, $0x10
4825 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16
4826 CMPQ BX, $0x20
4827 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32
4828 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64
4829
4830emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2:
4831 MOVB (CX), SI
4832 MOVB -1(CX)(BX*1), CL
4833 MOVB SI, (AX)
4834 MOVB CL, -1(AX)(BX*1)
4835 JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
4836
4837emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3:
4838 MOVW (CX), SI
4839 MOVB 2(CX), CL
4840 MOVW SI, (AX)
4841 MOVB CL, 2(AX)
4842 JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
4843
4844emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7:
4845 MOVL (CX), SI
4846 MOVL -4(CX)(BX*1), CX
4847 MOVL SI, (AX)
4848 MOVL CX, -4(AX)(BX*1)
4849 JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
4850
4851emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16:
4852 MOVQ (CX), SI
4853 MOVQ -8(CX)(BX*1), CX
4854 MOVQ SI, (AX)
4855 MOVQ CX, -8(AX)(BX*1)
4856 JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
4857
4858emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32:
4859 MOVOU (CX), X0
4860 MOVOU -16(CX)(BX*1), X1
4861 MOVOU X0, (AX)
4862 MOVOU X1, -16(AX)(BX*1)
4863 JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
4864
4865emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64:
4866 MOVOU (CX), X0
4867 MOVOU 16(CX), X1
4868 MOVOU -32(CX)(BX*1), X2
4869 MOVOU -16(CX)(BX*1), X3
4870 MOVOU X0, (AX)
4871 MOVOU X1, 16(AX)
4872 MOVOU X2, -32(AX)(BX*1)
4873 MOVOU X3, -16(AX)(BX*1)
4874
4875memmove_end_copy_emit_remainder_encodeBlockAsm10B:
4876 MOVQ DX, AX
4877 JMP emit_literal_done_emit_remainder_encodeBlockAsm10B
4878
4879memmove_long_emit_remainder_encodeBlockAsm10B:
4880 LEAQ (AX)(SI*1), DX
4881 MOVL SI, BX
4882
4883 // genMemMoveLong
4884 MOVOU (CX), X0
4885 MOVOU 16(CX), X1
4886 MOVOU -32(CX)(BX*1), X2
4887 MOVOU -16(CX)(BX*1), X3
4888 MOVQ BX, DI
4889 SHRQ $0x05, DI
4890 MOVQ AX, SI
4891 ANDL $0x0000001f, SI
4892 MOVQ $0x00000040, R8
4893 SUBQ SI, R8
4894 DECQ DI
4895 JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
4896 LEAQ -32(CX)(R8*1), SI
4897 LEAQ -32(AX)(R8*1), R9
4898
4899emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back:
4900 MOVOU (SI), X4
4901 MOVOU 16(SI), X5
4902 MOVOA X4, (R9)
4903 MOVOA X5, 16(R9)
4904 ADDQ $0x20, R9
4905 ADDQ $0x20, SI
4906 ADDQ $0x20, R8
4907 DECQ DI
4908 JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back
4909
4910emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32:
4911 MOVOU -32(CX)(R8*1), X4
4912 MOVOU -16(CX)(R8*1), X5
4913 MOVOA X4, -32(AX)(R8*1)
4914 MOVOA X5, -16(AX)(R8*1)
4915 ADDQ $0x20, R8
4916 CMPQ BX, R8
4917 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
4918 MOVOU X0, (AX)
4919 MOVOU X1, 16(AX)
4920 MOVOU X2, -32(AX)(BX*1)
4921 MOVOU X3, -16(AX)(BX*1)
4922 MOVQ DX, AX
4923
4924emit_literal_done_emit_remainder_encodeBlockAsm10B:
4925 MOVQ dst_base+0(FP), CX
4926 SUBQ CX, AX
4927 MOVQ AX, ret+48(FP)
4928 RET
4929
4930// func encodeBlockAsm8B(dst []byte, src []byte) int
4931// Requires: BMI, SSE2
4932TEXT ·encodeBlockAsm8B(SB), $1048-56
4933 MOVQ dst_base+0(FP), AX
4934 MOVQ $0x00000008, CX
4935 LEAQ 24(SP), DX
4936 PXOR X0, X0
4937
4938zero_loop_encodeBlockAsm8B:
4939 MOVOU X0, (DX)
4940 MOVOU X0, 16(DX)
4941 MOVOU X0, 32(DX)
4942 MOVOU X0, 48(DX)
4943 MOVOU X0, 64(DX)
4944 MOVOU X0, 80(DX)
4945 MOVOU X0, 96(DX)
4946 MOVOU X0, 112(DX)
4947 ADDQ $0x80, DX
4948 DECQ CX
4949 JNZ zero_loop_encodeBlockAsm8B
4950 MOVL $0x00000000, 12(SP)
4951 MOVQ src_len+32(FP), CX
4952 LEAQ -9(CX), DX
4953 LEAQ -8(CX), BX
4954 MOVL BX, 8(SP)
4955 SHRQ $0x05, CX
4956 SUBL CX, DX
4957 LEAQ (AX)(DX*1), DX
4958 MOVQ DX, (SP)
4959 MOVL $0x00000001, CX
4960 MOVL CX, 16(SP)
4961 MOVQ src_base+24(FP), DX
4962
4963search_loop_encodeBlockAsm8B:
4964 MOVL CX, BX
4965 SUBL 12(SP), BX
4966 SHRL $0x04, BX
4967 LEAL 4(CX)(BX*1), BX
4968 CMPL BX, 8(SP)
4969 JAE emit_remainder_encodeBlockAsm8B
4970 MOVQ (DX)(CX*1), SI
4971 MOVL BX, 20(SP)
4972 MOVQ $0x9e3779b1, R8
4973 MOVQ SI, R9
4974 MOVQ SI, R10
4975 SHRQ $0x08, R10
4976 SHLQ $0x20, R9
4977 IMULQ R8, R9
4978 SHRQ $0x38, R9
4979 SHLQ $0x20, R10
4980 IMULQ R8, R10
4981 SHRQ $0x38, R10
4982 MOVL 24(SP)(R9*4), BX
4983 MOVL 24(SP)(R10*4), DI
4984 MOVL CX, 24(SP)(R9*4)
4985 LEAL 1(CX), R9
4986 MOVL R9, 24(SP)(R10*4)
4987 MOVQ SI, R9
4988 SHRQ $0x10, R9
4989 SHLQ $0x20, R9
4990 IMULQ R8, R9
4991 SHRQ $0x38, R9
4992 MOVL CX, R8
4993 SUBL 16(SP), R8
4994 MOVL 1(DX)(R8*1), R10
4995 MOVQ SI, R8
4996 SHRQ $0x08, R8
4997 CMPL R8, R10
4998 JNE no_repeat_found_encodeBlockAsm8B
4999 LEAL 1(CX), SI
5000 MOVL 12(SP), DI
5001 MOVL SI, BX
5002 SUBL 16(SP), BX
5003 JZ repeat_extend_back_end_encodeBlockAsm8B
5004
5005repeat_extend_back_loop_encodeBlockAsm8B:
5006 CMPL SI, DI
5007 JBE repeat_extend_back_end_encodeBlockAsm8B
5008 MOVB -1(DX)(BX*1), R8
5009 MOVB -1(DX)(SI*1), R9
5010 CMPB R8, R9
5011 JNE repeat_extend_back_end_encodeBlockAsm8B
5012 LEAL -1(SI), SI
5013 DECL BX
5014 JNZ repeat_extend_back_loop_encodeBlockAsm8B
5015
5016repeat_extend_back_end_encodeBlockAsm8B:
5017 MOVL SI, BX
5018 SUBL 12(SP), BX
5019 LEAQ 3(AX)(BX*1), BX
5020 CMPQ BX, (SP)
5021 JB repeat_dst_size_check_encodeBlockAsm8B
5022 MOVQ $0x00000000, ret+48(FP)
5023 RET
5024
5025repeat_dst_size_check_encodeBlockAsm8B:
5026 MOVL 12(SP), BX
5027 CMPL BX, SI
5028 JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B
5029 MOVL SI, R8
5030 MOVL SI, 12(SP)
5031 LEAQ (DX)(BX*1), R9
5032 SUBL BX, R8
5033 LEAL -1(R8), BX
5034 CMPL BX, $0x3c
5035 JB one_byte_repeat_emit_encodeBlockAsm8B
5036 CMPL BX, $0x00000100
5037 JB two_bytes_repeat_emit_encodeBlockAsm8B
5038 JB three_bytes_repeat_emit_encodeBlockAsm8B
5039
5040three_bytes_repeat_emit_encodeBlockAsm8B:
5041 MOVB $0xf4, (AX)
5042 MOVW BX, 1(AX)
5043 ADDQ $0x03, AX
5044 JMP memmove_long_repeat_emit_encodeBlockAsm8B
5045
5046two_bytes_repeat_emit_encodeBlockAsm8B:
5047 MOVB $0xf0, (AX)
5048 MOVB BL, 1(AX)
5049 ADDQ $0x02, AX
5050 CMPL BX, $0x40
5051 JB memmove_repeat_emit_encodeBlockAsm8B
5052 JMP memmove_long_repeat_emit_encodeBlockAsm8B
5053
5054one_byte_repeat_emit_encodeBlockAsm8B:
5055 SHLB $0x02, BL
5056 MOVB BL, (AX)
5057 ADDQ $0x01, AX
5058
5059memmove_repeat_emit_encodeBlockAsm8B:
5060 LEAQ (AX)(R8*1), BX
5061
5062 // genMemMoveShort
5063 CMPQ R8, $0x08
5064 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8
5065 CMPQ R8, $0x10
5066 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16
5067 CMPQ R8, $0x20
5068 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32
5069 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64
5070
5071emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8:
5072 MOVQ (R9), R10
5073 MOVQ R10, (AX)
5074 JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
5075
5076emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16:
5077 MOVQ (R9), R10
5078 MOVQ -8(R9)(R8*1), R9
5079 MOVQ R10, (AX)
5080 MOVQ R9, -8(AX)(R8*1)
5081 JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
5082
5083emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32:
5084 MOVOU (R9), X0
5085 MOVOU -16(R9)(R8*1), X1
5086 MOVOU X0, (AX)
5087 MOVOU X1, -16(AX)(R8*1)
5088 JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
5089
5090emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64:
5091 MOVOU (R9), X0
5092 MOVOU 16(R9), X1
5093 MOVOU -32(R9)(R8*1), X2
5094 MOVOU -16(R9)(R8*1), X3
5095 MOVOU X0, (AX)
5096 MOVOU X1, 16(AX)
5097 MOVOU X2, -32(AX)(R8*1)
5098 MOVOU X3, -16(AX)(R8*1)
5099
5100memmove_end_copy_repeat_emit_encodeBlockAsm8B:
5101 MOVQ BX, AX
5102 JMP emit_literal_done_repeat_emit_encodeBlockAsm8B
5103
5104memmove_long_repeat_emit_encodeBlockAsm8B:
5105 LEAQ (AX)(R8*1), BX
5106
5107 // genMemMoveLong
5108 MOVOU (R9), X0
5109 MOVOU 16(R9), X1
5110 MOVOU -32(R9)(R8*1), X2
5111 MOVOU -16(R9)(R8*1), X3
5112 MOVQ R8, R11
5113 SHRQ $0x05, R11
5114 MOVQ AX, R10
5115 ANDL $0x0000001f, R10
5116 MOVQ $0x00000040, R12
5117 SUBQ R10, R12
5118 DECQ R11
5119 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
5120 LEAQ -32(R9)(R12*1), R10
5121 LEAQ -32(AX)(R12*1), R13
5122
5123emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back:
5124 MOVOU (R10), X4
5125 MOVOU 16(R10), X5
5126 MOVOA X4, (R13)
5127 MOVOA X5, 16(R13)
5128 ADDQ $0x20, R13
5129 ADDQ $0x20, R10
5130 ADDQ $0x20, R12
5131 DECQ R11
5132 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back
5133
5134emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
5135 MOVOU -32(R9)(R12*1), X4
5136 MOVOU -16(R9)(R12*1), X5
5137 MOVOA X4, -32(AX)(R12*1)
5138 MOVOA X5, -16(AX)(R12*1)
5139 ADDQ $0x20, R12
5140 CMPQ R8, R12
5141 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
5142 MOVOU X0, (AX)
5143 MOVOU X1, 16(AX)
5144 MOVOU X2, -32(AX)(R8*1)
5145 MOVOU X3, -16(AX)(R8*1)
5146 MOVQ BX, AX
5147
5148emit_literal_done_repeat_emit_encodeBlockAsm8B:
5149 ADDL $0x05, CX
5150 MOVL CX, BX
5151 SUBL 16(SP), BX
5152 MOVQ src_len+32(FP), R8
5153 SUBL CX, R8
5154 LEAQ (DX)(CX*1), R9
5155 LEAQ (DX)(BX*1), BX
5156
5157 // matchLen
5158 XORL R11, R11
5159
5160matchlen_loopback_16_repeat_extend_encodeBlockAsm8B:
5161 CMPL R8, $0x10
5162 JB matchlen_match8_repeat_extend_encodeBlockAsm8B
5163 MOVQ (R9)(R11*1), R10
5164 MOVQ 8(R9)(R11*1), R12
5165 XORQ (BX)(R11*1), R10
5166 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm8B
5167 XORQ 8(BX)(R11*1), R12
5168 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm8B
5169 LEAL -16(R8), R8
5170 LEAL 16(R11), R11
5171 JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm8B
5172
5173matchlen_bsf_16repeat_extend_encodeBlockAsm8B:
5174#ifdef GOAMD64_v3
5175 TZCNTQ R12, R12
5176
5177#else
5178 BSFQ R12, R12
5179
5180#endif
5181 SARQ $0x03, R12
5182 LEAL 8(R11)(R12*1), R11
5183 JMP repeat_extend_forward_end_encodeBlockAsm8B
5184
5185matchlen_match8_repeat_extend_encodeBlockAsm8B:
5186 CMPL R8, $0x08
5187 JB matchlen_match4_repeat_extend_encodeBlockAsm8B
5188 MOVQ (R9)(R11*1), R10
5189 XORQ (BX)(R11*1), R10
5190 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm8B
5191 LEAL -8(R8), R8
5192 LEAL 8(R11), R11
5193 JMP matchlen_match4_repeat_extend_encodeBlockAsm8B
5194
5195matchlen_bsf_8_repeat_extend_encodeBlockAsm8B:
5196#ifdef GOAMD64_v3
5197 TZCNTQ R10, R10
5198
5199#else
5200 BSFQ R10, R10
5201
5202#endif
5203 SARQ $0x03, R10
5204 LEAL (R11)(R10*1), R11
5205 JMP repeat_extend_forward_end_encodeBlockAsm8B
5206
5207matchlen_match4_repeat_extend_encodeBlockAsm8B:
5208 CMPL R8, $0x04
5209 JB matchlen_match2_repeat_extend_encodeBlockAsm8B
5210 MOVL (R9)(R11*1), R10
5211 CMPL (BX)(R11*1), R10
5212 JNE matchlen_match2_repeat_extend_encodeBlockAsm8B
5213 LEAL -4(R8), R8
5214 LEAL 4(R11), R11
5215
5216matchlen_match2_repeat_extend_encodeBlockAsm8B:
5217 CMPL R8, $0x01
5218 JE matchlen_match1_repeat_extend_encodeBlockAsm8B
5219 JB repeat_extend_forward_end_encodeBlockAsm8B
5220 MOVW (R9)(R11*1), R10
5221 CMPW (BX)(R11*1), R10
5222 JNE matchlen_match1_repeat_extend_encodeBlockAsm8B
5223 LEAL 2(R11), R11
5224 SUBL $0x02, R8
5225 JZ repeat_extend_forward_end_encodeBlockAsm8B
5226
5227matchlen_match1_repeat_extend_encodeBlockAsm8B:
5228 MOVB (R9)(R11*1), R10
5229 CMPB (BX)(R11*1), R10
5230 JNE repeat_extend_forward_end_encodeBlockAsm8B
5231 LEAL 1(R11), R11
5232
5233repeat_extend_forward_end_encodeBlockAsm8B:
5234 ADDL R11, CX
5235 MOVL CX, BX
5236 SUBL SI, BX
5237 MOVL 16(SP), SI
5238 TESTL DI, DI
5239 JZ repeat_as_copy_encodeBlockAsm8B
5240
5241 // emitRepeat
5242 MOVL BX, SI
5243 LEAL -4(BX), BX
5244 CMPL SI, $0x08
5245 JBE repeat_two_match_repeat_encodeBlockAsm8B
5246 CMPL SI, $0x0c
5247 JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B
5248
5249cant_repeat_two_offset_match_repeat_encodeBlockAsm8B:
5250 CMPL BX, $0x00000104
5251 JB repeat_three_match_repeat_encodeBlockAsm8B
5252 LEAL -256(BX), BX
5253 MOVW $0x0019, (AX)
5254 MOVW BX, 2(AX)
5255 ADDQ $0x04, AX
5256 JMP repeat_end_emit_encodeBlockAsm8B
5257
5258repeat_three_match_repeat_encodeBlockAsm8B:
5259 LEAL -4(BX), BX
5260 MOVW $0x0015, (AX)
5261 MOVB BL, 2(AX)
5262 ADDQ $0x03, AX
5263 JMP repeat_end_emit_encodeBlockAsm8B
5264
5265repeat_two_match_repeat_encodeBlockAsm8B:
5266 SHLL $0x02, BX
5267 ORL $0x01, BX
5268 MOVW BX, (AX)
5269 ADDQ $0x02, AX
5270 JMP repeat_end_emit_encodeBlockAsm8B
5271 XORQ DI, DI
5272 LEAL 1(DI)(BX*4), BX
5273 MOVB SI, 1(AX)
5274 SARL $0x08, SI
5275 SHLL $0x05, SI
5276 ORL SI, BX
5277 MOVB BL, (AX)
5278 ADDQ $0x02, AX
5279 JMP repeat_end_emit_encodeBlockAsm8B
5280
5281repeat_as_copy_encodeBlockAsm8B:
5282 // emitCopy
5283 CMPL BX, $0x40
5284 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B
5285 CMPL SI, $0x00000800
5286 JAE long_offset_short_repeat_as_copy_encodeBlockAsm8B
5287 MOVL $0x00000001, DI
5288 LEAL 16(DI), DI
5289 MOVB SI, 1(AX)
5290 SHRL $0x08, SI
5291 SHLL $0x05, SI
5292 ORL SI, DI
5293 MOVB DI, (AX)
5294 ADDQ $0x02, AX
5295 SUBL $0x08, BX
5296
5297 // emitRepeat
5298 LEAL -4(BX), BX
5299 JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
5300 MOVL BX, SI
5301 LEAL -4(BX), BX
5302 CMPL SI, $0x08
5303 JBE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
5304 CMPL SI, $0x0c
5305 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
5306
5307cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
5308 CMPL BX, $0x00000104
5309 JB repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
5310 LEAL -256(BX), BX
5311 MOVW $0x0019, (AX)
5312 MOVW BX, 2(AX)
5313 ADDQ $0x04, AX
5314 JMP repeat_end_emit_encodeBlockAsm8B
5315
5316repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
5317 LEAL -4(BX), BX
5318 MOVW $0x0015, (AX)
5319 MOVB BL, 2(AX)
5320 ADDQ $0x03, AX
5321 JMP repeat_end_emit_encodeBlockAsm8B
5322
5323repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
5324 SHLL $0x02, BX
5325 ORL $0x01, BX
5326 MOVW BX, (AX)
5327 ADDQ $0x02, AX
5328 JMP repeat_end_emit_encodeBlockAsm8B
5329 XORQ DI, DI
5330 LEAL 1(DI)(BX*4), BX
5331 MOVB SI, 1(AX)
5332 SARL $0x08, SI
5333 SHLL $0x05, SI
5334 ORL SI, BX
5335 MOVB BL, (AX)
5336 ADDQ $0x02, AX
5337 JMP repeat_end_emit_encodeBlockAsm8B
5338
5339long_offset_short_repeat_as_copy_encodeBlockAsm8B:
5340 MOVB $0xee, (AX)
5341 MOVW SI, 1(AX)
5342 LEAL -60(BX), BX
5343 ADDQ $0x03, AX
5344
5345 // emitRepeat
5346 MOVL BX, SI
5347 LEAL -4(BX), BX
5348 CMPL SI, $0x08
5349 JBE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
5350 CMPL SI, $0x0c
5351 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
5352
5353cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
5354 CMPL BX, $0x00000104
5355 JB repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
5356 LEAL -256(BX), BX
5357 MOVW $0x0019, (AX)
5358 MOVW BX, 2(AX)
5359 ADDQ $0x04, AX
5360 JMP repeat_end_emit_encodeBlockAsm8B
5361
5362repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
5363 LEAL -4(BX), BX
5364 MOVW $0x0015, (AX)
5365 MOVB BL, 2(AX)
5366 ADDQ $0x03, AX
5367 JMP repeat_end_emit_encodeBlockAsm8B
5368
5369repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
5370 SHLL $0x02, BX
5371 ORL $0x01, BX
5372 MOVW BX, (AX)
5373 ADDQ $0x02, AX
5374 JMP repeat_end_emit_encodeBlockAsm8B
5375 XORQ DI, DI
5376 LEAL 1(DI)(BX*4), BX
5377 MOVB SI, 1(AX)
5378 SARL $0x08, SI
5379 SHLL $0x05, SI
5380 ORL SI, BX
5381 MOVB BL, (AX)
5382 ADDQ $0x02, AX
5383 JMP repeat_end_emit_encodeBlockAsm8B
5384
5385two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B:
5386 MOVL BX, DI
5387 SHLL $0x02, DI
5388 CMPL BX, $0x0c
5389 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm8B
5390 LEAL -15(DI), DI
5391 MOVB SI, 1(AX)
5392 SHRL $0x08, SI
5393 SHLL $0x05, SI
5394 ORL SI, DI
5395 MOVB DI, (AX)
5396 ADDQ $0x02, AX
5397 JMP repeat_end_emit_encodeBlockAsm8B
5398
5399emit_copy_three_repeat_as_copy_encodeBlockAsm8B:
5400 LEAL -2(DI), DI
5401 MOVB DI, (AX)
5402 MOVW SI, 1(AX)
5403 ADDQ $0x03, AX
5404
5405repeat_end_emit_encodeBlockAsm8B:
5406 MOVL CX, 12(SP)
5407 JMP search_loop_encodeBlockAsm8B
5408
5409no_repeat_found_encodeBlockAsm8B:
5410 CMPL (DX)(BX*1), SI
5411 JEQ candidate_match_encodeBlockAsm8B
5412 SHRQ $0x08, SI
5413 MOVL 24(SP)(R9*4), BX
5414 LEAL 2(CX), R8
5415 CMPL (DX)(DI*1), SI
5416 JEQ candidate2_match_encodeBlockAsm8B
5417 MOVL R8, 24(SP)(R9*4)
5418 SHRQ $0x08, SI
5419 CMPL (DX)(BX*1), SI
5420 JEQ candidate3_match_encodeBlockAsm8B
5421 MOVL 20(SP), CX
5422 JMP search_loop_encodeBlockAsm8B
5423
5424candidate3_match_encodeBlockAsm8B:
5425 ADDL $0x02, CX
5426 JMP candidate_match_encodeBlockAsm8B
5427
5428candidate2_match_encodeBlockAsm8B:
5429 MOVL R8, 24(SP)(R9*4)
5430 INCL CX
5431 MOVL DI, BX
5432
5433candidate_match_encodeBlockAsm8B:
5434 MOVL 12(SP), SI
5435 TESTL BX, BX
5436 JZ match_extend_back_end_encodeBlockAsm8B
5437
5438match_extend_back_loop_encodeBlockAsm8B:
5439 CMPL CX, SI
5440 JBE match_extend_back_end_encodeBlockAsm8B
5441 MOVB -1(DX)(BX*1), DI
5442 MOVB -1(DX)(CX*1), R8
5443 CMPB DI, R8
5444 JNE match_extend_back_end_encodeBlockAsm8B
5445 LEAL -1(CX), CX
5446 DECL BX
5447 JZ match_extend_back_end_encodeBlockAsm8B
5448 JMP match_extend_back_loop_encodeBlockAsm8B
5449
5450match_extend_back_end_encodeBlockAsm8B:
5451 MOVL CX, SI
5452 SUBL 12(SP), SI
5453 LEAQ 3(AX)(SI*1), SI
5454 CMPQ SI, (SP)
5455 JB match_dst_size_check_encodeBlockAsm8B
5456 MOVQ $0x00000000, ret+48(FP)
5457 RET
5458
5459match_dst_size_check_encodeBlockAsm8B:
5460 MOVL CX, SI
5461 MOVL 12(SP), DI
5462 CMPL DI, SI
5463 JEQ emit_literal_done_match_emit_encodeBlockAsm8B
5464 MOVL SI, R8
5465 MOVL SI, 12(SP)
5466 LEAQ (DX)(DI*1), SI
5467 SUBL DI, R8
5468 LEAL -1(R8), DI
5469 CMPL DI, $0x3c
5470 JB one_byte_match_emit_encodeBlockAsm8B
5471 CMPL DI, $0x00000100
5472 JB two_bytes_match_emit_encodeBlockAsm8B
5473 JB three_bytes_match_emit_encodeBlockAsm8B
5474
5475three_bytes_match_emit_encodeBlockAsm8B:
5476 MOVB $0xf4, (AX)
5477 MOVW DI, 1(AX)
5478 ADDQ $0x03, AX
5479 JMP memmove_long_match_emit_encodeBlockAsm8B
5480
5481two_bytes_match_emit_encodeBlockAsm8B:
5482 MOVB $0xf0, (AX)
5483 MOVB DI, 1(AX)
5484 ADDQ $0x02, AX
5485 CMPL DI, $0x40
5486 JB memmove_match_emit_encodeBlockAsm8B
5487 JMP memmove_long_match_emit_encodeBlockAsm8B
5488
5489one_byte_match_emit_encodeBlockAsm8B:
5490 SHLB $0x02, DI
5491 MOVB DI, (AX)
5492 ADDQ $0x01, AX
5493
5494memmove_match_emit_encodeBlockAsm8B:
5495 LEAQ (AX)(R8*1), DI
5496
5497 // genMemMoveShort
5498 CMPQ R8, $0x08
5499 JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8
5500 CMPQ R8, $0x10
5501 JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16
5502 CMPQ R8, $0x20
5503 JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32
5504 JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64
5505
5506emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8:
5507 MOVQ (SI), R9
5508 MOVQ R9, (AX)
5509 JMP memmove_end_copy_match_emit_encodeBlockAsm8B
5510
5511emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16:
5512 MOVQ (SI), R9
5513 MOVQ -8(SI)(R8*1), SI
5514 MOVQ R9, (AX)
5515 MOVQ SI, -8(AX)(R8*1)
5516 JMP memmove_end_copy_match_emit_encodeBlockAsm8B
5517
5518emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32:
5519 MOVOU (SI), X0
5520 MOVOU -16(SI)(R8*1), X1
5521 MOVOU X0, (AX)
5522 MOVOU X1, -16(AX)(R8*1)
5523 JMP memmove_end_copy_match_emit_encodeBlockAsm8B
5524
5525emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64:
5526 MOVOU (SI), X0
5527 MOVOU 16(SI), X1
5528 MOVOU -32(SI)(R8*1), X2
5529 MOVOU -16(SI)(R8*1), X3
5530 MOVOU X0, (AX)
5531 MOVOU X1, 16(AX)
5532 MOVOU X2, -32(AX)(R8*1)
5533 MOVOU X3, -16(AX)(R8*1)
5534
5535memmove_end_copy_match_emit_encodeBlockAsm8B:
5536 MOVQ DI, AX
5537 JMP emit_literal_done_match_emit_encodeBlockAsm8B
5538
5539memmove_long_match_emit_encodeBlockAsm8B:
5540 LEAQ (AX)(R8*1), DI
5541
5542 // genMemMoveLong
5543 MOVOU (SI), X0
5544 MOVOU 16(SI), X1
5545 MOVOU -32(SI)(R8*1), X2
5546 MOVOU -16(SI)(R8*1), X3
5547 MOVQ R8, R10
5548 SHRQ $0x05, R10
5549 MOVQ AX, R9
5550 ANDL $0x0000001f, R9
5551 MOVQ $0x00000040, R11
5552 SUBQ R9, R11
5553 DECQ R10
5554 JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
5555 LEAQ -32(SI)(R11*1), R9
5556 LEAQ -32(AX)(R11*1), R12
5557
5558emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back:
5559 MOVOU (R9), X4
5560 MOVOU 16(R9), X5
5561 MOVOA X4, (R12)
5562 MOVOA X5, 16(R12)
5563 ADDQ $0x20, R12
5564 ADDQ $0x20, R9
5565 ADDQ $0x20, R11
5566 DECQ R10
5567 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back
5568
5569emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
5570 MOVOU -32(SI)(R11*1), X4
5571 MOVOU -16(SI)(R11*1), X5
5572 MOVOA X4, -32(AX)(R11*1)
5573 MOVOA X5, -16(AX)(R11*1)
5574 ADDQ $0x20, R11
5575 CMPQ R8, R11
5576 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
5577 MOVOU X0, (AX)
5578 MOVOU X1, 16(AX)
5579 MOVOU X2, -32(AX)(R8*1)
5580 MOVOU X3, -16(AX)(R8*1)
5581 MOVQ DI, AX
5582
5583emit_literal_done_match_emit_encodeBlockAsm8B:
5584match_nolit_loop_encodeBlockAsm8B:
5585 MOVL CX, SI
5586 SUBL BX, SI
5587 MOVL SI, 16(SP)
5588 ADDL $0x04, CX
5589 ADDL $0x04, BX
5590 MOVQ src_len+32(FP), SI
5591 SUBL CX, SI
5592 LEAQ (DX)(CX*1), DI
5593 LEAQ (DX)(BX*1), BX
5594
5595 // matchLen
5596 XORL R9, R9
5597
5598matchlen_loopback_16_match_nolit_encodeBlockAsm8B:
5599 CMPL SI, $0x10
5600 JB matchlen_match8_match_nolit_encodeBlockAsm8B
5601 MOVQ (DI)(R9*1), R8
5602 MOVQ 8(DI)(R9*1), R10
5603 XORQ (BX)(R9*1), R8
5604 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm8B
5605 XORQ 8(BX)(R9*1), R10
5606 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm8B
5607 LEAL -16(SI), SI
5608 LEAL 16(R9), R9
5609 JMP matchlen_loopback_16_match_nolit_encodeBlockAsm8B
5610
5611matchlen_bsf_16match_nolit_encodeBlockAsm8B:
5612#ifdef GOAMD64_v3
5613 TZCNTQ R10, R10
5614
5615#else
5616 BSFQ R10, R10
5617
5618#endif
5619 SARQ $0x03, R10
5620 LEAL 8(R9)(R10*1), R9
5621 JMP match_nolit_end_encodeBlockAsm8B
5622
5623matchlen_match8_match_nolit_encodeBlockAsm8B:
5624 CMPL SI, $0x08
5625 JB matchlen_match4_match_nolit_encodeBlockAsm8B
5626 MOVQ (DI)(R9*1), R8
5627 XORQ (BX)(R9*1), R8
5628 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm8B
5629 LEAL -8(SI), SI
5630 LEAL 8(R9), R9
5631 JMP matchlen_match4_match_nolit_encodeBlockAsm8B
5632
5633matchlen_bsf_8_match_nolit_encodeBlockAsm8B:
5634#ifdef GOAMD64_v3
5635 TZCNTQ R8, R8
5636
5637#else
5638 BSFQ R8, R8
5639
5640#endif
5641 SARQ $0x03, R8
5642 LEAL (R9)(R8*1), R9
5643 JMP match_nolit_end_encodeBlockAsm8B
5644
5645matchlen_match4_match_nolit_encodeBlockAsm8B:
5646 CMPL SI, $0x04
5647 JB matchlen_match2_match_nolit_encodeBlockAsm8B
5648 MOVL (DI)(R9*1), R8
5649 CMPL (BX)(R9*1), R8
5650 JNE matchlen_match2_match_nolit_encodeBlockAsm8B
5651 LEAL -4(SI), SI
5652 LEAL 4(R9), R9
5653
5654matchlen_match2_match_nolit_encodeBlockAsm8B:
5655 CMPL SI, $0x01
5656 JE matchlen_match1_match_nolit_encodeBlockAsm8B
5657 JB match_nolit_end_encodeBlockAsm8B
5658 MOVW (DI)(R9*1), R8
5659 CMPW (BX)(R9*1), R8
5660 JNE matchlen_match1_match_nolit_encodeBlockAsm8B
5661 LEAL 2(R9), R9
5662 SUBL $0x02, SI
5663 JZ match_nolit_end_encodeBlockAsm8B
5664
5665matchlen_match1_match_nolit_encodeBlockAsm8B:
5666 MOVB (DI)(R9*1), R8
5667 CMPB (BX)(R9*1), R8
5668 JNE match_nolit_end_encodeBlockAsm8B
5669 LEAL 1(R9), R9
5670
5671match_nolit_end_encodeBlockAsm8B:
5672 ADDL R9, CX
5673 MOVL 16(SP), BX
5674 ADDL $0x04, R9
5675 MOVL CX, 12(SP)
5676
5677 // emitCopy
5678 CMPL R9, $0x40
5679 JBE two_byte_offset_short_match_nolit_encodeBlockAsm8B
5680 CMPL BX, $0x00000800
5681 JAE long_offset_short_match_nolit_encodeBlockAsm8B
5682 MOVL $0x00000001, SI
5683 LEAL 16(SI), SI
5684 MOVB BL, 1(AX)
5685 SHRL $0x08, BX
5686 SHLL $0x05, BX
5687 ORL BX, SI
5688 MOVB SI, (AX)
5689 ADDQ $0x02, AX
5690 SUBL $0x08, R9
5691
5692 // emitRepeat
5693 LEAL -4(R9), R9
5694 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
5695 MOVL R9, BX
5696 LEAL -4(R9), R9
5697 CMPL BX, $0x08
5698 JBE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
5699 CMPL BX, $0x0c
5700 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
5701
5702cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
5703 CMPL R9, $0x00000104
5704 JB repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
5705 LEAL -256(R9), R9
5706 MOVW $0x0019, (AX)
5707 MOVW R9, 2(AX)
5708 ADDQ $0x04, AX
5709 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5710
5711repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
5712 LEAL -4(R9), R9
5713 MOVW $0x0015, (AX)
5714 MOVB R9, 2(AX)
5715 ADDQ $0x03, AX
5716 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5717
5718repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
5719 SHLL $0x02, R9
5720 ORL $0x01, R9
5721 MOVW R9, (AX)
5722 ADDQ $0x02, AX
5723 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5724 XORQ SI, SI
5725 LEAL 1(SI)(R9*4), R9
5726 MOVB BL, 1(AX)
5727 SARL $0x08, BX
5728 SHLL $0x05, BX
5729 ORL BX, R9
5730 MOVB R9, (AX)
5731 ADDQ $0x02, AX
5732 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5733
5734long_offset_short_match_nolit_encodeBlockAsm8B:
5735 MOVB $0xee, (AX)
5736 MOVW BX, 1(AX)
5737 LEAL -60(R9), R9
5738 ADDQ $0x03, AX
5739
5740 // emitRepeat
5741 MOVL R9, BX
5742 LEAL -4(R9), R9
5743 CMPL BX, $0x08
5744 JBE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short
5745 CMPL BX, $0x0c
5746 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short
5747
5748cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short:
5749 CMPL R9, $0x00000104
5750 JB repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short
5751 LEAL -256(R9), R9
5752 MOVW $0x0019, (AX)
5753 MOVW R9, 2(AX)
5754 ADDQ $0x04, AX
5755 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5756
5757repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short:
5758 LEAL -4(R9), R9
5759 MOVW $0x0015, (AX)
5760 MOVB R9, 2(AX)
5761 ADDQ $0x03, AX
5762 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5763
5764repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short:
5765 SHLL $0x02, R9
5766 ORL $0x01, R9
5767 MOVW R9, (AX)
5768 ADDQ $0x02, AX
5769 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5770 XORQ SI, SI
5771 LEAL 1(SI)(R9*4), R9
5772 MOVB BL, 1(AX)
5773 SARL $0x08, BX
5774 SHLL $0x05, BX
5775 ORL BX, R9
5776 MOVB R9, (AX)
5777 ADDQ $0x02, AX
5778 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5779
5780two_byte_offset_short_match_nolit_encodeBlockAsm8B:
5781 MOVL R9, SI
5782 SHLL $0x02, SI
5783 CMPL R9, $0x0c
5784 JAE emit_copy_three_match_nolit_encodeBlockAsm8B
5785 LEAL -15(SI), SI
5786 MOVB BL, 1(AX)
5787 SHRL $0x08, BX
5788 SHLL $0x05, BX
5789 ORL BX, SI
5790 MOVB SI, (AX)
5791 ADDQ $0x02, AX
5792 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5793
5794emit_copy_three_match_nolit_encodeBlockAsm8B:
5795 LEAL -2(SI), SI
5796 MOVB SI, (AX)
5797 MOVW BX, 1(AX)
5798 ADDQ $0x03, AX
5799
5800match_nolit_emitcopy_end_encodeBlockAsm8B:
5801 CMPL CX, 8(SP)
5802 JAE emit_remainder_encodeBlockAsm8B
5803 MOVQ -2(DX)(CX*1), SI
5804 CMPQ AX, (SP)
5805 JB match_nolit_dst_ok_encodeBlockAsm8B
5806 MOVQ $0x00000000, ret+48(FP)
5807 RET
5808
5809match_nolit_dst_ok_encodeBlockAsm8B:
5810 MOVQ $0x9e3779b1, R8
5811 MOVQ SI, DI
5812 SHRQ $0x10, SI
5813 MOVQ SI, BX
5814 SHLQ $0x20, DI
5815 IMULQ R8, DI
5816 SHRQ $0x38, DI
5817 SHLQ $0x20, BX
5818 IMULQ R8, BX
5819 SHRQ $0x38, BX
5820 LEAL -2(CX), R8
5821 LEAQ 24(SP)(BX*4), R9
5822 MOVL (R9), BX
5823 MOVL R8, 24(SP)(DI*4)
5824 MOVL CX, (R9)
5825 CMPL (DX)(BX*1), SI
5826 JEQ match_nolit_loop_encodeBlockAsm8B
5827 INCL CX
5828 JMP search_loop_encodeBlockAsm8B
5829
5830emit_remainder_encodeBlockAsm8B:
5831 MOVQ src_len+32(FP), CX
5832 SUBL 12(SP), CX
5833 LEAQ 3(AX)(CX*1), CX
5834 CMPQ CX, (SP)
5835 JB emit_remainder_ok_encodeBlockAsm8B
5836 MOVQ $0x00000000, ret+48(FP)
5837 RET
5838
5839emit_remainder_ok_encodeBlockAsm8B:
5840 MOVQ src_len+32(FP), CX
5841 MOVL 12(SP), BX
5842 CMPL BX, CX
5843 JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B
5844 MOVL CX, SI
5845 MOVL CX, 12(SP)
5846 LEAQ (DX)(BX*1), CX
5847 SUBL BX, SI
5848 LEAL -1(SI), DX
5849 CMPL DX, $0x3c
5850 JB one_byte_emit_remainder_encodeBlockAsm8B
5851 CMPL DX, $0x00000100
5852 JB two_bytes_emit_remainder_encodeBlockAsm8B
5853 JB three_bytes_emit_remainder_encodeBlockAsm8B
5854
5855three_bytes_emit_remainder_encodeBlockAsm8B:
5856 MOVB $0xf4, (AX)
5857 MOVW DX, 1(AX)
5858 ADDQ $0x03, AX
5859 JMP memmove_long_emit_remainder_encodeBlockAsm8B
5860
5861two_bytes_emit_remainder_encodeBlockAsm8B:
5862 MOVB $0xf0, (AX)
5863 MOVB DL, 1(AX)
5864 ADDQ $0x02, AX
5865 CMPL DX, $0x40
5866 JB memmove_emit_remainder_encodeBlockAsm8B
5867 JMP memmove_long_emit_remainder_encodeBlockAsm8B
5868
5869one_byte_emit_remainder_encodeBlockAsm8B:
5870 SHLB $0x02, DL
5871 MOVB DL, (AX)
5872 ADDQ $0x01, AX
5873
5874memmove_emit_remainder_encodeBlockAsm8B:
5875 LEAQ (AX)(SI*1), DX
5876 MOVL SI, BX
5877
5878 // genMemMoveShort
5879 CMPQ BX, $0x03
5880 JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2
5881 JE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3
5882 CMPQ BX, $0x08
5883 JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7
5884 CMPQ BX, $0x10
5885 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16
5886 CMPQ BX, $0x20
5887 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32
5888 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64
5889
5890emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2:
5891 MOVB (CX), SI
5892 MOVB -1(CX)(BX*1), CL
5893 MOVB SI, (AX)
5894 MOVB CL, -1(AX)(BX*1)
5895 JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
5896
5897emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3:
5898 MOVW (CX), SI
5899 MOVB 2(CX), CL
5900 MOVW SI, (AX)
5901 MOVB CL, 2(AX)
5902 JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
5903
5904emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7:
5905 MOVL (CX), SI
5906 MOVL -4(CX)(BX*1), CX
5907 MOVL SI, (AX)
5908 MOVL CX, -4(AX)(BX*1)
5909 JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
5910
5911emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16:
5912 MOVQ (CX), SI
5913 MOVQ -8(CX)(BX*1), CX
5914 MOVQ SI, (AX)
5915 MOVQ CX, -8(AX)(BX*1)
5916 JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
5917
5918emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32:
5919 MOVOU (CX), X0
5920 MOVOU -16(CX)(BX*1), X1
5921 MOVOU X0, (AX)
5922 MOVOU X1, -16(AX)(BX*1)
5923 JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
5924
5925emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64:
5926 MOVOU (CX), X0
5927 MOVOU 16(CX), X1
5928 MOVOU -32(CX)(BX*1), X2
5929 MOVOU -16(CX)(BX*1), X3
5930 MOVOU X0, (AX)
5931 MOVOU X1, 16(AX)
5932 MOVOU X2, -32(AX)(BX*1)
5933 MOVOU X3, -16(AX)(BX*1)
5934
5935memmove_end_copy_emit_remainder_encodeBlockAsm8B:
5936 MOVQ DX, AX
5937 JMP emit_literal_done_emit_remainder_encodeBlockAsm8B
5938
5939memmove_long_emit_remainder_encodeBlockAsm8B:
5940 LEAQ (AX)(SI*1), DX
5941 MOVL SI, BX
5942
5943 // genMemMoveLong
5944 MOVOU (CX), X0
5945 MOVOU 16(CX), X1
5946 MOVOU -32(CX)(BX*1), X2
5947 MOVOU -16(CX)(BX*1), X3
5948 MOVQ BX, DI
5949 SHRQ $0x05, DI
5950 MOVQ AX, SI
5951 ANDL $0x0000001f, SI
5952 MOVQ $0x00000040, R8
5953 SUBQ SI, R8
5954 DECQ DI
5955 JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
5956 LEAQ -32(CX)(R8*1), SI
5957 LEAQ -32(AX)(R8*1), R9
5958
5959emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back:
5960 MOVOU (SI), X4
5961 MOVOU 16(SI), X5
5962 MOVOA X4, (R9)
5963 MOVOA X5, 16(R9)
5964 ADDQ $0x20, R9
5965 ADDQ $0x20, SI
5966 ADDQ $0x20, R8
5967 DECQ DI
5968 JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back
5969
5970emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32:
5971 MOVOU -32(CX)(R8*1), X4
5972 MOVOU -16(CX)(R8*1), X5
5973 MOVOA X4, -32(AX)(R8*1)
5974 MOVOA X5, -16(AX)(R8*1)
5975 ADDQ $0x20, R8
5976 CMPQ BX, R8
5977 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
5978 MOVOU X0, (AX)
5979 MOVOU X1, 16(AX)
5980 MOVOU X2, -32(AX)(BX*1)
5981 MOVOU X3, -16(AX)(BX*1)
5982 MOVQ DX, AX
5983
5984emit_literal_done_emit_remainder_encodeBlockAsm8B:
5985 MOVQ dst_base+0(FP), CX
5986 SUBQ CX, AX
5987 MOVQ AX, ret+48(FP)
5988 RET
5989
5990// func encodeBetterBlockAsm(dst []byte, src []byte) int
5991// Requires: BMI, SSE2
5992TEXT ·encodeBetterBlockAsm(SB), $589848-56
5993 MOVQ dst_base+0(FP), AX
5994 MOVQ $0x00001200, CX
5995 LEAQ 24(SP), DX
5996 PXOR X0, X0
5997
5998zero_loop_encodeBetterBlockAsm:
5999 MOVOU X0, (DX)
6000 MOVOU X0, 16(DX)
6001 MOVOU X0, 32(DX)
6002 MOVOU X0, 48(DX)
6003 MOVOU X0, 64(DX)
6004 MOVOU X0, 80(DX)
6005 MOVOU X0, 96(DX)
6006 MOVOU X0, 112(DX)
6007 ADDQ $0x80, DX
6008 DECQ CX
6009 JNZ zero_loop_encodeBetterBlockAsm
6010 MOVL $0x00000000, 12(SP)
6011 MOVQ src_len+32(FP), CX
6012 LEAQ -6(CX), DX
6013 LEAQ -8(CX), BX
6014 MOVL BX, 8(SP)
6015 SHRQ $0x05, CX
6016 SUBL CX, DX
6017 LEAQ (AX)(DX*1), DX
6018 MOVQ DX, (SP)
6019 MOVL $0x00000001, CX
6020 MOVL $0x00000000, 16(SP)
6021 MOVQ src_base+24(FP), DX
6022
6023search_loop_encodeBetterBlockAsm:
6024 MOVL CX, BX
6025 SUBL 12(SP), BX
6026 SHRL $0x07, BX
6027 CMPL BX, $0x63
6028 JBE check_maxskip_ok_encodeBetterBlockAsm
6029 LEAL 100(CX), BX
6030 JMP check_maxskip_cont_encodeBetterBlockAsm
6031
6032check_maxskip_ok_encodeBetterBlockAsm:
6033 LEAL 1(CX)(BX*1), BX
6034
6035check_maxskip_cont_encodeBetterBlockAsm:
6036 CMPL BX, 8(SP)
6037 JAE emit_remainder_encodeBetterBlockAsm
6038 MOVQ (DX)(CX*1), SI
6039 MOVL BX, 20(SP)
6040 MOVQ $0x00cf1bbcdcbfa563, R8
6041 MOVQ $0x9e3779b1, BX
6042 MOVQ SI, R9
6043 MOVQ SI, R10
6044 SHLQ $0x08, R9
6045 IMULQ R8, R9
6046 SHRQ $0x2f, R9
6047 SHLQ $0x20, R10
6048 IMULQ BX, R10
6049 SHRQ $0x32, R10
6050 MOVL 24(SP)(R9*4), BX
6051 MOVL 524312(SP)(R10*4), DI
6052 MOVL CX, 24(SP)(R9*4)
6053 MOVL CX, 524312(SP)(R10*4)
6054 MOVQ (DX)(BX*1), R9
6055 MOVQ (DX)(DI*1), R10
6056 CMPQ R9, SI
6057 JEQ candidate_match_encodeBetterBlockAsm
6058 CMPQ R10, SI
6059 JNE no_short_found_encodeBetterBlockAsm
6060 MOVL DI, BX
6061 JMP candidate_match_encodeBetterBlockAsm
6062
6063no_short_found_encodeBetterBlockAsm:
6064 CMPL R9, SI
6065 JEQ candidate_match_encodeBetterBlockAsm
6066 CMPL R10, SI
6067 JEQ candidateS_match_encodeBetterBlockAsm
6068 MOVL 20(SP), CX
6069 JMP search_loop_encodeBetterBlockAsm
6070
6071candidateS_match_encodeBetterBlockAsm:
6072 SHRQ $0x08, SI
6073 MOVQ SI, R9
6074 SHLQ $0x08, R9
6075 IMULQ R8, R9
6076 SHRQ $0x2f, R9
6077 MOVL 24(SP)(R9*4), BX
6078 INCL CX
6079 MOVL CX, 24(SP)(R9*4)
6080 CMPL (DX)(BX*1), SI
6081 JEQ candidate_match_encodeBetterBlockAsm
6082 DECL CX
6083 MOVL DI, BX
6084
6085candidate_match_encodeBetterBlockAsm:
6086 MOVL 12(SP), SI
6087 TESTL BX, BX
6088 JZ match_extend_back_end_encodeBetterBlockAsm
6089
6090match_extend_back_loop_encodeBetterBlockAsm:
6091 CMPL CX, SI
6092 JBE match_extend_back_end_encodeBetterBlockAsm
6093 MOVB -1(DX)(BX*1), DI
6094 MOVB -1(DX)(CX*1), R8
6095 CMPB DI, R8
6096 JNE match_extend_back_end_encodeBetterBlockAsm
6097 LEAL -1(CX), CX
6098 DECL BX
6099 JZ match_extend_back_end_encodeBetterBlockAsm
6100 JMP match_extend_back_loop_encodeBetterBlockAsm
6101
6102match_extend_back_end_encodeBetterBlockAsm:
6103 MOVL CX, SI
6104 SUBL 12(SP), SI
6105 LEAQ 5(AX)(SI*1), SI
6106 CMPQ SI, (SP)
6107 JB match_dst_size_check_encodeBetterBlockAsm
6108 MOVQ $0x00000000, ret+48(FP)
6109 RET
6110
6111match_dst_size_check_encodeBetterBlockAsm:
6112 MOVL CX, SI
6113 ADDL $0x04, CX
6114 ADDL $0x04, BX
6115 MOVQ src_len+32(FP), DI
6116 SUBL CX, DI
6117 LEAQ (DX)(CX*1), R8
6118 LEAQ (DX)(BX*1), R9
6119
6120 // matchLen
6121 XORL R11, R11
6122
6123matchlen_loopback_16_match_nolit_encodeBetterBlockAsm:
6124 CMPL DI, $0x10
6125 JB matchlen_match8_match_nolit_encodeBetterBlockAsm
6126 MOVQ (R8)(R11*1), R10
6127 MOVQ 8(R8)(R11*1), R12
6128 XORQ (R9)(R11*1), R10
6129 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm
6130 XORQ 8(R9)(R11*1), R12
6131 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm
6132 LEAL -16(DI), DI
6133 LEAL 16(R11), R11
6134 JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm
6135
6136matchlen_bsf_16match_nolit_encodeBetterBlockAsm:
6137#ifdef GOAMD64_v3
6138 TZCNTQ R12, R12
6139
6140#else
6141 BSFQ R12, R12
6142
6143#endif
6144 SARQ $0x03, R12
6145 LEAL 8(R11)(R12*1), R11
6146 JMP match_nolit_end_encodeBetterBlockAsm
6147
6148matchlen_match8_match_nolit_encodeBetterBlockAsm:
6149 CMPL DI, $0x08
6150 JB matchlen_match4_match_nolit_encodeBetterBlockAsm
6151 MOVQ (R8)(R11*1), R10
6152 XORQ (R9)(R11*1), R10
6153 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm
6154 LEAL -8(DI), DI
6155 LEAL 8(R11), R11
6156 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm
6157
6158matchlen_bsf_8_match_nolit_encodeBetterBlockAsm:
6159#ifdef GOAMD64_v3
6160 TZCNTQ R10, R10
6161
6162#else
6163 BSFQ R10, R10
6164
6165#endif
6166 SARQ $0x03, R10
6167 LEAL (R11)(R10*1), R11
6168 JMP match_nolit_end_encodeBetterBlockAsm
6169
6170matchlen_match4_match_nolit_encodeBetterBlockAsm:
6171 CMPL DI, $0x04
6172 JB matchlen_match2_match_nolit_encodeBetterBlockAsm
6173 MOVL (R8)(R11*1), R10
6174 CMPL (R9)(R11*1), R10
6175 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm
6176 LEAL -4(DI), DI
6177 LEAL 4(R11), R11
6178
6179matchlen_match2_match_nolit_encodeBetterBlockAsm:
6180 CMPL DI, $0x01
6181 JE matchlen_match1_match_nolit_encodeBetterBlockAsm
6182 JB match_nolit_end_encodeBetterBlockAsm
6183 MOVW (R8)(R11*1), R10
6184 CMPW (R9)(R11*1), R10
6185 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm
6186 LEAL 2(R11), R11
6187 SUBL $0x02, DI
6188 JZ match_nolit_end_encodeBetterBlockAsm
6189
6190matchlen_match1_match_nolit_encodeBetterBlockAsm:
6191 MOVB (R8)(R11*1), R10
6192 CMPB (R9)(R11*1), R10
6193 JNE match_nolit_end_encodeBetterBlockAsm
6194 LEAL 1(R11), R11
6195
6196match_nolit_end_encodeBetterBlockAsm:
6197 MOVL CX, DI
6198 SUBL BX, DI
6199
6200 // Check if repeat
6201 CMPL 16(SP), DI
6202 JEQ match_is_repeat_encodeBetterBlockAsm
6203 CMPL R11, $0x01
6204 JA match_length_ok_encodeBetterBlockAsm
6205 CMPL DI, $0x0000ffff
6206 JBE match_length_ok_encodeBetterBlockAsm
6207 MOVL 20(SP), CX
6208 INCL CX
6209 JMP search_loop_encodeBetterBlockAsm
6210
6211match_length_ok_encodeBetterBlockAsm:
6212 MOVL DI, 16(SP)
6213 MOVL 12(SP), BX
6214 CMPL BX, SI
6215 JEQ emit_literal_done_match_emit_encodeBetterBlockAsm
6216 MOVL SI, R8
6217 MOVL SI, 12(SP)
6218 LEAQ (DX)(BX*1), R9
6219 SUBL BX, R8
6220 LEAL -1(R8), BX
6221 CMPL BX, $0x3c
6222 JB one_byte_match_emit_encodeBetterBlockAsm
6223 CMPL BX, $0x00000100
6224 JB two_bytes_match_emit_encodeBetterBlockAsm
6225 CMPL BX, $0x00010000
6226 JB three_bytes_match_emit_encodeBetterBlockAsm
6227 CMPL BX, $0x01000000
6228 JB four_bytes_match_emit_encodeBetterBlockAsm
6229 MOVB $0xfc, (AX)
6230 MOVL BX, 1(AX)
6231 ADDQ $0x05, AX
6232 JMP memmove_long_match_emit_encodeBetterBlockAsm
6233
6234four_bytes_match_emit_encodeBetterBlockAsm:
6235 MOVL BX, R10
6236 SHRL $0x10, R10
6237 MOVB $0xf8, (AX)
6238 MOVW BX, 1(AX)
6239 MOVB R10, 3(AX)
6240 ADDQ $0x04, AX
6241 JMP memmove_long_match_emit_encodeBetterBlockAsm
6242
6243three_bytes_match_emit_encodeBetterBlockAsm:
6244 MOVB $0xf4, (AX)
6245 MOVW BX, 1(AX)
6246 ADDQ $0x03, AX
6247 JMP memmove_long_match_emit_encodeBetterBlockAsm
6248
6249two_bytes_match_emit_encodeBetterBlockAsm:
6250 MOVB $0xf0, (AX)
6251 MOVB BL, 1(AX)
6252 ADDQ $0x02, AX
6253 CMPL BX, $0x40
6254 JB memmove_match_emit_encodeBetterBlockAsm
6255 JMP memmove_long_match_emit_encodeBetterBlockAsm
6256
6257one_byte_match_emit_encodeBetterBlockAsm:
6258 SHLB $0x02, BL
6259 MOVB BL, (AX)
6260 ADDQ $0x01, AX
6261
6262memmove_match_emit_encodeBetterBlockAsm:
6263 LEAQ (AX)(R8*1), BX
6264
6265 // genMemMoveShort
6266 CMPQ R8, $0x04
6267 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4
6268 CMPQ R8, $0x08
6269 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7
6270 CMPQ R8, $0x10
6271 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16
6272 CMPQ R8, $0x20
6273 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32
6274 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64
6275
6276emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4:
6277 MOVL (R9), R10
6278 MOVL R10, (AX)
6279 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
6280
6281emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7:
6282 MOVL (R9), R10
6283 MOVL -4(R9)(R8*1), R9
6284 MOVL R10, (AX)
6285 MOVL R9, -4(AX)(R8*1)
6286 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
6287
6288emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16:
6289 MOVQ (R9), R10
6290 MOVQ -8(R9)(R8*1), R9
6291 MOVQ R10, (AX)
6292 MOVQ R9, -8(AX)(R8*1)
6293 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
6294
6295emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
6296 MOVOU (R9), X0
6297 MOVOU -16(R9)(R8*1), X1
6298 MOVOU X0, (AX)
6299 MOVOU X1, -16(AX)(R8*1)
6300 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
6301
6302emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
6303 MOVOU (R9), X0
6304 MOVOU 16(R9), X1
6305 MOVOU -32(R9)(R8*1), X2
6306 MOVOU -16(R9)(R8*1), X3
6307 MOVOU X0, (AX)
6308 MOVOU X1, 16(AX)
6309 MOVOU X2, -32(AX)(R8*1)
6310 MOVOU X3, -16(AX)(R8*1)
6311
6312memmove_end_copy_match_emit_encodeBetterBlockAsm:
6313 MOVQ BX, AX
6314 JMP emit_literal_done_match_emit_encodeBetterBlockAsm
6315
6316memmove_long_match_emit_encodeBetterBlockAsm:
6317 LEAQ (AX)(R8*1), BX
6318
6319 // genMemMoveLong
6320 MOVOU (R9), X0
6321 MOVOU 16(R9), X1
6322 MOVOU -32(R9)(R8*1), X2
6323 MOVOU -16(R9)(R8*1), X3
6324 MOVQ R8, R12
6325 SHRQ $0x05, R12
6326 MOVQ AX, R10
6327 ANDL $0x0000001f, R10
6328 MOVQ $0x00000040, R13
6329 SUBQ R10, R13
6330 DECQ R12
6331 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
6332 LEAQ -32(R9)(R13*1), R10
6333 LEAQ -32(AX)(R13*1), R14
6334
6335emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back:
6336 MOVOU (R10), X4
6337 MOVOU 16(R10), X5
6338 MOVOA X4, (R14)
6339 MOVOA X5, 16(R14)
6340 ADDQ $0x20, R14
6341 ADDQ $0x20, R10
6342 ADDQ $0x20, R13
6343 DECQ R12
6344 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back
6345
6346emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
6347 MOVOU -32(R9)(R13*1), X4
6348 MOVOU -16(R9)(R13*1), X5
6349 MOVOA X4, -32(AX)(R13*1)
6350 MOVOA X5, -16(AX)(R13*1)
6351 ADDQ $0x20, R13
6352 CMPQ R8, R13
6353 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
6354 MOVOU X0, (AX)
6355 MOVOU X1, 16(AX)
6356 MOVOU X2, -32(AX)(R8*1)
6357 MOVOU X3, -16(AX)(R8*1)
6358 MOVQ BX, AX
6359
6360emit_literal_done_match_emit_encodeBetterBlockAsm:
6361 ADDL R11, CX
6362 ADDL $0x04, R11
6363 MOVL CX, 12(SP)
6364
6365 // emitCopy
6366 CMPL DI, $0x00010000
6367 JB two_byte_offset_match_nolit_encodeBetterBlockAsm
6368 CMPL R11, $0x40
6369 JBE four_bytes_remain_match_nolit_encodeBetterBlockAsm
6370 MOVB $0xff, (AX)
6371 MOVL DI, 1(AX)
6372 LEAL -64(R11), R11
6373 ADDQ $0x05, AX
6374 CMPL R11, $0x04
6375 JB four_bytes_remain_match_nolit_encodeBetterBlockAsm
6376
6377 // emitRepeat
6378emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy:
6379 MOVL R11, BX
6380 LEAL -4(R11), R11
6381 CMPL BX, $0x08
6382 JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy
6383 CMPL BX, $0x0c
6384 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
6385 CMPL DI, $0x00000800
6386 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
6387
6388cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
6389 CMPL R11, $0x00000104
6390 JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy
6391 CMPL R11, $0x00010100
6392 JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy
6393 CMPL R11, $0x0100ffff
6394 JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy
6395 LEAL -16842747(R11), R11
6396 MOVL $0xfffb001d, (AX)
6397 MOVB $0xff, 4(AX)
6398 ADDQ $0x05, AX
6399 JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy
6400
6401repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy:
6402 LEAL -65536(R11), R11
6403 MOVL R11, DI
6404 MOVW $0x001d, (AX)
6405 MOVW R11, 2(AX)
6406 SARL $0x10, DI
6407 MOVB DI, 4(AX)
6408 ADDQ $0x05, AX
6409 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6410
6411repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy:
6412 LEAL -256(R11), R11
6413 MOVW $0x0019, (AX)
6414 MOVW R11, 2(AX)
6415 ADDQ $0x04, AX
6416 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6417
6418repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy:
6419 LEAL -4(R11), R11
6420 MOVW $0x0015, (AX)
6421 MOVB R11, 2(AX)
6422 ADDQ $0x03, AX
6423 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6424
6425repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy:
6426 SHLL $0x02, R11
6427 ORL $0x01, R11
6428 MOVW R11, (AX)
6429 ADDQ $0x02, AX
6430 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6431
6432repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
6433 XORQ BX, BX
6434 LEAL 1(BX)(R11*4), R11
6435 MOVB DI, 1(AX)
6436 SARL $0x08, DI
6437 SHLL $0x05, DI
6438 ORL DI, R11
6439 MOVB R11, (AX)
6440 ADDQ $0x02, AX
6441 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6442
6443four_bytes_remain_match_nolit_encodeBetterBlockAsm:
6444 TESTL R11, R11
6445 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm
6446 XORL BX, BX
6447 LEAL -1(BX)(R11*4), R11
6448 MOVB R11, (AX)
6449 MOVL DI, 1(AX)
6450 ADDQ $0x05, AX
6451 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6452
6453two_byte_offset_match_nolit_encodeBetterBlockAsm:
6454 CMPL R11, $0x40
6455 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm
6456 CMPL DI, $0x00000800
6457 JAE long_offset_short_match_nolit_encodeBetterBlockAsm
6458 MOVL $0x00000001, BX
6459 LEAL 16(BX), BX
6460 MOVB DI, 1(AX)
6461 MOVL DI, R8
6462 SHRL $0x08, R8
6463 SHLL $0x05, R8
6464 ORL R8, BX
6465 MOVB BL, (AX)
6466 ADDQ $0x02, AX
6467 SUBL $0x08, R11
6468
6469 // emitRepeat
6470 LEAL -4(R11), R11
6471 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6472
6473emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
6474 MOVL R11, BX
6475 LEAL -4(R11), R11
6476 CMPL BX, $0x08
6477 JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6478 CMPL BX, $0x0c
6479 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6480 CMPL DI, $0x00000800
6481 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6482
6483cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
6484 CMPL R11, $0x00000104
6485 JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6486 CMPL R11, $0x00010100
6487 JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6488 CMPL R11, $0x0100ffff
6489 JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6490 LEAL -16842747(R11), R11
6491 MOVL $0xfffb001d, (AX)
6492 MOVB $0xff, 4(AX)
6493 ADDQ $0x05, AX
6494 JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6495
6496repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
6497 LEAL -65536(R11), R11
6498 MOVL R11, DI
6499 MOVW $0x001d, (AX)
6500 MOVW R11, 2(AX)
6501 SARL $0x10, DI
6502 MOVB DI, 4(AX)
6503 ADDQ $0x05, AX
6504 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6505
6506repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
6507 LEAL -256(R11), R11
6508 MOVW $0x0019, (AX)
6509 MOVW R11, 2(AX)
6510 ADDQ $0x04, AX
6511 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6512
6513repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
6514 LEAL -4(R11), R11
6515 MOVW $0x0015, (AX)
6516 MOVB R11, 2(AX)
6517 ADDQ $0x03, AX
6518 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6519
6520repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
6521 SHLL $0x02, R11
6522 ORL $0x01, R11
6523 MOVW R11, (AX)
6524 ADDQ $0x02, AX
6525 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6526
6527repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
6528 XORQ BX, BX
6529 LEAL 1(BX)(R11*4), R11
6530 MOVB DI, 1(AX)
6531 SARL $0x08, DI
6532 SHLL $0x05, DI
6533 ORL DI, R11
6534 MOVB R11, (AX)
6535 ADDQ $0x02, AX
6536 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6537
6538long_offset_short_match_nolit_encodeBetterBlockAsm:
6539 MOVB $0xee, (AX)
6540 MOVW DI, 1(AX)
6541 LEAL -60(R11), R11
6542 ADDQ $0x03, AX
6543
6544 // emitRepeat
6545emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short:
6546 MOVL R11, BX
6547 LEAL -4(R11), R11
6548 CMPL BX, $0x08
6549 JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short
6550 CMPL BX, $0x0c
6551 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
6552 CMPL DI, $0x00000800
6553 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
6554
6555cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
6556 CMPL R11, $0x00000104
6557 JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short
6558 CMPL R11, $0x00010100
6559 JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short
6560 CMPL R11, $0x0100ffff
6561 JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short
6562 LEAL -16842747(R11), R11
6563 MOVL $0xfffb001d, (AX)
6564 MOVB $0xff, 4(AX)
6565 ADDQ $0x05, AX
6566 JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short
6567
6568repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short:
6569 LEAL -65536(R11), R11
6570 MOVL R11, DI
6571 MOVW $0x001d, (AX)
6572 MOVW R11, 2(AX)
6573 SARL $0x10, DI
6574 MOVB DI, 4(AX)
6575 ADDQ $0x05, AX
6576 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6577
6578repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short:
6579 LEAL -256(R11), R11
6580 MOVW $0x0019, (AX)
6581 MOVW R11, 2(AX)
6582 ADDQ $0x04, AX
6583 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6584
6585repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short:
6586 LEAL -4(R11), R11
6587 MOVW $0x0015, (AX)
6588 MOVB R11, 2(AX)
6589 ADDQ $0x03, AX
6590 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6591
6592repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short:
6593 SHLL $0x02, R11
6594 ORL $0x01, R11
6595 MOVW R11, (AX)
6596 ADDQ $0x02, AX
6597 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6598
6599repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
6600 XORQ BX, BX
6601 LEAL 1(BX)(R11*4), R11
6602 MOVB DI, 1(AX)
6603 SARL $0x08, DI
6604 SHLL $0x05, DI
6605 ORL DI, R11
6606 MOVB R11, (AX)
6607 ADDQ $0x02, AX
6608 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6609
6610two_byte_offset_short_match_nolit_encodeBetterBlockAsm:
6611 MOVL R11, BX
6612 SHLL $0x02, BX
6613 CMPL R11, $0x0c
6614 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm
6615 CMPL DI, $0x00000800
6616 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm
6617 LEAL -15(BX), BX
6618 MOVB DI, 1(AX)
6619 SHRL $0x08, DI
6620 SHLL $0x05, DI
6621 ORL DI, BX
6622 MOVB BL, (AX)
6623 ADDQ $0x02, AX
6624 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6625
6626emit_copy_three_match_nolit_encodeBetterBlockAsm:
6627 LEAL -2(BX), BX
6628 MOVB BL, (AX)
6629 MOVW DI, 1(AX)
6630 ADDQ $0x03, AX
6631 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6632
6633match_is_repeat_encodeBetterBlockAsm:
6634 MOVL 12(SP), BX
6635 CMPL BX, SI
6636 JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
6637 MOVL SI, R8
6638 MOVL SI, 12(SP)
6639 LEAQ (DX)(BX*1), R9
6640 SUBL BX, R8
6641 LEAL -1(R8), BX
6642 CMPL BX, $0x3c
6643 JB one_byte_match_emit_repeat_encodeBetterBlockAsm
6644 CMPL BX, $0x00000100
6645 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm
6646 CMPL BX, $0x00010000
6647 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm
6648 CMPL BX, $0x01000000
6649 JB four_bytes_match_emit_repeat_encodeBetterBlockAsm
6650 MOVB $0xfc, (AX)
6651 MOVL BX, 1(AX)
6652 ADDQ $0x05, AX
6653 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
6654
6655four_bytes_match_emit_repeat_encodeBetterBlockAsm:
6656 MOVL BX, R10
6657 SHRL $0x10, R10
6658 MOVB $0xf8, (AX)
6659 MOVW BX, 1(AX)
6660 MOVB R10, 3(AX)
6661 ADDQ $0x04, AX
6662 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
6663
6664three_bytes_match_emit_repeat_encodeBetterBlockAsm:
6665 MOVB $0xf4, (AX)
6666 MOVW BX, 1(AX)
6667 ADDQ $0x03, AX
6668 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
6669
6670two_bytes_match_emit_repeat_encodeBetterBlockAsm:
6671 MOVB $0xf0, (AX)
6672 MOVB BL, 1(AX)
6673 ADDQ $0x02, AX
6674 CMPL BX, $0x40
6675 JB memmove_match_emit_repeat_encodeBetterBlockAsm
6676 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
6677
6678one_byte_match_emit_repeat_encodeBetterBlockAsm:
6679 SHLB $0x02, BL
6680 MOVB BL, (AX)
6681 ADDQ $0x01, AX
6682
6683memmove_match_emit_repeat_encodeBetterBlockAsm:
6684 LEAQ (AX)(R8*1), BX
6685
6686 // genMemMoveShort
6687 CMPQ R8, $0x04
6688 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4
6689 CMPQ R8, $0x08
6690 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7
6691 CMPQ R8, $0x10
6692 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16
6693 CMPQ R8, $0x20
6694 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
6695 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64
6696
6697emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4:
6698 MOVL (R9), R10
6699 MOVL R10, (AX)
6700 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
6701
6702emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7:
6703 MOVL (R9), R10
6704 MOVL -4(R9)(R8*1), R9
6705 MOVL R10, (AX)
6706 MOVL R9, -4(AX)(R8*1)
6707 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
6708
6709emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16:
6710 MOVQ (R9), R10
6711 MOVQ -8(R9)(R8*1), R9
6712 MOVQ R10, (AX)
6713 MOVQ R9, -8(AX)(R8*1)
6714 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
6715
6716emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
6717 MOVOU (R9), X0
6718 MOVOU -16(R9)(R8*1), X1
6719 MOVOU X0, (AX)
6720 MOVOU X1, -16(AX)(R8*1)
6721 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
6722
6723emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
6724 MOVOU (R9), X0
6725 MOVOU 16(R9), X1
6726 MOVOU -32(R9)(R8*1), X2
6727 MOVOU -16(R9)(R8*1), X3
6728 MOVOU X0, (AX)
6729 MOVOU X1, 16(AX)
6730 MOVOU X2, -32(AX)(R8*1)
6731 MOVOU X3, -16(AX)(R8*1)
6732
6733memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm:
6734 MOVQ BX, AX
6735 JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
6736
6737memmove_long_match_emit_repeat_encodeBetterBlockAsm:
6738 LEAQ (AX)(R8*1), BX
6739
6740 // genMemMoveLong
6741 MOVOU (R9), X0
6742 MOVOU 16(R9), X1
6743 MOVOU -32(R9)(R8*1), X2
6744 MOVOU -16(R9)(R8*1), X3
6745 MOVQ R8, R12
6746 SHRQ $0x05, R12
6747 MOVQ AX, R10
6748 ANDL $0x0000001f, R10
6749 MOVQ $0x00000040, R13
6750 SUBQ R10, R13
6751 DECQ R12
6752 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
6753 LEAQ -32(R9)(R13*1), R10
6754 LEAQ -32(AX)(R13*1), R14
6755
6756emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back:
6757 MOVOU (R10), X4
6758 MOVOU 16(R10), X5
6759 MOVOA X4, (R14)
6760 MOVOA X5, 16(R14)
6761 ADDQ $0x20, R14
6762 ADDQ $0x20, R10
6763 ADDQ $0x20, R13
6764 DECQ R12
6765 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back
6766
6767emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32:
6768 MOVOU -32(R9)(R13*1), X4
6769 MOVOU -16(R9)(R13*1), X5
6770 MOVOA X4, -32(AX)(R13*1)
6771 MOVOA X5, -16(AX)(R13*1)
6772 ADDQ $0x20, R13
6773 CMPQ R8, R13
6774 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
6775 MOVOU X0, (AX)
6776 MOVOU X1, 16(AX)
6777 MOVOU X2, -32(AX)(R8*1)
6778 MOVOU X3, -16(AX)(R8*1)
6779 MOVQ BX, AX
6780
6781emit_literal_done_match_emit_repeat_encodeBetterBlockAsm:
6782 ADDL R11, CX
6783 ADDL $0x04, R11
6784 MOVL CX, 12(SP)
6785
6786 // emitRepeat
6787emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm:
6788 MOVL R11, BX
6789 LEAL -4(R11), R11
6790 CMPL BX, $0x08
6791 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm
6792 CMPL BX, $0x0c
6793 JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
6794 CMPL DI, $0x00000800
6795 JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
6796
6797cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
6798 CMPL R11, $0x00000104
6799 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm
6800 CMPL R11, $0x00010100
6801 JB repeat_four_match_nolit_repeat_encodeBetterBlockAsm
6802 CMPL R11, $0x0100ffff
6803 JB repeat_five_match_nolit_repeat_encodeBetterBlockAsm
6804 LEAL -16842747(R11), R11
6805 MOVL $0xfffb001d, (AX)
6806 MOVB $0xff, 4(AX)
6807 ADDQ $0x05, AX
6808 JMP emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm
6809
6810repeat_five_match_nolit_repeat_encodeBetterBlockAsm:
6811 LEAL -65536(R11), R11
6812 MOVL R11, DI
6813 MOVW $0x001d, (AX)
6814 MOVW R11, 2(AX)
6815 SARL $0x10, DI
6816 MOVB DI, 4(AX)
6817 ADDQ $0x05, AX
6818 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6819
6820repeat_four_match_nolit_repeat_encodeBetterBlockAsm:
6821 LEAL -256(R11), R11
6822 MOVW $0x0019, (AX)
6823 MOVW R11, 2(AX)
6824 ADDQ $0x04, AX
6825 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6826
6827repeat_three_match_nolit_repeat_encodeBetterBlockAsm:
6828 LEAL -4(R11), R11
6829 MOVW $0x0015, (AX)
6830 MOVB R11, 2(AX)
6831 ADDQ $0x03, AX
6832 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6833
6834repeat_two_match_nolit_repeat_encodeBetterBlockAsm:
6835 SHLL $0x02, R11
6836 ORL $0x01, R11
6837 MOVW R11, (AX)
6838 ADDQ $0x02, AX
6839 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6840
6841repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
6842 XORQ BX, BX
6843 LEAL 1(BX)(R11*4), R11
6844 MOVB DI, 1(AX)
6845 SARL $0x08, DI
6846 SHLL $0x05, DI
6847 ORL DI, R11
6848 MOVB R11, (AX)
6849 ADDQ $0x02, AX
6850
6851match_nolit_emitcopy_end_encodeBetterBlockAsm:
6852 CMPL CX, 8(SP)
6853 JAE emit_remainder_encodeBetterBlockAsm
6854 CMPQ AX, (SP)
6855 JB match_nolit_dst_ok_encodeBetterBlockAsm
6856 MOVQ $0x00000000, ret+48(FP)
6857 RET
6858
6859match_nolit_dst_ok_encodeBetterBlockAsm:
6860 MOVQ $0x00cf1bbcdcbfa563, BX
6861 MOVQ $0x9e3779b1, DI
6862 LEAQ 1(SI), SI
6863 LEAQ -2(CX), R8
6864 MOVQ (DX)(SI*1), R9
6865 MOVQ 1(DX)(SI*1), R10
6866 MOVQ (DX)(R8*1), R11
6867 MOVQ 1(DX)(R8*1), R12
6868 SHLQ $0x08, R9
6869 IMULQ BX, R9
6870 SHRQ $0x2f, R9
6871 SHLQ $0x20, R10
6872 IMULQ DI, R10
6873 SHRQ $0x32, R10
6874 SHLQ $0x08, R11
6875 IMULQ BX, R11
6876 SHRQ $0x2f, R11
6877 SHLQ $0x20, R12
6878 IMULQ DI, R12
6879 SHRQ $0x32, R12
6880 LEAQ 1(SI), DI
6881 LEAQ 1(R8), R13
6882 MOVL SI, 24(SP)(R9*4)
6883 MOVL R8, 24(SP)(R11*4)
6884 MOVL DI, 524312(SP)(R10*4)
6885 MOVL R13, 524312(SP)(R12*4)
6886 LEAQ 1(R8)(SI*1), DI
6887 SHRQ $0x01, DI
6888 ADDQ $0x01, SI
6889 SUBQ $0x01, R8
6890
6891index_loop_encodeBetterBlockAsm:
6892 CMPQ DI, R8
6893 JAE search_loop_encodeBetterBlockAsm
6894 MOVQ (DX)(SI*1), R9
6895 MOVQ (DX)(DI*1), R10
6896 SHLQ $0x08, R9
6897 IMULQ BX, R9
6898 SHRQ $0x2f, R9
6899 SHLQ $0x08, R10
6900 IMULQ BX, R10
6901 SHRQ $0x2f, R10
6902 MOVL SI, 24(SP)(R9*4)
6903 MOVL DI, 24(SP)(R10*4)
6904 ADDQ $0x02, SI
6905 ADDQ $0x02, DI
6906 JMP index_loop_encodeBetterBlockAsm
6907
6908emit_remainder_encodeBetterBlockAsm:
6909 MOVQ src_len+32(FP), CX
6910 SUBL 12(SP), CX
6911 LEAQ 5(AX)(CX*1), CX
6912 CMPQ CX, (SP)
6913 JB emit_remainder_ok_encodeBetterBlockAsm
6914 MOVQ $0x00000000, ret+48(FP)
6915 RET
6916
6917emit_remainder_ok_encodeBetterBlockAsm:
6918 MOVQ src_len+32(FP), CX
6919 MOVL 12(SP), BX
6920 CMPL BX, CX
6921 JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm
6922 MOVL CX, SI
6923 MOVL CX, 12(SP)
6924 LEAQ (DX)(BX*1), CX
6925 SUBL BX, SI
6926 LEAL -1(SI), DX
6927 CMPL DX, $0x3c
6928 JB one_byte_emit_remainder_encodeBetterBlockAsm
6929 CMPL DX, $0x00000100
6930 JB two_bytes_emit_remainder_encodeBetterBlockAsm
6931 CMPL DX, $0x00010000
6932 JB three_bytes_emit_remainder_encodeBetterBlockAsm
6933 CMPL DX, $0x01000000
6934 JB four_bytes_emit_remainder_encodeBetterBlockAsm
6935 MOVB $0xfc, (AX)
6936 MOVL DX, 1(AX)
6937 ADDQ $0x05, AX
6938 JMP memmove_long_emit_remainder_encodeBetterBlockAsm
6939
6940four_bytes_emit_remainder_encodeBetterBlockAsm:
6941 MOVL DX, BX
6942 SHRL $0x10, BX
6943 MOVB $0xf8, (AX)
6944 MOVW DX, 1(AX)
6945 MOVB BL, 3(AX)
6946 ADDQ $0x04, AX
6947 JMP memmove_long_emit_remainder_encodeBetterBlockAsm
6948
6949three_bytes_emit_remainder_encodeBetterBlockAsm:
6950 MOVB $0xf4, (AX)
6951 MOVW DX, 1(AX)
6952 ADDQ $0x03, AX
6953 JMP memmove_long_emit_remainder_encodeBetterBlockAsm
6954
6955two_bytes_emit_remainder_encodeBetterBlockAsm:
6956 MOVB $0xf0, (AX)
6957 MOVB DL, 1(AX)
6958 ADDQ $0x02, AX
6959 CMPL DX, $0x40
6960 JB memmove_emit_remainder_encodeBetterBlockAsm
6961 JMP memmove_long_emit_remainder_encodeBetterBlockAsm
6962
6963one_byte_emit_remainder_encodeBetterBlockAsm:
6964 SHLB $0x02, DL
6965 MOVB DL, (AX)
6966 ADDQ $0x01, AX
6967
6968memmove_emit_remainder_encodeBetterBlockAsm:
6969 LEAQ (AX)(SI*1), DX
6970 MOVL SI, BX
6971
6972 // genMemMoveShort
6973 CMPQ BX, $0x03
6974 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2
6975 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3
6976 CMPQ BX, $0x08
6977 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7
6978 CMPQ BX, $0x10
6979 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16
6980 CMPQ BX, $0x20
6981 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32
6982 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64
6983
6984emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2:
6985 MOVB (CX), SI
6986 MOVB -1(CX)(BX*1), CL
6987 MOVB SI, (AX)
6988 MOVB CL, -1(AX)(BX*1)
6989 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
6990
6991emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3:
6992 MOVW (CX), SI
6993 MOVB 2(CX), CL
6994 MOVW SI, (AX)
6995 MOVB CL, 2(AX)
6996 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
6997
6998emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7:
6999 MOVL (CX), SI
7000 MOVL -4(CX)(BX*1), CX
7001 MOVL SI, (AX)
7002 MOVL CX, -4(AX)(BX*1)
7003 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
7004
7005emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16:
7006 MOVQ (CX), SI
7007 MOVQ -8(CX)(BX*1), CX
7008 MOVQ SI, (AX)
7009 MOVQ CX, -8(AX)(BX*1)
7010 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
7011
7012emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32:
7013 MOVOU (CX), X0
7014 MOVOU -16(CX)(BX*1), X1
7015 MOVOU X0, (AX)
7016 MOVOU X1, -16(AX)(BX*1)
7017 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
7018
7019emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64:
7020 MOVOU (CX), X0
7021 MOVOU 16(CX), X1
7022 MOVOU -32(CX)(BX*1), X2
7023 MOVOU -16(CX)(BX*1), X3
7024 MOVOU X0, (AX)
7025 MOVOU X1, 16(AX)
7026 MOVOU X2, -32(AX)(BX*1)
7027 MOVOU X3, -16(AX)(BX*1)
7028
7029memmove_end_copy_emit_remainder_encodeBetterBlockAsm:
7030 MOVQ DX, AX
7031 JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm
7032
7033memmove_long_emit_remainder_encodeBetterBlockAsm:
7034 LEAQ (AX)(SI*1), DX
7035 MOVL SI, BX
7036
7037 // genMemMoveLong
7038 MOVOU (CX), X0
7039 MOVOU 16(CX), X1
7040 MOVOU -32(CX)(BX*1), X2
7041 MOVOU -16(CX)(BX*1), X3
7042 MOVQ BX, DI
7043 SHRQ $0x05, DI
7044 MOVQ AX, SI
7045 ANDL $0x0000001f, SI
7046 MOVQ $0x00000040, R8
7047 SUBQ SI, R8
7048 DECQ DI
7049 JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
7050 LEAQ -32(CX)(R8*1), SI
7051 LEAQ -32(AX)(R8*1), R9
7052
7053emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back:
7054 MOVOU (SI), X4
7055 MOVOU 16(SI), X5
7056 MOVOA X4, (R9)
7057 MOVOA X5, 16(R9)
7058 ADDQ $0x20, R9
7059 ADDQ $0x20, SI
7060 ADDQ $0x20, R8
7061 DECQ DI
7062 JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back
7063
7064emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32:
7065 MOVOU -32(CX)(R8*1), X4
7066 MOVOU -16(CX)(R8*1), X5
7067 MOVOA X4, -32(AX)(R8*1)
7068 MOVOA X5, -16(AX)(R8*1)
7069 ADDQ $0x20, R8
7070 CMPQ BX, R8
7071 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
7072 MOVOU X0, (AX)
7073 MOVOU X1, 16(AX)
7074 MOVOU X2, -32(AX)(BX*1)
7075 MOVOU X3, -16(AX)(BX*1)
7076 MOVQ DX, AX
7077
7078emit_literal_done_emit_remainder_encodeBetterBlockAsm:
7079 MOVQ dst_base+0(FP), CX
7080 SUBQ CX, AX
7081 MOVQ AX, ret+48(FP)
7082 RET
7083
7084// func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
7085// Requires: BMI, SSE2
7086TEXT ·encodeBetterBlockAsm4MB(SB), $589848-56
7087 MOVQ dst_base+0(FP), AX
7088 MOVQ $0x00001200, CX
7089 LEAQ 24(SP), DX
7090 PXOR X0, X0
7091
7092zero_loop_encodeBetterBlockAsm4MB:
7093 MOVOU X0, (DX)
7094 MOVOU X0, 16(DX)
7095 MOVOU X0, 32(DX)
7096 MOVOU X0, 48(DX)
7097 MOVOU X0, 64(DX)
7098 MOVOU X0, 80(DX)
7099 MOVOU X0, 96(DX)
7100 MOVOU X0, 112(DX)
7101 ADDQ $0x80, DX
7102 DECQ CX
7103 JNZ zero_loop_encodeBetterBlockAsm4MB
7104 MOVL $0x00000000, 12(SP)
7105 MOVQ src_len+32(FP), CX
7106 LEAQ -6(CX), DX
7107 LEAQ -8(CX), BX
7108 MOVL BX, 8(SP)
7109 SHRQ $0x05, CX
7110 SUBL CX, DX
7111 LEAQ (AX)(DX*1), DX
7112 MOVQ DX, (SP)
7113 MOVL $0x00000001, CX
7114 MOVL $0x00000000, 16(SP)
7115 MOVQ src_base+24(FP), DX
7116
7117search_loop_encodeBetterBlockAsm4MB:
7118 MOVL CX, BX
7119 SUBL 12(SP), BX
7120 SHRL $0x07, BX
7121 CMPL BX, $0x63
7122 JBE check_maxskip_ok_encodeBetterBlockAsm4MB
7123 LEAL 100(CX), BX
7124 JMP check_maxskip_cont_encodeBetterBlockAsm4MB
7125
7126check_maxskip_ok_encodeBetterBlockAsm4MB:
7127 LEAL 1(CX)(BX*1), BX
7128
7129check_maxskip_cont_encodeBetterBlockAsm4MB:
7130 CMPL BX, 8(SP)
7131 JAE emit_remainder_encodeBetterBlockAsm4MB
7132 MOVQ (DX)(CX*1), SI
7133 MOVL BX, 20(SP)
7134 MOVQ $0x00cf1bbcdcbfa563, R8
7135 MOVQ $0x9e3779b1, BX
7136 MOVQ SI, R9
7137 MOVQ SI, R10
7138 SHLQ $0x08, R9
7139 IMULQ R8, R9
7140 SHRQ $0x2f, R9
7141 SHLQ $0x20, R10
7142 IMULQ BX, R10
7143 SHRQ $0x32, R10
7144 MOVL 24(SP)(R9*4), BX
7145 MOVL 524312(SP)(R10*4), DI
7146 MOVL CX, 24(SP)(R9*4)
7147 MOVL CX, 524312(SP)(R10*4)
7148 MOVQ (DX)(BX*1), R9
7149 MOVQ (DX)(DI*1), R10
7150 CMPQ R9, SI
7151 JEQ candidate_match_encodeBetterBlockAsm4MB
7152 CMPQ R10, SI
7153 JNE no_short_found_encodeBetterBlockAsm4MB
7154 MOVL DI, BX
7155 JMP candidate_match_encodeBetterBlockAsm4MB
7156
7157no_short_found_encodeBetterBlockAsm4MB:
7158 CMPL R9, SI
7159 JEQ candidate_match_encodeBetterBlockAsm4MB
7160 CMPL R10, SI
7161 JEQ candidateS_match_encodeBetterBlockAsm4MB
7162 MOVL 20(SP), CX
7163 JMP search_loop_encodeBetterBlockAsm4MB
7164
7165candidateS_match_encodeBetterBlockAsm4MB:
7166 SHRQ $0x08, SI
7167 MOVQ SI, R9
7168 SHLQ $0x08, R9
7169 IMULQ R8, R9
7170 SHRQ $0x2f, R9
7171 MOVL 24(SP)(R9*4), BX
7172 INCL CX
7173 MOVL CX, 24(SP)(R9*4)
7174 CMPL (DX)(BX*1), SI
7175 JEQ candidate_match_encodeBetterBlockAsm4MB
7176 DECL CX
7177 MOVL DI, BX
7178
7179candidate_match_encodeBetterBlockAsm4MB:
7180 MOVL 12(SP), SI
7181 TESTL BX, BX
7182 JZ match_extend_back_end_encodeBetterBlockAsm4MB
7183
7184match_extend_back_loop_encodeBetterBlockAsm4MB:
7185 CMPL CX, SI
7186 JBE match_extend_back_end_encodeBetterBlockAsm4MB
7187 MOVB -1(DX)(BX*1), DI
7188 MOVB -1(DX)(CX*1), R8
7189 CMPB DI, R8
7190 JNE match_extend_back_end_encodeBetterBlockAsm4MB
7191 LEAL -1(CX), CX
7192 DECL BX
7193 JZ match_extend_back_end_encodeBetterBlockAsm4MB
7194 JMP match_extend_back_loop_encodeBetterBlockAsm4MB
7195
7196match_extend_back_end_encodeBetterBlockAsm4MB:
7197 MOVL CX, SI
7198 SUBL 12(SP), SI
7199 LEAQ 4(AX)(SI*1), SI
7200 CMPQ SI, (SP)
7201 JB match_dst_size_check_encodeBetterBlockAsm4MB
7202 MOVQ $0x00000000, ret+48(FP)
7203 RET
7204
7205match_dst_size_check_encodeBetterBlockAsm4MB:
7206 MOVL CX, SI
7207 ADDL $0x04, CX
7208 ADDL $0x04, BX
7209 MOVQ src_len+32(FP), DI
7210 SUBL CX, DI
7211 LEAQ (DX)(CX*1), R8
7212 LEAQ (DX)(BX*1), R9
7213
7214 // matchLen
7215 XORL R11, R11
7216
7217matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB:
7218 CMPL DI, $0x10
7219 JB matchlen_match8_match_nolit_encodeBetterBlockAsm4MB
7220 MOVQ (R8)(R11*1), R10
7221 MOVQ 8(R8)(R11*1), R12
7222 XORQ (R9)(R11*1), R10
7223 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB
7224 XORQ 8(R9)(R11*1), R12
7225 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB
7226 LEAL -16(DI), DI
7227 LEAL 16(R11), R11
7228 JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB
7229
7230matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB:
7231#ifdef GOAMD64_v3
7232 TZCNTQ R12, R12
7233
7234#else
7235 BSFQ R12, R12
7236
7237#endif
7238 SARQ $0x03, R12
7239 LEAL 8(R11)(R12*1), R11
7240 JMP match_nolit_end_encodeBetterBlockAsm4MB
7241
7242matchlen_match8_match_nolit_encodeBetterBlockAsm4MB:
7243 CMPL DI, $0x08
7244 JB matchlen_match4_match_nolit_encodeBetterBlockAsm4MB
7245 MOVQ (R8)(R11*1), R10
7246 XORQ (R9)(R11*1), R10
7247 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB
7248 LEAL -8(DI), DI
7249 LEAL 8(R11), R11
7250 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm4MB
7251
7252matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB:
7253#ifdef GOAMD64_v3
7254 TZCNTQ R10, R10
7255
7256#else
7257 BSFQ R10, R10
7258
7259#endif
7260 SARQ $0x03, R10
7261 LEAL (R11)(R10*1), R11
7262 JMP match_nolit_end_encodeBetterBlockAsm4MB
7263
7264matchlen_match4_match_nolit_encodeBetterBlockAsm4MB:
7265 CMPL DI, $0x04
7266 JB matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
7267 MOVL (R8)(R11*1), R10
7268 CMPL (R9)(R11*1), R10
7269 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
7270 LEAL -4(DI), DI
7271 LEAL 4(R11), R11
7272
7273matchlen_match2_match_nolit_encodeBetterBlockAsm4MB:
7274 CMPL DI, $0x01
7275 JE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
7276 JB match_nolit_end_encodeBetterBlockAsm4MB
7277 MOVW (R8)(R11*1), R10
7278 CMPW (R9)(R11*1), R10
7279 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
7280 LEAL 2(R11), R11
7281 SUBL $0x02, DI
7282 JZ match_nolit_end_encodeBetterBlockAsm4MB
7283
7284matchlen_match1_match_nolit_encodeBetterBlockAsm4MB:
7285 MOVB (R8)(R11*1), R10
7286 CMPB (R9)(R11*1), R10
7287 JNE match_nolit_end_encodeBetterBlockAsm4MB
7288 LEAL 1(R11), R11
7289
7290match_nolit_end_encodeBetterBlockAsm4MB:
7291 MOVL CX, DI
7292 SUBL BX, DI
7293
7294 // Check if repeat
7295 CMPL 16(SP), DI
7296 JEQ match_is_repeat_encodeBetterBlockAsm4MB
7297 CMPL R11, $0x01
7298 JA match_length_ok_encodeBetterBlockAsm4MB
7299 CMPL DI, $0x0000ffff
7300 JBE match_length_ok_encodeBetterBlockAsm4MB
7301 MOVL 20(SP), CX
7302 INCL CX
7303 JMP search_loop_encodeBetterBlockAsm4MB
7304
7305match_length_ok_encodeBetterBlockAsm4MB:
7306 MOVL DI, 16(SP)
7307 MOVL 12(SP), BX
7308 CMPL BX, SI
7309 JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB
7310 MOVL SI, R8
7311 MOVL SI, 12(SP)
7312 LEAQ (DX)(BX*1), R9
7313 SUBL BX, R8
7314 LEAL -1(R8), BX
7315 CMPL BX, $0x3c
7316 JB one_byte_match_emit_encodeBetterBlockAsm4MB
7317 CMPL BX, $0x00000100
7318 JB two_bytes_match_emit_encodeBetterBlockAsm4MB
7319 CMPL BX, $0x00010000
7320 JB three_bytes_match_emit_encodeBetterBlockAsm4MB
7321 MOVL BX, R10
7322 SHRL $0x10, R10
7323 MOVB $0xf8, (AX)
7324 MOVW BX, 1(AX)
7325 MOVB R10, 3(AX)
7326 ADDQ $0x04, AX
7327 JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
7328
7329three_bytes_match_emit_encodeBetterBlockAsm4MB:
7330 MOVB $0xf4, (AX)
7331 MOVW BX, 1(AX)
7332 ADDQ $0x03, AX
7333 JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
7334
7335two_bytes_match_emit_encodeBetterBlockAsm4MB:
7336 MOVB $0xf0, (AX)
7337 MOVB BL, 1(AX)
7338 ADDQ $0x02, AX
7339 CMPL BX, $0x40
7340 JB memmove_match_emit_encodeBetterBlockAsm4MB
7341 JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
7342
7343one_byte_match_emit_encodeBetterBlockAsm4MB:
7344 SHLB $0x02, BL
7345 MOVB BL, (AX)
7346 ADDQ $0x01, AX
7347
7348memmove_match_emit_encodeBetterBlockAsm4MB:
7349 LEAQ (AX)(R8*1), BX
7350
7351 // genMemMoveShort
7352 CMPQ R8, $0x04
7353 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4
7354 CMPQ R8, $0x08
7355 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7
7356 CMPQ R8, $0x10
7357 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16
7358 CMPQ R8, $0x20
7359 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32
7360 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64
7361
7362emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4:
7363 MOVL (R9), R10
7364 MOVL R10, (AX)
7365 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
7366
7367emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7:
7368 MOVL (R9), R10
7369 MOVL -4(R9)(R8*1), R9
7370 MOVL R10, (AX)
7371 MOVL R9, -4(AX)(R8*1)
7372 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
7373
7374emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16:
7375 MOVQ (R9), R10
7376 MOVQ -8(R9)(R8*1), R9
7377 MOVQ R10, (AX)
7378 MOVQ R9, -8(AX)(R8*1)
7379 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
7380
7381emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32:
7382 MOVOU (R9), X0
7383 MOVOU -16(R9)(R8*1), X1
7384 MOVOU X0, (AX)
7385 MOVOU X1, -16(AX)(R8*1)
7386 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
7387
7388emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64:
7389 MOVOU (R9), X0
7390 MOVOU 16(R9), X1
7391 MOVOU -32(R9)(R8*1), X2
7392 MOVOU -16(R9)(R8*1), X3
7393 MOVOU X0, (AX)
7394 MOVOU X1, 16(AX)
7395 MOVOU X2, -32(AX)(R8*1)
7396 MOVOU X3, -16(AX)(R8*1)
7397
7398memmove_end_copy_match_emit_encodeBetterBlockAsm4MB:
7399 MOVQ BX, AX
7400 JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB
7401
7402memmove_long_match_emit_encodeBetterBlockAsm4MB:
7403 LEAQ (AX)(R8*1), BX
7404
7405 // genMemMoveLong
7406 MOVOU (R9), X0
7407 MOVOU 16(R9), X1
7408 MOVOU -32(R9)(R8*1), X2
7409 MOVOU -16(R9)(R8*1), X3
7410 MOVQ R8, R12
7411 SHRQ $0x05, R12
7412 MOVQ AX, R10
7413 ANDL $0x0000001f, R10
7414 MOVQ $0x00000040, R13
7415 SUBQ R10, R13
7416 DECQ R12
7417 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
7418 LEAQ -32(R9)(R13*1), R10
7419 LEAQ -32(AX)(R13*1), R14
7420
7421emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back:
7422 MOVOU (R10), X4
7423 MOVOU 16(R10), X5
7424 MOVOA X4, (R14)
7425 MOVOA X5, 16(R14)
7426 ADDQ $0x20, R14
7427 ADDQ $0x20, R10
7428 ADDQ $0x20, R13
7429 DECQ R12
7430 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back
7431
7432emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
7433 MOVOU -32(R9)(R13*1), X4
7434 MOVOU -16(R9)(R13*1), X5
7435 MOVOA X4, -32(AX)(R13*1)
7436 MOVOA X5, -16(AX)(R13*1)
7437 ADDQ $0x20, R13
7438 CMPQ R8, R13
7439 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
7440 MOVOU X0, (AX)
7441 MOVOU X1, 16(AX)
7442 MOVOU X2, -32(AX)(R8*1)
7443 MOVOU X3, -16(AX)(R8*1)
7444 MOVQ BX, AX
7445
7446emit_literal_done_match_emit_encodeBetterBlockAsm4MB:
7447 ADDL R11, CX
7448 ADDL $0x04, R11
7449 MOVL CX, 12(SP)
7450
7451 // emitCopy
7452 CMPL DI, $0x00010000
7453 JB two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
7454 CMPL R11, $0x40
7455 JBE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
7456 MOVB $0xff, (AX)
7457 MOVL DI, 1(AX)
7458 LEAL -64(R11), R11
7459 ADDQ $0x05, AX
7460 CMPL R11, $0x04
7461 JB four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
7462
7463 // emitRepeat
7464 MOVL R11, BX
7465 LEAL -4(R11), R11
7466 CMPL BX, $0x08
7467 JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy
7468 CMPL BX, $0x0c
7469 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
7470 CMPL DI, $0x00000800
7471 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
7472
7473cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
7474 CMPL R11, $0x00000104
7475 JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy
7476 CMPL R11, $0x00010100
7477 JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy
7478 LEAL -65536(R11), R11
7479 MOVL R11, DI
7480 MOVW $0x001d, (AX)
7481 MOVW R11, 2(AX)
7482 SARL $0x10, DI
7483 MOVB DI, 4(AX)
7484 ADDQ $0x05, AX
7485 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7486
7487repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
7488 LEAL -256(R11), R11
7489 MOVW $0x0019, (AX)
7490 MOVW R11, 2(AX)
7491 ADDQ $0x04, AX
7492 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7493
7494repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
7495 LEAL -4(R11), R11
7496 MOVW $0x0015, (AX)
7497 MOVB R11, 2(AX)
7498 ADDQ $0x03, AX
7499 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7500
7501repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
7502 SHLL $0x02, R11
7503 ORL $0x01, R11
7504 MOVW R11, (AX)
7505 ADDQ $0x02, AX
7506 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7507
7508repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
7509 XORQ BX, BX
7510 LEAL 1(BX)(R11*4), R11
7511 MOVB DI, 1(AX)
7512 SARL $0x08, DI
7513 SHLL $0x05, DI
7514 ORL DI, R11
7515 MOVB R11, (AX)
7516 ADDQ $0x02, AX
7517 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7518
7519four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB:
7520 TESTL R11, R11
7521 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7522 XORL BX, BX
7523 LEAL -1(BX)(R11*4), R11
7524 MOVB R11, (AX)
7525 MOVL DI, 1(AX)
7526 ADDQ $0x05, AX
7527 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7528
7529two_byte_offset_match_nolit_encodeBetterBlockAsm4MB:
7530 CMPL R11, $0x40
7531 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB
7532 CMPL DI, $0x00000800
7533 JAE long_offset_short_match_nolit_encodeBetterBlockAsm4MB
7534 MOVL $0x00000001, BX
7535 LEAL 16(BX), BX
7536 MOVB DI, 1(AX)
7537 SHRL $0x08, DI
7538 SHLL $0x05, DI
7539 ORL DI, BX
7540 MOVB BL, (AX)
7541 ADDQ $0x02, AX
7542 SUBL $0x08, R11
7543
7544 // emitRepeat
7545 LEAL -4(R11), R11
7546 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
7547 MOVL R11, BX
7548 LEAL -4(R11), R11
7549 CMPL BX, $0x08
7550 JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
7551 CMPL BX, $0x0c
7552 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
7553 CMPL DI, $0x00000800
7554 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
7555
7556cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
7557 CMPL R11, $0x00000104
7558 JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
7559 CMPL R11, $0x00010100
7560 JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
7561 LEAL -65536(R11), R11
7562 MOVL R11, DI
7563 MOVW $0x001d, (AX)
7564 MOVW R11, 2(AX)
7565 SARL $0x10, DI
7566 MOVB DI, 4(AX)
7567 ADDQ $0x05, AX
7568 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7569
7570repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
7571 LEAL -256(R11), R11
7572 MOVW $0x0019, (AX)
7573 MOVW R11, 2(AX)
7574 ADDQ $0x04, AX
7575 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7576
7577repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
7578 LEAL -4(R11), R11
7579 MOVW $0x0015, (AX)
7580 MOVB R11, 2(AX)
7581 ADDQ $0x03, AX
7582 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7583
7584repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
7585 SHLL $0x02, R11
7586 ORL $0x01, R11
7587 MOVW R11, (AX)
7588 ADDQ $0x02, AX
7589 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7590
7591repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
7592 XORQ BX, BX
7593 LEAL 1(BX)(R11*4), R11
7594 MOVB DI, 1(AX)
7595 SARL $0x08, DI
7596 SHLL $0x05, DI
7597 ORL DI, R11
7598 MOVB R11, (AX)
7599 ADDQ $0x02, AX
7600 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7601
7602long_offset_short_match_nolit_encodeBetterBlockAsm4MB:
7603 MOVB $0xee, (AX)
7604 MOVW DI, 1(AX)
7605 LEAL -60(R11), R11
7606 ADDQ $0x03, AX
7607
7608 // emitRepeat
7609 MOVL R11, BX
7610 LEAL -4(R11), R11
7611 CMPL BX, $0x08
7612 JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
7613 CMPL BX, $0x0c
7614 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
7615 CMPL DI, $0x00000800
7616 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
7617
7618cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
7619 CMPL R11, $0x00000104
7620 JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
7621 CMPL R11, $0x00010100
7622 JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
7623 LEAL -65536(R11), R11
7624 MOVL R11, DI
7625 MOVW $0x001d, (AX)
7626 MOVW R11, 2(AX)
7627 SARL $0x10, DI
7628 MOVB DI, 4(AX)
7629 ADDQ $0x05, AX
7630 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7631
7632repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
7633 LEAL -256(R11), R11
7634 MOVW $0x0019, (AX)
7635 MOVW R11, 2(AX)
7636 ADDQ $0x04, AX
7637 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7638
7639repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
7640 LEAL -4(R11), R11
7641 MOVW $0x0015, (AX)
7642 MOVB R11, 2(AX)
7643 ADDQ $0x03, AX
7644 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7645
7646repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
7647 SHLL $0x02, R11
7648 ORL $0x01, R11
7649 MOVW R11, (AX)
7650 ADDQ $0x02, AX
7651 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7652
7653repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
7654 XORQ BX, BX
7655 LEAL 1(BX)(R11*4), R11
7656 MOVB DI, 1(AX)
7657 SARL $0x08, DI
7658 SHLL $0x05, DI
7659 ORL DI, R11
7660 MOVB R11, (AX)
7661 ADDQ $0x02, AX
7662 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7663
7664two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB:
7665 MOVL R11, BX
7666 SHLL $0x02, BX
7667 CMPL R11, $0x0c
7668 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
7669 CMPL DI, $0x00000800
7670 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
7671 LEAL -15(BX), BX
7672 MOVB DI, 1(AX)
7673 SHRL $0x08, DI
7674 SHLL $0x05, DI
7675 ORL DI, BX
7676 MOVB BL, (AX)
7677 ADDQ $0x02, AX
7678 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7679
7680emit_copy_three_match_nolit_encodeBetterBlockAsm4MB:
7681 LEAL -2(BX), BX
7682 MOVB BL, (AX)
7683 MOVW DI, 1(AX)
7684 ADDQ $0x03, AX
7685 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7686
7687match_is_repeat_encodeBetterBlockAsm4MB:
7688 MOVL 12(SP), BX
7689 CMPL BX, SI
7690 JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
7691 MOVL SI, R8
7692 MOVL SI, 12(SP)
7693 LEAQ (DX)(BX*1), R9
7694 SUBL BX, R8
7695 LEAL -1(R8), BX
7696 CMPL BX, $0x3c
7697 JB one_byte_match_emit_repeat_encodeBetterBlockAsm4MB
7698 CMPL BX, $0x00000100
7699 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
7700 CMPL BX, $0x00010000
7701 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
7702 MOVL BX, R10
7703 SHRL $0x10, R10
7704 MOVB $0xf8, (AX)
7705 MOVW BX, 1(AX)
7706 MOVB R10, 3(AX)
7707 ADDQ $0x04, AX
7708 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
7709
7710three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
7711 MOVB $0xf4, (AX)
7712 MOVW BX, 1(AX)
7713 ADDQ $0x03, AX
7714 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
7715
7716two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
7717 MOVB $0xf0, (AX)
7718 MOVB BL, 1(AX)
7719 ADDQ $0x02, AX
7720 CMPL BX, $0x40
7721 JB memmove_match_emit_repeat_encodeBetterBlockAsm4MB
7722 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
7723
7724one_byte_match_emit_repeat_encodeBetterBlockAsm4MB:
7725 SHLB $0x02, BL
7726 MOVB BL, (AX)
7727 ADDQ $0x01, AX
7728
7729memmove_match_emit_repeat_encodeBetterBlockAsm4MB:
7730 LEAQ (AX)(R8*1), BX
7731
7732 // genMemMoveShort
7733 CMPQ R8, $0x04
7734 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4
7735 CMPQ R8, $0x08
7736 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7
7737 CMPQ R8, $0x10
7738 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16
7739 CMPQ R8, $0x20
7740 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32
7741 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64
7742
7743emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4:
7744 MOVL (R9), R10
7745 MOVL R10, (AX)
7746 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
7747
7748emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7:
7749 MOVL (R9), R10
7750 MOVL -4(R9)(R8*1), R9
7751 MOVL R10, (AX)
7752 MOVL R9, -4(AX)(R8*1)
7753 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
7754
7755emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16:
7756 MOVQ (R9), R10
7757 MOVQ -8(R9)(R8*1), R9
7758 MOVQ R10, (AX)
7759 MOVQ R9, -8(AX)(R8*1)
7760 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
7761
7762emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32:
7763 MOVOU (R9), X0
7764 MOVOU -16(R9)(R8*1), X1
7765 MOVOU X0, (AX)
7766 MOVOU X1, -16(AX)(R8*1)
7767 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
7768
7769emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64:
7770 MOVOU (R9), X0
7771 MOVOU 16(R9), X1
7772 MOVOU -32(R9)(R8*1), X2
7773 MOVOU -16(R9)(R8*1), X3
7774 MOVOU X0, (AX)
7775 MOVOU X1, 16(AX)
7776 MOVOU X2, -32(AX)(R8*1)
7777 MOVOU X3, -16(AX)(R8*1)
7778
7779memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB:
7780 MOVQ BX, AX
7781 JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
7782
7783memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB:
7784 LEAQ (AX)(R8*1), BX
7785
7786 // genMemMoveLong
7787 MOVOU (R9), X0
7788 MOVOU 16(R9), X1
7789 MOVOU -32(R9)(R8*1), X2
7790 MOVOU -16(R9)(R8*1), X3
7791 MOVQ R8, R12
7792 SHRQ $0x05, R12
7793 MOVQ AX, R10
7794 ANDL $0x0000001f, R10
7795 MOVQ $0x00000040, R13
7796 SUBQ R10, R13
7797 DECQ R12
7798 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
7799 LEAQ -32(R9)(R13*1), R10
7800 LEAQ -32(AX)(R13*1), R14
7801
7802emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back:
7803 MOVOU (R10), X4
7804 MOVOU 16(R10), X5
7805 MOVOA X4, (R14)
7806 MOVOA X5, 16(R14)
7807 ADDQ $0x20, R14
7808 ADDQ $0x20, R10
7809 ADDQ $0x20, R13
7810 DECQ R12
7811 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back
7812
7813emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
7814 MOVOU -32(R9)(R13*1), X4
7815 MOVOU -16(R9)(R13*1), X5
7816 MOVOA X4, -32(AX)(R13*1)
7817 MOVOA X5, -16(AX)(R13*1)
7818 ADDQ $0x20, R13
7819 CMPQ R8, R13
7820 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
7821 MOVOU X0, (AX)
7822 MOVOU X1, 16(AX)
7823 MOVOU X2, -32(AX)(R8*1)
7824 MOVOU X3, -16(AX)(R8*1)
7825 MOVQ BX, AX
7826
7827emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB:
7828 ADDL R11, CX
7829 ADDL $0x04, R11
7830 MOVL CX, 12(SP)
7831
7832 // emitRepeat
7833 MOVL R11, BX
7834 LEAL -4(R11), R11
7835 CMPL BX, $0x08
7836 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB
7837 CMPL BX, $0x0c
7838 JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
7839 CMPL DI, $0x00000800
7840 JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
7841
7842cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
7843 CMPL R11, $0x00000104
7844 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB
7845 CMPL R11, $0x00010100
7846 JB repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB
7847 LEAL -65536(R11), R11
7848 MOVL R11, DI
7849 MOVW $0x001d, (AX)
7850 MOVW R11, 2(AX)
7851 SARL $0x10, DI
7852 MOVB DI, 4(AX)
7853 ADDQ $0x05, AX
7854 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7855
7856repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB:
7857 LEAL -256(R11), R11
7858 MOVW $0x0019, (AX)
7859 MOVW R11, 2(AX)
7860 ADDQ $0x04, AX
7861 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7862
7863repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB:
7864 LEAL -4(R11), R11
7865 MOVW $0x0015, (AX)
7866 MOVB R11, 2(AX)
7867 ADDQ $0x03, AX
7868 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7869
7870repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB:
7871 SHLL $0x02, R11
7872 ORL $0x01, R11
7873 MOVW R11, (AX)
7874 ADDQ $0x02, AX
7875 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7876
7877repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
7878 XORQ BX, BX
7879 LEAL 1(BX)(R11*4), R11
7880 MOVB DI, 1(AX)
7881 SARL $0x08, DI
7882 SHLL $0x05, DI
7883 ORL DI, R11
7884 MOVB R11, (AX)
7885 ADDQ $0x02, AX
7886
7887match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
7888 CMPL CX, 8(SP)
7889 JAE emit_remainder_encodeBetterBlockAsm4MB
7890 CMPQ AX, (SP)
7891 JB match_nolit_dst_ok_encodeBetterBlockAsm4MB
7892 MOVQ $0x00000000, ret+48(FP)
7893 RET
7894
7895match_nolit_dst_ok_encodeBetterBlockAsm4MB:
7896 MOVQ $0x00cf1bbcdcbfa563, BX
7897 MOVQ $0x9e3779b1, DI
7898 LEAQ 1(SI), SI
7899 LEAQ -2(CX), R8
7900 MOVQ (DX)(SI*1), R9
7901 MOVQ 1(DX)(SI*1), R10
7902 MOVQ (DX)(R8*1), R11
7903 MOVQ 1(DX)(R8*1), R12
7904 SHLQ $0x08, R9
7905 IMULQ BX, R9
7906 SHRQ $0x2f, R9
7907 SHLQ $0x20, R10
7908 IMULQ DI, R10
7909 SHRQ $0x32, R10
7910 SHLQ $0x08, R11
7911 IMULQ BX, R11
7912 SHRQ $0x2f, R11
7913 SHLQ $0x20, R12
7914 IMULQ DI, R12
7915 SHRQ $0x32, R12
7916 LEAQ 1(SI), DI
7917 LEAQ 1(R8), R13
7918 MOVL SI, 24(SP)(R9*4)
7919 MOVL R8, 24(SP)(R11*4)
7920 MOVL DI, 524312(SP)(R10*4)
7921 MOVL R13, 524312(SP)(R12*4)
7922 LEAQ 1(R8)(SI*1), DI
7923 SHRQ $0x01, DI
7924 ADDQ $0x01, SI
7925 SUBQ $0x01, R8
7926
7927index_loop_encodeBetterBlockAsm4MB:
7928 CMPQ DI, R8
7929 JAE search_loop_encodeBetterBlockAsm4MB
7930 MOVQ (DX)(SI*1), R9
7931 MOVQ (DX)(DI*1), R10
7932 SHLQ $0x08, R9
7933 IMULQ BX, R9
7934 SHRQ $0x2f, R9
7935 SHLQ $0x08, R10
7936 IMULQ BX, R10
7937 SHRQ $0x2f, R10
7938 MOVL SI, 24(SP)(R9*4)
7939 MOVL DI, 24(SP)(R10*4)
7940 ADDQ $0x02, SI
7941 ADDQ $0x02, DI
7942 JMP index_loop_encodeBetterBlockAsm4MB
7943
7944emit_remainder_encodeBetterBlockAsm4MB:
7945 MOVQ src_len+32(FP), CX
7946 SUBL 12(SP), CX
7947 LEAQ 4(AX)(CX*1), CX
7948 CMPQ CX, (SP)
7949 JB emit_remainder_ok_encodeBetterBlockAsm4MB
7950 MOVQ $0x00000000, ret+48(FP)
7951 RET
7952
7953emit_remainder_ok_encodeBetterBlockAsm4MB:
7954 MOVQ src_len+32(FP), CX
7955 MOVL 12(SP), BX
7956 CMPL BX, CX
7957 JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
7958 MOVL CX, SI
7959 MOVL CX, 12(SP)
7960 LEAQ (DX)(BX*1), CX
7961 SUBL BX, SI
7962 LEAL -1(SI), DX
7963 CMPL DX, $0x3c
7964 JB one_byte_emit_remainder_encodeBetterBlockAsm4MB
7965 CMPL DX, $0x00000100
7966 JB two_bytes_emit_remainder_encodeBetterBlockAsm4MB
7967 CMPL DX, $0x00010000
7968 JB three_bytes_emit_remainder_encodeBetterBlockAsm4MB
7969 MOVL DX, BX
7970 SHRL $0x10, BX
7971 MOVB $0xf8, (AX)
7972 MOVW DX, 1(AX)
7973 MOVB BL, 3(AX)
7974 ADDQ $0x04, AX
7975 JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
7976
7977three_bytes_emit_remainder_encodeBetterBlockAsm4MB:
7978 MOVB $0xf4, (AX)
7979 MOVW DX, 1(AX)
7980 ADDQ $0x03, AX
7981 JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
7982
7983two_bytes_emit_remainder_encodeBetterBlockAsm4MB:
7984 MOVB $0xf0, (AX)
7985 MOVB DL, 1(AX)
7986 ADDQ $0x02, AX
7987 CMPL DX, $0x40
7988 JB memmove_emit_remainder_encodeBetterBlockAsm4MB
7989 JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
7990
7991one_byte_emit_remainder_encodeBetterBlockAsm4MB:
7992 SHLB $0x02, DL
7993 MOVB DL, (AX)
7994 ADDQ $0x01, AX
7995
7996memmove_emit_remainder_encodeBetterBlockAsm4MB:
7997 LEAQ (AX)(SI*1), DX
7998 MOVL SI, BX
7999
8000 // genMemMoveShort
8001 CMPQ BX, $0x03
8002 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2
8003 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3
8004 CMPQ BX, $0x08
8005 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7
8006 CMPQ BX, $0x10
8007 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16
8008 CMPQ BX, $0x20
8009 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32
8010 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64
8011
8012emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2:
8013 MOVB (CX), SI
8014 MOVB -1(CX)(BX*1), CL
8015 MOVB SI, (AX)
8016 MOVB CL, -1(AX)(BX*1)
8017 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
8018
8019emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3:
8020 MOVW (CX), SI
8021 MOVB 2(CX), CL
8022 MOVW SI, (AX)
8023 MOVB CL, 2(AX)
8024 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
8025
8026emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7:
8027 MOVL (CX), SI
8028 MOVL -4(CX)(BX*1), CX
8029 MOVL SI, (AX)
8030 MOVL CX, -4(AX)(BX*1)
8031 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
8032
8033emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16:
8034 MOVQ (CX), SI
8035 MOVQ -8(CX)(BX*1), CX
8036 MOVQ SI, (AX)
8037 MOVQ CX, -8(AX)(BX*1)
8038 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
8039
8040emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32:
8041 MOVOU (CX), X0
8042 MOVOU -16(CX)(BX*1), X1
8043 MOVOU X0, (AX)
8044 MOVOU X1, -16(AX)(BX*1)
8045 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
8046
8047emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64:
8048 MOVOU (CX), X0
8049 MOVOU 16(CX), X1
8050 MOVOU -32(CX)(BX*1), X2
8051 MOVOU -16(CX)(BX*1), X3
8052 MOVOU X0, (AX)
8053 MOVOU X1, 16(AX)
8054 MOVOU X2, -32(AX)(BX*1)
8055 MOVOU X3, -16(AX)(BX*1)
8056
8057memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB:
8058 MOVQ DX, AX
8059 JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
8060
8061memmove_long_emit_remainder_encodeBetterBlockAsm4MB:
8062 LEAQ (AX)(SI*1), DX
8063 MOVL SI, BX
8064
8065 // genMemMoveLong
8066 MOVOU (CX), X0
8067 MOVOU 16(CX), X1
8068 MOVOU -32(CX)(BX*1), X2
8069 MOVOU -16(CX)(BX*1), X3
8070 MOVQ BX, DI
8071 SHRQ $0x05, DI
8072 MOVQ AX, SI
8073 ANDL $0x0000001f, SI
8074 MOVQ $0x00000040, R8
8075 SUBQ SI, R8
8076 DECQ DI
8077 JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
8078 LEAQ -32(CX)(R8*1), SI
8079 LEAQ -32(AX)(R8*1), R9
8080
8081emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back:
8082 MOVOU (SI), X4
8083 MOVOU 16(SI), X5
8084 MOVOA X4, (R9)
8085 MOVOA X5, 16(R9)
8086 ADDQ $0x20, R9
8087 ADDQ $0x20, SI
8088 ADDQ $0x20, R8
8089 DECQ DI
8090 JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back
8091
8092emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
8093 MOVOU -32(CX)(R8*1), X4
8094 MOVOU -16(CX)(R8*1), X5
8095 MOVOA X4, -32(AX)(R8*1)
8096 MOVOA X5, -16(AX)(R8*1)
8097 ADDQ $0x20, R8
8098 CMPQ BX, R8
8099 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
8100 MOVOU X0, (AX)
8101 MOVOU X1, 16(AX)
8102 MOVOU X2, -32(AX)(BX*1)
8103 MOVOU X3, -16(AX)(BX*1)
8104 MOVQ DX, AX
8105
8106emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB:
8107 MOVQ dst_base+0(FP), CX
8108 SUBQ CX, AX
8109 MOVQ AX, ret+48(FP)
8110 RET
8111
8112// func encodeBetterBlockAsm12B(dst []byte, src []byte) int
8113// Requires: BMI, SSE2
8114TEXT ·encodeBetterBlockAsm12B(SB), $81944-56
8115 MOVQ dst_base+0(FP), AX
8116 MOVQ $0x00000280, CX
8117 LEAQ 24(SP), DX
8118 PXOR X0, X0
8119
8120zero_loop_encodeBetterBlockAsm12B:
8121 MOVOU X0, (DX)
8122 MOVOU X0, 16(DX)
8123 MOVOU X0, 32(DX)
8124 MOVOU X0, 48(DX)
8125 MOVOU X0, 64(DX)
8126 MOVOU X0, 80(DX)
8127 MOVOU X0, 96(DX)
8128 MOVOU X0, 112(DX)
8129 ADDQ $0x80, DX
8130 DECQ CX
8131 JNZ zero_loop_encodeBetterBlockAsm12B
8132 MOVL $0x00000000, 12(SP)
8133 MOVQ src_len+32(FP), CX
8134 LEAQ -6(CX), DX
8135 LEAQ -8(CX), BX
8136 MOVL BX, 8(SP)
8137 SHRQ $0x05, CX
8138 SUBL CX, DX
8139 LEAQ (AX)(DX*1), DX
8140 MOVQ DX, (SP)
8141 MOVL $0x00000001, CX
8142 MOVL $0x00000000, 16(SP)
8143 MOVQ src_base+24(FP), DX
8144
8145search_loop_encodeBetterBlockAsm12B:
8146 MOVL CX, BX
8147 SUBL 12(SP), BX
8148 SHRL $0x06, BX
8149 LEAL 1(CX)(BX*1), BX
8150 CMPL BX, 8(SP)
8151 JAE emit_remainder_encodeBetterBlockAsm12B
8152 MOVQ (DX)(CX*1), SI
8153 MOVL BX, 20(SP)
8154 MOVQ $0x0000cf1bbcdcbf9b, R8
8155 MOVQ $0x9e3779b1, BX
8156 MOVQ SI, R9
8157 MOVQ SI, R10
8158 SHLQ $0x10, R9
8159 IMULQ R8, R9
8160 SHRQ $0x32, R9
8161 SHLQ $0x20, R10
8162 IMULQ BX, R10
8163 SHRQ $0x34, R10
8164 MOVL 24(SP)(R9*4), BX
8165 MOVL 65560(SP)(R10*4), DI
8166 MOVL CX, 24(SP)(R9*4)
8167 MOVL CX, 65560(SP)(R10*4)
8168 MOVQ (DX)(BX*1), R9
8169 MOVQ (DX)(DI*1), R10
8170 CMPQ R9, SI
8171 JEQ candidate_match_encodeBetterBlockAsm12B
8172 CMPQ R10, SI
8173 JNE no_short_found_encodeBetterBlockAsm12B
8174 MOVL DI, BX
8175 JMP candidate_match_encodeBetterBlockAsm12B
8176
8177no_short_found_encodeBetterBlockAsm12B:
8178 CMPL R9, SI
8179 JEQ candidate_match_encodeBetterBlockAsm12B
8180 CMPL R10, SI
8181 JEQ candidateS_match_encodeBetterBlockAsm12B
8182 MOVL 20(SP), CX
8183 JMP search_loop_encodeBetterBlockAsm12B
8184
8185candidateS_match_encodeBetterBlockAsm12B:
8186 SHRQ $0x08, SI
8187 MOVQ SI, R9
8188 SHLQ $0x10, R9
8189 IMULQ R8, R9
8190 SHRQ $0x32, R9
8191 MOVL 24(SP)(R9*4), BX
8192 INCL CX
8193 MOVL CX, 24(SP)(R9*4)
8194 CMPL (DX)(BX*1), SI
8195 JEQ candidate_match_encodeBetterBlockAsm12B
8196 DECL CX
8197 MOVL DI, BX
8198
8199candidate_match_encodeBetterBlockAsm12B:
8200 MOVL 12(SP), SI
8201 TESTL BX, BX
8202 JZ match_extend_back_end_encodeBetterBlockAsm12B
8203
8204match_extend_back_loop_encodeBetterBlockAsm12B:
8205 CMPL CX, SI
8206 JBE match_extend_back_end_encodeBetterBlockAsm12B
8207 MOVB -1(DX)(BX*1), DI
8208 MOVB -1(DX)(CX*1), R8
8209 CMPB DI, R8
8210 JNE match_extend_back_end_encodeBetterBlockAsm12B
8211 LEAL -1(CX), CX
8212 DECL BX
8213 JZ match_extend_back_end_encodeBetterBlockAsm12B
8214 JMP match_extend_back_loop_encodeBetterBlockAsm12B
8215
8216match_extend_back_end_encodeBetterBlockAsm12B:
8217 MOVL CX, SI
8218 SUBL 12(SP), SI
8219 LEAQ 3(AX)(SI*1), SI
8220 CMPQ SI, (SP)
8221 JB match_dst_size_check_encodeBetterBlockAsm12B
8222 MOVQ $0x00000000, ret+48(FP)
8223 RET
8224
8225match_dst_size_check_encodeBetterBlockAsm12B:
8226 MOVL CX, SI
8227 ADDL $0x04, CX
8228 ADDL $0x04, BX
8229 MOVQ src_len+32(FP), DI
8230 SUBL CX, DI
8231 LEAQ (DX)(CX*1), R8
8232 LEAQ (DX)(BX*1), R9
8233
8234 // matchLen
8235 XORL R11, R11
8236
8237matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B:
8238 CMPL DI, $0x10
8239 JB matchlen_match8_match_nolit_encodeBetterBlockAsm12B
8240 MOVQ (R8)(R11*1), R10
8241 MOVQ 8(R8)(R11*1), R12
8242 XORQ (R9)(R11*1), R10
8243 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B
8244 XORQ 8(R9)(R11*1), R12
8245 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B
8246 LEAL -16(DI), DI
8247 LEAL 16(R11), R11
8248 JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B
8249
8250matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B:
8251#ifdef GOAMD64_v3
8252 TZCNTQ R12, R12
8253
8254#else
8255 BSFQ R12, R12
8256
8257#endif
8258 SARQ $0x03, R12
8259 LEAL 8(R11)(R12*1), R11
8260 JMP match_nolit_end_encodeBetterBlockAsm12B
8261
8262matchlen_match8_match_nolit_encodeBetterBlockAsm12B:
8263 CMPL DI, $0x08
8264 JB matchlen_match4_match_nolit_encodeBetterBlockAsm12B
8265 MOVQ (R8)(R11*1), R10
8266 XORQ (R9)(R11*1), R10
8267 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B
8268 LEAL -8(DI), DI
8269 LEAL 8(R11), R11
8270 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm12B
8271
8272matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B:
8273#ifdef GOAMD64_v3
8274 TZCNTQ R10, R10
8275
8276#else
8277 BSFQ R10, R10
8278
8279#endif
8280 SARQ $0x03, R10
8281 LEAL (R11)(R10*1), R11
8282 JMP match_nolit_end_encodeBetterBlockAsm12B
8283
8284matchlen_match4_match_nolit_encodeBetterBlockAsm12B:
8285 CMPL DI, $0x04
8286 JB matchlen_match2_match_nolit_encodeBetterBlockAsm12B
8287 MOVL (R8)(R11*1), R10
8288 CMPL (R9)(R11*1), R10
8289 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm12B
8290 LEAL -4(DI), DI
8291 LEAL 4(R11), R11
8292
8293matchlen_match2_match_nolit_encodeBetterBlockAsm12B:
8294 CMPL DI, $0x01
8295 JE matchlen_match1_match_nolit_encodeBetterBlockAsm12B
8296 JB match_nolit_end_encodeBetterBlockAsm12B
8297 MOVW (R8)(R11*1), R10
8298 CMPW (R9)(R11*1), R10
8299 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm12B
8300 LEAL 2(R11), R11
8301 SUBL $0x02, DI
8302 JZ match_nolit_end_encodeBetterBlockAsm12B
8303
8304matchlen_match1_match_nolit_encodeBetterBlockAsm12B:
8305 MOVB (R8)(R11*1), R10
8306 CMPB (R9)(R11*1), R10
8307 JNE match_nolit_end_encodeBetterBlockAsm12B
8308 LEAL 1(R11), R11
8309
8310match_nolit_end_encodeBetterBlockAsm12B:
8311 MOVL CX, DI
8312 SUBL BX, DI
8313
8314 // Check if repeat
8315 CMPL 16(SP), DI
8316 JEQ match_is_repeat_encodeBetterBlockAsm12B
8317 MOVL DI, 16(SP)
8318 MOVL 12(SP), BX
8319 CMPL BX, SI
8320 JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B
8321 MOVL SI, R8
8322 MOVL SI, 12(SP)
8323 LEAQ (DX)(BX*1), R9
8324 SUBL BX, R8
8325 LEAL -1(R8), BX
8326 CMPL BX, $0x3c
8327 JB one_byte_match_emit_encodeBetterBlockAsm12B
8328 CMPL BX, $0x00000100
8329 JB two_bytes_match_emit_encodeBetterBlockAsm12B
8330 JB three_bytes_match_emit_encodeBetterBlockAsm12B
8331
8332three_bytes_match_emit_encodeBetterBlockAsm12B:
8333 MOVB $0xf4, (AX)
8334 MOVW BX, 1(AX)
8335 ADDQ $0x03, AX
8336 JMP memmove_long_match_emit_encodeBetterBlockAsm12B
8337
8338two_bytes_match_emit_encodeBetterBlockAsm12B:
8339 MOVB $0xf0, (AX)
8340 MOVB BL, 1(AX)
8341 ADDQ $0x02, AX
8342 CMPL BX, $0x40
8343 JB memmove_match_emit_encodeBetterBlockAsm12B
8344 JMP memmove_long_match_emit_encodeBetterBlockAsm12B
8345
8346one_byte_match_emit_encodeBetterBlockAsm12B:
8347 SHLB $0x02, BL
8348 MOVB BL, (AX)
8349 ADDQ $0x01, AX
8350
8351memmove_match_emit_encodeBetterBlockAsm12B:
8352 LEAQ (AX)(R8*1), BX
8353
8354 // genMemMoveShort
8355 CMPQ R8, $0x04
8356 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4
8357 CMPQ R8, $0x08
8358 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7
8359 CMPQ R8, $0x10
8360 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16
8361 CMPQ R8, $0x20
8362 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32
8363 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64
8364
8365emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4:
8366 MOVL (R9), R10
8367 MOVL R10, (AX)
8368 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
8369
8370emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7:
8371 MOVL (R9), R10
8372 MOVL -4(R9)(R8*1), R9
8373 MOVL R10, (AX)
8374 MOVL R9, -4(AX)(R8*1)
8375 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
8376
8377emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16:
8378 MOVQ (R9), R10
8379 MOVQ -8(R9)(R8*1), R9
8380 MOVQ R10, (AX)
8381 MOVQ R9, -8(AX)(R8*1)
8382 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
8383
8384emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32:
8385 MOVOU (R9), X0
8386 MOVOU -16(R9)(R8*1), X1
8387 MOVOU X0, (AX)
8388 MOVOU X1, -16(AX)(R8*1)
8389 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
8390
8391emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64:
8392 MOVOU (R9), X0
8393 MOVOU 16(R9), X1
8394 MOVOU -32(R9)(R8*1), X2
8395 MOVOU -16(R9)(R8*1), X3
8396 MOVOU X0, (AX)
8397 MOVOU X1, 16(AX)
8398 MOVOU X2, -32(AX)(R8*1)
8399 MOVOU X3, -16(AX)(R8*1)
8400
8401memmove_end_copy_match_emit_encodeBetterBlockAsm12B:
8402 MOVQ BX, AX
8403 JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B
8404
8405memmove_long_match_emit_encodeBetterBlockAsm12B:
8406 LEAQ (AX)(R8*1), BX
8407
8408 // genMemMoveLong
8409 MOVOU (R9), X0
8410 MOVOU 16(R9), X1
8411 MOVOU -32(R9)(R8*1), X2
8412 MOVOU -16(R9)(R8*1), X3
8413 MOVQ R8, R12
8414 SHRQ $0x05, R12
8415 MOVQ AX, R10
8416 ANDL $0x0000001f, R10
8417 MOVQ $0x00000040, R13
8418 SUBQ R10, R13
8419 DECQ R12
8420 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
8421 LEAQ -32(R9)(R13*1), R10
8422 LEAQ -32(AX)(R13*1), R14
8423
8424emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back:
8425 MOVOU (R10), X4
8426 MOVOU 16(R10), X5
8427 MOVOA X4, (R14)
8428 MOVOA X5, 16(R14)
8429 ADDQ $0x20, R14
8430 ADDQ $0x20, R10
8431 ADDQ $0x20, R13
8432 DECQ R12
8433 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back
8434
8435emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
8436 MOVOU -32(R9)(R13*1), X4
8437 MOVOU -16(R9)(R13*1), X5
8438 MOVOA X4, -32(AX)(R13*1)
8439 MOVOA X5, -16(AX)(R13*1)
8440 ADDQ $0x20, R13
8441 CMPQ R8, R13
8442 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
8443 MOVOU X0, (AX)
8444 MOVOU X1, 16(AX)
8445 MOVOU X2, -32(AX)(R8*1)
8446 MOVOU X3, -16(AX)(R8*1)
8447 MOVQ BX, AX
8448
8449emit_literal_done_match_emit_encodeBetterBlockAsm12B:
8450 ADDL R11, CX
8451 ADDL $0x04, R11
8452 MOVL CX, 12(SP)
8453
8454 // emitCopy
8455 CMPL R11, $0x40
8456 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B
8457 CMPL DI, $0x00000800
8458 JAE long_offset_short_match_nolit_encodeBetterBlockAsm12B
8459 MOVL $0x00000001, BX
8460 LEAL 16(BX), BX
8461 MOVB DI, 1(AX)
8462 SHRL $0x08, DI
8463 SHLL $0x05, DI
8464 ORL DI, BX
8465 MOVB BL, (AX)
8466 ADDQ $0x02, AX
8467 SUBL $0x08, R11
8468
8469 // emitRepeat
8470 LEAL -4(R11), R11
8471 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
8472 MOVL R11, BX
8473 LEAL -4(R11), R11
8474 CMPL BX, $0x08
8475 JBE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
8476 CMPL BX, $0x0c
8477 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
8478 CMPL DI, $0x00000800
8479 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
8480
8481cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
8482 CMPL R11, $0x00000104
8483 JB repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
8484 LEAL -256(R11), R11
8485 MOVW $0x0019, (AX)
8486 MOVW R11, 2(AX)
8487 ADDQ $0x04, AX
8488 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8489
8490repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
8491 LEAL -4(R11), R11
8492 MOVW $0x0015, (AX)
8493 MOVB R11, 2(AX)
8494 ADDQ $0x03, AX
8495 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8496
8497repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
8498 SHLL $0x02, R11
8499 ORL $0x01, R11
8500 MOVW R11, (AX)
8501 ADDQ $0x02, AX
8502 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8503
8504repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
8505 XORQ BX, BX
8506 LEAL 1(BX)(R11*4), R11
8507 MOVB DI, 1(AX)
8508 SARL $0x08, DI
8509 SHLL $0x05, DI
8510 ORL DI, R11
8511 MOVB R11, (AX)
8512 ADDQ $0x02, AX
8513 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8514
8515long_offset_short_match_nolit_encodeBetterBlockAsm12B:
8516 MOVB $0xee, (AX)
8517 MOVW DI, 1(AX)
8518 LEAL -60(R11), R11
8519 ADDQ $0x03, AX
8520
8521 // emitRepeat
8522 MOVL R11, BX
8523 LEAL -4(R11), R11
8524 CMPL BX, $0x08
8525 JBE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
8526 CMPL BX, $0x0c
8527 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
8528 CMPL DI, $0x00000800
8529 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
8530
8531cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
8532 CMPL R11, $0x00000104
8533 JB repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
8534 LEAL -256(R11), R11
8535 MOVW $0x0019, (AX)
8536 MOVW R11, 2(AX)
8537 ADDQ $0x04, AX
8538 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8539
8540repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
8541 LEAL -4(R11), R11
8542 MOVW $0x0015, (AX)
8543 MOVB R11, 2(AX)
8544 ADDQ $0x03, AX
8545 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8546
8547repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
8548 SHLL $0x02, R11
8549 ORL $0x01, R11
8550 MOVW R11, (AX)
8551 ADDQ $0x02, AX
8552 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8553
8554repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
8555 XORQ BX, BX
8556 LEAL 1(BX)(R11*4), R11
8557 MOVB DI, 1(AX)
8558 SARL $0x08, DI
8559 SHLL $0x05, DI
8560 ORL DI, R11
8561 MOVB R11, (AX)
8562 ADDQ $0x02, AX
8563 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8564
8565two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B:
8566 MOVL R11, BX
8567 SHLL $0x02, BX
8568 CMPL R11, $0x0c
8569 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm12B
8570 CMPL DI, $0x00000800
8571 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm12B
8572 LEAL -15(BX), BX
8573 MOVB DI, 1(AX)
8574 SHRL $0x08, DI
8575 SHLL $0x05, DI
8576 ORL DI, BX
8577 MOVB BL, (AX)
8578 ADDQ $0x02, AX
8579 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8580
8581emit_copy_three_match_nolit_encodeBetterBlockAsm12B:
8582 LEAL -2(BX), BX
8583 MOVB BL, (AX)
8584 MOVW DI, 1(AX)
8585 ADDQ $0x03, AX
8586 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8587
8588match_is_repeat_encodeBetterBlockAsm12B:
8589 MOVL 12(SP), BX
8590 CMPL BX, SI
8591 JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
8592 MOVL SI, R8
8593 MOVL SI, 12(SP)
8594 LEAQ (DX)(BX*1), R9
8595 SUBL BX, R8
8596 LEAL -1(R8), BX
8597 CMPL BX, $0x3c
8598 JB one_byte_match_emit_repeat_encodeBetterBlockAsm12B
8599 CMPL BX, $0x00000100
8600 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm12B
8601 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm12B
8602
8603three_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
8604 MOVB $0xf4, (AX)
8605 MOVW BX, 1(AX)
8606 ADDQ $0x03, AX
8607 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
8608
8609two_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
8610 MOVB $0xf0, (AX)
8611 MOVB BL, 1(AX)
8612 ADDQ $0x02, AX
8613 CMPL BX, $0x40
8614 JB memmove_match_emit_repeat_encodeBetterBlockAsm12B
8615 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
8616
8617one_byte_match_emit_repeat_encodeBetterBlockAsm12B:
8618 SHLB $0x02, BL
8619 MOVB BL, (AX)
8620 ADDQ $0x01, AX
8621
8622memmove_match_emit_repeat_encodeBetterBlockAsm12B:
8623 LEAQ (AX)(R8*1), BX
8624
8625 // genMemMoveShort
8626 CMPQ R8, $0x04
8627 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4
8628 CMPQ R8, $0x08
8629 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7
8630 CMPQ R8, $0x10
8631 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16
8632 CMPQ R8, $0x20
8633 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32
8634 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64
8635
8636emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4:
8637 MOVL (R9), R10
8638 MOVL R10, (AX)
8639 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
8640
8641emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7:
8642 MOVL (R9), R10
8643 MOVL -4(R9)(R8*1), R9
8644 MOVL R10, (AX)
8645 MOVL R9, -4(AX)(R8*1)
8646 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
8647
8648emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16:
8649 MOVQ (R9), R10
8650 MOVQ -8(R9)(R8*1), R9
8651 MOVQ R10, (AX)
8652 MOVQ R9, -8(AX)(R8*1)
8653 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
8654
8655emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32:
8656 MOVOU (R9), X0
8657 MOVOU -16(R9)(R8*1), X1
8658 MOVOU X0, (AX)
8659 MOVOU X1, -16(AX)(R8*1)
8660 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
8661
8662emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64:
8663 MOVOU (R9), X0
8664 MOVOU 16(R9), X1
8665 MOVOU -32(R9)(R8*1), X2
8666 MOVOU -16(R9)(R8*1), X3
8667 MOVOU X0, (AX)
8668 MOVOU X1, 16(AX)
8669 MOVOU X2, -32(AX)(R8*1)
8670 MOVOU X3, -16(AX)(R8*1)
8671
8672memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B:
8673 MOVQ BX, AX
8674 JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
8675
8676memmove_long_match_emit_repeat_encodeBetterBlockAsm12B:
8677 LEAQ (AX)(R8*1), BX
8678
8679 // genMemMoveLong
8680 MOVOU (R9), X0
8681 MOVOU 16(R9), X1
8682 MOVOU -32(R9)(R8*1), X2
8683 MOVOU -16(R9)(R8*1), X3
8684 MOVQ R8, R12
8685 SHRQ $0x05, R12
8686 MOVQ AX, R10
8687 ANDL $0x0000001f, R10
8688 MOVQ $0x00000040, R13
8689 SUBQ R10, R13
8690 DECQ R12
8691 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
8692 LEAQ -32(R9)(R13*1), R10
8693 LEAQ -32(AX)(R13*1), R14
8694
8695emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back:
8696 MOVOU (R10), X4
8697 MOVOU 16(R10), X5
8698 MOVOA X4, (R14)
8699 MOVOA X5, 16(R14)
8700 ADDQ $0x20, R14
8701 ADDQ $0x20, R10
8702 ADDQ $0x20, R13
8703 DECQ R12
8704 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back
8705
8706emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
8707 MOVOU -32(R9)(R13*1), X4
8708 MOVOU -16(R9)(R13*1), X5
8709 MOVOA X4, -32(AX)(R13*1)
8710 MOVOA X5, -16(AX)(R13*1)
8711 ADDQ $0x20, R13
8712 CMPQ R8, R13
8713 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
8714 MOVOU X0, (AX)
8715 MOVOU X1, 16(AX)
8716 MOVOU X2, -32(AX)(R8*1)
8717 MOVOU X3, -16(AX)(R8*1)
8718 MOVQ BX, AX
8719
8720emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B:
8721 ADDL R11, CX
8722 ADDL $0x04, R11
8723 MOVL CX, 12(SP)
8724
8725 // emitRepeat
8726 MOVL R11, BX
8727 LEAL -4(R11), R11
8728 CMPL BX, $0x08
8729 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B
8730 CMPL BX, $0x0c
8731 JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
8732 CMPL DI, $0x00000800
8733 JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
8734
8735cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
8736 CMPL R11, $0x00000104
8737 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B
8738 LEAL -256(R11), R11
8739 MOVW $0x0019, (AX)
8740 MOVW R11, 2(AX)
8741 ADDQ $0x04, AX
8742 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8743
8744repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B:
8745 LEAL -4(R11), R11
8746 MOVW $0x0015, (AX)
8747 MOVB R11, 2(AX)
8748 ADDQ $0x03, AX
8749 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8750
8751repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B:
8752 SHLL $0x02, R11
8753 ORL $0x01, R11
8754 MOVW R11, (AX)
8755 ADDQ $0x02, AX
8756 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8757
8758repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
8759 XORQ BX, BX
8760 LEAL 1(BX)(R11*4), R11
8761 MOVB DI, 1(AX)
8762 SARL $0x08, DI
8763 SHLL $0x05, DI
8764 ORL DI, R11
8765 MOVB R11, (AX)
8766 ADDQ $0x02, AX
8767
8768match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
8769 CMPL CX, 8(SP)
8770 JAE emit_remainder_encodeBetterBlockAsm12B
8771 CMPQ AX, (SP)
8772 JB match_nolit_dst_ok_encodeBetterBlockAsm12B
8773 MOVQ $0x00000000, ret+48(FP)
8774 RET
8775
8776match_nolit_dst_ok_encodeBetterBlockAsm12B:
8777 MOVQ $0x0000cf1bbcdcbf9b, BX
8778 MOVQ $0x9e3779b1, DI
8779 LEAQ 1(SI), SI
8780 LEAQ -2(CX), R8
8781 MOVQ (DX)(SI*1), R9
8782 MOVQ 1(DX)(SI*1), R10
8783 MOVQ (DX)(R8*1), R11
8784 MOVQ 1(DX)(R8*1), R12
8785 SHLQ $0x10, R9
8786 IMULQ BX, R9
8787 SHRQ $0x32, R9
8788 SHLQ $0x20, R10
8789 IMULQ DI, R10
8790 SHRQ $0x34, R10
8791 SHLQ $0x10, R11
8792 IMULQ BX, R11
8793 SHRQ $0x32, R11
8794 SHLQ $0x20, R12
8795 IMULQ DI, R12
8796 SHRQ $0x34, R12
8797 LEAQ 1(SI), DI
8798 LEAQ 1(R8), R13
8799 MOVL SI, 24(SP)(R9*4)
8800 MOVL R8, 24(SP)(R11*4)
8801 MOVL DI, 65560(SP)(R10*4)
8802 MOVL R13, 65560(SP)(R12*4)
8803 LEAQ 1(R8)(SI*1), DI
8804 SHRQ $0x01, DI
8805 ADDQ $0x01, SI
8806 SUBQ $0x01, R8
8807
8808index_loop_encodeBetterBlockAsm12B:
8809 CMPQ DI, R8
8810 JAE search_loop_encodeBetterBlockAsm12B
8811 MOVQ (DX)(SI*1), R9
8812 MOVQ (DX)(DI*1), R10
8813 SHLQ $0x10, R9
8814 IMULQ BX, R9
8815 SHRQ $0x32, R9
8816 SHLQ $0x10, R10
8817 IMULQ BX, R10
8818 SHRQ $0x32, R10
8819 MOVL SI, 24(SP)(R9*4)
8820 MOVL DI, 24(SP)(R10*4)
8821 ADDQ $0x02, SI
8822 ADDQ $0x02, DI
8823 JMP index_loop_encodeBetterBlockAsm12B
8824
8825emit_remainder_encodeBetterBlockAsm12B:
8826 MOVQ src_len+32(FP), CX
8827 SUBL 12(SP), CX
8828 LEAQ 3(AX)(CX*1), CX
8829 CMPQ CX, (SP)
8830 JB emit_remainder_ok_encodeBetterBlockAsm12B
8831 MOVQ $0x00000000, ret+48(FP)
8832 RET
8833
8834emit_remainder_ok_encodeBetterBlockAsm12B:
8835 MOVQ src_len+32(FP), CX
8836 MOVL 12(SP), BX
8837 CMPL BX, CX
8838 JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
8839 MOVL CX, SI
8840 MOVL CX, 12(SP)
8841 LEAQ (DX)(BX*1), CX
8842 SUBL BX, SI
8843 LEAL -1(SI), DX
8844 CMPL DX, $0x3c
8845 JB one_byte_emit_remainder_encodeBetterBlockAsm12B
8846 CMPL DX, $0x00000100
8847 JB two_bytes_emit_remainder_encodeBetterBlockAsm12B
8848 JB three_bytes_emit_remainder_encodeBetterBlockAsm12B
8849
8850three_bytes_emit_remainder_encodeBetterBlockAsm12B:
8851 MOVB $0xf4, (AX)
8852 MOVW DX, 1(AX)
8853 ADDQ $0x03, AX
8854 JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B
8855
8856two_bytes_emit_remainder_encodeBetterBlockAsm12B:
8857 MOVB $0xf0, (AX)
8858 MOVB DL, 1(AX)
8859 ADDQ $0x02, AX
8860 CMPL DX, $0x40
8861 JB memmove_emit_remainder_encodeBetterBlockAsm12B
8862 JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B
8863
8864one_byte_emit_remainder_encodeBetterBlockAsm12B:
8865 SHLB $0x02, DL
8866 MOVB DL, (AX)
8867 ADDQ $0x01, AX
8868
8869memmove_emit_remainder_encodeBetterBlockAsm12B:
8870 LEAQ (AX)(SI*1), DX
8871 MOVL SI, BX
8872
8873 // genMemMoveShort
8874 CMPQ BX, $0x03
8875 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2
8876 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3
8877 CMPQ BX, $0x08
8878 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7
8879 CMPQ BX, $0x10
8880 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16
8881 CMPQ BX, $0x20
8882 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32
8883 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64
8884
8885emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2:
8886 MOVB (CX), SI
8887 MOVB -1(CX)(BX*1), CL
8888 MOVB SI, (AX)
8889 MOVB CL, -1(AX)(BX*1)
8890 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
8891
8892emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3:
8893 MOVW (CX), SI
8894 MOVB 2(CX), CL
8895 MOVW SI, (AX)
8896 MOVB CL, 2(AX)
8897 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
8898
8899emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7:
8900 MOVL (CX), SI
8901 MOVL -4(CX)(BX*1), CX
8902 MOVL SI, (AX)
8903 MOVL CX, -4(AX)(BX*1)
8904 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
8905
8906emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16:
8907 MOVQ (CX), SI
8908 MOVQ -8(CX)(BX*1), CX
8909 MOVQ SI, (AX)
8910 MOVQ CX, -8(AX)(BX*1)
8911 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
8912
8913emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32:
8914 MOVOU (CX), X0
8915 MOVOU -16(CX)(BX*1), X1
8916 MOVOU X0, (AX)
8917 MOVOU X1, -16(AX)(BX*1)
8918 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
8919
8920emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64:
8921 MOVOU (CX), X0
8922 MOVOU 16(CX), X1
8923 MOVOU -32(CX)(BX*1), X2
8924 MOVOU -16(CX)(BX*1), X3
8925 MOVOU X0, (AX)
8926 MOVOU X1, 16(AX)
8927 MOVOU X2, -32(AX)(BX*1)
8928 MOVOU X3, -16(AX)(BX*1)
8929
8930memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B:
8931 MOVQ DX, AX
8932 JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
8933
8934memmove_long_emit_remainder_encodeBetterBlockAsm12B:
8935 LEAQ (AX)(SI*1), DX
8936 MOVL SI, BX
8937
8938 // genMemMoveLong
8939 MOVOU (CX), X0
8940 MOVOU 16(CX), X1
8941 MOVOU -32(CX)(BX*1), X2
8942 MOVOU -16(CX)(BX*1), X3
8943 MOVQ BX, DI
8944 SHRQ $0x05, DI
8945 MOVQ AX, SI
8946 ANDL $0x0000001f, SI
8947 MOVQ $0x00000040, R8
8948 SUBQ SI, R8
8949 DECQ DI
8950 JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
8951 LEAQ -32(CX)(R8*1), SI
8952 LEAQ -32(AX)(R8*1), R9
8953
8954emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back:
8955 MOVOU (SI), X4
8956 MOVOU 16(SI), X5
8957 MOVOA X4, (R9)
8958 MOVOA X5, 16(R9)
8959 ADDQ $0x20, R9
8960 ADDQ $0x20, SI
8961 ADDQ $0x20, R8
8962 DECQ DI
8963 JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back
8964
8965emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
8966 MOVOU -32(CX)(R8*1), X4
8967 MOVOU -16(CX)(R8*1), X5
8968 MOVOA X4, -32(AX)(R8*1)
8969 MOVOA X5, -16(AX)(R8*1)
8970 ADDQ $0x20, R8
8971 CMPQ BX, R8
8972 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
8973 MOVOU X0, (AX)
8974 MOVOU X1, 16(AX)
8975 MOVOU X2, -32(AX)(BX*1)
8976 MOVOU X3, -16(AX)(BX*1)
8977 MOVQ DX, AX
8978
8979emit_literal_done_emit_remainder_encodeBetterBlockAsm12B:
8980 MOVQ dst_base+0(FP), CX
8981 SUBQ CX, AX
8982 MOVQ AX, ret+48(FP)
8983 RET
8984
8985// func encodeBetterBlockAsm10B(dst []byte, src []byte) int
8986// Requires: BMI, SSE2
8987TEXT ·encodeBetterBlockAsm10B(SB), $20504-56
8988 MOVQ dst_base+0(FP), AX
8989 MOVQ $0x000000a0, CX
8990 LEAQ 24(SP), DX
8991 PXOR X0, X0
8992
8993zero_loop_encodeBetterBlockAsm10B:
8994 MOVOU X0, (DX)
8995 MOVOU X0, 16(DX)
8996 MOVOU X0, 32(DX)
8997 MOVOU X0, 48(DX)
8998 MOVOU X0, 64(DX)
8999 MOVOU X0, 80(DX)
9000 MOVOU X0, 96(DX)
9001 MOVOU X0, 112(DX)
9002 ADDQ $0x80, DX
9003 DECQ CX
9004 JNZ zero_loop_encodeBetterBlockAsm10B
9005 MOVL $0x00000000, 12(SP)
9006 MOVQ src_len+32(FP), CX
9007 LEAQ -6(CX), DX
9008 LEAQ -8(CX), BX
9009 MOVL BX, 8(SP)
9010 SHRQ $0x05, CX
9011 SUBL CX, DX
9012 LEAQ (AX)(DX*1), DX
9013 MOVQ DX, (SP)
9014 MOVL $0x00000001, CX
9015 MOVL $0x00000000, 16(SP)
9016 MOVQ src_base+24(FP), DX
9017
9018search_loop_encodeBetterBlockAsm10B:
9019 MOVL CX, BX
9020 SUBL 12(SP), BX
9021 SHRL $0x05, BX
9022 LEAL 1(CX)(BX*1), BX
9023 CMPL BX, 8(SP)
9024 JAE emit_remainder_encodeBetterBlockAsm10B
9025 MOVQ (DX)(CX*1), SI
9026 MOVL BX, 20(SP)
9027 MOVQ $0x0000cf1bbcdcbf9b, R8
9028 MOVQ $0x9e3779b1, BX
9029 MOVQ SI, R9
9030 MOVQ SI, R10
9031 SHLQ $0x10, R9
9032 IMULQ R8, R9
9033 SHRQ $0x34, R9
9034 SHLQ $0x20, R10
9035 IMULQ BX, R10
9036 SHRQ $0x36, R10
9037 MOVL 24(SP)(R9*4), BX
9038 MOVL 16408(SP)(R10*4), DI
9039 MOVL CX, 24(SP)(R9*4)
9040 MOVL CX, 16408(SP)(R10*4)
9041 MOVQ (DX)(BX*1), R9
9042 MOVQ (DX)(DI*1), R10
9043 CMPQ R9, SI
9044 JEQ candidate_match_encodeBetterBlockAsm10B
9045 CMPQ R10, SI
9046 JNE no_short_found_encodeBetterBlockAsm10B
9047 MOVL DI, BX
9048 JMP candidate_match_encodeBetterBlockAsm10B
9049
9050no_short_found_encodeBetterBlockAsm10B:
9051 CMPL R9, SI
9052 JEQ candidate_match_encodeBetterBlockAsm10B
9053 CMPL R10, SI
9054 JEQ candidateS_match_encodeBetterBlockAsm10B
9055 MOVL 20(SP), CX
9056 JMP search_loop_encodeBetterBlockAsm10B
9057
9058candidateS_match_encodeBetterBlockAsm10B:
9059 SHRQ $0x08, SI
9060 MOVQ SI, R9
9061 SHLQ $0x10, R9
9062 IMULQ R8, R9
9063 SHRQ $0x34, R9
9064 MOVL 24(SP)(R9*4), BX
9065 INCL CX
9066 MOVL CX, 24(SP)(R9*4)
9067 CMPL (DX)(BX*1), SI
9068 JEQ candidate_match_encodeBetterBlockAsm10B
9069 DECL CX
9070 MOVL DI, BX
9071
9072candidate_match_encodeBetterBlockAsm10B:
9073 MOVL 12(SP), SI
9074 TESTL BX, BX
9075 JZ match_extend_back_end_encodeBetterBlockAsm10B
9076
9077match_extend_back_loop_encodeBetterBlockAsm10B:
9078 CMPL CX, SI
9079 JBE match_extend_back_end_encodeBetterBlockAsm10B
9080 MOVB -1(DX)(BX*1), DI
9081 MOVB -1(DX)(CX*1), R8
9082 CMPB DI, R8
9083 JNE match_extend_back_end_encodeBetterBlockAsm10B
9084 LEAL -1(CX), CX
9085 DECL BX
9086 JZ match_extend_back_end_encodeBetterBlockAsm10B
9087 JMP match_extend_back_loop_encodeBetterBlockAsm10B
9088
9089match_extend_back_end_encodeBetterBlockAsm10B:
9090 MOVL CX, SI
9091 SUBL 12(SP), SI
9092 LEAQ 3(AX)(SI*1), SI
9093 CMPQ SI, (SP)
9094 JB match_dst_size_check_encodeBetterBlockAsm10B
9095 MOVQ $0x00000000, ret+48(FP)
9096 RET
9097
9098match_dst_size_check_encodeBetterBlockAsm10B:
9099 MOVL CX, SI
9100 ADDL $0x04, CX
9101 ADDL $0x04, BX
9102 MOVQ src_len+32(FP), DI
9103 SUBL CX, DI
9104 LEAQ (DX)(CX*1), R8
9105 LEAQ (DX)(BX*1), R9
9106
9107 // matchLen
9108 XORL R11, R11
9109
9110matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B:
9111 CMPL DI, $0x10
9112 JB matchlen_match8_match_nolit_encodeBetterBlockAsm10B
9113 MOVQ (R8)(R11*1), R10
9114 MOVQ 8(R8)(R11*1), R12
9115 XORQ (R9)(R11*1), R10
9116 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B
9117 XORQ 8(R9)(R11*1), R12
9118 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B
9119 LEAL -16(DI), DI
9120 LEAL 16(R11), R11
9121 JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B
9122
9123matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B:
9124#ifdef GOAMD64_v3
9125 TZCNTQ R12, R12
9126
9127#else
9128 BSFQ R12, R12
9129
9130#endif
9131 SARQ $0x03, R12
9132 LEAL 8(R11)(R12*1), R11
9133 JMP match_nolit_end_encodeBetterBlockAsm10B
9134
9135matchlen_match8_match_nolit_encodeBetterBlockAsm10B:
9136 CMPL DI, $0x08
9137 JB matchlen_match4_match_nolit_encodeBetterBlockAsm10B
9138 MOVQ (R8)(R11*1), R10
9139 XORQ (R9)(R11*1), R10
9140 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B
9141 LEAL -8(DI), DI
9142 LEAL 8(R11), R11
9143 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm10B
9144
9145matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B:
9146#ifdef GOAMD64_v3
9147 TZCNTQ R10, R10
9148
9149#else
9150 BSFQ R10, R10
9151
9152#endif
9153 SARQ $0x03, R10
9154 LEAL (R11)(R10*1), R11
9155 JMP match_nolit_end_encodeBetterBlockAsm10B
9156
9157matchlen_match4_match_nolit_encodeBetterBlockAsm10B:
9158 CMPL DI, $0x04
9159 JB matchlen_match2_match_nolit_encodeBetterBlockAsm10B
9160 MOVL (R8)(R11*1), R10
9161 CMPL (R9)(R11*1), R10
9162 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm10B
9163 LEAL -4(DI), DI
9164 LEAL 4(R11), R11
9165
9166matchlen_match2_match_nolit_encodeBetterBlockAsm10B:
9167 CMPL DI, $0x01
9168 JE matchlen_match1_match_nolit_encodeBetterBlockAsm10B
9169 JB match_nolit_end_encodeBetterBlockAsm10B
9170 MOVW (R8)(R11*1), R10
9171 CMPW (R9)(R11*1), R10
9172 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm10B
9173 LEAL 2(R11), R11
9174 SUBL $0x02, DI
9175 JZ match_nolit_end_encodeBetterBlockAsm10B
9176
9177matchlen_match1_match_nolit_encodeBetterBlockAsm10B:
9178 MOVB (R8)(R11*1), R10
9179 CMPB (R9)(R11*1), R10
9180 JNE match_nolit_end_encodeBetterBlockAsm10B
9181 LEAL 1(R11), R11
9182
9183match_nolit_end_encodeBetterBlockAsm10B:
9184 MOVL CX, DI
9185 SUBL BX, DI
9186
9187 // Check if repeat
9188 CMPL 16(SP), DI
9189 JEQ match_is_repeat_encodeBetterBlockAsm10B
9190 MOVL DI, 16(SP)
9191 MOVL 12(SP), BX
9192 CMPL BX, SI
9193 JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B
9194 MOVL SI, R8
9195 MOVL SI, 12(SP)
9196 LEAQ (DX)(BX*1), R9
9197 SUBL BX, R8
9198 LEAL -1(R8), BX
9199 CMPL BX, $0x3c
9200 JB one_byte_match_emit_encodeBetterBlockAsm10B
9201 CMPL BX, $0x00000100
9202 JB two_bytes_match_emit_encodeBetterBlockAsm10B
9203 JB three_bytes_match_emit_encodeBetterBlockAsm10B
9204
9205three_bytes_match_emit_encodeBetterBlockAsm10B:
9206 MOVB $0xf4, (AX)
9207 MOVW BX, 1(AX)
9208 ADDQ $0x03, AX
9209 JMP memmove_long_match_emit_encodeBetterBlockAsm10B
9210
9211two_bytes_match_emit_encodeBetterBlockAsm10B:
9212 MOVB $0xf0, (AX)
9213 MOVB BL, 1(AX)
9214 ADDQ $0x02, AX
9215 CMPL BX, $0x40
9216 JB memmove_match_emit_encodeBetterBlockAsm10B
9217 JMP memmove_long_match_emit_encodeBetterBlockAsm10B
9218
9219one_byte_match_emit_encodeBetterBlockAsm10B:
9220 SHLB $0x02, BL
9221 MOVB BL, (AX)
9222 ADDQ $0x01, AX
9223
9224memmove_match_emit_encodeBetterBlockAsm10B:
9225 LEAQ (AX)(R8*1), BX
9226
9227 // genMemMoveShort
9228 CMPQ R8, $0x04
9229 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4
9230 CMPQ R8, $0x08
9231 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7
9232 CMPQ R8, $0x10
9233 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16
9234 CMPQ R8, $0x20
9235 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32
9236 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64
9237
9238emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4:
9239 MOVL (R9), R10
9240 MOVL R10, (AX)
9241 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
9242
9243emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7:
9244 MOVL (R9), R10
9245 MOVL -4(R9)(R8*1), R9
9246 MOVL R10, (AX)
9247 MOVL R9, -4(AX)(R8*1)
9248 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
9249
9250emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16:
9251 MOVQ (R9), R10
9252 MOVQ -8(R9)(R8*1), R9
9253 MOVQ R10, (AX)
9254 MOVQ R9, -8(AX)(R8*1)
9255 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
9256
9257emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32:
9258 MOVOU (R9), X0
9259 MOVOU -16(R9)(R8*1), X1
9260 MOVOU X0, (AX)
9261 MOVOU X1, -16(AX)(R8*1)
9262 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
9263
9264emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64:
9265 MOVOU (R9), X0
9266 MOVOU 16(R9), X1
9267 MOVOU -32(R9)(R8*1), X2
9268 MOVOU -16(R9)(R8*1), X3
9269 MOVOU X0, (AX)
9270 MOVOU X1, 16(AX)
9271 MOVOU X2, -32(AX)(R8*1)
9272 MOVOU X3, -16(AX)(R8*1)
9273
9274memmove_end_copy_match_emit_encodeBetterBlockAsm10B:
9275 MOVQ BX, AX
9276 JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B
9277
9278memmove_long_match_emit_encodeBetterBlockAsm10B:
9279 LEAQ (AX)(R8*1), BX
9280
9281 // genMemMoveLong
9282 MOVOU (R9), X0
9283 MOVOU 16(R9), X1
9284 MOVOU -32(R9)(R8*1), X2
9285 MOVOU -16(R9)(R8*1), X3
9286 MOVQ R8, R12
9287 SHRQ $0x05, R12
9288 MOVQ AX, R10
9289 ANDL $0x0000001f, R10
9290 MOVQ $0x00000040, R13
9291 SUBQ R10, R13
9292 DECQ R12
9293 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
9294 LEAQ -32(R9)(R13*1), R10
9295 LEAQ -32(AX)(R13*1), R14
9296
9297emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back:
9298 MOVOU (R10), X4
9299 MOVOU 16(R10), X5
9300 MOVOA X4, (R14)
9301 MOVOA X5, 16(R14)
9302 ADDQ $0x20, R14
9303 ADDQ $0x20, R10
9304 ADDQ $0x20, R13
9305 DECQ R12
9306 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back
9307
9308emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
9309 MOVOU -32(R9)(R13*1), X4
9310 MOVOU -16(R9)(R13*1), X5
9311 MOVOA X4, -32(AX)(R13*1)
9312 MOVOA X5, -16(AX)(R13*1)
9313 ADDQ $0x20, R13
9314 CMPQ R8, R13
9315 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
9316 MOVOU X0, (AX)
9317 MOVOU X1, 16(AX)
9318 MOVOU X2, -32(AX)(R8*1)
9319 MOVOU X3, -16(AX)(R8*1)
9320 MOVQ BX, AX
9321
9322emit_literal_done_match_emit_encodeBetterBlockAsm10B:
9323 ADDL R11, CX
9324 ADDL $0x04, R11
9325 MOVL CX, 12(SP)
9326
9327 // emitCopy
9328 CMPL R11, $0x40
9329 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B
9330 CMPL DI, $0x00000800
9331 JAE long_offset_short_match_nolit_encodeBetterBlockAsm10B
9332 MOVL $0x00000001, BX
9333 LEAL 16(BX), BX
9334 MOVB DI, 1(AX)
9335 SHRL $0x08, DI
9336 SHLL $0x05, DI
9337 ORL DI, BX
9338 MOVB BL, (AX)
9339 ADDQ $0x02, AX
9340 SUBL $0x08, R11
9341
9342 // emitRepeat
9343 LEAL -4(R11), R11
9344 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
9345 MOVL R11, BX
9346 LEAL -4(R11), R11
9347 CMPL BX, $0x08
9348 JBE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
9349 CMPL BX, $0x0c
9350 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
9351 CMPL DI, $0x00000800
9352 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
9353
9354cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
9355 CMPL R11, $0x00000104
9356 JB repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
9357 LEAL -256(R11), R11
9358 MOVW $0x0019, (AX)
9359 MOVW R11, 2(AX)
9360 ADDQ $0x04, AX
9361 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9362
9363repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
9364 LEAL -4(R11), R11
9365 MOVW $0x0015, (AX)
9366 MOVB R11, 2(AX)
9367 ADDQ $0x03, AX
9368 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9369
9370repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
9371 SHLL $0x02, R11
9372 ORL $0x01, R11
9373 MOVW R11, (AX)
9374 ADDQ $0x02, AX
9375 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9376
9377repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
9378 XORQ BX, BX
9379 LEAL 1(BX)(R11*4), R11
9380 MOVB DI, 1(AX)
9381 SARL $0x08, DI
9382 SHLL $0x05, DI
9383 ORL DI, R11
9384 MOVB R11, (AX)
9385 ADDQ $0x02, AX
9386 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9387
9388long_offset_short_match_nolit_encodeBetterBlockAsm10B:
9389 MOVB $0xee, (AX)
9390 MOVW DI, 1(AX)
9391 LEAL -60(R11), R11
9392 ADDQ $0x03, AX
9393
9394 // emitRepeat
9395 MOVL R11, BX
9396 LEAL -4(R11), R11
9397 CMPL BX, $0x08
9398 JBE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
9399 CMPL BX, $0x0c
9400 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
9401 CMPL DI, $0x00000800
9402 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
9403
9404cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
9405 CMPL R11, $0x00000104
9406 JB repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
9407 LEAL -256(R11), R11
9408 MOVW $0x0019, (AX)
9409 MOVW R11, 2(AX)
9410 ADDQ $0x04, AX
9411 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9412
9413repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
9414 LEAL -4(R11), R11
9415 MOVW $0x0015, (AX)
9416 MOVB R11, 2(AX)
9417 ADDQ $0x03, AX
9418 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9419
9420repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
9421 SHLL $0x02, R11
9422 ORL $0x01, R11
9423 MOVW R11, (AX)
9424 ADDQ $0x02, AX
9425 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9426
9427repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
9428 XORQ BX, BX
9429 LEAL 1(BX)(R11*4), R11
9430 MOVB DI, 1(AX)
9431 SARL $0x08, DI
9432 SHLL $0x05, DI
9433 ORL DI, R11
9434 MOVB R11, (AX)
9435 ADDQ $0x02, AX
9436 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9437
9438two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B:
9439 MOVL R11, BX
9440 SHLL $0x02, BX
9441 CMPL R11, $0x0c
9442 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm10B
9443 CMPL DI, $0x00000800
9444 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm10B
9445 LEAL -15(BX), BX
9446 MOVB DI, 1(AX)
9447 SHRL $0x08, DI
9448 SHLL $0x05, DI
9449 ORL DI, BX
9450 MOVB BL, (AX)
9451 ADDQ $0x02, AX
9452 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9453
9454emit_copy_three_match_nolit_encodeBetterBlockAsm10B:
9455 LEAL -2(BX), BX
9456 MOVB BL, (AX)
9457 MOVW DI, 1(AX)
9458 ADDQ $0x03, AX
9459 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9460
9461match_is_repeat_encodeBetterBlockAsm10B:
9462 MOVL 12(SP), BX
9463 CMPL BX, SI
9464 JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
9465 MOVL SI, R8
9466 MOVL SI, 12(SP)
9467 LEAQ (DX)(BX*1), R9
9468 SUBL BX, R8
9469 LEAL -1(R8), BX
9470 CMPL BX, $0x3c
9471 JB one_byte_match_emit_repeat_encodeBetterBlockAsm10B
9472 CMPL BX, $0x00000100
9473 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm10B
9474 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm10B
9475
9476three_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
9477 MOVB $0xf4, (AX)
9478 MOVW BX, 1(AX)
9479 ADDQ $0x03, AX
9480 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
9481
9482two_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
9483 MOVB $0xf0, (AX)
9484 MOVB BL, 1(AX)
9485 ADDQ $0x02, AX
9486 CMPL BX, $0x40
9487 JB memmove_match_emit_repeat_encodeBetterBlockAsm10B
9488 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
9489
9490one_byte_match_emit_repeat_encodeBetterBlockAsm10B:
9491 SHLB $0x02, BL
9492 MOVB BL, (AX)
9493 ADDQ $0x01, AX
9494
9495memmove_match_emit_repeat_encodeBetterBlockAsm10B:
9496 LEAQ (AX)(R8*1), BX
9497
9498 // genMemMoveShort
9499 CMPQ R8, $0x04
9500 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4
9501 CMPQ R8, $0x08
9502 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7
9503 CMPQ R8, $0x10
9504 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16
9505 CMPQ R8, $0x20
9506 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32
9507 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64
9508
9509emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4:
9510 MOVL (R9), R10
9511 MOVL R10, (AX)
9512 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
9513
9514emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7:
9515 MOVL (R9), R10
9516 MOVL -4(R9)(R8*1), R9
9517 MOVL R10, (AX)
9518 MOVL R9, -4(AX)(R8*1)
9519 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
9520
9521emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16:
9522 MOVQ (R9), R10
9523 MOVQ -8(R9)(R8*1), R9
9524 MOVQ R10, (AX)
9525 MOVQ R9, -8(AX)(R8*1)
9526 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
9527
9528emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32:
9529 MOVOU (R9), X0
9530 MOVOU -16(R9)(R8*1), X1
9531 MOVOU X0, (AX)
9532 MOVOU X1, -16(AX)(R8*1)
9533 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
9534
9535emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64:
9536 MOVOU (R9), X0
9537 MOVOU 16(R9), X1
9538 MOVOU -32(R9)(R8*1), X2
9539 MOVOU -16(R9)(R8*1), X3
9540 MOVOU X0, (AX)
9541 MOVOU X1, 16(AX)
9542 MOVOU X2, -32(AX)(R8*1)
9543 MOVOU X3, -16(AX)(R8*1)
9544
9545memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B:
9546 MOVQ BX, AX
9547 JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
9548
9549memmove_long_match_emit_repeat_encodeBetterBlockAsm10B:
9550 LEAQ (AX)(R8*1), BX
9551
9552 // genMemMoveLong
9553 MOVOU (R9), X0
9554 MOVOU 16(R9), X1
9555 MOVOU -32(R9)(R8*1), X2
9556 MOVOU -16(R9)(R8*1), X3
9557 MOVQ R8, R12
9558 SHRQ $0x05, R12
9559 MOVQ AX, R10
9560 ANDL $0x0000001f, R10
9561 MOVQ $0x00000040, R13
9562 SUBQ R10, R13
9563 DECQ R12
9564 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
9565 LEAQ -32(R9)(R13*1), R10
9566 LEAQ -32(AX)(R13*1), R14
9567
9568emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back:
9569 MOVOU (R10), X4
9570 MOVOU 16(R10), X5
9571 MOVOA X4, (R14)
9572 MOVOA X5, 16(R14)
9573 ADDQ $0x20, R14
9574 ADDQ $0x20, R10
9575 ADDQ $0x20, R13
9576 DECQ R12
9577 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back
9578
9579emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
9580 MOVOU -32(R9)(R13*1), X4
9581 MOVOU -16(R9)(R13*1), X5
9582 MOVOA X4, -32(AX)(R13*1)
9583 MOVOA X5, -16(AX)(R13*1)
9584 ADDQ $0x20, R13
9585 CMPQ R8, R13
9586 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
9587 MOVOU X0, (AX)
9588 MOVOU X1, 16(AX)
9589 MOVOU X2, -32(AX)(R8*1)
9590 MOVOU X3, -16(AX)(R8*1)
9591 MOVQ BX, AX
9592
9593emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B:
9594 ADDL R11, CX
9595 ADDL $0x04, R11
9596 MOVL CX, 12(SP)
9597
9598 // emitRepeat
9599 MOVL R11, BX
9600 LEAL -4(R11), R11
9601 CMPL BX, $0x08
9602 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B
9603 CMPL BX, $0x0c
9604 JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
9605 CMPL DI, $0x00000800
9606 JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
9607
9608cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
9609 CMPL R11, $0x00000104
9610 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B
9611 LEAL -256(R11), R11
9612 MOVW $0x0019, (AX)
9613 MOVW R11, 2(AX)
9614 ADDQ $0x04, AX
9615 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9616
9617repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B:
9618 LEAL -4(R11), R11
9619 MOVW $0x0015, (AX)
9620 MOVB R11, 2(AX)
9621 ADDQ $0x03, AX
9622 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9623
9624repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B:
9625 SHLL $0x02, R11
9626 ORL $0x01, R11
9627 MOVW R11, (AX)
9628 ADDQ $0x02, AX
9629 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9630
9631repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
9632 XORQ BX, BX
9633 LEAL 1(BX)(R11*4), R11
9634 MOVB DI, 1(AX)
9635 SARL $0x08, DI
9636 SHLL $0x05, DI
9637 ORL DI, R11
9638 MOVB R11, (AX)
9639 ADDQ $0x02, AX
9640
9641match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
9642 CMPL CX, 8(SP)
9643 JAE emit_remainder_encodeBetterBlockAsm10B
9644 CMPQ AX, (SP)
9645 JB match_nolit_dst_ok_encodeBetterBlockAsm10B
9646 MOVQ $0x00000000, ret+48(FP)
9647 RET
9648
9649match_nolit_dst_ok_encodeBetterBlockAsm10B:
9650 MOVQ $0x0000cf1bbcdcbf9b, BX
9651 MOVQ $0x9e3779b1, DI
9652 LEAQ 1(SI), SI
9653 LEAQ -2(CX), R8
9654 MOVQ (DX)(SI*1), R9
9655 MOVQ 1(DX)(SI*1), R10
9656 MOVQ (DX)(R8*1), R11
9657 MOVQ 1(DX)(R8*1), R12
9658 SHLQ $0x10, R9
9659 IMULQ BX, R9
9660 SHRQ $0x34, R9
9661 SHLQ $0x20, R10
9662 IMULQ DI, R10
9663 SHRQ $0x36, R10
9664 SHLQ $0x10, R11
9665 IMULQ BX, R11
9666 SHRQ $0x34, R11
9667 SHLQ $0x20, R12
9668 IMULQ DI, R12
9669 SHRQ $0x36, R12
9670 LEAQ 1(SI), DI
9671 LEAQ 1(R8), R13
9672 MOVL SI, 24(SP)(R9*4)
9673 MOVL R8, 24(SP)(R11*4)
9674 MOVL DI, 16408(SP)(R10*4)
9675 MOVL R13, 16408(SP)(R12*4)
9676 LEAQ 1(R8)(SI*1), DI
9677 SHRQ $0x01, DI
9678 ADDQ $0x01, SI
9679 SUBQ $0x01, R8
9680
9681index_loop_encodeBetterBlockAsm10B:
9682 CMPQ DI, R8
9683 JAE search_loop_encodeBetterBlockAsm10B
9684 MOVQ (DX)(SI*1), R9
9685 MOVQ (DX)(DI*1), R10
9686 SHLQ $0x10, R9
9687 IMULQ BX, R9
9688 SHRQ $0x34, R9
9689 SHLQ $0x10, R10
9690 IMULQ BX, R10
9691 SHRQ $0x34, R10
9692 MOVL SI, 24(SP)(R9*4)
9693 MOVL DI, 24(SP)(R10*4)
9694 ADDQ $0x02, SI
9695 ADDQ $0x02, DI
9696 JMP index_loop_encodeBetterBlockAsm10B
9697
9698emit_remainder_encodeBetterBlockAsm10B:
9699 MOVQ src_len+32(FP), CX
9700 SUBL 12(SP), CX
9701 LEAQ 3(AX)(CX*1), CX
9702 CMPQ CX, (SP)
9703 JB emit_remainder_ok_encodeBetterBlockAsm10B
9704 MOVQ $0x00000000, ret+48(FP)
9705 RET
9706
9707emit_remainder_ok_encodeBetterBlockAsm10B:
9708 MOVQ src_len+32(FP), CX
9709 MOVL 12(SP), BX
9710 CMPL BX, CX
9711 JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
9712 MOVL CX, SI
9713 MOVL CX, 12(SP)
9714 LEAQ (DX)(BX*1), CX
9715 SUBL BX, SI
9716 LEAL -1(SI), DX
9717 CMPL DX, $0x3c
9718 JB one_byte_emit_remainder_encodeBetterBlockAsm10B
9719 CMPL DX, $0x00000100
9720 JB two_bytes_emit_remainder_encodeBetterBlockAsm10B
9721 JB three_bytes_emit_remainder_encodeBetterBlockAsm10B
9722
9723three_bytes_emit_remainder_encodeBetterBlockAsm10B:
9724 MOVB $0xf4, (AX)
9725 MOVW DX, 1(AX)
9726 ADDQ $0x03, AX
9727 JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B
9728
9729two_bytes_emit_remainder_encodeBetterBlockAsm10B:
9730 MOVB $0xf0, (AX)
9731 MOVB DL, 1(AX)
9732 ADDQ $0x02, AX
9733 CMPL DX, $0x40
9734 JB memmove_emit_remainder_encodeBetterBlockAsm10B
9735 JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B
9736
9737one_byte_emit_remainder_encodeBetterBlockAsm10B:
9738 SHLB $0x02, DL
9739 MOVB DL, (AX)
9740 ADDQ $0x01, AX
9741
9742memmove_emit_remainder_encodeBetterBlockAsm10B:
9743 LEAQ (AX)(SI*1), DX
9744 MOVL SI, BX
9745
9746 // genMemMoveShort
9747 CMPQ BX, $0x03
9748 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2
9749 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3
9750 CMPQ BX, $0x08
9751 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7
9752 CMPQ BX, $0x10
9753 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16
9754 CMPQ BX, $0x20
9755 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32
9756 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64
9757
9758emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2:
9759 MOVB (CX), SI
9760 MOVB -1(CX)(BX*1), CL
9761 MOVB SI, (AX)
9762 MOVB CL, -1(AX)(BX*1)
9763 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
9764
9765emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3:
9766 MOVW (CX), SI
9767 MOVB 2(CX), CL
9768 MOVW SI, (AX)
9769 MOVB CL, 2(AX)
9770 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
9771
9772emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7:
9773 MOVL (CX), SI
9774 MOVL -4(CX)(BX*1), CX
9775 MOVL SI, (AX)
9776 MOVL CX, -4(AX)(BX*1)
9777 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
9778
9779emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16:
9780 MOVQ (CX), SI
9781 MOVQ -8(CX)(BX*1), CX
9782 MOVQ SI, (AX)
9783 MOVQ CX, -8(AX)(BX*1)
9784 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
9785
9786emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32:
9787 MOVOU (CX), X0
9788 MOVOU -16(CX)(BX*1), X1
9789 MOVOU X0, (AX)
9790 MOVOU X1, -16(AX)(BX*1)
9791 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
9792
9793emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64:
9794 MOVOU (CX), X0
9795 MOVOU 16(CX), X1
9796 MOVOU -32(CX)(BX*1), X2
9797 MOVOU -16(CX)(BX*1), X3
9798 MOVOU X0, (AX)
9799 MOVOU X1, 16(AX)
9800 MOVOU X2, -32(AX)(BX*1)
9801 MOVOU X3, -16(AX)(BX*1)
9802
9803memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B:
9804 MOVQ DX, AX
9805 JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
9806
9807memmove_long_emit_remainder_encodeBetterBlockAsm10B:
9808 LEAQ (AX)(SI*1), DX
9809 MOVL SI, BX
9810
9811 // genMemMoveLong
9812 MOVOU (CX), X0
9813 MOVOU 16(CX), X1
9814 MOVOU -32(CX)(BX*1), X2
9815 MOVOU -16(CX)(BX*1), X3
9816 MOVQ BX, DI
9817 SHRQ $0x05, DI
9818 MOVQ AX, SI
9819 ANDL $0x0000001f, SI
9820 MOVQ $0x00000040, R8
9821 SUBQ SI, R8
9822 DECQ DI
9823 JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
9824 LEAQ -32(CX)(R8*1), SI
9825 LEAQ -32(AX)(R8*1), R9
9826
9827emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back:
9828 MOVOU (SI), X4
9829 MOVOU 16(SI), X5
9830 MOVOA X4, (R9)
9831 MOVOA X5, 16(R9)
9832 ADDQ $0x20, R9
9833 ADDQ $0x20, SI
9834 ADDQ $0x20, R8
9835 DECQ DI
9836 JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back
9837
9838emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
9839 MOVOU -32(CX)(R8*1), X4
9840 MOVOU -16(CX)(R8*1), X5
9841 MOVOA X4, -32(AX)(R8*1)
9842 MOVOA X5, -16(AX)(R8*1)
9843 ADDQ $0x20, R8
9844 CMPQ BX, R8
9845 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
9846 MOVOU X0, (AX)
9847 MOVOU X1, 16(AX)
9848 MOVOU X2, -32(AX)(BX*1)
9849 MOVOU X3, -16(AX)(BX*1)
9850 MOVQ DX, AX
9851
9852emit_literal_done_emit_remainder_encodeBetterBlockAsm10B:
9853 MOVQ dst_base+0(FP), CX
9854 SUBQ CX, AX
9855 MOVQ AX, ret+48(FP)
9856 RET
9857
9858// func encodeBetterBlockAsm8B(dst []byte, src []byte) int
9859// Requires: BMI, SSE2
9860TEXT ·encodeBetterBlockAsm8B(SB), $5144-56
9861 MOVQ dst_base+0(FP), AX
9862 MOVQ $0x00000028, CX
9863 LEAQ 24(SP), DX
9864 PXOR X0, X0
9865
9866zero_loop_encodeBetterBlockAsm8B:
9867 MOVOU X0, (DX)
9868 MOVOU X0, 16(DX)
9869 MOVOU X0, 32(DX)
9870 MOVOU X0, 48(DX)
9871 MOVOU X0, 64(DX)
9872 MOVOU X0, 80(DX)
9873 MOVOU X0, 96(DX)
9874 MOVOU X0, 112(DX)
9875 ADDQ $0x80, DX
9876 DECQ CX
9877 JNZ zero_loop_encodeBetterBlockAsm8B
9878 MOVL $0x00000000, 12(SP)
9879 MOVQ src_len+32(FP), CX
9880 LEAQ -6(CX), DX
9881 LEAQ -8(CX), BX
9882 MOVL BX, 8(SP)
9883 SHRQ $0x05, CX
9884 SUBL CX, DX
9885 LEAQ (AX)(DX*1), DX
9886 MOVQ DX, (SP)
9887 MOVL $0x00000001, CX
9888 MOVL $0x00000000, 16(SP)
9889 MOVQ src_base+24(FP), DX
9890
9891search_loop_encodeBetterBlockAsm8B:
9892 MOVL CX, BX
9893 SUBL 12(SP), BX
9894 SHRL $0x04, BX
9895 LEAL 1(CX)(BX*1), BX
9896 CMPL BX, 8(SP)
9897 JAE emit_remainder_encodeBetterBlockAsm8B
9898 MOVQ (DX)(CX*1), SI
9899 MOVL BX, 20(SP)
9900 MOVQ $0x0000cf1bbcdcbf9b, R8
9901 MOVQ $0x9e3779b1, BX
9902 MOVQ SI, R9
9903 MOVQ SI, R10
9904 SHLQ $0x10, R9
9905 IMULQ R8, R9
9906 SHRQ $0x36, R9
9907 SHLQ $0x20, R10
9908 IMULQ BX, R10
9909 SHRQ $0x38, R10
9910 MOVL 24(SP)(R9*4), BX
9911 MOVL 4120(SP)(R10*4), DI
9912 MOVL CX, 24(SP)(R9*4)
9913 MOVL CX, 4120(SP)(R10*4)
9914 MOVQ (DX)(BX*1), R9
9915 MOVQ (DX)(DI*1), R10
9916 CMPQ R9, SI
9917 JEQ candidate_match_encodeBetterBlockAsm8B
9918 CMPQ R10, SI
9919 JNE no_short_found_encodeBetterBlockAsm8B
9920 MOVL DI, BX
9921 JMP candidate_match_encodeBetterBlockAsm8B
9922
9923no_short_found_encodeBetterBlockAsm8B:
9924 CMPL R9, SI
9925 JEQ candidate_match_encodeBetterBlockAsm8B
9926 CMPL R10, SI
9927 JEQ candidateS_match_encodeBetterBlockAsm8B
9928 MOVL 20(SP), CX
9929 JMP search_loop_encodeBetterBlockAsm8B
9930
9931candidateS_match_encodeBetterBlockAsm8B:
9932 SHRQ $0x08, SI
9933 MOVQ SI, R9
9934 SHLQ $0x10, R9
9935 IMULQ R8, R9
9936 SHRQ $0x36, R9
9937 MOVL 24(SP)(R9*4), BX
9938 INCL CX
9939 MOVL CX, 24(SP)(R9*4)
9940 CMPL (DX)(BX*1), SI
9941 JEQ candidate_match_encodeBetterBlockAsm8B
9942 DECL CX
9943 MOVL DI, BX
9944
9945candidate_match_encodeBetterBlockAsm8B:
9946 MOVL 12(SP), SI
9947 TESTL BX, BX
9948 JZ match_extend_back_end_encodeBetterBlockAsm8B
9949
9950match_extend_back_loop_encodeBetterBlockAsm8B:
9951 CMPL CX, SI
9952 JBE match_extend_back_end_encodeBetterBlockAsm8B
9953 MOVB -1(DX)(BX*1), DI
9954 MOVB -1(DX)(CX*1), R8
9955 CMPB DI, R8
9956 JNE match_extend_back_end_encodeBetterBlockAsm8B
9957 LEAL -1(CX), CX
9958 DECL BX
9959 JZ match_extend_back_end_encodeBetterBlockAsm8B
9960 JMP match_extend_back_loop_encodeBetterBlockAsm8B
9961
9962match_extend_back_end_encodeBetterBlockAsm8B:
9963 MOVL CX, SI
9964 SUBL 12(SP), SI
9965 LEAQ 3(AX)(SI*1), SI
9966 CMPQ SI, (SP)
9967 JB match_dst_size_check_encodeBetterBlockAsm8B
9968 MOVQ $0x00000000, ret+48(FP)
9969 RET
9970
9971match_dst_size_check_encodeBetterBlockAsm8B:
9972 MOVL CX, SI
9973 ADDL $0x04, CX
9974 ADDL $0x04, BX
9975 MOVQ src_len+32(FP), DI
9976 SUBL CX, DI
9977 LEAQ (DX)(CX*1), R8
9978 LEAQ (DX)(BX*1), R9
9979
9980 // matchLen
9981 XORL R11, R11
9982
9983matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B:
9984 CMPL DI, $0x10
9985 JB matchlen_match8_match_nolit_encodeBetterBlockAsm8B
9986 MOVQ (R8)(R11*1), R10
9987 MOVQ 8(R8)(R11*1), R12
9988 XORQ (R9)(R11*1), R10
9989 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B
9990 XORQ 8(R9)(R11*1), R12
9991 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B
9992 LEAL -16(DI), DI
9993 LEAL 16(R11), R11
9994 JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B
9995
9996matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B:
9997#ifdef GOAMD64_v3
9998 TZCNTQ R12, R12
9999
10000#else
10001 BSFQ R12, R12
10002
10003#endif
10004 SARQ $0x03, R12
10005 LEAL 8(R11)(R12*1), R11
10006 JMP match_nolit_end_encodeBetterBlockAsm8B
10007
10008matchlen_match8_match_nolit_encodeBetterBlockAsm8B:
10009 CMPL DI, $0x08
10010 JB matchlen_match4_match_nolit_encodeBetterBlockAsm8B
10011 MOVQ (R8)(R11*1), R10
10012 XORQ (R9)(R11*1), R10
10013 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B
10014 LEAL -8(DI), DI
10015 LEAL 8(R11), R11
10016 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm8B
10017
10018matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B:
10019#ifdef GOAMD64_v3
10020 TZCNTQ R10, R10
10021
10022#else
10023 BSFQ R10, R10
10024
10025#endif
10026 SARQ $0x03, R10
10027 LEAL (R11)(R10*1), R11
10028 JMP match_nolit_end_encodeBetterBlockAsm8B
10029
10030matchlen_match4_match_nolit_encodeBetterBlockAsm8B:
10031 CMPL DI, $0x04
10032 JB matchlen_match2_match_nolit_encodeBetterBlockAsm8B
10033 MOVL (R8)(R11*1), R10
10034 CMPL (R9)(R11*1), R10
10035 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm8B
10036 LEAL -4(DI), DI
10037 LEAL 4(R11), R11
10038
10039matchlen_match2_match_nolit_encodeBetterBlockAsm8B:
10040 CMPL DI, $0x01
10041 JE matchlen_match1_match_nolit_encodeBetterBlockAsm8B
10042 JB match_nolit_end_encodeBetterBlockAsm8B
10043 MOVW (R8)(R11*1), R10
10044 CMPW (R9)(R11*1), R10
10045 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm8B
10046 LEAL 2(R11), R11
10047 SUBL $0x02, DI
10048 JZ match_nolit_end_encodeBetterBlockAsm8B
10049
10050matchlen_match1_match_nolit_encodeBetterBlockAsm8B:
10051 MOVB (R8)(R11*1), R10
10052 CMPB (R9)(R11*1), R10
10053 JNE match_nolit_end_encodeBetterBlockAsm8B
10054 LEAL 1(R11), R11
10055
10056match_nolit_end_encodeBetterBlockAsm8B:
10057 MOVL CX, DI
10058 SUBL BX, DI
10059
10060 // Check if repeat
10061 CMPL 16(SP), DI
10062 JEQ match_is_repeat_encodeBetterBlockAsm8B
10063 MOVL DI, 16(SP)
10064 MOVL 12(SP), BX
10065 CMPL BX, SI
10066 JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B
10067 MOVL SI, R8
10068 MOVL SI, 12(SP)
10069 LEAQ (DX)(BX*1), R9
10070 SUBL BX, R8
10071 LEAL -1(R8), BX
10072 CMPL BX, $0x3c
10073 JB one_byte_match_emit_encodeBetterBlockAsm8B
10074 CMPL BX, $0x00000100
10075 JB two_bytes_match_emit_encodeBetterBlockAsm8B
10076 JB three_bytes_match_emit_encodeBetterBlockAsm8B
10077
10078three_bytes_match_emit_encodeBetterBlockAsm8B:
10079 MOVB $0xf4, (AX)
10080 MOVW BX, 1(AX)
10081 ADDQ $0x03, AX
10082 JMP memmove_long_match_emit_encodeBetterBlockAsm8B
10083
10084two_bytes_match_emit_encodeBetterBlockAsm8B:
10085 MOVB $0xf0, (AX)
10086 MOVB BL, 1(AX)
10087 ADDQ $0x02, AX
10088 CMPL BX, $0x40
10089 JB memmove_match_emit_encodeBetterBlockAsm8B
10090 JMP memmove_long_match_emit_encodeBetterBlockAsm8B
10091
10092one_byte_match_emit_encodeBetterBlockAsm8B:
10093 SHLB $0x02, BL
10094 MOVB BL, (AX)
10095 ADDQ $0x01, AX
10096
10097memmove_match_emit_encodeBetterBlockAsm8B:
10098 LEAQ (AX)(R8*1), BX
10099
10100 // genMemMoveShort
10101 CMPQ R8, $0x04
10102 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4
10103 CMPQ R8, $0x08
10104 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7
10105 CMPQ R8, $0x10
10106 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16
10107 CMPQ R8, $0x20
10108 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32
10109 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64
10110
10111emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4:
10112 MOVL (R9), R10
10113 MOVL R10, (AX)
10114 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
10115
10116emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7:
10117 MOVL (R9), R10
10118 MOVL -4(R9)(R8*1), R9
10119 MOVL R10, (AX)
10120 MOVL R9, -4(AX)(R8*1)
10121 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
10122
10123emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16:
10124 MOVQ (R9), R10
10125 MOVQ -8(R9)(R8*1), R9
10126 MOVQ R10, (AX)
10127 MOVQ R9, -8(AX)(R8*1)
10128 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
10129
10130emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32:
10131 MOVOU (R9), X0
10132 MOVOU -16(R9)(R8*1), X1
10133 MOVOU X0, (AX)
10134 MOVOU X1, -16(AX)(R8*1)
10135 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
10136
10137emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64:
10138 MOVOU (R9), X0
10139 MOVOU 16(R9), X1
10140 MOVOU -32(R9)(R8*1), X2
10141 MOVOU -16(R9)(R8*1), X3
10142 MOVOU X0, (AX)
10143 MOVOU X1, 16(AX)
10144 MOVOU X2, -32(AX)(R8*1)
10145 MOVOU X3, -16(AX)(R8*1)
10146
10147memmove_end_copy_match_emit_encodeBetterBlockAsm8B:
10148 MOVQ BX, AX
10149 JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B
10150
10151memmove_long_match_emit_encodeBetterBlockAsm8B:
10152 LEAQ (AX)(R8*1), BX
10153
10154 // genMemMoveLong
10155 MOVOU (R9), X0
10156 MOVOU 16(R9), X1
10157 MOVOU -32(R9)(R8*1), X2
10158 MOVOU -16(R9)(R8*1), X3
10159 MOVQ R8, R12
10160 SHRQ $0x05, R12
10161 MOVQ AX, R10
10162 ANDL $0x0000001f, R10
10163 MOVQ $0x00000040, R13
10164 SUBQ R10, R13
10165 DECQ R12
10166 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
10167 LEAQ -32(R9)(R13*1), R10
10168 LEAQ -32(AX)(R13*1), R14
10169
10170emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back:
10171 MOVOU (R10), X4
10172 MOVOU 16(R10), X5
10173 MOVOA X4, (R14)
10174 MOVOA X5, 16(R14)
10175 ADDQ $0x20, R14
10176 ADDQ $0x20, R10
10177 ADDQ $0x20, R13
10178 DECQ R12
10179 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back
10180
10181emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
10182 MOVOU -32(R9)(R13*1), X4
10183 MOVOU -16(R9)(R13*1), X5
10184 MOVOA X4, -32(AX)(R13*1)
10185 MOVOA X5, -16(AX)(R13*1)
10186 ADDQ $0x20, R13
10187 CMPQ R8, R13
10188 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
10189 MOVOU X0, (AX)
10190 MOVOU X1, 16(AX)
10191 MOVOU X2, -32(AX)(R8*1)
10192 MOVOU X3, -16(AX)(R8*1)
10193 MOVQ BX, AX
10194
10195emit_literal_done_match_emit_encodeBetterBlockAsm8B:
10196 ADDL R11, CX
10197 ADDL $0x04, R11
10198 MOVL CX, 12(SP)
10199
10200 // emitCopy
10201 CMPL R11, $0x40
10202 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B
10203 CMPL DI, $0x00000800
10204 JAE long_offset_short_match_nolit_encodeBetterBlockAsm8B
10205 MOVL $0x00000001, BX
10206 LEAL 16(BX), BX
10207 MOVB DI, 1(AX)
10208 SHRL $0x08, DI
10209 SHLL $0x05, DI
10210 ORL DI, BX
10211 MOVB BL, (AX)
10212 ADDQ $0x02, AX
10213 SUBL $0x08, R11
10214
10215 // emitRepeat
10216 LEAL -4(R11), R11
10217 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
10218 MOVL R11, BX
10219 LEAL -4(R11), R11
10220 CMPL BX, $0x08
10221 JBE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
10222 CMPL BX, $0x0c
10223 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
10224
10225cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
10226 CMPL R11, $0x00000104
10227 JB repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
10228 LEAL -256(R11), R11
10229 MOVW $0x0019, (AX)
10230 MOVW R11, 2(AX)
10231 ADDQ $0x04, AX
10232 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10233
10234repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
10235 LEAL -4(R11), R11
10236 MOVW $0x0015, (AX)
10237 MOVB R11, 2(AX)
10238 ADDQ $0x03, AX
10239 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10240
10241repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
10242 SHLL $0x02, R11
10243 ORL $0x01, R11
10244 MOVW R11, (AX)
10245 ADDQ $0x02, AX
10246 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10247 XORQ BX, BX
10248 LEAL 1(BX)(R11*4), R11
10249 MOVB DI, 1(AX)
10250 SARL $0x08, DI
10251 SHLL $0x05, DI
10252 ORL DI, R11
10253 MOVB R11, (AX)
10254 ADDQ $0x02, AX
10255 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10256
10257long_offset_short_match_nolit_encodeBetterBlockAsm8B:
10258 MOVB $0xee, (AX)
10259 MOVW DI, 1(AX)
10260 LEAL -60(R11), R11
10261 ADDQ $0x03, AX
10262
10263 // emitRepeat
10264 MOVL R11, BX
10265 LEAL -4(R11), R11
10266 CMPL BX, $0x08
10267 JBE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
10268 CMPL BX, $0x0c
10269 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
10270
10271cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
10272 CMPL R11, $0x00000104
10273 JB repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
10274 LEAL -256(R11), R11
10275 MOVW $0x0019, (AX)
10276 MOVW R11, 2(AX)
10277 ADDQ $0x04, AX
10278 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10279
10280repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
10281 LEAL -4(R11), R11
10282 MOVW $0x0015, (AX)
10283 MOVB R11, 2(AX)
10284 ADDQ $0x03, AX
10285 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10286
10287repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
10288 SHLL $0x02, R11
10289 ORL $0x01, R11
10290 MOVW R11, (AX)
10291 ADDQ $0x02, AX
10292 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10293 XORQ BX, BX
10294 LEAL 1(BX)(R11*4), R11
10295 MOVB DI, 1(AX)
10296 SARL $0x08, DI
10297 SHLL $0x05, DI
10298 ORL DI, R11
10299 MOVB R11, (AX)
10300 ADDQ $0x02, AX
10301 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10302
10303two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B:
10304 MOVL R11, BX
10305 SHLL $0x02, BX
10306 CMPL R11, $0x0c
10307 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm8B
10308 LEAL -15(BX), BX
10309 MOVB DI, 1(AX)
10310 SHRL $0x08, DI
10311 SHLL $0x05, DI
10312 ORL DI, BX
10313 MOVB BL, (AX)
10314 ADDQ $0x02, AX
10315 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10316
10317emit_copy_three_match_nolit_encodeBetterBlockAsm8B:
10318 LEAL -2(BX), BX
10319 MOVB BL, (AX)
10320 MOVW DI, 1(AX)
10321 ADDQ $0x03, AX
10322 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10323
10324match_is_repeat_encodeBetterBlockAsm8B:
10325 MOVL 12(SP), BX
10326 CMPL BX, SI
10327 JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
10328 MOVL SI, DI
10329 MOVL SI, 12(SP)
10330 LEAQ (DX)(BX*1), R8
10331 SUBL BX, DI
10332 LEAL -1(DI), BX
10333 CMPL BX, $0x3c
10334 JB one_byte_match_emit_repeat_encodeBetterBlockAsm8B
10335 CMPL BX, $0x00000100
10336 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm8B
10337 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm8B
10338
10339three_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
10340 MOVB $0xf4, (AX)
10341 MOVW BX, 1(AX)
10342 ADDQ $0x03, AX
10343 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
10344
10345two_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
10346 MOVB $0xf0, (AX)
10347 MOVB BL, 1(AX)
10348 ADDQ $0x02, AX
10349 CMPL BX, $0x40
10350 JB memmove_match_emit_repeat_encodeBetterBlockAsm8B
10351 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
10352
10353one_byte_match_emit_repeat_encodeBetterBlockAsm8B:
10354 SHLB $0x02, BL
10355 MOVB BL, (AX)
10356 ADDQ $0x01, AX
10357
10358memmove_match_emit_repeat_encodeBetterBlockAsm8B:
10359 LEAQ (AX)(DI*1), BX
10360
10361 // genMemMoveShort
10362 CMPQ DI, $0x04
10363 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4
10364 CMPQ DI, $0x08
10365 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7
10366 CMPQ DI, $0x10
10367 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16
10368 CMPQ DI, $0x20
10369 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32
10370 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64
10371
10372emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4:
10373 MOVL (R8), R9
10374 MOVL R9, (AX)
10375 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
10376
10377emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7:
10378 MOVL (R8), R9
10379 MOVL -4(R8)(DI*1), R8
10380 MOVL R9, (AX)
10381 MOVL R8, -4(AX)(DI*1)
10382 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
10383
10384emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16:
10385 MOVQ (R8), R9
10386 MOVQ -8(R8)(DI*1), R8
10387 MOVQ R9, (AX)
10388 MOVQ R8, -8(AX)(DI*1)
10389 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
10390
10391emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32:
10392 MOVOU (R8), X0
10393 MOVOU -16(R8)(DI*1), X1
10394 MOVOU X0, (AX)
10395 MOVOU X1, -16(AX)(DI*1)
10396 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
10397
10398emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64:
10399 MOVOU (R8), X0
10400 MOVOU 16(R8), X1
10401 MOVOU -32(R8)(DI*1), X2
10402 MOVOU -16(R8)(DI*1), X3
10403 MOVOU X0, (AX)
10404 MOVOU X1, 16(AX)
10405 MOVOU X2, -32(AX)(DI*1)
10406 MOVOU X3, -16(AX)(DI*1)
10407
10408memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B:
10409 MOVQ BX, AX
10410 JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
10411
10412memmove_long_match_emit_repeat_encodeBetterBlockAsm8B:
10413 LEAQ (AX)(DI*1), BX
10414
10415 // genMemMoveLong
10416 MOVOU (R8), X0
10417 MOVOU 16(R8), X1
10418 MOVOU -32(R8)(DI*1), X2
10419 MOVOU -16(R8)(DI*1), X3
10420 MOVQ DI, R10
10421 SHRQ $0x05, R10
10422 MOVQ AX, R9
10423 ANDL $0x0000001f, R9
10424 MOVQ $0x00000040, R12
10425 SUBQ R9, R12
10426 DECQ R10
10427 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
10428 LEAQ -32(R8)(R12*1), R9
10429 LEAQ -32(AX)(R12*1), R13
10430
10431emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back:
10432 MOVOU (R9), X4
10433 MOVOU 16(R9), X5
10434 MOVOA X4, (R13)
10435 MOVOA X5, 16(R13)
10436 ADDQ $0x20, R13
10437 ADDQ $0x20, R9
10438 ADDQ $0x20, R12
10439 DECQ R10
10440 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back
10441
10442emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
10443 MOVOU -32(R8)(R12*1), X4
10444 MOVOU -16(R8)(R12*1), X5
10445 MOVOA X4, -32(AX)(R12*1)
10446 MOVOA X5, -16(AX)(R12*1)
10447 ADDQ $0x20, R12
10448 CMPQ DI, R12
10449 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
10450 MOVOU X0, (AX)
10451 MOVOU X1, 16(AX)
10452 MOVOU X2, -32(AX)(DI*1)
10453 MOVOU X3, -16(AX)(DI*1)
10454 MOVQ BX, AX
10455
10456emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B:
10457 ADDL R11, CX
10458 ADDL $0x04, R11
10459 MOVL CX, 12(SP)
10460
10461 // emitRepeat
10462 MOVL R11, BX
10463 LEAL -4(R11), R11
10464 CMPL BX, $0x08
10465 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B
10466 CMPL BX, $0x0c
10467 JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B
10468
10469cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B:
10470 CMPL R11, $0x00000104
10471 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B
10472 LEAL -256(R11), R11
10473 MOVW $0x0019, (AX)
10474 MOVW R11, 2(AX)
10475 ADDQ $0x04, AX
10476 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10477
10478repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B:
10479 LEAL -4(R11), R11
10480 MOVW $0x0015, (AX)
10481 MOVB R11, 2(AX)
10482 ADDQ $0x03, AX
10483 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10484
10485repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B:
10486 SHLL $0x02, R11
10487 ORL $0x01, R11
10488 MOVW R11, (AX)
10489 ADDQ $0x02, AX
10490 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10491 XORQ BX, BX
10492 LEAL 1(BX)(R11*4), R11
10493 MOVB DI, 1(AX)
10494 SARL $0x08, DI
10495 SHLL $0x05, DI
10496 ORL DI, R11
10497 MOVB R11, (AX)
10498 ADDQ $0x02, AX
10499
10500match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
10501 CMPL CX, 8(SP)
10502 JAE emit_remainder_encodeBetterBlockAsm8B
10503 CMPQ AX, (SP)
10504 JB match_nolit_dst_ok_encodeBetterBlockAsm8B
10505 MOVQ $0x00000000, ret+48(FP)
10506 RET
10507
10508match_nolit_dst_ok_encodeBetterBlockAsm8B:
10509 MOVQ $0x0000cf1bbcdcbf9b, BX
10510 MOVQ $0x9e3779b1, DI
10511 LEAQ 1(SI), SI
10512 LEAQ -2(CX), R8
10513 MOVQ (DX)(SI*1), R9
10514 MOVQ 1(DX)(SI*1), R10
10515 MOVQ (DX)(R8*1), R11
10516 MOVQ 1(DX)(R8*1), R12
10517 SHLQ $0x10, R9
10518 IMULQ BX, R9
10519 SHRQ $0x36, R9
10520 SHLQ $0x20, R10
10521 IMULQ DI, R10
10522 SHRQ $0x38, R10
10523 SHLQ $0x10, R11
10524 IMULQ BX, R11
10525 SHRQ $0x36, R11
10526 SHLQ $0x20, R12
10527 IMULQ DI, R12
10528 SHRQ $0x38, R12
10529 LEAQ 1(SI), DI
10530 LEAQ 1(R8), R13
10531 MOVL SI, 24(SP)(R9*4)
10532 MOVL R8, 24(SP)(R11*4)
10533 MOVL DI, 4120(SP)(R10*4)
10534 MOVL R13, 4120(SP)(R12*4)
10535 LEAQ 1(R8)(SI*1), DI
10536 SHRQ $0x01, DI
10537 ADDQ $0x01, SI
10538 SUBQ $0x01, R8
10539
10540index_loop_encodeBetterBlockAsm8B:
10541 CMPQ DI, R8
10542 JAE search_loop_encodeBetterBlockAsm8B
10543 MOVQ (DX)(SI*1), R9
10544 MOVQ (DX)(DI*1), R10
10545 SHLQ $0x10, R9
10546 IMULQ BX, R9
10547 SHRQ $0x36, R9
10548 SHLQ $0x10, R10
10549 IMULQ BX, R10
10550 SHRQ $0x36, R10
10551 MOVL SI, 24(SP)(R9*4)
10552 MOVL DI, 24(SP)(R10*4)
10553 ADDQ $0x02, SI
10554 ADDQ $0x02, DI
10555 JMP index_loop_encodeBetterBlockAsm8B
10556
10557emit_remainder_encodeBetterBlockAsm8B:
10558 MOVQ src_len+32(FP), CX
10559 SUBL 12(SP), CX
10560 LEAQ 3(AX)(CX*1), CX
10561 CMPQ CX, (SP)
10562 JB emit_remainder_ok_encodeBetterBlockAsm8B
10563 MOVQ $0x00000000, ret+48(FP)
10564 RET
10565
10566emit_remainder_ok_encodeBetterBlockAsm8B:
10567 MOVQ src_len+32(FP), CX
10568 MOVL 12(SP), BX
10569 CMPL BX, CX
10570 JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
10571 MOVL CX, SI
10572 MOVL CX, 12(SP)
10573 LEAQ (DX)(BX*1), CX
10574 SUBL BX, SI
10575 LEAL -1(SI), DX
10576 CMPL DX, $0x3c
10577 JB one_byte_emit_remainder_encodeBetterBlockAsm8B
10578 CMPL DX, $0x00000100
10579 JB two_bytes_emit_remainder_encodeBetterBlockAsm8B
10580 JB three_bytes_emit_remainder_encodeBetterBlockAsm8B
10581
10582three_bytes_emit_remainder_encodeBetterBlockAsm8B:
10583 MOVB $0xf4, (AX)
10584 MOVW DX, 1(AX)
10585 ADDQ $0x03, AX
10586 JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B
10587
10588two_bytes_emit_remainder_encodeBetterBlockAsm8B:
10589 MOVB $0xf0, (AX)
10590 MOVB DL, 1(AX)
10591 ADDQ $0x02, AX
10592 CMPL DX, $0x40
10593 JB memmove_emit_remainder_encodeBetterBlockAsm8B
10594 JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B
10595
10596one_byte_emit_remainder_encodeBetterBlockAsm8B:
10597 SHLB $0x02, DL
10598 MOVB DL, (AX)
10599 ADDQ $0x01, AX
10600
10601memmove_emit_remainder_encodeBetterBlockAsm8B:
10602 LEAQ (AX)(SI*1), DX
10603 MOVL SI, BX
10604
10605 // genMemMoveShort
10606 CMPQ BX, $0x03
10607 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2
10608 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3
10609 CMPQ BX, $0x08
10610 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7
10611 CMPQ BX, $0x10
10612 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16
10613 CMPQ BX, $0x20
10614 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32
10615 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64
10616
10617emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2:
10618 MOVB (CX), SI
10619 MOVB -1(CX)(BX*1), CL
10620 MOVB SI, (AX)
10621 MOVB CL, -1(AX)(BX*1)
10622 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
10623
10624emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3:
10625 MOVW (CX), SI
10626 MOVB 2(CX), CL
10627 MOVW SI, (AX)
10628 MOVB CL, 2(AX)
10629 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
10630
10631emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7:
10632 MOVL (CX), SI
10633 MOVL -4(CX)(BX*1), CX
10634 MOVL SI, (AX)
10635 MOVL CX, -4(AX)(BX*1)
10636 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
10637
10638emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16:
10639 MOVQ (CX), SI
10640 MOVQ -8(CX)(BX*1), CX
10641 MOVQ SI, (AX)
10642 MOVQ CX, -8(AX)(BX*1)
10643 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
10644
10645emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32:
10646 MOVOU (CX), X0
10647 MOVOU -16(CX)(BX*1), X1
10648 MOVOU X0, (AX)
10649 MOVOU X1, -16(AX)(BX*1)
10650 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
10651
10652emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64:
10653 MOVOU (CX), X0
10654 MOVOU 16(CX), X1
10655 MOVOU -32(CX)(BX*1), X2
10656 MOVOU -16(CX)(BX*1), X3
10657 MOVOU X0, (AX)
10658 MOVOU X1, 16(AX)
10659 MOVOU X2, -32(AX)(BX*1)
10660 MOVOU X3, -16(AX)(BX*1)
10661
10662memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B:
10663 MOVQ DX, AX
10664 JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
10665
10666memmove_long_emit_remainder_encodeBetterBlockAsm8B:
10667 LEAQ (AX)(SI*1), DX
10668 MOVL SI, BX
10669
10670 // genMemMoveLong
10671 MOVOU (CX), X0
10672 MOVOU 16(CX), X1
10673 MOVOU -32(CX)(BX*1), X2
10674 MOVOU -16(CX)(BX*1), X3
10675 MOVQ BX, DI
10676 SHRQ $0x05, DI
10677 MOVQ AX, SI
10678 ANDL $0x0000001f, SI
10679 MOVQ $0x00000040, R8
10680 SUBQ SI, R8
10681 DECQ DI
10682 JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
10683 LEAQ -32(CX)(R8*1), SI
10684 LEAQ -32(AX)(R8*1), R9
10685
10686emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back:
10687 MOVOU (SI), X4
10688 MOVOU 16(SI), X5
10689 MOVOA X4, (R9)
10690 MOVOA X5, 16(R9)
10691 ADDQ $0x20, R9
10692 ADDQ $0x20, SI
10693 ADDQ $0x20, R8
10694 DECQ DI
10695 JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back
10696
10697emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
10698 MOVOU -32(CX)(R8*1), X4
10699 MOVOU -16(CX)(R8*1), X5
10700 MOVOA X4, -32(AX)(R8*1)
10701 MOVOA X5, -16(AX)(R8*1)
10702 ADDQ $0x20, R8
10703 CMPQ BX, R8
10704 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
10705 MOVOU X0, (AX)
10706 MOVOU X1, 16(AX)
10707 MOVOU X2, -32(AX)(BX*1)
10708 MOVOU X3, -16(AX)(BX*1)
10709 MOVQ DX, AX
10710
10711emit_literal_done_emit_remainder_encodeBetterBlockAsm8B:
10712 MOVQ dst_base+0(FP), CX
10713 SUBQ CX, AX
10714 MOVQ AX, ret+48(FP)
10715 RET
10716
10717// func encodeSnappyBlockAsm(dst []byte, src []byte) int
10718// Requires: BMI, SSE2
10719TEXT ·encodeSnappyBlockAsm(SB), $65560-56
10720 MOVQ dst_base+0(FP), AX
10721 MOVQ $0x00000200, CX
10722 LEAQ 24(SP), DX
10723 PXOR X0, X0
10724
10725zero_loop_encodeSnappyBlockAsm:
10726 MOVOU X0, (DX)
10727 MOVOU X0, 16(DX)
10728 MOVOU X0, 32(DX)
10729 MOVOU X0, 48(DX)
10730 MOVOU X0, 64(DX)
10731 MOVOU X0, 80(DX)
10732 MOVOU X0, 96(DX)
10733 MOVOU X0, 112(DX)
10734 ADDQ $0x80, DX
10735 DECQ CX
10736 JNZ zero_loop_encodeSnappyBlockAsm
10737 MOVL $0x00000000, 12(SP)
10738 MOVQ src_len+32(FP), CX
10739 LEAQ -9(CX), DX
10740 LEAQ -8(CX), BX
10741 MOVL BX, 8(SP)
10742 SHRQ $0x05, CX
10743 SUBL CX, DX
10744 LEAQ (AX)(DX*1), DX
10745 MOVQ DX, (SP)
10746 MOVL $0x00000001, CX
10747 MOVL CX, 16(SP)
10748 MOVQ src_base+24(FP), DX
10749
10750search_loop_encodeSnappyBlockAsm:
10751 MOVL CX, BX
10752 SUBL 12(SP), BX
10753 SHRL $0x06, BX
10754 LEAL 4(CX)(BX*1), BX
10755 CMPL BX, 8(SP)
10756 JAE emit_remainder_encodeSnappyBlockAsm
10757 MOVQ (DX)(CX*1), SI
10758 MOVL BX, 20(SP)
10759 MOVQ $0x0000cf1bbcdcbf9b, R8
10760 MOVQ SI, R9
10761 MOVQ SI, R10
10762 SHRQ $0x08, R10
10763 SHLQ $0x10, R9
10764 IMULQ R8, R9
10765 SHRQ $0x32, R9
10766 SHLQ $0x10, R10
10767 IMULQ R8, R10
10768 SHRQ $0x32, R10
10769 MOVL 24(SP)(R9*4), BX
10770 MOVL 24(SP)(R10*4), DI
10771 MOVL CX, 24(SP)(R9*4)
10772 LEAL 1(CX), R9
10773 MOVL R9, 24(SP)(R10*4)
10774 MOVQ SI, R9
10775 SHRQ $0x10, R9
10776 SHLQ $0x10, R9
10777 IMULQ R8, R9
10778 SHRQ $0x32, R9
10779 MOVL CX, R8
10780 SUBL 16(SP), R8
10781 MOVL 1(DX)(R8*1), R10
10782 MOVQ SI, R8
10783 SHRQ $0x08, R8
10784 CMPL R8, R10
10785 JNE no_repeat_found_encodeSnappyBlockAsm
10786 LEAL 1(CX), SI
10787 MOVL 12(SP), BX
10788 MOVL SI, DI
10789 SUBL 16(SP), DI
10790 JZ repeat_extend_back_end_encodeSnappyBlockAsm
10791
10792repeat_extend_back_loop_encodeSnappyBlockAsm:
10793 CMPL SI, BX
10794 JBE repeat_extend_back_end_encodeSnappyBlockAsm
10795 MOVB -1(DX)(DI*1), R8
10796 MOVB -1(DX)(SI*1), R9
10797 CMPB R8, R9
10798 JNE repeat_extend_back_end_encodeSnappyBlockAsm
10799 LEAL -1(SI), SI
10800 DECL DI
10801 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm
10802
10803repeat_extend_back_end_encodeSnappyBlockAsm:
10804 MOVL SI, BX
10805 SUBL 12(SP), BX
10806 LEAQ 5(AX)(BX*1), BX
10807 CMPQ BX, (SP)
10808 JB repeat_dst_size_check_encodeSnappyBlockAsm
10809 MOVQ $0x00000000, ret+48(FP)
10810 RET
10811
10812repeat_dst_size_check_encodeSnappyBlockAsm:
10813 MOVL 12(SP), BX
10814 CMPL BX, SI
10815 JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm
10816 MOVL SI, DI
10817 MOVL SI, 12(SP)
10818 LEAQ (DX)(BX*1), R8
10819 SUBL BX, DI
10820 LEAL -1(DI), BX
10821 CMPL BX, $0x3c
10822 JB one_byte_repeat_emit_encodeSnappyBlockAsm
10823 CMPL BX, $0x00000100
10824 JB two_bytes_repeat_emit_encodeSnappyBlockAsm
10825 CMPL BX, $0x00010000
10826 JB three_bytes_repeat_emit_encodeSnappyBlockAsm
10827 CMPL BX, $0x01000000
10828 JB four_bytes_repeat_emit_encodeSnappyBlockAsm
10829 MOVB $0xfc, (AX)
10830 MOVL BX, 1(AX)
10831 ADDQ $0x05, AX
10832 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
10833
10834four_bytes_repeat_emit_encodeSnappyBlockAsm:
10835 MOVL BX, R9
10836 SHRL $0x10, R9
10837 MOVB $0xf8, (AX)
10838 MOVW BX, 1(AX)
10839 MOVB R9, 3(AX)
10840 ADDQ $0x04, AX
10841 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
10842
10843three_bytes_repeat_emit_encodeSnappyBlockAsm:
10844 MOVB $0xf4, (AX)
10845 MOVW BX, 1(AX)
10846 ADDQ $0x03, AX
10847 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
10848
10849two_bytes_repeat_emit_encodeSnappyBlockAsm:
10850 MOVB $0xf0, (AX)
10851 MOVB BL, 1(AX)
10852 ADDQ $0x02, AX
10853 CMPL BX, $0x40
10854 JB memmove_repeat_emit_encodeSnappyBlockAsm
10855 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
10856
10857one_byte_repeat_emit_encodeSnappyBlockAsm:
10858 SHLB $0x02, BL
10859 MOVB BL, (AX)
10860 ADDQ $0x01, AX
10861
10862memmove_repeat_emit_encodeSnappyBlockAsm:
10863 LEAQ (AX)(DI*1), BX
10864
10865 // genMemMoveShort
10866 CMPQ DI, $0x08
10867 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8
10868 CMPQ DI, $0x10
10869 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16
10870 CMPQ DI, $0x20
10871 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32
10872 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64
10873
10874emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8:
10875 MOVQ (R8), R9
10876 MOVQ R9, (AX)
10877 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
10878
10879emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16:
10880 MOVQ (R8), R9
10881 MOVQ -8(R8)(DI*1), R8
10882 MOVQ R9, (AX)
10883 MOVQ R8, -8(AX)(DI*1)
10884 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
10885
10886emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32:
10887 MOVOU (R8), X0
10888 MOVOU -16(R8)(DI*1), X1
10889 MOVOU X0, (AX)
10890 MOVOU X1, -16(AX)(DI*1)
10891 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
10892
10893emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64:
10894 MOVOU (R8), X0
10895 MOVOU 16(R8), X1
10896 MOVOU -32(R8)(DI*1), X2
10897 MOVOU -16(R8)(DI*1), X3
10898 MOVOU X0, (AX)
10899 MOVOU X1, 16(AX)
10900 MOVOU X2, -32(AX)(DI*1)
10901 MOVOU X3, -16(AX)(DI*1)
10902
10903memmove_end_copy_repeat_emit_encodeSnappyBlockAsm:
10904 MOVQ BX, AX
10905 JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm
10906
10907memmove_long_repeat_emit_encodeSnappyBlockAsm:
10908 LEAQ (AX)(DI*1), BX
10909
10910 // genMemMoveLong
10911 MOVOU (R8), X0
10912 MOVOU 16(R8), X1
10913 MOVOU -32(R8)(DI*1), X2
10914 MOVOU -16(R8)(DI*1), X3
10915 MOVQ DI, R10
10916 SHRQ $0x05, R10
10917 MOVQ AX, R9
10918 ANDL $0x0000001f, R9
10919 MOVQ $0x00000040, R11
10920 SUBQ R9, R11
10921 DECQ R10
10922 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
10923 LEAQ -32(R8)(R11*1), R9
10924 LEAQ -32(AX)(R11*1), R12
10925
10926emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back:
10927 MOVOU (R9), X4
10928 MOVOU 16(R9), X5
10929 MOVOA X4, (R12)
10930 MOVOA X5, 16(R12)
10931 ADDQ $0x20, R12
10932 ADDQ $0x20, R9
10933 ADDQ $0x20, R11
10934 DECQ R10
10935 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back
10936
10937emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
10938 MOVOU -32(R8)(R11*1), X4
10939 MOVOU -16(R8)(R11*1), X5
10940 MOVOA X4, -32(AX)(R11*1)
10941 MOVOA X5, -16(AX)(R11*1)
10942 ADDQ $0x20, R11
10943 CMPQ DI, R11
10944 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
10945 MOVOU X0, (AX)
10946 MOVOU X1, 16(AX)
10947 MOVOU X2, -32(AX)(DI*1)
10948 MOVOU X3, -16(AX)(DI*1)
10949 MOVQ BX, AX
10950
10951emit_literal_done_repeat_emit_encodeSnappyBlockAsm:
10952 ADDL $0x05, CX
10953 MOVL CX, BX
10954 SUBL 16(SP), BX
10955 MOVQ src_len+32(FP), DI
10956 SUBL CX, DI
10957 LEAQ (DX)(CX*1), R8
10958 LEAQ (DX)(BX*1), BX
10959
10960 // matchLen
10961 XORL R10, R10
10962
10963matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm:
10964 CMPL DI, $0x10
10965 JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm
10966 MOVQ (R8)(R10*1), R9
10967 MOVQ 8(R8)(R10*1), R11
10968 XORQ (BX)(R10*1), R9
10969 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm
10970 XORQ 8(BX)(R10*1), R11
10971 JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm
10972 LEAL -16(DI), DI
10973 LEAL 16(R10), R10
10974 JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm
10975
10976matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm:
10977#ifdef GOAMD64_v3
10978 TZCNTQ R11, R11
10979
10980#else
10981 BSFQ R11, R11
10982
10983#endif
10984 SARQ $0x03, R11
10985 LEAL 8(R10)(R11*1), R10
10986 JMP repeat_extend_forward_end_encodeSnappyBlockAsm
10987
10988matchlen_match8_repeat_extend_encodeSnappyBlockAsm:
10989 CMPL DI, $0x08
10990 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm
10991 MOVQ (R8)(R10*1), R9
10992 XORQ (BX)(R10*1), R9
10993 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm
10994 LEAL -8(DI), DI
10995 LEAL 8(R10), R10
10996 JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm
10997
10998matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm:
10999#ifdef GOAMD64_v3
11000 TZCNTQ R9, R9
11001
11002#else
11003 BSFQ R9, R9
11004
11005#endif
11006 SARQ $0x03, R9
11007 LEAL (R10)(R9*1), R10
11008 JMP repeat_extend_forward_end_encodeSnappyBlockAsm
11009
11010matchlen_match4_repeat_extend_encodeSnappyBlockAsm:
11011 CMPL DI, $0x04
11012 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm
11013 MOVL (R8)(R10*1), R9
11014 CMPL (BX)(R10*1), R9
11015 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm
11016 LEAL -4(DI), DI
11017 LEAL 4(R10), R10
11018
11019matchlen_match2_repeat_extend_encodeSnappyBlockAsm:
11020 CMPL DI, $0x01
11021 JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm
11022 JB repeat_extend_forward_end_encodeSnappyBlockAsm
11023 MOVW (R8)(R10*1), R9
11024 CMPW (BX)(R10*1), R9
11025 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm
11026 LEAL 2(R10), R10
11027 SUBL $0x02, DI
11028 JZ repeat_extend_forward_end_encodeSnappyBlockAsm
11029
11030matchlen_match1_repeat_extend_encodeSnappyBlockAsm:
11031 MOVB (R8)(R10*1), R9
11032 CMPB (BX)(R10*1), R9
11033 JNE repeat_extend_forward_end_encodeSnappyBlockAsm
11034 LEAL 1(R10), R10
11035
11036repeat_extend_forward_end_encodeSnappyBlockAsm:
11037 ADDL R10, CX
11038 MOVL CX, BX
11039 SUBL SI, BX
11040 MOVL 16(SP), SI
11041
11042 // emitCopy
11043 CMPL SI, $0x00010000
11044 JB two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
11045
11046four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm:
11047 CMPL BX, $0x40
11048 JBE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
11049 MOVB $0xff, (AX)
11050 MOVL SI, 1(AX)
11051 LEAL -64(BX), BX
11052 ADDQ $0x05, AX
11053 CMPL BX, $0x04
11054 JB four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
11055 JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm
11056
11057four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm:
11058 TESTL BX, BX
11059 JZ repeat_end_emit_encodeSnappyBlockAsm
11060 XORL DI, DI
11061 LEAL -1(DI)(BX*4), BX
11062 MOVB BL, (AX)
11063 MOVL SI, 1(AX)
11064 ADDQ $0x05, AX
11065 JMP repeat_end_emit_encodeSnappyBlockAsm
11066
11067two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm:
11068 CMPL BX, $0x40
11069 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm
11070 MOVB $0xee, (AX)
11071 MOVW SI, 1(AX)
11072 LEAL -60(BX), BX
11073 ADDQ $0x03, AX
11074 JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
11075
11076two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm:
11077 MOVL BX, DI
11078 SHLL $0x02, DI
11079 CMPL BX, $0x0c
11080 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
11081 CMPL SI, $0x00000800
11082 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
11083 LEAL -15(DI), DI
11084 MOVB SI, 1(AX)
11085 SHRL $0x08, SI
11086 SHLL $0x05, SI
11087 ORL SI, DI
11088 MOVB DI, (AX)
11089 ADDQ $0x02, AX
11090 JMP repeat_end_emit_encodeSnappyBlockAsm
11091
11092emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm:
11093 LEAL -2(DI), DI
11094 MOVB DI, (AX)
11095 MOVW SI, 1(AX)
11096 ADDQ $0x03, AX
11097
11098repeat_end_emit_encodeSnappyBlockAsm:
11099 MOVL CX, 12(SP)
11100 JMP search_loop_encodeSnappyBlockAsm
11101
11102no_repeat_found_encodeSnappyBlockAsm:
11103 CMPL (DX)(BX*1), SI
11104 JEQ candidate_match_encodeSnappyBlockAsm
11105 SHRQ $0x08, SI
11106 MOVL 24(SP)(R9*4), BX
11107 LEAL 2(CX), R8
11108 CMPL (DX)(DI*1), SI
11109 JEQ candidate2_match_encodeSnappyBlockAsm
11110 MOVL R8, 24(SP)(R9*4)
11111 SHRQ $0x08, SI
11112 CMPL (DX)(BX*1), SI
11113 JEQ candidate3_match_encodeSnappyBlockAsm
11114 MOVL 20(SP), CX
11115 JMP search_loop_encodeSnappyBlockAsm
11116
11117candidate3_match_encodeSnappyBlockAsm:
11118 ADDL $0x02, CX
11119 JMP candidate_match_encodeSnappyBlockAsm
11120
11121candidate2_match_encodeSnappyBlockAsm:
11122 MOVL R8, 24(SP)(R9*4)
11123 INCL CX
11124 MOVL DI, BX
11125
11126candidate_match_encodeSnappyBlockAsm:
11127 MOVL 12(SP), SI
11128 TESTL BX, BX
11129 JZ match_extend_back_end_encodeSnappyBlockAsm
11130
11131match_extend_back_loop_encodeSnappyBlockAsm:
11132 CMPL CX, SI
11133 JBE match_extend_back_end_encodeSnappyBlockAsm
11134 MOVB -1(DX)(BX*1), DI
11135 MOVB -1(DX)(CX*1), R8
11136 CMPB DI, R8
11137 JNE match_extend_back_end_encodeSnappyBlockAsm
11138 LEAL -1(CX), CX
11139 DECL BX
11140 JZ match_extend_back_end_encodeSnappyBlockAsm
11141 JMP match_extend_back_loop_encodeSnappyBlockAsm
11142
11143match_extend_back_end_encodeSnappyBlockAsm:
11144 MOVL CX, SI
11145 SUBL 12(SP), SI
11146 LEAQ 5(AX)(SI*1), SI
11147 CMPQ SI, (SP)
11148 JB match_dst_size_check_encodeSnappyBlockAsm
11149 MOVQ $0x00000000, ret+48(FP)
11150 RET
11151
11152match_dst_size_check_encodeSnappyBlockAsm:
11153 MOVL CX, SI
11154 MOVL 12(SP), DI
11155 CMPL DI, SI
11156 JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm
11157 MOVL SI, R8
11158 MOVL SI, 12(SP)
11159 LEAQ (DX)(DI*1), SI
11160 SUBL DI, R8
11161 LEAL -1(R8), DI
11162 CMPL DI, $0x3c
11163 JB one_byte_match_emit_encodeSnappyBlockAsm
11164 CMPL DI, $0x00000100
11165 JB two_bytes_match_emit_encodeSnappyBlockAsm
11166 CMPL DI, $0x00010000
11167 JB three_bytes_match_emit_encodeSnappyBlockAsm
11168 CMPL DI, $0x01000000
11169 JB four_bytes_match_emit_encodeSnappyBlockAsm
11170 MOVB $0xfc, (AX)
11171 MOVL DI, 1(AX)
11172 ADDQ $0x05, AX
11173 JMP memmove_long_match_emit_encodeSnappyBlockAsm
11174
11175four_bytes_match_emit_encodeSnappyBlockAsm:
11176 MOVL DI, R9
11177 SHRL $0x10, R9
11178 MOVB $0xf8, (AX)
11179 MOVW DI, 1(AX)
11180 MOVB R9, 3(AX)
11181 ADDQ $0x04, AX
11182 JMP memmove_long_match_emit_encodeSnappyBlockAsm
11183
11184three_bytes_match_emit_encodeSnappyBlockAsm:
11185 MOVB $0xf4, (AX)
11186 MOVW DI, 1(AX)
11187 ADDQ $0x03, AX
11188 JMP memmove_long_match_emit_encodeSnappyBlockAsm
11189
11190two_bytes_match_emit_encodeSnappyBlockAsm:
11191 MOVB $0xf0, (AX)
11192 MOVB DI, 1(AX)
11193 ADDQ $0x02, AX
11194 CMPL DI, $0x40
11195 JB memmove_match_emit_encodeSnappyBlockAsm
11196 JMP memmove_long_match_emit_encodeSnappyBlockAsm
11197
11198one_byte_match_emit_encodeSnappyBlockAsm:
11199 SHLB $0x02, DI
11200 MOVB DI, (AX)
11201 ADDQ $0x01, AX
11202
11203memmove_match_emit_encodeSnappyBlockAsm:
11204 LEAQ (AX)(R8*1), DI
11205
11206 // genMemMoveShort
11207 CMPQ R8, $0x08
11208 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8
11209 CMPQ R8, $0x10
11210 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16
11211 CMPQ R8, $0x20
11212 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32
11213 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64
11214
11215emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8:
11216 MOVQ (SI), R9
11217 MOVQ R9, (AX)
11218 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
11219
11220emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16:
11221 MOVQ (SI), R9
11222 MOVQ -8(SI)(R8*1), SI
11223 MOVQ R9, (AX)
11224 MOVQ SI, -8(AX)(R8*1)
11225 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
11226
11227emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32:
11228 MOVOU (SI), X0
11229 MOVOU -16(SI)(R8*1), X1
11230 MOVOU X0, (AX)
11231 MOVOU X1, -16(AX)(R8*1)
11232 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
11233
11234emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64:
11235 MOVOU (SI), X0
11236 MOVOU 16(SI), X1
11237 MOVOU -32(SI)(R8*1), X2
11238 MOVOU -16(SI)(R8*1), X3
11239 MOVOU X0, (AX)
11240 MOVOU X1, 16(AX)
11241 MOVOU X2, -32(AX)(R8*1)
11242 MOVOU X3, -16(AX)(R8*1)
11243
11244memmove_end_copy_match_emit_encodeSnappyBlockAsm:
11245 MOVQ DI, AX
11246 JMP emit_literal_done_match_emit_encodeSnappyBlockAsm
11247
11248memmove_long_match_emit_encodeSnappyBlockAsm:
11249 LEAQ (AX)(R8*1), DI
11250
11251 // genMemMoveLong
11252 MOVOU (SI), X0
11253 MOVOU 16(SI), X1
11254 MOVOU -32(SI)(R8*1), X2
11255 MOVOU -16(SI)(R8*1), X3
11256 MOVQ R8, R10
11257 SHRQ $0x05, R10
11258 MOVQ AX, R9
11259 ANDL $0x0000001f, R9
11260 MOVQ $0x00000040, R11
11261 SUBQ R9, R11
11262 DECQ R10
11263 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
11264 LEAQ -32(SI)(R11*1), R9
11265 LEAQ -32(AX)(R11*1), R12
11266
11267emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back:
11268 MOVOU (R9), X4
11269 MOVOU 16(R9), X5
11270 MOVOA X4, (R12)
11271 MOVOA X5, 16(R12)
11272 ADDQ $0x20, R12
11273 ADDQ $0x20, R9
11274 ADDQ $0x20, R11
11275 DECQ R10
11276 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back
11277
11278emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
11279 MOVOU -32(SI)(R11*1), X4
11280 MOVOU -16(SI)(R11*1), X5
11281 MOVOA X4, -32(AX)(R11*1)
11282 MOVOA X5, -16(AX)(R11*1)
11283 ADDQ $0x20, R11
11284 CMPQ R8, R11
11285 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
11286 MOVOU X0, (AX)
11287 MOVOU X1, 16(AX)
11288 MOVOU X2, -32(AX)(R8*1)
11289 MOVOU X3, -16(AX)(R8*1)
11290 MOVQ DI, AX
11291
11292emit_literal_done_match_emit_encodeSnappyBlockAsm:
11293match_nolit_loop_encodeSnappyBlockAsm:
11294 MOVL CX, SI
11295 SUBL BX, SI
11296 MOVL SI, 16(SP)
11297 ADDL $0x04, CX
11298 ADDL $0x04, BX
11299 MOVQ src_len+32(FP), SI
11300 SUBL CX, SI
11301 LEAQ (DX)(CX*1), DI
11302 LEAQ (DX)(BX*1), BX
11303
11304 // matchLen
11305 XORL R9, R9
11306
11307matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm:
11308 CMPL SI, $0x10
11309 JB matchlen_match8_match_nolit_encodeSnappyBlockAsm
11310 MOVQ (DI)(R9*1), R8
11311 MOVQ 8(DI)(R9*1), R10
11312 XORQ (BX)(R9*1), R8
11313 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm
11314 XORQ 8(BX)(R9*1), R10
11315 JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm
11316 LEAL -16(SI), SI
11317 LEAL 16(R9), R9
11318 JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm
11319
11320matchlen_bsf_16match_nolit_encodeSnappyBlockAsm:
11321#ifdef GOAMD64_v3
11322 TZCNTQ R10, R10
11323
11324#else
11325 BSFQ R10, R10
11326
11327#endif
11328 SARQ $0x03, R10
11329 LEAL 8(R9)(R10*1), R9
11330 JMP match_nolit_end_encodeSnappyBlockAsm
11331
11332matchlen_match8_match_nolit_encodeSnappyBlockAsm:
11333 CMPL SI, $0x08
11334 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm
11335 MOVQ (DI)(R9*1), R8
11336 XORQ (BX)(R9*1), R8
11337 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm
11338 LEAL -8(SI), SI
11339 LEAL 8(R9), R9
11340 JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm
11341
11342matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm:
11343#ifdef GOAMD64_v3
11344 TZCNTQ R8, R8
11345
11346#else
11347 BSFQ R8, R8
11348
11349#endif
11350 SARQ $0x03, R8
11351 LEAL (R9)(R8*1), R9
11352 JMP match_nolit_end_encodeSnappyBlockAsm
11353
11354matchlen_match4_match_nolit_encodeSnappyBlockAsm:
11355 CMPL SI, $0x04
11356 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm
11357 MOVL (DI)(R9*1), R8
11358 CMPL (BX)(R9*1), R8
11359 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm
11360 LEAL -4(SI), SI
11361 LEAL 4(R9), R9
11362
11363matchlen_match2_match_nolit_encodeSnappyBlockAsm:
11364 CMPL SI, $0x01
11365 JE matchlen_match1_match_nolit_encodeSnappyBlockAsm
11366 JB match_nolit_end_encodeSnappyBlockAsm
11367 MOVW (DI)(R9*1), R8
11368 CMPW (BX)(R9*1), R8
11369 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm
11370 LEAL 2(R9), R9
11371 SUBL $0x02, SI
11372 JZ match_nolit_end_encodeSnappyBlockAsm
11373
11374matchlen_match1_match_nolit_encodeSnappyBlockAsm:
11375 MOVB (DI)(R9*1), R8
11376 CMPB (BX)(R9*1), R8
11377 JNE match_nolit_end_encodeSnappyBlockAsm
11378 LEAL 1(R9), R9
11379
11380match_nolit_end_encodeSnappyBlockAsm:
11381 ADDL R9, CX
11382 MOVL 16(SP), BX
11383 ADDL $0x04, R9
11384 MOVL CX, 12(SP)
11385
11386 // emitCopy
11387 CMPL BX, $0x00010000
11388 JB two_byte_offset_match_nolit_encodeSnappyBlockAsm
11389
11390four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm:
11391 CMPL R9, $0x40
11392 JBE four_bytes_remain_match_nolit_encodeSnappyBlockAsm
11393 MOVB $0xff, (AX)
11394 MOVL BX, 1(AX)
11395 LEAL -64(R9), R9
11396 ADDQ $0x05, AX
11397 CMPL R9, $0x04
11398 JB four_bytes_remain_match_nolit_encodeSnappyBlockAsm
11399 JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm
11400
11401four_bytes_remain_match_nolit_encodeSnappyBlockAsm:
11402 TESTL R9, R9
11403 JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm
11404 XORL SI, SI
11405 LEAL -1(SI)(R9*4), R9
11406 MOVB R9, (AX)
11407 MOVL BX, 1(AX)
11408 ADDQ $0x05, AX
11409 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm
11410
11411two_byte_offset_match_nolit_encodeSnappyBlockAsm:
11412 CMPL R9, $0x40
11413 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm
11414 MOVB $0xee, (AX)
11415 MOVW BX, 1(AX)
11416 LEAL -60(R9), R9
11417 ADDQ $0x03, AX
11418 JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm
11419
11420two_byte_offset_short_match_nolit_encodeSnappyBlockAsm:
11421 MOVL R9, SI
11422 SHLL $0x02, SI
11423 CMPL R9, $0x0c
11424 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm
11425 CMPL BX, $0x00000800
11426 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm
11427 LEAL -15(SI), SI
11428 MOVB BL, 1(AX)
11429 SHRL $0x08, BX
11430 SHLL $0x05, BX
11431 ORL BX, SI
11432 MOVB SI, (AX)
11433 ADDQ $0x02, AX
11434 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm
11435
11436emit_copy_three_match_nolit_encodeSnappyBlockAsm:
11437 LEAL -2(SI), SI
11438 MOVB SI, (AX)
11439 MOVW BX, 1(AX)
11440 ADDQ $0x03, AX
11441
11442match_nolit_emitcopy_end_encodeSnappyBlockAsm:
11443 CMPL CX, 8(SP)
11444 JAE emit_remainder_encodeSnappyBlockAsm
11445 MOVQ -2(DX)(CX*1), SI
11446 CMPQ AX, (SP)
11447 JB match_nolit_dst_ok_encodeSnappyBlockAsm
11448 MOVQ $0x00000000, ret+48(FP)
11449 RET
11450
11451match_nolit_dst_ok_encodeSnappyBlockAsm:
11452 MOVQ $0x0000cf1bbcdcbf9b, R8
11453 MOVQ SI, DI
11454 SHRQ $0x10, SI
11455 MOVQ SI, BX
11456 SHLQ $0x10, DI
11457 IMULQ R8, DI
11458 SHRQ $0x32, DI
11459 SHLQ $0x10, BX
11460 IMULQ R8, BX
11461 SHRQ $0x32, BX
11462 LEAL -2(CX), R8
11463 LEAQ 24(SP)(BX*4), R9
11464 MOVL (R9), BX
11465 MOVL R8, 24(SP)(DI*4)
11466 MOVL CX, (R9)
11467 CMPL (DX)(BX*1), SI
11468 JEQ match_nolit_loop_encodeSnappyBlockAsm
11469 INCL CX
11470 JMP search_loop_encodeSnappyBlockAsm
11471
11472emit_remainder_encodeSnappyBlockAsm:
11473 MOVQ src_len+32(FP), CX
11474 SUBL 12(SP), CX
11475 LEAQ 5(AX)(CX*1), CX
11476 CMPQ CX, (SP)
11477 JB emit_remainder_ok_encodeSnappyBlockAsm
11478 MOVQ $0x00000000, ret+48(FP)
11479 RET
11480
11481emit_remainder_ok_encodeSnappyBlockAsm:
11482 MOVQ src_len+32(FP), CX
11483 MOVL 12(SP), BX
11484 CMPL BX, CX
11485 JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm
11486 MOVL CX, SI
11487 MOVL CX, 12(SP)
11488 LEAQ (DX)(BX*1), CX
11489 SUBL BX, SI
11490 LEAL -1(SI), DX
11491 CMPL DX, $0x3c
11492 JB one_byte_emit_remainder_encodeSnappyBlockAsm
11493 CMPL DX, $0x00000100
11494 JB two_bytes_emit_remainder_encodeSnappyBlockAsm
11495 CMPL DX, $0x00010000
11496 JB three_bytes_emit_remainder_encodeSnappyBlockAsm
11497 CMPL DX, $0x01000000
11498 JB four_bytes_emit_remainder_encodeSnappyBlockAsm
11499 MOVB $0xfc, (AX)
11500 MOVL DX, 1(AX)
11501 ADDQ $0x05, AX
11502 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
11503
11504four_bytes_emit_remainder_encodeSnappyBlockAsm:
11505 MOVL DX, BX
11506 SHRL $0x10, BX
11507 MOVB $0xf8, (AX)
11508 MOVW DX, 1(AX)
11509 MOVB BL, 3(AX)
11510 ADDQ $0x04, AX
11511 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
11512
11513three_bytes_emit_remainder_encodeSnappyBlockAsm:
11514 MOVB $0xf4, (AX)
11515 MOVW DX, 1(AX)
11516 ADDQ $0x03, AX
11517 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
11518
11519two_bytes_emit_remainder_encodeSnappyBlockAsm:
11520 MOVB $0xf0, (AX)
11521 MOVB DL, 1(AX)
11522 ADDQ $0x02, AX
11523 CMPL DX, $0x40
11524 JB memmove_emit_remainder_encodeSnappyBlockAsm
11525 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
11526
11527one_byte_emit_remainder_encodeSnappyBlockAsm:
11528 SHLB $0x02, DL
11529 MOVB DL, (AX)
11530 ADDQ $0x01, AX
11531
11532memmove_emit_remainder_encodeSnappyBlockAsm:
11533 LEAQ (AX)(SI*1), DX
11534 MOVL SI, BX
11535
11536 // genMemMoveShort
11537 CMPQ BX, $0x03
11538 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2
11539 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3
11540 CMPQ BX, $0x08
11541 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7
11542 CMPQ BX, $0x10
11543 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16
11544 CMPQ BX, $0x20
11545 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32
11546 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64
11547
11548emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2:
11549 MOVB (CX), SI
11550 MOVB -1(CX)(BX*1), CL
11551 MOVB SI, (AX)
11552 MOVB CL, -1(AX)(BX*1)
11553 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
11554
11555emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3:
11556 MOVW (CX), SI
11557 MOVB 2(CX), CL
11558 MOVW SI, (AX)
11559 MOVB CL, 2(AX)
11560 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
11561
11562emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7:
11563 MOVL (CX), SI
11564 MOVL -4(CX)(BX*1), CX
11565 MOVL SI, (AX)
11566 MOVL CX, -4(AX)(BX*1)
11567 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
11568
11569emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16:
11570 MOVQ (CX), SI
11571 MOVQ -8(CX)(BX*1), CX
11572 MOVQ SI, (AX)
11573 MOVQ CX, -8(AX)(BX*1)
11574 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
11575
11576emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32:
11577 MOVOU (CX), X0
11578 MOVOU -16(CX)(BX*1), X1
11579 MOVOU X0, (AX)
11580 MOVOU X1, -16(AX)(BX*1)
11581 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
11582
11583emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64:
11584 MOVOU (CX), X0
11585 MOVOU 16(CX), X1
11586 MOVOU -32(CX)(BX*1), X2
11587 MOVOU -16(CX)(BX*1), X3
11588 MOVOU X0, (AX)
11589 MOVOU X1, 16(AX)
11590 MOVOU X2, -32(AX)(BX*1)
11591 MOVOU X3, -16(AX)(BX*1)
11592
11593memmove_end_copy_emit_remainder_encodeSnappyBlockAsm:
11594 MOVQ DX, AX
11595 JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm
11596
11597memmove_long_emit_remainder_encodeSnappyBlockAsm:
11598 LEAQ (AX)(SI*1), DX
11599 MOVL SI, BX
11600
11601 // genMemMoveLong
11602 MOVOU (CX), X0
11603 MOVOU 16(CX), X1
11604 MOVOU -32(CX)(BX*1), X2
11605 MOVOU -16(CX)(BX*1), X3
11606 MOVQ BX, DI
11607 SHRQ $0x05, DI
11608 MOVQ AX, SI
11609 ANDL $0x0000001f, SI
11610 MOVQ $0x00000040, R8
11611 SUBQ SI, R8
11612 DECQ DI
11613 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
11614 LEAQ -32(CX)(R8*1), SI
11615 LEAQ -32(AX)(R8*1), R9
11616
11617emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back:
11618 MOVOU (SI), X4
11619 MOVOU 16(SI), X5
11620 MOVOA X4, (R9)
11621 MOVOA X5, 16(R9)
11622 ADDQ $0x20, R9
11623 ADDQ $0x20, SI
11624 ADDQ $0x20, R8
11625 DECQ DI
11626 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back
11627
11628emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
11629 MOVOU -32(CX)(R8*1), X4
11630 MOVOU -16(CX)(R8*1), X5
11631 MOVOA X4, -32(AX)(R8*1)
11632 MOVOA X5, -16(AX)(R8*1)
11633 ADDQ $0x20, R8
11634 CMPQ BX, R8
11635 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
11636 MOVOU X0, (AX)
11637 MOVOU X1, 16(AX)
11638 MOVOU X2, -32(AX)(BX*1)
11639 MOVOU X3, -16(AX)(BX*1)
11640 MOVQ DX, AX
11641
11642emit_literal_done_emit_remainder_encodeSnappyBlockAsm:
11643 MOVQ dst_base+0(FP), CX
11644 SUBQ CX, AX
11645 MOVQ AX, ret+48(FP)
11646 RET
11647
11648// func encodeSnappyBlockAsm64K(dst []byte, src []byte) int
11649// Requires: BMI, SSE2
11650TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56
11651 MOVQ dst_base+0(FP), AX
11652 MOVQ $0x00000200, CX
11653 LEAQ 24(SP), DX
11654 PXOR X0, X0
11655
11656zero_loop_encodeSnappyBlockAsm64K:
11657 MOVOU X0, (DX)
11658 MOVOU X0, 16(DX)
11659 MOVOU X0, 32(DX)
11660 MOVOU X0, 48(DX)
11661 MOVOU X0, 64(DX)
11662 MOVOU X0, 80(DX)
11663 MOVOU X0, 96(DX)
11664 MOVOU X0, 112(DX)
11665 ADDQ $0x80, DX
11666 DECQ CX
11667 JNZ zero_loop_encodeSnappyBlockAsm64K
11668 MOVL $0x00000000, 12(SP)
11669 MOVQ src_len+32(FP), CX
11670 LEAQ -9(CX), DX
11671 LEAQ -8(CX), BX
11672 MOVL BX, 8(SP)
11673 SHRQ $0x05, CX
11674 SUBL CX, DX
11675 LEAQ (AX)(DX*1), DX
11676 MOVQ DX, (SP)
11677 MOVL $0x00000001, CX
11678 MOVL CX, 16(SP)
11679 MOVQ src_base+24(FP), DX
11680
11681search_loop_encodeSnappyBlockAsm64K:
11682 MOVL CX, BX
11683 SUBL 12(SP), BX
11684 SHRL $0x06, BX
11685 LEAL 4(CX)(BX*1), BX
11686 CMPL BX, 8(SP)
11687 JAE emit_remainder_encodeSnappyBlockAsm64K
11688 MOVQ (DX)(CX*1), SI
11689 MOVL BX, 20(SP)
11690 MOVQ $0x0000cf1bbcdcbf9b, R8
11691 MOVQ SI, R9
11692 MOVQ SI, R10
11693 SHRQ $0x08, R10
11694 SHLQ $0x10, R9
11695 IMULQ R8, R9
11696 SHRQ $0x32, R9
11697 SHLQ $0x10, R10
11698 IMULQ R8, R10
11699 SHRQ $0x32, R10
11700 MOVL 24(SP)(R9*4), BX
11701 MOVL 24(SP)(R10*4), DI
11702 MOVL CX, 24(SP)(R9*4)
11703 LEAL 1(CX), R9
11704 MOVL R9, 24(SP)(R10*4)
11705 MOVQ SI, R9
11706 SHRQ $0x10, R9
11707 SHLQ $0x10, R9
11708 IMULQ R8, R9
11709 SHRQ $0x32, R9
11710 MOVL CX, R8
11711 SUBL 16(SP), R8
11712 MOVL 1(DX)(R8*1), R10
11713 MOVQ SI, R8
11714 SHRQ $0x08, R8
11715 CMPL R8, R10
11716 JNE no_repeat_found_encodeSnappyBlockAsm64K
11717 LEAL 1(CX), SI
11718 MOVL 12(SP), BX
11719 MOVL SI, DI
11720 SUBL 16(SP), DI
11721 JZ repeat_extend_back_end_encodeSnappyBlockAsm64K
11722
11723repeat_extend_back_loop_encodeSnappyBlockAsm64K:
11724 CMPL SI, BX
11725 JBE repeat_extend_back_end_encodeSnappyBlockAsm64K
11726 MOVB -1(DX)(DI*1), R8
11727 MOVB -1(DX)(SI*1), R9
11728 CMPB R8, R9
11729 JNE repeat_extend_back_end_encodeSnappyBlockAsm64K
11730 LEAL -1(SI), SI
11731 DECL DI
11732 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm64K
11733
11734repeat_extend_back_end_encodeSnappyBlockAsm64K:
11735 MOVL SI, BX
11736 SUBL 12(SP), BX
11737 LEAQ 3(AX)(BX*1), BX
11738 CMPQ BX, (SP)
11739 JB repeat_dst_size_check_encodeSnappyBlockAsm64K
11740 MOVQ $0x00000000, ret+48(FP)
11741 RET
11742
11743repeat_dst_size_check_encodeSnappyBlockAsm64K:
11744 MOVL 12(SP), BX
11745 CMPL BX, SI
11746 JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
11747 MOVL SI, DI
11748 MOVL SI, 12(SP)
11749 LEAQ (DX)(BX*1), R8
11750 SUBL BX, DI
11751 LEAL -1(DI), BX
11752 CMPL BX, $0x3c
11753 JB one_byte_repeat_emit_encodeSnappyBlockAsm64K
11754 CMPL BX, $0x00000100
11755 JB two_bytes_repeat_emit_encodeSnappyBlockAsm64K
11756 JB three_bytes_repeat_emit_encodeSnappyBlockAsm64K
11757
11758three_bytes_repeat_emit_encodeSnappyBlockAsm64K:
11759 MOVB $0xf4, (AX)
11760 MOVW BX, 1(AX)
11761 ADDQ $0x03, AX
11762 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K
11763
11764two_bytes_repeat_emit_encodeSnappyBlockAsm64K:
11765 MOVB $0xf0, (AX)
11766 MOVB BL, 1(AX)
11767 ADDQ $0x02, AX
11768 CMPL BX, $0x40
11769 JB memmove_repeat_emit_encodeSnappyBlockAsm64K
11770 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K
11771
11772one_byte_repeat_emit_encodeSnappyBlockAsm64K:
11773 SHLB $0x02, BL
11774 MOVB BL, (AX)
11775 ADDQ $0x01, AX
11776
11777memmove_repeat_emit_encodeSnappyBlockAsm64K:
11778 LEAQ (AX)(DI*1), BX
11779
11780 // genMemMoveShort
11781 CMPQ DI, $0x08
11782 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8
11783 CMPQ DI, $0x10
11784 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
11785 CMPQ DI, $0x20
11786 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
11787 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
11788
11789emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8:
11790 MOVQ (R8), R9
11791 MOVQ R9, (AX)
11792 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
11793
11794emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
11795 MOVQ (R8), R9
11796 MOVQ -8(R8)(DI*1), R8
11797 MOVQ R9, (AX)
11798 MOVQ R8, -8(AX)(DI*1)
11799 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
11800
11801emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
11802 MOVOU (R8), X0
11803 MOVOU -16(R8)(DI*1), X1
11804 MOVOU X0, (AX)
11805 MOVOU X1, -16(AX)(DI*1)
11806 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
11807
11808emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
11809 MOVOU (R8), X0
11810 MOVOU 16(R8), X1
11811 MOVOU -32(R8)(DI*1), X2
11812 MOVOU -16(R8)(DI*1), X3
11813 MOVOU X0, (AX)
11814 MOVOU X1, 16(AX)
11815 MOVOU X2, -32(AX)(DI*1)
11816 MOVOU X3, -16(AX)(DI*1)
11817
11818memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K:
11819 MOVQ BX, AX
11820 JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
11821
11822memmove_long_repeat_emit_encodeSnappyBlockAsm64K:
11823 LEAQ (AX)(DI*1), BX
11824
11825 // genMemMoveLong
11826 MOVOU (R8), X0
11827 MOVOU 16(R8), X1
11828 MOVOU -32(R8)(DI*1), X2
11829 MOVOU -16(R8)(DI*1), X3
11830 MOVQ DI, R10
11831 SHRQ $0x05, R10
11832 MOVQ AX, R9
11833 ANDL $0x0000001f, R9
11834 MOVQ $0x00000040, R11
11835 SUBQ R9, R11
11836 DECQ R10
11837 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
11838 LEAQ -32(R8)(R11*1), R9
11839 LEAQ -32(AX)(R11*1), R12
11840
11841emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
11842 MOVOU (R9), X4
11843 MOVOU 16(R9), X5
11844 MOVOA X4, (R12)
11845 MOVOA X5, 16(R12)
11846 ADDQ $0x20, R12
11847 ADDQ $0x20, R9
11848 ADDQ $0x20, R11
11849 DECQ R10
11850 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
11851
11852emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
11853 MOVOU -32(R8)(R11*1), X4
11854 MOVOU -16(R8)(R11*1), X5
11855 MOVOA X4, -32(AX)(R11*1)
11856 MOVOA X5, -16(AX)(R11*1)
11857 ADDQ $0x20, R11
11858 CMPQ DI, R11
11859 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
11860 MOVOU X0, (AX)
11861 MOVOU X1, 16(AX)
11862 MOVOU X2, -32(AX)(DI*1)
11863 MOVOU X3, -16(AX)(DI*1)
11864 MOVQ BX, AX
11865
11866emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K:
11867 ADDL $0x05, CX
11868 MOVL CX, BX
11869 SUBL 16(SP), BX
11870 MOVQ src_len+32(FP), DI
11871 SUBL CX, DI
11872 LEAQ (DX)(CX*1), R8
11873 LEAQ (DX)(BX*1), BX
11874
11875 // matchLen
11876 XORL R10, R10
11877
11878matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K:
11879 CMPL DI, $0x10
11880 JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K
11881 MOVQ (R8)(R10*1), R9
11882 MOVQ 8(R8)(R10*1), R11
11883 XORQ (BX)(R10*1), R9
11884 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K
11885 XORQ 8(BX)(R10*1), R11
11886 JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K
11887 LEAL -16(DI), DI
11888 LEAL 16(R10), R10
11889 JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K
11890
11891matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K:
11892#ifdef GOAMD64_v3
11893 TZCNTQ R11, R11
11894
11895#else
11896 BSFQ R11, R11
11897
11898#endif
11899 SARQ $0x03, R11
11900 LEAL 8(R10)(R11*1), R10
11901 JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K
11902
11903matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K:
11904 CMPL DI, $0x08
11905 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K
11906 MOVQ (R8)(R10*1), R9
11907 XORQ (BX)(R10*1), R9
11908 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K
11909 LEAL -8(DI), DI
11910 LEAL 8(R10), R10
11911 JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K
11912
11913matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K:
11914#ifdef GOAMD64_v3
11915 TZCNTQ R9, R9
11916
11917#else
11918 BSFQ R9, R9
11919
11920#endif
11921 SARQ $0x03, R9
11922 LEAL (R10)(R9*1), R10
11923 JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K
11924
11925matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K:
11926 CMPL DI, $0x04
11927 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
11928 MOVL (R8)(R10*1), R9
11929 CMPL (BX)(R10*1), R9
11930 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
11931 LEAL -4(DI), DI
11932 LEAL 4(R10), R10
11933
11934matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K:
11935 CMPL DI, $0x01
11936 JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
11937 JB repeat_extend_forward_end_encodeSnappyBlockAsm64K
11938 MOVW (R8)(R10*1), R9
11939 CMPW (BX)(R10*1), R9
11940 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
11941 LEAL 2(R10), R10
11942 SUBL $0x02, DI
11943 JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K
11944
11945matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K:
11946 MOVB (R8)(R10*1), R9
11947 CMPB (BX)(R10*1), R9
11948 JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K
11949 LEAL 1(R10), R10
11950
11951repeat_extend_forward_end_encodeSnappyBlockAsm64K:
11952 ADDL R10, CX
11953 MOVL CX, BX
11954 SUBL SI, BX
11955 MOVL 16(SP), SI
11956
11957 // emitCopy
11958two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K:
11959 CMPL BX, $0x40
11960 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K
11961 MOVB $0xee, (AX)
11962 MOVW SI, 1(AX)
11963 LEAL -60(BX), BX
11964 ADDQ $0x03, AX
11965 JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K
11966
11967two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K:
11968 MOVL BX, DI
11969 SHLL $0x02, DI
11970 CMPL BX, $0x0c
11971 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
11972 CMPL SI, $0x00000800
11973 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
11974 LEAL -15(DI), DI
11975 MOVB SI, 1(AX)
11976 SHRL $0x08, SI
11977 SHLL $0x05, SI
11978 ORL SI, DI
11979 MOVB DI, (AX)
11980 ADDQ $0x02, AX
11981 JMP repeat_end_emit_encodeSnappyBlockAsm64K
11982
11983emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K:
11984 LEAL -2(DI), DI
11985 MOVB DI, (AX)
11986 MOVW SI, 1(AX)
11987 ADDQ $0x03, AX
11988
11989repeat_end_emit_encodeSnappyBlockAsm64K:
11990 MOVL CX, 12(SP)
11991 JMP search_loop_encodeSnappyBlockAsm64K
11992
11993no_repeat_found_encodeSnappyBlockAsm64K:
11994 CMPL (DX)(BX*1), SI
11995 JEQ candidate_match_encodeSnappyBlockAsm64K
11996 SHRQ $0x08, SI
11997 MOVL 24(SP)(R9*4), BX
11998 LEAL 2(CX), R8
11999 CMPL (DX)(DI*1), SI
12000 JEQ candidate2_match_encodeSnappyBlockAsm64K
12001 MOVL R8, 24(SP)(R9*4)
12002 SHRQ $0x08, SI
12003 CMPL (DX)(BX*1), SI
12004 JEQ candidate3_match_encodeSnappyBlockAsm64K
12005 MOVL 20(SP), CX
12006 JMP search_loop_encodeSnappyBlockAsm64K
12007
12008candidate3_match_encodeSnappyBlockAsm64K:
12009 ADDL $0x02, CX
12010 JMP candidate_match_encodeSnappyBlockAsm64K
12011
12012candidate2_match_encodeSnappyBlockAsm64K:
12013 MOVL R8, 24(SP)(R9*4)
12014 INCL CX
12015 MOVL DI, BX
12016
12017candidate_match_encodeSnappyBlockAsm64K:
12018 MOVL 12(SP), SI
12019 TESTL BX, BX
12020 JZ match_extend_back_end_encodeSnappyBlockAsm64K
12021
12022match_extend_back_loop_encodeSnappyBlockAsm64K:
12023 CMPL CX, SI
12024 JBE match_extend_back_end_encodeSnappyBlockAsm64K
12025 MOVB -1(DX)(BX*1), DI
12026 MOVB -1(DX)(CX*1), R8
12027 CMPB DI, R8
12028 JNE match_extend_back_end_encodeSnappyBlockAsm64K
12029 LEAL -1(CX), CX
12030 DECL BX
12031 JZ match_extend_back_end_encodeSnappyBlockAsm64K
12032 JMP match_extend_back_loop_encodeSnappyBlockAsm64K
12033
12034match_extend_back_end_encodeSnappyBlockAsm64K:
12035 MOVL CX, SI
12036 SUBL 12(SP), SI
12037 LEAQ 3(AX)(SI*1), SI
12038 CMPQ SI, (SP)
12039 JB match_dst_size_check_encodeSnappyBlockAsm64K
12040 MOVQ $0x00000000, ret+48(FP)
12041 RET
12042
12043match_dst_size_check_encodeSnappyBlockAsm64K:
12044 MOVL CX, SI
12045 MOVL 12(SP), DI
12046 CMPL DI, SI
12047 JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm64K
12048 MOVL SI, R8
12049 MOVL SI, 12(SP)
12050 LEAQ (DX)(DI*1), SI
12051 SUBL DI, R8
12052 LEAL -1(R8), DI
12053 CMPL DI, $0x3c
12054 JB one_byte_match_emit_encodeSnappyBlockAsm64K
12055 CMPL DI, $0x00000100
12056 JB two_bytes_match_emit_encodeSnappyBlockAsm64K
12057 JB three_bytes_match_emit_encodeSnappyBlockAsm64K
12058
12059three_bytes_match_emit_encodeSnappyBlockAsm64K:
12060 MOVB $0xf4, (AX)
12061 MOVW DI, 1(AX)
12062 ADDQ $0x03, AX
12063 JMP memmove_long_match_emit_encodeSnappyBlockAsm64K
12064
12065two_bytes_match_emit_encodeSnappyBlockAsm64K:
12066 MOVB $0xf0, (AX)
12067 MOVB DI, 1(AX)
12068 ADDQ $0x02, AX
12069 CMPL DI, $0x40
12070 JB memmove_match_emit_encodeSnappyBlockAsm64K
12071 JMP memmove_long_match_emit_encodeSnappyBlockAsm64K
12072
12073one_byte_match_emit_encodeSnappyBlockAsm64K:
12074 SHLB $0x02, DI
12075 MOVB DI, (AX)
12076 ADDQ $0x01, AX
12077
12078memmove_match_emit_encodeSnappyBlockAsm64K:
12079 LEAQ (AX)(R8*1), DI
12080
12081 // genMemMoveShort
12082 CMPQ R8, $0x08
12083 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8
12084 CMPQ R8, $0x10
12085 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
12086 CMPQ R8, $0x20
12087 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
12088 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
12089
12090emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8:
12091 MOVQ (SI), R9
12092 MOVQ R9, (AX)
12093 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
12094
12095emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
12096 MOVQ (SI), R9
12097 MOVQ -8(SI)(R8*1), SI
12098 MOVQ R9, (AX)
12099 MOVQ SI, -8(AX)(R8*1)
12100 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
12101
12102emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
12103 MOVOU (SI), X0
12104 MOVOU -16(SI)(R8*1), X1
12105 MOVOU X0, (AX)
12106 MOVOU X1, -16(AX)(R8*1)
12107 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
12108
12109emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
12110 MOVOU (SI), X0
12111 MOVOU 16(SI), X1
12112 MOVOU -32(SI)(R8*1), X2
12113 MOVOU -16(SI)(R8*1), X3
12114 MOVOU X0, (AX)
12115 MOVOU X1, 16(AX)
12116 MOVOU X2, -32(AX)(R8*1)
12117 MOVOU X3, -16(AX)(R8*1)
12118
12119memmove_end_copy_match_emit_encodeSnappyBlockAsm64K:
12120 MOVQ DI, AX
12121 JMP emit_literal_done_match_emit_encodeSnappyBlockAsm64K
12122
12123memmove_long_match_emit_encodeSnappyBlockAsm64K:
12124 LEAQ (AX)(R8*1), DI
12125
12126 // genMemMoveLong
12127 MOVOU (SI), X0
12128 MOVOU 16(SI), X1
12129 MOVOU -32(SI)(R8*1), X2
12130 MOVOU -16(SI)(R8*1), X3
12131 MOVQ R8, R10
12132 SHRQ $0x05, R10
12133 MOVQ AX, R9
12134 ANDL $0x0000001f, R9
12135 MOVQ $0x00000040, R11
12136 SUBQ R9, R11
12137 DECQ R10
12138 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
12139 LEAQ -32(SI)(R11*1), R9
12140 LEAQ -32(AX)(R11*1), R12
12141
12142emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
12143 MOVOU (R9), X4
12144 MOVOU 16(R9), X5
12145 MOVOA X4, (R12)
12146 MOVOA X5, 16(R12)
12147 ADDQ $0x20, R12
12148 ADDQ $0x20, R9
12149 ADDQ $0x20, R11
12150 DECQ R10
12151 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
12152
12153emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
12154 MOVOU -32(SI)(R11*1), X4
12155 MOVOU -16(SI)(R11*1), X5
12156 MOVOA X4, -32(AX)(R11*1)
12157 MOVOA X5, -16(AX)(R11*1)
12158 ADDQ $0x20, R11
12159 CMPQ R8, R11
12160 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
12161 MOVOU X0, (AX)
12162 MOVOU X1, 16(AX)
12163 MOVOU X2, -32(AX)(R8*1)
12164 MOVOU X3, -16(AX)(R8*1)
12165 MOVQ DI, AX
12166
12167emit_literal_done_match_emit_encodeSnappyBlockAsm64K:
12168match_nolit_loop_encodeSnappyBlockAsm64K:
12169 MOVL CX, SI
12170 SUBL BX, SI
12171 MOVL SI, 16(SP)
12172 ADDL $0x04, CX
12173 ADDL $0x04, BX
12174 MOVQ src_len+32(FP), SI
12175 SUBL CX, SI
12176 LEAQ (DX)(CX*1), DI
12177 LEAQ (DX)(BX*1), BX
12178
12179 // matchLen
12180 XORL R9, R9
12181
12182matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K:
12183 CMPL SI, $0x10
12184 JB matchlen_match8_match_nolit_encodeSnappyBlockAsm64K
12185 MOVQ (DI)(R9*1), R8
12186 MOVQ 8(DI)(R9*1), R10
12187 XORQ (BX)(R9*1), R8
12188 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K
12189 XORQ 8(BX)(R9*1), R10
12190 JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K
12191 LEAL -16(SI), SI
12192 LEAL 16(R9), R9
12193 JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K
12194
12195matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K:
12196#ifdef GOAMD64_v3
12197 TZCNTQ R10, R10
12198
12199#else
12200 BSFQ R10, R10
12201
12202#endif
12203 SARQ $0x03, R10
12204 LEAL 8(R9)(R10*1), R9
12205 JMP match_nolit_end_encodeSnappyBlockAsm64K
12206
12207matchlen_match8_match_nolit_encodeSnappyBlockAsm64K:
12208 CMPL SI, $0x08
12209 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm64K
12210 MOVQ (DI)(R9*1), R8
12211 XORQ (BX)(R9*1), R8
12212 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K
12213 LEAL -8(SI), SI
12214 LEAL 8(R9), R9
12215 JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm64K
12216
12217matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K:
12218#ifdef GOAMD64_v3
12219 TZCNTQ R8, R8
12220
12221#else
12222 BSFQ R8, R8
12223
12224#endif
12225 SARQ $0x03, R8
12226 LEAL (R9)(R8*1), R9
12227 JMP match_nolit_end_encodeSnappyBlockAsm64K
12228
12229matchlen_match4_match_nolit_encodeSnappyBlockAsm64K:
12230 CMPL SI, $0x04
12231 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
12232 MOVL (DI)(R9*1), R8
12233 CMPL (BX)(R9*1), R8
12234 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
12235 LEAL -4(SI), SI
12236 LEAL 4(R9), R9
12237
12238matchlen_match2_match_nolit_encodeSnappyBlockAsm64K:
12239 CMPL SI, $0x01
12240 JE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
12241 JB match_nolit_end_encodeSnappyBlockAsm64K
12242 MOVW (DI)(R9*1), R8
12243 CMPW (BX)(R9*1), R8
12244 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
12245 LEAL 2(R9), R9
12246 SUBL $0x02, SI
12247 JZ match_nolit_end_encodeSnappyBlockAsm64K
12248
12249matchlen_match1_match_nolit_encodeSnappyBlockAsm64K:
12250 MOVB (DI)(R9*1), R8
12251 CMPB (BX)(R9*1), R8
12252 JNE match_nolit_end_encodeSnappyBlockAsm64K
12253 LEAL 1(R9), R9
12254
12255match_nolit_end_encodeSnappyBlockAsm64K:
12256 ADDL R9, CX
12257 MOVL 16(SP), BX
12258 ADDL $0x04, R9
12259 MOVL CX, 12(SP)
12260
12261 // emitCopy
12262two_byte_offset_match_nolit_encodeSnappyBlockAsm64K:
12263 CMPL R9, $0x40
12264 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K
12265 MOVB $0xee, (AX)
12266 MOVW BX, 1(AX)
12267 LEAL -60(R9), R9
12268 ADDQ $0x03, AX
12269 JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm64K
12270
12271two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K:
12272 MOVL R9, SI
12273 SHLL $0x02, SI
12274 CMPL R9, $0x0c
12275 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
12276 CMPL BX, $0x00000800
12277 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
12278 LEAL -15(SI), SI
12279 MOVB BL, 1(AX)
12280 SHRL $0x08, BX
12281 SHLL $0x05, BX
12282 ORL BX, SI
12283 MOVB SI, (AX)
12284 ADDQ $0x02, AX
12285 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm64K
12286
12287emit_copy_three_match_nolit_encodeSnappyBlockAsm64K:
12288 LEAL -2(SI), SI
12289 MOVB SI, (AX)
12290 MOVW BX, 1(AX)
12291 ADDQ $0x03, AX
12292
12293match_nolit_emitcopy_end_encodeSnappyBlockAsm64K:
12294 CMPL CX, 8(SP)
12295 JAE emit_remainder_encodeSnappyBlockAsm64K
12296 MOVQ -2(DX)(CX*1), SI
12297 CMPQ AX, (SP)
12298 JB match_nolit_dst_ok_encodeSnappyBlockAsm64K
12299 MOVQ $0x00000000, ret+48(FP)
12300 RET
12301
12302match_nolit_dst_ok_encodeSnappyBlockAsm64K:
12303 MOVQ $0x0000cf1bbcdcbf9b, R8
12304 MOVQ SI, DI
12305 SHRQ $0x10, SI
12306 MOVQ SI, BX
12307 SHLQ $0x10, DI
12308 IMULQ R8, DI
12309 SHRQ $0x32, DI
12310 SHLQ $0x10, BX
12311 IMULQ R8, BX
12312 SHRQ $0x32, BX
12313 LEAL -2(CX), R8
12314 LEAQ 24(SP)(BX*4), R9
12315 MOVL (R9), BX
12316 MOVL R8, 24(SP)(DI*4)
12317 MOVL CX, (R9)
12318 CMPL (DX)(BX*1), SI
12319 JEQ match_nolit_loop_encodeSnappyBlockAsm64K
12320 INCL CX
12321 JMP search_loop_encodeSnappyBlockAsm64K
12322
12323emit_remainder_encodeSnappyBlockAsm64K:
12324 MOVQ src_len+32(FP), CX
12325 SUBL 12(SP), CX
12326 LEAQ 3(AX)(CX*1), CX
12327 CMPQ CX, (SP)
12328 JB emit_remainder_ok_encodeSnappyBlockAsm64K
12329 MOVQ $0x00000000, ret+48(FP)
12330 RET
12331
12332emit_remainder_ok_encodeSnappyBlockAsm64K:
12333 MOVQ src_len+32(FP), CX
12334 MOVL 12(SP), BX
12335 CMPL BX, CX
12336 JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
12337 MOVL CX, SI
12338 MOVL CX, 12(SP)
12339 LEAQ (DX)(BX*1), CX
12340 SUBL BX, SI
12341 LEAL -1(SI), DX
12342 CMPL DX, $0x3c
12343 JB one_byte_emit_remainder_encodeSnappyBlockAsm64K
12344 CMPL DX, $0x00000100
12345 JB two_bytes_emit_remainder_encodeSnappyBlockAsm64K
12346 JB three_bytes_emit_remainder_encodeSnappyBlockAsm64K
12347
12348three_bytes_emit_remainder_encodeSnappyBlockAsm64K:
12349 MOVB $0xf4, (AX)
12350 MOVW DX, 1(AX)
12351 ADDQ $0x03, AX
12352 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K
12353
12354two_bytes_emit_remainder_encodeSnappyBlockAsm64K:
12355 MOVB $0xf0, (AX)
12356 MOVB DL, 1(AX)
12357 ADDQ $0x02, AX
12358 CMPL DX, $0x40
12359 JB memmove_emit_remainder_encodeSnappyBlockAsm64K
12360 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K
12361
12362one_byte_emit_remainder_encodeSnappyBlockAsm64K:
12363 SHLB $0x02, DL
12364 MOVB DL, (AX)
12365 ADDQ $0x01, AX
12366
12367memmove_emit_remainder_encodeSnappyBlockAsm64K:
12368 LEAQ (AX)(SI*1), DX
12369 MOVL SI, BX
12370
12371 // genMemMoveShort
12372 CMPQ BX, $0x03
12373 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2
12374 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3
12375 CMPQ BX, $0x08
12376 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7
12377 CMPQ BX, $0x10
12378 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16
12379 CMPQ BX, $0x20
12380 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32
12381 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64
12382
12383emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2:
12384 MOVB (CX), SI
12385 MOVB -1(CX)(BX*1), CL
12386 MOVB SI, (AX)
12387 MOVB CL, -1(AX)(BX*1)
12388 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
12389
12390emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3:
12391 MOVW (CX), SI
12392 MOVB 2(CX), CL
12393 MOVW SI, (AX)
12394 MOVB CL, 2(AX)
12395 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
12396
12397emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7:
12398 MOVL (CX), SI
12399 MOVL -4(CX)(BX*1), CX
12400 MOVL SI, (AX)
12401 MOVL CX, -4(AX)(BX*1)
12402 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
12403
12404emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16:
12405 MOVQ (CX), SI
12406 MOVQ -8(CX)(BX*1), CX
12407 MOVQ SI, (AX)
12408 MOVQ CX, -8(AX)(BX*1)
12409 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
12410
12411emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32:
12412 MOVOU (CX), X0
12413 MOVOU -16(CX)(BX*1), X1
12414 MOVOU X0, (AX)
12415 MOVOU X1, -16(AX)(BX*1)
12416 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
12417
12418emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64:
12419 MOVOU (CX), X0
12420 MOVOU 16(CX), X1
12421 MOVOU -32(CX)(BX*1), X2
12422 MOVOU -16(CX)(BX*1), X3
12423 MOVOU X0, (AX)
12424 MOVOU X1, 16(AX)
12425 MOVOU X2, -32(AX)(BX*1)
12426 MOVOU X3, -16(AX)(BX*1)
12427
12428memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K:
12429 MOVQ DX, AX
12430 JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
12431
12432memmove_long_emit_remainder_encodeSnappyBlockAsm64K:
12433 LEAQ (AX)(SI*1), DX
12434 MOVL SI, BX
12435
12436 // genMemMoveLong
12437 MOVOU (CX), X0
12438 MOVOU 16(CX), X1
12439 MOVOU -32(CX)(BX*1), X2
12440 MOVOU -16(CX)(BX*1), X3
12441 MOVQ BX, DI
12442 SHRQ $0x05, DI
12443 MOVQ AX, SI
12444 ANDL $0x0000001f, SI
12445 MOVQ $0x00000040, R8
12446 SUBQ SI, R8
12447 DECQ DI
12448 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
12449 LEAQ -32(CX)(R8*1), SI
12450 LEAQ -32(AX)(R8*1), R9
12451
12452emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back:
12453 MOVOU (SI), X4
12454 MOVOU 16(SI), X5
12455 MOVOA X4, (R9)
12456 MOVOA X5, 16(R9)
12457 ADDQ $0x20, R9
12458 ADDQ $0x20, SI
12459 ADDQ $0x20, R8
12460 DECQ DI
12461 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back
12462
12463emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
12464 MOVOU -32(CX)(R8*1), X4
12465 MOVOU -16(CX)(R8*1), X5
12466 MOVOA X4, -32(AX)(R8*1)
12467 MOVOA X5, -16(AX)(R8*1)
12468 ADDQ $0x20, R8
12469 CMPQ BX, R8
12470 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
12471 MOVOU X0, (AX)
12472 MOVOU X1, 16(AX)
12473 MOVOU X2, -32(AX)(BX*1)
12474 MOVOU X3, -16(AX)(BX*1)
12475 MOVQ DX, AX
12476
12477emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K:
12478 MOVQ dst_base+0(FP), CX
12479 SUBQ CX, AX
12480 MOVQ AX, ret+48(FP)
12481 RET
12482
12483// func encodeSnappyBlockAsm12B(dst []byte, src []byte) int
12484// Requires: BMI, SSE2
12485TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56
12486 MOVQ dst_base+0(FP), AX
12487 MOVQ $0x00000080, CX
12488 LEAQ 24(SP), DX
12489 PXOR X0, X0
12490
12491zero_loop_encodeSnappyBlockAsm12B:
12492 MOVOU X0, (DX)
12493 MOVOU X0, 16(DX)
12494 MOVOU X0, 32(DX)
12495 MOVOU X0, 48(DX)
12496 MOVOU X0, 64(DX)
12497 MOVOU X0, 80(DX)
12498 MOVOU X0, 96(DX)
12499 MOVOU X0, 112(DX)
12500 ADDQ $0x80, DX
12501 DECQ CX
12502 JNZ zero_loop_encodeSnappyBlockAsm12B
12503 MOVL $0x00000000, 12(SP)
12504 MOVQ src_len+32(FP), CX
12505 LEAQ -9(CX), DX
12506 LEAQ -8(CX), BX
12507 MOVL BX, 8(SP)
12508 SHRQ $0x05, CX
12509 SUBL CX, DX
12510 LEAQ (AX)(DX*1), DX
12511 MOVQ DX, (SP)
12512 MOVL $0x00000001, CX
12513 MOVL CX, 16(SP)
12514 MOVQ src_base+24(FP), DX
12515
12516search_loop_encodeSnappyBlockAsm12B:
12517 MOVL CX, BX
12518 SUBL 12(SP), BX
12519 SHRL $0x05, BX
12520 LEAL 4(CX)(BX*1), BX
12521 CMPL BX, 8(SP)
12522 JAE emit_remainder_encodeSnappyBlockAsm12B
12523 MOVQ (DX)(CX*1), SI
12524 MOVL BX, 20(SP)
12525 MOVQ $0x000000cf1bbcdcbb, R8
12526 MOVQ SI, R9
12527 MOVQ SI, R10
12528 SHRQ $0x08, R10
12529 SHLQ $0x18, R9
12530 IMULQ R8, R9
12531 SHRQ $0x34, R9
12532 SHLQ $0x18, R10
12533 IMULQ R8, R10
12534 SHRQ $0x34, R10
12535 MOVL 24(SP)(R9*4), BX
12536 MOVL 24(SP)(R10*4), DI
12537 MOVL CX, 24(SP)(R9*4)
12538 LEAL 1(CX), R9
12539 MOVL R9, 24(SP)(R10*4)
12540 MOVQ SI, R9
12541 SHRQ $0x10, R9
12542 SHLQ $0x18, R9
12543 IMULQ R8, R9
12544 SHRQ $0x34, R9
12545 MOVL CX, R8
12546 SUBL 16(SP), R8
12547 MOVL 1(DX)(R8*1), R10
12548 MOVQ SI, R8
12549 SHRQ $0x08, R8
12550 CMPL R8, R10
12551 JNE no_repeat_found_encodeSnappyBlockAsm12B
12552 LEAL 1(CX), SI
12553 MOVL 12(SP), BX
12554 MOVL SI, DI
12555 SUBL 16(SP), DI
12556 JZ repeat_extend_back_end_encodeSnappyBlockAsm12B
12557
12558repeat_extend_back_loop_encodeSnappyBlockAsm12B:
12559 CMPL SI, BX
12560 JBE repeat_extend_back_end_encodeSnappyBlockAsm12B
12561 MOVB -1(DX)(DI*1), R8
12562 MOVB -1(DX)(SI*1), R9
12563 CMPB R8, R9
12564 JNE repeat_extend_back_end_encodeSnappyBlockAsm12B
12565 LEAL -1(SI), SI
12566 DECL DI
12567 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B
12568
12569repeat_extend_back_end_encodeSnappyBlockAsm12B:
12570 MOVL SI, BX
12571 SUBL 12(SP), BX
12572 LEAQ 3(AX)(BX*1), BX
12573 CMPQ BX, (SP)
12574 JB repeat_dst_size_check_encodeSnappyBlockAsm12B
12575 MOVQ $0x00000000, ret+48(FP)
12576 RET
12577
12578repeat_dst_size_check_encodeSnappyBlockAsm12B:
12579 MOVL 12(SP), BX
12580 CMPL BX, SI
12581 JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
12582 MOVL SI, DI
12583 MOVL SI, 12(SP)
12584 LEAQ (DX)(BX*1), R8
12585 SUBL BX, DI
12586 LEAL -1(DI), BX
12587 CMPL BX, $0x3c
12588 JB one_byte_repeat_emit_encodeSnappyBlockAsm12B
12589 CMPL BX, $0x00000100
12590 JB two_bytes_repeat_emit_encodeSnappyBlockAsm12B
12591 JB three_bytes_repeat_emit_encodeSnappyBlockAsm12B
12592
12593three_bytes_repeat_emit_encodeSnappyBlockAsm12B:
12594 MOVB $0xf4, (AX)
12595 MOVW BX, 1(AX)
12596 ADDQ $0x03, AX
12597 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B
12598
12599two_bytes_repeat_emit_encodeSnappyBlockAsm12B:
12600 MOVB $0xf0, (AX)
12601 MOVB BL, 1(AX)
12602 ADDQ $0x02, AX
12603 CMPL BX, $0x40
12604 JB memmove_repeat_emit_encodeSnappyBlockAsm12B
12605 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B
12606
12607one_byte_repeat_emit_encodeSnappyBlockAsm12B:
12608 SHLB $0x02, BL
12609 MOVB BL, (AX)
12610 ADDQ $0x01, AX
12611
12612memmove_repeat_emit_encodeSnappyBlockAsm12B:
12613 LEAQ (AX)(DI*1), BX
12614
12615 // genMemMoveShort
12616 CMPQ DI, $0x08
12617 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8
12618 CMPQ DI, $0x10
12619 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
12620 CMPQ DI, $0x20
12621 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
12622 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
12623
12624emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8:
12625 MOVQ (R8), R9
12626 MOVQ R9, (AX)
12627 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
12628
12629emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
12630 MOVQ (R8), R9
12631 MOVQ -8(R8)(DI*1), R8
12632 MOVQ R9, (AX)
12633 MOVQ R8, -8(AX)(DI*1)
12634 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
12635
12636emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
12637 MOVOU (R8), X0
12638 MOVOU -16(R8)(DI*1), X1
12639 MOVOU X0, (AX)
12640 MOVOU X1, -16(AX)(DI*1)
12641 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
12642
12643emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
12644 MOVOU (R8), X0
12645 MOVOU 16(R8), X1
12646 MOVOU -32(R8)(DI*1), X2
12647 MOVOU -16(R8)(DI*1), X3
12648 MOVOU X0, (AX)
12649 MOVOU X1, 16(AX)
12650 MOVOU X2, -32(AX)(DI*1)
12651 MOVOU X3, -16(AX)(DI*1)
12652
12653memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B:
12654 MOVQ BX, AX
12655 JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
12656
12657memmove_long_repeat_emit_encodeSnappyBlockAsm12B:
12658 LEAQ (AX)(DI*1), BX
12659
12660 // genMemMoveLong
12661 MOVOU (R8), X0
12662 MOVOU 16(R8), X1
12663 MOVOU -32(R8)(DI*1), X2
12664 MOVOU -16(R8)(DI*1), X3
12665 MOVQ DI, R10
12666 SHRQ $0x05, R10
12667 MOVQ AX, R9
12668 ANDL $0x0000001f, R9
12669 MOVQ $0x00000040, R11
12670 SUBQ R9, R11
12671 DECQ R10
12672 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
12673 LEAQ -32(R8)(R11*1), R9
12674 LEAQ -32(AX)(R11*1), R12
12675
12676emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
12677 MOVOU (R9), X4
12678 MOVOU 16(R9), X5
12679 MOVOA X4, (R12)
12680 MOVOA X5, 16(R12)
12681 ADDQ $0x20, R12
12682 ADDQ $0x20, R9
12683 ADDQ $0x20, R11
12684 DECQ R10
12685 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
12686
12687emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
12688 MOVOU -32(R8)(R11*1), X4
12689 MOVOU -16(R8)(R11*1), X5
12690 MOVOA X4, -32(AX)(R11*1)
12691 MOVOA X5, -16(AX)(R11*1)
12692 ADDQ $0x20, R11
12693 CMPQ DI, R11
12694 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
12695 MOVOU X0, (AX)
12696 MOVOU X1, 16(AX)
12697 MOVOU X2, -32(AX)(DI*1)
12698 MOVOU X3, -16(AX)(DI*1)
12699 MOVQ BX, AX
12700
12701emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B:
12702 ADDL $0x05, CX
12703 MOVL CX, BX
12704 SUBL 16(SP), BX
12705 MOVQ src_len+32(FP), DI
12706 SUBL CX, DI
12707 LEAQ (DX)(CX*1), R8
12708 LEAQ (DX)(BX*1), BX
12709
12710 // matchLen
12711 XORL R10, R10
12712
12713matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B:
12714 CMPL DI, $0x10
12715 JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B
12716 MOVQ (R8)(R10*1), R9
12717 MOVQ 8(R8)(R10*1), R11
12718 XORQ (BX)(R10*1), R9
12719 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B
12720 XORQ 8(BX)(R10*1), R11
12721 JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B
12722 LEAL -16(DI), DI
12723 LEAL 16(R10), R10
12724 JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B
12725
12726matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B:
12727#ifdef GOAMD64_v3
12728 TZCNTQ R11, R11
12729
12730#else
12731 BSFQ R11, R11
12732
12733#endif
12734 SARQ $0x03, R11
12735 LEAL 8(R10)(R11*1), R10
12736 JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B
12737
12738matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B:
12739 CMPL DI, $0x08
12740 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B
12741 MOVQ (R8)(R10*1), R9
12742 XORQ (BX)(R10*1), R9
12743 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B
12744 LEAL -8(DI), DI
12745 LEAL 8(R10), R10
12746 JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B
12747
12748matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B:
12749#ifdef GOAMD64_v3
12750 TZCNTQ R9, R9
12751
12752#else
12753 BSFQ R9, R9
12754
12755#endif
12756 SARQ $0x03, R9
12757 LEAL (R10)(R9*1), R10
12758 JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B
12759
12760matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B:
12761 CMPL DI, $0x04
12762 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
12763 MOVL (R8)(R10*1), R9
12764 CMPL (BX)(R10*1), R9
12765 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
12766 LEAL -4(DI), DI
12767 LEAL 4(R10), R10
12768
12769matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B:
12770 CMPL DI, $0x01
12771 JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
12772 JB repeat_extend_forward_end_encodeSnappyBlockAsm12B
12773 MOVW (R8)(R10*1), R9
12774 CMPW (BX)(R10*1), R9
12775 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
12776 LEAL 2(R10), R10
12777 SUBL $0x02, DI
12778 JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B
12779
12780matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B:
12781 MOVB (R8)(R10*1), R9
12782 CMPB (BX)(R10*1), R9
12783 JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B
12784 LEAL 1(R10), R10
12785
12786repeat_extend_forward_end_encodeSnappyBlockAsm12B:
12787 ADDL R10, CX
12788 MOVL CX, BX
12789 SUBL SI, BX
12790 MOVL 16(SP), SI
12791
12792 // emitCopy
12793two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B:
12794 CMPL BX, $0x40
12795 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B
12796 MOVB $0xee, (AX)
12797 MOVW SI, 1(AX)
12798 LEAL -60(BX), BX
12799 ADDQ $0x03, AX
12800 JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B
12801
12802two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B:
12803 MOVL BX, DI
12804 SHLL $0x02, DI
12805 CMPL BX, $0x0c
12806 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
12807 CMPL SI, $0x00000800
12808 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
12809 LEAL -15(DI), DI
12810 MOVB SI, 1(AX)
12811 SHRL $0x08, SI
12812 SHLL $0x05, SI
12813 ORL SI, DI
12814 MOVB DI, (AX)
12815 ADDQ $0x02, AX
12816 JMP repeat_end_emit_encodeSnappyBlockAsm12B
12817
12818emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B:
12819 LEAL -2(DI), DI
12820 MOVB DI, (AX)
12821 MOVW SI, 1(AX)
12822 ADDQ $0x03, AX
12823
12824repeat_end_emit_encodeSnappyBlockAsm12B:
12825 MOVL CX, 12(SP)
12826 JMP search_loop_encodeSnappyBlockAsm12B
12827
12828no_repeat_found_encodeSnappyBlockAsm12B:
12829 CMPL (DX)(BX*1), SI
12830 JEQ candidate_match_encodeSnappyBlockAsm12B
12831 SHRQ $0x08, SI
12832 MOVL 24(SP)(R9*4), BX
12833 LEAL 2(CX), R8
12834 CMPL (DX)(DI*1), SI
12835 JEQ candidate2_match_encodeSnappyBlockAsm12B
12836 MOVL R8, 24(SP)(R9*4)
12837 SHRQ $0x08, SI
12838 CMPL (DX)(BX*1), SI
12839 JEQ candidate3_match_encodeSnappyBlockAsm12B
12840 MOVL 20(SP), CX
12841 JMP search_loop_encodeSnappyBlockAsm12B
12842
12843candidate3_match_encodeSnappyBlockAsm12B:
12844 ADDL $0x02, CX
12845 JMP candidate_match_encodeSnappyBlockAsm12B
12846
12847candidate2_match_encodeSnappyBlockAsm12B:
12848 MOVL R8, 24(SP)(R9*4)
12849 INCL CX
12850 MOVL DI, BX
12851
12852candidate_match_encodeSnappyBlockAsm12B:
12853 MOVL 12(SP), SI
12854 TESTL BX, BX
12855 JZ match_extend_back_end_encodeSnappyBlockAsm12B
12856
12857match_extend_back_loop_encodeSnappyBlockAsm12B:
12858 CMPL CX, SI
12859 JBE match_extend_back_end_encodeSnappyBlockAsm12B
12860 MOVB -1(DX)(BX*1), DI
12861 MOVB -1(DX)(CX*1), R8
12862 CMPB DI, R8
12863 JNE match_extend_back_end_encodeSnappyBlockAsm12B
12864 LEAL -1(CX), CX
12865 DECL BX
12866 JZ match_extend_back_end_encodeSnappyBlockAsm12B
12867 JMP match_extend_back_loop_encodeSnappyBlockAsm12B
12868
12869match_extend_back_end_encodeSnappyBlockAsm12B:
12870 MOVL CX, SI
12871 SUBL 12(SP), SI
12872 LEAQ 3(AX)(SI*1), SI
12873 CMPQ SI, (SP)
12874 JB match_dst_size_check_encodeSnappyBlockAsm12B
12875 MOVQ $0x00000000, ret+48(FP)
12876 RET
12877
12878match_dst_size_check_encodeSnappyBlockAsm12B:
12879 MOVL CX, SI
12880 MOVL 12(SP), DI
12881 CMPL DI, SI
12882 JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B
12883 MOVL SI, R8
12884 MOVL SI, 12(SP)
12885 LEAQ (DX)(DI*1), SI
12886 SUBL DI, R8
12887 LEAL -1(R8), DI
12888 CMPL DI, $0x3c
12889 JB one_byte_match_emit_encodeSnappyBlockAsm12B
12890 CMPL DI, $0x00000100
12891 JB two_bytes_match_emit_encodeSnappyBlockAsm12B
12892 JB three_bytes_match_emit_encodeSnappyBlockAsm12B
12893
12894three_bytes_match_emit_encodeSnappyBlockAsm12B:
12895 MOVB $0xf4, (AX)
12896 MOVW DI, 1(AX)
12897 ADDQ $0x03, AX
12898 JMP memmove_long_match_emit_encodeSnappyBlockAsm12B
12899
12900two_bytes_match_emit_encodeSnappyBlockAsm12B:
12901 MOVB $0xf0, (AX)
12902 MOVB DI, 1(AX)
12903 ADDQ $0x02, AX
12904 CMPL DI, $0x40
12905 JB memmove_match_emit_encodeSnappyBlockAsm12B
12906 JMP memmove_long_match_emit_encodeSnappyBlockAsm12B
12907
12908one_byte_match_emit_encodeSnappyBlockAsm12B:
12909 SHLB $0x02, DI
12910 MOVB DI, (AX)
12911 ADDQ $0x01, AX
12912
12913memmove_match_emit_encodeSnappyBlockAsm12B:
12914 LEAQ (AX)(R8*1), DI
12915
12916 // genMemMoveShort
12917 CMPQ R8, $0x08
12918 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8
12919 CMPQ R8, $0x10
12920 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
12921 CMPQ R8, $0x20
12922 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
12923 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
12924
12925emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8:
12926 MOVQ (SI), R9
12927 MOVQ R9, (AX)
12928 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
12929
12930emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
12931 MOVQ (SI), R9
12932 MOVQ -8(SI)(R8*1), SI
12933 MOVQ R9, (AX)
12934 MOVQ SI, -8(AX)(R8*1)
12935 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
12936
12937emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
12938 MOVOU (SI), X0
12939 MOVOU -16(SI)(R8*1), X1
12940 MOVOU X0, (AX)
12941 MOVOU X1, -16(AX)(R8*1)
12942 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
12943
12944emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
12945 MOVOU (SI), X0
12946 MOVOU 16(SI), X1
12947 MOVOU -32(SI)(R8*1), X2
12948 MOVOU -16(SI)(R8*1), X3
12949 MOVOU X0, (AX)
12950 MOVOU X1, 16(AX)
12951 MOVOU X2, -32(AX)(R8*1)
12952 MOVOU X3, -16(AX)(R8*1)
12953
12954memmove_end_copy_match_emit_encodeSnappyBlockAsm12B:
12955 MOVQ DI, AX
12956 JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B
12957
12958memmove_long_match_emit_encodeSnappyBlockAsm12B:
12959 LEAQ (AX)(R8*1), DI
12960
12961 // genMemMoveLong
12962 MOVOU (SI), X0
12963 MOVOU 16(SI), X1
12964 MOVOU -32(SI)(R8*1), X2
12965 MOVOU -16(SI)(R8*1), X3
12966 MOVQ R8, R10
12967 SHRQ $0x05, R10
12968 MOVQ AX, R9
12969 ANDL $0x0000001f, R9
12970 MOVQ $0x00000040, R11
12971 SUBQ R9, R11
12972 DECQ R10
12973 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
12974 LEAQ -32(SI)(R11*1), R9
12975 LEAQ -32(AX)(R11*1), R12
12976
12977emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
12978 MOVOU (R9), X4
12979 MOVOU 16(R9), X5
12980 MOVOA X4, (R12)
12981 MOVOA X5, 16(R12)
12982 ADDQ $0x20, R12
12983 ADDQ $0x20, R9
12984 ADDQ $0x20, R11
12985 DECQ R10
12986 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
12987
12988emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
12989 MOVOU -32(SI)(R11*1), X4
12990 MOVOU -16(SI)(R11*1), X5
12991 MOVOA X4, -32(AX)(R11*1)
12992 MOVOA X5, -16(AX)(R11*1)
12993 ADDQ $0x20, R11
12994 CMPQ R8, R11
12995 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
12996 MOVOU X0, (AX)
12997 MOVOU X1, 16(AX)
12998 MOVOU X2, -32(AX)(R8*1)
12999 MOVOU X3, -16(AX)(R8*1)
13000 MOVQ DI, AX
13001
13002emit_literal_done_match_emit_encodeSnappyBlockAsm12B:
13003match_nolit_loop_encodeSnappyBlockAsm12B:
13004 MOVL CX, SI
13005 SUBL BX, SI
13006 MOVL SI, 16(SP)
13007 ADDL $0x04, CX
13008 ADDL $0x04, BX
13009 MOVQ src_len+32(FP), SI
13010 SUBL CX, SI
13011 LEAQ (DX)(CX*1), DI
13012 LEAQ (DX)(BX*1), BX
13013
13014 // matchLen
13015 XORL R9, R9
13016
13017matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B:
13018 CMPL SI, $0x10
13019 JB matchlen_match8_match_nolit_encodeSnappyBlockAsm12B
13020 MOVQ (DI)(R9*1), R8
13021 MOVQ 8(DI)(R9*1), R10
13022 XORQ (BX)(R9*1), R8
13023 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B
13024 XORQ 8(BX)(R9*1), R10
13025 JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B
13026 LEAL -16(SI), SI
13027 LEAL 16(R9), R9
13028 JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B
13029
13030matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B:
13031#ifdef GOAMD64_v3
13032 TZCNTQ R10, R10
13033
13034#else
13035 BSFQ R10, R10
13036
13037#endif
13038 SARQ $0x03, R10
13039 LEAL 8(R9)(R10*1), R9
13040 JMP match_nolit_end_encodeSnappyBlockAsm12B
13041
13042matchlen_match8_match_nolit_encodeSnappyBlockAsm12B:
13043 CMPL SI, $0x08
13044 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm12B
13045 MOVQ (DI)(R9*1), R8
13046 XORQ (BX)(R9*1), R8
13047 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B
13048 LEAL -8(SI), SI
13049 LEAL 8(R9), R9
13050 JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm12B
13051
13052matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B:
13053#ifdef GOAMD64_v3
13054 TZCNTQ R8, R8
13055
13056#else
13057 BSFQ R8, R8
13058
13059#endif
13060 SARQ $0x03, R8
13061 LEAL (R9)(R8*1), R9
13062 JMP match_nolit_end_encodeSnappyBlockAsm12B
13063
13064matchlen_match4_match_nolit_encodeSnappyBlockAsm12B:
13065 CMPL SI, $0x04
13066 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
13067 MOVL (DI)(R9*1), R8
13068 CMPL (BX)(R9*1), R8
13069 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
13070 LEAL -4(SI), SI
13071 LEAL 4(R9), R9
13072
13073matchlen_match2_match_nolit_encodeSnappyBlockAsm12B:
13074 CMPL SI, $0x01
13075 JE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
13076 JB match_nolit_end_encodeSnappyBlockAsm12B
13077 MOVW (DI)(R9*1), R8
13078 CMPW (BX)(R9*1), R8
13079 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
13080 LEAL 2(R9), R9
13081 SUBL $0x02, SI
13082 JZ match_nolit_end_encodeSnappyBlockAsm12B
13083
13084matchlen_match1_match_nolit_encodeSnappyBlockAsm12B:
13085 MOVB (DI)(R9*1), R8
13086 CMPB (BX)(R9*1), R8
13087 JNE match_nolit_end_encodeSnappyBlockAsm12B
13088 LEAL 1(R9), R9
13089
13090match_nolit_end_encodeSnappyBlockAsm12B:
13091 ADDL R9, CX
13092 MOVL 16(SP), BX
13093 ADDL $0x04, R9
13094 MOVL CX, 12(SP)
13095
13096 // emitCopy
13097two_byte_offset_match_nolit_encodeSnappyBlockAsm12B:
13098 CMPL R9, $0x40
13099 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B
13100 MOVB $0xee, (AX)
13101 MOVW BX, 1(AX)
13102 LEAL -60(R9), R9
13103 ADDQ $0x03, AX
13104 JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B
13105
13106two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B:
13107 MOVL R9, SI
13108 SHLL $0x02, SI
13109 CMPL R9, $0x0c
13110 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
13111 CMPL BX, $0x00000800
13112 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
13113 LEAL -15(SI), SI
13114 MOVB BL, 1(AX)
13115 SHRL $0x08, BX
13116 SHLL $0x05, BX
13117 ORL BX, SI
13118 MOVB SI, (AX)
13119 ADDQ $0x02, AX
13120 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B
13121
13122emit_copy_three_match_nolit_encodeSnappyBlockAsm12B:
13123 LEAL -2(SI), SI
13124 MOVB SI, (AX)
13125 MOVW BX, 1(AX)
13126 ADDQ $0x03, AX
13127
13128match_nolit_emitcopy_end_encodeSnappyBlockAsm12B:
13129 CMPL CX, 8(SP)
13130 JAE emit_remainder_encodeSnappyBlockAsm12B
13131 MOVQ -2(DX)(CX*1), SI
13132 CMPQ AX, (SP)
13133 JB match_nolit_dst_ok_encodeSnappyBlockAsm12B
13134 MOVQ $0x00000000, ret+48(FP)
13135 RET
13136
13137match_nolit_dst_ok_encodeSnappyBlockAsm12B:
13138 MOVQ $0x000000cf1bbcdcbb, R8
13139 MOVQ SI, DI
13140 SHRQ $0x10, SI
13141 MOVQ SI, BX
13142 SHLQ $0x18, DI
13143 IMULQ R8, DI
13144 SHRQ $0x34, DI
13145 SHLQ $0x18, BX
13146 IMULQ R8, BX
13147 SHRQ $0x34, BX
13148 LEAL -2(CX), R8
13149 LEAQ 24(SP)(BX*4), R9
13150 MOVL (R9), BX
13151 MOVL R8, 24(SP)(DI*4)
13152 MOVL CX, (R9)
13153 CMPL (DX)(BX*1), SI
13154 JEQ match_nolit_loop_encodeSnappyBlockAsm12B
13155 INCL CX
13156 JMP search_loop_encodeSnappyBlockAsm12B
13157
13158emit_remainder_encodeSnappyBlockAsm12B:
13159 MOVQ src_len+32(FP), CX
13160 SUBL 12(SP), CX
13161 LEAQ 3(AX)(CX*1), CX
13162 CMPQ CX, (SP)
13163 JB emit_remainder_ok_encodeSnappyBlockAsm12B
13164 MOVQ $0x00000000, ret+48(FP)
13165 RET
13166
13167emit_remainder_ok_encodeSnappyBlockAsm12B:
13168 MOVQ src_len+32(FP), CX
13169 MOVL 12(SP), BX
13170 CMPL BX, CX
13171 JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
13172 MOVL CX, SI
13173 MOVL CX, 12(SP)
13174 LEAQ (DX)(BX*1), CX
13175 SUBL BX, SI
13176 LEAL -1(SI), DX
13177 CMPL DX, $0x3c
13178 JB one_byte_emit_remainder_encodeSnappyBlockAsm12B
13179 CMPL DX, $0x00000100
13180 JB two_bytes_emit_remainder_encodeSnappyBlockAsm12B
13181 JB three_bytes_emit_remainder_encodeSnappyBlockAsm12B
13182
13183three_bytes_emit_remainder_encodeSnappyBlockAsm12B:
13184 MOVB $0xf4, (AX)
13185 MOVW DX, 1(AX)
13186 ADDQ $0x03, AX
13187 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B
13188
13189two_bytes_emit_remainder_encodeSnappyBlockAsm12B:
13190 MOVB $0xf0, (AX)
13191 MOVB DL, 1(AX)
13192 ADDQ $0x02, AX
13193 CMPL DX, $0x40
13194 JB memmove_emit_remainder_encodeSnappyBlockAsm12B
13195 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B
13196
13197one_byte_emit_remainder_encodeSnappyBlockAsm12B:
13198 SHLB $0x02, DL
13199 MOVB DL, (AX)
13200 ADDQ $0x01, AX
13201
13202memmove_emit_remainder_encodeSnappyBlockAsm12B:
13203 LEAQ (AX)(SI*1), DX
13204 MOVL SI, BX
13205
13206 // genMemMoveShort
13207 CMPQ BX, $0x03
13208 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2
13209 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3
13210 CMPQ BX, $0x08
13211 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7
13212 CMPQ BX, $0x10
13213 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16
13214 CMPQ BX, $0x20
13215 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32
13216 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64
13217
13218emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2:
13219 MOVB (CX), SI
13220 MOVB -1(CX)(BX*1), CL
13221 MOVB SI, (AX)
13222 MOVB CL, -1(AX)(BX*1)
13223 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
13224
13225emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3:
13226 MOVW (CX), SI
13227 MOVB 2(CX), CL
13228 MOVW SI, (AX)
13229 MOVB CL, 2(AX)
13230 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
13231
13232emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7:
13233 MOVL (CX), SI
13234 MOVL -4(CX)(BX*1), CX
13235 MOVL SI, (AX)
13236 MOVL CX, -4(AX)(BX*1)
13237 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
13238
13239emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16:
13240 MOVQ (CX), SI
13241 MOVQ -8(CX)(BX*1), CX
13242 MOVQ SI, (AX)
13243 MOVQ CX, -8(AX)(BX*1)
13244 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
13245
13246emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32:
13247 MOVOU (CX), X0
13248 MOVOU -16(CX)(BX*1), X1
13249 MOVOU X0, (AX)
13250 MOVOU X1, -16(AX)(BX*1)
13251 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
13252
13253emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64:
13254 MOVOU (CX), X0
13255 MOVOU 16(CX), X1
13256 MOVOU -32(CX)(BX*1), X2
13257 MOVOU -16(CX)(BX*1), X3
13258 MOVOU X0, (AX)
13259 MOVOU X1, 16(AX)
13260 MOVOU X2, -32(AX)(BX*1)
13261 MOVOU X3, -16(AX)(BX*1)
13262
13263memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B:
13264 MOVQ DX, AX
13265 JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
13266
13267memmove_long_emit_remainder_encodeSnappyBlockAsm12B:
13268 LEAQ (AX)(SI*1), DX
13269 MOVL SI, BX
13270
13271 // genMemMoveLong
13272 MOVOU (CX), X0
13273 MOVOU 16(CX), X1
13274 MOVOU -32(CX)(BX*1), X2
13275 MOVOU -16(CX)(BX*1), X3
13276 MOVQ BX, DI
13277 SHRQ $0x05, DI
13278 MOVQ AX, SI
13279 ANDL $0x0000001f, SI
13280 MOVQ $0x00000040, R8
13281 SUBQ SI, R8
13282 DECQ DI
13283 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
13284 LEAQ -32(CX)(R8*1), SI
13285 LEAQ -32(AX)(R8*1), R9
13286
13287emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back:
13288 MOVOU (SI), X4
13289 MOVOU 16(SI), X5
13290 MOVOA X4, (R9)
13291 MOVOA X5, 16(R9)
13292 ADDQ $0x20, R9
13293 ADDQ $0x20, SI
13294 ADDQ $0x20, R8
13295 DECQ DI
13296 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back
13297
13298emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
13299 MOVOU -32(CX)(R8*1), X4
13300 MOVOU -16(CX)(R8*1), X5
13301 MOVOA X4, -32(AX)(R8*1)
13302 MOVOA X5, -16(AX)(R8*1)
13303 ADDQ $0x20, R8
13304 CMPQ BX, R8
13305 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
13306 MOVOU X0, (AX)
13307 MOVOU X1, 16(AX)
13308 MOVOU X2, -32(AX)(BX*1)
13309 MOVOU X3, -16(AX)(BX*1)
13310 MOVQ DX, AX
13311
13312emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B:
13313 MOVQ dst_base+0(FP), CX
13314 SUBQ CX, AX
13315 MOVQ AX, ret+48(FP)
13316 RET
13317
13318// func encodeSnappyBlockAsm10B(dst []byte, src []byte) int
13319// Requires: BMI, SSE2
13320TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56
13321 MOVQ dst_base+0(FP), AX
13322 MOVQ $0x00000020, CX
13323 LEAQ 24(SP), DX
13324 PXOR X0, X0
13325
13326zero_loop_encodeSnappyBlockAsm10B:
13327 MOVOU X0, (DX)
13328 MOVOU X0, 16(DX)
13329 MOVOU X0, 32(DX)
13330 MOVOU X0, 48(DX)
13331 MOVOU X0, 64(DX)
13332 MOVOU X0, 80(DX)
13333 MOVOU X0, 96(DX)
13334 MOVOU X0, 112(DX)
13335 ADDQ $0x80, DX
13336 DECQ CX
13337 JNZ zero_loop_encodeSnappyBlockAsm10B
13338 MOVL $0x00000000, 12(SP)
13339 MOVQ src_len+32(FP), CX
13340 LEAQ -9(CX), DX
13341 LEAQ -8(CX), BX
13342 MOVL BX, 8(SP)
13343 SHRQ $0x05, CX
13344 SUBL CX, DX
13345 LEAQ (AX)(DX*1), DX
13346 MOVQ DX, (SP)
13347 MOVL $0x00000001, CX
13348 MOVL CX, 16(SP)
13349 MOVQ src_base+24(FP), DX
13350
13351search_loop_encodeSnappyBlockAsm10B:
13352 MOVL CX, BX
13353 SUBL 12(SP), BX
13354 SHRL $0x05, BX
13355 LEAL 4(CX)(BX*1), BX
13356 CMPL BX, 8(SP)
13357 JAE emit_remainder_encodeSnappyBlockAsm10B
13358 MOVQ (DX)(CX*1), SI
13359 MOVL BX, 20(SP)
13360 MOVQ $0x9e3779b1, R8
13361 MOVQ SI, R9
13362 MOVQ SI, R10
13363 SHRQ $0x08, R10
13364 SHLQ $0x20, R9
13365 IMULQ R8, R9
13366 SHRQ $0x36, R9
13367 SHLQ $0x20, R10
13368 IMULQ R8, R10
13369 SHRQ $0x36, R10
13370 MOVL 24(SP)(R9*4), BX
13371 MOVL 24(SP)(R10*4), DI
13372 MOVL CX, 24(SP)(R9*4)
13373 LEAL 1(CX), R9
13374 MOVL R9, 24(SP)(R10*4)
13375 MOVQ SI, R9
13376 SHRQ $0x10, R9
13377 SHLQ $0x20, R9
13378 IMULQ R8, R9
13379 SHRQ $0x36, R9
13380 MOVL CX, R8
13381 SUBL 16(SP), R8
13382 MOVL 1(DX)(R8*1), R10
13383 MOVQ SI, R8
13384 SHRQ $0x08, R8
13385 CMPL R8, R10
13386 JNE no_repeat_found_encodeSnappyBlockAsm10B
13387 LEAL 1(CX), SI
13388 MOVL 12(SP), BX
13389 MOVL SI, DI
13390 SUBL 16(SP), DI
13391 JZ repeat_extend_back_end_encodeSnappyBlockAsm10B
13392
13393repeat_extend_back_loop_encodeSnappyBlockAsm10B:
13394 CMPL SI, BX
13395 JBE repeat_extend_back_end_encodeSnappyBlockAsm10B
13396 MOVB -1(DX)(DI*1), R8
13397 MOVB -1(DX)(SI*1), R9
13398 CMPB R8, R9
13399 JNE repeat_extend_back_end_encodeSnappyBlockAsm10B
13400 LEAL -1(SI), SI
13401 DECL DI
13402 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B
13403
13404repeat_extend_back_end_encodeSnappyBlockAsm10B:
13405 MOVL SI, BX
13406 SUBL 12(SP), BX
13407 LEAQ 3(AX)(BX*1), BX
13408 CMPQ BX, (SP)
13409 JB repeat_dst_size_check_encodeSnappyBlockAsm10B
13410 MOVQ $0x00000000, ret+48(FP)
13411 RET
13412
13413repeat_dst_size_check_encodeSnappyBlockAsm10B:
13414 MOVL 12(SP), BX
13415 CMPL BX, SI
13416 JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
13417 MOVL SI, DI
13418 MOVL SI, 12(SP)
13419 LEAQ (DX)(BX*1), R8
13420 SUBL BX, DI
13421 LEAL -1(DI), BX
13422 CMPL BX, $0x3c
13423 JB one_byte_repeat_emit_encodeSnappyBlockAsm10B
13424 CMPL BX, $0x00000100
13425 JB two_bytes_repeat_emit_encodeSnappyBlockAsm10B
13426 JB three_bytes_repeat_emit_encodeSnappyBlockAsm10B
13427
13428three_bytes_repeat_emit_encodeSnappyBlockAsm10B:
13429 MOVB $0xf4, (AX)
13430 MOVW BX, 1(AX)
13431 ADDQ $0x03, AX
13432 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B
13433
13434two_bytes_repeat_emit_encodeSnappyBlockAsm10B:
13435 MOVB $0xf0, (AX)
13436 MOVB BL, 1(AX)
13437 ADDQ $0x02, AX
13438 CMPL BX, $0x40
13439 JB memmove_repeat_emit_encodeSnappyBlockAsm10B
13440 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B
13441
13442one_byte_repeat_emit_encodeSnappyBlockAsm10B:
13443 SHLB $0x02, BL
13444 MOVB BL, (AX)
13445 ADDQ $0x01, AX
13446
13447memmove_repeat_emit_encodeSnappyBlockAsm10B:
13448 LEAQ (AX)(DI*1), BX
13449
13450 // genMemMoveShort
13451 CMPQ DI, $0x08
13452 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8
13453 CMPQ DI, $0x10
13454 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
13455 CMPQ DI, $0x20
13456 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
13457 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
13458
13459emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8:
13460 MOVQ (R8), R9
13461 MOVQ R9, (AX)
13462 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
13463
13464emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
13465 MOVQ (R8), R9
13466 MOVQ -8(R8)(DI*1), R8
13467 MOVQ R9, (AX)
13468 MOVQ R8, -8(AX)(DI*1)
13469 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
13470
13471emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
13472 MOVOU (R8), X0
13473 MOVOU -16(R8)(DI*1), X1
13474 MOVOU X0, (AX)
13475 MOVOU X1, -16(AX)(DI*1)
13476 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
13477
13478emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
13479 MOVOU (R8), X0
13480 MOVOU 16(R8), X1
13481 MOVOU -32(R8)(DI*1), X2
13482 MOVOU -16(R8)(DI*1), X3
13483 MOVOU X0, (AX)
13484 MOVOU X1, 16(AX)
13485 MOVOU X2, -32(AX)(DI*1)
13486 MOVOU X3, -16(AX)(DI*1)
13487
13488memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B:
13489 MOVQ BX, AX
13490 JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
13491
13492memmove_long_repeat_emit_encodeSnappyBlockAsm10B:
13493 LEAQ (AX)(DI*1), BX
13494
13495 // genMemMoveLong
13496 MOVOU (R8), X0
13497 MOVOU 16(R8), X1
13498 MOVOU -32(R8)(DI*1), X2
13499 MOVOU -16(R8)(DI*1), X3
13500 MOVQ DI, R10
13501 SHRQ $0x05, R10
13502 MOVQ AX, R9
13503 ANDL $0x0000001f, R9
13504 MOVQ $0x00000040, R11
13505 SUBQ R9, R11
13506 DECQ R10
13507 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
13508 LEAQ -32(R8)(R11*1), R9
13509 LEAQ -32(AX)(R11*1), R12
13510
13511emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
13512 MOVOU (R9), X4
13513 MOVOU 16(R9), X5
13514 MOVOA X4, (R12)
13515 MOVOA X5, 16(R12)
13516 ADDQ $0x20, R12
13517 ADDQ $0x20, R9
13518 ADDQ $0x20, R11
13519 DECQ R10
13520 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
13521
13522emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
13523 MOVOU -32(R8)(R11*1), X4
13524 MOVOU -16(R8)(R11*1), X5
13525 MOVOA X4, -32(AX)(R11*1)
13526 MOVOA X5, -16(AX)(R11*1)
13527 ADDQ $0x20, R11
13528 CMPQ DI, R11
13529 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
13530 MOVOU X0, (AX)
13531 MOVOU X1, 16(AX)
13532 MOVOU X2, -32(AX)(DI*1)
13533 MOVOU X3, -16(AX)(DI*1)
13534 MOVQ BX, AX
13535
13536emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B:
13537 ADDL $0x05, CX
13538 MOVL CX, BX
13539 SUBL 16(SP), BX
13540 MOVQ src_len+32(FP), DI
13541 SUBL CX, DI
13542 LEAQ (DX)(CX*1), R8
13543 LEAQ (DX)(BX*1), BX
13544
13545 // matchLen
13546 XORL R10, R10
13547
13548matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B:
13549 CMPL DI, $0x10
13550 JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B
13551 MOVQ (R8)(R10*1), R9
13552 MOVQ 8(R8)(R10*1), R11
13553 XORQ (BX)(R10*1), R9
13554 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B
13555 XORQ 8(BX)(R10*1), R11
13556 JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B
13557 LEAL -16(DI), DI
13558 LEAL 16(R10), R10
13559 JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B
13560
13561matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B:
13562#ifdef GOAMD64_v3
13563 TZCNTQ R11, R11
13564
13565#else
13566 BSFQ R11, R11
13567
13568#endif
13569 SARQ $0x03, R11
13570 LEAL 8(R10)(R11*1), R10
13571 JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B
13572
13573matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B:
13574 CMPL DI, $0x08
13575 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B
13576 MOVQ (R8)(R10*1), R9
13577 XORQ (BX)(R10*1), R9
13578 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B
13579 LEAL -8(DI), DI
13580 LEAL 8(R10), R10
13581 JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B
13582
13583matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B:
13584#ifdef GOAMD64_v3
13585 TZCNTQ R9, R9
13586
13587#else
13588 BSFQ R9, R9
13589
13590#endif
13591 SARQ $0x03, R9
13592 LEAL (R10)(R9*1), R10
13593 JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B
13594
13595matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B:
13596 CMPL DI, $0x04
13597 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
13598 MOVL (R8)(R10*1), R9
13599 CMPL (BX)(R10*1), R9
13600 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
13601 LEAL -4(DI), DI
13602 LEAL 4(R10), R10
13603
13604matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B:
13605 CMPL DI, $0x01
13606 JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
13607 JB repeat_extend_forward_end_encodeSnappyBlockAsm10B
13608 MOVW (R8)(R10*1), R9
13609 CMPW (BX)(R10*1), R9
13610 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
13611 LEAL 2(R10), R10
13612 SUBL $0x02, DI
13613 JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B
13614
13615matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B:
13616 MOVB (R8)(R10*1), R9
13617 CMPB (BX)(R10*1), R9
13618 JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B
13619 LEAL 1(R10), R10
13620
13621repeat_extend_forward_end_encodeSnappyBlockAsm10B:
13622 ADDL R10, CX
13623 MOVL CX, BX
13624 SUBL SI, BX
13625 MOVL 16(SP), SI
13626
13627 // emitCopy
13628two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B:
13629 CMPL BX, $0x40
13630 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B
13631 MOVB $0xee, (AX)
13632 MOVW SI, 1(AX)
13633 LEAL -60(BX), BX
13634 ADDQ $0x03, AX
13635 JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B
13636
13637two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B:
13638 MOVL BX, DI
13639 SHLL $0x02, DI
13640 CMPL BX, $0x0c
13641 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
13642 CMPL SI, $0x00000800
13643 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
13644 LEAL -15(DI), DI
13645 MOVB SI, 1(AX)
13646 SHRL $0x08, SI
13647 SHLL $0x05, SI
13648 ORL SI, DI
13649 MOVB DI, (AX)
13650 ADDQ $0x02, AX
13651 JMP repeat_end_emit_encodeSnappyBlockAsm10B
13652
13653emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B:
13654 LEAL -2(DI), DI
13655 MOVB DI, (AX)
13656 MOVW SI, 1(AX)
13657 ADDQ $0x03, AX
13658
13659repeat_end_emit_encodeSnappyBlockAsm10B:
13660 MOVL CX, 12(SP)
13661 JMP search_loop_encodeSnappyBlockAsm10B
13662
13663no_repeat_found_encodeSnappyBlockAsm10B:
13664 CMPL (DX)(BX*1), SI
13665 JEQ candidate_match_encodeSnappyBlockAsm10B
13666 SHRQ $0x08, SI
13667 MOVL 24(SP)(R9*4), BX
13668 LEAL 2(CX), R8
13669 CMPL (DX)(DI*1), SI
13670 JEQ candidate2_match_encodeSnappyBlockAsm10B
13671 MOVL R8, 24(SP)(R9*4)
13672 SHRQ $0x08, SI
13673 CMPL (DX)(BX*1), SI
13674 JEQ candidate3_match_encodeSnappyBlockAsm10B
13675 MOVL 20(SP), CX
13676 JMP search_loop_encodeSnappyBlockAsm10B
13677
13678candidate3_match_encodeSnappyBlockAsm10B:
13679 ADDL $0x02, CX
13680 JMP candidate_match_encodeSnappyBlockAsm10B
13681
13682candidate2_match_encodeSnappyBlockAsm10B:
13683 MOVL R8, 24(SP)(R9*4)
13684 INCL CX
13685 MOVL DI, BX
13686
13687candidate_match_encodeSnappyBlockAsm10B:
13688 MOVL 12(SP), SI
13689 TESTL BX, BX
13690 JZ match_extend_back_end_encodeSnappyBlockAsm10B
13691
13692match_extend_back_loop_encodeSnappyBlockAsm10B:
13693 CMPL CX, SI
13694 JBE match_extend_back_end_encodeSnappyBlockAsm10B
13695 MOVB -1(DX)(BX*1), DI
13696 MOVB -1(DX)(CX*1), R8
13697 CMPB DI, R8
13698 JNE match_extend_back_end_encodeSnappyBlockAsm10B
13699 LEAL -1(CX), CX
13700 DECL BX
13701 JZ match_extend_back_end_encodeSnappyBlockAsm10B
13702 JMP match_extend_back_loop_encodeSnappyBlockAsm10B
13703
13704match_extend_back_end_encodeSnappyBlockAsm10B:
13705 MOVL CX, SI
13706 SUBL 12(SP), SI
13707 LEAQ 3(AX)(SI*1), SI
13708 CMPQ SI, (SP)
13709 JB match_dst_size_check_encodeSnappyBlockAsm10B
13710 MOVQ $0x00000000, ret+48(FP)
13711 RET
13712
13713match_dst_size_check_encodeSnappyBlockAsm10B:
13714 MOVL CX, SI
13715 MOVL 12(SP), DI
13716 CMPL DI, SI
13717 JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B
13718 MOVL SI, R8
13719 MOVL SI, 12(SP)
13720 LEAQ (DX)(DI*1), SI
13721 SUBL DI, R8
13722 LEAL -1(R8), DI
13723 CMPL DI, $0x3c
13724 JB one_byte_match_emit_encodeSnappyBlockAsm10B
13725 CMPL DI, $0x00000100
13726 JB two_bytes_match_emit_encodeSnappyBlockAsm10B
13727 JB three_bytes_match_emit_encodeSnappyBlockAsm10B
13728
13729three_bytes_match_emit_encodeSnappyBlockAsm10B:
13730 MOVB $0xf4, (AX)
13731 MOVW DI, 1(AX)
13732 ADDQ $0x03, AX
13733 JMP memmove_long_match_emit_encodeSnappyBlockAsm10B
13734
13735two_bytes_match_emit_encodeSnappyBlockAsm10B:
13736 MOVB $0xf0, (AX)
13737 MOVB DI, 1(AX)
13738 ADDQ $0x02, AX
13739 CMPL DI, $0x40
13740 JB memmove_match_emit_encodeSnappyBlockAsm10B
13741 JMP memmove_long_match_emit_encodeSnappyBlockAsm10B
13742
13743one_byte_match_emit_encodeSnappyBlockAsm10B:
13744 SHLB $0x02, DI
13745 MOVB DI, (AX)
13746 ADDQ $0x01, AX
13747
13748memmove_match_emit_encodeSnappyBlockAsm10B:
13749 LEAQ (AX)(R8*1), DI
13750
13751 // genMemMoveShort
13752 CMPQ R8, $0x08
13753 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8
13754 CMPQ R8, $0x10
13755 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
13756 CMPQ R8, $0x20
13757 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
13758 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
13759
13760emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8:
13761 MOVQ (SI), R9
13762 MOVQ R9, (AX)
13763 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
13764
13765emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
13766 MOVQ (SI), R9
13767 MOVQ -8(SI)(R8*1), SI
13768 MOVQ R9, (AX)
13769 MOVQ SI, -8(AX)(R8*1)
13770 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
13771
13772emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
13773 MOVOU (SI), X0
13774 MOVOU -16(SI)(R8*1), X1
13775 MOVOU X0, (AX)
13776 MOVOU X1, -16(AX)(R8*1)
13777 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
13778
13779emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
13780 MOVOU (SI), X0
13781 MOVOU 16(SI), X1
13782 MOVOU -32(SI)(R8*1), X2
13783 MOVOU -16(SI)(R8*1), X3
13784 MOVOU X0, (AX)
13785 MOVOU X1, 16(AX)
13786 MOVOU X2, -32(AX)(R8*1)
13787 MOVOU X3, -16(AX)(R8*1)
13788
13789memmove_end_copy_match_emit_encodeSnappyBlockAsm10B:
13790 MOVQ DI, AX
13791 JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B
13792
13793memmove_long_match_emit_encodeSnappyBlockAsm10B:
13794 LEAQ (AX)(R8*1), DI
13795
13796 // genMemMoveLong
13797 MOVOU (SI), X0
13798 MOVOU 16(SI), X1
13799 MOVOU -32(SI)(R8*1), X2
13800 MOVOU -16(SI)(R8*1), X3
13801 MOVQ R8, R10
13802 SHRQ $0x05, R10
13803 MOVQ AX, R9
13804 ANDL $0x0000001f, R9
13805 MOVQ $0x00000040, R11
13806 SUBQ R9, R11
13807 DECQ R10
13808 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
13809 LEAQ -32(SI)(R11*1), R9
13810 LEAQ -32(AX)(R11*1), R12
13811
13812emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
13813 MOVOU (R9), X4
13814 MOVOU 16(R9), X5
13815 MOVOA X4, (R12)
13816 MOVOA X5, 16(R12)
13817 ADDQ $0x20, R12
13818 ADDQ $0x20, R9
13819 ADDQ $0x20, R11
13820 DECQ R10
13821 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
13822
13823emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
13824 MOVOU -32(SI)(R11*1), X4
13825 MOVOU -16(SI)(R11*1), X5
13826 MOVOA X4, -32(AX)(R11*1)
13827 MOVOA X5, -16(AX)(R11*1)
13828 ADDQ $0x20, R11
13829 CMPQ R8, R11
13830 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
13831 MOVOU X0, (AX)
13832 MOVOU X1, 16(AX)
13833 MOVOU X2, -32(AX)(R8*1)
13834 MOVOU X3, -16(AX)(R8*1)
13835 MOVQ DI, AX
13836
13837emit_literal_done_match_emit_encodeSnappyBlockAsm10B:
13838match_nolit_loop_encodeSnappyBlockAsm10B:
13839 MOVL CX, SI
13840 SUBL BX, SI
13841 MOVL SI, 16(SP)
13842 ADDL $0x04, CX
13843 ADDL $0x04, BX
13844 MOVQ src_len+32(FP), SI
13845 SUBL CX, SI
13846 LEAQ (DX)(CX*1), DI
13847 LEAQ (DX)(BX*1), BX
13848
13849 // matchLen
13850 XORL R9, R9
13851
13852matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B:
13853 CMPL SI, $0x10
13854 JB matchlen_match8_match_nolit_encodeSnappyBlockAsm10B
13855 MOVQ (DI)(R9*1), R8
13856 MOVQ 8(DI)(R9*1), R10
13857 XORQ (BX)(R9*1), R8
13858 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B
13859 XORQ 8(BX)(R9*1), R10
13860 JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B
13861 LEAL -16(SI), SI
13862 LEAL 16(R9), R9
13863 JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B
13864
13865matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B:
13866#ifdef GOAMD64_v3
13867 TZCNTQ R10, R10
13868
13869#else
13870 BSFQ R10, R10
13871
13872#endif
13873 SARQ $0x03, R10
13874 LEAL 8(R9)(R10*1), R9
13875 JMP match_nolit_end_encodeSnappyBlockAsm10B
13876
13877matchlen_match8_match_nolit_encodeSnappyBlockAsm10B:
13878 CMPL SI, $0x08
13879 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm10B
13880 MOVQ (DI)(R9*1), R8
13881 XORQ (BX)(R9*1), R8
13882 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B
13883 LEAL -8(SI), SI
13884 LEAL 8(R9), R9
13885 JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm10B
13886
13887matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B:
13888#ifdef GOAMD64_v3
13889 TZCNTQ R8, R8
13890
13891#else
13892 BSFQ R8, R8
13893
13894#endif
13895 SARQ $0x03, R8
13896 LEAL (R9)(R8*1), R9
13897 JMP match_nolit_end_encodeSnappyBlockAsm10B
13898
13899matchlen_match4_match_nolit_encodeSnappyBlockAsm10B:
13900 CMPL SI, $0x04
13901 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
13902 MOVL (DI)(R9*1), R8
13903 CMPL (BX)(R9*1), R8
13904 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
13905 LEAL -4(SI), SI
13906 LEAL 4(R9), R9
13907
13908matchlen_match2_match_nolit_encodeSnappyBlockAsm10B:
13909 CMPL SI, $0x01
13910 JE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
13911 JB match_nolit_end_encodeSnappyBlockAsm10B
13912 MOVW (DI)(R9*1), R8
13913 CMPW (BX)(R9*1), R8
13914 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
13915 LEAL 2(R9), R9
13916 SUBL $0x02, SI
13917 JZ match_nolit_end_encodeSnappyBlockAsm10B
13918
13919matchlen_match1_match_nolit_encodeSnappyBlockAsm10B:
13920 MOVB (DI)(R9*1), R8
13921 CMPB (BX)(R9*1), R8
13922 JNE match_nolit_end_encodeSnappyBlockAsm10B
13923 LEAL 1(R9), R9
13924
13925match_nolit_end_encodeSnappyBlockAsm10B:
13926 ADDL R9, CX
13927 MOVL 16(SP), BX
13928 ADDL $0x04, R9
13929 MOVL CX, 12(SP)
13930
13931 // emitCopy
13932two_byte_offset_match_nolit_encodeSnappyBlockAsm10B:
13933 CMPL R9, $0x40
13934 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B
13935 MOVB $0xee, (AX)
13936 MOVW BX, 1(AX)
13937 LEAL -60(R9), R9
13938 ADDQ $0x03, AX
13939 JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B
13940
13941two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B:
13942 MOVL R9, SI
13943 SHLL $0x02, SI
13944 CMPL R9, $0x0c
13945 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
13946 CMPL BX, $0x00000800
13947 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
13948 LEAL -15(SI), SI
13949 MOVB BL, 1(AX)
13950 SHRL $0x08, BX
13951 SHLL $0x05, BX
13952 ORL BX, SI
13953 MOVB SI, (AX)
13954 ADDQ $0x02, AX
13955 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B
13956
13957emit_copy_three_match_nolit_encodeSnappyBlockAsm10B:
13958 LEAL -2(SI), SI
13959 MOVB SI, (AX)
13960 MOVW BX, 1(AX)
13961 ADDQ $0x03, AX
13962
13963match_nolit_emitcopy_end_encodeSnappyBlockAsm10B:
13964 CMPL CX, 8(SP)
13965 JAE emit_remainder_encodeSnappyBlockAsm10B
13966 MOVQ -2(DX)(CX*1), SI
13967 CMPQ AX, (SP)
13968 JB match_nolit_dst_ok_encodeSnappyBlockAsm10B
13969 MOVQ $0x00000000, ret+48(FP)
13970 RET
13971
13972match_nolit_dst_ok_encodeSnappyBlockAsm10B:
13973 MOVQ $0x9e3779b1, R8
13974 MOVQ SI, DI
13975 SHRQ $0x10, SI
13976 MOVQ SI, BX
13977 SHLQ $0x20, DI
13978 IMULQ R8, DI
13979 SHRQ $0x36, DI
13980 SHLQ $0x20, BX
13981 IMULQ R8, BX
13982 SHRQ $0x36, BX
13983 LEAL -2(CX), R8
13984 LEAQ 24(SP)(BX*4), R9
13985 MOVL (R9), BX
13986 MOVL R8, 24(SP)(DI*4)
13987 MOVL CX, (R9)
13988 CMPL (DX)(BX*1), SI
13989 JEQ match_nolit_loop_encodeSnappyBlockAsm10B
13990 INCL CX
13991 JMP search_loop_encodeSnappyBlockAsm10B
13992
13993emit_remainder_encodeSnappyBlockAsm10B:
13994 MOVQ src_len+32(FP), CX
13995 SUBL 12(SP), CX
13996 LEAQ 3(AX)(CX*1), CX
13997 CMPQ CX, (SP)
13998 JB emit_remainder_ok_encodeSnappyBlockAsm10B
13999 MOVQ $0x00000000, ret+48(FP)
14000 RET
14001
14002emit_remainder_ok_encodeSnappyBlockAsm10B:
14003 MOVQ src_len+32(FP), CX
14004 MOVL 12(SP), BX
14005 CMPL BX, CX
14006 JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
14007 MOVL CX, SI
14008 MOVL CX, 12(SP)
14009 LEAQ (DX)(BX*1), CX
14010 SUBL BX, SI
14011 LEAL -1(SI), DX
14012 CMPL DX, $0x3c
14013 JB one_byte_emit_remainder_encodeSnappyBlockAsm10B
14014 CMPL DX, $0x00000100
14015 JB two_bytes_emit_remainder_encodeSnappyBlockAsm10B
14016 JB three_bytes_emit_remainder_encodeSnappyBlockAsm10B
14017
14018three_bytes_emit_remainder_encodeSnappyBlockAsm10B:
14019 MOVB $0xf4, (AX)
14020 MOVW DX, 1(AX)
14021 ADDQ $0x03, AX
14022 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B
14023
14024two_bytes_emit_remainder_encodeSnappyBlockAsm10B:
14025 MOVB $0xf0, (AX)
14026 MOVB DL, 1(AX)
14027 ADDQ $0x02, AX
14028 CMPL DX, $0x40
14029 JB memmove_emit_remainder_encodeSnappyBlockAsm10B
14030 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B
14031
14032one_byte_emit_remainder_encodeSnappyBlockAsm10B:
14033 SHLB $0x02, DL
14034 MOVB DL, (AX)
14035 ADDQ $0x01, AX
14036
14037memmove_emit_remainder_encodeSnappyBlockAsm10B:
14038 LEAQ (AX)(SI*1), DX
14039 MOVL SI, BX
14040
14041 // genMemMoveShort
14042 CMPQ BX, $0x03
14043 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2
14044 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3
14045 CMPQ BX, $0x08
14046 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7
14047 CMPQ BX, $0x10
14048 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16
14049 CMPQ BX, $0x20
14050 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32
14051 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64
14052
14053emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2:
14054 MOVB (CX), SI
14055 MOVB -1(CX)(BX*1), CL
14056 MOVB SI, (AX)
14057 MOVB CL, -1(AX)(BX*1)
14058 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
14059
14060emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3:
14061 MOVW (CX), SI
14062 MOVB 2(CX), CL
14063 MOVW SI, (AX)
14064 MOVB CL, 2(AX)
14065 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
14066
14067emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7:
14068 MOVL (CX), SI
14069 MOVL -4(CX)(BX*1), CX
14070 MOVL SI, (AX)
14071 MOVL CX, -4(AX)(BX*1)
14072 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
14073
14074emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16:
14075 MOVQ (CX), SI
14076 MOVQ -8(CX)(BX*1), CX
14077 MOVQ SI, (AX)
14078 MOVQ CX, -8(AX)(BX*1)
14079 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
14080
14081emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32:
14082 MOVOU (CX), X0
14083 MOVOU -16(CX)(BX*1), X1
14084 MOVOU X0, (AX)
14085 MOVOU X1, -16(AX)(BX*1)
14086 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
14087
14088emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64:
14089 MOVOU (CX), X0
14090 MOVOU 16(CX), X1
14091 MOVOU -32(CX)(BX*1), X2
14092 MOVOU -16(CX)(BX*1), X3
14093 MOVOU X0, (AX)
14094 MOVOU X1, 16(AX)
14095 MOVOU X2, -32(AX)(BX*1)
14096 MOVOU X3, -16(AX)(BX*1)
14097
14098memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B:
14099 MOVQ DX, AX
14100 JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
14101
14102memmove_long_emit_remainder_encodeSnappyBlockAsm10B:
14103 LEAQ (AX)(SI*1), DX
14104 MOVL SI, BX
14105
14106 // genMemMoveLong
14107 MOVOU (CX), X0
14108 MOVOU 16(CX), X1
14109 MOVOU -32(CX)(BX*1), X2
14110 MOVOU -16(CX)(BX*1), X3
14111 MOVQ BX, DI
14112 SHRQ $0x05, DI
14113 MOVQ AX, SI
14114 ANDL $0x0000001f, SI
14115 MOVQ $0x00000040, R8
14116 SUBQ SI, R8
14117 DECQ DI
14118 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
14119 LEAQ -32(CX)(R8*1), SI
14120 LEAQ -32(AX)(R8*1), R9
14121
14122emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back:
14123 MOVOU (SI), X4
14124 MOVOU 16(SI), X5
14125 MOVOA X4, (R9)
14126 MOVOA X5, 16(R9)
14127 ADDQ $0x20, R9
14128 ADDQ $0x20, SI
14129 ADDQ $0x20, R8
14130 DECQ DI
14131 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back
14132
14133emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
14134 MOVOU -32(CX)(R8*1), X4
14135 MOVOU -16(CX)(R8*1), X5
14136 MOVOA X4, -32(AX)(R8*1)
14137 MOVOA X5, -16(AX)(R8*1)
14138 ADDQ $0x20, R8
14139 CMPQ BX, R8
14140 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
14141 MOVOU X0, (AX)
14142 MOVOU X1, 16(AX)
14143 MOVOU X2, -32(AX)(BX*1)
14144 MOVOU X3, -16(AX)(BX*1)
14145 MOVQ DX, AX
14146
14147emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B:
14148 MOVQ dst_base+0(FP), CX
14149 SUBQ CX, AX
14150 MOVQ AX, ret+48(FP)
14151 RET
14152
14153// func encodeSnappyBlockAsm8B(dst []byte, src []byte) int
14154// Requires: BMI, SSE2
14155TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56
14156 MOVQ dst_base+0(FP), AX
14157 MOVQ $0x00000008, CX
14158 LEAQ 24(SP), DX
14159 PXOR X0, X0
14160
14161zero_loop_encodeSnappyBlockAsm8B:
14162 MOVOU X0, (DX)
14163 MOVOU X0, 16(DX)
14164 MOVOU X0, 32(DX)
14165 MOVOU X0, 48(DX)
14166 MOVOU X0, 64(DX)
14167 MOVOU X0, 80(DX)
14168 MOVOU X0, 96(DX)
14169 MOVOU X0, 112(DX)
14170 ADDQ $0x80, DX
14171 DECQ CX
14172 JNZ zero_loop_encodeSnappyBlockAsm8B
14173 MOVL $0x00000000, 12(SP)
14174 MOVQ src_len+32(FP), CX
14175 LEAQ -9(CX), DX
14176 LEAQ -8(CX), BX
14177 MOVL BX, 8(SP)
14178 SHRQ $0x05, CX
14179 SUBL CX, DX
14180 LEAQ (AX)(DX*1), DX
14181 MOVQ DX, (SP)
14182 MOVL $0x00000001, CX
14183 MOVL CX, 16(SP)
14184 MOVQ src_base+24(FP), DX
14185
14186search_loop_encodeSnappyBlockAsm8B:
14187 MOVL CX, BX
14188 SUBL 12(SP), BX
14189 SHRL $0x04, BX
14190 LEAL 4(CX)(BX*1), BX
14191 CMPL BX, 8(SP)
14192 JAE emit_remainder_encodeSnappyBlockAsm8B
14193 MOVQ (DX)(CX*1), SI
14194 MOVL BX, 20(SP)
14195 MOVQ $0x9e3779b1, R8
14196 MOVQ SI, R9
14197 MOVQ SI, R10
14198 SHRQ $0x08, R10
14199 SHLQ $0x20, R9
14200 IMULQ R8, R9
14201 SHRQ $0x38, R9
14202 SHLQ $0x20, R10
14203 IMULQ R8, R10
14204 SHRQ $0x38, R10
14205 MOVL 24(SP)(R9*4), BX
14206 MOVL 24(SP)(R10*4), DI
14207 MOVL CX, 24(SP)(R9*4)
14208 LEAL 1(CX), R9
14209 MOVL R9, 24(SP)(R10*4)
14210 MOVQ SI, R9
14211 SHRQ $0x10, R9
14212 SHLQ $0x20, R9
14213 IMULQ R8, R9
14214 SHRQ $0x38, R9
14215 MOVL CX, R8
14216 SUBL 16(SP), R8
14217 MOVL 1(DX)(R8*1), R10
14218 MOVQ SI, R8
14219 SHRQ $0x08, R8
14220 CMPL R8, R10
14221 JNE no_repeat_found_encodeSnappyBlockAsm8B
14222 LEAL 1(CX), SI
14223 MOVL 12(SP), BX
14224 MOVL SI, DI
14225 SUBL 16(SP), DI
14226 JZ repeat_extend_back_end_encodeSnappyBlockAsm8B
14227
14228repeat_extend_back_loop_encodeSnappyBlockAsm8B:
14229 CMPL SI, BX
14230 JBE repeat_extend_back_end_encodeSnappyBlockAsm8B
14231 MOVB -1(DX)(DI*1), R8
14232 MOVB -1(DX)(SI*1), R9
14233 CMPB R8, R9
14234 JNE repeat_extend_back_end_encodeSnappyBlockAsm8B
14235 LEAL -1(SI), SI
14236 DECL DI
14237 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B
14238
14239repeat_extend_back_end_encodeSnappyBlockAsm8B:
14240 MOVL SI, BX
14241 SUBL 12(SP), BX
14242 LEAQ 3(AX)(BX*1), BX
14243 CMPQ BX, (SP)
14244 JB repeat_dst_size_check_encodeSnappyBlockAsm8B
14245 MOVQ $0x00000000, ret+48(FP)
14246 RET
14247
14248repeat_dst_size_check_encodeSnappyBlockAsm8B:
14249 MOVL 12(SP), BX
14250 CMPL BX, SI
14251 JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
14252 MOVL SI, DI
14253 MOVL SI, 12(SP)
14254 LEAQ (DX)(BX*1), R8
14255 SUBL BX, DI
14256 LEAL -1(DI), BX
14257 CMPL BX, $0x3c
14258 JB one_byte_repeat_emit_encodeSnappyBlockAsm8B
14259 CMPL BX, $0x00000100
14260 JB two_bytes_repeat_emit_encodeSnappyBlockAsm8B
14261 JB three_bytes_repeat_emit_encodeSnappyBlockAsm8B
14262
14263three_bytes_repeat_emit_encodeSnappyBlockAsm8B:
14264 MOVB $0xf4, (AX)
14265 MOVW BX, 1(AX)
14266 ADDQ $0x03, AX
14267 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B
14268
14269two_bytes_repeat_emit_encodeSnappyBlockAsm8B:
14270 MOVB $0xf0, (AX)
14271 MOVB BL, 1(AX)
14272 ADDQ $0x02, AX
14273 CMPL BX, $0x40
14274 JB memmove_repeat_emit_encodeSnappyBlockAsm8B
14275 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B
14276
14277one_byte_repeat_emit_encodeSnappyBlockAsm8B:
14278 SHLB $0x02, BL
14279 MOVB BL, (AX)
14280 ADDQ $0x01, AX
14281
14282memmove_repeat_emit_encodeSnappyBlockAsm8B:
14283 LEAQ (AX)(DI*1), BX
14284
14285 // genMemMoveShort
14286 CMPQ DI, $0x08
14287 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8
14288 CMPQ DI, $0x10
14289 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
14290 CMPQ DI, $0x20
14291 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
14292 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
14293
14294emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8:
14295 MOVQ (R8), R9
14296 MOVQ R9, (AX)
14297 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
14298
14299emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
14300 MOVQ (R8), R9
14301 MOVQ -8(R8)(DI*1), R8
14302 MOVQ R9, (AX)
14303 MOVQ R8, -8(AX)(DI*1)
14304 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
14305
14306emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
14307 MOVOU (R8), X0
14308 MOVOU -16(R8)(DI*1), X1
14309 MOVOU X0, (AX)
14310 MOVOU X1, -16(AX)(DI*1)
14311 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
14312
14313emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
14314 MOVOU (R8), X0
14315 MOVOU 16(R8), X1
14316 MOVOU -32(R8)(DI*1), X2
14317 MOVOU -16(R8)(DI*1), X3
14318 MOVOU X0, (AX)
14319 MOVOU X1, 16(AX)
14320 MOVOU X2, -32(AX)(DI*1)
14321 MOVOU X3, -16(AX)(DI*1)
14322
14323memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B:
14324 MOVQ BX, AX
14325 JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
14326
14327memmove_long_repeat_emit_encodeSnappyBlockAsm8B:
14328 LEAQ (AX)(DI*1), BX
14329
14330 // genMemMoveLong
14331 MOVOU (R8), X0
14332 MOVOU 16(R8), X1
14333 MOVOU -32(R8)(DI*1), X2
14334 MOVOU -16(R8)(DI*1), X3
14335 MOVQ DI, R10
14336 SHRQ $0x05, R10
14337 MOVQ AX, R9
14338 ANDL $0x0000001f, R9
14339 MOVQ $0x00000040, R11
14340 SUBQ R9, R11
14341 DECQ R10
14342 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
14343 LEAQ -32(R8)(R11*1), R9
14344 LEAQ -32(AX)(R11*1), R12
14345
14346emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
14347 MOVOU (R9), X4
14348 MOVOU 16(R9), X5
14349 MOVOA X4, (R12)
14350 MOVOA X5, 16(R12)
14351 ADDQ $0x20, R12
14352 ADDQ $0x20, R9
14353 ADDQ $0x20, R11
14354 DECQ R10
14355 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
14356
14357emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
14358 MOVOU -32(R8)(R11*1), X4
14359 MOVOU -16(R8)(R11*1), X5
14360 MOVOA X4, -32(AX)(R11*1)
14361 MOVOA X5, -16(AX)(R11*1)
14362 ADDQ $0x20, R11
14363 CMPQ DI, R11
14364 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
14365 MOVOU X0, (AX)
14366 MOVOU X1, 16(AX)
14367 MOVOU X2, -32(AX)(DI*1)
14368 MOVOU X3, -16(AX)(DI*1)
14369 MOVQ BX, AX
14370
14371emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B:
14372 ADDL $0x05, CX
14373 MOVL CX, BX
14374 SUBL 16(SP), BX
14375 MOVQ src_len+32(FP), DI
14376 SUBL CX, DI
14377 LEAQ (DX)(CX*1), R8
14378 LEAQ (DX)(BX*1), BX
14379
14380 // matchLen
14381 XORL R10, R10
14382
14383matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B:
14384 CMPL DI, $0x10
14385 JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B
14386 MOVQ (R8)(R10*1), R9
14387 MOVQ 8(R8)(R10*1), R11
14388 XORQ (BX)(R10*1), R9
14389 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B
14390 XORQ 8(BX)(R10*1), R11
14391 JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B
14392 LEAL -16(DI), DI
14393 LEAL 16(R10), R10
14394 JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B
14395
14396matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B:
14397#ifdef GOAMD64_v3
14398 TZCNTQ R11, R11
14399
14400#else
14401 BSFQ R11, R11
14402
14403#endif
14404 SARQ $0x03, R11
14405 LEAL 8(R10)(R11*1), R10
14406 JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B
14407
14408matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B:
14409 CMPL DI, $0x08
14410 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B
14411 MOVQ (R8)(R10*1), R9
14412 XORQ (BX)(R10*1), R9
14413 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B
14414 LEAL -8(DI), DI
14415 LEAL 8(R10), R10
14416 JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B
14417
14418matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B:
14419#ifdef GOAMD64_v3
14420 TZCNTQ R9, R9
14421
14422#else
14423 BSFQ R9, R9
14424
14425#endif
14426 SARQ $0x03, R9
14427 LEAL (R10)(R9*1), R10
14428 JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B
14429
14430matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B:
14431 CMPL DI, $0x04
14432 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
14433 MOVL (R8)(R10*1), R9
14434 CMPL (BX)(R10*1), R9
14435 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
14436 LEAL -4(DI), DI
14437 LEAL 4(R10), R10
14438
14439matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B:
14440 CMPL DI, $0x01
14441 JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
14442 JB repeat_extend_forward_end_encodeSnappyBlockAsm8B
14443 MOVW (R8)(R10*1), R9
14444 CMPW (BX)(R10*1), R9
14445 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
14446 LEAL 2(R10), R10
14447 SUBL $0x02, DI
14448 JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B
14449
14450matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B:
14451 MOVB (R8)(R10*1), R9
14452 CMPB (BX)(R10*1), R9
14453 JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B
14454 LEAL 1(R10), R10
14455
14456repeat_extend_forward_end_encodeSnappyBlockAsm8B:
14457 ADDL R10, CX
14458 MOVL CX, BX
14459 SUBL SI, BX
14460 MOVL 16(SP), SI
14461
14462 // emitCopy
14463two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B:
14464 CMPL BX, $0x40
14465 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B
14466 MOVB $0xee, (AX)
14467 MOVW SI, 1(AX)
14468 LEAL -60(BX), BX
14469 ADDQ $0x03, AX
14470 JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B
14471
14472two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B:
14473 MOVL BX, DI
14474 SHLL $0x02, DI
14475 CMPL BX, $0x0c
14476 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B
14477 LEAL -15(DI), DI
14478 MOVB SI, 1(AX)
14479 SHRL $0x08, SI
14480 SHLL $0x05, SI
14481 ORL SI, DI
14482 MOVB DI, (AX)
14483 ADDQ $0x02, AX
14484 JMP repeat_end_emit_encodeSnappyBlockAsm8B
14485
14486emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B:
14487 LEAL -2(DI), DI
14488 MOVB DI, (AX)
14489 MOVW SI, 1(AX)
14490 ADDQ $0x03, AX
14491
14492repeat_end_emit_encodeSnappyBlockAsm8B:
14493 MOVL CX, 12(SP)
14494 JMP search_loop_encodeSnappyBlockAsm8B
14495
14496no_repeat_found_encodeSnappyBlockAsm8B:
14497 CMPL (DX)(BX*1), SI
14498 JEQ candidate_match_encodeSnappyBlockAsm8B
14499 SHRQ $0x08, SI
14500 MOVL 24(SP)(R9*4), BX
14501 LEAL 2(CX), R8
14502 CMPL (DX)(DI*1), SI
14503 JEQ candidate2_match_encodeSnappyBlockAsm8B
14504 MOVL R8, 24(SP)(R9*4)
14505 SHRQ $0x08, SI
14506 CMPL (DX)(BX*1), SI
14507 JEQ candidate3_match_encodeSnappyBlockAsm8B
14508 MOVL 20(SP), CX
14509 JMP search_loop_encodeSnappyBlockAsm8B
14510
14511candidate3_match_encodeSnappyBlockAsm8B:
14512 ADDL $0x02, CX
14513 JMP candidate_match_encodeSnappyBlockAsm8B
14514
14515candidate2_match_encodeSnappyBlockAsm8B:
14516 MOVL R8, 24(SP)(R9*4)
14517 INCL CX
14518 MOVL DI, BX
14519
14520candidate_match_encodeSnappyBlockAsm8B:
14521 MOVL 12(SP), SI
14522 TESTL BX, BX
14523 JZ match_extend_back_end_encodeSnappyBlockAsm8B
14524
14525match_extend_back_loop_encodeSnappyBlockAsm8B:
14526 CMPL CX, SI
14527 JBE match_extend_back_end_encodeSnappyBlockAsm8B
14528 MOVB -1(DX)(BX*1), DI
14529 MOVB -1(DX)(CX*1), R8
14530 CMPB DI, R8
14531 JNE match_extend_back_end_encodeSnappyBlockAsm8B
14532 LEAL -1(CX), CX
14533 DECL BX
14534 JZ match_extend_back_end_encodeSnappyBlockAsm8B
14535 JMP match_extend_back_loop_encodeSnappyBlockAsm8B
14536
14537match_extend_back_end_encodeSnappyBlockAsm8B:
14538 MOVL CX, SI
14539 SUBL 12(SP), SI
14540 LEAQ 3(AX)(SI*1), SI
14541 CMPQ SI, (SP)
14542 JB match_dst_size_check_encodeSnappyBlockAsm8B
14543 MOVQ $0x00000000, ret+48(FP)
14544 RET
14545
14546match_dst_size_check_encodeSnappyBlockAsm8B:
14547 MOVL CX, SI
14548 MOVL 12(SP), DI
14549 CMPL DI, SI
14550 JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B
14551 MOVL SI, R8
14552 MOVL SI, 12(SP)
14553 LEAQ (DX)(DI*1), SI
14554 SUBL DI, R8
14555 LEAL -1(R8), DI
14556 CMPL DI, $0x3c
14557 JB one_byte_match_emit_encodeSnappyBlockAsm8B
14558 CMPL DI, $0x00000100
14559 JB two_bytes_match_emit_encodeSnappyBlockAsm8B
14560 JB three_bytes_match_emit_encodeSnappyBlockAsm8B
14561
14562three_bytes_match_emit_encodeSnappyBlockAsm8B:
14563 MOVB $0xf4, (AX)
14564 MOVW DI, 1(AX)
14565 ADDQ $0x03, AX
14566 JMP memmove_long_match_emit_encodeSnappyBlockAsm8B
14567
14568two_bytes_match_emit_encodeSnappyBlockAsm8B:
14569 MOVB $0xf0, (AX)
14570 MOVB DI, 1(AX)
14571 ADDQ $0x02, AX
14572 CMPL DI, $0x40
14573 JB memmove_match_emit_encodeSnappyBlockAsm8B
14574 JMP memmove_long_match_emit_encodeSnappyBlockAsm8B
14575
14576one_byte_match_emit_encodeSnappyBlockAsm8B:
14577 SHLB $0x02, DI
14578 MOVB DI, (AX)
14579 ADDQ $0x01, AX
14580
14581memmove_match_emit_encodeSnappyBlockAsm8B:
14582 LEAQ (AX)(R8*1), DI
14583
14584 // genMemMoveShort
14585 CMPQ R8, $0x08
14586 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8
14587 CMPQ R8, $0x10
14588 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
14589 CMPQ R8, $0x20
14590 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
14591 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
14592
14593emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8:
14594 MOVQ (SI), R9
14595 MOVQ R9, (AX)
14596 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
14597
14598emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
14599 MOVQ (SI), R9
14600 MOVQ -8(SI)(R8*1), SI
14601 MOVQ R9, (AX)
14602 MOVQ SI, -8(AX)(R8*1)
14603 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
14604
14605emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
14606 MOVOU (SI), X0
14607 MOVOU -16(SI)(R8*1), X1
14608 MOVOU X0, (AX)
14609 MOVOU X1, -16(AX)(R8*1)
14610 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
14611
14612emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
14613 MOVOU (SI), X0
14614 MOVOU 16(SI), X1
14615 MOVOU -32(SI)(R8*1), X2
14616 MOVOU -16(SI)(R8*1), X3
14617 MOVOU X0, (AX)
14618 MOVOU X1, 16(AX)
14619 MOVOU X2, -32(AX)(R8*1)
14620 MOVOU X3, -16(AX)(R8*1)
14621
14622memmove_end_copy_match_emit_encodeSnappyBlockAsm8B:
14623 MOVQ DI, AX
14624 JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B
14625
14626memmove_long_match_emit_encodeSnappyBlockAsm8B:
14627 LEAQ (AX)(R8*1), DI
14628
14629 // genMemMoveLong
14630 MOVOU (SI), X0
14631 MOVOU 16(SI), X1
14632 MOVOU -32(SI)(R8*1), X2
14633 MOVOU -16(SI)(R8*1), X3
14634 MOVQ R8, R10
14635 SHRQ $0x05, R10
14636 MOVQ AX, R9
14637 ANDL $0x0000001f, R9
14638 MOVQ $0x00000040, R11
14639 SUBQ R9, R11
14640 DECQ R10
14641 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
14642 LEAQ -32(SI)(R11*1), R9
14643 LEAQ -32(AX)(R11*1), R12
14644
14645emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
14646 MOVOU (R9), X4
14647 MOVOU 16(R9), X5
14648 MOVOA X4, (R12)
14649 MOVOA X5, 16(R12)
14650 ADDQ $0x20, R12
14651 ADDQ $0x20, R9
14652 ADDQ $0x20, R11
14653 DECQ R10
14654 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
14655
14656emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
14657 MOVOU -32(SI)(R11*1), X4
14658 MOVOU -16(SI)(R11*1), X5
14659 MOVOA X4, -32(AX)(R11*1)
14660 MOVOA X5, -16(AX)(R11*1)
14661 ADDQ $0x20, R11
14662 CMPQ R8, R11
14663 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
14664 MOVOU X0, (AX)
14665 MOVOU X1, 16(AX)
14666 MOVOU X2, -32(AX)(R8*1)
14667 MOVOU X3, -16(AX)(R8*1)
14668 MOVQ DI, AX
14669
14670emit_literal_done_match_emit_encodeSnappyBlockAsm8B:
14671match_nolit_loop_encodeSnappyBlockAsm8B:
14672 MOVL CX, SI
14673 SUBL BX, SI
14674 MOVL SI, 16(SP)
14675 ADDL $0x04, CX
14676 ADDL $0x04, BX
14677 MOVQ src_len+32(FP), SI
14678 SUBL CX, SI
14679 LEAQ (DX)(CX*1), DI
14680 LEAQ (DX)(BX*1), BX
14681
14682 // matchLen
14683 XORL R9, R9
14684
14685matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B:
14686 CMPL SI, $0x10
14687 JB matchlen_match8_match_nolit_encodeSnappyBlockAsm8B
14688 MOVQ (DI)(R9*1), R8
14689 MOVQ 8(DI)(R9*1), R10
14690 XORQ (BX)(R9*1), R8
14691 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B
14692 XORQ 8(BX)(R9*1), R10
14693 JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B
14694 LEAL -16(SI), SI
14695 LEAL 16(R9), R9
14696 JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B
14697
14698matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B:
14699#ifdef GOAMD64_v3
14700 TZCNTQ R10, R10
14701
14702#else
14703 BSFQ R10, R10
14704
14705#endif
14706 SARQ $0x03, R10
14707 LEAL 8(R9)(R10*1), R9
14708 JMP match_nolit_end_encodeSnappyBlockAsm8B
14709
14710matchlen_match8_match_nolit_encodeSnappyBlockAsm8B:
14711 CMPL SI, $0x08
14712 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm8B
14713 MOVQ (DI)(R9*1), R8
14714 XORQ (BX)(R9*1), R8
14715 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B
14716 LEAL -8(SI), SI
14717 LEAL 8(R9), R9
14718 JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm8B
14719
14720matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B:
14721#ifdef GOAMD64_v3
14722 TZCNTQ R8, R8
14723
14724#else
14725 BSFQ R8, R8
14726
14727#endif
14728 SARQ $0x03, R8
14729 LEAL (R9)(R8*1), R9
14730 JMP match_nolit_end_encodeSnappyBlockAsm8B
14731
14732matchlen_match4_match_nolit_encodeSnappyBlockAsm8B:
14733 CMPL SI, $0x04
14734 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
14735 MOVL (DI)(R9*1), R8
14736 CMPL (BX)(R9*1), R8
14737 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
14738 LEAL -4(SI), SI
14739 LEAL 4(R9), R9
14740
14741matchlen_match2_match_nolit_encodeSnappyBlockAsm8B:
14742 CMPL SI, $0x01
14743 JE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
14744 JB match_nolit_end_encodeSnappyBlockAsm8B
14745 MOVW (DI)(R9*1), R8
14746 CMPW (BX)(R9*1), R8
14747 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
14748 LEAL 2(R9), R9
14749 SUBL $0x02, SI
14750 JZ match_nolit_end_encodeSnappyBlockAsm8B
14751
14752matchlen_match1_match_nolit_encodeSnappyBlockAsm8B:
14753 MOVB (DI)(R9*1), R8
14754 CMPB (BX)(R9*1), R8
14755 JNE match_nolit_end_encodeSnappyBlockAsm8B
14756 LEAL 1(R9), R9
14757
14758match_nolit_end_encodeSnappyBlockAsm8B:
14759 ADDL R9, CX
14760 MOVL 16(SP), BX
14761 ADDL $0x04, R9
14762 MOVL CX, 12(SP)
14763
14764 // emitCopy
14765two_byte_offset_match_nolit_encodeSnappyBlockAsm8B:
14766 CMPL R9, $0x40
14767 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B
14768 MOVB $0xee, (AX)
14769 MOVW BX, 1(AX)
14770 LEAL -60(R9), R9
14771 ADDQ $0x03, AX
14772 JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B
14773
14774two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B:
14775 MOVL R9, SI
14776 SHLL $0x02, SI
14777 CMPL R9, $0x0c
14778 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B
14779 LEAL -15(SI), SI
14780 MOVB BL, 1(AX)
14781 SHRL $0x08, BX
14782 SHLL $0x05, BX
14783 ORL BX, SI
14784 MOVB SI, (AX)
14785 ADDQ $0x02, AX
14786 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B
14787
14788emit_copy_three_match_nolit_encodeSnappyBlockAsm8B:
14789 LEAL -2(SI), SI
14790 MOVB SI, (AX)
14791 MOVW BX, 1(AX)
14792 ADDQ $0x03, AX
14793
14794match_nolit_emitcopy_end_encodeSnappyBlockAsm8B:
14795 CMPL CX, 8(SP)
14796 JAE emit_remainder_encodeSnappyBlockAsm8B
14797 MOVQ -2(DX)(CX*1), SI
14798 CMPQ AX, (SP)
14799 JB match_nolit_dst_ok_encodeSnappyBlockAsm8B
14800 MOVQ $0x00000000, ret+48(FP)
14801 RET
14802
14803match_nolit_dst_ok_encodeSnappyBlockAsm8B:
14804 MOVQ $0x9e3779b1, R8
14805 MOVQ SI, DI
14806 SHRQ $0x10, SI
14807 MOVQ SI, BX
14808 SHLQ $0x20, DI
14809 IMULQ R8, DI
14810 SHRQ $0x38, DI
14811 SHLQ $0x20, BX
14812 IMULQ R8, BX
14813 SHRQ $0x38, BX
14814 LEAL -2(CX), R8
14815 LEAQ 24(SP)(BX*4), R9
14816 MOVL (R9), BX
14817 MOVL R8, 24(SP)(DI*4)
14818 MOVL CX, (R9)
14819 CMPL (DX)(BX*1), SI
14820 JEQ match_nolit_loop_encodeSnappyBlockAsm8B
14821 INCL CX
14822 JMP search_loop_encodeSnappyBlockAsm8B
14823
14824emit_remainder_encodeSnappyBlockAsm8B:
14825 MOVQ src_len+32(FP), CX
14826 SUBL 12(SP), CX
14827 LEAQ 3(AX)(CX*1), CX
14828 CMPQ CX, (SP)
14829 JB emit_remainder_ok_encodeSnappyBlockAsm8B
14830 MOVQ $0x00000000, ret+48(FP)
14831 RET
14832
14833emit_remainder_ok_encodeSnappyBlockAsm8B:
14834 MOVQ src_len+32(FP), CX
14835 MOVL 12(SP), BX
14836 CMPL BX, CX
14837 JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
14838 MOVL CX, SI
14839 MOVL CX, 12(SP)
14840 LEAQ (DX)(BX*1), CX
14841 SUBL BX, SI
14842 LEAL -1(SI), DX
14843 CMPL DX, $0x3c
14844 JB one_byte_emit_remainder_encodeSnappyBlockAsm8B
14845 CMPL DX, $0x00000100
14846 JB two_bytes_emit_remainder_encodeSnappyBlockAsm8B
14847 JB three_bytes_emit_remainder_encodeSnappyBlockAsm8B
14848
14849three_bytes_emit_remainder_encodeSnappyBlockAsm8B:
14850 MOVB $0xf4, (AX)
14851 MOVW DX, 1(AX)
14852 ADDQ $0x03, AX
14853 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B
14854
14855two_bytes_emit_remainder_encodeSnappyBlockAsm8B:
14856 MOVB $0xf0, (AX)
14857 MOVB DL, 1(AX)
14858 ADDQ $0x02, AX
14859 CMPL DX, $0x40
14860 JB memmove_emit_remainder_encodeSnappyBlockAsm8B
14861 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B
14862
14863one_byte_emit_remainder_encodeSnappyBlockAsm8B:
14864 SHLB $0x02, DL
14865 MOVB DL, (AX)
14866 ADDQ $0x01, AX
14867
14868memmove_emit_remainder_encodeSnappyBlockAsm8B:
14869 LEAQ (AX)(SI*1), DX
14870 MOVL SI, BX
14871
14872 // genMemMoveShort
14873 CMPQ BX, $0x03
14874 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2
14875 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3
14876 CMPQ BX, $0x08
14877 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7
14878 CMPQ BX, $0x10
14879 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16
14880 CMPQ BX, $0x20
14881 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32
14882 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64
14883
14884emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2:
14885 MOVB (CX), SI
14886 MOVB -1(CX)(BX*1), CL
14887 MOVB SI, (AX)
14888 MOVB CL, -1(AX)(BX*1)
14889 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
14890
14891emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3:
14892 MOVW (CX), SI
14893 MOVB 2(CX), CL
14894 MOVW SI, (AX)
14895 MOVB CL, 2(AX)
14896 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
14897
14898emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7:
14899 MOVL (CX), SI
14900 MOVL -4(CX)(BX*1), CX
14901 MOVL SI, (AX)
14902 MOVL CX, -4(AX)(BX*1)
14903 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
14904
14905emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16:
14906 MOVQ (CX), SI
14907 MOVQ -8(CX)(BX*1), CX
14908 MOVQ SI, (AX)
14909 MOVQ CX, -8(AX)(BX*1)
14910 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
14911
14912emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32:
14913 MOVOU (CX), X0
14914 MOVOU -16(CX)(BX*1), X1
14915 MOVOU X0, (AX)
14916 MOVOU X1, -16(AX)(BX*1)
14917 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
14918
14919emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64:
14920 MOVOU (CX), X0
14921 MOVOU 16(CX), X1
14922 MOVOU -32(CX)(BX*1), X2
14923 MOVOU -16(CX)(BX*1), X3
14924 MOVOU X0, (AX)
14925 MOVOU X1, 16(AX)
14926 MOVOU X2, -32(AX)(BX*1)
14927 MOVOU X3, -16(AX)(BX*1)
14928
14929memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B:
14930 MOVQ DX, AX
14931 JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
14932
14933memmove_long_emit_remainder_encodeSnappyBlockAsm8B:
14934 LEAQ (AX)(SI*1), DX
14935 MOVL SI, BX
14936
14937 // genMemMoveLong
14938 MOVOU (CX), X0
14939 MOVOU 16(CX), X1
14940 MOVOU -32(CX)(BX*1), X2
14941 MOVOU -16(CX)(BX*1), X3
14942 MOVQ BX, DI
14943 SHRQ $0x05, DI
14944 MOVQ AX, SI
14945 ANDL $0x0000001f, SI
14946 MOVQ $0x00000040, R8
14947 SUBQ SI, R8
14948 DECQ DI
14949 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
14950 LEAQ -32(CX)(R8*1), SI
14951 LEAQ -32(AX)(R8*1), R9
14952
14953emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back:
14954 MOVOU (SI), X4
14955 MOVOU 16(SI), X5
14956 MOVOA X4, (R9)
14957 MOVOA X5, 16(R9)
14958 ADDQ $0x20, R9
14959 ADDQ $0x20, SI
14960 ADDQ $0x20, R8
14961 DECQ DI
14962 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back
14963
14964emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
14965 MOVOU -32(CX)(R8*1), X4
14966 MOVOU -16(CX)(R8*1), X5
14967 MOVOA X4, -32(AX)(R8*1)
14968 MOVOA X5, -16(AX)(R8*1)
14969 ADDQ $0x20, R8
14970 CMPQ BX, R8
14971 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
14972 MOVOU X0, (AX)
14973 MOVOU X1, 16(AX)
14974 MOVOU X2, -32(AX)(BX*1)
14975 MOVOU X3, -16(AX)(BX*1)
14976 MOVQ DX, AX
14977
14978emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B:
14979 MOVQ dst_base+0(FP), CX
14980 SUBQ CX, AX
14981 MOVQ AX, ret+48(FP)
14982 RET
14983
14984// func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
14985// Requires: BMI, SSE2
14986TEXT ·encodeSnappyBetterBlockAsm(SB), $589848-56
14987 MOVQ dst_base+0(FP), AX
14988 MOVQ $0x00001200, CX
14989 LEAQ 24(SP), DX
14990 PXOR X0, X0
14991
14992zero_loop_encodeSnappyBetterBlockAsm:
14993 MOVOU X0, (DX)
14994 MOVOU X0, 16(DX)
14995 MOVOU X0, 32(DX)
14996 MOVOU X0, 48(DX)
14997 MOVOU X0, 64(DX)
14998 MOVOU X0, 80(DX)
14999 MOVOU X0, 96(DX)
15000 MOVOU X0, 112(DX)
15001 ADDQ $0x80, DX
15002 DECQ CX
15003 JNZ zero_loop_encodeSnappyBetterBlockAsm
15004 MOVL $0x00000000, 12(SP)
15005 MOVQ src_len+32(FP), CX
15006 LEAQ -9(CX), DX
15007 LEAQ -8(CX), BX
15008 MOVL BX, 8(SP)
15009 SHRQ $0x05, CX
15010 SUBL CX, DX
15011 LEAQ (AX)(DX*1), DX
15012 MOVQ DX, (SP)
15013 MOVL $0x00000001, CX
15014 MOVL $0x00000000, 16(SP)
15015 MOVQ src_base+24(FP), DX
15016
15017search_loop_encodeSnappyBetterBlockAsm:
15018 MOVL CX, BX
15019 SUBL 12(SP), BX
15020 SHRL $0x07, BX
15021 CMPL BX, $0x63
15022 JBE check_maxskip_ok_encodeSnappyBetterBlockAsm
15023 LEAL 100(CX), BX
15024 JMP check_maxskip_cont_encodeSnappyBetterBlockAsm
15025
15026check_maxskip_ok_encodeSnappyBetterBlockAsm:
15027 LEAL 1(CX)(BX*1), BX
15028
15029check_maxskip_cont_encodeSnappyBetterBlockAsm:
15030 CMPL BX, 8(SP)
15031 JAE emit_remainder_encodeSnappyBetterBlockAsm
15032 MOVQ (DX)(CX*1), SI
15033 MOVL BX, 20(SP)
15034 MOVQ $0x00cf1bbcdcbfa563, R8
15035 MOVQ $0x9e3779b1, BX
15036 MOVQ SI, R9
15037 MOVQ SI, R10
15038 SHLQ $0x08, R9
15039 IMULQ R8, R9
15040 SHRQ $0x2f, R9
15041 SHLQ $0x20, R10
15042 IMULQ BX, R10
15043 SHRQ $0x32, R10
15044 MOVL 24(SP)(R9*4), BX
15045 MOVL 524312(SP)(R10*4), DI
15046 MOVL CX, 24(SP)(R9*4)
15047 MOVL CX, 524312(SP)(R10*4)
15048 MOVQ (DX)(BX*1), R9
15049 MOVQ (DX)(DI*1), R10
15050 CMPQ R9, SI
15051 JEQ candidate_match_encodeSnappyBetterBlockAsm
15052 CMPQ R10, SI
15053 JNE no_short_found_encodeSnappyBetterBlockAsm
15054 MOVL DI, BX
15055 JMP candidate_match_encodeSnappyBetterBlockAsm
15056
15057no_short_found_encodeSnappyBetterBlockAsm:
15058 CMPL R9, SI
15059 JEQ candidate_match_encodeSnappyBetterBlockAsm
15060 CMPL R10, SI
15061 JEQ candidateS_match_encodeSnappyBetterBlockAsm
15062 MOVL 20(SP), CX
15063 JMP search_loop_encodeSnappyBetterBlockAsm
15064
15065candidateS_match_encodeSnappyBetterBlockAsm:
15066 SHRQ $0x08, SI
15067 MOVQ SI, R9
15068 SHLQ $0x08, R9
15069 IMULQ R8, R9
15070 SHRQ $0x2f, R9
15071 MOVL 24(SP)(R9*4), BX
15072 INCL CX
15073 MOVL CX, 24(SP)(R9*4)
15074 CMPL (DX)(BX*1), SI
15075 JEQ candidate_match_encodeSnappyBetterBlockAsm
15076 DECL CX
15077 MOVL DI, BX
15078
15079candidate_match_encodeSnappyBetterBlockAsm:
15080 MOVL 12(SP), SI
15081 TESTL BX, BX
15082 JZ match_extend_back_end_encodeSnappyBetterBlockAsm
15083
15084match_extend_back_loop_encodeSnappyBetterBlockAsm:
15085 CMPL CX, SI
15086 JBE match_extend_back_end_encodeSnappyBetterBlockAsm
15087 MOVB -1(DX)(BX*1), DI
15088 MOVB -1(DX)(CX*1), R8
15089 CMPB DI, R8
15090 JNE match_extend_back_end_encodeSnappyBetterBlockAsm
15091 LEAL -1(CX), CX
15092 DECL BX
15093 JZ match_extend_back_end_encodeSnappyBetterBlockAsm
15094 JMP match_extend_back_loop_encodeSnappyBetterBlockAsm
15095
15096match_extend_back_end_encodeSnappyBetterBlockAsm:
15097 MOVL CX, SI
15098 SUBL 12(SP), SI
15099 LEAQ 5(AX)(SI*1), SI
15100 CMPQ SI, (SP)
15101 JB match_dst_size_check_encodeSnappyBetterBlockAsm
15102 MOVQ $0x00000000, ret+48(FP)
15103 RET
15104
15105match_dst_size_check_encodeSnappyBetterBlockAsm:
15106 MOVL CX, SI
15107 ADDL $0x04, CX
15108 ADDL $0x04, BX
15109 MOVQ src_len+32(FP), DI
15110 SUBL CX, DI
15111 LEAQ (DX)(CX*1), R8
15112 LEAQ (DX)(BX*1), R9
15113
15114 // matchLen
15115 XORL R11, R11
15116
15117matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm:
15118 CMPL DI, $0x10
15119 JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm
15120 MOVQ (R8)(R11*1), R10
15121 MOVQ 8(R8)(R11*1), R12
15122 XORQ (R9)(R11*1), R10
15123 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm
15124 XORQ 8(R9)(R11*1), R12
15125 JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm
15126 LEAL -16(DI), DI
15127 LEAL 16(R11), R11
15128 JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm
15129
15130matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm:
15131#ifdef GOAMD64_v3
15132 TZCNTQ R12, R12
15133
15134#else
15135 BSFQ R12, R12
15136
15137#endif
15138 SARQ $0x03, R12
15139 LEAL 8(R11)(R12*1), R11
15140 JMP match_nolit_end_encodeSnappyBetterBlockAsm
15141
15142matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm:
15143 CMPL DI, $0x08
15144 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm
15145 MOVQ (R8)(R11*1), R10
15146 XORQ (R9)(R11*1), R10
15147 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm
15148 LEAL -8(DI), DI
15149 LEAL 8(R11), R11
15150 JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm
15151
15152matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm:
15153#ifdef GOAMD64_v3
15154 TZCNTQ R10, R10
15155
15156#else
15157 BSFQ R10, R10
15158
15159#endif
15160 SARQ $0x03, R10
15161 LEAL (R11)(R10*1), R11
15162 JMP match_nolit_end_encodeSnappyBetterBlockAsm
15163
15164matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm:
15165 CMPL DI, $0x04
15166 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
15167 MOVL (R8)(R11*1), R10
15168 CMPL (R9)(R11*1), R10
15169 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
15170 LEAL -4(DI), DI
15171 LEAL 4(R11), R11
15172
15173matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm:
15174 CMPL DI, $0x01
15175 JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
15176 JB match_nolit_end_encodeSnappyBetterBlockAsm
15177 MOVW (R8)(R11*1), R10
15178 CMPW (R9)(R11*1), R10
15179 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
15180 LEAL 2(R11), R11
15181 SUBL $0x02, DI
15182 JZ match_nolit_end_encodeSnappyBetterBlockAsm
15183
15184matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm:
15185 MOVB (R8)(R11*1), R10
15186 CMPB (R9)(R11*1), R10
15187 JNE match_nolit_end_encodeSnappyBetterBlockAsm
15188 LEAL 1(R11), R11
15189
15190match_nolit_end_encodeSnappyBetterBlockAsm:
15191 MOVL CX, DI
15192 SUBL BX, DI
15193
15194 // Check if repeat
15195 CMPL R11, $0x01
15196 JA match_length_ok_encodeSnappyBetterBlockAsm
15197 CMPL DI, $0x0000ffff
15198 JBE match_length_ok_encodeSnappyBetterBlockAsm
15199 MOVL 20(SP), CX
15200 INCL CX
15201 JMP search_loop_encodeSnappyBetterBlockAsm
15202
15203match_length_ok_encodeSnappyBetterBlockAsm:
15204 MOVL DI, 16(SP)
15205 MOVL 12(SP), BX
15206 CMPL BX, SI
15207 JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
15208 MOVL SI, R8
15209 MOVL SI, 12(SP)
15210 LEAQ (DX)(BX*1), R9
15211 SUBL BX, R8
15212 LEAL -1(R8), BX
15213 CMPL BX, $0x3c
15214 JB one_byte_match_emit_encodeSnappyBetterBlockAsm
15215 CMPL BX, $0x00000100
15216 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm
15217 CMPL BX, $0x00010000
15218 JB three_bytes_match_emit_encodeSnappyBetterBlockAsm
15219 CMPL BX, $0x01000000
15220 JB four_bytes_match_emit_encodeSnappyBetterBlockAsm
15221 MOVB $0xfc, (AX)
15222 MOVL BX, 1(AX)
15223 ADDQ $0x05, AX
15224 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
15225
15226four_bytes_match_emit_encodeSnappyBetterBlockAsm:
15227 MOVL BX, R10
15228 SHRL $0x10, R10
15229 MOVB $0xf8, (AX)
15230 MOVW BX, 1(AX)
15231 MOVB R10, 3(AX)
15232 ADDQ $0x04, AX
15233 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
15234
15235three_bytes_match_emit_encodeSnappyBetterBlockAsm:
15236 MOVB $0xf4, (AX)
15237 MOVW BX, 1(AX)
15238 ADDQ $0x03, AX
15239 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
15240
15241two_bytes_match_emit_encodeSnappyBetterBlockAsm:
15242 MOVB $0xf0, (AX)
15243 MOVB BL, 1(AX)
15244 ADDQ $0x02, AX
15245 CMPL BX, $0x40
15246 JB memmove_match_emit_encodeSnappyBetterBlockAsm
15247 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
15248
15249one_byte_match_emit_encodeSnappyBetterBlockAsm:
15250 SHLB $0x02, BL
15251 MOVB BL, (AX)
15252 ADDQ $0x01, AX
15253
15254memmove_match_emit_encodeSnappyBetterBlockAsm:
15255 LEAQ (AX)(R8*1), BX
15256
15257 // genMemMoveShort
15258 CMPQ R8, $0x08
15259 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8
15260 CMPQ R8, $0x10
15261 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16
15262 CMPQ R8, $0x20
15263 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32
15264 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64
15265
15266emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8:
15267 MOVQ (R9), R10
15268 MOVQ R10, (AX)
15269 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
15270
15271emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16:
15272 MOVQ (R9), R10
15273 MOVQ -8(R9)(R8*1), R9
15274 MOVQ R10, (AX)
15275 MOVQ R9, -8(AX)(R8*1)
15276 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
15277
15278emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32:
15279 MOVOU (R9), X0
15280 MOVOU -16(R9)(R8*1), X1
15281 MOVOU X0, (AX)
15282 MOVOU X1, -16(AX)(R8*1)
15283 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
15284
15285emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64:
15286 MOVOU (R9), X0
15287 MOVOU 16(R9), X1
15288 MOVOU -32(R9)(R8*1), X2
15289 MOVOU -16(R9)(R8*1), X3
15290 MOVOU X0, (AX)
15291 MOVOU X1, 16(AX)
15292 MOVOU X2, -32(AX)(R8*1)
15293 MOVOU X3, -16(AX)(R8*1)
15294
15295memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm:
15296 MOVQ BX, AX
15297 JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
15298
15299memmove_long_match_emit_encodeSnappyBetterBlockAsm:
15300 LEAQ (AX)(R8*1), BX
15301
15302 // genMemMoveLong
15303 MOVOU (R9), X0
15304 MOVOU 16(R9), X1
15305 MOVOU -32(R9)(R8*1), X2
15306 MOVOU -16(R9)(R8*1), X3
15307 MOVQ R8, R12
15308 SHRQ $0x05, R12
15309 MOVQ AX, R10
15310 ANDL $0x0000001f, R10
15311 MOVQ $0x00000040, R13
15312 SUBQ R10, R13
15313 DECQ R12
15314 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
15315 LEAQ -32(R9)(R13*1), R10
15316 LEAQ -32(AX)(R13*1), R14
15317
15318emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back:
15319 MOVOU (R10), X4
15320 MOVOU 16(R10), X5
15321 MOVOA X4, (R14)
15322 MOVOA X5, 16(R14)
15323 ADDQ $0x20, R14
15324 ADDQ $0x20, R10
15325 ADDQ $0x20, R13
15326 DECQ R12
15327 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back
15328
15329emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
15330 MOVOU -32(R9)(R13*1), X4
15331 MOVOU -16(R9)(R13*1), X5
15332 MOVOA X4, -32(AX)(R13*1)
15333 MOVOA X5, -16(AX)(R13*1)
15334 ADDQ $0x20, R13
15335 CMPQ R8, R13
15336 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
15337 MOVOU X0, (AX)
15338 MOVOU X1, 16(AX)
15339 MOVOU X2, -32(AX)(R8*1)
15340 MOVOU X3, -16(AX)(R8*1)
15341 MOVQ BX, AX
15342
15343emit_literal_done_match_emit_encodeSnappyBetterBlockAsm:
15344 ADDL R11, CX
15345 ADDL $0x04, R11
15346 MOVL CX, 12(SP)
15347
15348 // emitCopy
15349 CMPL DI, $0x00010000
15350 JB two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
15351
15352four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm:
15353 CMPL R11, $0x40
15354 JBE four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
15355 MOVB $0xff, (AX)
15356 MOVL DI, 1(AX)
15357 LEAL -64(R11), R11
15358 ADDQ $0x05, AX
15359 CMPL R11, $0x04
15360 JB four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
15361 JMP four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm
15362
15363four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm:
15364 TESTL R11, R11
15365 JZ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
15366 XORL BX, BX
15367 LEAL -1(BX)(R11*4), R11
15368 MOVB R11, (AX)
15369 MOVL DI, 1(AX)
15370 ADDQ $0x05, AX
15371 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
15372
15373two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm:
15374 CMPL R11, $0x40
15375 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm
15376 MOVB $0xee, (AX)
15377 MOVW DI, 1(AX)
15378 LEAL -60(R11), R11
15379 ADDQ $0x03, AX
15380 JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
15381
15382two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm:
15383 MOVL R11, BX
15384 SHLL $0x02, BX
15385 CMPL R11, $0x0c
15386 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
15387 CMPL DI, $0x00000800
15388 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
15389 LEAL -15(BX), BX
15390 MOVB DI, 1(AX)
15391 SHRL $0x08, DI
15392 SHLL $0x05, DI
15393 ORL DI, BX
15394 MOVB BL, (AX)
15395 ADDQ $0x02, AX
15396 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
15397
15398emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm:
15399 LEAL -2(BX), BX
15400 MOVB BL, (AX)
15401 MOVW DI, 1(AX)
15402 ADDQ $0x03, AX
15403
15404match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:
15405 CMPL CX, 8(SP)
15406 JAE emit_remainder_encodeSnappyBetterBlockAsm
15407 CMPQ AX, (SP)
15408 JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm
15409 MOVQ $0x00000000, ret+48(FP)
15410 RET
15411
15412match_nolit_dst_ok_encodeSnappyBetterBlockAsm:
15413 MOVQ $0x00cf1bbcdcbfa563, BX
15414 MOVQ $0x9e3779b1, DI
15415 LEAQ 1(SI), SI
15416 LEAQ -2(CX), R8
15417 MOVQ (DX)(SI*1), R9
15418 MOVQ 1(DX)(SI*1), R10
15419 MOVQ (DX)(R8*1), R11
15420 MOVQ 1(DX)(R8*1), R12
15421 SHLQ $0x08, R9
15422 IMULQ BX, R9
15423 SHRQ $0x2f, R9
15424 SHLQ $0x20, R10
15425 IMULQ DI, R10
15426 SHRQ $0x32, R10
15427 SHLQ $0x08, R11
15428 IMULQ BX, R11
15429 SHRQ $0x2f, R11
15430 SHLQ $0x20, R12
15431 IMULQ DI, R12
15432 SHRQ $0x32, R12
15433 LEAQ 1(SI), DI
15434 LEAQ 1(R8), R13
15435 MOVL SI, 24(SP)(R9*4)
15436 MOVL R8, 24(SP)(R11*4)
15437 MOVL DI, 524312(SP)(R10*4)
15438 MOVL R13, 524312(SP)(R12*4)
15439 LEAQ 1(R8)(SI*1), DI
15440 SHRQ $0x01, DI
15441 ADDQ $0x01, SI
15442 SUBQ $0x01, R8
15443
15444index_loop_encodeSnappyBetterBlockAsm:
15445 CMPQ DI, R8
15446 JAE search_loop_encodeSnappyBetterBlockAsm
15447 MOVQ (DX)(SI*1), R9
15448 MOVQ (DX)(DI*1), R10
15449 SHLQ $0x08, R9
15450 IMULQ BX, R9
15451 SHRQ $0x2f, R9
15452 SHLQ $0x08, R10
15453 IMULQ BX, R10
15454 SHRQ $0x2f, R10
15455 MOVL SI, 24(SP)(R9*4)
15456 MOVL DI, 24(SP)(R10*4)
15457 ADDQ $0x02, SI
15458 ADDQ $0x02, DI
15459 JMP index_loop_encodeSnappyBetterBlockAsm
15460
15461emit_remainder_encodeSnappyBetterBlockAsm:
15462 MOVQ src_len+32(FP), CX
15463 SUBL 12(SP), CX
15464 LEAQ 5(AX)(CX*1), CX
15465 CMPQ CX, (SP)
15466 JB emit_remainder_ok_encodeSnappyBetterBlockAsm
15467 MOVQ $0x00000000, ret+48(FP)
15468 RET
15469
15470emit_remainder_ok_encodeSnappyBetterBlockAsm:
15471 MOVQ src_len+32(FP), CX
15472 MOVL 12(SP), BX
15473 CMPL BX, CX
15474 JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
15475 MOVL CX, SI
15476 MOVL CX, 12(SP)
15477 LEAQ (DX)(BX*1), CX
15478 SUBL BX, SI
15479 LEAL -1(SI), DX
15480 CMPL DX, $0x3c
15481 JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm
15482 CMPL DX, $0x00000100
15483 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm
15484 CMPL DX, $0x00010000
15485 JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm
15486 CMPL DX, $0x01000000
15487 JB four_bytes_emit_remainder_encodeSnappyBetterBlockAsm
15488 MOVB $0xfc, (AX)
15489 MOVL DX, 1(AX)
15490 ADDQ $0x05, AX
15491 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
15492
15493four_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
15494 MOVL DX, BX
15495 SHRL $0x10, BX
15496 MOVB $0xf8, (AX)
15497 MOVW DX, 1(AX)
15498 MOVB BL, 3(AX)
15499 ADDQ $0x04, AX
15500 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
15501
15502three_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
15503 MOVB $0xf4, (AX)
15504 MOVW DX, 1(AX)
15505 ADDQ $0x03, AX
15506 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
15507
15508two_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
15509 MOVB $0xf0, (AX)
15510 MOVB DL, 1(AX)
15511 ADDQ $0x02, AX
15512 CMPL DX, $0x40
15513 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm
15514 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
15515
15516one_byte_emit_remainder_encodeSnappyBetterBlockAsm:
15517 SHLB $0x02, DL
15518 MOVB DL, (AX)
15519 ADDQ $0x01, AX
15520
15521memmove_emit_remainder_encodeSnappyBetterBlockAsm:
15522 LEAQ (AX)(SI*1), DX
15523 MOVL SI, BX
15524
15525 // genMemMoveShort
15526 CMPQ BX, $0x03
15527 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2
15528 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3
15529 CMPQ BX, $0x08
15530 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7
15531 CMPQ BX, $0x10
15532 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16
15533 CMPQ BX, $0x20
15534 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32
15535 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64
15536
15537emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2:
15538 MOVB (CX), SI
15539 MOVB -1(CX)(BX*1), CL
15540 MOVB SI, (AX)
15541 MOVB CL, -1(AX)(BX*1)
15542 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
15543
15544emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3:
15545 MOVW (CX), SI
15546 MOVB 2(CX), CL
15547 MOVW SI, (AX)
15548 MOVB CL, 2(AX)
15549 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
15550
15551emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7:
15552 MOVL (CX), SI
15553 MOVL -4(CX)(BX*1), CX
15554 MOVL SI, (AX)
15555 MOVL CX, -4(AX)(BX*1)
15556 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
15557
15558emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16:
15559 MOVQ (CX), SI
15560 MOVQ -8(CX)(BX*1), CX
15561 MOVQ SI, (AX)
15562 MOVQ CX, -8(AX)(BX*1)
15563 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
15564
15565emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32:
15566 MOVOU (CX), X0
15567 MOVOU -16(CX)(BX*1), X1
15568 MOVOU X0, (AX)
15569 MOVOU X1, -16(AX)(BX*1)
15570 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
15571
15572emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64:
15573 MOVOU (CX), X0
15574 MOVOU 16(CX), X1
15575 MOVOU -32(CX)(BX*1), X2
15576 MOVOU -16(CX)(BX*1), X3
15577 MOVOU X0, (AX)
15578 MOVOU X1, 16(AX)
15579 MOVOU X2, -32(AX)(BX*1)
15580 MOVOU X3, -16(AX)(BX*1)
15581
15582memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm:
15583 MOVQ DX, AX
15584 JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
15585
15586memmove_long_emit_remainder_encodeSnappyBetterBlockAsm:
15587 LEAQ (AX)(SI*1), DX
15588 MOVL SI, BX
15589
15590 // genMemMoveLong
15591 MOVOU (CX), X0
15592 MOVOU 16(CX), X1
15593 MOVOU -32(CX)(BX*1), X2
15594 MOVOU -16(CX)(BX*1), X3
15595 MOVQ BX, DI
15596 SHRQ $0x05, DI
15597 MOVQ AX, SI
15598 ANDL $0x0000001f, SI
15599 MOVQ $0x00000040, R8
15600 SUBQ SI, R8
15601 DECQ DI
15602 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
15603 LEAQ -32(CX)(R8*1), SI
15604 LEAQ -32(AX)(R8*1), R9
15605
15606emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back:
15607 MOVOU (SI), X4
15608 MOVOU 16(SI), X5
15609 MOVOA X4, (R9)
15610 MOVOA X5, 16(R9)
15611 ADDQ $0x20, R9
15612 ADDQ $0x20, SI
15613 ADDQ $0x20, R8
15614 DECQ DI
15615 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back
15616
15617emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
15618 MOVOU -32(CX)(R8*1), X4
15619 MOVOU -16(CX)(R8*1), X5
15620 MOVOA X4, -32(AX)(R8*1)
15621 MOVOA X5, -16(AX)(R8*1)
15622 ADDQ $0x20, R8
15623 CMPQ BX, R8
15624 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
15625 MOVOU X0, (AX)
15626 MOVOU X1, 16(AX)
15627 MOVOU X2, -32(AX)(BX*1)
15628 MOVOU X3, -16(AX)(BX*1)
15629 MOVQ DX, AX
15630
15631emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm:
15632 MOVQ dst_base+0(FP), CX
15633 SUBQ CX, AX
15634 MOVQ AX, ret+48(FP)
15635 RET
15636
15637// func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int
15638// Requires: BMI, SSE2
15639TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56
15640 MOVQ dst_base+0(FP), AX
15641 MOVQ $0x00000a00, CX
15642 LEAQ 24(SP), DX
15643 PXOR X0, X0
15644
15645zero_loop_encodeSnappyBetterBlockAsm64K:
15646 MOVOU X0, (DX)
15647 MOVOU X0, 16(DX)
15648 MOVOU X0, 32(DX)
15649 MOVOU X0, 48(DX)
15650 MOVOU X0, 64(DX)
15651 MOVOU X0, 80(DX)
15652 MOVOU X0, 96(DX)
15653 MOVOU X0, 112(DX)
15654 ADDQ $0x80, DX
15655 DECQ CX
15656 JNZ zero_loop_encodeSnappyBetterBlockAsm64K
15657 MOVL $0x00000000, 12(SP)
15658 MOVQ src_len+32(FP), CX
15659 LEAQ -9(CX), DX
15660 LEAQ -8(CX), BX
15661 MOVL BX, 8(SP)
15662 SHRQ $0x05, CX
15663 SUBL CX, DX
15664 LEAQ (AX)(DX*1), DX
15665 MOVQ DX, (SP)
15666 MOVL $0x00000001, CX
15667 MOVL $0x00000000, 16(SP)
15668 MOVQ src_base+24(FP), DX
15669
15670search_loop_encodeSnappyBetterBlockAsm64K:
15671 MOVL CX, BX
15672 SUBL 12(SP), BX
15673 SHRL $0x07, BX
15674 LEAL 1(CX)(BX*1), BX
15675 CMPL BX, 8(SP)
15676 JAE emit_remainder_encodeSnappyBetterBlockAsm64K
15677 MOVQ (DX)(CX*1), SI
15678 MOVL BX, 20(SP)
15679 MOVQ $0x00cf1bbcdcbfa563, R8
15680 MOVQ $0x9e3779b1, BX
15681 MOVQ SI, R9
15682 MOVQ SI, R10
15683 SHLQ $0x08, R9
15684 IMULQ R8, R9
15685 SHRQ $0x30, R9
15686 SHLQ $0x20, R10
15687 IMULQ BX, R10
15688 SHRQ $0x32, R10
15689 MOVL 24(SP)(R9*4), BX
15690 MOVL 262168(SP)(R10*4), DI
15691 MOVL CX, 24(SP)(R9*4)
15692 MOVL CX, 262168(SP)(R10*4)
15693 MOVQ (DX)(BX*1), R9
15694 MOVQ (DX)(DI*1), R10
15695 CMPQ R9, SI
15696 JEQ candidate_match_encodeSnappyBetterBlockAsm64K
15697 CMPQ R10, SI
15698 JNE no_short_found_encodeSnappyBetterBlockAsm64K
15699 MOVL DI, BX
15700 JMP candidate_match_encodeSnappyBetterBlockAsm64K
15701
15702no_short_found_encodeSnappyBetterBlockAsm64K:
15703 CMPL R9, SI
15704 JEQ candidate_match_encodeSnappyBetterBlockAsm64K
15705 CMPL R10, SI
15706 JEQ candidateS_match_encodeSnappyBetterBlockAsm64K
15707 MOVL 20(SP), CX
15708 JMP search_loop_encodeSnappyBetterBlockAsm64K
15709
15710candidateS_match_encodeSnappyBetterBlockAsm64K:
15711 SHRQ $0x08, SI
15712 MOVQ SI, R9
15713 SHLQ $0x08, R9
15714 IMULQ R8, R9
15715 SHRQ $0x30, R9
15716 MOVL 24(SP)(R9*4), BX
15717 INCL CX
15718 MOVL CX, 24(SP)(R9*4)
15719 CMPL (DX)(BX*1), SI
15720 JEQ candidate_match_encodeSnappyBetterBlockAsm64K
15721 DECL CX
15722 MOVL DI, BX
15723
15724candidate_match_encodeSnappyBetterBlockAsm64K:
15725 MOVL 12(SP), SI
15726 TESTL BX, BX
15727 JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K
15728
15729match_extend_back_loop_encodeSnappyBetterBlockAsm64K:
15730 CMPL CX, SI
15731 JBE match_extend_back_end_encodeSnappyBetterBlockAsm64K
15732 MOVB -1(DX)(BX*1), DI
15733 MOVB -1(DX)(CX*1), R8
15734 CMPB DI, R8
15735 JNE match_extend_back_end_encodeSnappyBetterBlockAsm64K
15736 LEAL -1(CX), CX
15737 DECL BX
15738 JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K
15739 JMP match_extend_back_loop_encodeSnappyBetterBlockAsm64K
15740
15741match_extend_back_end_encodeSnappyBetterBlockAsm64K:
15742 MOVL CX, SI
15743 SUBL 12(SP), SI
15744 LEAQ 3(AX)(SI*1), SI
15745 CMPQ SI, (SP)
15746 JB match_dst_size_check_encodeSnappyBetterBlockAsm64K
15747 MOVQ $0x00000000, ret+48(FP)
15748 RET
15749
15750match_dst_size_check_encodeSnappyBetterBlockAsm64K:
15751 MOVL CX, SI
15752 ADDL $0x04, CX
15753 ADDL $0x04, BX
15754 MOVQ src_len+32(FP), DI
15755 SUBL CX, DI
15756 LEAQ (DX)(CX*1), R8
15757 LEAQ (DX)(BX*1), R9
15758
15759 // matchLen
15760 XORL R11, R11
15761
15762matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K:
15763 CMPL DI, $0x10
15764 JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K
15765 MOVQ (R8)(R11*1), R10
15766 MOVQ 8(R8)(R11*1), R12
15767 XORQ (R9)(R11*1), R10
15768 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K
15769 XORQ 8(R9)(R11*1), R12
15770 JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K
15771 LEAL -16(DI), DI
15772 LEAL 16(R11), R11
15773 JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K
15774
15775matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K:
15776#ifdef GOAMD64_v3
15777 TZCNTQ R12, R12
15778
15779#else
15780 BSFQ R12, R12
15781
15782#endif
15783 SARQ $0x03, R12
15784 LEAL 8(R11)(R12*1), R11
15785 JMP match_nolit_end_encodeSnappyBetterBlockAsm64K
15786
15787matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K:
15788 CMPL DI, $0x08
15789 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K
15790 MOVQ (R8)(R11*1), R10
15791 XORQ (R9)(R11*1), R10
15792 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K
15793 LEAL -8(DI), DI
15794 LEAL 8(R11), R11
15795 JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K
15796
15797matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K:
15798#ifdef GOAMD64_v3
15799 TZCNTQ R10, R10
15800
15801#else
15802 BSFQ R10, R10
15803
15804#endif
15805 SARQ $0x03, R10
15806 LEAL (R11)(R10*1), R11
15807 JMP match_nolit_end_encodeSnappyBetterBlockAsm64K
15808
15809matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K:
15810 CMPL DI, $0x04
15811 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
15812 MOVL (R8)(R11*1), R10
15813 CMPL (R9)(R11*1), R10
15814 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
15815 LEAL -4(DI), DI
15816 LEAL 4(R11), R11
15817
15818matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K:
15819 CMPL DI, $0x01
15820 JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
15821 JB match_nolit_end_encodeSnappyBetterBlockAsm64K
15822 MOVW (R8)(R11*1), R10
15823 CMPW (R9)(R11*1), R10
15824 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
15825 LEAL 2(R11), R11
15826 SUBL $0x02, DI
15827 JZ match_nolit_end_encodeSnappyBetterBlockAsm64K
15828
15829matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K:
15830 MOVB (R8)(R11*1), R10
15831 CMPB (R9)(R11*1), R10
15832 JNE match_nolit_end_encodeSnappyBetterBlockAsm64K
15833 LEAL 1(R11), R11
15834
15835match_nolit_end_encodeSnappyBetterBlockAsm64K:
15836 MOVL CX, DI
15837 SUBL BX, DI
15838
15839 // Check if repeat
15840 MOVL DI, 16(SP)
15841 MOVL 12(SP), BX
15842 CMPL BX, SI
15843 JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
15844 MOVL SI, R8
15845 MOVL SI, 12(SP)
15846 LEAQ (DX)(BX*1), R9
15847 SUBL BX, R8
15848 LEAL -1(R8), BX
15849 CMPL BX, $0x3c
15850 JB one_byte_match_emit_encodeSnappyBetterBlockAsm64K
15851 CMPL BX, $0x00000100
15852 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm64K
15853 JB three_bytes_match_emit_encodeSnappyBetterBlockAsm64K
15854
15855three_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
15856 MOVB $0xf4, (AX)
15857 MOVW BX, 1(AX)
15858 ADDQ $0x03, AX
15859 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
15860
15861two_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
15862 MOVB $0xf0, (AX)
15863 MOVB BL, 1(AX)
15864 ADDQ $0x02, AX
15865 CMPL BX, $0x40
15866 JB memmove_match_emit_encodeSnappyBetterBlockAsm64K
15867 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
15868
15869one_byte_match_emit_encodeSnappyBetterBlockAsm64K:
15870 SHLB $0x02, BL
15871 MOVB BL, (AX)
15872 ADDQ $0x01, AX
15873
15874memmove_match_emit_encodeSnappyBetterBlockAsm64K:
15875 LEAQ (AX)(R8*1), BX
15876
15877 // genMemMoveShort
15878 CMPQ R8, $0x08
15879 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8
15880 CMPQ R8, $0x10
15881 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
15882 CMPQ R8, $0x20
15883 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
15884 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
15885
15886emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8:
15887 MOVQ (R9), R10
15888 MOVQ R10, (AX)
15889 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
15890
15891emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
15892 MOVQ (R9), R10
15893 MOVQ -8(R9)(R8*1), R9
15894 MOVQ R10, (AX)
15895 MOVQ R9, -8(AX)(R8*1)
15896 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
15897
15898emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
15899 MOVOU (R9), X0
15900 MOVOU -16(R9)(R8*1), X1
15901 MOVOU X0, (AX)
15902 MOVOU X1, -16(AX)(R8*1)
15903 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
15904
15905emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
15906 MOVOU (R9), X0
15907 MOVOU 16(R9), X1
15908 MOVOU -32(R9)(R8*1), X2
15909 MOVOU -16(R9)(R8*1), X3
15910 MOVOU X0, (AX)
15911 MOVOU X1, 16(AX)
15912 MOVOU X2, -32(AX)(R8*1)
15913 MOVOU X3, -16(AX)(R8*1)
15914
15915memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K:
15916 MOVQ BX, AX
15917 JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
15918
15919memmove_long_match_emit_encodeSnappyBetterBlockAsm64K:
15920 LEAQ (AX)(R8*1), BX
15921
15922 // genMemMoveLong
15923 MOVOU (R9), X0
15924 MOVOU 16(R9), X1
15925 MOVOU -32(R9)(R8*1), X2
15926 MOVOU -16(R9)(R8*1), X3
15927 MOVQ R8, R12
15928 SHRQ $0x05, R12
15929 MOVQ AX, R10
15930 ANDL $0x0000001f, R10
15931 MOVQ $0x00000040, R13
15932 SUBQ R10, R13
15933 DECQ R12
15934 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
15935 LEAQ -32(R9)(R13*1), R10
15936 LEAQ -32(AX)(R13*1), R14
15937
15938emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
15939 MOVOU (R10), X4
15940 MOVOU 16(R10), X5
15941 MOVOA X4, (R14)
15942 MOVOA X5, 16(R14)
15943 ADDQ $0x20, R14
15944 ADDQ $0x20, R10
15945 ADDQ $0x20, R13
15946 DECQ R12
15947 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
15948
15949emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
15950 MOVOU -32(R9)(R13*1), X4
15951 MOVOU -16(R9)(R13*1), X5
15952 MOVOA X4, -32(AX)(R13*1)
15953 MOVOA X5, -16(AX)(R13*1)
15954 ADDQ $0x20, R13
15955 CMPQ R8, R13
15956 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
15957 MOVOU X0, (AX)
15958 MOVOU X1, 16(AX)
15959 MOVOU X2, -32(AX)(R8*1)
15960 MOVOU X3, -16(AX)(R8*1)
15961 MOVQ BX, AX
15962
15963emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K:
15964 ADDL R11, CX
15965 ADDL $0x04, R11
15966 MOVL CX, 12(SP)
15967
15968 // emitCopy
15969two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K:
15970 CMPL R11, $0x40
15971 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K
15972 MOVB $0xee, (AX)
15973 MOVW DI, 1(AX)
15974 LEAL -60(R11), R11
15975 ADDQ $0x03, AX
15976 JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K
15977
15978two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K:
15979 MOVL R11, BX
15980 SHLL $0x02, BX
15981 CMPL R11, $0x0c
15982 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
15983 CMPL DI, $0x00000800
15984 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
15985 LEAL -15(BX), BX
15986 MOVB DI, 1(AX)
15987 SHRL $0x08, DI
15988 SHLL $0x05, DI
15989 ORL DI, BX
15990 MOVB BL, (AX)
15991 ADDQ $0x02, AX
15992 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K
15993
15994emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K:
15995 LEAL -2(BX), BX
15996 MOVB BL, (AX)
15997 MOVW DI, 1(AX)
15998 ADDQ $0x03, AX
15999
16000match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:
16001 CMPL CX, 8(SP)
16002 JAE emit_remainder_encodeSnappyBetterBlockAsm64K
16003 CMPQ AX, (SP)
16004 JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K
16005 MOVQ $0x00000000, ret+48(FP)
16006 RET
16007
16008match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K:
16009 MOVQ $0x00cf1bbcdcbfa563, BX
16010 MOVQ $0x9e3779b1, DI
16011 LEAQ 1(SI), SI
16012 LEAQ -2(CX), R8
16013 MOVQ (DX)(SI*1), R9
16014 MOVQ 1(DX)(SI*1), R10
16015 MOVQ (DX)(R8*1), R11
16016 MOVQ 1(DX)(R8*1), R12
16017 SHLQ $0x08, R9
16018 IMULQ BX, R9
16019 SHRQ $0x30, R9
16020 SHLQ $0x20, R10
16021 IMULQ DI, R10
16022 SHRQ $0x32, R10
16023 SHLQ $0x08, R11
16024 IMULQ BX, R11
16025 SHRQ $0x30, R11
16026 SHLQ $0x20, R12
16027 IMULQ DI, R12
16028 SHRQ $0x32, R12
16029 LEAQ 1(SI), DI
16030 LEAQ 1(R8), R13
16031 MOVL SI, 24(SP)(R9*4)
16032 MOVL R8, 24(SP)(R11*4)
16033 MOVL DI, 262168(SP)(R10*4)
16034 MOVL R13, 262168(SP)(R12*4)
16035 LEAQ 1(R8)(SI*1), DI
16036 SHRQ $0x01, DI
16037 ADDQ $0x01, SI
16038 SUBQ $0x01, R8
16039
16040index_loop_encodeSnappyBetterBlockAsm64K:
16041 CMPQ DI, R8
16042 JAE search_loop_encodeSnappyBetterBlockAsm64K
16043 MOVQ (DX)(SI*1), R9
16044 MOVQ (DX)(DI*1), R10
16045 SHLQ $0x08, R9
16046 IMULQ BX, R9
16047 SHRQ $0x30, R9
16048 SHLQ $0x08, R10
16049 IMULQ BX, R10
16050 SHRQ $0x30, R10
16051 MOVL SI, 24(SP)(R9*4)
16052 MOVL DI, 24(SP)(R10*4)
16053 ADDQ $0x02, SI
16054 ADDQ $0x02, DI
16055 JMP index_loop_encodeSnappyBetterBlockAsm64K
16056
16057emit_remainder_encodeSnappyBetterBlockAsm64K:
16058 MOVQ src_len+32(FP), CX
16059 SUBL 12(SP), CX
16060 LEAQ 3(AX)(CX*1), CX
16061 CMPQ CX, (SP)
16062 JB emit_remainder_ok_encodeSnappyBetterBlockAsm64K
16063 MOVQ $0x00000000, ret+48(FP)
16064 RET
16065
16066emit_remainder_ok_encodeSnappyBetterBlockAsm64K:
16067 MOVQ src_len+32(FP), CX
16068 MOVL 12(SP), BX
16069 CMPL BX, CX
16070 JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
16071 MOVL CX, SI
16072 MOVL CX, 12(SP)
16073 LEAQ (DX)(BX*1), CX
16074 SUBL BX, SI
16075 LEAL -1(SI), DX
16076 CMPL DX, $0x3c
16077 JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K
16078 CMPL DX, $0x00000100
16079 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K
16080 JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K
16081
16082three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
16083 MOVB $0xf4, (AX)
16084 MOVW DX, 1(AX)
16085 ADDQ $0x03, AX
16086 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
16087
16088two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
16089 MOVB $0xf0, (AX)
16090 MOVB DL, 1(AX)
16091 ADDQ $0x02, AX
16092 CMPL DX, $0x40
16093 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm64K
16094 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
16095
16096one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K:
16097 SHLB $0x02, DL
16098 MOVB DL, (AX)
16099 ADDQ $0x01, AX
16100
16101memmove_emit_remainder_encodeSnappyBetterBlockAsm64K:
16102 LEAQ (AX)(SI*1), DX
16103 MOVL SI, BX
16104
16105 // genMemMoveShort
16106 CMPQ BX, $0x03
16107 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2
16108 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3
16109 CMPQ BX, $0x08
16110 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7
16111 CMPQ BX, $0x10
16112 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
16113 CMPQ BX, $0x20
16114 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
16115 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
16116
16117emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2:
16118 MOVB (CX), SI
16119 MOVB -1(CX)(BX*1), CL
16120 MOVB SI, (AX)
16121 MOVB CL, -1(AX)(BX*1)
16122 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
16123
16124emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3:
16125 MOVW (CX), SI
16126 MOVB 2(CX), CL
16127 MOVW SI, (AX)
16128 MOVB CL, 2(AX)
16129 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
16130
16131emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7:
16132 MOVL (CX), SI
16133 MOVL -4(CX)(BX*1), CX
16134 MOVL SI, (AX)
16135 MOVL CX, -4(AX)(BX*1)
16136 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
16137
16138emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
16139 MOVQ (CX), SI
16140 MOVQ -8(CX)(BX*1), CX
16141 MOVQ SI, (AX)
16142 MOVQ CX, -8(AX)(BX*1)
16143 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
16144
16145emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
16146 MOVOU (CX), X0
16147 MOVOU -16(CX)(BX*1), X1
16148 MOVOU X0, (AX)
16149 MOVOU X1, -16(AX)(BX*1)
16150 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
16151
16152emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
16153 MOVOU (CX), X0
16154 MOVOU 16(CX), X1
16155 MOVOU -32(CX)(BX*1), X2
16156 MOVOU -16(CX)(BX*1), X3
16157 MOVOU X0, (AX)
16158 MOVOU X1, 16(AX)
16159 MOVOU X2, -32(AX)(BX*1)
16160 MOVOU X3, -16(AX)(BX*1)
16161
16162memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K:
16163 MOVQ DX, AX
16164 JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
16165
16166memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K:
16167 LEAQ (AX)(SI*1), DX
16168 MOVL SI, BX
16169
16170 // genMemMoveLong
16171 MOVOU (CX), X0
16172 MOVOU 16(CX), X1
16173 MOVOU -32(CX)(BX*1), X2
16174 MOVOU -16(CX)(BX*1), X3
16175 MOVQ BX, DI
16176 SHRQ $0x05, DI
16177 MOVQ AX, SI
16178 ANDL $0x0000001f, SI
16179 MOVQ $0x00000040, R8
16180 SUBQ SI, R8
16181 DECQ DI
16182 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
16183 LEAQ -32(CX)(R8*1), SI
16184 LEAQ -32(AX)(R8*1), R9
16185
16186emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
16187 MOVOU (SI), X4
16188 MOVOU 16(SI), X5
16189 MOVOA X4, (R9)
16190 MOVOA X5, 16(R9)
16191 ADDQ $0x20, R9
16192 ADDQ $0x20, SI
16193 ADDQ $0x20, R8
16194 DECQ DI
16195 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
16196
16197emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
16198 MOVOU -32(CX)(R8*1), X4
16199 MOVOU -16(CX)(R8*1), X5
16200 MOVOA X4, -32(AX)(R8*1)
16201 MOVOA X5, -16(AX)(R8*1)
16202 ADDQ $0x20, R8
16203 CMPQ BX, R8
16204 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
16205 MOVOU X0, (AX)
16206 MOVOU X1, 16(AX)
16207 MOVOU X2, -32(AX)(BX*1)
16208 MOVOU X3, -16(AX)(BX*1)
16209 MOVQ DX, AX
16210
16211emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K:
16212 MOVQ dst_base+0(FP), CX
16213 SUBQ CX, AX
16214 MOVQ AX, ret+48(FP)
16215 RET
16216
16217// func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int
16218// Requires: BMI, SSE2
16219TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56
16220 MOVQ dst_base+0(FP), AX
16221 MOVQ $0x00000280, CX
16222 LEAQ 24(SP), DX
16223 PXOR X0, X0
16224
16225zero_loop_encodeSnappyBetterBlockAsm12B:
16226 MOVOU X0, (DX)
16227 MOVOU X0, 16(DX)
16228 MOVOU X0, 32(DX)
16229 MOVOU X0, 48(DX)
16230 MOVOU X0, 64(DX)
16231 MOVOU X0, 80(DX)
16232 MOVOU X0, 96(DX)
16233 MOVOU X0, 112(DX)
16234 ADDQ $0x80, DX
16235 DECQ CX
16236 JNZ zero_loop_encodeSnappyBetterBlockAsm12B
16237 MOVL $0x00000000, 12(SP)
16238 MOVQ src_len+32(FP), CX
16239 LEAQ -9(CX), DX
16240 LEAQ -8(CX), BX
16241 MOVL BX, 8(SP)
16242 SHRQ $0x05, CX
16243 SUBL CX, DX
16244 LEAQ (AX)(DX*1), DX
16245 MOVQ DX, (SP)
16246 MOVL $0x00000001, CX
16247 MOVL $0x00000000, 16(SP)
16248 MOVQ src_base+24(FP), DX
16249
16250search_loop_encodeSnappyBetterBlockAsm12B:
16251 MOVL CX, BX
16252 SUBL 12(SP), BX
16253 SHRL $0x06, BX
16254 LEAL 1(CX)(BX*1), BX
16255 CMPL BX, 8(SP)
16256 JAE emit_remainder_encodeSnappyBetterBlockAsm12B
16257 MOVQ (DX)(CX*1), SI
16258 MOVL BX, 20(SP)
16259 MOVQ $0x0000cf1bbcdcbf9b, R8
16260 MOVQ $0x9e3779b1, BX
16261 MOVQ SI, R9
16262 MOVQ SI, R10
16263 SHLQ $0x10, R9
16264 IMULQ R8, R9
16265 SHRQ $0x32, R9
16266 SHLQ $0x20, R10
16267 IMULQ BX, R10
16268 SHRQ $0x34, R10
16269 MOVL 24(SP)(R9*4), BX
16270 MOVL 65560(SP)(R10*4), DI
16271 MOVL CX, 24(SP)(R9*4)
16272 MOVL CX, 65560(SP)(R10*4)
16273 MOVQ (DX)(BX*1), R9
16274 MOVQ (DX)(DI*1), R10
16275 CMPQ R9, SI
16276 JEQ candidate_match_encodeSnappyBetterBlockAsm12B
16277 CMPQ R10, SI
16278 JNE no_short_found_encodeSnappyBetterBlockAsm12B
16279 MOVL DI, BX
16280 JMP candidate_match_encodeSnappyBetterBlockAsm12B
16281
16282no_short_found_encodeSnappyBetterBlockAsm12B:
16283 CMPL R9, SI
16284 JEQ candidate_match_encodeSnappyBetterBlockAsm12B
16285 CMPL R10, SI
16286 JEQ candidateS_match_encodeSnappyBetterBlockAsm12B
16287 MOVL 20(SP), CX
16288 JMP search_loop_encodeSnappyBetterBlockAsm12B
16289
16290candidateS_match_encodeSnappyBetterBlockAsm12B:
16291 SHRQ $0x08, SI
16292 MOVQ SI, R9
16293 SHLQ $0x10, R9
16294 IMULQ R8, R9
16295 SHRQ $0x32, R9
16296 MOVL 24(SP)(R9*4), BX
16297 INCL CX
16298 MOVL CX, 24(SP)(R9*4)
16299 CMPL (DX)(BX*1), SI
16300 JEQ candidate_match_encodeSnappyBetterBlockAsm12B
16301 DECL CX
16302 MOVL DI, BX
16303
16304candidate_match_encodeSnappyBetterBlockAsm12B:
16305 MOVL 12(SP), SI
16306 TESTL BX, BX
16307 JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B
16308
16309match_extend_back_loop_encodeSnappyBetterBlockAsm12B:
16310 CMPL CX, SI
16311 JBE match_extend_back_end_encodeSnappyBetterBlockAsm12B
16312 MOVB -1(DX)(BX*1), DI
16313 MOVB -1(DX)(CX*1), R8
16314 CMPB DI, R8
16315 JNE match_extend_back_end_encodeSnappyBetterBlockAsm12B
16316 LEAL -1(CX), CX
16317 DECL BX
16318 JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B
16319 JMP match_extend_back_loop_encodeSnappyBetterBlockAsm12B
16320
16321match_extend_back_end_encodeSnappyBetterBlockAsm12B:
16322 MOVL CX, SI
16323 SUBL 12(SP), SI
16324 LEAQ 3(AX)(SI*1), SI
16325 CMPQ SI, (SP)
16326 JB match_dst_size_check_encodeSnappyBetterBlockAsm12B
16327 MOVQ $0x00000000, ret+48(FP)
16328 RET
16329
16330match_dst_size_check_encodeSnappyBetterBlockAsm12B:
16331 MOVL CX, SI
16332 ADDL $0x04, CX
16333 ADDL $0x04, BX
16334 MOVQ src_len+32(FP), DI
16335 SUBL CX, DI
16336 LEAQ (DX)(CX*1), R8
16337 LEAQ (DX)(BX*1), R9
16338
16339 // matchLen
16340 XORL R11, R11
16341
16342matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B:
16343 CMPL DI, $0x10
16344 JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B
16345 MOVQ (R8)(R11*1), R10
16346 MOVQ 8(R8)(R11*1), R12
16347 XORQ (R9)(R11*1), R10
16348 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B
16349 XORQ 8(R9)(R11*1), R12
16350 JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B
16351 LEAL -16(DI), DI
16352 LEAL 16(R11), R11
16353 JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B
16354
16355matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B:
16356#ifdef GOAMD64_v3
16357 TZCNTQ R12, R12
16358
16359#else
16360 BSFQ R12, R12
16361
16362#endif
16363 SARQ $0x03, R12
16364 LEAL 8(R11)(R12*1), R11
16365 JMP match_nolit_end_encodeSnappyBetterBlockAsm12B
16366
16367matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B:
16368 CMPL DI, $0x08
16369 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B
16370 MOVQ (R8)(R11*1), R10
16371 XORQ (R9)(R11*1), R10
16372 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B
16373 LEAL -8(DI), DI
16374 LEAL 8(R11), R11
16375 JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B
16376
16377matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B:
16378#ifdef GOAMD64_v3
16379 TZCNTQ R10, R10
16380
16381#else
16382 BSFQ R10, R10
16383
16384#endif
16385 SARQ $0x03, R10
16386 LEAL (R11)(R10*1), R11
16387 JMP match_nolit_end_encodeSnappyBetterBlockAsm12B
16388
16389matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B:
16390 CMPL DI, $0x04
16391 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
16392 MOVL (R8)(R11*1), R10
16393 CMPL (R9)(R11*1), R10
16394 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
16395 LEAL -4(DI), DI
16396 LEAL 4(R11), R11
16397
16398matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B:
16399 CMPL DI, $0x01
16400 JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
16401 JB match_nolit_end_encodeSnappyBetterBlockAsm12B
16402 MOVW (R8)(R11*1), R10
16403 CMPW (R9)(R11*1), R10
16404 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
16405 LEAL 2(R11), R11
16406 SUBL $0x02, DI
16407 JZ match_nolit_end_encodeSnappyBetterBlockAsm12B
16408
16409matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B:
16410 MOVB (R8)(R11*1), R10
16411 CMPB (R9)(R11*1), R10
16412 JNE match_nolit_end_encodeSnappyBetterBlockAsm12B
16413 LEAL 1(R11), R11
16414
16415match_nolit_end_encodeSnappyBetterBlockAsm12B:
16416 MOVL CX, DI
16417 SUBL BX, DI
16418
16419 // Check if repeat
16420 MOVL DI, 16(SP)
16421 MOVL 12(SP), BX
16422 CMPL BX, SI
16423 JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
16424 MOVL SI, R8
16425 MOVL SI, 12(SP)
16426 LEAQ (DX)(BX*1), R9
16427 SUBL BX, R8
16428 LEAL -1(R8), BX
16429 CMPL BX, $0x3c
16430 JB one_byte_match_emit_encodeSnappyBetterBlockAsm12B
16431 CMPL BX, $0x00000100
16432 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm12B
16433 JB three_bytes_match_emit_encodeSnappyBetterBlockAsm12B
16434
16435three_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
16436 MOVB $0xf4, (AX)
16437 MOVW BX, 1(AX)
16438 ADDQ $0x03, AX
16439 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
16440
16441two_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
16442 MOVB $0xf0, (AX)
16443 MOVB BL, 1(AX)
16444 ADDQ $0x02, AX
16445 CMPL BX, $0x40
16446 JB memmove_match_emit_encodeSnappyBetterBlockAsm12B
16447 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
16448
16449one_byte_match_emit_encodeSnappyBetterBlockAsm12B:
16450 SHLB $0x02, BL
16451 MOVB BL, (AX)
16452 ADDQ $0x01, AX
16453
16454memmove_match_emit_encodeSnappyBetterBlockAsm12B:
16455 LEAQ (AX)(R8*1), BX
16456
16457 // genMemMoveShort
16458 CMPQ R8, $0x08
16459 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8
16460 CMPQ R8, $0x10
16461 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
16462 CMPQ R8, $0x20
16463 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
16464 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
16465
16466emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8:
16467 MOVQ (R9), R10
16468 MOVQ R10, (AX)
16469 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
16470
16471emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
16472 MOVQ (R9), R10
16473 MOVQ -8(R9)(R8*1), R9
16474 MOVQ R10, (AX)
16475 MOVQ R9, -8(AX)(R8*1)
16476 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
16477
16478emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
16479 MOVOU (R9), X0
16480 MOVOU -16(R9)(R8*1), X1
16481 MOVOU X0, (AX)
16482 MOVOU X1, -16(AX)(R8*1)
16483 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
16484
16485emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
16486 MOVOU (R9), X0
16487 MOVOU 16(R9), X1
16488 MOVOU -32(R9)(R8*1), X2
16489 MOVOU -16(R9)(R8*1), X3
16490 MOVOU X0, (AX)
16491 MOVOU X1, 16(AX)
16492 MOVOU X2, -32(AX)(R8*1)
16493 MOVOU X3, -16(AX)(R8*1)
16494
16495memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B:
16496 MOVQ BX, AX
16497 JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
16498
16499memmove_long_match_emit_encodeSnappyBetterBlockAsm12B:
16500 LEAQ (AX)(R8*1), BX
16501
16502 // genMemMoveLong
16503 MOVOU (R9), X0
16504 MOVOU 16(R9), X1
16505 MOVOU -32(R9)(R8*1), X2
16506 MOVOU -16(R9)(R8*1), X3
16507 MOVQ R8, R12
16508 SHRQ $0x05, R12
16509 MOVQ AX, R10
16510 ANDL $0x0000001f, R10
16511 MOVQ $0x00000040, R13
16512 SUBQ R10, R13
16513 DECQ R12
16514 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
16515 LEAQ -32(R9)(R13*1), R10
16516 LEAQ -32(AX)(R13*1), R14
16517
16518emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
16519 MOVOU (R10), X4
16520 MOVOU 16(R10), X5
16521 MOVOA X4, (R14)
16522 MOVOA X5, 16(R14)
16523 ADDQ $0x20, R14
16524 ADDQ $0x20, R10
16525 ADDQ $0x20, R13
16526 DECQ R12
16527 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
16528
16529emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
16530 MOVOU -32(R9)(R13*1), X4
16531 MOVOU -16(R9)(R13*1), X5
16532 MOVOA X4, -32(AX)(R13*1)
16533 MOVOA X5, -16(AX)(R13*1)
16534 ADDQ $0x20, R13
16535 CMPQ R8, R13
16536 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
16537 MOVOU X0, (AX)
16538 MOVOU X1, 16(AX)
16539 MOVOU X2, -32(AX)(R8*1)
16540 MOVOU X3, -16(AX)(R8*1)
16541 MOVQ BX, AX
16542
16543emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B:
16544 ADDL R11, CX
16545 ADDL $0x04, R11
16546 MOVL CX, 12(SP)
16547
16548 // emitCopy
16549two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B:
16550 CMPL R11, $0x40
16551 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B
16552 MOVB $0xee, (AX)
16553 MOVW DI, 1(AX)
16554 LEAL -60(R11), R11
16555 ADDQ $0x03, AX
16556 JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B
16557
16558two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B:
16559 MOVL R11, BX
16560 SHLL $0x02, BX
16561 CMPL R11, $0x0c
16562 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
16563 CMPL DI, $0x00000800
16564 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
16565 LEAL -15(BX), BX
16566 MOVB DI, 1(AX)
16567 SHRL $0x08, DI
16568 SHLL $0x05, DI
16569 ORL DI, BX
16570 MOVB BL, (AX)
16571 ADDQ $0x02, AX
16572 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B
16573
16574emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B:
16575 LEAL -2(BX), BX
16576 MOVB BL, (AX)
16577 MOVW DI, 1(AX)
16578 ADDQ $0x03, AX
16579
16580match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:
16581 CMPL CX, 8(SP)
16582 JAE emit_remainder_encodeSnappyBetterBlockAsm12B
16583 CMPQ AX, (SP)
16584 JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B
16585 MOVQ $0x00000000, ret+48(FP)
16586 RET
16587
16588match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B:
16589 MOVQ $0x0000cf1bbcdcbf9b, BX
16590 MOVQ $0x9e3779b1, DI
16591 LEAQ 1(SI), SI
16592 LEAQ -2(CX), R8
16593 MOVQ (DX)(SI*1), R9
16594 MOVQ 1(DX)(SI*1), R10
16595 MOVQ (DX)(R8*1), R11
16596 MOVQ 1(DX)(R8*1), R12
16597 SHLQ $0x10, R9
16598 IMULQ BX, R9
16599 SHRQ $0x32, R9
16600 SHLQ $0x20, R10
16601 IMULQ DI, R10
16602 SHRQ $0x34, R10
16603 SHLQ $0x10, R11
16604 IMULQ BX, R11
16605 SHRQ $0x32, R11
16606 SHLQ $0x20, R12
16607 IMULQ DI, R12
16608 SHRQ $0x34, R12
16609 LEAQ 1(SI), DI
16610 LEAQ 1(R8), R13
16611 MOVL SI, 24(SP)(R9*4)
16612 MOVL R8, 24(SP)(R11*4)
16613 MOVL DI, 65560(SP)(R10*4)
16614 MOVL R13, 65560(SP)(R12*4)
16615 LEAQ 1(R8)(SI*1), DI
16616 SHRQ $0x01, DI
16617 ADDQ $0x01, SI
16618 SUBQ $0x01, R8
16619
16620index_loop_encodeSnappyBetterBlockAsm12B:
16621 CMPQ DI, R8
16622 JAE search_loop_encodeSnappyBetterBlockAsm12B
16623 MOVQ (DX)(SI*1), R9
16624 MOVQ (DX)(DI*1), R10
16625 SHLQ $0x10, R9
16626 IMULQ BX, R9
16627 SHRQ $0x32, R9
16628 SHLQ $0x10, R10
16629 IMULQ BX, R10
16630 SHRQ $0x32, R10
16631 MOVL SI, 24(SP)(R9*4)
16632 MOVL DI, 24(SP)(R10*4)
16633 ADDQ $0x02, SI
16634 ADDQ $0x02, DI
16635 JMP index_loop_encodeSnappyBetterBlockAsm12B
16636
16637emit_remainder_encodeSnappyBetterBlockAsm12B:
16638 MOVQ src_len+32(FP), CX
16639 SUBL 12(SP), CX
16640 LEAQ 3(AX)(CX*1), CX
16641 CMPQ CX, (SP)
16642 JB emit_remainder_ok_encodeSnappyBetterBlockAsm12B
16643 MOVQ $0x00000000, ret+48(FP)
16644 RET
16645
16646emit_remainder_ok_encodeSnappyBetterBlockAsm12B:
16647 MOVQ src_len+32(FP), CX
16648 MOVL 12(SP), BX
16649 CMPL BX, CX
16650 JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
16651 MOVL CX, SI
16652 MOVL CX, 12(SP)
16653 LEAQ (DX)(BX*1), CX
16654 SUBL BX, SI
16655 LEAL -1(SI), DX
16656 CMPL DX, $0x3c
16657 JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B
16658 CMPL DX, $0x00000100
16659 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B
16660 JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B
16661
16662three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
16663 MOVB $0xf4, (AX)
16664 MOVW DX, 1(AX)
16665 ADDQ $0x03, AX
16666 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
16667
16668two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
16669 MOVB $0xf0, (AX)
16670 MOVB DL, 1(AX)
16671 ADDQ $0x02, AX
16672 CMPL DX, $0x40
16673 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm12B
16674 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
16675
16676one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B:
16677 SHLB $0x02, DL
16678 MOVB DL, (AX)
16679 ADDQ $0x01, AX
16680
16681memmove_emit_remainder_encodeSnappyBetterBlockAsm12B:
16682 LEAQ (AX)(SI*1), DX
16683 MOVL SI, BX
16684
16685 // genMemMoveShort
16686 CMPQ BX, $0x03
16687 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2
16688 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3
16689 CMPQ BX, $0x08
16690 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7
16691 CMPQ BX, $0x10
16692 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
16693 CMPQ BX, $0x20
16694 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
16695 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
16696
16697emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2:
16698 MOVB (CX), SI
16699 MOVB -1(CX)(BX*1), CL
16700 MOVB SI, (AX)
16701 MOVB CL, -1(AX)(BX*1)
16702 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
16703
16704emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3:
16705 MOVW (CX), SI
16706 MOVB 2(CX), CL
16707 MOVW SI, (AX)
16708 MOVB CL, 2(AX)
16709 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
16710
16711emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7:
16712 MOVL (CX), SI
16713 MOVL -4(CX)(BX*1), CX
16714 MOVL SI, (AX)
16715 MOVL CX, -4(AX)(BX*1)
16716 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
16717
16718emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
16719 MOVQ (CX), SI
16720 MOVQ -8(CX)(BX*1), CX
16721 MOVQ SI, (AX)
16722 MOVQ CX, -8(AX)(BX*1)
16723 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
16724
16725emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
16726 MOVOU (CX), X0
16727 MOVOU -16(CX)(BX*1), X1
16728 MOVOU X0, (AX)
16729 MOVOU X1, -16(AX)(BX*1)
16730 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
16731
16732emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
16733 MOVOU (CX), X0
16734 MOVOU 16(CX), X1
16735 MOVOU -32(CX)(BX*1), X2
16736 MOVOU -16(CX)(BX*1), X3
16737 MOVOU X0, (AX)
16738 MOVOU X1, 16(AX)
16739 MOVOU X2, -32(AX)(BX*1)
16740 MOVOU X3, -16(AX)(BX*1)
16741
16742memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B:
16743 MOVQ DX, AX
16744 JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
16745
16746memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B:
16747 LEAQ (AX)(SI*1), DX
16748 MOVL SI, BX
16749
16750 // genMemMoveLong
16751 MOVOU (CX), X0
16752 MOVOU 16(CX), X1
16753 MOVOU -32(CX)(BX*1), X2
16754 MOVOU -16(CX)(BX*1), X3
16755 MOVQ BX, DI
16756 SHRQ $0x05, DI
16757 MOVQ AX, SI
16758 ANDL $0x0000001f, SI
16759 MOVQ $0x00000040, R8
16760 SUBQ SI, R8
16761 DECQ DI
16762 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
16763 LEAQ -32(CX)(R8*1), SI
16764 LEAQ -32(AX)(R8*1), R9
16765
16766emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
16767 MOVOU (SI), X4
16768 MOVOU 16(SI), X5
16769 MOVOA X4, (R9)
16770 MOVOA X5, 16(R9)
16771 ADDQ $0x20, R9
16772 ADDQ $0x20, SI
16773 ADDQ $0x20, R8
16774 DECQ DI
16775 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
16776
16777emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
16778 MOVOU -32(CX)(R8*1), X4
16779 MOVOU -16(CX)(R8*1), X5
16780 MOVOA X4, -32(AX)(R8*1)
16781 MOVOA X5, -16(AX)(R8*1)
16782 ADDQ $0x20, R8
16783 CMPQ BX, R8
16784 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
16785 MOVOU X0, (AX)
16786 MOVOU X1, 16(AX)
16787 MOVOU X2, -32(AX)(BX*1)
16788 MOVOU X3, -16(AX)(BX*1)
16789 MOVQ DX, AX
16790
16791emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B:
16792 MOVQ dst_base+0(FP), CX
16793 SUBQ CX, AX
16794 MOVQ AX, ret+48(FP)
16795 RET
16796
16797// func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int
16798// Requires: BMI, SSE2
16799TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56
16800 MOVQ dst_base+0(FP), AX
16801 MOVQ $0x000000a0, CX
16802 LEAQ 24(SP), DX
16803 PXOR X0, X0
16804
16805zero_loop_encodeSnappyBetterBlockAsm10B:
16806 MOVOU X0, (DX)
16807 MOVOU X0, 16(DX)
16808 MOVOU X0, 32(DX)
16809 MOVOU X0, 48(DX)
16810 MOVOU X0, 64(DX)
16811 MOVOU X0, 80(DX)
16812 MOVOU X0, 96(DX)
16813 MOVOU X0, 112(DX)
16814 ADDQ $0x80, DX
16815 DECQ CX
16816 JNZ zero_loop_encodeSnappyBetterBlockAsm10B
16817 MOVL $0x00000000, 12(SP)
16818 MOVQ src_len+32(FP), CX
16819 LEAQ -9(CX), DX
16820 LEAQ -8(CX), BX
16821 MOVL BX, 8(SP)
16822 SHRQ $0x05, CX
16823 SUBL CX, DX
16824 LEAQ (AX)(DX*1), DX
16825 MOVQ DX, (SP)
16826 MOVL $0x00000001, CX
16827 MOVL $0x00000000, 16(SP)
16828 MOVQ src_base+24(FP), DX
16829
16830search_loop_encodeSnappyBetterBlockAsm10B:
16831 MOVL CX, BX
16832 SUBL 12(SP), BX
16833 SHRL $0x05, BX
16834 LEAL 1(CX)(BX*1), BX
16835 CMPL BX, 8(SP)
16836 JAE emit_remainder_encodeSnappyBetterBlockAsm10B
16837 MOVQ (DX)(CX*1), SI
16838 MOVL BX, 20(SP)
16839 MOVQ $0x0000cf1bbcdcbf9b, R8
16840 MOVQ $0x9e3779b1, BX
16841 MOVQ SI, R9
16842 MOVQ SI, R10
16843 SHLQ $0x10, R9
16844 IMULQ R8, R9
16845 SHRQ $0x34, R9
16846 SHLQ $0x20, R10
16847 IMULQ BX, R10
16848 SHRQ $0x36, R10
16849 MOVL 24(SP)(R9*4), BX
16850 MOVL 16408(SP)(R10*4), DI
16851 MOVL CX, 24(SP)(R9*4)
16852 MOVL CX, 16408(SP)(R10*4)
16853 MOVQ (DX)(BX*1), R9
16854 MOVQ (DX)(DI*1), R10
16855 CMPQ R9, SI
16856 JEQ candidate_match_encodeSnappyBetterBlockAsm10B
16857 CMPQ R10, SI
16858 JNE no_short_found_encodeSnappyBetterBlockAsm10B
16859 MOVL DI, BX
16860 JMP candidate_match_encodeSnappyBetterBlockAsm10B
16861
16862no_short_found_encodeSnappyBetterBlockAsm10B:
16863 CMPL R9, SI
16864 JEQ candidate_match_encodeSnappyBetterBlockAsm10B
16865 CMPL R10, SI
16866 JEQ candidateS_match_encodeSnappyBetterBlockAsm10B
16867 MOVL 20(SP), CX
16868 JMP search_loop_encodeSnappyBetterBlockAsm10B
16869
16870candidateS_match_encodeSnappyBetterBlockAsm10B:
16871 SHRQ $0x08, SI
16872 MOVQ SI, R9
16873 SHLQ $0x10, R9
16874 IMULQ R8, R9
16875 SHRQ $0x34, R9
16876 MOVL 24(SP)(R9*4), BX
16877 INCL CX
16878 MOVL CX, 24(SP)(R9*4)
16879 CMPL (DX)(BX*1), SI
16880 JEQ candidate_match_encodeSnappyBetterBlockAsm10B
16881 DECL CX
16882 MOVL DI, BX
16883
16884candidate_match_encodeSnappyBetterBlockAsm10B:
16885 MOVL 12(SP), SI
16886 TESTL BX, BX
16887 JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B
16888
16889match_extend_back_loop_encodeSnappyBetterBlockAsm10B:
16890 CMPL CX, SI
16891 JBE match_extend_back_end_encodeSnappyBetterBlockAsm10B
16892 MOVB -1(DX)(BX*1), DI
16893 MOVB -1(DX)(CX*1), R8
16894 CMPB DI, R8
16895 JNE match_extend_back_end_encodeSnappyBetterBlockAsm10B
16896 LEAL -1(CX), CX
16897 DECL BX
16898 JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B
16899 JMP match_extend_back_loop_encodeSnappyBetterBlockAsm10B
16900
16901match_extend_back_end_encodeSnappyBetterBlockAsm10B:
16902 MOVL CX, SI
16903 SUBL 12(SP), SI
16904 LEAQ 3(AX)(SI*1), SI
16905 CMPQ SI, (SP)
16906 JB match_dst_size_check_encodeSnappyBetterBlockAsm10B
16907 MOVQ $0x00000000, ret+48(FP)
16908 RET
16909
16910match_dst_size_check_encodeSnappyBetterBlockAsm10B:
16911 MOVL CX, SI
16912 ADDL $0x04, CX
16913 ADDL $0x04, BX
16914 MOVQ src_len+32(FP), DI
16915 SUBL CX, DI
16916 LEAQ (DX)(CX*1), R8
16917 LEAQ (DX)(BX*1), R9
16918
16919 // matchLen
16920 XORL R11, R11
16921
16922matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B:
16923 CMPL DI, $0x10
16924 JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B
16925 MOVQ (R8)(R11*1), R10
16926 MOVQ 8(R8)(R11*1), R12
16927 XORQ (R9)(R11*1), R10
16928 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B
16929 XORQ 8(R9)(R11*1), R12
16930 JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B
16931 LEAL -16(DI), DI
16932 LEAL 16(R11), R11
16933 JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B
16934
16935matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B:
16936#ifdef GOAMD64_v3
16937 TZCNTQ R12, R12
16938
16939#else
16940 BSFQ R12, R12
16941
16942#endif
16943 SARQ $0x03, R12
16944 LEAL 8(R11)(R12*1), R11
16945 JMP match_nolit_end_encodeSnappyBetterBlockAsm10B
16946
16947matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B:
16948 CMPL DI, $0x08
16949 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B
16950 MOVQ (R8)(R11*1), R10
16951 XORQ (R9)(R11*1), R10
16952 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B
16953 LEAL -8(DI), DI
16954 LEAL 8(R11), R11
16955 JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B
16956
16957matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B:
16958#ifdef GOAMD64_v3
16959 TZCNTQ R10, R10
16960
16961#else
16962 BSFQ R10, R10
16963
16964#endif
16965 SARQ $0x03, R10
16966 LEAL (R11)(R10*1), R11
16967 JMP match_nolit_end_encodeSnappyBetterBlockAsm10B
16968
16969matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B:
16970 CMPL DI, $0x04
16971 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
16972 MOVL (R8)(R11*1), R10
16973 CMPL (R9)(R11*1), R10
16974 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
16975 LEAL -4(DI), DI
16976 LEAL 4(R11), R11
16977
16978matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B:
16979 CMPL DI, $0x01
16980 JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
16981 JB match_nolit_end_encodeSnappyBetterBlockAsm10B
16982 MOVW (R8)(R11*1), R10
16983 CMPW (R9)(R11*1), R10
16984 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
16985 LEAL 2(R11), R11
16986 SUBL $0x02, DI
16987 JZ match_nolit_end_encodeSnappyBetterBlockAsm10B
16988
16989matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B:
16990 MOVB (R8)(R11*1), R10
16991 CMPB (R9)(R11*1), R10
16992 JNE match_nolit_end_encodeSnappyBetterBlockAsm10B
16993 LEAL 1(R11), R11
16994
16995match_nolit_end_encodeSnappyBetterBlockAsm10B:
16996 MOVL CX, DI
16997 SUBL BX, DI
16998
16999 // Check if repeat
17000 MOVL DI, 16(SP)
17001 MOVL 12(SP), BX
17002 CMPL BX, SI
17003 JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
17004 MOVL SI, R8
17005 MOVL SI, 12(SP)
17006 LEAQ (DX)(BX*1), R9
17007 SUBL BX, R8
17008 LEAL -1(R8), BX
17009 CMPL BX, $0x3c
17010 JB one_byte_match_emit_encodeSnappyBetterBlockAsm10B
17011 CMPL BX, $0x00000100
17012 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm10B
17013 JB three_bytes_match_emit_encodeSnappyBetterBlockAsm10B
17014
17015three_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
17016 MOVB $0xf4, (AX)
17017 MOVW BX, 1(AX)
17018 ADDQ $0x03, AX
17019 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
17020
17021two_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
17022 MOVB $0xf0, (AX)
17023 MOVB BL, 1(AX)
17024 ADDQ $0x02, AX
17025 CMPL BX, $0x40
17026 JB memmove_match_emit_encodeSnappyBetterBlockAsm10B
17027 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
17028
17029one_byte_match_emit_encodeSnappyBetterBlockAsm10B:
17030 SHLB $0x02, BL
17031 MOVB BL, (AX)
17032 ADDQ $0x01, AX
17033
17034memmove_match_emit_encodeSnappyBetterBlockAsm10B:
17035 LEAQ (AX)(R8*1), BX
17036
17037 // genMemMoveShort
17038 CMPQ R8, $0x08
17039 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8
17040 CMPQ R8, $0x10
17041 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
17042 CMPQ R8, $0x20
17043 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
17044 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
17045
17046emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8:
17047 MOVQ (R9), R10
17048 MOVQ R10, (AX)
17049 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
17050
17051emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
17052 MOVQ (R9), R10
17053 MOVQ -8(R9)(R8*1), R9
17054 MOVQ R10, (AX)
17055 MOVQ R9, -8(AX)(R8*1)
17056 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
17057
17058emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
17059 MOVOU (R9), X0
17060 MOVOU -16(R9)(R8*1), X1
17061 MOVOU X0, (AX)
17062 MOVOU X1, -16(AX)(R8*1)
17063 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
17064
17065emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
17066 MOVOU (R9), X0
17067 MOVOU 16(R9), X1
17068 MOVOU -32(R9)(R8*1), X2
17069 MOVOU -16(R9)(R8*1), X3
17070 MOVOU X0, (AX)
17071 MOVOU X1, 16(AX)
17072 MOVOU X2, -32(AX)(R8*1)
17073 MOVOU X3, -16(AX)(R8*1)
17074
17075memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B:
17076 MOVQ BX, AX
17077 JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
17078
17079memmove_long_match_emit_encodeSnappyBetterBlockAsm10B:
17080 LEAQ (AX)(R8*1), BX
17081
17082 // genMemMoveLong
17083 MOVOU (R9), X0
17084 MOVOU 16(R9), X1
17085 MOVOU -32(R9)(R8*1), X2
17086 MOVOU -16(R9)(R8*1), X3
17087 MOVQ R8, R12
17088 SHRQ $0x05, R12
17089 MOVQ AX, R10
17090 ANDL $0x0000001f, R10
17091 MOVQ $0x00000040, R13
17092 SUBQ R10, R13
17093 DECQ R12
17094 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
17095 LEAQ -32(R9)(R13*1), R10
17096 LEAQ -32(AX)(R13*1), R14
17097
17098emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
17099 MOVOU (R10), X4
17100 MOVOU 16(R10), X5
17101 MOVOA X4, (R14)
17102 MOVOA X5, 16(R14)
17103 ADDQ $0x20, R14
17104 ADDQ $0x20, R10
17105 ADDQ $0x20, R13
17106 DECQ R12
17107 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
17108
17109emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
17110 MOVOU -32(R9)(R13*1), X4
17111 MOVOU -16(R9)(R13*1), X5
17112 MOVOA X4, -32(AX)(R13*1)
17113 MOVOA X5, -16(AX)(R13*1)
17114 ADDQ $0x20, R13
17115 CMPQ R8, R13
17116 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
17117 MOVOU X0, (AX)
17118 MOVOU X1, 16(AX)
17119 MOVOU X2, -32(AX)(R8*1)
17120 MOVOU X3, -16(AX)(R8*1)
17121 MOVQ BX, AX
17122
17123emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B:
17124 ADDL R11, CX
17125 ADDL $0x04, R11
17126 MOVL CX, 12(SP)
17127
17128 // emitCopy
17129two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B:
17130 CMPL R11, $0x40
17131 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B
17132 MOVB $0xee, (AX)
17133 MOVW DI, 1(AX)
17134 LEAL -60(R11), R11
17135 ADDQ $0x03, AX
17136 JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B
17137
17138two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B:
17139 MOVL R11, BX
17140 SHLL $0x02, BX
17141 CMPL R11, $0x0c
17142 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
17143 CMPL DI, $0x00000800
17144 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
17145 LEAL -15(BX), BX
17146 MOVB DI, 1(AX)
17147 SHRL $0x08, DI
17148 SHLL $0x05, DI
17149 ORL DI, BX
17150 MOVB BL, (AX)
17151 ADDQ $0x02, AX
17152 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B
17153
17154emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B:
17155 LEAL -2(BX), BX
17156 MOVB BL, (AX)
17157 MOVW DI, 1(AX)
17158 ADDQ $0x03, AX
17159
17160match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:
17161 CMPL CX, 8(SP)
17162 JAE emit_remainder_encodeSnappyBetterBlockAsm10B
17163 CMPQ AX, (SP)
17164 JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B
17165 MOVQ $0x00000000, ret+48(FP)
17166 RET
17167
17168match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B:
17169 MOVQ $0x0000cf1bbcdcbf9b, BX
17170 MOVQ $0x9e3779b1, DI
17171 LEAQ 1(SI), SI
17172 LEAQ -2(CX), R8
17173 MOVQ (DX)(SI*1), R9
17174 MOVQ 1(DX)(SI*1), R10
17175 MOVQ (DX)(R8*1), R11
17176 MOVQ 1(DX)(R8*1), R12
17177 SHLQ $0x10, R9
17178 IMULQ BX, R9
17179 SHRQ $0x34, R9
17180 SHLQ $0x20, R10
17181 IMULQ DI, R10
17182 SHRQ $0x36, R10
17183 SHLQ $0x10, R11
17184 IMULQ BX, R11
17185 SHRQ $0x34, R11
17186 SHLQ $0x20, R12
17187 IMULQ DI, R12
17188 SHRQ $0x36, R12
17189 LEAQ 1(SI), DI
17190 LEAQ 1(R8), R13
17191 MOVL SI, 24(SP)(R9*4)
17192 MOVL R8, 24(SP)(R11*4)
17193 MOVL DI, 16408(SP)(R10*4)
17194 MOVL R13, 16408(SP)(R12*4)
17195 LEAQ 1(R8)(SI*1), DI
17196 SHRQ $0x01, DI
17197 ADDQ $0x01, SI
17198 SUBQ $0x01, R8
17199
17200index_loop_encodeSnappyBetterBlockAsm10B:
17201 CMPQ DI, R8
17202 JAE search_loop_encodeSnappyBetterBlockAsm10B
17203 MOVQ (DX)(SI*1), R9
17204 MOVQ (DX)(DI*1), R10
17205 SHLQ $0x10, R9
17206 IMULQ BX, R9
17207 SHRQ $0x34, R9
17208 SHLQ $0x10, R10
17209 IMULQ BX, R10
17210 SHRQ $0x34, R10
17211 MOVL SI, 24(SP)(R9*4)
17212 MOVL DI, 24(SP)(R10*4)
17213 ADDQ $0x02, SI
17214 ADDQ $0x02, DI
17215 JMP index_loop_encodeSnappyBetterBlockAsm10B
17216
17217emit_remainder_encodeSnappyBetterBlockAsm10B:
17218 MOVQ src_len+32(FP), CX
17219 SUBL 12(SP), CX
17220 LEAQ 3(AX)(CX*1), CX
17221 CMPQ CX, (SP)
17222 JB emit_remainder_ok_encodeSnappyBetterBlockAsm10B
17223 MOVQ $0x00000000, ret+48(FP)
17224 RET
17225
17226emit_remainder_ok_encodeSnappyBetterBlockAsm10B:
17227 MOVQ src_len+32(FP), CX
17228 MOVL 12(SP), BX
17229 CMPL BX, CX
17230 JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
17231 MOVL CX, SI
17232 MOVL CX, 12(SP)
17233 LEAQ (DX)(BX*1), CX
17234 SUBL BX, SI
17235 LEAL -1(SI), DX
17236 CMPL DX, $0x3c
17237 JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B
17238 CMPL DX, $0x00000100
17239 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B
17240 JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B
17241
17242three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
17243 MOVB $0xf4, (AX)
17244 MOVW DX, 1(AX)
17245 ADDQ $0x03, AX
17246 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
17247
17248two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
17249 MOVB $0xf0, (AX)
17250 MOVB DL, 1(AX)
17251 ADDQ $0x02, AX
17252 CMPL DX, $0x40
17253 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm10B
17254 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
17255
17256one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B:
17257 SHLB $0x02, DL
17258 MOVB DL, (AX)
17259 ADDQ $0x01, AX
17260
17261memmove_emit_remainder_encodeSnappyBetterBlockAsm10B:
17262 LEAQ (AX)(SI*1), DX
17263 MOVL SI, BX
17264
17265 // genMemMoveShort
17266 CMPQ BX, $0x03
17267 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2
17268 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3
17269 CMPQ BX, $0x08
17270 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7
17271 CMPQ BX, $0x10
17272 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
17273 CMPQ BX, $0x20
17274 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
17275 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
17276
17277emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2:
17278 MOVB (CX), SI
17279 MOVB -1(CX)(BX*1), CL
17280 MOVB SI, (AX)
17281 MOVB CL, -1(AX)(BX*1)
17282 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
17283
17284emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3:
17285 MOVW (CX), SI
17286 MOVB 2(CX), CL
17287 MOVW SI, (AX)
17288 MOVB CL, 2(AX)
17289 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
17290
17291emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7:
17292 MOVL (CX), SI
17293 MOVL -4(CX)(BX*1), CX
17294 MOVL SI, (AX)
17295 MOVL CX, -4(AX)(BX*1)
17296 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
17297
17298emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
17299 MOVQ (CX), SI
17300 MOVQ -8(CX)(BX*1), CX
17301 MOVQ SI, (AX)
17302 MOVQ CX, -8(AX)(BX*1)
17303 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
17304
17305emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
17306 MOVOU (CX), X0
17307 MOVOU -16(CX)(BX*1), X1
17308 MOVOU X0, (AX)
17309 MOVOU X1, -16(AX)(BX*1)
17310 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
17311
17312emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
17313 MOVOU (CX), X0
17314 MOVOU 16(CX), X1
17315 MOVOU -32(CX)(BX*1), X2
17316 MOVOU -16(CX)(BX*1), X3
17317 MOVOU X0, (AX)
17318 MOVOU X1, 16(AX)
17319 MOVOU X2, -32(AX)(BX*1)
17320 MOVOU X3, -16(AX)(BX*1)
17321
17322memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B:
17323 MOVQ DX, AX
17324 JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
17325
17326memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B:
17327 LEAQ (AX)(SI*1), DX
17328 MOVL SI, BX
17329
17330 // genMemMoveLong
17331 MOVOU (CX), X0
17332 MOVOU 16(CX), X1
17333 MOVOU -32(CX)(BX*1), X2
17334 MOVOU -16(CX)(BX*1), X3
17335 MOVQ BX, DI
17336 SHRQ $0x05, DI
17337 MOVQ AX, SI
17338 ANDL $0x0000001f, SI
17339 MOVQ $0x00000040, R8
17340 SUBQ SI, R8
17341 DECQ DI
17342 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
17343 LEAQ -32(CX)(R8*1), SI
17344 LEAQ -32(AX)(R8*1), R9
17345
17346emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
17347 MOVOU (SI), X4
17348 MOVOU 16(SI), X5
17349 MOVOA X4, (R9)
17350 MOVOA X5, 16(R9)
17351 ADDQ $0x20, R9
17352 ADDQ $0x20, SI
17353 ADDQ $0x20, R8
17354 DECQ DI
17355 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
17356
17357emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
17358 MOVOU -32(CX)(R8*1), X4
17359 MOVOU -16(CX)(R8*1), X5
17360 MOVOA X4, -32(AX)(R8*1)
17361 MOVOA X5, -16(AX)(R8*1)
17362 ADDQ $0x20, R8
17363 CMPQ BX, R8
17364 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
17365 MOVOU X0, (AX)
17366 MOVOU X1, 16(AX)
17367 MOVOU X2, -32(AX)(BX*1)
17368 MOVOU X3, -16(AX)(BX*1)
17369 MOVQ DX, AX
17370
17371emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B:
17372 MOVQ dst_base+0(FP), CX
17373 SUBQ CX, AX
17374 MOVQ AX, ret+48(FP)
17375 RET
17376
17377// func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
17378// Requires: BMI, SSE2
17379TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56
17380 MOVQ dst_base+0(FP), AX
17381 MOVQ $0x00000028, CX
17382 LEAQ 24(SP), DX
17383 PXOR X0, X0
17384
17385zero_loop_encodeSnappyBetterBlockAsm8B:
17386 MOVOU X0, (DX)
17387 MOVOU X0, 16(DX)
17388 MOVOU X0, 32(DX)
17389 MOVOU X0, 48(DX)
17390 MOVOU X0, 64(DX)
17391 MOVOU X0, 80(DX)
17392 MOVOU X0, 96(DX)
17393 MOVOU X0, 112(DX)
17394 ADDQ $0x80, DX
17395 DECQ CX
17396 JNZ zero_loop_encodeSnappyBetterBlockAsm8B
17397 MOVL $0x00000000, 12(SP)
17398 MOVQ src_len+32(FP), CX
17399 LEAQ -9(CX), DX
17400 LEAQ -8(CX), BX
17401 MOVL BX, 8(SP)
17402 SHRQ $0x05, CX
17403 SUBL CX, DX
17404 LEAQ (AX)(DX*1), DX
17405 MOVQ DX, (SP)
17406 MOVL $0x00000001, CX
17407 MOVL $0x00000000, 16(SP)
17408 MOVQ src_base+24(FP), DX
17409
17410search_loop_encodeSnappyBetterBlockAsm8B:
17411 MOVL CX, BX
17412 SUBL 12(SP), BX
17413 SHRL $0x04, BX
17414 LEAL 1(CX)(BX*1), BX
17415 CMPL BX, 8(SP)
17416 JAE emit_remainder_encodeSnappyBetterBlockAsm8B
17417 MOVQ (DX)(CX*1), SI
17418 MOVL BX, 20(SP)
17419 MOVQ $0x0000cf1bbcdcbf9b, R8
17420 MOVQ $0x9e3779b1, BX
17421 MOVQ SI, R9
17422 MOVQ SI, R10
17423 SHLQ $0x10, R9
17424 IMULQ R8, R9
17425 SHRQ $0x36, R9
17426 SHLQ $0x20, R10
17427 IMULQ BX, R10
17428 SHRQ $0x38, R10
17429 MOVL 24(SP)(R9*4), BX
17430 MOVL 4120(SP)(R10*4), DI
17431 MOVL CX, 24(SP)(R9*4)
17432 MOVL CX, 4120(SP)(R10*4)
17433 MOVQ (DX)(BX*1), R9
17434 MOVQ (DX)(DI*1), R10
17435 CMPQ R9, SI
17436 JEQ candidate_match_encodeSnappyBetterBlockAsm8B
17437 CMPQ R10, SI
17438 JNE no_short_found_encodeSnappyBetterBlockAsm8B
17439 MOVL DI, BX
17440 JMP candidate_match_encodeSnappyBetterBlockAsm8B
17441
17442no_short_found_encodeSnappyBetterBlockAsm8B:
17443 CMPL R9, SI
17444 JEQ candidate_match_encodeSnappyBetterBlockAsm8B
17445 CMPL R10, SI
17446 JEQ candidateS_match_encodeSnappyBetterBlockAsm8B
17447 MOVL 20(SP), CX
17448 JMP search_loop_encodeSnappyBetterBlockAsm8B
17449
17450candidateS_match_encodeSnappyBetterBlockAsm8B:
17451 SHRQ $0x08, SI
17452 MOVQ SI, R9
17453 SHLQ $0x10, R9
17454 IMULQ R8, R9
17455 SHRQ $0x36, R9
17456 MOVL 24(SP)(R9*4), BX
17457 INCL CX
17458 MOVL CX, 24(SP)(R9*4)
17459 CMPL (DX)(BX*1), SI
17460 JEQ candidate_match_encodeSnappyBetterBlockAsm8B
17461 DECL CX
17462 MOVL DI, BX
17463
17464candidate_match_encodeSnappyBetterBlockAsm8B:
17465 MOVL 12(SP), SI
17466 TESTL BX, BX
17467 JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B
17468
17469match_extend_back_loop_encodeSnappyBetterBlockAsm8B:
17470 CMPL CX, SI
17471 JBE match_extend_back_end_encodeSnappyBetterBlockAsm8B
17472 MOVB -1(DX)(BX*1), DI
17473 MOVB -1(DX)(CX*1), R8
17474 CMPB DI, R8
17475 JNE match_extend_back_end_encodeSnappyBetterBlockAsm8B
17476 LEAL -1(CX), CX
17477 DECL BX
17478 JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B
17479 JMP match_extend_back_loop_encodeSnappyBetterBlockAsm8B
17480
17481match_extend_back_end_encodeSnappyBetterBlockAsm8B:
17482 MOVL CX, SI
17483 SUBL 12(SP), SI
17484 LEAQ 3(AX)(SI*1), SI
17485 CMPQ SI, (SP)
17486 JB match_dst_size_check_encodeSnappyBetterBlockAsm8B
17487 MOVQ $0x00000000, ret+48(FP)
17488 RET
17489
17490match_dst_size_check_encodeSnappyBetterBlockAsm8B:
17491 MOVL CX, SI
17492 ADDL $0x04, CX
17493 ADDL $0x04, BX
17494 MOVQ src_len+32(FP), DI
17495 SUBL CX, DI
17496 LEAQ (DX)(CX*1), R8
17497 LEAQ (DX)(BX*1), R9
17498
17499 // matchLen
17500 XORL R11, R11
17501
17502matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B:
17503 CMPL DI, $0x10
17504 JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B
17505 MOVQ (R8)(R11*1), R10
17506 MOVQ 8(R8)(R11*1), R12
17507 XORQ (R9)(R11*1), R10
17508 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B
17509 XORQ 8(R9)(R11*1), R12
17510 JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B
17511 LEAL -16(DI), DI
17512 LEAL 16(R11), R11
17513 JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B
17514
17515matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B:
17516#ifdef GOAMD64_v3
17517 TZCNTQ R12, R12
17518
17519#else
17520 BSFQ R12, R12
17521
17522#endif
17523 SARQ $0x03, R12
17524 LEAL 8(R11)(R12*1), R11
17525 JMP match_nolit_end_encodeSnappyBetterBlockAsm8B
17526
17527matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B:
17528 CMPL DI, $0x08
17529 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B
17530 MOVQ (R8)(R11*1), R10
17531 XORQ (R9)(R11*1), R10
17532 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B
17533 LEAL -8(DI), DI
17534 LEAL 8(R11), R11
17535 JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B
17536
17537matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B:
17538#ifdef GOAMD64_v3
17539 TZCNTQ R10, R10
17540
17541#else
17542 BSFQ R10, R10
17543
17544#endif
17545 SARQ $0x03, R10
17546 LEAL (R11)(R10*1), R11
17547 JMP match_nolit_end_encodeSnappyBetterBlockAsm8B
17548
17549matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B:
17550 CMPL DI, $0x04
17551 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
17552 MOVL (R8)(R11*1), R10
17553 CMPL (R9)(R11*1), R10
17554 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
17555 LEAL -4(DI), DI
17556 LEAL 4(R11), R11
17557
17558matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B:
17559 CMPL DI, $0x01
17560 JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
17561 JB match_nolit_end_encodeSnappyBetterBlockAsm8B
17562 MOVW (R8)(R11*1), R10
17563 CMPW (R9)(R11*1), R10
17564 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
17565 LEAL 2(R11), R11
17566 SUBL $0x02, DI
17567 JZ match_nolit_end_encodeSnappyBetterBlockAsm8B
17568
17569matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B:
17570 MOVB (R8)(R11*1), R10
17571 CMPB (R9)(R11*1), R10
17572 JNE match_nolit_end_encodeSnappyBetterBlockAsm8B
17573 LEAL 1(R11), R11
17574
17575match_nolit_end_encodeSnappyBetterBlockAsm8B:
17576 MOVL CX, DI
17577 SUBL BX, DI
17578
17579 // Check if repeat
17580 MOVL DI, 16(SP)
17581 MOVL 12(SP), BX
17582 CMPL BX, SI
17583 JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
17584 MOVL SI, R8
17585 MOVL SI, 12(SP)
17586 LEAQ (DX)(BX*1), R9
17587 SUBL BX, R8
17588 LEAL -1(R8), BX
17589 CMPL BX, $0x3c
17590 JB one_byte_match_emit_encodeSnappyBetterBlockAsm8B
17591 CMPL BX, $0x00000100
17592 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm8B
17593 JB three_bytes_match_emit_encodeSnappyBetterBlockAsm8B
17594
17595three_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
17596 MOVB $0xf4, (AX)
17597 MOVW BX, 1(AX)
17598 ADDQ $0x03, AX
17599 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
17600
17601two_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
17602 MOVB $0xf0, (AX)
17603 MOVB BL, 1(AX)
17604 ADDQ $0x02, AX
17605 CMPL BX, $0x40
17606 JB memmove_match_emit_encodeSnappyBetterBlockAsm8B
17607 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
17608
17609one_byte_match_emit_encodeSnappyBetterBlockAsm8B:
17610 SHLB $0x02, BL
17611 MOVB BL, (AX)
17612 ADDQ $0x01, AX
17613
17614memmove_match_emit_encodeSnappyBetterBlockAsm8B:
17615 LEAQ (AX)(R8*1), BX
17616
17617 // genMemMoveShort
17618 CMPQ R8, $0x08
17619 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8
17620 CMPQ R8, $0x10
17621 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
17622 CMPQ R8, $0x20
17623 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
17624 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
17625
17626emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8:
17627 MOVQ (R9), R10
17628 MOVQ R10, (AX)
17629 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
17630
17631emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
17632 MOVQ (R9), R10
17633 MOVQ -8(R9)(R8*1), R9
17634 MOVQ R10, (AX)
17635 MOVQ R9, -8(AX)(R8*1)
17636 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
17637
17638emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
17639 MOVOU (R9), X0
17640 MOVOU -16(R9)(R8*1), X1
17641 MOVOU X0, (AX)
17642 MOVOU X1, -16(AX)(R8*1)
17643 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
17644
17645emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
17646 MOVOU (R9), X0
17647 MOVOU 16(R9), X1
17648 MOVOU -32(R9)(R8*1), X2
17649 MOVOU -16(R9)(R8*1), X3
17650 MOVOU X0, (AX)
17651 MOVOU X1, 16(AX)
17652 MOVOU X2, -32(AX)(R8*1)
17653 MOVOU X3, -16(AX)(R8*1)
17654
17655memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B:
17656 MOVQ BX, AX
17657 JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
17658
17659memmove_long_match_emit_encodeSnappyBetterBlockAsm8B:
17660 LEAQ (AX)(R8*1), BX
17661
17662 // genMemMoveLong
17663 MOVOU (R9), X0
17664 MOVOU 16(R9), X1
17665 MOVOU -32(R9)(R8*1), X2
17666 MOVOU -16(R9)(R8*1), X3
17667 MOVQ R8, R12
17668 SHRQ $0x05, R12
17669 MOVQ AX, R10
17670 ANDL $0x0000001f, R10
17671 MOVQ $0x00000040, R13
17672 SUBQ R10, R13
17673 DECQ R12
17674 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
17675 LEAQ -32(R9)(R13*1), R10
17676 LEAQ -32(AX)(R13*1), R14
17677
17678emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
17679 MOVOU (R10), X4
17680 MOVOU 16(R10), X5
17681 MOVOA X4, (R14)
17682 MOVOA X5, 16(R14)
17683 ADDQ $0x20, R14
17684 ADDQ $0x20, R10
17685 ADDQ $0x20, R13
17686 DECQ R12
17687 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
17688
17689emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
17690 MOVOU -32(R9)(R13*1), X4
17691 MOVOU -16(R9)(R13*1), X5
17692 MOVOA X4, -32(AX)(R13*1)
17693 MOVOA X5, -16(AX)(R13*1)
17694 ADDQ $0x20, R13
17695 CMPQ R8, R13
17696 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
17697 MOVOU X0, (AX)
17698 MOVOU X1, 16(AX)
17699 MOVOU X2, -32(AX)(R8*1)
17700 MOVOU X3, -16(AX)(R8*1)
17701 MOVQ BX, AX
17702
17703emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B:
17704 ADDL R11, CX
17705 ADDL $0x04, R11
17706 MOVL CX, 12(SP)
17707
17708 // emitCopy
17709two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B:
17710 CMPL R11, $0x40
17711 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B
17712 MOVB $0xee, (AX)
17713 MOVW DI, 1(AX)
17714 LEAL -60(R11), R11
17715 ADDQ $0x03, AX
17716 JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B
17717
17718two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B:
17719 MOVL R11, BX
17720 SHLL $0x02, BX
17721 CMPL R11, $0x0c
17722 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B
17723 LEAL -15(BX), BX
17724 MOVB DI, 1(AX)
17725 SHRL $0x08, DI
17726 SHLL $0x05, DI
17727 ORL DI, BX
17728 MOVB BL, (AX)
17729 ADDQ $0x02, AX
17730 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B
17731
17732emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B:
17733 LEAL -2(BX), BX
17734 MOVB BL, (AX)
17735 MOVW DI, 1(AX)
17736 ADDQ $0x03, AX
17737
17738match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:
17739 CMPL CX, 8(SP)
17740 JAE emit_remainder_encodeSnappyBetterBlockAsm8B
17741 CMPQ AX, (SP)
17742 JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B
17743 MOVQ $0x00000000, ret+48(FP)
17744 RET
17745
17746match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B:
17747 MOVQ $0x0000cf1bbcdcbf9b, BX
17748 MOVQ $0x9e3779b1, DI
17749 LEAQ 1(SI), SI
17750 LEAQ -2(CX), R8
17751 MOVQ (DX)(SI*1), R9
17752 MOVQ 1(DX)(SI*1), R10
17753 MOVQ (DX)(R8*1), R11
17754 MOVQ 1(DX)(R8*1), R12
17755 SHLQ $0x10, R9
17756 IMULQ BX, R9
17757 SHRQ $0x36, R9
17758 SHLQ $0x20, R10
17759 IMULQ DI, R10
17760 SHRQ $0x38, R10
17761 SHLQ $0x10, R11
17762 IMULQ BX, R11
17763 SHRQ $0x36, R11
17764 SHLQ $0x20, R12
17765 IMULQ DI, R12
17766 SHRQ $0x38, R12
17767 LEAQ 1(SI), DI
17768 LEAQ 1(R8), R13
17769 MOVL SI, 24(SP)(R9*4)
17770 MOVL R8, 24(SP)(R11*4)
17771 MOVL DI, 4120(SP)(R10*4)
17772 MOVL R13, 4120(SP)(R12*4)
17773 LEAQ 1(R8)(SI*1), DI
17774 SHRQ $0x01, DI
17775 ADDQ $0x01, SI
17776 SUBQ $0x01, R8
17777
17778index_loop_encodeSnappyBetterBlockAsm8B:
17779 CMPQ DI, R8
17780 JAE search_loop_encodeSnappyBetterBlockAsm8B
17781 MOVQ (DX)(SI*1), R9
17782 MOVQ (DX)(DI*1), R10
17783 SHLQ $0x10, R9
17784 IMULQ BX, R9
17785 SHRQ $0x36, R9
17786 SHLQ $0x10, R10
17787 IMULQ BX, R10
17788 SHRQ $0x36, R10
17789 MOVL SI, 24(SP)(R9*4)
17790 MOVL DI, 24(SP)(R10*4)
17791 ADDQ $0x02, SI
17792 ADDQ $0x02, DI
17793 JMP index_loop_encodeSnappyBetterBlockAsm8B
17794
17795emit_remainder_encodeSnappyBetterBlockAsm8B:
17796 MOVQ src_len+32(FP), CX
17797 SUBL 12(SP), CX
17798 LEAQ 3(AX)(CX*1), CX
17799 CMPQ CX, (SP)
17800 JB emit_remainder_ok_encodeSnappyBetterBlockAsm8B
17801 MOVQ $0x00000000, ret+48(FP)
17802 RET
17803
17804emit_remainder_ok_encodeSnappyBetterBlockAsm8B:
17805 MOVQ src_len+32(FP), CX
17806 MOVL 12(SP), BX
17807 CMPL BX, CX
17808 JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
17809 MOVL CX, SI
17810 MOVL CX, 12(SP)
17811 LEAQ (DX)(BX*1), CX
17812 SUBL BX, SI
17813 LEAL -1(SI), DX
17814 CMPL DX, $0x3c
17815 JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B
17816 CMPL DX, $0x00000100
17817 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B
17818 JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B
17819
17820three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
17821 MOVB $0xf4, (AX)
17822 MOVW DX, 1(AX)
17823 ADDQ $0x03, AX
17824 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
17825
17826two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
17827 MOVB $0xf0, (AX)
17828 MOVB DL, 1(AX)
17829 ADDQ $0x02, AX
17830 CMPL DX, $0x40
17831 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm8B
17832 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
17833
17834one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B:
17835 SHLB $0x02, DL
17836 MOVB DL, (AX)
17837 ADDQ $0x01, AX
17838
17839memmove_emit_remainder_encodeSnappyBetterBlockAsm8B:
17840 LEAQ (AX)(SI*1), DX
17841 MOVL SI, BX
17842
17843 // genMemMoveShort
17844 CMPQ BX, $0x03
17845 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2
17846 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3
17847 CMPQ BX, $0x08
17848 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7
17849 CMPQ BX, $0x10
17850 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
17851 CMPQ BX, $0x20
17852 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
17853 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
17854
17855emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2:
17856 MOVB (CX), SI
17857 MOVB -1(CX)(BX*1), CL
17858 MOVB SI, (AX)
17859 MOVB CL, -1(AX)(BX*1)
17860 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
17861
17862emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3:
17863 MOVW (CX), SI
17864 MOVB 2(CX), CL
17865 MOVW SI, (AX)
17866 MOVB CL, 2(AX)
17867 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
17868
17869emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7:
17870 MOVL (CX), SI
17871 MOVL -4(CX)(BX*1), CX
17872 MOVL SI, (AX)
17873 MOVL CX, -4(AX)(BX*1)
17874 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
17875
17876emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
17877 MOVQ (CX), SI
17878 MOVQ -8(CX)(BX*1), CX
17879 MOVQ SI, (AX)
17880 MOVQ CX, -8(AX)(BX*1)
17881 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
17882
17883emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
17884 MOVOU (CX), X0
17885 MOVOU -16(CX)(BX*1), X1
17886 MOVOU X0, (AX)
17887 MOVOU X1, -16(AX)(BX*1)
17888 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
17889
17890emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
17891 MOVOU (CX), X0
17892 MOVOU 16(CX), X1
17893 MOVOU -32(CX)(BX*1), X2
17894 MOVOU -16(CX)(BX*1), X3
17895 MOVOU X0, (AX)
17896 MOVOU X1, 16(AX)
17897 MOVOU X2, -32(AX)(BX*1)
17898 MOVOU X3, -16(AX)(BX*1)
17899
17900memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B:
17901 MOVQ DX, AX
17902 JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
17903
17904memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B:
17905 LEAQ (AX)(SI*1), DX
17906 MOVL SI, BX
17907
17908 // genMemMoveLong
17909 MOVOU (CX), X0
17910 MOVOU 16(CX), X1
17911 MOVOU -32(CX)(BX*1), X2
17912 MOVOU -16(CX)(BX*1), X3
17913 MOVQ BX, DI
17914 SHRQ $0x05, DI
17915 MOVQ AX, SI
17916 ANDL $0x0000001f, SI
17917 MOVQ $0x00000040, R8
17918 SUBQ SI, R8
17919 DECQ DI
17920 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
17921 LEAQ -32(CX)(R8*1), SI
17922 LEAQ -32(AX)(R8*1), R9
17923
17924emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
17925 MOVOU (SI), X4
17926 MOVOU 16(SI), X5
17927 MOVOA X4, (R9)
17928 MOVOA X5, 16(R9)
17929 ADDQ $0x20, R9
17930 ADDQ $0x20, SI
17931 ADDQ $0x20, R8
17932 DECQ DI
17933 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
17934
17935emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
17936 MOVOU -32(CX)(R8*1), X4
17937 MOVOU -16(CX)(R8*1), X5
17938 MOVOA X4, -32(AX)(R8*1)
17939 MOVOA X5, -16(AX)(R8*1)
17940 ADDQ $0x20, R8
17941 CMPQ BX, R8
17942 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
17943 MOVOU X0, (AX)
17944 MOVOU X1, 16(AX)
17945 MOVOU X2, -32(AX)(BX*1)
17946 MOVOU X3, -16(AX)(BX*1)
17947 MOVQ DX, AX
17948
17949emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B:
17950 MOVQ dst_base+0(FP), CX
17951 SUBQ CX, AX
17952 MOVQ AX, ret+48(FP)
17953 RET
17954
17955// func calcBlockSize(src []byte) int
17956// Requires: BMI, SSE2
17957TEXT ·calcBlockSize(SB), $32792-32
17958 XORQ AX, AX
17959 MOVQ $0x00000100, CX
17960 LEAQ 24(SP), DX
17961 PXOR X0, X0
17962
17963zero_loop_calcBlockSize:
17964 MOVOU X0, (DX)
17965 MOVOU X0, 16(DX)
17966 MOVOU X0, 32(DX)
17967 MOVOU X0, 48(DX)
17968 MOVOU X0, 64(DX)
17969 MOVOU X0, 80(DX)
17970 MOVOU X0, 96(DX)
17971 MOVOU X0, 112(DX)
17972 ADDQ $0x80, DX
17973 DECQ CX
17974 JNZ zero_loop_calcBlockSize
17975 MOVL $0x00000000, 12(SP)
17976 MOVQ src_len+8(FP), CX
17977 LEAQ -9(CX), DX
17978 LEAQ -8(CX), BX
17979 MOVL BX, 8(SP)
17980 SHRQ $0x05, CX
17981 SUBL CX, DX
17982 LEAQ (AX)(DX*1), DX
17983 MOVQ DX, (SP)
17984 MOVL $0x00000001, CX
17985 MOVL CX, 16(SP)
17986 MOVQ src_base+0(FP), DX
17987
17988search_loop_calcBlockSize:
17989 MOVL CX, BX
17990 SUBL 12(SP), BX
17991 SHRL $0x05, BX
17992 LEAL 4(CX)(BX*1), BX
17993 CMPL BX, 8(SP)
17994 JAE emit_remainder_calcBlockSize
17995 MOVQ (DX)(CX*1), SI
17996 MOVL BX, 20(SP)
17997 MOVQ $0x0000cf1bbcdcbf9b, R8
17998 MOVQ SI, R9
17999 MOVQ SI, R10
18000 SHRQ $0x08, R10
18001 SHLQ $0x10, R9
18002 IMULQ R8, R9
18003 SHRQ $0x33, R9
18004 SHLQ $0x10, R10
18005 IMULQ R8, R10
18006 SHRQ $0x33, R10
18007 MOVL 24(SP)(R9*4), BX
18008 MOVL 24(SP)(R10*4), DI
18009 MOVL CX, 24(SP)(R9*4)
18010 LEAL 1(CX), R9
18011 MOVL R9, 24(SP)(R10*4)
18012 MOVQ SI, R9
18013 SHRQ $0x10, R9
18014 SHLQ $0x10, R9
18015 IMULQ R8, R9
18016 SHRQ $0x33, R9
18017 MOVL CX, R8
18018 SUBL 16(SP), R8
18019 MOVL 1(DX)(R8*1), R10
18020 MOVQ SI, R8
18021 SHRQ $0x08, R8
18022 CMPL R8, R10
18023 JNE no_repeat_found_calcBlockSize
18024 LEAL 1(CX), SI
18025 MOVL 12(SP), BX
18026 MOVL SI, DI
18027 SUBL 16(SP), DI
18028 JZ repeat_extend_back_end_calcBlockSize
18029
18030repeat_extend_back_loop_calcBlockSize:
18031 CMPL SI, BX
18032 JBE repeat_extend_back_end_calcBlockSize
18033 MOVB -1(DX)(DI*1), R8
18034 MOVB -1(DX)(SI*1), R9
18035 CMPB R8, R9
18036 JNE repeat_extend_back_end_calcBlockSize
18037 LEAL -1(SI), SI
18038 DECL DI
18039 JNZ repeat_extend_back_loop_calcBlockSize
18040
18041repeat_extend_back_end_calcBlockSize:
18042 MOVL SI, BX
18043 SUBL 12(SP), BX
18044 LEAQ 5(AX)(BX*1), BX
18045 CMPQ BX, (SP)
18046 JB repeat_dst_size_check_calcBlockSize
18047 MOVQ $0x00000000, ret+24(FP)
18048 RET
18049
18050repeat_dst_size_check_calcBlockSize:
18051 MOVL 12(SP), BX
18052 CMPL BX, SI
18053 JEQ emit_literal_done_repeat_emit_calcBlockSize
18054 MOVL SI, DI
18055 MOVL SI, 12(SP)
18056 LEAQ (DX)(BX*1), R8
18057 SUBL BX, DI
18058 LEAL -1(DI), BX
18059 CMPL BX, $0x3c
18060 JB one_byte_repeat_emit_calcBlockSize
18061 CMPL BX, $0x00000100
18062 JB two_bytes_repeat_emit_calcBlockSize
18063 CMPL BX, $0x00010000
18064 JB three_bytes_repeat_emit_calcBlockSize
18065 CMPL BX, $0x01000000
18066 JB four_bytes_repeat_emit_calcBlockSize
18067 ADDQ $0x05, AX
18068 JMP memmove_long_repeat_emit_calcBlockSize
18069
18070four_bytes_repeat_emit_calcBlockSize:
18071 ADDQ $0x04, AX
18072 JMP memmove_long_repeat_emit_calcBlockSize
18073
18074three_bytes_repeat_emit_calcBlockSize:
18075 ADDQ $0x03, AX
18076 JMP memmove_long_repeat_emit_calcBlockSize
18077
18078two_bytes_repeat_emit_calcBlockSize:
18079 ADDQ $0x02, AX
18080 CMPL BX, $0x40
18081 JB memmove_repeat_emit_calcBlockSize
18082 JMP memmove_long_repeat_emit_calcBlockSize
18083
18084one_byte_repeat_emit_calcBlockSize:
18085 ADDQ $0x01, AX
18086
18087memmove_repeat_emit_calcBlockSize:
18088 LEAQ (AX)(DI*1), AX
18089 JMP emit_literal_done_repeat_emit_calcBlockSize
18090
18091memmove_long_repeat_emit_calcBlockSize:
18092 LEAQ (AX)(DI*1), AX
18093
18094emit_literal_done_repeat_emit_calcBlockSize:
18095 ADDL $0x05, CX
18096 MOVL CX, BX
18097 SUBL 16(SP), BX
18098 MOVQ src_len+8(FP), DI
18099 SUBL CX, DI
18100 LEAQ (DX)(CX*1), R8
18101 LEAQ (DX)(BX*1), BX
18102
18103 // matchLen
18104 XORL R10, R10
18105
18106matchlen_loopback_16_repeat_extend_calcBlockSize:
18107 CMPL DI, $0x10
18108 JB matchlen_match8_repeat_extend_calcBlockSize
18109 MOVQ (R8)(R10*1), R9
18110 MOVQ 8(R8)(R10*1), R11
18111 XORQ (BX)(R10*1), R9
18112 JNZ matchlen_bsf_8_repeat_extend_calcBlockSize
18113 XORQ 8(BX)(R10*1), R11
18114 JNZ matchlen_bsf_16repeat_extend_calcBlockSize
18115 LEAL -16(DI), DI
18116 LEAL 16(R10), R10
18117 JMP matchlen_loopback_16_repeat_extend_calcBlockSize
18118
18119matchlen_bsf_16repeat_extend_calcBlockSize:
18120#ifdef GOAMD64_v3
18121 TZCNTQ R11, R11
18122
18123#else
18124 BSFQ R11, R11
18125
18126#endif
18127 SARQ $0x03, R11
18128 LEAL 8(R10)(R11*1), R10
18129 JMP repeat_extend_forward_end_calcBlockSize
18130
18131matchlen_match8_repeat_extend_calcBlockSize:
18132 CMPL DI, $0x08
18133 JB matchlen_match4_repeat_extend_calcBlockSize
18134 MOVQ (R8)(R10*1), R9
18135 XORQ (BX)(R10*1), R9
18136 JNZ matchlen_bsf_8_repeat_extend_calcBlockSize
18137 LEAL -8(DI), DI
18138 LEAL 8(R10), R10
18139 JMP matchlen_match4_repeat_extend_calcBlockSize
18140
18141matchlen_bsf_8_repeat_extend_calcBlockSize:
18142#ifdef GOAMD64_v3
18143 TZCNTQ R9, R9
18144
18145#else
18146 BSFQ R9, R9
18147
18148#endif
18149 SARQ $0x03, R9
18150 LEAL (R10)(R9*1), R10
18151 JMP repeat_extend_forward_end_calcBlockSize
18152
18153matchlen_match4_repeat_extend_calcBlockSize:
18154 CMPL DI, $0x04
18155 JB matchlen_match2_repeat_extend_calcBlockSize
18156 MOVL (R8)(R10*1), R9
18157 CMPL (BX)(R10*1), R9
18158 JNE matchlen_match2_repeat_extend_calcBlockSize
18159 LEAL -4(DI), DI
18160 LEAL 4(R10), R10
18161
18162matchlen_match2_repeat_extend_calcBlockSize:
18163 CMPL DI, $0x01
18164 JE matchlen_match1_repeat_extend_calcBlockSize
18165 JB repeat_extend_forward_end_calcBlockSize
18166 MOVW (R8)(R10*1), R9
18167 CMPW (BX)(R10*1), R9
18168 JNE matchlen_match1_repeat_extend_calcBlockSize
18169 LEAL 2(R10), R10
18170 SUBL $0x02, DI
18171 JZ repeat_extend_forward_end_calcBlockSize
18172
18173matchlen_match1_repeat_extend_calcBlockSize:
18174 MOVB (R8)(R10*1), R9
18175 CMPB (BX)(R10*1), R9
18176 JNE repeat_extend_forward_end_calcBlockSize
18177 LEAL 1(R10), R10
18178
18179repeat_extend_forward_end_calcBlockSize:
18180 ADDL R10, CX
18181 MOVL CX, BX
18182 SUBL SI, BX
18183 MOVL 16(SP), SI
18184
18185 // emitCopy
18186 CMPL SI, $0x00010000
18187 JB two_byte_offset_repeat_as_copy_calcBlockSize
18188
18189four_bytes_loop_back_repeat_as_copy_calcBlockSize:
18190 CMPL BX, $0x40
18191 JBE four_bytes_remain_repeat_as_copy_calcBlockSize
18192 LEAL -64(BX), BX
18193 ADDQ $0x05, AX
18194 CMPL BX, $0x04
18195 JB four_bytes_remain_repeat_as_copy_calcBlockSize
18196 JMP four_bytes_loop_back_repeat_as_copy_calcBlockSize
18197
18198four_bytes_remain_repeat_as_copy_calcBlockSize:
18199 TESTL BX, BX
18200 JZ repeat_end_emit_calcBlockSize
18201 XORL BX, BX
18202 ADDQ $0x05, AX
18203 JMP repeat_end_emit_calcBlockSize
18204
18205two_byte_offset_repeat_as_copy_calcBlockSize:
18206 CMPL BX, $0x40
18207 JBE two_byte_offset_short_repeat_as_copy_calcBlockSize
18208 LEAL -60(BX), BX
18209 ADDQ $0x03, AX
18210 JMP two_byte_offset_repeat_as_copy_calcBlockSize
18211
18212two_byte_offset_short_repeat_as_copy_calcBlockSize:
18213 MOVL BX, DI
18214 SHLL $0x02, DI
18215 CMPL BX, $0x0c
18216 JAE emit_copy_three_repeat_as_copy_calcBlockSize
18217 CMPL SI, $0x00000800
18218 JAE emit_copy_three_repeat_as_copy_calcBlockSize
18219 ADDQ $0x02, AX
18220 JMP repeat_end_emit_calcBlockSize
18221
18222emit_copy_three_repeat_as_copy_calcBlockSize:
18223 ADDQ $0x03, AX
18224
18225repeat_end_emit_calcBlockSize:
18226 MOVL CX, 12(SP)
18227 JMP search_loop_calcBlockSize
18228
18229no_repeat_found_calcBlockSize:
18230 CMPL (DX)(BX*1), SI
18231 JEQ candidate_match_calcBlockSize
18232 SHRQ $0x08, SI
18233 MOVL 24(SP)(R9*4), BX
18234 LEAL 2(CX), R8
18235 CMPL (DX)(DI*1), SI
18236 JEQ candidate2_match_calcBlockSize
18237 MOVL R8, 24(SP)(R9*4)
18238 SHRQ $0x08, SI
18239 CMPL (DX)(BX*1), SI
18240 JEQ candidate3_match_calcBlockSize
18241 MOVL 20(SP), CX
18242 JMP search_loop_calcBlockSize
18243
18244candidate3_match_calcBlockSize:
18245 ADDL $0x02, CX
18246 JMP candidate_match_calcBlockSize
18247
18248candidate2_match_calcBlockSize:
18249 MOVL R8, 24(SP)(R9*4)
18250 INCL CX
18251 MOVL DI, BX
18252
18253candidate_match_calcBlockSize:
18254 MOVL 12(SP), SI
18255 TESTL BX, BX
18256 JZ match_extend_back_end_calcBlockSize
18257
18258match_extend_back_loop_calcBlockSize:
18259 CMPL CX, SI
18260 JBE match_extend_back_end_calcBlockSize
18261 MOVB -1(DX)(BX*1), DI
18262 MOVB -1(DX)(CX*1), R8
18263 CMPB DI, R8
18264 JNE match_extend_back_end_calcBlockSize
18265 LEAL -1(CX), CX
18266 DECL BX
18267 JZ match_extend_back_end_calcBlockSize
18268 JMP match_extend_back_loop_calcBlockSize
18269
18270match_extend_back_end_calcBlockSize:
18271 MOVL CX, SI
18272 SUBL 12(SP), SI
18273 LEAQ 5(AX)(SI*1), SI
18274 CMPQ SI, (SP)
18275 JB match_dst_size_check_calcBlockSize
18276 MOVQ $0x00000000, ret+24(FP)
18277 RET
18278
18279match_dst_size_check_calcBlockSize:
18280 MOVL CX, SI
18281 MOVL 12(SP), DI
18282 CMPL DI, SI
18283 JEQ emit_literal_done_match_emit_calcBlockSize
18284 MOVL SI, R8
18285 MOVL SI, 12(SP)
18286 LEAQ (DX)(DI*1), SI
18287 SUBL DI, R8
18288 LEAL -1(R8), SI
18289 CMPL SI, $0x3c
18290 JB one_byte_match_emit_calcBlockSize
18291 CMPL SI, $0x00000100
18292 JB two_bytes_match_emit_calcBlockSize
18293 CMPL SI, $0x00010000
18294 JB three_bytes_match_emit_calcBlockSize
18295 CMPL SI, $0x01000000
18296 JB four_bytes_match_emit_calcBlockSize
18297 ADDQ $0x05, AX
18298 JMP memmove_long_match_emit_calcBlockSize
18299
18300four_bytes_match_emit_calcBlockSize:
18301 ADDQ $0x04, AX
18302 JMP memmove_long_match_emit_calcBlockSize
18303
18304three_bytes_match_emit_calcBlockSize:
18305 ADDQ $0x03, AX
18306 JMP memmove_long_match_emit_calcBlockSize
18307
18308two_bytes_match_emit_calcBlockSize:
18309 ADDQ $0x02, AX
18310 CMPL SI, $0x40
18311 JB memmove_match_emit_calcBlockSize
18312 JMP memmove_long_match_emit_calcBlockSize
18313
18314one_byte_match_emit_calcBlockSize:
18315 ADDQ $0x01, AX
18316
18317memmove_match_emit_calcBlockSize:
18318 LEAQ (AX)(R8*1), AX
18319 JMP emit_literal_done_match_emit_calcBlockSize
18320
18321memmove_long_match_emit_calcBlockSize:
18322 LEAQ (AX)(R8*1), AX
18323
18324emit_literal_done_match_emit_calcBlockSize:
18325match_nolit_loop_calcBlockSize:
18326 MOVL CX, SI
18327 SUBL BX, SI
18328 MOVL SI, 16(SP)
18329 ADDL $0x04, CX
18330 ADDL $0x04, BX
18331 MOVQ src_len+8(FP), SI
18332 SUBL CX, SI
18333 LEAQ (DX)(CX*1), DI
18334 LEAQ (DX)(BX*1), BX
18335
18336 // matchLen
18337 XORL R9, R9
18338
18339matchlen_loopback_16_match_nolit_calcBlockSize:
18340 CMPL SI, $0x10
18341 JB matchlen_match8_match_nolit_calcBlockSize
18342 MOVQ (DI)(R9*1), R8
18343 MOVQ 8(DI)(R9*1), R10
18344 XORQ (BX)(R9*1), R8
18345 JNZ matchlen_bsf_8_match_nolit_calcBlockSize
18346 XORQ 8(BX)(R9*1), R10
18347 JNZ matchlen_bsf_16match_nolit_calcBlockSize
18348 LEAL -16(SI), SI
18349 LEAL 16(R9), R9
18350 JMP matchlen_loopback_16_match_nolit_calcBlockSize
18351
18352matchlen_bsf_16match_nolit_calcBlockSize:
18353#ifdef GOAMD64_v3
18354 TZCNTQ R10, R10
18355
18356#else
18357 BSFQ R10, R10
18358
18359#endif
18360 SARQ $0x03, R10
18361 LEAL 8(R9)(R10*1), R9
18362 JMP match_nolit_end_calcBlockSize
18363
18364matchlen_match8_match_nolit_calcBlockSize:
18365 CMPL SI, $0x08
18366 JB matchlen_match4_match_nolit_calcBlockSize
18367 MOVQ (DI)(R9*1), R8
18368 XORQ (BX)(R9*1), R8
18369 JNZ matchlen_bsf_8_match_nolit_calcBlockSize
18370 LEAL -8(SI), SI
18371 LEAL 8(R9), R9
18372 JMP matchlen_match4_match_nolit_calcBlockSize
18373
18374matchlen_bsf_8_match_nolit_calcBlockSize:
18375#ifdef GOAMD64_v3
18376 TZCNTQ R8, R8
18377
18378#else
18379 BSFQ R8, R8
18380
18381#endif
18382 SARQ $0x03, R8
18383 LEAL (R9)(R8*1), R9
18384 JMP match_nolit_end_calcBlockSize
18385
18386matchlen_match4_match_nolit_calcBlockSize:
18387 CMPL SI, $0x04
18388 JB matchlen_match2_match_nolit_calcBlockSize
18389 MOVL (DI)(R9*1), R8
18390 CMPL (BX)(R9*1), R8
18391 JNE matchlen_match2_match_nolit_calcBlockSize
18392 LEAL -4(SI), SI
18393 LEAL 4(R9), R9
18394
18395matchlen_match2_match_nolit_calcBlockSize:
18396 CMPL SI, $0x01
18397 JE matchlen_match1_match_nolit_calcBlockSize
18398 JB match_nolit_end_calcBlockSize
18399 MOVW (DI)(R9*1), R8
18400 CMPW (BX)(R9*1), R8
18401 JNE matchlen_match1_match_nolit_calcBlockSize
18402 LEAL 2(R9), R9
18403 SUBL $0x02, SI
18404 JZ match_nolit_end_calcBlockSize
18405
18406matchlen_match1_match_nolit_calcBlockSize:
18407 MOVB (DI)(R9*1), R8
18408 CMPB (BX)(R9*1), R8
18409 JNE match_nolit_end_calcBlockSize
18410 LEAL 1(R9), R9
18411
18412match_nolit_end_calcBlockSize:
18413 ADDL R9, CX
18414 MOVL 16(SP), BX
18415 ADDL $0x04, R9
18416 MOVL CX, 12(SP)
18417
18418 // emitCopy
18419 CMPL BX, $0x00010000
18420 JB two_byte_offset_match_nolit_calcBlockSize
18421
18422four_bytes_loop_back_match_nolit_calcBlockSize:
18423 CMPL R9, $0x40
18424 JBE four_bytes_remain_match_nolit_calcBlockSize
18425 LEAL -64(R9), R9
18426 ADDQ $0x05, AX
18427 CMPL R9, $0x04
18428 JB four_bytes_remain_match_nolit_calcBlockSize
18429 JMP four_bytes_loop_back_match_nolit_calcBlockSize
18430
18431four_bytes_remain_match_nolit_calcBlockSize:
18432 TESTL R9, R9
18433 JZ match_nolit_emitcopy_end_calcBlockSize
18434 XORL BX, BX
18435 ADDQ $0x05, AX
18436 JMP match_nolit_emitcopy_end_calcBlockSize
18437
18438two_byte_offset_match_nolit_calcBlockSize:
18439 CMPL R9, $0x40
18440 JBE two_byte_offset_short_match_nolit_calcBlockSize
18441 LEAL -60(R9), R9
18442 ADDQ $0x03, AX
18443 JMP two_byte_offset_match_nolit_calcBlockSize
18444
18445two_byte_offset_short_match_nolit_calcBlockSize:
18446 MOVL R9, SI
18447 SHLL $0x02, SI
18448 CMPL R9, $0x0c
18449 JAE emit_copy_three_match_nolit_calcBlockSize
18450 CMPL BX, $0x00000800
18451 JAE emit_copy_three_match_nolit_calcBlockSize
18452 ADDQ $0x02, AX
18453 JMP match_nolit_emitcopy_end_calcBlockSize
18454
18455emit_copy_three_match_nolit_calcBlockSize:
18456 ADDQ $0x03, AX
18457
18458match_nolit_emitcopy_end_calcBlockSize:
18459 CMPL CX, 8(SP)
18460 JAE emit_remainder_calcBlockSize
18461 MOVQ -2(DX)(CX*1), SI
18462 CMPQ AX, (SP)
18463 JB match_nolit_dst_ok_calcBlockSize
18464 MOVQ $0x00000000, ret+24(FP)
18465 RET
18466
18467match_nolit_dst_ok_calcBlockSize:
18468 MOVQ $0x0000cf1bbcdcbf9b, R8
18469 MOVQ SI, DI
18470 SHRQ $0x10, SI
18471 MOVQ SI, BX
18472 SHLQ $0x10, DI
18473 IMULQ R8, DI
18474 SHRQ $0x33, DI
18475 SHLQ $0x10, BX
18476 IMULQ R8, BX
18477 SHRQ $0x33, BX
18478 LEAL -2(CX), R8
18479 LEAQ 24(SP)(BX*4), R9
18480 MOVL (R9), BX
18481 MOVL R8, 24(SP)(DI*4)
18482 MOVL CX, (R9)
18483 CMPL (DX)(BX*1), SI
18484 JEQ match_nolit_loop_calcBlockSize
18485 INCL CX
18486 JMP search_loop_calcBlockSize
18487
18488emit_remainder_calcBlockSize:
18489 MOVQ src_len+8(FP), CX
18490 SUBL 12(SP), CX
18491 LEAQ 5(AX)(CX*1), CX
18492 CMPQ CX, (SP)
18493 JB emit_remainder_ok_calcBlockSize
18494 MOVQ $0x00000000, ret+24(FP)
18495 RET
18496
18497emit_remainder_ok_calcBlockSize:
18498 MOVQ src_len+8(FP), CX
18499 MOVL 12(SP), BX
18500 CMPL BX, CX
18501 JEQ emit_literal_done_emit_remainder_calcBlockSize
18502 MOVL CX, SI
18503 MOVL CX, 12(SP)
18504 LEAQ (DX)(BX*1), CX
18505 SUBL BX, SI
18506 LEAL -1(SI), CX
18507 CMPL CX, $0x3c
18508 JB one_byte_emit_remainder_calcBlockSize
18509 CMPL CX, $0x00000100
18510 JB two_bytes_emit_remainder_calcBlockSize
18511 CMPL CX, $0x00010000
18512 JB three_bytes_emit_remainder_calcBlockSize
18513 CMPL CX, $0x01000000
18514 JB four_bytes_emit_remainder_calcBlockSize
18515 ADDQ $0x05, AX
18516 JMP memmove_long_emit_remainder_calcBlockSize
18517
18518four_bytes_emit_remainder_calcBlockSize:
18519 ADDQ $0x04, AX
18520 JMP memmove_long_emit_remainder_calcBlockSize
18521
18522three_bytes_emit_remainder_calcBlockSize:
18523 ADDQ $0x03, AX
18524 JMP memmove_long_emit_remainder_calcBlockSize
18525
18526two_bytes_emit_remainder_calcBlockSize:
18527 ADDQ $0x02, AX
18528 CMPL CX, $0x40
18529 JB memmove_emit_remainder_calcBlockSize
18530 JMP memmove_long_emit_remainder_calcBlockSize
18531
18532one_byte_emit_remainder_calcBlockSize:
18533 ADDQ $0x01, AX
18534
18535memmove_emit_remainder_calcBlockSize:
18536 LEAQ (AX)(SI*1), AX
18537 JMP emit_literal_done_emit_remainder_calcBlockSize
18538
18539memmove_long_emit_remainder_calcBlockSize:
18540 LEAQ (AX)(SI*1), AX
18541
18542emit_literal_done_emit_remainder_calcBlockSize:
18543 MOVQ AX, ret+24(FP)
18544 RET
18545
18546// func calcBlockSizeSmall(src []byte) int
18547// Requires: BMI, SSE2
18548TEXT ·calcBlockSizeSmall(SB), $2072-32
18549 XORQ AX, AX
18550 MOVQ $0x00000010, CX
18551 LEAQ 24(SP), DX
18552 PXOR X0, X0
18553
18554zero_loop_calcBlockSizeSmall:
18555 MOVOU X0, (DX)
18556 MOVOU X0, 16(DX)
18557 MOVOU X0, 32(DX)
18558 MOVOU X0, 48(DX)
18559 MOVOU X0, 64(DX)
18560 MOVOU X0, 80(DX)
18561 MOVOU X0, 96(DX)
18562 MOVOU X0, 112(DX)
18563 ADDQ $0x80, DX
18564 DECQ CX
18565 JNZ zero_loop_calcBlockSizeSmall
18566 MOVL $0x00000000, 12(SP)
18567 MOVQ src_len+8(FP), CX
18568 LEAQ -9(CX), DX
18569 LEAQ -8(CX), BX
18570 MOVL BX, 8(SP)
18571 SHRQ $0x05, CX
18572 SUBL CX, DX
18573 LEAQ (AX)(DX*1), DX
18574 MOVQ DX, (SP)
18575 MOVL $0x00000001, CX
18576 MOVL CX, 16(SP)
18577 MOVQ src_base+0(FP), DX
18578
18579search_loop_calcBlockSizeSmall:
18580 MOVL CX, BX
18581 SUBL 12(SP), BX
18582 SHRL $0x04, BX
18583 LEAL 4(CX)(BX*1), BX
18584 CMPL BX, 8(SP)
18585 JAE emit_remainder_calcBlockSizeSmall
18586 MOVQ (DX)(CX*1), SI
18587 MOVL BX, 20(SP)
18588 MOVQ $0x9e3779b1, R8
18589 MOVQ SI, R9
18590 MOVQ SI, R10
18591 SHRQ $0x08, R10
18592 SHLQ $0x20, R9
18593 IMULQ R8, R9
18594 SHRQ $0x37, R9
18595 SHLQ $0x20, R10
18596 IMULQ R8, R10
18597 SHRQ $0x37, R10
18598 MOVL 24(SP)(R9*4), BX
18599 MOVL 24(SP)(R10*4), DI
18600 MOVL CX, 24(SP)(R9*4)
18601 LEAL 1(CX), R9
18602 MOVL R9, 24(SP)(R10*4)
18603 MOVQ SI, R9
18604 SHRQ $0x10, R9
18605 SHLQ $0x20, R9
18606 IMULQ R8, R9
18607 SHRQ $0x37, R9
18608 MOVL CX, R8
18609 SUBL 16(SP), R8
18610 MOVL 1(DX)(R8*1), R10
18611 MOVQ SI, R8
18612 SHRQ $0x08, R8
18613 CMPL R8, R10
18614 JNE no_repeat_found_calcBlockSizeSmall
18615 LEAL 1(CX), SI
18616 MOVL 12(SP), BX
18617 MOVL SI, DI
18618 SUBL 16(SP), DI
18619 JZ repeat_extend_back_end_calcBlockSizeSmall
18620
18621repeat_extend_back_loop_calcBlockSizeSmall:
18622 CMPL SI, BX
18623 JBE repeat_extend_back_end_calcBlockSizeSmall
18624 MOVB -1(DX)(DI*1), R8
18625 MOVB -1(DX)(SI*1), R9
18626 CMPB R8, R9
18627 JNE repeat_extend_back_end_calcBlockSizeSmall
18628 LEAL -1(SI), SI
18629 DECL DI
18630 JNZ repeat_extend_back_loop_calcBlockSizeSmall
18631
18632repeat_extend_back_end_calcBlockSizeSmall:
18633 MOVL SI, BX
18634 SUBL 12(SP), BX
18635 LEAQ 3(AX)(BX*1), BX
18636 CMPQ BX, (SP)
18637 JB repeat_dst_size_check_calcBlockSizeSmall
18638 MOVQ $0x00000000, ret+24(FP)
18639 RET
18640
18641repeat_dst_size_check_calcBlockSizeSmall:
18642 MOVL 12(SP), BX
18643 CMPL BX, SI
18644 JEQ emit_literal_done_repeat_emit_calcBlockSizeSmall
18645 MOVL SI, DI
18646 MOVL SI, 12(SP)
18647 LEAQ (DX)(BX*1), R8
18648 SUBL BX, DI
18649 LEAL -1(DI), BX
18650 CMPL BX, $0x3c
18651 JB one_byte_repeat_emit_calcBlockSizeSmall
18652 CMPL BX, $0x00000100
18653 JB two_bytes_repeat_emit_calcBlockSizeSmall
18654 JB three_bytes_repeat_emit_calcBlockSizeSmall
18655
18656three_bytes_repeat_emit_calcBlockSizeSmall:
18657 ADDQ $0x03, AX
18658 JMP memmove_long_repeat_emit_calcBlockSizeSmall
18659
18660two_bytes_repeat_emit_calcBlockSizeSmall:
18661 ADDQ $0x02, AX
18662 CMPL BX, $0x40
18663 JB memmove_repeat_emit_calcBlockSizeSmall
18664 JMP memmove_long_repeat_emit_calcBlockSizeSmall
18665
18666one_byte_repeat_emit_calcBlockSizeSmall:
18667 ADDQ $0x01, AX
18668
18669memmove_repeat_emit_calcBlockSizeSmall:
18670 LEAQ (AX)(DI*1), AX
18671 JMP emit_literal_done_repeat_emit_calcBlockSizeSmall
18672
18673memmove_long_repeat_emit_calcBlockSizeSmall:
18674 LEAQ (AX)(DI*1), AX
18675
18676emit_literal_done_repeat_emit_calcBlockSizeSmall:
18677 ADDL $0x05, CX
18678 MOVL CX, BX
18679 SUBL 16(SP), BX
18680 MOVQ src_len+8(FP), DI
18681 SUBL CX, DI
18682 LEAQ (DX)(CX*1), R8
18683 LEAQ (DX)(BX*1), BX
18684
18685 // matchLen
18686 XORL R10, R10
18687
18688matchlen_loopback_16_repeat_extend_calcBlockSizeSmall:
18689 CMPL DI, $0x10
18690 JB matchlen_match8_repeat_extend_calcBlockSizeSmall
18691 MOVQ (R8)(R10*1), R9
18692 MOVQ 8(R8)(R10*1), R11
18693 XORQ (BX)(R10*1), R9
18694 JNZ matchlen_bsf_8_repeat_extend_calcBlockSizeSmall
18695 XORQ 8(BX)(R10*1), R11
18696 JNZ matchlen_bsf_16repeat_extend_calcBlockSizeSmall
18697 LEAL -16(DI), DI
18698 LEAL 16(R10), R10
18699 JMP matchlen_loopback_16_repeat_extend_calcBlockSizeSmall
18700
18701matchlen_bsf_16repeat_extend_calcBlockSizeSmall:
18702#ifdef GOAMD64_v3
18703 TZCNTQ R11, R11
18704
18705#else
18706 BSFQ R11, R11
18707
18708#endif
18709 SARQ $0x03, R11
18710 LEAL 8(R10)(R11*1), R10
18711 JMP repeat_extend_forward_end_calcBlockSizeSmall
18712
18713matchlen_match8_repeat_extend_calcBlockSizeSmall:
18714 CMPL DI, $0x08
18715 JB matchlen_match4_repeat_extend_calcBlockSizeSmall
18716 MOVQ (R8)(R10*1), R9
18717 XORQ (BX)(R10*1), R9
18718 JNZ matchlen_bsf_8_repeat_extend_calcBlockSizeSmall
18719 LEAL -8(DI), DI
18720 LEAL 8(R10), R10
18721 JMP matchlen_match4_repeat_extend_calcBlockSizeSmall
18722
18723matchlen_bsf_8_repeat_extend_calcBlockSizeSmall:
18724#ifdef GOAMD64_v3
18725 TZCNTQ R9, R9
18726
18727#else
18728 BSFQ R9, R9
18729
18730#endif
18731 SARQ $0x03, R9
18732 LEAL (R10)(R9*1), R10
18733 JMP repeat_extend_forward_end_calcBlockSizeSmall
18734
18735matchlen_match4_repeat_extend_calcBlockSizeSmall:
18736 CMPL DI, $0x04
18737 JB matchlen_match2_repeat_extend_calcBlockSizeSmall
18738 MOVL (R8)(R10*1), R9
18739 CMPL (BX)(R10*1), R9
18740 JNE matchlen_match2_repeat_extend_calcBlockSizeSmall
18741 LEAL -4(DI), DI
18742 LEAL 4(R10), R10
18743
18744matchlen_match2_repeat_extend_calcBlockSizeSmall:
18745 CMPL DI, $0x01
18746 JE matchlen_match1_repeat_extend_calcBlockSizeSmall
18747 JB repeat_extend_forward_end_calcBlockSizeSmall
18748 MOVW (R8)(R10*1), R9
18749 CMPW (BX)(R10*1), R9
18750 JNE matchlen_match1_repeat_extend_calcBlockSizeSmall
18751 LEAL 2(R10), R10
18752 SUBL $0x02, DI
18753 JZ repeat_extend_forward_end_calcBlockSizeSmall
18754
18755matchlen_match1_repeat_extend_calcBlockSizeSmall:
18756 MOVB (R8)(R10*1), R9
18757 CMPB (BX)(R10*1), R9
18758 JNE repeat_extend_forward_end_calcBlockSizeSmall
18759 LEAL 1(R10), R10
18760
18761repeat_extend_forward_end_calcBlockSizeSmall:
18762 ADDL R10, CX
18763 MOVL CX, BX
18764 SUBL SI, BX
18765 MOVL 16(SP), SI
18766
18767 // emitCopy
18768two_byte_offset_repeat_as_copy_calcBlockSizeSmall:
18769 CMPL BX, $0x40
18770 JBE two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall
18771 LEAL -60(BX), BX
18772 ADDQ $0x03, AX
18773 JMP two_byte_offset_repeat_as_copy_calcBlockSizeSmall
18774
18775two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall:
18776 MOVL BX, SI
18777 SHLL $0x02, SI
18778 CMPL BX, $0x0c
18779 JAE emit_copy_three_repeat_as_copy_calcBlockSizeSmall
18780 ADDQ $0x02, AX
18781 JMP repeat_end_emit_calcBlockSizeSmall
18782
18783emit_copy_three_repeat_as_copy_calcBlockSizeSmall:
18784 ADDQ $0x03, AX
18785
18786repeat_end_emit_calcBlockSizeSmall:
18787 MOVL CX, 12(SP)
18788 JMP search_loop_calcBlockSizeSmall
18789
18790no_repeat_found_calcBlockSizeSmall:
18791 CMPL (DX)(BX*1), SI
18792 JEQ candidate_match_calcBlockSizeSmall
18793 SHRQ $0x08, SI
18794 MOVL 24(SP)(R9*4), BX
18795 LEAL 2(CX), R8
18796 CMPL (DX)(DI*1), SI
18797 JEQ candidate2_match_calcBlockSizeSmall
18798 MOVL R8, 24(SP)(R9*4)
18799 SHRQ $0x08, SI
18800 CMPL (DX)(BX*1), SI
18801 JEQ candidate3_match_calcBlockSizeSmall
18802 MOVL 20(SP), CX
18803 JMP search_loop_calcBlockSizeSmall
18804
18805candidate3_match_calcBlockSizeSmall:
18806 ADDL $0x02, CX
18807 JMP candidate_match_calcBlockSizeSmall
18808
18809candidate2_match_calcBlockSizeSmall:
18810 MOVL R8, 24(SP)(R9*4)
18811 INCL CX
18812 MOVL DI, BX
18813
18814candidate_match_calcBlockSizeSmall:
18815 MOVL 12(SP), SI
18816 TESTL BX, BX
18817 JZ match_extend_back_end_calcBlockSizeSmall
18818
18819match_extend_back_loop_calcBlockSizeSmall:
18820 CMPL CX, SI
18821 JBE match_extend_back_end_calcBlockSizeSmall
18822 MOVB -1(DX)(BX*1), DI
18823 MOVB -1(DX)(CX*1), R8
18824 CMPB DI, R8
18825 JNE match_extend_back_end_calcBlockSizeSmall
18826 LEAL -1(CX), CX
18827 DECL BX
18828 JZ match_extend_back_end_calcBlockSizeSmall
18829 JMP match_extend_back_loop_calcBlockSizeSmall
18830
18831match_extend_back_end_calcBlockSizeSmall:
18832 MOVL CX, SI
18833 SUBL 12(SP), SI
18834 LEAQ 3(AX)(SI*1), SI
18835 CMPQ SI, (SP)
18836 JB match_dst_size_check_calcBlockSizeSmall
18837 MOVQ $0x00000000, ret+24(FP)
18838 RET
18839
18840match_dst_size_check_calcBlockSizeSmall:
18841 MOVL CX, SI
18842 MOVL 12(SP), DI
18843 CMPL DI, SI
18844 JEQ emit_literal_done_match_emit_calcBlockSizeSmall
18845 MOVL SI, R8
18846 MOVL SI, 12(SP)
18847 LEAQ (DX)(DI*1), SI
18848 SUBL DI, R8
18849 LEAL -1(R8), SI
18850 CMPL SI, $0x3c
18851 JB one_byte_match_emit_calcBlockSizeSmall
18852 CMPL SI, $0x00000100
18853 JB two_bytes_match_emit_calcBlockSizeSmall
18854 JB three_bytes_match_emit_calcBlockSizeSmall
18855
18856three_bytes_match_emit_calcBlockSizeSmall:
18857 ADDQ $0x03, AX
18858 JMP memmove_long_match_emit_calcBlockSizeSmall
18859
18860two_bytes_match_emit_calcBlockSizeSmall:
18861 ADDQ $0x02, AX
18862 CMPL SI, $0x40
18863 JB memmove_match_emit_calcBlockSizeSmall
18864 JMP memmove_long_match_emit_calcBlockSizeSmall
18865
18866one_byte_match_emit_calcBlockSizeSmall:
18867 ADDQ $0x01, AX
18868
18869memmove_match_emit_calcBlockSizeSmall:
18870 LEAQ (AX)(R8*1), AX
18871 JMP emit_literal_done_match_emit_calcBlockSizeSmall
18872
18873memmove_long_match_emit_calcBlockSizeSmall:
18874 LEAQ (AX)(R8*1), AX
18875
18876emit_literal_done_match_emit_calcBlockSizeSmall:
18877match_nolit_loop_calcBlockSizeSmall:
18878 MOVL CX, SI
18879 SUBL BX, SI
18880 MOVL SI, 16(SP)
18881 ADDL $0x04, CX
18882 ADDL $0x04, BX
18883 MOVQ src_len+8(FP), SI
18884 SUBL CX, SI
18885 LEAQ (DX)(CX*1), DI
18886 LEAQ (DX)(BX*1), BX
18887
18888 // matchLen
18889 XORL R9, R9
18890
18891matchlen_loopback_16_match_nolit_calcBlockSizeSmall:
18892 CMPL SI, $0x10
18893 JB matchlen_match8_match_nolit_calcBlockSizeSmall
18894 MOVQ (DI)(R9*1), R8
18895 MOVQ 8(DI)(R9*1), R10
18896 XORQ (BX)(R9*1), R8
18897 JNZ matchlen_bsf_8_match_nolit_calcBlockSizeSmall
18898 XORQ 8(BX)(R9*1), R10
18899 JNZ matchlen_bsf_16match_nolit_calcBlockSizeSmall
18900 LEAL -16(SI), SI
18901 LEAL 16(R9), R9
18902 JMP matchlen_loopback_16_match_nolit_calcBlockSizeSmall
18903
18904matchlen_bsf_16match_nolit_calcBlockSizeSmall:
18905#ifdef GOAMD64_v3
18906 TZCNTQ R10, R10
18907
18908#else
18909 BSFQ R10, R10
18910
18911#endif
18912 SARQ $0x03, R10
18913 LEAL 8(R9)(R10*1), R9
18914 JMP match_nolit_end_calcBlockSizeSmall
18915
18916matchlen_match8_match_nolit_calcBlockSizeSmall:
18917 CMPL SI, $0x08
18918 JB matchlen_match4_match_nolit_calcBlockSizeSmall
18919 MOVQ (DI)(R9*1), R8
18920 XORQ (BX)(R9*1), R8
18921 JNZ matchlen_bsf_8_match_nolit_calcBlockSizeSmall
18922 LEAL -8(SI), SI
18923 LEAL 8(R9), R9
18924 JMP matchlen_match4_match_nolit_calcBlockSizeSmall
18925
18926matchlen_bsf_8_match_nolit_calcBlockSizeSmall:
18927#ifdef GOAMD64_v3
18928 TZCNTQ R8, R8
18929
18930#else
18931 BSFQ R8, R8
18932
18933#endif
18934 SARQ $0x03, R8
18935 LEAL (R9)(R8*1), R9
18936 JMP match_nolit_end_calcBlockSizeSmall
18937
18938matchlen_match4_match_nolit_calcBlockSizeSmall:
18939 CMPL SI, $0x04
18940 JB matchlen_match2_match_nolit_calcBlockSizeSmall
18941 MOVL (DI)(R9*1), R8
18942 CMPL (BX)(R9*1), R8
18943 JNE matchlen_match2_match_nolit_calcBlockSizeSmall
18944 LEAL -4(SI), SI
18945 LEAL 4(R9), R9
18946
18947matchlen_match2_match_nolit_calcBlockSizeSmall:
18948 CMPL SI, $0x01
18949 JE matchlen_match1_match_nolit_calcBlockSizeSmall
18950 JB match_nolit_end_calcBlockSizeSmall
18951 MOVW (DI)(R9*1), R8
18952 CMPW (BX)(R9*1), R8
18953 JNE matchlen_match1_match_nolit_calcBlockSizeSmall
18954 LEAL 2(R9), R9
18955 SUBL $0x02, SI
18956 JZ match_nolit_end_calcBlockSizeSmall
18957
18958matchlen_match1_match_nolit_calcBlockSizeSmall:
18959 MOVB (DI)(R9*1), R8
18960 CMPB (BX)(R9*1), R8
18961 JNE match_nolit_end_calcBlockSizeSmall
18962 LEAL 1(R9), R9
18963
18964match_nolit_end_calcBlockSizeSmall:
18965 ADDL R9, CX
18966 MOVL 16(SP), BX
18967 ADDL $0x04, R9
18968 MOVL CX, 12(SP)
18969
18970 // emitCopy
18971two_byte_offset_match_nolit_calcBlockSizeSmall:
18972 CMPL R9, $0x40
18973 JBE two_byte_offset_short_match_nolit_calcBlockSizeSmall
18974 LEAL -60(R9), R9
18975 ADDQ $0x03, AX
18976 JMP two_byte_offset_match_nolit_calcBlockSizeSmall
18977
18978two_byte_offset_short_match_nolit_calcBlockSizeSmall:
18979 MOVL R9, BX
18980 SHLL $0x02, BX
18981 CMPL R9, $0x0c
18982 JAE emit_copy_three_match_nolit_calcBlockSizeSmall
18983 ADDQ $0x02, AX
18984 JMP match_nolit_emitcopy_end_calcBlockSizeSmall
18985
18986emit_copy_three_match_nolit_calcBlockSizeSmall:
18987 ADDQ $0x03, AX
18988
18989match_nolit_emitcopy_end_calcBlockSizeSmall:
18990 CMPL CX, 8(SP)
18991 JAE emit_remainder_calcBlockSizeSmall
18992 MOVQ -2(DX)(CX*1), SI
18993 CMPQ AX, (SP)
18994 JB match_nolit_dst_ok_calcBlockSizeSmall
18995 MOVQ $0x00000000, ret+24(FP)
18996 RET
18997
18998match_nolit_dst_ok_calcBlockSizeSmall:
18999 MOVQ $0x9e3779b1, R8
19000 MOVQ SI, DI
19001 SHRQ $0x10, SI
19002 MOVQ SI, BX
19003 SHLQ $0x20, DI
19004 IMULQ R8, DI
19005 SHRQ $0x37, DI
19006 SHLQ $0x20, BX
19007 IMULQ R8, BX
19008 SHRQ $0x37, BX
19009 LEAL -2(CX), R8
19010 LEAQ 24(SP)(BX*4), R9
19011 MOVL (R9), BX
19012 MOVL R8, 24(SP)(DI*4)
19013 MOVL CX, (R9)
19014 CMPL (DX)(BX*1), SI
19015 JEQ match_nolit_loop_calcBlockSizeSmall
19016 INCL CX
19017 JMP search_loop_calcBlockSizeSmall
19018
19019emit_remainder_calcBlockSizeSmall:
19020 MOVQ src_len+8(FP), CX
19021 SUBL 12(SP), CX
19022 LEAQ 3(AX)(CX*1), CX
19023 CMPQ CX, (SP)
19024 JB emit_remainder_ok_calcBlockSizeSmall
19025 MOVQ $0x00000000, ret+24(FP)
19026 RET
19027
19028emit_remainder_ok_calcBlockSizeSmall:
19029 MOVQ src_len+8(FP), CX
19030 MOVL 12(SP), BX
19031 CMPL BX, CX
19032 JEQ emit_literal_done_emit_remainder_calcBlockSizeSmall
19033 MOVL CX, SI
19034 MOVL CX, 12(SP)
19035 LEAQ (DX)(BX*1), CX
19036 SUBL BX, SI
19037 LEAL -1(SI), CX
19038 CMPL CX, $0x3c
19039 JB one_byte_emit_remainder_calcBlockSizeSmall
19040 CMPL CX, $0x00000100
19041 JB two_bytes_emit_remainder_calcBlockSizeSmall
19042 JB three_bytes_emit_remainder_calcBlockSizeSmall
19043
19044three_bytes_emit_remainder_calcBlockSizeSmall:
19045 ADDQ $0x03, AX
19046 JMP memmove_long_emit_remainder_calcBlockSizeSmall
19047
19048two_bytes_emit_remainder_calcBlockSizeSmall:
19049 ADDQ $0x02, AX
19050 CMPL CX, $0x40
19051 JB memmove_emit_remainder_calcBlockSizeSmall
19052 JMP memmove_long_emit_remainder_calcBlockSizeSmall
19053
19054one_byte_emit_remainder_calcBlockSizeSmall:
19055 ADDQ $0x01, AX
19056
19057memmove_emit_remainder_calcBlockSizeSmall:
19058 LEAQ (AX)(SI*1), AX
19059 JMP emit_literal_done_emit_remainder_calcBlockSizeSmall
19060
19061memmove_long_emit_remainder_calcBlockSizeSmall:
19062 LEAQ (AX)(SI*1), AX
19063
19064emit_literal_done_emit_remainder_calcBlockSizeSmall:
19065 MOVQ AX, ret+24(FP)
19066 RET
19067
19068// func emitLiteral(dst []byte, lit []byte) int
19069// Requires: SSE2
19070TEXT ·emitLiteral(SB), NOSPLIT, $0-56
19071 MOVQ lit_len+32(FP), DX
19072 MOVQ dst_base+0(FP), AX
19073 MOVQ lit_base+24(FP), CX
19074 TESTQ DX, DX
19075 JZ emit_literal_end_standalone_skip
19076 MOVL DX, BX
19077 LEAL -1(DX), SI
19078 CMPL SI, $0x3c
19079 JB one_byte_standalone
19080 CMPL SI, $0x00000100
19081 JB two_bytes_standalone
19082 CMPL SI, $0x00010000
19083 JB three_bytes_standalone
19084 CMPL SI, $0x01000000
19085 JB four_bytes_standalone
19086 MOVB $0xfc, (AX)
19087 MOVL SI, 1(AX)
19088 ADDQ $0x05, BX
19089 ADDQ $0x05, AX
19090 JMP memmove_long_standalone
19091
19092four_bytes_standalone:
19093 MOVL SI, DI
19094 SHRL $0x10, DI
19095 MOVB $0xf8, (AX)
19096 MOVW SI, 1(AX)
19097 MOVB DI, 3(AX)
19098 ADDQ $0x04, BX
19099 ADDQ $0x04, AX
19100 JMP memmove_long_standalone
19101
19102three_bytes_standalone:
19103 MOVB $0xf4, (AX)
19104 MOVW SI, 1(AX)
19105 ADDQ $0x03, BX
19106 ADDQ $0x03, AX
19107 JMP memmove_long_standalone
19108
19109two_bytes_standalone:
19110 MOVB $0xf0, (AX)
19111 MOVB SI, 1(AX)
19112 ADDQ $0x02, BX
19113 ADDQ $0x02, AX
19114 CMPL SI, $0x40
19115 JB memmove_standalone
19116 JMP memmove_long_standalone
19117
19118one_byte_standalone:
19119 SHLB $0x02, SI
19120 MOVB SI, (AX)
19121 ADDQ $0x01, BX
19122 ADDQ $0x01, AX
19123
19124memmove_standalone:
19125 // genMemMoveShort
19126 CMPQ DX, $0x03
19127 JB emit_lit_memmove_standalone_memmove_move_1or2
19128 JE emit_lit_memmove_standalone_memmove_move_3
19129 CMPQ DX, $0x08
19130 JB emit_lit_memmove_standalone_memmove_move_4through7
19131 CMPQ DX, $0x10
19132 JBE emit_lit_memmove_standalone_memmove_move_8through16
19133 CMPQ DX, $0x20
19134 JBE emit_lit_memmove_standalone_memmove_move_17through32
19135 JMP emit_lit_memmove_standalone_memmove_move_33through64
19136
19137emit_lit_memmove_standalone_memmove_move_1or2:
19138 MOVB (CX), SI
19139 MOVB -1(CX)(DX*1), CL
19140 MOVB SI, (AX)
19141 MOVB CL, -1(AX)(DX*1)
19142 JMP emit_literal_end_standalone
19143
19144emit_lit_memmove_standalone_memmove_move_3:
19145 MOVW (CX), SI
19146 MOVB 2(CX), CL
19147 MOVW SI, (AX)
19148 MOVB CL, 2(AX)
19149 JMP emit_literal_end_standalone
19150
19151emit_lit_memmove_standalone_memmove_move_4through7:
19152 MOVL (CX), SI
19153 MOVL -4(CX)(DX*1), CX
19154 MOVL SI, (AX)
19155 MOVL CX, -4(AX)(DX*1)
19156 JMP emit_literal_end_standalone
19157
19158emit_lit_memmove_standalone_memmove_move_8through16:
19159 MOVQ (CX), SI
19160 MOVQ -8(CX)(DX*1), CX
19161 MOVQ SI, (AX)
19162 MOVQ CX, -8(AX)(DX*1)
19163 JMP emit_literal_end_standalone
19164
19165emit_lit_memmove_standalone_memmove_move_17through32:
19166 MOVOU (CX), X0
19167 MOVOU -16(CX)(DX*1), X1
19168 MOVOU X0, (AX)
19169 MOVOU X1, -16(AX)(DX*1)
19170 JMP emit_literal_end_standalone
19171
19172emit_lit_memmove_standalone_memmove_move_33through64:
19173 MOVOU (CX), X0
19174 MOVOU 16(CX), X1
19175 MOVOU -32(CX)(DX*1), X2
19176 MOVOU -16(CX)(DX*1), X3
19177 MOVOU X0, (AX)
19178 MOVOU X1, 16(AX)
19179 MOVOU X2, -32(AX)(DX*1)
19180 MOVOU X3, -16(AX)(DX*1)
19181 JMP emit_literal_end_standalone
19182 JMP emit_literal_end_standalone
19183
19184memmove_long_standalone:
19185 // genMemMoveLong
19186 MOVOU (CX), X0
19187 MOVOU 16(CX), X1
19188 MOVOU -32(CX)(DX*1), X2
19189 MOVOU -16(CX)(DX*1), X3
19190 MOVQ DX, DI
19191 SHRQ $0x05, DI
19192 MOVQ AX, SI
19193 ANDL $0x0000001f, SI
19194 MOVQ $0x00000040, R8
19195 SUBQ SI, R8
19196 DECQ DI
19197 JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
19198 LEAQ -32(CX)(R8*1), SI
19199 LEAQ -32(AX)(R8*1), R9
19200
19201emit_lit_memmove_long_standalonelarge_big_loop_back:
19202 MOVOU (SI), X4
19203 MOVOU 16(SI), X5
19204 MOVOA X4, (R9)
19205 MOVOA X5, 16(R9)
19206 ADDQ $0x20, R9
19207 ADDQ $0x20, SI
19208 ADDQ $0x20, R8
19209 DECQ DI
19210 JNA emit_lit_memmove_long_standalonelarge_big_loop_back
19211
19212emit_lit_memmove_long_standalonelarge_forward_sse_loop_32:
19213 MOVOU -32(CX)(R8*1), X4
19214 MOVOU -16(CX)(R8*1), X5
19215 MOVOA X4, -32(AX)(R8*1)
19216 MOVOA X5, -16(AX)(R8*1)
19217 ADDQ $0x20, R8
19218 CMPQ DX, R8
19219 JAE emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
19220 MOVOU X0, (AX)
19221 MOVOU X1, 16(AX)
19222 MOVOU X2, -32(AX)(DX*1)
19223 MOVOU X3, -16(AX)(DX*1)
19224 JMP emit_literal_end_standalone
19225 JMP emit_literal_end_standalone
19226
19227emit_literal_end_standalone_skip:
19228 XORQ BX, BX
19229
19230emit_literal_end_standalone:
19231 MOVQ BX, ret+48(FP)
19232 RET
19233
19234// func emitRepeat(dst []byte, offset int, length int) int
19235TEXT ·emitRepeat(SB), NOSPLIT, $0-48
19236 XORQ BX, BX
19237 MOVQ dst_base+0(FP), AX
19238 MOVQ offset+24(FP), CX
19239 MOVQ length+32(FP), DX
19240
19241 // emitRepeat
19242emit_repeat_again_standalone:
19243 MOVL DX, SI
19244 LEAL -4(DX), DX
19245 CMPL SI, $0x08
19246 JBE repeat_two_standalone
19247 CMPL SI, $0x0c
19248 JAE cant_repeat_two_offset_standalone
19249 CMPL CX, $0x00000800
19250 JB repeat_two_offset_standalone
19251
19252cant_repeat_two_offset_standalone:
19253 CMPL DX, $0x00000104
19254 JB repeat_three_standalone
19255 CMPL DX, $0x00010100
19256 JB repeat_four_standalone
19257 CMPL DX, $0x0100ffff
19258 JB repeat_five_standalone
19259 LEAL -16842747(DX), DX
19260 MOVL $0xfffb001d, (AX)
19261 MOVB $0xff, 4(AX)
19262 ADDQ $0x05, AX
19263 ADDQ $0x05, BX
19264 JMP emit_repeat_again_standalone
19265
19266repeat_five_standalone:
19267 LEAL -65536(DX), DX
19268 MOVL DX, CX
19269 MOVW $0x001d, (AX)
19270 MOVW DX, 2(AX)
19271 SARL $0x10, CX
19272 MOVB CL, 4(AX)
19273 ADDQ $0x05, BX
19274 ADDQ $0x05, AX
19275 JMP gen_emit_repeat_end
19276
19277repeat_four_standalone:
19278 LEAL -256(DX), DX
19279 MOVW $0x0019, (AX)
19280 MOVW DX, 2(AX)
19281 ADDQ $0x04, BX
19282 ADDQ $0x04, AX
19283 JMP gen_emit_repeat_end
19284
19285repeat_three_standalone:
19286 LEAL -4(DX), DX
19287 MOVW $0x0015, (AX)
19288 MOVB DL, 2(AX)
19289 ADDQ $0x03, BX
19290 ADDQ $0x03, AX
19291 JMP gen_emit_repeat_end
19292
19293repeat_two_standalone:
19294 SHLL $0x02, DX
19295 ORL $0x01, DX
19296 MOVW DX, (AX)
19297 ADDQ $0x02, BX
19298 ADDQ $0x02, AX
19299 JMP gen_emit_repeat_end
19300
19301repeat_two_offset_standalone:
19302 XORQ SI, SI
19303 LEAL 1(SI)(DX*4), DX
19304 MOVB CL, 1(AX)
19305 SARL $0x08, CX
19306 SHLL $0x05, CX
19307 ORL CX, DX
19308 MOVB DL, (AX)
19309 ADDQ $0x02, BX
19310 ADDQ $0x02, AX
19311
19312gen_emit_repeat_end:
19313 MOVQ BX, ret+40(FP)
19314 RET
19315
19316// func emitCopy(dst []byte, offset int, length int) int
19317TEXT ·emitCopy(SB), NOSPLIT, $0-48
19318 XORQ BX, BX
19319 MOVQ dst_base+0(FP), AX
19320 MOVQ offset+24(FP), CX
19321 MOVQ length+32(FP), DX
19322
19323 // emitCopy
19324 CMPL CX, $0x00010000
19325 JB two_byte_offset_standalone
19326 CMPL DX, $0x40
19327 JBE four_bytes_remain_standalone
19328 MOVB $0xff, (AX)
19329 MOVL CX, 1(AX)
19330 LEAL -64(DX), DX
19331 ADDQ $0x05, BX
19332 ADDQ $0x05, AX
19333 CMPL DX, $0x04
19334 JB four_bytes_remain_standalone
19335
19336 // emitRepeat
19337emit_repeat_again_standalone_emit_copy:
19338 MOVL DX, SI
19339 LEAL -4(DX), DX
19340 CMPL SI, $0x08
19341 JBE repeat_two_standalone_emit_copy
19342 CMPL SI, $0x0c
19343 JAE cant_repeat_two_offset_standalone_emit_copy
19344 CMPL CX, $0x00000800
19345 JB repeat_two_offset_standalone_emit_copy
19346
19347cant_repeat_two_offset_standalone_emit_copy:
19348 CMPL DX, $0x00000104
19349 JB repeat_three_standalone_emit_copy
19350 CMPL DX, $0x00010100
19351 JB repeat_four_standalone_emit_copy
19352 CMPL DX, $0x0100ffff
19353 JB repeat_five_standalone_emit_copy
19354 LEAL -16842747(DX), DX
19355 MOVL $0xfffb001d, (AX)
19356 MOVB $0xff, 4(AX)
19357 ADDQ $0x05, AX
19358 ADDQ $0x05, BX
19359 JMP emit_repeat_again_standalone_emit_copy
19360
19361repeat_five_standalone_emit_copy:
19362 LEAL -65536(DX), DX
19363 MOVL DX, CX
19364 MOVW $0x001d, (AX)
19365 MOVW DX, 2(AX)
19366 SARL $0x10, CX
19367 MOVB CL, 4(AX)
19368 ADDQ $0x05, BX
19369 ADDQ $0x05, AX
19370 JMP gen_emit_copy_end
19371
19372repeat_four_standalone_emit_copy:
19373 LEAL -256(DX), DX
19374 MOVW $0x0019, (AX)
19375 MOVW DX, 2(AX)
19376 ADDQ $0x04, BX
19377 ADDQ $0x04, AX
19378 JMP gen_emit_copy_end
19379
19380repeat_three_standalone_emit_copy:
19381 LEAL -4(DX), DX
19382 MOVW $0x0015, (AX)
19383 MOVB DL, 2(AX)
19384 ADDQ $0x03, BX
19385 ADDQ $0x03, AX
19386 JMP gen_emit_copy_end
19387
19388repeat_two_standalone_emit_copy:
19389 SHLL $0x02, DX
19390 ORL $0x01, DX
19391 MOVW DX, (AX)
19392 ADDQ $0x02, BX
19393 ADDQ $0x02, AX
19394 JMP gen_emit_copy_end
19395
19396repeat_two_offset_standalone_emit_copy:
19397 XORQ SI, SI
19398 LEAL 1(SI)(DX*4), DX
19399 MOVB CL, 1(AX)
19400 SARL $0x08, CX
19401 SHLL $0x05, CX
19402 ORL CX, DX
19403 MOVB DL, (AX)
19404 ADDQ $0x02, BX
19405 ADDQ $0x02, AX
19406 JMP gen_emit_copy_end
19407
19408four_bytes_remain_standalone:
19409 TESTL DX, DX
19410 JZ gen_emit_copy_end
19411 XORL SI, SI
19412 LEAL -1(SI)(DX*4), DX
19413 MOVB DL, (AX)
19414 MOVL CX, 1(AX)
19415 ADDQ $0x05, BX
19416 ADDQ $0x05, AX
19417 JMP gen_emit_copy_end
19418
19419two_byte_offset_standalone:
19420 CMPL DX, $0x40
19421 JBE two_byte_offset_short_standalone
19422 CMPL CX, $0x00000800
19423 JAE long_offset_short_standalone
19424 MOVL $0x00000001, SI
19425 LEAL 16(SI), SI
19426 MOVB CL, 1(AX)
19427 MOVL CX, DI
19428 SHRL $0x08, DI
19429 SHLL $0x05, DI
19430 ORL DI, SI
19431 MOVB SI, (AX)
19432 ADDQ $0x02, BX
19433 ADDQ $0x02, AX
19434 SUBL $0x08, DX
19435
19436 // emitRepeat
19437 LEAL -4(DX), DX
19438 JMP cant_repeat_two_offset_standalone_emit_copy_short_2b
19439
19440emit_repeat_again_standalone_emit_copy_short_2b:
19441 MOVL DX, SI
19442 LEAL -4(DX), DX
19443 CMPL SI, $0x08
19444 JBE repeat_two_standalone_emit_copy_short_2b
19445 CMPL SI, $0x0c
19446 JAE cant_repeat_two_offset_standalone_emit_copy_short_2b
19447 CMPL CX, $0x00000800
19448 JB repeat_two_offset_standalone_emit_copy_short_2b
19449
19450cant_repeat_two_offset_standalone_emit_copy_short_2b:
19451 CMPL DX, $0x00000104
19452 JB repeat_three_standalone_emit_copy_short_2b
19453 CMPL DX, $0x00010100
19454 JB repeat_four_standalone_emit_copy_short_2b
19455 CMPL DX, $0x0100ffff
19456 JB repeat_five_standalone_emit_copy_short_2b
19457 LEAL -16842747(DX), DX
19458 MOVL $0xfffb001d, (AX)
19459 MOVB $0xff, 4(AX)
19460 ADDQ $0x05, AX
19461 ADDQ $0x05, BX
19462 JMP emit_repeat_again_standalone_emit_copy_short_2b
19463
19464repeat_five_standalone_emit_copy_short_2b:
19465 LEAL -65536(DX), DX
19466 MOVL DX, CX
19467 MOVW $0x001d, (AX)
19468 MOVW DX, 2(AX)
19469 SARL $0x10, CX
19470 MOVB CL, 4(AX)
19471 ADDQ $0x05, BX
19472 ADDQ $0x05, AX
19473 JMP gen_emit_copy_end
19474
19475repeat_four_standalone_emit_copy_short_2b:
19476 LEAL -256(DX), DX
19477 MOVW $0x0019, (AX)
19478 MOVW DX, 2(AX)
19479 ADDQ $0x04, BX
19480 ADDQ $0x04, AX
19481 JMP gen_emit_copy_end
19482
19483repeat_three_standalone_emit_copy_short_2b:
19484 LEAL -4(DX), DX
19485 MOVW $0x0015, (AX)
19486 MOVB DL, 2(AX)
19487 ADDQ $0x03, BX
19488 ADDQ $0x03, AX
19489 JMP gen_emit_copy_end
19490
19491repeat_two_standalone_emit_copy_short_2b:
19492 SHLL $0x02, DX
19493 ORL $0x01, DX
19494 MOVW DX, (AX)
19495 ADDQ $0x02, BX
19496 ADDQ $0x02, AX
19497 JMP gen_emit_copy_end
19498
19499repeat_two_offset_standalone_emit_copy_short_2b:
19500 XORQ SI, SI
19501 LEAL 1(SI)(DX*4), DX
19502 MOVB CL, 1(AX)
19503 SARL $0x08, CX
19504 SHLL $0x05, CX
19505 ORL CX, DX
19506 MOVB DL, (AX)
19507 ADDQ $0x02, BX
19508 ADDQ $0x02, AX
19509 JMP gen_emit_copy_end
19510
19511long_offset_short_standalone:
19512 MOVB $0xee, (AX)
19513 MOVW CX, 1(AX)
19514 LEAL -60(DX), DX
19515 ADDQ $0x03, AX
19516 ADDQ $0x03, BX
19517
19518 // emitRepeat
19519emit_repeat_again_standalone_emit_copy_short:
19520 MOVL DX, SI
19521 LEAL -4(DX), DX
19522 CMPL SI, $0x08
19523 JBE repeat_two_standalone_emit_copy_short
19524 CMPL SI, $0x0c
19525 JAE cant_repeat_two_offset_standalone_emit_copy_short
19526 CMPL CX, $0x00000800
19527 JB repeat_two_offset_standalone_emit_copy_short
19528
19529cant_repeat_two_offset_standalone_emit_copy_short:
19530 CMPL DX, $0x00000104
19531 JB repeat_three_standalone_emit_copy_short
19532 CMPL DX, $0x00010100
19533 JB repeat_four_standalone_emit_copy_short
19534 CMPL DX, $0x0100ffff
19535 JB repeat_five_standalone_emit_copy_short
19536 LEAL -16842747(DX), DX
19537 MOVL $0xfffb001d, (AX)
19538 MOVB $0xff, 4(AX)
19539 ADDQ $0x05, AX
19540 ADDQ $0x05, BX
19541 JMP emit_repeat_again_standalone_emit_copy_short
19542
19543repeat_five_standalone_emit_copy_short:
19544 LEAL -65536(DX), DX
19545 MOVL DX, CX
19546 MOVW $0x001d, (AX)
19547 MOVW DX, 2(AX)
19548 SARL $0x10, CX
19549 MOVB CL, 4(AX)
19550 ADDQ $0x05, BX
19551 ADDQ $0x05, AX
19552 JMP gen_emit_copy_end
19553
19554repeat_four_standalone_emit_copy_short:
19555 LEAL -256(DX), DX
19556 MOVW $0x0019, (AX)
19557 MOVW DX, 2(AX)
19558 ADDQ $0x04, BX
19559 ADDQ $0x04, AX
19560 JMP gen_emit_copy_end
19561
19562repeat_three_standalone_emit_copy_short:
19563 LEAL -4(DX), DX
19564 MOVW $0x0015, (AX)
19565 MOVB DL, 2(AX)
19566 ADDQ $0x03, BX
19567 ADDQ $0x03, AX
19568 JMP gen_emit_copy_end
19569
19570repeat_two_standalone_emit_copy_short:
19571 SHLL $0x02, DX
19572 ORL $0x01, DX
19573 MOVW DX, (AX)
19574 ADDQ $0x02, BX
19575 ADDQ $0x02, AX
19576 JMP gen_emit_copy_end
19577
19578repeat_two_offset_standalone_emit_copy_short:
19579 XORQ SI, SI
19580 LEAL 1(SI)(DX*4), DX
19581 MOVB CL, 1(AX)
19582 SARL $0x08, CX
19583 SHLL $0x05, CX
19584 ORL CX, DX
19585 MOVB DL, (AX)
19586 ADDQ $0x02, BX
19587 ADDQ $0x02, AX
19588 JMP gen_emit_copy_end
19589
19590two_byte_offset_short_standalone:
19591 MOVL DX, SI
19592 SHLL $0x02, SI
19593 CMPL DX, $0x0c
19594 JAE emit_copy_three_standalone
19595 CMPL CX, $0x00000800
19596 JAE emit_copy_three_standalone
19597 LEAL -15(SI), SI
19598 MOVB CL, 1(AX)
19599 SHRL $0x08, CX
19600 SHLL $0x05, CX
19601 ORL CX, SI
19602 MOVB SI, (AX)
19603 ADDQ $0x02, BX
19604 ADDQ $0x02, AX
19605 JMP gen_emit_copy_end
19606
19607emit_copy_three_standalone:
19608 LEAL -2(SI), SI
19609 MOVB SI, (AX)
19610 MOVW CX, 1(AX)
19611 ADDQ $0x03, BX
19612 ADDQ $0x03, AX
19613
19614gen_emit_copy_end:
19615 MOVQ BX, ret+40(FP)
19616 RET
19617
19618// func emitCopyNoRepeat(dst []byte, offset int, length int) int
19619TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48
19620 XORQ BX, BX
19621 MOVQ dst_base+0(FP), AX
19622 MOVQ offset+24(FP), CX
19623 MOVQ length+32(FP), DX
19624
19625 // emitCopy
19626 CMPL CX, $0x00010000
19627 JB two_byte_offset_standalone_snappy
19628
19629four_bytes_loop_back_standalone_snappy:
19630 CMPL DX, $0x40
19631 JBE four_bytes_remain_standalone_snappy
19632 MOVB $0xff, (AX)
19633 MOVL CX, 1(AX)
19634 LEAL -64(DX), DX
19635 ADDQ $0x05, BX
19636 ADDQ $0x05, AX
19637 CMPL DX, $0x04
19638 JB four_bytes_remain_standalone_snappy
19639 JMP four_bytes_loop_back_standalone_snappy
19640
19641four_bytes_remain_standalone_snappy:
19642 TESTL DX, DX
19643 JZ gen_emit_copy_end_snappy
19644 XORL SI, SI
19645 LEAL -1(SI)(DX*4), DX
19646 MOVB DL, (AX)
19647 MOVL CX, 1(AX)
19648 ADDQ $0x05, BX
19649 ADDQ $0x05, AX
19650 JMP gen_emit_copy_end_snappy
19651
19652two_byte_offset_standalone_snappy:
19653 CMPL DX, $0x40
19654 JBE two_byte_offset_short_standalone_snappy
19655 MOVB $0xee, (AX)
19656 MOVW CX, 1(AX)
19657 LEAL -60(DX), DX
19658 ADDQ $0x03, AX
19659 ADDQ $0x03, BX
19660 JMP two_byte_offset_standalone_snappy
19661
19662two_byte_offset_short_standalone_snappy:
19663 MOVL DX, SI
19664 SHLL $0x02, SI
19665 CMPL DX, $0x0c
19666 JAE emit_copy_three_standalone_snappy
19667 CMPL CX, $0x00000800
19668 JAE emit_copy_three_standalone_snappy
19669 LEAL -15(SI), SI
19670 MOVB CL, 1(AX)
19671 SHRL $0x08, CX
19672 SHLL $0x05, CX
19673 ORL CX, SI
19674 MOVB SI, (AX)
19675 ADDQ $0x02, BX
19676 ADDQ $0x02, AX
19677 JMP gen_emit_copy_end_snappy
19678
19679emit_copy_three_standalone_snappy:
19680 LEAL -2(SI), SI
19681 MOVB SI, (AX)
19682 MOVW CX, 1(AX)
19683 ADDQ $0x03, BX
19684 ADDQ $0x03, AX
19685
19686gen_emit_copy_end_snappy:
19687 MOVQ BX, ret+40(FP)
19688 RET
19689
19690// func matchLen(a []byte, b []byte) int
19691// Requires: BMI
19692TEXT ·matchLen(SB), NOSPLIT, $0-56
19693 MOVQ a_base+0(FP), AX
19694 MOVQ b_base+24(FP), CX
19695 MOVQ a_len+8(FP), DX
19696
19697 // matchLen
19698 XORL SI, SI
19699
19700matchlen_loopback_16_standalone:
19701 CMPL DX, $0x10
19702 JB matchlen_match8_standalone
19703 MOVQ (AX)(SI*1), BX
19704 MOVQ 8(AX)(SI*1), DI
19705 XORQ (CX)(SI*1), BX
19706 JNZ matchlen_bsf_8_standalone
19707 XORQ 8(CX)(SI*1), DI
19708 JNZ matchlen_bsf_16standalone
19709 LEAL -16(DX), DX
19710 LEAL 16(SI), SI
19711 JMP matchlen_loopback_16_standalone
19712
19713matchlen_bsf_16standalone:
19714#ifdef GOAMD64_v3
19715 TZCNTQ DI, DI
19716
19717#else
19718 BSFQ DI, DI
19719
19720#endif
19721 SARQ $0x03, DI
19722 LEAL 8(SI)(DI*1), SI
19723 JMP gen_match_len_end
19724
19725matchlen_match8_standalone:
19726 CMPL DX, $0x08
19727 JB matchlen_match4_standalone
19728 MOVQ (AX)(SI*1), BX
19729 XORQ (CX)(SI*1), BX
19730 JNZ matchlen_bsf_8_standalone
19731 LEAL -8(DX), DX
19732 LEAL 8(SI), SI
19733 JMP matchlen_match4_standalone
19734
19735matchlen_bsf_8_standalone:
19736#ifdef GOAMD64_v3
19737 TZCNTQ BX, BX
19738
19739#else
19740 BSFQ BX, BX
19741
19742#endif
19743 SARQ $0x03, BX
19744 LEAL (SI)(BX*1), SI
19745 JMP gen_match_len_end
19746
19747matchlen_match4_standalone:
19748 CMPL DX, $0x04
19749 JB matchlen_match2_standalone
19750 MOVL (AX)(SI*1), BX
19751 CMPL (CX)(SI*1), BX
19752 JNE matchlen_match2_standalone
19753 LEAL -4(DX), DX
19754 LEAL 4(SI), SI
19755
19756matchlen_match2_standalone:
19757 CMPL DX, $0x01
19758 JE matchlen_match1_standalone
19759 JB gen_match_len_end
19760 MOVW (AX)(SI*1), BX
19761 CMPW (CX)(SI*1), BX
19762 JNE matchlen_match1_standalone
19763 LEAL 2(SI), SI
19764 SUBL $0x02, DX
19765 JZ gen_match_len_end
19766
19767matchlen_match1_standalone:
19768 MOVB (AX)(SI*1), BL
19769 CMPB (CX)(SI*1), BL
19770 JNE gen_match_len_end
19771 LEAL 1(SI), SI
19772
19773gen_match_len_end:
19774 MOVQ SI, ret+48(FP)
19775 RET
19776
19777// func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
19778// Requires: SSE2
19779TEXT ·cvtLZ4BlockAsm(SB), NOSPLIT, $0-64
19780 XORQ SI, SI
19781 MOVQ dst_base+0(FP), AX
19782 MOVQ dst_len+8(FP), CX
19783 MOVQ src_base+24(FP), DX
19784 MOVQ src_len+32(FP), BX
19785 LEAQ (DX)(BX*1), BX
19786 LEAQ -10(AX)(CX*1), CX
19787 XORQ DI, DI
19788
19789lz4_s2_loop:
19790 CMPQ DX, BX
19791 JAE lz4_s2_corrupt
19792 CMPQ AX, CX
19793 JAE lz4_s2_dstfull
19794 MOVBQZX (DX), R8
19795 MOVQ R8, R9
19796 MOVQ R8, R10
19797 SHRQ $0x04, R9
19798 ANDQ $0x0f, R10
19799 CMPQ R8, $0xf0
19800 JB lz4_s2_ll_end
19801
19802lz4_s2_ll_loop:
19803 INCQ DX
19804 CMPQ DX, BX
19805 JAE lz4_s2_corrupt
19806 MOVBQZX (DX), R8
19807 ADDQ R8, R9
19808 CMPQ R8, $0xff
19809 JEQ lz4_s2_ll_loop
19810
19811lz4_s2_ll_end:
19812 LEAQ (DX)(R9*1), R8
19813 ADDQ $0x04, R10
19814 CMPQ R8, BX
19815 JAE lz4_s2_corrupt
19816 INCQ DX
19817 INCQ R8
19818 TESTQ R9, R9
19819 JZ lz4_s2_lits_done
19820 LEAQ (AX)(R9*1), R11
19821 CMPQ R11, CX
19822 JAE lz4_s2_dstfull
19823 ADDQ R9, SI
19824 LEAL -1(R9), R11
19825 CMPL R11, $0x3c
19826 JB one_byte_lz4_s2
19827 CMPL R11, $0x00000100
19828 JB two_bytes_lz4_s2
19829 CMPL R11, $0x00010000
19830 JB three_bytes_lz4_s2
19831 CMPL R11, $0x01000000
19832 JB four_bytes_lz4_s2
19833 MOVB $0xfc, (AX)
19834 MOVL R11, 1(AX)
19835 ADDQ $0x05, AX
19836 JMP memmove_long_lz4_s2
19837
19838four_bytes_lz4_s2:
19839 MOVL R11, R12
19840 SHRL $0x10, R12
19841 MOVB $0xf8, (AX)
19842 MOVW R11, 1(AX)
19843 MOVB R12, 3(AX)
19844 ADDQ $0x04, AX
19845 JMP memmove_long_lz4_s2
19846
19847three_bytes_lz4_s2:
19848 MOVB $0xf4, (AX)
19849 MOVW R11, 1(AX)
19850 ADDQ $0x03, AX
19851 JMP memmove_long_lz4_s2
19852
19853two_bytes_lz4_s2:
19854 MOVB $0xf0, (AX)
19855 MOVB R11, 1(AX)
19856 ADDQ $0x02, AX
19857 CMPL R11, $0x40
19858 JB memmove_lz4_s2
19859 JMP memmove_long_lz4_s2
19860
19861one_byte_lz4_s2:
19862 SHLB $0x02, R11
19863 MOVB R11, (AX)
19864 ADDQ $0x01, AX
19865
19866memmove_lz4_s2:
19867 LEAQ (AX)(R9*1), R11
19868
19869 // genMemMoveShort
19870 CMPQ R9, $0x08
19871 JBE emit_lit_memmove_lz4_s2_memmove_move_8
19872 CMPQ R9, $0x10
19873 JBE emit_lit_memmove_lz4_s2_memmove_move_8through16
19874 CMPQ R9, $0x20
19875 JBE emit_lit_memmove_lz4_s2_memmove_move_17through32
19876 JMP emit_lit_memmove_lz4_s2_memmove_move_33through64
19877
19878emit_lit_memmove_lz4_s2_memmove_move_8:
19879 MOVQ (DX), R12
19880 MOVQ R12, (AX)
19881 JMP memmove_end_copy_lz4_s2
19882
19883emit_lit_memmove_lz4_s2_memmove_move_8through16:
19884 MOVQ (DX), R12
19885 MOVQ -8(DX)(R9*1), DX
19886 MOVQ R12, (AX)
19887 MOVQ DX, -8(AX)(R9*1)
19888 JMP memmove_end_copy_lz4_s2
19889
19890emit_lit_memmove_lz4_s2_memmove_move_17through32:
19891 MOVOU (DX), X0
19892 MOVOU -16(DX)(R9*1), X1
19893 MOVOU X0, (AX)
19894 MOVOU X1, -16(AX)(R9*1)
19895 JMP memmove_end_copy_lz4_s2
19896
19897emit_lit_memmove_lz4_s2_memmove_move_33through64:
19898 MOVOU (DX), X0
19899 MOVOU 16(DX), X1
19900 MOVOU -32(DX)(R9*1), X2
19901 MOVOU -16(DX)(R9*1), X3
19902 MOVOU X0, (AX)
19903 MOVOU X1, 16(AX)
19904 MOVOU X2, -32(AX)(R9*1)
19905 MOVOU X3, -16(AX)(R9*1)
19906
19907memmove_end_copy_lz4_s2:
19908 MOVQ R11, AX
19909 JMP lz4_s2_lits_emit_done
19910
19911memmove_long_lz4_s2:
19912 LEAQ (AX)(R9*1), R11
19913
19914 // genMemMoveLong
19915 MOVOU (DX), X0
19916 MOVOU 16(DX), X1
19917 MOVOU -32(DX)(R9*1), X2
19918 MOVOU -16(DX)(R9*1), X3
19919 MOVQ R9, R13
19920 SHRQ $0x05, R13
19921 MOVQ AX, R12
19922 ANDL $0x0000001f, R12
19923 MOVQ $0x00000040, R14
19924 SUBQ R12, R14
19925 DECQ R13
19926 JA emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
19927 LEAQ -32(DX)(R14*1), R12
19928 LEAQ -32(AX)(R14*1), R15
19929
19930emit_lit_memmove_long_lz4_s2large_big_loop_back:
19931 MOVOU (R12), X4
19932 MOVOU 16(R12), X5
19933 MOVOA X4, (R15)
19934 MOVOA X5, 16(R15)
19935 ADDQ $0x20, R15
19936 ADDQ $0x20, R12
19937 ADDQ $0x20, R14
19938 DECQ R13
19939 JNA emit_lit_memmove_long_lz4_s2large_big_loop_back
19940
19941emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32:
19942 MOVOU -32(DX)(R14*1), X4
19943 MOVOU -16(DX)(R14*1), X5
19944 MOVOA X4, -32(AX)(R14*1)
19945 MOVOA X5, -16(AX)(R14*1)
19946 ADDQ $0x20, R14
19947 CMPQ R9, R14
19948 JAE emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
19949 MOVOU X0, (AX)
19950 MOVOU X1, 16(AX)
19951 MOVOU X2, -32(AX)(R9*1)
19952 MOVOU X3, -16(AX)(R9*1)
19953 MOVQ R11, AX
19954
19955lz4_s2_lits_emit_done:
19956 MOVQ R8, DX
19957
19958lz4_s2_lits_done:
19959 CMPQ DX, BX
19960 JNE lz4_s2_match
19961 CMPQ R10, $0x04
19962 JEQ lz4_s2_done
19963 JMP lz4_s2_corrupt
19964
19965lz4_s2_match:
19966 LEAQ 2(DX), R8
19967 CMPQ R8, BX
19968 JAE lz4_s2_corrupt
19969 MOVWQZX (DX), R9
19970 MOVQ R8, DX
19971 TESTQ R9, R9
19972 JZ lz4_s2_corrupt
19973 CMPQ R9, SI
19974 JA lz4_s2_corrupt
19975 CMPQ R10, $0x13
19976 JNE lz4_s2_ml_done
19977
19978lz4_s2_ml_loop:
19979 MOVBQZX (DX), R8
19980 INCQ DX
19981 ADDQ R8, R10
19982 CMPQ DX, BX
19983 JAE lz4_s2_corrupt
19984 CMPQ R8, $0xff
19985 JEQ lz4_s2_ml_loop
19986
19987lz4_s2_ml_done:
19988 ADDQ R10, SI
19989 CMPQ R9, DI
19990 JNE lz4_s2_docopy
19991
19992 // emitRepeat
19993emit_repeat_again_lz4_s2:
19994 MOVL R10, R8
19995 LEAL -4(R10), R10
19996 CMPL R8, $0x08
19997 JBE repeat_two_lz4_s2
19998 CMPL R8, $0x0c
19999 JAE cant_repeat_two_offset_lz4_s2
20000 CMPL R9, $0x00000800
20001 JB repeat_two_offset_lz4_s2
20002
20003cant_repeat_two_offset_lz4_s2:
20004 CMPL R10, $0x00000104
20005 JB repeat_three_lz4_s2
20006 CMPL R10, $0x00010100
20007 JB repeat_four_lz4_s2
20008 CMPL R10, $0x0100ffff
20009 JB repeat_five_lz4_s2
20010 LEAL -16842747(R10), R10
20011 MOVL $0xfffb001d, (AX)
20012 MOVB $0xff, 4(AX)
20013 ADDQ $0x05, AX
20014 JMP emit_repeat_again_lz4_s2
20015
20016repeat_five_lz4_s2:
20017 LEAL -65536(R10), R10
20018 MOVL R10, R9
20019 MOVW $0x001d, (AX)
20020 MOVW R10, 2(AX)
20021 SARL $0x10, R9
20022 MOVB R9, 4(AX)
20023 ADDQ $0x05, AX
20024 JMP lz4_s2_loop
20025
20026repeat_four_lz4_s2:
20027 LEAL -256(R10), R10
20028 MOVW $0x0019, (AX)
20029 MOVW R10, 2(AX)
20030 ADDQ $0x04, AX
20031 JMP lz4_s2_loop
20032
20033repeat_three_lz4_s2:
20034 LEAL -4(R10), R10
20035 MOVW $0x0015, (AX)
20036 MOVB R10, 2(AX)
20037 ADDQ $0x03, AX
20038 JMP lz4_s2_loop
20039
20040repeat_two_lz4_s2:
20041 SHLL $0x02, R10
20042 ORL $0x01, R10
20043 MOVW R10, (AX)
20044 ADDQ $0x02, AX
20045 JMP lz4_s2_loop
20046
20047repeat_two_offset_lz4_s2:
20048 XORQ R8, R8
20049 LEAL 1(R8)(R10*4), R10
20050 MOVB R9, 1(AX)
20051 SARL $0x08, R9
20052 SHLL $0x05, R9
20053 ORL R9, R10
20054 MOVB R10, (AX)
20055 ADDQ $0x02, AX
20056 JMP lz4_s2_loop
20057
20058lz4_s2_docopy:
20059 MOVQ R9, DI
20060
20061 // emitCopy
20062 CMPL R10, $0x40
20063 JBE two_byte_offset_short_lz4_s2
20064 CMPL R9, $0x00000800
20065 JAE long_offset_short_lz4_s2
20066 MOVL $0x00000001, R8
20067 LEAL 16(R8), R8
20068 MOVB R9, 1(AX)
20069 MOVL R9, R11
20070 SHRL $0x08, R11
20071 SHLL $0x05, R11
20072 ORL R11, R8
20073 MOVB R8, (AX)
20074 ADDQ $0x02, AX
20075 SUBL $0x08, R10
20076
20077 // emitRepeat
20078 LEAL -4(R10), R10
20079 JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
20080
20081emit_repeat_again_lz4_s2_emit_copy_short_2b:
20082 MOVL R10, R8
20083 LEAL -4(R10), R10
20084 CMPL R8, $0x08
20085 JBE repeat_two_lz4_s2_emit_copy_short_2b
20086 CMPL R8, $0x0c
20087 JAE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
20088 CMPL R9, $0x00000800
20089 JB repeat_two_offset_lz4_s2_emit_copy_short_2b
20090
20091cant_repeat_two_offset_lz4_s2_emit_copy_short_2b:
20092 CMPL R10, $0x00000104
20093 JB repeat_three_lz4_s2_emit_copy_short_2b
20094 CMPL R10, $0x00010100
20095 JB repeat_four_lz4_s2_emit_copy_short_2b
20096 CMPL R10, $0x0100ffff
20097 JB repeat_five_lz4_s2_emit_copy_short_2b
20098 LEAL -16842747(R10), R10
20099 MOVL $0xfffb001d, (AX)
20100 MOVB $0xff, 4(AX)
20101 ADDQ $0x05, AX
20102 JMP emit_repeat_again_lz4_s2_emit_copy_short_2b
20103
20104repeat_five_lz4_s2_emit_copy_short_2b:
20105 LEAL -65536(R10), R10
20106 MOVL R10, R9
20107 MOVW $0x001d, (AX)
20108 MOVW R10, 2(AX)
20109 SARL $0x10, R9
20110 MOVB R9, 4(AX)
20111 ADDQ $0x05, AX
20112 JMP lz4_s2_loop
20113
20114repeat_four_lz4_s2_emit_copy_short_2b:
20115 LEAL -256(R10), R10
20116 MOVW $0x0019, (AX)
20117 MOVW R10, 2(AX)
20118 ADDQ $0x04, AX
20119 JMP lz4_s2_loop
20120
20121repeat_three_lz4_s2_emit_copy_short_2b:
20122 LEAL -4(R10), R10
20123 MOVW $0x0015, (AX)
20124 MOVB R10, 2(AX)
20125 ADDQ $0x03, AX
20126 JMP lz4_s2_loop
20127
20128repeat_two_lz4_s2_emit_copy_short_2b:
20129 SHLL $0x02, R10
20130 ORL $0x01, R10
20131 MOVW R10, (AX)
20132 ADDQ $0x02, AX
20133 JMP lz4_s2_loop
20134
20135repeat_two_offset_lz4_s2_emit_copy_short_2b:
20136 XORQ R8, R8
20137 LEAL 1(R8)(R10*4), R10
20138 MOVB R9, 1(AX)
20139 SARL $0x08, R9
20140 SHLL $0x05, R9
20141 ORL R9, R10
20142 MOVB R10, (AX)
20143 ADDQ $0x02, AX
20144 JMP lz4_s2_loop
20145
20146long_offset_short_lz4_s2:
20147 MOVB $0xee, (AX)
20148 MOVW R9, 1(AX)
20149 LEAL -60(R10), R10
20150 ADDQ $0x03, AX
20151
20152 // emitRepeat
20153emit_repeat_again_lz4_s2_emit_copy_short:
20154 MOVL R10, R8
20155 LEAL -4(R10), R10
20156 CMPL R8, $0x08
20157 JBE repeat_two_lz4_s2_emit_copy_short
20158 CMPL R8, $0x0c
20159 JAE cant_repeat_two_offset_lz4_s2_emit_copy_short
20160 CMPL R9, $0x00000800
20161 JB repeat_two_offset_lz4_s2_emit_copy_short
20162
20163cant_repeat_two_offset_lz4_s2_emit_copy_short:
20164 CMPL R10, $0x00000104
20165 JB repeat_three_lz4_s2_emit_copy_short
20166 CMPL R10, $0x00010100
20167 JB repeat_four_lz4_s2_emit_copy_short
20168 CMPL R10, $0x0100ffff
20169 JB repeat_five_lz4_s2_emit_copy_short
20170 LEAL -16842747(R10), R10
20171 MOVL $0xfffb001d, (AX)
20172 MOVB $0xff, 4(AX)
20173 ADDQ $0x05, AX
20174 JMP emit_repeat_again_lz4_s2_emit_copy_short
20175
20176repeat_five_lz4_s2_emit_copy_short:
20177 LEAL -65536(R10), R10
20178 MOVL R10, R9
20179 MOVW $0x001d, (AX)
20180 MOVW R10, 2(AX)
20181 SARL $0x10, R9
20182 MOVB R9, 4(AX)
20183 ADDQ $0x05, AX
20184 JMP lz4_s2_loop
20185
20186repeat_four_lz4_s2_emit_copy_short:
20187 LEAL -256(R10), R10
20188 MOVW $0x0019, (AX)
20189 MOVW R10, 2(AX)
20190 ADDQ $0x04, AX
20191 JMP lz4_s2_loop
20192
20193repeat_three_lz4_s2_emit_copy_short:
20194 LEAL -4(R10), R10
20195 MOVW $0x0015, (AX)
20196 MOVB R10, 2(AX)
20197 ADDQ $0x03, AX
20198 JMP lz4_s2_loop
20199
20200repeat_two_lz4_s2_emit_copy_short:
20201 SHLL $0x02, R10
20202 ORL $0x01, R10
20203 MOVW R10, (AX)
20204 ADDQ $0x02, AX
20205 JMP lz4_s2_loop
20206
20207repeat_two_offset_lz4_s2_emit_copy_short:
20208 XORQ R8, R8
20209 LEAL 1(R8)(R10*4), R10
20210 MOVB R9, 1(AX)
20211 SARL $0x08, R9
20212 SHLL $0x05, R9
20213 ORL R9, R10
20214 MOVB R10, (AX)
20215 ADDQ $0x02, AX
20216 JMP lz4_s2_loop
20217
20218two_byte_offset_short_lz4_s2:
20219 MOVL R10, R8
20220 SHLL $0x02, R8
20221 CMPL R10, $0x0c
20222 JAE emit_copy_three_lz4_s2
20223 CMPL R9, $0x00000800
20224 JAE emit_copy_three_lz4_s2
20225 LEAL -15(R8), R8
20226 MOVB R9, 1(AX)
20227 SHRL $0x08, R9
20228 SHLL $0x05, R9
20229 ORL R9, R8
20230 MOVB R8, (AX)
20231 ADDQ $0x02, AX
20232 JMP lz4_s2_loop
20233
20234emit_copy_three_lz4_s2:
20235 LEAL -2(R8), R8
20236 MOVB R8, (AX)
20237 MOVW R9, 1(AX)
20238 ADDQ $0x03, AX
20239 JMP lz4_s2_loop
20240
20241lz4_s2_done:
20242 MOVQ dst_base+0(FP), CX
20243 SUBQ CX, AX
20244 MOVQ SI, uncompressed+48(FP)
20245 MOVQ AX, dstUsed+56(FP)
20246 RET
20247
20248lz4_s2_corrupt:
20249 XORQ AX, AX
20250 LEAQ -1(AX), SI
20251 MOVQ SI, uncompressed+48(FP)
20252 RET
20253
20254lz4_s2_dstfull:
20255 XORQ AX, AX
20256 LEAQ -2(AX), SI
20257 MOVQ SI, uncompressed+48(FP)
20258 RET
20259
20260// func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
20261// Requires: SSE2
20262TEXT ·cvtLZ4sBlockAsm(SB), NOSPLIT, $0-64
20263 XORQ SI, SI
20264 MOVQ dst_base+0(FP), AX
20265 MOVQ dst_len+8(FP), CX
20266 MOVQ src_base+24(FP), DX
20267 MOVQ src_len+32(FP), BX
20268 LEAQ (DX)(BX*1), BX
20269 LEAQ -10(AX)(CX*1), CX
20270 XORQ DI, DI
20271
20272lz4s_s2_loop:
20273 CMPQ DX, BX
20274 JAE lz4s_s2_corrupt
20275 CMPQ AX, CX
20276 JAE lz4s_s2_dstfull
20277 MOVBQZX (DX), R8
20278 MOVQ R8, R9
20279 MOVQ R8, R10
20280 SHRQ $0x04, R9
20281 ANDQ $0x0f, R10
20282 CMPQ R8, $0xf0
20283 JB lz4s_s2_ll_end
20284
20285lz4s_s2_ll_loop:
20286 INCQ DX
20287 CMPQ DX, BX
20288 JAE lz4s_s2_corrupt
20289 MOVBQZX (DX), R8
20290 ADDQ R8, R9
20291 CMPQ R8, $0xff
20292 JEQ lz4s_s2_ll_loop
20293
20294lz4s_s2_ll_end:
20295 LEAQ (DX)(R9*1), R8
20296 ADDQ $0x03, R10
20297 CMPQ R8, BX
20298 JAE lz4s_s2_corrupt
20299 INCQ DX
20300 INCQ R8
20301 TESTQ R9, R9
20302 JZ lz4s_s2_lits_done
20303 LEAQ (AX)(R9*1), R11
20304 CMPQ R11, CX
20305 JAE lz4s_s2_dstfull
20306 ADDQ R9, SI
20307 LEAL -1(R9), R11
20308 CMPL R11, $0x3c
20309 JB one_byte_lz4s_s2
20310 CMPL R11, $0x00000100
20311 JB two_bytes_lz4s_s2
20312 CMPL R11, $0x00010000
20313 JB three_bytes_lz4s_s2
20314 CMPL R11, $0x01000000
20315 JB four_bytes_lz4s_s2
20316 MOVB $0xfc, (AX)
20317 MOVL R11, 1(AX)
20318 ADDQ $0x05, AX
20319 JMP memmove_long_lz4s_s2
20320
20321four_bytes_lz4s_s2:
20322 MOVL R11, R12
20323 SHRL $0x10, R12
20324 MOVB $0xf8, (AX)
20325 MOVW R11, 1(AX)
20326 MOVB R12, 3(AX)
20327 ADDQ $0x04, AX
20328 JMP memmove_long_lz4s_s2
20329
20330three_bytes_lz4s_s2:
20331 MOVB $0xf4, (AX)
20332 MOVW R11, 1(AX)
20333 ADDQ $0x03, AX
20334 JMP memmove_long_lz4s_s2
20335
20336two_bytes_lz4s_s2:
20337 MOVB $0xf0, (AX)
20338 MOVB R11, 1(AX)
20339 ADDQ $0x02, AX
20340 CMPL R11, $0x40
20341 JB memmove_lz4s_s2
20342 JMP memmove_long_lz4s_s2
20343
20344one_byte_lz4s_s2:
20345 SHLB $0x02, R11
20346 MOVB R11, (AX)
20347 ADDQ $0x01, AX
20348
20349memmove_lz4s_s2:
20350 LEAQ (AX)(R9*1), R11
20351
20352 // genMemMoveShort
20353 CMPQ R9, $0x08
20354 JBE emit_lit_memmove_lz4s_s2_memmove_move_8
20355 CMPQ R9, $0x10
20356 JBE emit_lit_memmove_lz4s_s2_memmove_move_8through16
20357 CMPQ R9, $0x20
20358 JBE emit_lit_memmove_lz4s_s2_memmove_move_17through32
20359 JMP emit_lit_memmove_lz4s_s2_memmove_move_33through64
20360
20361emit_lit_memmove_lz4s_s2_memmove_move_8:
20362 MOVQ (DX), R12
20363 MOVQ R12, (AX)
20364 JMP memmove_end_copy_lz4s_s2
20365
20366emit_lit_memmove_lz4s_s2_memmove_move_8through16:
20367 MOVQ (DX), R12
20368 MOVQ -8(DX)(R9*1), DX
20369 MOVQ R12, (AX)
20370 MOVQ DX, -8(AX)(R9*1)
20371 JMP memmove_end_copy_lz4s_s2
20372
20373emit_lit_memmove_lz4s_s2_memmove_move_17through32:
20374 MOVOU (DX), X0
20375 MOVOU -16(DX)(R9*1), X1
20376 MOVOU X0, (AX)
20377 MOVOU X1, -16(AX)(R9*1)
20378 JMP memmove_end_copy_lz4s_s2
20379
20380emit_lit_memmove_lz4s_s2_memmove_move_33through64:
20381 MOVOU (DX), X0
20382 MOVOU 16(DX), X1
20383 MOVOU -32(DX)(R9*1), X2
20384 MOVOU -16(DX)(R9*1), X3
20385 MOVOU X0, (AX)
20386 MOVOU X1, 16(AX)
20387 MOVOU X2, -32(AX)(R9*1)
20388 MOVOU X3, -16(AX)(R9*1)
20389
20390memmove_end_copy_lz4s_s2:
20391 MOVQ R11, AX
20392 JMP lz4s_s2_lits_emit_done
20393
20394memmove_long_lz4s_s2:
20395 LEAQ (AX)(R9*1), R11
20396
20397 // genMemMoveLong
20398 MOVOU (DX), X0
20399 MOVOU 16(DX), X1
20400 MOVOU -32(DX)(R9*1), X2
20401 MOVOU -16(DX)(R9*1), X3
20402 MOVQ R9, R13
20403 SHRQ $0x05, R13
20404 MOVQ AX, R12
20405 ANDL $0x0000001f, R12
20406 MOVQ $0x00000040, R14
20407 SUBQ R12, R14
20408 DECQ R13
20409 JA emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32
20410 LEAQ -32(DX)(R14*1), R12
20411 LEAQ -32(AX)(R14*1), R15
20412
20413emit_lit_memmove_long_lz4s_s2large_big_loop_back:
20414 MOVOU (R12), X4
20415 MOVOU 16(R12), X5
20416 MOVOA X4, (R15)
20417 MOVOA X5, 16(R15)
20418 ADDQ $0x20, R15
20419 ADDQ $0x20, R12
20420 ADDQ $0x20, R14
20421 DECQ R13
20422 JNA emit_lit_memmove_long_lz4s_s2large_big_loop_back
20423
20424emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32:
20425 MOVOU -32(DX)(R14*1), X4
20426 MOVOU -16(DX)(R14*1), X5
20427 MOVOA X4, -32(AX)(R14*1)
20428 MOVOA X5, -16(AX)(R14*1)
20429 ADDQ $0x20, R14
20430 CMPQ R9, R14
20431 JAE emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32
20432 MOVOU X0, (AX)
20433 MOVOU X1, 16(AX)
20434 MOVOU X2, -32(AX)(R9*1)
20435 MOVOU X3, -16(AX)(R9*1)
20436 MOVQ R11, AX
20437
20438lz4s_s2_lits_emit_done:
20439 MOVQ R8, DX
20440
20441lz4s_s2_lits_done:
20442 CMPQ DX, BX
20443 JNE lz4s_s2_match
20444 CMPQ R10, $0x03
20445 JEQ lz4s_s2_done
20446 JMP lz4s_s2_corrupt
20447
20448lz4s_s2_match:
20449 CMPQ R10, $0x03
20450 JEQ lz4s_s2_loop
20451 LEAQ 2(DX), R8
20452 CMPQ R8, BX
20453 JAE lz4s_s2_corrupt
20454 MOVWQZX (DX), R9
20455 MOVQ R8, DX
20456 TESTQ R9, R9
20457 JZ lz4s_s2_corrupt
20458 CMPQ R9, SI
20459 JA lz4s_s2_corrupt
20460 CMPQ R10, $0x12
20461 JNE lz4s_s2_ml_done
20462
20463lz4s_s2_ml_loop:
20464 MOVBQZX (DX), R8
20465 INCQ DX
20466 ADDQ R8, R10
20467 CMPQ DX, BX
20468 JAE lz4s_s2_corrupt
20469 CMPQ R8, $0xff
20470 JEQ lz4s_s2_ml_loop
20471
20472lz4s_s2_ml_done:
20473 ADDQ R10, SI
20474 CMPQ R9, DI
20475 JNE lz4s_s2_docopy
20476
20477 // emitRepeat
20478emit_repeat_again_lz4_s2:
20479 MOVL R10, R8
20480 LEAL -4(R10), R10
20481 CMPL R8, $0x08
20482 JBE repeat_two_lz4_s2
20483 CMPL R8, $0x0c
20484 JAE cant_repeat_two_offset_lz4_s2
20485 CMPL R9, $0x00000800
20486 JB repeat_two_offset_lz4_s2
20487
20488cant_repeat_two_offset_lz4_s2:
20489 CMPL R10, $0x00000104
20490 JB repeat_three_lz4_s2
20491 CMPL R10, $0x00010100
20492 JB repeat_four_lz4_s2
20493 CMPL R10, $0x0100ffff
20494 JB repeat_five_lz4_s2
20495 LEAL -16842747(R10), R10
20496 MOVL $0xfffb001d, (AX)
20497 MOVB $0xff, 4(AX)
20498 ADDQ $0x05, AX
20499 JMP emit_repeat_again_lz4_s2
20500
20501repeat_five_lz4_s2:
20502 LEAL -65536(R10), R10
20503 MOVL R10, R9
20504 MOVW $0x001d, (AX)
20505 MOVW R10, 2(AX)
20506 SARL $0x10, R9
20507 MOVB R9, 4(AX)
20508 ADDQ $0x05, AX
20509 JMP lz4s_s2_loop
20510
20511repeat_four_lz4_s2:
20512 LEAL -256(R10), R10
20513 MOVW $0x0019, (AX)
20514 MOVW R10, 2(AX)
20515 ADDQ $0x04, AX
20516 JMP lz4s_s2_loop
20517
20518repeat_three_lz4_s2:
20519 LEAL -4(R10), R10
20520 MOVW $0x0015, (AX)
20521 MOVB R10, 2(AX)
20522 ADDQ $0x03, AX
20523 JMP lz4s_s2_loop
20524
20525repeat_two_lz4_s2:
20526 SHLL $0x02, R10
20527 ORL $0x01, R10
20528 MOVW R10, (AX)
20529 ADDQ $0x02, AX
20530 JMP lz4s_s2_loop
20531
20532repeat_two_offset_lz4_s2:
20533 XORQ R8, R8
20534 LEAL 1(R8)(R10*4), R10
20535 MOVB R9, 1(AX)
20536 SARL $0x08, R9
20537 SHLL $0x05, R9
20538 ORL R9, R10
20539 MOVB R10, (AX)
20540 ADDQ $0x02, AX
20541 JMP lz4s_s2_loop
20542
20543lz4s_s2_docopy:
20544 MOVQ R9, DI
20545
20546 // emitCopy
20547 CMPL R10, $0x40
20548 JBE two_byte_offset_short_lz4_s2
20549 CMPL R9, $0x00000800
20550 JAE long_offset_short_lz4_s2
20551 MOVL $0x00000001, R8
20552 LEAL 16(R8), R8
20553 MOVB R9, 1(AX)
20554 MOVL R9, R11
20555 SHRL $0x08, R11
20556 SHLL $0x05, R11
20557 ORL R11, R8
20558 MOVB R8, (AX)
20559 ADDQ $0x02, AX
20560 SUBL $0x08, R10
20561
20562 // emitRepeat
20563 LEAL -4(R10), R10
20564 JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
20565
20566emit_repeat_again_lz4_s2_emit_copy_short_2b:
20567 MOVL R10, R8
20568 LEAL -4(R10), R10
20569 CMPL R8, $0x08
20570 JBE repeat_two_lz4_s2_emit_copy_short_2b
20571 CMPL R8, $0x0c
20572 JAE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
20573 CMPL R9, $0x00000800
20574 JB repeat_two_offset_lz4_s2_emit_copy_short_2b
20575
20576cant_repeat_two_offset_lz4_s2_emit_copy_short_2b:
20577 CMPL R10, $0x00000104
20578 JB repeat_three_lz4_s2_emit_copy_short_2b
20579 CMPL R10, $0x00010100
20580 JB repeat_four_lz4_s2_emit_copy_short_2b
20581 CMPL R10, $0x0100ffff
20582 JB repeat_five_lz4_s2_emit_copy_short_2b
20583 LEAL -16842747(R10), R10
20584 MOVL $0xfffb001d, (AX)
20585 MOVB $0xff, 4(AX)
20586 ADDQ $0x05, AX
20587 JMP emit_repeat_again_lz4_s2_emit_copy_short_2b
20588
20589repeat_five_lz4_s2_emit_copy_short_2b:
20590 LEAL -65536(R10), R10
20591 MOVL R10, R9
20592 MOVW $0x001d, (AX)
20593 MOVW R10, 2(AX)
20594 SARL $0x10, R9
20595 MOVB R9, 4(AX)
20596 ADDQ $0x05, AX
20597 JMP lz4s_s2_loop
20598
20599repeat_four_lz4_s2_emit_copy_short_2b:
20600 LEAL -256(R10), R10
20601 MOVW $0x0019, (AX)
20602 MOVW R10, 2(AX)
20603 ADDQ $0x04, AX
20604 JMP lz4s_s2_loop
20605
20606repeat_three_lz4_s2_emit_copy_short_2b:
20607 LEAL -4(R10), R10
20608 MOVW $0x0015, (AX)
20609 MOVB R10, 2(AX)
20610 ADDQ $0x03, AX
20611 JMP lz4s_s2_loop
20612
20613repeat_two_lz4_s2_emit_copy_short_2b:
20614 SHLL $0x02, R10
20615 ORL $0x01, R10
20616 MOVW R10, (AX)
20617 ADDQ $0x02, AX
20618 JMP lz4s_s2_loop
20619
20620repeat_two_offset_lz4_s2_emit_copy_short_2b:
20621 XORQ R8, R8
20622 LEAL 1(R8)(R10*4), R10
20623 MOVB R9, 1(AX)
20624 SARL $0x08, R9
20625 SHLL $0x05, R9
20626 ORL R9, R10
20627 MOVB R10, (AX)
20628 ADDQ $0x02, AX
20629 JMP lz4s_s2_loop
20630
20631long_offset_short_lz4_s2:
20632 MOVB $0xee, (AX)
20633 MOVW R9, 1(AX)
20634 LEAL -60(R10), R10
20635 ADDQ $0x03, AX
20636
20637 // emitRepeat
20638emit_repeat_again_lz4_s2_emit_copy_short:
20639 MOVL R10, R8
20640 LEAL -4(R10), R10
20641 CMPL R8, $0x08
20642 JBE repeat_two_lz4_s2_emit_copy_short
20643 CMPL R8, $0x0c
20644 JAE cant_repeat_two_offset_lz4_s2_emit_copy_short
20645 CMPL R9, $0x00000800
20646 JB repeat_two_offset_lz4_s2_emit_copy_short
20647
20648cant_repeat_two_offset_lz4_s2_emit_copy_short:
20649 CMPL R10, $0x00000104
20650 JB repeat_three_lz4_s2_emit_copy_short
20651 CMPL R10, $0x00010100
20652 JB repeat_four_lz4_s2_emit_copy_short
20653 CMPL R10, $0x0100ffff
20654 JB repeat_five_lz4_s2_emit_copy_short
20655 LEAL -16842747(R10), R10
20656 MOVL $0xfffb001d, (AX)
20657 MOVB $0xff, 4(AX)
20658 ADDQ $0x05, AX
20659 JMP emit_repeat_again_lz4_s2_emit_copy_short
20660
20661repeat_five_lz4_s2_emit_copy_short:
20662 LEAL -65536(R10), R10
20663 MOVL R10, R9
20664 MOVW $0x001d, (AX)
20665 MOVW R10, 2(AX)
20666 SARL $0x10, R9
20667 MOVB R9, 4(AX)
20668 ADDQ $0x05, AX
20669 JMP lz4s_s2_loop
20670
20671repeat_four_lz4_s2_emit_copy_short:
20672 LEAL -256(R10), R10
20673 MOVW $0x0019, (AX)
20674 MOVW R10, 2(AX)
20675 ADDQ $0x04, AX
20676 JMP lz4s_s2_loop
20677
20678repeat_three_lz4_s2_emit_copy_short:
20679 LEAL -4(R10), R10
20680 MOVW $0x0015, (AX)
20681 MOVB R10, 2(AX)
20682 ADDQ $0x03, AX
20683 JMP lz4s_s2_loop
20684
20685repeat_two_lz4_s2_emit_copy_short:
20686 SHLL $0x02, R10
20687 ORL $0x01, R10
20688 MOVW R10, (AX)
20689 ADDQ $0x02, AX
20690 JMP lz4s_s2_loop
20691
20692repeat_two_offset_lz4_s2_emit_copy_short:
20693 XORQ R8, R8
20694 LEAL 1(R8)(R10*4), R10
20695 MOVB R9, 1(AX)
20696 SARL $0x08, R9
20697 SHLL $0x05, R9
20698 ORL R9, R10
20699 MOVB R10, (AX)
20700 ADDQ $0x02, AX
20701 JMP lz4s_s2_loop
20702
20703two_byte_offset_short_lz4_s2:
20704 MOVL R10, R8
20705 SHLL $0x02, R8
20706 CMPL R10, $0x0c
20707 JAE emit_copy_three_lz4_s2
20708 CMPL R9, $0x00000800
20709 JAE emit_copy_three_lz4_s2
20710 LEAL -15(R8), R8
20711 MOVB R9, 1(AX)
20712 SHRL $0x08, R9
20713 SHLL $0x05, R9
20714 ORL R9, R8
20715 MOVB R8, (AX)
20716 ADDQ $0x02, AX
20717 JMP lz4s_s2_loop
20718
20719emit_copy_three_lz4_s2:
20720 LEAL -2(R8), R8
20721 MOVB R8, (AX)
20722 MOVW R9, 1(AX)
20723 ADDQ $0x03, AX
20724 JMP lz4s_s2_loop
20725
20726lz4s_s2_done:
20727 MOVQ dst_base+0(FP), CX
20728 SUBQ CX, AX
20729 MOVQ SI, uncompressed+48(FP)
20730 MOVQ AX, dstUsed+56(FP)
20731 RET
20732
20733lz4s_s2_corrupt:
20734 XORQ AX, AX
20735 LEAQ -1(AX), SI
20736 MOVQ SI, uncompressed+48(FP)
20737 RET
20738
20739lz4s_s2_dstfull:
20740 XORQ AX, AX
20741 LEAQ -2(AX), SI
20742 MOVQ SI, uncompressed+48(FP)
20743 RET
20744
20745// func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
20746// Requires: SSE2
20747TEXT ·cvtLZ4BlockSnappyAsm(SB), NOSPLIT, $0-64
20748 XORQ SI, SI
20749 MOVQ dst_base+0(FP), AX
20750 MOVQ dst_len+8(FP), CX
20751 MOVQ src_base+24(FP), DX
20752 MOVQ src_len+32(FP), BX
20753 LEAQ (DX)(BX*1), BX
20754 LEAQ -10(AX)(CX*1), CX
20755
20756lz4_snappy_loop:
20757 CMPQ DX, BX
20758 JAE lz4_snappy_corrupt
20759 CMPQ AX, CX
20760 JAE lz4_snappy_dstfull
20761 MOVBQZX (DX), DI
20762 MOVQ DI, R8
20763 MOVQ DI, R9
20764 SHRQ $0x04, R8
20765 ANDQ $0x0f, R9
20766 CMPQ DI, $0xf0
20767 JB lz4_snappy_ll_end
20768
20769lz4_snappy_ll_loop:
20770 INCQ DX
20771 CMPQ DX, BX
20772 JAE lz4_snappy_corrupt
20773 MOVBQZX (DX), DI
20774 ADDQ DI, R8
20775 CMPQ DI, $0xff
20776 JEQ lz4_snappy_ll_loop
20777
20778lz4_snappy_ll_end:
20779 LEAQ (DX)(R8*1), DI
20780 ADDQ $0x04, R9
20781 CMPQ DI, BX
20782 JAE lz4_snappy_corrupt
20783 INCQ DX
20784 INCQ DI
20785 TESTQ R8, R8
20786 JZ lz4_snappy_lits_done
20787 LEAQ (AX)(R8*1), R10
20788 CMPQ R10, CX
20789 JAE lz4_snappy_dstfull
20790 ADDQ R8, SI
20791 LEAL -1(R8), R10
20792 CMPL R10, $0x3c
20793 JB one_byte_lz4_snappy
20794 CMPL R10, $0x00000100
20795 JB two_bytes_lz4_snappy
20796 CMPL R10, $0x00010000
20797 JB three_bytes_lz4_snappy
20798 CMPL R10, $0x01000000
20799 JB four_bytes_lz4_snappy
20800 MOVB $0xfc, (AX)
20801 MOVL R10, 1(AX)
20802 ADDQ $0x05, AX
20803 JMP memmove_long_lz4_snappy
20804
20805four_bytes_lz4_snappy:
20806 MOVL R10, R11
20807 SHRL $0x10, R11
20808 MOVB $0xf8, (AX)
20809 MOVW R10, 1(AX)
20810 MOVB R11, 3(AX)
20811 ADDQ $0x04, AX
20812 JMP memmove_long_lz4_snappy
20813
20814three_bytes_lz4_snappy:
20815 MOVB $0xf4, (AX)
20816 MOVW R10, 1(AX)
20817 ADDQ $0x03, AX
20818 JMP memmove_long_lz4_snappy
20819
20820two_bytes_lz4_snappy:
20821 MOVB $0xf0, (AX)
20822 MOVB R10, 1(AX)
20823 ADDQ $0x02, AX
20824 CMPL R10, $0x40
20825 JB memmove_lz4_snappy
20826 JMP memmove_long_lz4_snappy
20827
20828one_byte_lz4_snappy:
20829 SHLB $0x02, R10
20830 MOVB R10, (AX)
20831 ADDQ $0x01, AX
20832
20833memmove_lz4_snappy:
20834 LEAQ (AX)(R8*1), R10
20835
20836 // genMemMoveShort
20837 CMPQ R8, $0x08
20838 JBE emit_lit_memmove_lz4_snappy_memmove_move_8
20839 CMPQ R8, $0x10
20840 JBE emit_lit_memmove_lz4_snappy_memmove_move_8through16
20841 CMPQ R8, $0x20
20842 JBE emit_lit_memmove_lz4_snappy_memmove_move_17through32
20843 JMP emit_lit_memmove_lz4_snappy_memmove_move_33through64
20844
20845emit_lit_memmove_lz4_snappy_memmove_move_8:
20846 MOVQ (DX), R11
20847 MOVQ R11, (AX)
20848 JMP memmove_end_copy_lz4_snappy
20849
20850emit_lit_memmove_lz4_snappy_memmove_move_8through16:
20851 MOVQ (DX), R11
20852 MOVQ -8(DX)(R8*1), DX
20853 MOVQ R11, (AX)
20854 MOVQ DX, -8(AX)(R8*1)
20855 JMP memmove_end_copy_lz4_snappy
20856
20857emit_lit_memmove_lz4_snappy_memmove_move_17through32:
20858 MOVOU (DX), X0
20859 MOVOU -16(DX)(R8*1), X1
20860 MOVOU X0, (AX)
20861 MOVOU X1, -16(AX)(R8*1)
20862 JMP memmove_end_copy_lz4_snappy
20863
20864emit_lit_memmove_lz4_snappy_memmove_move_33through64:
20865 MOVOU (DX), X0
20866 MOVOU 16(DX), X1
20867 MOVOU -32(DX)(R8*1), X2
20868 MOVOU -16(DX)(R8*1), X3
20869 MOVOU X0, (AX)
20870 MOVOU X1, 16(AX)
20871 MOVOU X2, -32(AX)(R8*1)
20872 MOVOU X3, -16(AX)(R8*1)
20873
20874memmove_end_copy_lz4_snappy:
20875 MOVQ R10, AX
20876 JMP lz4_snappy_lits_emit_done
20877
20878memmove_long_lz4_snappy:
20879 LEAQ (AX)(R8*1), R10
20880
20881 // genMemMoveLong
20882 MOVOU (DX), X0
20883 MOVOU 16(DX), X1
20884 MOVOU -32(DX)(R8*1), X2
20885 MOVOU -16(DX)(R8*1), X3
20886 MOVQ R8, R12
20887 SHRQ $0x05, R12
20888 MOVQ AX, R11
20889 ANDL $0x0000001f, R11
20890 MOVQ $0x00000040, R13
20891 SUBQ R11, R13
20892 DECQ R12
20893 JA emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32
20894 LEAQ -32(DX)(R13*1), R11
20895 LEAQ -32(AX)(R13*1), R14
20896
20897emit_lit_memmove_long_lz4_snappylarge_big_loop_back:
20898 MOVOU (R11), X4
20899 MOVOU 16(R11), X5
20900 MOVOA X4, (R14)
20901 MOVOA X5, 16(R14)
20902 ADDQ $0x20, R14
20903 ADDQ $0x20, R11
20904 ADDQ $0x20, R13
20905 DECQ R12
20906 JNA emit_lit_memmove_long_lz4_snappylarge_big_loop_back
20907
20908emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32:
20909 MOVOU -32(DX)(R13*1), X4
20910 MOVOU -16(DX)(R13*1), X5
20911 MOVOA X4, -32(AX)(R13*1)
20912 MOVOA X5, -16(AX)(R13*1)
20913 ADDQ $0x20, R13
20914 CMPQ R8, R13
20915 JAE emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32
20916 MOVOU X0, (AX)
20917 MOVOU X1, 16(AX)
20918 MOVOU X2, -32(AX)(R8*1)
20919 MOVOU X3, -16(AX)(R8*1)
20920 MOVQ R10, AX
20921
20922lz4_snappy_lits_emit_done:
20923 MOVQ DI, DX
20924
20925lz4_snappy_lits_done:
20926 CMPQ DX, BX
20927 JNE lz4_snappy_match
20928 CMPQ R9, $0x04
20929 JEQ lz4_snappy_done
20930 JMP lz4_snappy_corrupt
20931
20932lz4_snappy_match:
20933 LEAQ 2(DX), DI
20934 CMPQ DI, BX
20935 JAE lz4_snappy_corrupt
20936 MOVWQZX (DX), R8
20937 MOVQ DI, DX
20938 TESTQ R8, R8
20939 JZ lz4_snappy_corrupt
20940 CMPQ R8, SI
20941 JA lz4_snappy_corrupt
20942 CMPQ R9, $0x13
20943 JNE lz4_snappy_ml_done
20944
20945lz4_snappy_ml_loop:
20946 MOVBQZX (DX), DI
20947 INCQ DX
20948 ADDQ DI, R9
20949 CMPQ DX, BX
20950 JAE lz4_snappy_corrupt
20951 CMPQ DI, $0xff
20952 JEQ lz4_snappy_ml_loop
20953
20954lz4_snappy_ml_done:
20955 ADDQ R9, SI
20956
20957 // emitCopy
20958two_byte_offset_lz4_s2:
20959 CMPL R9, $0x40
20960 JBE two_byte_offset_short_lz4_s2
20961 MOVB $0xee, (AX)
20962 MOVW R8, 1(AX)
20963 LEAL -60(R9), R9
20964 ADDQ $0x03, AX
20965 CMPQ AX, CX
20966 JAE lz4_snappy_loop
20967 JMP two_byte_offset_lz4_s2
20968
20969two_byte_offset_short_lz4_s2:
20970 MOVL R9, DI
20971 SHLL $0x02, DI
20972 CMPL R9, $0x0c
20973 JAE emit_copy_three_lz4_s2
20974 CMPL R8, $0x00000800
20975 JAE emit_copy_three_lz4_s2
20976 LEAL -15(DI), DI
20977 MOVB R8, 1(AX)
20978 SHRL $0x08, R8
20979 SHLL $0x05, R8
20980 ORL R8, DI
20981 MOVB DI, (AX)
20982 ADDQ $0x02, AX
20983 JMP lz4_snappy_loop
20984
20985emit_copy_three_lz4_s2:
20986 LEAL -2(DI), DI
20987 MOVB DI, (AX)
20988 MOVW R8, 1(AX)
20989 ADDQ $0x03, AX
20990 JMP lz4_snappy_loop
20991
20992lz4_snappy_done:
20993 MOVQ dst_base+0(FP), CX
20994 SUBQ CX, AX
20995 MOVQ SI, uncompressed+48(FP)
20996 MOVQ AX, dstUsed+56(FP)
20997 RET
20998
20999lz4_snappy_corrupt:
21000 XORQ AX, AX
21001 LEAQ -1(AX), SI
21002 MOVQ SI, uncompressed+48(FP)
21003 RET
21004
21005lz4_snappy_dstfull:
21006 XORQ AX, AX
21007 LEAQ -2(AX), SI
21008 MOVQ SI, uncompressed+48(FP)
21009 RET
21010
21011// func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
21012// Requires: SSE2
21013TEXT ·cvtLZ4sBlockSnappyAsm(SB), NOSPLIT, $0-64
21014 XORQ SI, SI
21015 MOVQ dst_base+0(FP), AX
21016 MOVQ dst_len+8(FP), CX
21017 MOVQ src_base+24(FP), DX
21018 MOVQ src_len+32(FP), BX
21019 LEAQ (DX)(BX*1), BX
21020 LEAQ -10(AX)(CX*1), CX
21021
21022lz4s_snappy_loop:
21023 CMPQ DX, BX
21024 JAE lz4s_snappy_corrupt
21025 CMPQ AX, CX
21026 JAE lz4s_snappy_dstfull
21027 MOVBQZX (DX), DI
21028 MOVQ DI, R8
21029 MOVQ DI, R9
21030 SHRQ $0x04, R8
21031 ANDQ $0x0f, R9
21032 CMPQ DI, $0xf0
21033 JB lz4s_snappy_ll_end
21034
21035lz4s_snappy_ll_loop:
21036 INCQ DX
21037 CMPQ DX, BX
21038 JAE lz4s_snappy_corrupt
21039 MOVBQZX (DX), DI
21040 ADDQ DI, R8
21041 CMPQ DI, $0xff
21042 JEQ lz4s_snappy_ll_loop
21043
21044lz4s_snappy_ll_end:
21045 LEAQ (DX)(R8*1), DI
21046 ADDQ $0x03, R9
21047 CMPQ DI, BX
21048 JAE lz4s_snappy_corrupt
21049 INCQ DX
21050 INCQ DI
21051 TESTQ R8, R8
21052 JZ lz4s_snappy_lits_done
21053 LEAQ (AX)(R8*1), R10
21054 CMPQ R10, CX
21055 JAE lz4s_snappy_dstfull
21056 ADDQ R8, SI
21057 LEAL -1(R8), R10
21058 CMPL R10, $0x3c
21059 JB one_byte_lz4s_snappy
21060 CMPL R10, $0x00000100
21061 JB two_bytes_lz4s_snappy
21062 CMPL R10, $0x00010000
21063 JB three_bytes_lz4s_snappy
21064 CMPL R10, $0x01000000
21065 JB four_bytes_lz4s_snappy
21066 MOVB $0xfc, (AX)
21067 MOVL R10, 1(AX)
21068 ADDQ $0x05, AX
21069 JMP memmove_long_lz4s_snappy
21070
21071four_bytes_lz4s_snappy:
21072 MOVL R10, R11
21073 SHRL $0x10, R11
21074 MOVB $0xf8, (AX)
21075 MOVW R10, 1(AX)
21076 MOVB R11, 3(AX)
21077 ADDQ $0x04, AX
21078 JMP memmove_long_lz4s_snappy
21079
21080three_bytes_lz4s_snappy:
21081 MOVB $0xf4, (AX)
21082 MOVW R10, 1(AX)
21083 ADDQ $0x03, AX
21084 JMP memmove_long_lz4s_snappy
21085
21086two_bytes_lz4s_snappy:
21087 MOVB $0xf0, (AX)
21088 MOVB R10, 1(AX)
21089 ADDQ $0x02, AX
21090 CMPL R10, $0x40
21091 JB memmove_lz4s_snappy
21092 JMP memmove_long_lz4s_snappy
21093
21094one_byte_lz4s_snappy:
21095 SHLB $0x02, R10
21096 MOVB R10, (AX)
21097 ADDQ $0x01, AX
21098
21099memmove_lz4s_snappy:
21100 LEAQ (AX)(R8*1), R10
21101
21102 // genMemMoveShort
21103 CMPQ R8, $0x08
21104 JBE emit_lit_memmove_lz4s_snappy_memmove_move_8
21105 CMPQ R8, $0x10
21106 JBE emit_lit_memmove_lz4s_snappy_memmove_move_8through16
21107 CMPQ R8, $0x20
21108 JBE emit_lit_memmove_lz4s_snappy_memmove_move_17through32
21109 JMP emit_lit_memmove_lz4s_snappy_memmove_move_33through64
21110
21111emit_lit_memmove_lz4s_snappy_memmove_move_8:
21112 MOVQ (DX), R11
21113 MOVQ R11, (AX)
21114 JMP memmove_end_copy_lz4s_snappy
21115
21116emit_lit_memmove_lz4s_snappy_memmove_move_8through16:
21117 MOVQ (DX), R11
21118 MOVQ -8(DX)(R8*1), DX
21119 MOVQ R11, (AX)
21120 MOVQ DX, -8(AX)(R8*1)
21121 JMP memmove_end_copy_lz4s_snappy
21122
21123emit_lit_memmove_lz4s_snappy_memmove_move_17through32:
21124 MOVOU (DX), X0
21125 MOVOU -16(DX)(R8*1), X1
21126 MOVOU X0, (AX)
21127 MOVOU X1, -16(AX)(R8*1)
21128 JMP memmove_end_copy_lz4s_snappy
21129
21130emit_lit_memmove_lz4s_snappy_memmove_move_33through64:
21131 MOVOU (DX), X0
21132 MOVOU 16(DX), X1
21133 MOVOU -32(DX)(R8*1), X2
21134 MOVOU -16(DX)(R8*1), X3
21135 MOVOU X0, (AX)
21136 MOVOU X1, 16(AX)
21137 MOVOU X2, -32(AX)(R8*1)
21138 MOVOU X3, -16(AX)(R8*1)
21139
21140memmove_end_copy_lz4s_snappy:
21141 MOVQ R10, AX
21142 JMP lz4s_snappy_lits_emit_done
21143
21144memmove_long_lz4s_snappy:
21145 LEAQ (AX)(R8*1), R10
21146
21147 // genMemMoveLong
21148 MOVOU (DX), X0
21149 MOVOU 16(DX), X1
21150 MOVOU -32(DX)(R8*1), X2
21151 MOVOU -16(DX)(R8*1), X3
21152 MOVQ R8, R12
21153 SHRQ $0x05, R12
21154 MOVQ AX, R11
21155 ANDL $0x0000001f, R11
21156 MOVQ $0x00000040, R13
21157 SUBQ R11, R13
21158 DECQ R12
21159 JA emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32
21160 LEAQ -32(DX)(R13*1), R11
21161 LEAQ -32(AX)(R13*1), R14
21162
21163emit_lit_memmove_long_lz4s_snappylarge_big_loop_back:
21164 MOVOU (R11), X4
21165 MOVOU 16(R11), X5
21166 MOVOA X4, (R14)
21167 MOVOA X5, 16(R14)
21168 ADDQ $0x20, R14
21169 ADDQ $0x20, R11
21170 ADDQ $0x20, R13
21171 DECQ R12
21172 JNA emit_lit_memmove_long_lz4s_snappylarge_big_loop_back
21173
21174emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32:
21175 MOVOU -32(DX)(R13*1), X4
21176 MOVOU -16(DX)(R13*1), X5
21177 MOVOA X4, -32(AX)(R13*1)
21178 MOVOA X5, -16(AX)(R13*1)
21179 ADDQ $0x20, R13
21180 CMPQ R8, R13
21181 JAE emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32
21182 MOVOU X0, (AX)
21183 MOVOU X1, 16(AX)
21184 MOVOU X2, -32(AX)(R8*1)
21185 MOVOU X3, -16(AX)(R8*1)
21186 MOVQ R10, AX
21187
21188lz4s_snappy_lits_emit_done:
21189 MOVQ DI, DX
21190
21191lz4s_snappy_lits_done:
21192 CMPQ DX, BX
21193 JNE lz4s_snappy_match
21194 CMPQ R9, $0x03
21195 JEQ lz4s_snappy_done
21196 JMP lz4s_snappy_corrupt
21197
21198lz4s_snappy_match:
21199 CMPQ R9, $0x03
21200 JEQ lz4s_snappy_loop
21201 LEAQ 2(DX), DI
21202 CMPQ DI, BX
21203 JAE lz4s_snappy_corrupt
21204 MOVWQZX (DX), R8
21205 MOVQ DI, DX
21206 TESTQ R8, R8
21207 JZ lz4s_snappy_corrupt
21208 CMPQ R8, SI
21209 JA lz4s_snappy_corrupt
21210 CMPQ R9, $0x12
21211 JNE lz4s_snappy_ml_done
21212
21213lz4s_snappy_ml_loop:
21214 MOVBQZX (DX), DI
21215 INCQ DX
21216 ADDQ DI, R9
21217 CMPQ DX, BX
21218 JAE lz4s_snappy_corrupt
21219 CMPQ DI, $0xff
21220 JEQ lz4s_snappy_ml_loop
21221
21222lz4s_snappy_ml_done:
21223 ADDQ R9, SI
21224
21225 // emitCopy
21226two_byte_offset_lz4_s2:
21227 CMPL R9, $0x40
21228 JBE two_byte_offset_short_lz4_s2
21229 MOVB $0xee, (AX)
21230 MOVW R8, 1(AX)
21231 LEAL -60(R9), R9
21232 ADDQ $0x03, AX
21233 CMPQ AX, CX
21234 JAE lz4s_snappy_loop
21235 JMP two_byte_offset_lz4_s2
21236
21237two_byte_offset_short_lz4_s2:
21238 MOVL R9, DI
21239 SHLL $0x02, DI
21240 CMPL R9, $0x0c
21241 JAE emit_copy_three_lz4_s2
21242 CMPL R8, $0x00000800
21243 JAE emit_copy_three_lz4_s2
21244 LEAL -15(DI), DI
21245 MOVB R8, 1(AX)
21246 SHRL $0x08, R8
21247 SHLL $0x05, R8
21248 ORL R8, DI
21249 MOVB DI, (AX)
21250 ADDQ $0x02, AX
21251 JMP lz4s_snappy_loop
21252
21253emit_copy_three_lz4_s2:
21254 LEAL -2(DI), DI
21255 MOVB DI, (AX)
21256 MOVW R8, 1(AX)
21257 ADDQ $0x03, AX
21258 JMP lz4s_snappy_loop
21259
21260lz4s_snappy_done:
21261 MOVQ dst_base+0(FP), CX
21262 SUBQ CX, AX
21263 MOVQ SI, uncompressed+48(FP)
21264 MOVQ AX, dstUsed+56(FP)
21265 RET
21266
21267lz4s_snappy_corrupt:
21268 XORQ AX, AX
21269 LEAQ -1(AX), SI
21270 MOVQ SI, uncompressed+48(FP)
21271 RET
21272
21273lz4s_snappy_dstfull:
21274 XORQ AX, AX
21275 LEAQ -2(AX), SI
21276 MOVQ SI, uncompressed+48(FP)
21277 RET
View as plain text