...

Text file src/github.com/klauspost/compress/zstd/seqdec_amd64.s

Documentation: github.com/klauspost/compress/zstd

     1// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
     2
     3//go:build !appengine && !noasm && gc && !noasm
     4
     5// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
     6// Requires: CMOV
     7TEXT ·sequenceDecs_decode_amd64(SB), $8-32
     8	MOVQ    br+8(FP), CX
     9	MOVQ    24(CX), DX
    10	MOVBQZX 32(CX), BX
    11	MOVQ    (CX), AX
    12	MOVQ    8(CX), SI
    13	ADDQ    SI, AX
    14	MOVQ    AX, (SP)
    15	MOVQ    ctx+16(FP), AX
    16	MOVQ    72(AX), DI
    17	MOVQ    80(AX), R8
    18	MOVQ    88(AX), R9
    19	MOVQ    104(AX), R10
    20	MOVQ    s+0(FP), AX
    21	MOVQ    144(AX), R11
    22	MOVQ    152(AX), R12
    23	MOVQ    160(AX), R13
    24
    25sequenceDecs_decode_amd64_main_loop:
    26	MOVQ (SP), R14
    27
    28	// Fill bitreader to have enough for the offset and match length.
    29	CMPQ SI, $0x08
    30	JL   sequenceDecs_decode_amd64_fill_byte_by_byte
    31	MOVQ BX, AX
    32	SHRQ $0x03, AX
    33	SUBQ AX, R14
    34	MOVQ (R14), DX
    35	SUBQ AX, SI
    36	ANDQ $0x07, BX
    37	JMP  sequenceDecs_decode_amd64_fill_end
    38
    39sequenceDecs_decode_amd64_fill_byte_by_byte:
    40	CMPQ    SI, $0x00
    41	JLE     sequenceDecs_decode_amd64_fill_check_overread
    42	CMPQ    BX, $0x07
    43	JLE     sequenceDecs_decode_amd64_fill_end
    44	SHLQ    $0x08, DX
    45	SUBQ    $0x01, R14
    46	SUBQ    $0x01, SI
    47	SUBQ    $0x08, BX
    48	MOVBQZX (R14), AX
    49	ORQ     AX, DX
    50	JMP     sequenceDecs_decode_amd64_fill_byte_by_byte
    51
    52sequenceDecs_decode_amd64_fill_check_overread:
    53	CMPQ BX, $0x40
    54	JA   error_overread
    55
    56sequenceDecs_decode_amd64_fill_end:
    57	// Update offset
    58	MOVQ  R9, AX
    59	MOVQ  BX, CX
    60	MOVQ  DX, R15
    61	SHLQ  CL, R15
    62	MOVB  AH, CL
    63	SHRQ  $0x20, AX
    64	TESTQ CX, CX
    65	JZ    sequenceDecs_decode_amd64_of_update_zero
    66	ADDQ  CX, BX
    67	CMPQ  BX, $0x40
    68	JA    sequenceDecs_decode_amd64_of_update_zero
    69	CMPQ  CX, $0x40
    70	JAE   sequenceDecs_decode_amd64_of_update_zero
    71	NEGQ  CX
    72	SHRQ  CL, R15
    73	ADDQ  R15, AX
    74
    75sequenceDecs_decode_amd64_of_update_zero:
    76	MOVQ AX, 16(R10)
    77
    78	// Update match length
    79	MOVQ  R8, AX
    80	MOVQ  BX, CX
    81	MOVQ  DX, R15
    82	SHLQ  CL, R15
    83	MOVB  AH, CL
    84	SHRQ  $0x20, AX
    85	TESTQ CX, CX
    86	JZ    sequenceDecs_decode_amd64_ml_update_zero
    87	ADDQ  CX, BX
    88	CMPQ  BX, $0x40
    89	JA    sequenceDecs_decode_amd64_ml_update_zero
    90	CMPQ  CX, $0x40
    91	JAE   sequenceDecs_decode_amd64_ml_update_zero
    92	NEGQ  CX
    93	SHRQ  CL, R15
    94	ADDQ  R15, AX
    95
    96sequenceDecs_decode_amd64_ml_update_zero:
    97	MOVQ AX, 8(R10)
    98
    99	// Fill bitreader to have enough for the remaining
   100	CMPQ SI, $0x08
   101	JL   sequenceDecs_decode_amd64_fill_2_byte_by_byte
   102	MOVQ BX, AX
   103	SHRQ $0x03, AX
   104	SUBQ AX, R14
   105	MOVQ (R14), DX
   106	SUBQ AX, SI
   107	ANDQ $0x07, BX
   108	JMP  sequenceDecs_decode_amd64_fill_2_end
   109
   110sequenceDecs_decode_amd64_fill_2_byte_by_byte:
   111	CMPQ    SI, $0x00
   112	JLE     sequenceDecs_decode_amd64_fill_2_check_overread
   113	CMPQ    BX, $0x07
   114	JLE     sequenceDecs_decode_amd64_fill_2_end
   115	SHLQ    $0x08, DX
   116	SUBQ    $0x01, R14
   117	SUBQ    $0x01, SI
   118	SUBQ    $0x08, BX
   119	MOVBQZX (R14), AX
   120	ORQ     AX, DX
   121	JMP     sequenceDecs_decode_amd64_fill_2_byte_by_byte
   122
   123sequenceDecs_decode_amd64_fill_2_check_overread:
   124	CMPQ BX, $0x40
   125	JA   error_overread
   126
   127sequenceDecs_decode_amd64_fill_2_end:
   128	// Update literal length
   129	MOVQ  DI, AX
   130	MOVQ  BX, CX
   131	MOVQ  DX, R15
   132	SHLQ  CL, R15
   133	MOVB  AH, CL
   134	SHRQ  $0x20, AX
   135	TESTQ CX, CX
   136	JZ    sequenceDecs_decode_amd64_ll_update_zero
   137	ADDQ  CX, BX
   138	CMPQ  BX, $0x40
   139	JA    sequenceDecs_decode_amd64_ll_update_zero
   140	CMPQ  CX, $0x40
   141	JAE   sequenceDecs_decode_amd64_ll_update_zero
   142	NEGQ  CX
   143	SHRQ  CL, R15
   144	ADDQ  R15, AX
   145
   146sequenceDecs_decode_amd64_ll_update_zero:
   147	MOVQ AX, (R10)
   148
   149	// Fill bitreader for state updates
   150	MOVQ    R14, (SP)
   151	MOVQ    R9, AX
   152	SHRQ    $0x08, AX
   153	MOVBQZX AL, AX
   154	MOVQ    ctx+16(FP), CX
   155	CMPQ    96(CX), $0x00
   156	JZ      sequenceDecs_decode_amd64_skip_update
   157
   158	// Update Literal Length State
   159	MOVBQZX DI, R14
   160	SHRL    $0x10, DI
   161	LEAQ    (BX)(R14*1), CX
   162	MOVQ    DX, R15
   163	MOVQ    CX, BX
   164	ROLQ    CL, R15
   165	MOVL    $0x00000001, BP
   166	MOVB    R14, CL
   167	SHLL    CL, BP
   168	DECL    BP
   169	ANDQ    BP, R15
   170	ADDQ    R15, DI
   171
   172	// Load ctx.llTable
   173	MOVQ ctx+16(FP), CX
   174	MOVQ (CX), CX
   175	MOVQ (CX)(DI*8), DI
   176
   177	// Update Match Length State
   178	MOVBQZX R8, R14
   179	SHRL    $0x10, R8
   180	LEAQ    (BX)(R14*1), CX
   181	MOVQ    DX, R15
   182	MOVQ    CX, BX
   183	ROLQ    CL, R15
   184	MOVL    $0x00000001, BP
   185	MOVB    R14, CL
   186	SHLL    CL, BP
   187	DECL    BP
   188	ANDQ    BP, R15
   189	ADDQ    R15, R8
   190
   191	// Load ctx.mlTable
   192	MOVQ ctx+16(FP), CX
   193	MOVQ 24(CX), CX
   194	MOVQ (CX)(R8*8), R8
   195
   196	// Update Offset State
   197	MOVBQZX R9, R14
   198	SHRL    $0x10, R9
   199	LEAQ    (BX)(R14*1), CX
   200	MOVQ    DX, R15
   201	MOVQ    CX, BX
   202	ROLQ    CL, R15
   203	MOVL    $0x00000001, BP
   204	MOVB    R14, CL
   205	SHLL    CL, BP
   206	DECL    BP
   207	ANDQ    BP, R15
   208	ADDQ    R15, R9
   209
   210	// Load ctx.ofTable
   211	MOVQ ctx+16(FP), CX
   212	MOVQ 48(CX), CX
   213	MOVQ (CX)(R9*8), R9
   214
   215sequenceDecs_decode_amd64_skip_update:
   216	// Adjust offset
   217	MOVQ 16(R10), CX
   218	CMPQ AX, $0x01
   219	JBE  sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
   220	MOVQ R12, R13
   221	MOVQ R11, R12
   222	MOVQ CX, R11
   223	JMP  sequenceDecs_decode_amd64_after_adjust
   224
   225sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
   226	CMPQ (R10), $0x00000000
   227	JNE  sequenceDecs_decode_amd64_adjust_offset_maybezero
   228	INCQ CX
   229	JMP  sequenceDecs_decode_amd64_adjust_offset_nonzero
   230
   231sequenceDecs_decode_amd64_adjust_offset_maybezero:
   232	TESTQ CX, CX
   233	JNZ   sequenceDecs_decode_amd64_adjust_offset_nonzero
   234	MOVQ  R11, CX
   235	JMP   sequenceDecs_decode_amd64_after_adjust
   236
   237sequenceDecs_decode_amd64_adjust_offset_nonzero:
   238	CMPQ CX, $0x01
   239	JB   sequenceDecs_decode_amd64_adjust_zero
   240	JEQ  sequenceDecs_decode_amd64_adjust_one
   241	CMPQ CX, $0x02
   242	JA   sequenceDecs_decode_amd64_adjust_three
   243	JMP  sequenceDecs_decode_amd64_adjust_two
   244
   245sequenceDecs_decode_amd64_adjust_zero:
   246	MOVQ R11, AX
   247	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
   248
   249sequenceDecs_decode_amd64_adjust_one:
   250	MOVQ R12, AX
   251	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
   252
   253sequenceDecs_decode_amd64_adjust_two:
   254	MOVQ R13, AX
   255	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
   256
   257sequenceDecs_decode_amd64_adjust_three:
   258	LEAQ -1(R11), AX
   259
   260sequenceDecs_decode_amd64_adjust_test_temp_valid:
   261	TESTQ AX, AX
   262	JNZ   sequenceDecs_decode_amd64_adjust_temp_valid
   263	MOVQ  $0x00000001, AX
   264
   265sequenceDecs_decode_amd64_adjust_temp_valid:
   266	CMPQ    CX, $0x01
   267	CMOVQNE R12, R13
   268	MOVQ    R11, R12
   269	MOVQ    AX, R11
   270	MOVQ    AX, CX
   271
   272sequenceDecs_decode_amd64_after_adjust:
   273	MOVQ CX, 16(R10)
   274
   275	// Check values
   276	MOVQ  8(R10), AX
   277	MOVQ  (R10), R14
   278	LEAQ  (AX)(R14*1), R15
   279	MOVQ  s+0(FP), BP
   280	ADDQ  R15, 256(BP)
   281	MOVQ  ctx+16(FP), R15
   282	SUBQ  R14, 128(R15)
   283	JS    error_not_enough_literals
   284	CMPQ  AX, $0x00020002
   285	JA    sequenceDecs_decode_amd64_error_match_len_too_big
   286	TESTQ CX, CX
   287	JNZ   sequenceDecs_decode_amd64_match_len_ofs_ok
   288	TESTQ AX, AX
   289	JNZ   sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
   290
   291sequenceDecs_decode_amd64_match_len_ofs_ok:
   292	ADDQ $0x18, R10
   293	MOVQ ctx+16(FP), AX
   294	DECQ 96(AX)
   295	JNS  sequenceDecs_decode_amd64_main_loop
   296	MOVQ s+0(FP), AX
   297	MOVQ R11, 144(AX)
   298	MOVQ R12, 152(AX)
   299	MOVQ R13, 160(AX)
   300	MOVQ br+8(FP), AX
   301	MOVQ DX, 24(AX)
   302	MOVB BL, 32(AX)
   303	MOVQ SI, 8(AX)
   304
   305	// Return success
   306	MOVQ $0x00000000, ret+24(FP)
   307	RET
   308
   309	// Return with match length error
   310sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
   311	MOVQ $0x00000001, ret+24(FP)
   312	RET
   313
   314	// Return with match too long error
   315sequenceDecs_decode_amd64_error_match_len_too_big:
   316	MOVQ $0x00000002, ret+24(FP)
   317	RET
   318
   319	// Return with match offset too long error
   320	MOVQ $0x00000003, ret+24(FP)
   321	RET
   322
   323	// Return with not enough literals error
   324error_not_enough_literals:
   325	MOVQ $0x00000004, ret+24(FP)
   326	RET
   327
   328	// Return with overread error
   329error_overread:
   330	MOVQ $0x00000006, ret+24(FP)
   331	RET
   332
   333// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
   334// Requires: CMOV
   335TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
   336	MOVQ    br+8(FP), CX
   337	MOVQ    24(CX), DX
   338	MOVBQZX 32(CX), BX
   339	MOVQ    (CX), AX
   340	MOVQ    8(CX), SI
   341	ADDQ    SI, AX
   342	MOVQ    AX, (SP)
   343	MOVQ    ctx+16(FP), AX
   344	MOVQ    72(AX), DI
   345	MOVQ    80(AX), R8
   346	MOVQ    88(AX), R9
   347	MOVQ    104(AX), R10
   348	MOVQ    s+0(FP), AX
   349	MOVQ    144(AX), R11
   350	MOVQ    152(AX), R12
   351	MOVQ    160(AX), R13
   352
   353sequenceDecs_decode_56_amd64_main_loop:
   354	MOVQ (SP), R14
   355
   356	// Fill bitreader to have enough for the offset and match length.
   357	CMPQ SI, $0x08
   358	JL   sequenceDecs_decode_56_amd64_fill_byte_by_byte
   359	MOVQ BX, AX
   360	SHRQ $0x03, AX
   361	SUBQ AX, R14
   362	MOVQ (R14), DX
   363	SUBQ AX, SI
   364	ANDQ $0x07, BX
   365	JMP  sequenceDecs_decode_56_amd64_fill_end
   366
   367sequenceDecs_decode_56_amd64_fill_byte_by_byte:
   368	CMPQ    SI, $0x00
   369	JLE     sequenceDecs_decode_56_amd64_fill_check_overread
   370	CMPQ    BX, $0x07
   371	JLE     sequenceDecs_decode_56_amd64_fill_end
   372	SHLQ    $0x08, DX
   373	SUBQ    $0x01, R14
   374	SUBQ    $0x01, SI
   375	SUBQ    $0x08, BX
   376	MOVBQZX (R14), AX
   377	ORQ     AX, DX
   378	JMP     sequenceDecs_decode_56_amd64_fill_byte_by_byte
   379
   380sequenceDecs_decode_56_amd64_fill_check_overread:
   381	CMPQ BX, $0x40
   382	JA   error_overread
   383
   384sequenceDecs_decode_56_amd64_fill_end:
   385	// Update offset
   386	MOVQ  R9, AX
   387	MOVQ  BX, CX
   388	MOVQ  DX, R15
   389	SHLQ  CL, R15
   390	MOVB  AH, CL
   391	SHRQ  $0x20, AX
   392	TESTQ CX, CX
   393	JZ    sequenceDecs_decode_56_amd64_of_update_zero
   394	ADDQ  CX, BX
   395	CMPQ  BX, $0x40
   396	JA    sequenceDecs_decode_56_amd64_of_update_zero
   397	CMPQ  CX, $0x40
   398	JAE   sequenceDecs_decode_56_amd64_of_update_zero
   399	NEGQ  CX
   400	SHRQ  CL, R15
   401	ADDQ  R15, AX
   402
   403sequenceDecs_decode_56_amd64_of_update_zero:
   404	MOVQ AX, 16(R10)
   405
   406	// Update match length
   407	MOVQ  R8, AX
   408	MOVQ  BX, CX
   409	MOVQ  DX, R15
   410	SHLQ  CL, R15
   411	MOVB  AH, CL
   412	SHRQ  $0x20, AX
   413	TESTQ CX, CX
   414	JZ    sequenceDecs_decode_56_amd64_ml_update_zero
   415	ADDQ  CX, BX
   416	CMPQ  BX, $0x40
   417	JA    sequenceDecs_decode_56_amd64_ml_update_zero
   418	CMPQ  CX, $0x40
   419	JAE   sequenceDecs_decode_56_amd64_ml_update_zero
   420	NEGQ  CX
   421	SHRQ  CL, R15
   422	ADDQ  R15, AX
   423
   424sequenceDecs_decode_56_amd64_ml_update_zero:
   425	MOVQ AX, 8(R10)
   426
   427	// Update literal length
   428	MOVQ  DI, AX
   429	MOVQ  BX, CX
   430	MOVQ  DX, R15
   431	SHLQ  CL, R15
   432	MOVB  AH, CL
   433	SHRQ  $0x20, AX
   434	TESTQ CX, CX
   435	JZ    sequenceDecs_decode_56_amd64_ll_update_zero
   436	ADDQ  CX, BX
   437	CMPQ  BX, $0x40
   438	JA    sequenceDecs_decode_56_amd64_ll_update_zero
   439	CMPQ  CX, $0x40
   440	JAE   sequenceDecs_decode_56_amd64_ll_update_zero
   441	NEGQ  CX
   442	SHRQ  CL, R15
   443	ADDQ  R15, AX
   444
   445sequenceDecs_decode_56_amd64_ll_update_zero:
   446	MOVQ AX, (R10)
   447
   448	// Fill bitreader for state updates
   449	MOVQ    R14, (SP)
   450	MOVQ    R9, AX
   451	SHRQ    $0x08, AX
   452	MOVBQZX AL, AX
   453	MOVQ    ctx+16(FP), CX
   454	CMPQ    96(CX), $0x00
   455	JZ      sequenceDecs_decode_56_amd64_skip_update
   456
   457	// Update Literal Length State
   458	MOVBQZX DI, R14
   459	SHRL    $0x10, DI
   460	LEAQ    (BX)(R14*1), CX
   461	MOVQ    DX, R15
   462	MOVQ    CX, BX
   463	ROLQ    CL, R15
   464	MOVL    $0x00000001, BP
   465	MOVB    R14, CL
   466	SHLL    CL, BP
   467	DECL    BP
   468	ANDQ    BP, R15
   469	ADDQ    R15, DI
   470
   471	// Load ctx.llTable
   472	MOVQ ctx+16(FP), CX
   473	MOVQ (CX), CX
   474	MOVQ (CX)(DI*8), DI
   475
   476	// Update Match Length State
   477	MOVBQZX R8, R14
   478	SHRL    $0x10, R8
   479	LEAQ    (BX)(R14*1), CX
   480	MOVQ    DX, R15
   481	MOVQ    CX, BX
   482	ROLQ    CL, R15
   483	MOVL    $0x00000001, BP
   484	MOVB    R14, CL
   485	SHLL    CL, BP
   486	DECL    BP
   487	ANDQ    BP, R15
   488	ADDQ    R15, R8
   489
   490	// Load ctx.mlTable
   491	MOVQ ctx+16(FP), CX
   492	MOVQ 24(CX), CX
   493	MOVQ (CX)(R8*8), R8
   494
   495	// Update Offset State
   496	MOVBQZX R9, R14
   497	SHRL    $0x10, R9
   498	LEAQ    (BX)(R14*1), CX
   499	MOVQ    DX, R15
   500	MOVQ    CX, BX
   501	ROLQ    CL, R15
   502	MOVL    $0x00000001, BP
   503	MOVB    R14, CL
   504	SHLL    CL, BP
   505	DECL    BP
   506	ANDQ    BP, R15
   507	ADDQ    R15, R9
   508
   509	// Load ctx.ofTable
   510	MOVQ ctx+16(FP), CX
   511	MOVQ 48(CX), CX
   512	MOVQ (CX)(R9*8), R9
   513
   514sequenceDecs_decode_56_amd64_skip_update:
   515	// Adjust offset
   516	MOVQ 16(R10), CX
   517	CMPQ AX, $0x01
   518	JBE  sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
   519	MOVQ R12, R13
   520	MOVQ R11, R12
   521	MOVQ CX, R11
   522	JMP  sequenceDecs_decode_56_amd64_after_adjust
   523
   524sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
   525	CMPQ (R10), $0x00000000
   526	JNE  sequenceDecs_decode_56_amd64_adjust_offset_maybezero
   527	INCQ CX
   528	JMP  sequenceDecs_decode_56_amd64_adjust_offset_nonzero
   529
   530sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
   531	TESTQ CX, CX
   532	JNZ   sequenceDecs_decode_56_amd64_adjust_offset_nonzero
   533	MOVQ  R11, CX
   534	JMP   sequenceDecs_decode_56_amd64_after_adjust
   535
   536sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
   537	CMPQ CX, $0x01
   538	JB   sequenceDecs_decode_56_amd64_adjust_zero
   539	JEQ  sequenceDecs_decode_56_amd64_adjust_one
   540	CMPQ CX, $0x02
   541	JA   sequenceDecs_decode_56_amd64_adjust_three
   542	JMP  sequenceDecs_decode_56_amd64_adjust_two
   543
   544sequenceDecs_decode_56_amd64_adjust_zero:
   545	MOVQ R11, AX
   546	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
   547
   548sequenceDecs_decode_56_amd64_adjust_one:
   549	MOVQ R12, AX
   550	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
   551
   552sequenceDecs_decode_56_amd64_adjust_two:
   553	MOVQ R13, AX
   554	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
   555
   556sequenceDecs_decode_56_amd64_adjust_three:
   557	LEAQ -1(R11), AX
   558
   559sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
   560	TESTQ AX, AX
   561	JNZ   sequenceDecs_decode_56_amd64_adjust_temp_valid
   562	MOVQ  $0x00000001, AX
   563
   564sequenceDecs_decode_56_amd64_adjust_temp_valid:
   565	CMPQ    CX, $0x01
   566	CMOVQNE R12, R13
   567	MOVQ    R11, R12
   568	MOVQ    AX, R11
   569	MOVQ    AX, CX
   570
   571sequenceDecs_decode_56_amd64_after_adjust:
   572	MOVQ CX, 16(R10)
   573
   574	// Check values
   575	MOVQ  8(R10), AX
   576	MOVQ  (R10), R14
   577	LEAQ  (AX)(R14*1), R15
   578	MOVQ  s+0(FP), BP
   579	ADDQ  R15, 256(BP)
   580	MOVQ  ctx+16(FP), R15
   581	SUBQ  R14, 128(R15)
   582	JS    error_not_enough_literals
   583	CMPQ  AX, $0x00020002
   584	JA    sequenceDecs_decode_56_amd64_error_match_len_too_big
   585	TESTQ CX, CX
   586	JNZ   sequenceDecs_decode_56_amd64_match_len_ofs_ok
   587	TESTQ AX, AX
   588	JNZ   sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
   589
   590sequenceDecs_decode_56_amd64_match_len_ofs_ok:
   591	ADDQ $0x18, R10
   592	MOVQ ctx+16(FP), AX
   593	DECQ 96(AX)
   594	JNS  sequenceDecs_decode_56_amd64_main_loop
   595	MOVQ s+0(FP), AX
   596	MOVQ R11, 144(AX)
   597	MOVQ R12, 152(AX)
   598	MOVQ R13, 160(AX)
   599	MOVQ br+8(FP), AX
   600	MOVQ DX, 24(AX)
   601	MOVB BL, 32(AX)
   602	MOVQ SI, 8(AX)
   603
   604	// Return success
   605	MOVQ $0x00000000, ret+24(FP)
   606	RET
   607
   608	// Return with match length error
   609sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
   610	MOVQ $0x00000001, ret+24(FP)
   611	RET
   612
   613	// Return with match too long error
   614sequenceDecs_decode_56_amd64_error_match_len_too_big:
   615	MOVQ $0x00000002, ret+24(FP)
   616	RET
   617
   618	// Return with match offset too long error
   619	MOVQ $0x00000003, ret+24(FP)
   620	RET
   621
   622	// Return with not enough literals error
   623error_not_enough_literals:
   624	MOVQ $0x00000004, ret+24(FP)
   625	RET
   626
   627	// Return with overread error
   628error_overread:
   629	MOVQ $0x00000006, ret+24(FP)
   630	RET
   631
   632// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
   633// Requires: BMI, BMI2, CMOV
   634TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
   635	MOVQ    br+8(FP), BX
   636	MOVQ    24(BX), AX
   637	MOVBQZX 32(BX), DX
   638	MOVQ    (BX), CX
   639	MOVQ    8(BX), BX
   640	ADDQ    BX, CX
   641	MOVQ    CX, (SP)
   642	MOVQ    ctx+16(FP), CX
   643	MOVQ    72(CX), SI
   644	MOVQ    80(CX), DI
   645	MOVQ    88(CX), R8
   646	MOVQ    104(CX), R9
   647	MOVQ    s+0(FP), CX
   648	MOVQ    144(CX), R10
   649	MOVQ    152(CX), R11
   650	MOVQ    160(CX), R12
   651
   652sequenceDecs_decode_bmi2_main_loop:
   653	MOVQ (SP), R13
   654
   655	// Fill bitreader to have enough for the offset and match length.
   656	CMPQ BX, $0x08
   657	JL   sequenceDecs_decode_bmi2_fill_byte_by_byte
   658	MOVQ DX, CX
   659	SHRQ $0x03, CX
   660	SUBQ CX, R13
   661	MOVQ (R13), AX
   662	SUBQ CX, BX
   663	ANDQ $0x07, DX
   664	JMP  sequenceDecs_decode_bmi2_fill_end
   665
   666sequenceDecs_decode_bmi2_fill_byte_by_byte:
   667	CMPQ    BX, $0x00
   668	JLE     sequenceDecs_decode_bmi2_fill_check_overread
   669	CMPQ    DX, $0x07
   670	JLE     sequenceDecs_decode_bmi2_fill_end
   671	SHLQ    $0x08, AX
   672	SUBQ    $0x01, R13
   673	SUBQ    $0x01, BX
   674	SUBQ    $0x08, DX
   675	MOVBQZX (R13), CX
   676	ORQ     CX, AX
   677	JMP     sequenceDecs_decode_bmi2_fill_byte_by_byte
   678
   679sequenceDecs_decode_bmi2_fill_check_overread:
   680	CMPQ DX, $0x40
   681	JA   error_overread
   682
   683sequenceDecs_decode_bmi2_fill_end:
   684	// Update offset
   685	MOVQ   $0x00000808, CX
   686	BEXTRQ CX, R8, R14
   687	MOVQ   AX, R15
   688	LEAQ   (DX)(R14*1), CX
   689	ROLQ   CL, R15
   690	BZHIQ  R14, R15, R15
   691	MOVQ   CX, DX
   692	MOVQ   R8, CX
   693	SHRQ   $0x20, CX
   694	ADDQ   R15, CX
   695	MOVQ   CX, 16(R9)
   696
   697	// Update match length
   698	MOVQ   $0x00000808, CX
   699	BEXTRQ CX, DI, R14
   700	MOVQ   AX, R15
   701	LEAQ   (DX)(R14*1), CX
   702	ROLQ   CL, R15
   703	BZHIQ  R14, R15, R15
   704	MOVQ   CX, DX
   705	MOVQ   DI, CX
   706	SHRQ   $0x20, CX
   707	ADDQ   R15, CX
   708	MOVQ   CX, 8(R9)
   709
   710	// Fill bitreader to have enough for the remaining
   711	CMPQ BX, $0x08
   712	JL   sequenceDecs_decode_bmi2_fill_2_byte_by_byte
   713	MOVQ DX, CX
   714	SHRQ $0x03, CX
   715	SUBQ CX, R13
   716	MOVQ (R13), AX
   717	SUBQ CX, BX
   718	ANDQ $0x07, DX
   719	JMP  sequenceDecs_decode_bmi2_fill_2_end
   720
   721sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
   722	CMPQ    BX, $0x00
   723	JLE     sequenceDecs_decode_bmi2_fill_2_check_overread
   724	CMPQ    DX, $0x07
   725	JLE     sequenceDecs_decode_bmi2_fill_2_end
   726	SHLQ    $0x08, AX
   727	SUBQ    $0x01, R13
   728	SUBQ    $0x01, BX
   729	SUBQ    $0x08, DX
   730	MOVBQZX (R13), CX
   731	ORQ     CX, AX
   732	JMP     sequenceDecs_decode_bmi2_fill_2_byte_by_byte
   733
   734sequenceDecs_decode_bmi2_fill_2_check_overread:
   735	CMPQ DX, $0x40
   736	JA   error_overread
   737
   738sequenceDecs_decode_bmi2_fill_2_end:
   739	// Update literal length
   740	MOVQ   $0x00000808, CX
   741	BEXTRQ CX, SI, R14
   742	MOVQ   AX, R15
   743	LEAQ   (DX)(R14*1), CX
   744	ROLQ   CL, R15
   745	BZHIQ  R14, R15, R15
   746	MOVQ   CX, DX
   747	MOVQ   SI, CX
   748	SHRQ   $0x20, CX
   749	ADDQ   R15, CX
   750	MOVQ   CX, (R9)
   751
   752	// Fill bitreader for state updates
   753	MOVQ    R13, (SP)
   754	MOVQ    $0x00000808, CX
   755	BEXTRQ  CX, R8, R13
   756	MOVQ    ctx+16(FP), CX
   757	CMPQ    96(CX), $0x00
   758	JZ      sequenceDecs_decode_bmi2_skip_update
   759	LEAQ    (SI)(DI*1), R14
   760	ADDQ    R8, R14
   761	MOVBQZX R14, R14
   762	LEAQ    (DX)(R14*1), CX
   763	MOVQ    AX, R15
   764	MOVQ    CX, DX
   765	ROLQ    CL, R15
   766	BZHIQ   R14, R15, R15
   767
   768	// Update Offset State
   769	BZHIQ R8, R15, CX
   770	SHRXQ R8, R15, R15
   771	SHRL  $0x10, R8
   772	ADDQ  CX, R8
   773
   774	// Load ctx.ofTable
   775	MOVQ ctx+16(FP), CX
   776	MOVQ 48(CX), CX
   777	MOVQ (CX)(R8*8), R8
   778
   779	// Update Match Length State
   780	BZHIQ DI, R15, CX
   781	SHRXQ DI, R15, R15
   782	SHRL  $0x10, DI
   783	ADDQ  CX, DI
   784
   785	// Load ctx.mlTable
   786	MOVQ ctx+16(FP), CX
   787	MOVQ 24(CX), CX
   788	MOVQ (CX)(DI*8), DI
   789
   790	// Update Literal Length State
   791	BZHIQ SI, R15, CX
   792	SHRL  $0x10, SI
   793	ADDQ  CX, SI
   794
   795	// Load ctx.llTable
   796	MOVQ ctx+16(FP), CX
   797	MOVQ (CX), CX
   798	MOVQ (CX)(SI*8), SI
   799
   800sequenceDecs_decode_bmi2_skip_update:
   801	// Adjust offset
   802	MOVQ 16(R9), CX
   803	CMPQ R13, $0x01
   804	JBE  sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
   805	MOVQ R11, R12
   806	MOVQ R10, R11
   807	MOVQ CX, R10
   808	JMP  sequenceDecs_decode_bmi2_after_adjust
   809
   810sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
   811	CMPQ (R9), $0x00000000
   812	JNE  sequenceDecs_decode_bmi2_adjust_offset_maybezero
   813	INCQ CX
   814	JMP  sequenceDecs_decode_bmi2_adjust_offset_nonzero
   815
   816sequenceDecs_decode_bmi2_adjust_offset_maybezero:
   817	TESTQ CX, CX
   818	JNZ   sequenceDecs_decode_bmi2_adjust_offset_nonzero
   819	MOVQ  R10, CX
   820	JMP   sequenceDecs_decode_bmi2_after_adjust
   821
   822sequenceDecs_decode_bmi2_adjust_offset_nonzero:
   823	CMPQ CX, $0x01
   824	JB   sequenceDecs_decode_bmi2_adjust_zero
   825	JEQ  sequenceDecs_decode_bmi2_adjust_one
   826	CMPQ CX, $0x02
   827	JA   sequenceDecs_decode_bmi2_adjust_three
   828	JMP  sequenceDecs_decode_bmi2_adjust_two
   829
   830sequenceDecs_decode_bmi2_adjust_zero:
   831	MOVQ R10, R13
   832	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
   833
   834sequenceDecs_decode_bmi2_adjust_one:
   835	MOVQ R11, R13
   836	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
   837
   838sequenceDecs_decode_bmi2_adjust_two:
   839	MOVQ R12, R13
   840	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
   841
   842sequenceDecs_decode_bmi2_adjust_three:
   843	LEAQ -1(R10), R13
   844
   845sequenceDecs_decode_bmi2_adjust_test_temp_valid:
   846	TESTQ R13, R13
   847	JNZ   sequenceDecs_decode_bmi2_adjust_temp_valid
   848	MOVQ  $0x00000001, R13
   849
   850sequenceDecs_decode_bmi2_adjust_temp_valid:
   851	CMPQ    CX, $0x01
   852	CMOVQNE R11, R12
   853	MOVQ    R10, R11
   854	MOVQ    R13, R10
   855	MOVQ    R13, CX
   856
   857sequenceDecs_decode_bmi2_after_adjust:
   858	MOVQ CX, 16(R9)
   859
   860	// Check values
   861	MOVQ  8(R9), R13
   862	MOVQ  (R9), R14
   863	LEAQ  (R13)(R14*1), R15
   864	MOVQ  s+0(FP), BP
   865	ADDQ  R15, 256(BP)
   866	MOVQ  ctx+16(FP), R15
   867	SUBQ  R14, 128(R15)
   868	JS    error_not_enough_literals
   869	CMPQ  R13, $0x00020002
   870	JA    sequenceDecs_decode_bmi2_error_match_len_too_big
   871	TESTQ CX, CX
   872	JNZ   sequenceDecs_decode_bmi2_match_len_ofs_ok
   873	TESTQ R13, R13
   874	JNZ   sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
   875
   876sequenceDecs_decode_bmi2_match_len_ofs_ok:
   877	ADDQ $0x18, R9
   878	MOVQ ctx+16(FP), CX
   879	DECQ 96(CX)
   880	JNS  sequenceDecs_decode_bmi2_main_loop
   881	MOVQ s+0(FP), CX
   882	MOVQ R10, 144(CX)
   883	MOVQ R11, 152(CX)
   884	MOVQ R12, 160(CX)
   885	MOVQ br+8(FP), CX
   886	MOVQ AX, 24(CX)
   887	MOVB DL, 32(CX)
   888	MOVQ BX, 8(CX)
   889
   890	// Return success
   891	MOVQ $0x00000000, ret+24(FP)
   892	RET
   893
   894	// Return with match length error
   895sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
   896	MOVQ $0x00000001, ret+24(FP)
   897	RET
   898
   899	// Return with match too long error
   900sequenceDecs_decode_bmi2_error_match_len_too_big:
   901	MOVQ $0x00000002, ret+24(FP)
   902	RET
   903
   904	// Return with match offset too long error
   905	MOVQ $0x00000003, ret+24(FP)
   906	RET
   907
   908	// Return with not enough literals error
   909error_not_enough_literals:
   910	MOVQ $0x00000004, ret+24(FP)
   911	RET
   912
   913	// Return with overread error
   914error_overread:
   915	MOVQ $0x00000006, ret+24(FP)
   916	RET
   917
   918// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
   919// Requires: BMI, BMI2, CMOV
   920TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
   921	MOVQ    br+8(FP), BX
   922	MOVQ    24(BX), AX
   923	MOVBQZX 32(BX), DX
   924	MOVQ    (BX), CX
   925	MOVQ    8(BX), BX
   926	ADDQ    BX, CX
   927	MOVQ    CX, (SP)
   928	MOVQ    ctx+16(FP), CX
   929	MOVQ    72(CX), SI
   930	MOVQ    80(CX), DI
   931	MOVQ    88(CX), R8
   932	MOVQ    104(CX), R9
   933	MOVQ    s+0(FP), CX
   934	MOVQ    144(CX), R10
   935	MOVQ    152(CX), R11
   936	MOVQ    160(CX), R12
   937
   938sequenceDecs_decode_56_bmi2_main_loop:
   939	MOVQ (SP), R13
   940
   941	// Fill bitreader to have enough for the offset and match length.
   942	CMPQ BX, $0x08
   943	JL   sequenceDecs_decode_56_bmi2_fill_byte_by_byte
   944	MOVQ DX, CX
   945	SHRQ $0x03, CX
   946	SUBQ CX, R13
   947	MOVQ (R13), AX
   948	SUBQ CX, BX
   949	ANDQ $0x07, DX
   950	JMP  sequenceDecs_decode_56_bmi2_fill_end
   951
   952sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
   953	CMPQ    BX, $0x00
   954	JLE     sequenceDecs_decode_56_bmi2_fill_check_overread
   955	CMPQ    DX, $0x07
   956	JLE     sequenceDecs_decode_56_bmi2_fill_end
   957	SHLQ    $0x08, AX
   958	SUBQ    $0x01, R13
   959	SUBQ    $0x01, BX
   960	SUBQ    $0x08, DX
   961	MOVBQZX (R13), CX
   962	ORQ     CX, AX
   963	JMP     sequenceDecs_decode_56_bmi2_fill_byte_by_byte
   964
   965sequenceDecs_decode_56_bmi2_fill_check_overread:
   966	CMPQ DX, $0x40
   967	JA   error_overread
   968
   969sequenceDecs_decode_56_bmi2_fill_end:
   970	// Update offset
   971	MOVQ   $0x00000808, CX
   972	BEXTRQ CX, R8, R14
   973	MOVQ   AX, R15
   974	LEAQ   (DX)(R14*1), CX
   975	ROLQ   CL, R15
   976	BZHIQ  R14, R15, R15
   977	MOVQ   CX, DX
   978	MOVQ   R8, CX
   979	SHRQ   $0x20, CX
   980	ADDQ   R15, CX
   981	MOVQ   CX, 16(R9)
   982
   983	// Update match length
   984	MOVQ   $0x00000808, CX
   985	BEXTRQ CX, DI, R14
   986	MOVQ   AX, R15
   987	LEAQ   (DX)(R14*1), CX
   988	ROLQ   CL, R15
   989	BZHIQ  R14, R15, R15
   990	MOVQ   CX, DX
   991	MOVQ   DI, CX
   992	SHRQ   $0x20, CX
   993	ADDQ   R15, CX
   994	MOVQ   CX, 8(R9)
   995
   996	// Update literal length
   997	MOVQ   $0x00000808, CX
   998	BEXTRQ CX, SI, R14
   999	MOVQ   AX, R15
  1000	LEAQ   (DX)(R14*1), CX
  1001	ROLQ   CL, R15
  1002	BZHIQ  R14, R15, R15
  1003	MOVQ   CX, DX
  1004	MOVQ   SI, CX
  1005	SHRQ   $0x20, CX
  1006	ADDQ   R15, CX
  1007	MOVQ   CX, (R9)
  1008
  1009	// Fill bitreader for state updates
  1010	MOVQ    R13, (SP)
  1011	MOVQ    $0x00000808, CX
  1012	BEXTRQ  CX, R8, R13
  1013	MOVQ    ctx+16(FP), CX
  1014	CMPQ    96(CX), $0x00
  1015	JZ      sequenceDecs_decode_56_bmi2_skip_update
  1016	LEAQ    (SI)(DI*1), R14
  1017	ADDQ    R8, R14
  1018	MOVBQZX R14, R14
  1019	LEAQ    (DX)(R14*1), CX
  1020	MOVQ    AX, R15
  1021	MOVQ    CX, DX
  1022	ROLQ    CL, R15
  1023	BZHIQ   R14, R15, R15
  1024
  1025	// Update Offset State
  1026	BZHIQ R8, R15, CX
  1027	SHRXQ R8, R15, R15
  1028	SHRL  $0x10, R8
  1029	ADDQ  CX, R8
  1030
  1031	// Load ctx.ofTable
  1032	MOVQ ctx+16(FP), CX
  1033	MOVQ 48(CX), CX
  1034	MOVQ (CX)(R8*8), R8
  1035
  1036	// Update Match Length State
  1037	BZHIQ DI, R15, CX
  1038	SHRXQ DI, R15, R15
  1039	SHRL  $0x10, DI
  1040	ADDQ  CX, DI
  1041
  1042	// Load ctx.mlTable
  1043	MOVQ ctx+16(FP), CX
  1044	MOVQ 24(CX), CX
  1045	MOVQ (CX)(DI*8), DI
  1046
  1047	// Update Literal Length State
  1048	BZHIQ SI, R15, CX
  1049	SHRL  $0x10, SI
  1050	ADDQ  CX, SI
  1051
  1052	// Load ctx.llTable
  1053	MOVQ ctx+16(FP), CX
  1054	MOVQ (CX), CX
  1055	MOVQ (CX)(SI*8), SI
  1056
  1057sequenceDecs_decode_56_bmi2_skip_update:
  1058	// Adjust offset
  1059	MOVQ 16(R9), CX
  1060	CMPQ R13, $0x01
  1061	JBE  sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
  1062	MOVQ R11, R12
  1063	MOVQ R10, R11
  1064	MOVQ CX, R10
  1065	JMP  sequenceDecs_decode_56_bmi2_after_adjust
  1066
  1067sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
  1068	CMPQ (R9), $0x00000000
  1069	JNE  sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
  1070	INCQ CX
  1071	JMP  sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
  1072
  1073sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
  1074	TESTQ CX, CX
  1075	JNZ   sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
  1076	MOVQ  R10, CX
  1077	JMP   sequenceDecs_decode_56_bmi2_after_adjust
  1078
  1079sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
  1080	CMPQ CX, $0x01
  1081	JB   sequenceDecs_decode_56_bmi2_adjust_zero
  1082	JEQ  sequenceDecs_decode_56_bmi2_adjust_one
  1083	CMPQ CX, $0x02
  1084	JA   sequenceDecs_decode_56_bmi2_adjust_three
  1085	JMP  sequenceDecs_decode_56_bmi2_adjust_two
  1086
  1087sequenceDecs_decode_56_bmi2_adjust_zero:
  1088	MOVQ R10, R13
  1089	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
  1090
  1091sequenceDecs_decode_56_bmi2_adjust_one:
  1092	MOVQ R11, R13
  1093	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
  1094
  1095sequenceDecs_decode_56_bmi2_adjust_two:
  1096	MOVQ R12, R13
  1097	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
  1098
  1099sequenceDecs_decode_56_bmi2_adjust_three:
  1100	LEAQ -1(R10), R13
  1101
  1102sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
  1103	TESTQ R13, R13
  1104	JNZ   sequenceDecs_decode_56_bmi2_adjust_temp_valid
  1105	MOVQ  $0x00000001, R13
  1106
  1107sequenceDecs_decode_56_bmi2_adjust_temp_valid:
  1108	CMPQ    CX, $0x01
  1109	CMOVQNE R11, R12
  1110	MOVQ    R10, R11
  1111	MOVQ    R13, R10
  1112	MOVQ    R13, CX
  1113
  1114sequenceDecs_decode_56_bmi2_after_adjust:
  1115	MOVQ CX, 16(R9)
  1116
  1117	// Check values
  1118	MOVQ  8(R9), R13
  1119	MOVQ  (R9), R14
  1120	LEAQ  (R13)(R14*1), R15
  1121	MOVQ  s+0(FP), BP
  1122	ADDQ  R15, 256(BP)
  1123	MOVQ  ctx+16(FP), R15
  1124	SUBQ  R14, 128(R15)
  1125	JS    error_not_enough_literals
  1126	CMPQ  R13, $0x00020002
  1127	JA    sequenceDecs_decode_56_bmi2_error_match_len_too_big
  1128	TESTQ CX, CX
  1129	JNZ   sequenceDecs_decode_56_bmi2_match_len_ofs_ok
  1130	TESTQ R13, R13
  1131	JNZ   sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
  1132
  1133sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
  1134	ADDQ $0x18, R9
  1135	MOVQ ctx+16(FP), CX
  1136	DECQ 96(CX)
  1137	JNS  sequenceDecs_decode_56_bmi2_main_loop
  1138	MOVQ s+0(FP), CX
  1139	MOVQ R10, 144(CX)
  1140	MOVQ R11, 152(CX)
  1141	MOVQ R12, 160(CX)
  1142	MOVQ br+8(FP), CX
  1143	MOVQ AX, 24(CX)
  1144	MOVB DL, 32(CX)
  1145	MOVQ BX, 8(CX)
  1146
  1147	// Return success
  1148	MOVQ $0x00000000, ret+24(FP)
  1149	RET
  1150
  1151	// Return with match length error
  1152sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
  1153	MOVQ $0x00000001, ret+24(FP)
  1154	RET
  1155
  1156	// Return with match too long error
  1157sequenceDecs_decode_56_bmi2_error_match_len_too_big:
  1158	MOVQ $0x00000002, ret+24(FP)
  1159	RET
  1160
  1161	// Return with match offset too long error
  1162	MOVQ $0x00000003, ret+24(FP)
  1163	RET
  1164
  1165	// Return with not enough literals error
  1166error_not_enough_literals:
  1167	MOVQ $0x00000004, ret+24(FP)
  1168	RET
  1169
  1170	// Return with overread error
  1171error_overread:
  1172	MOVQ $0x00000006, ret+24(FP)
  1173	RET
  1174
  1175// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
  1176// Requires: SSE
  1177TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
  1178	MOVQ  ctx+0(FP), R10
  1179	MOVQ  8(R10), CX
  1180	TESTQ CX, CX
  1181	JZ    empty_seqs
  1182	MOVQ  (R10), AX
  1183	MOVQ  24(R10), DX
  1184	MOVQ  32(R10), BX
  1185	MOVQ  80(R10), SI
  1186	MOVQ  104(R10), DI
  1187	MOVQ  120(R10), R8
  1188	MOVQ  56(R10), R9
  1189	MOVQ  64(R10), R10
  1190	ADDQ  R10, R9
  1191
  1192	// seqsBase += 24 * seqIndex
  1193	LEAQ (DX)(DX*2), R11
  1194	SHLQ $0x03, R11
  1195	ADDQ R11, AX
  1196
  1197	// outBase += outPosition
  1198	ADDQ DI, BX
  1199
  1200main_loop:
  1201	MOVQ (AX), R11
  1202	MOVQ 16(AX), R12
  1203	MOVQ 8(AX), R13
  1204
  1205	// Copy literals
  1206	TESTQ R11, R11
  1207	JZ    check_offset
  1208	XORQ  R14, R14
  1209
  1210copy_1:
  1211	MOVUPS (SI)(R14*1), X0
  1212	MOVUPS X0, (BX)(R14*1)
  1213	ADDQ   $0x10, R14
  1214	CMPQ   R14, R11
  1215	JB     copy_1
  1216	ADDQ   R11, SI
  1217	ADDQ   R11, BX
  1218	ADDQ   R11, DI
  1219
  1220	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  1221check_offset:
  1222	LEAQ (DI)(R10*1), R11
  1223	CMPQ R12, R11
  1224	JG   error_match_off_too_big
  1225	CMPQ R12, R8
  1226	JG   error_match_off_too_big
  1227
  1228	// Copy match from history
  1229	MOVQ R12, R11
  1230	SUBQ DI, R11
  1231	JLS  copy_match
  1232	MOVQ R9, R14
  1233	SUBQ R11, R14
  1234	CMPQ R13, R11
  1235	JG   copy_all_from_history
  1236	MOVQ R13, R11
  1237	SUBQ $0x10, R11
  1238	JB   copy_4_small
  1239
  1240copy_4_loop:
  1241	MOVUPS (R14), X0
  1242	MOVUPS X0, (BX)
  1243	ADDQ   $0x10, R14
  1244	ADDQ   $0x10, BX
  1245	SUBQ   $0x10, R11
  1246	JAE    copy_4_loop
  1247	LEAQ   16(R14)(R11*1), R14
  1248	LEAQ   16(BX)(R11*1), BX
  1249	MOVUPS -16(R14), X0
  1250	MOVUPS X0, -16(BX)
  1251	JMP    copy_4_end
  1252
  1253copy_4_small:
  1254	CMPQ R13, $0x03
  1255	JE   copy_4_move_3
  1256	CMPQ R13, $0x08
  1257	JB   copy_4_move_4through7
  1258	JMP  copy_4_move_8through16
  1259
  1260copy_4_move_3:
  1261	MOVW (R14), R11
  1262	MOVB 2(R14), R12
  1263	MOVW R11, (BX)
  1264	MOVB R12, 2(BX)
  1265	ADDQ R13, R14
  1266	ADDQ R13, BX
  1267	JMP  copy_4_end
  1268
  1269copy_4_move_4through7:
  1270	MOVL (R14), R11
  1271	MOVL -4(R14)(R13*1), R12
  1272	MOVL R11, (BX)
  1273	MOVL R12, -4(BX)(R13*1)
  1274	ADDQ R13, R14
  1275	ADDQ R13, BX
  1276	JMP  copy_4_end
  1277
  1278copy_4_move_8through16:
  1279	MOVQ (R14), R11
  1280	MOVQ -8(R14)(R13*1), R12
  1281	MOVQ R11, (BX)
  1282	MOVQ R12, -8(BX)(R13*1)
  1283	ADDQ R13, R14
  1284	ADDQ R13, BX
  1285
  1286copy_4_end:
  1287	ADDQ R13, DI
  1288	ADDQ $0x18, AX
  1289	INCQ DX
  1290	CMPQ DX, CX
  1291	JB   main_loop
  1292	JMP  loop_finished
  1293
  1294copy_all_from_history:
  1295	MOVQ R11, R15
  1296	SUBQ $0x10, R15
  1297	JB   copy_5_small
  1298
  1299copy_5_loop:
  1300	MOVUPS (R14), X0
  1301	MOVUPS X0, (BX)
  1302	ADDQ   $0x10, R14
  1303	ADDQ   $0x10, BX
  1304	SUBQ   $0x10, R15
  1305	JAE    copy_5_loop
  1306	LEAQ   16(R14)(R15*1), R14
  1307	LEAQ   16(BX)(R15*1), BX
  1308	MOVUPS -16(R14), X0
  1309	MOVUPS X0, -16(BX)
  1310	JMP    copy_5_end
  1311
  1312copy_5_small:
  1313	CMPQ R11, $0x03
  1314	JE   copy_5_move_3
  1315	JB   copy_5_move_1or2
  1316	CMPQ R11, $0x08
  1317	JB   copy_5_move_4through7
  1318	JMP  copy_5_move_8through16
  1319
  1320copy_5_move_1or2:
  1321	MOVB (R14), R15
  1322	MOVB -1(R14)(R11*1), BP
  1323	MOVB R15, (BX)
  1324	MOVB BP, -1(BX)(R11*1)
  1325	ADDQ R11, R14
  1326	ADDQ R11, BX
  1327	JMP  copy_5_end
  1328
  1329copy_5_move_3:
  1330	MOVW (R14), R15
  1331	MOVB 2(R14), BP
  1332	MOVW R15, (BX)
  1333	MOVB BP, 2(BX)
  1334	ADDQ R11, R14
  1335	ADDQ R11, BX
  1336	JMP  copy_5_end
  1337
  1338copy_5_move_4through7:
  1339	MOVL (R14), R15
  1340	MOVL -4(R14)(R11*1), BP
  1341	MOVL R15, (BX)
  1342	MOVL BP, -4(BX)(R11*1)
  1343	ADDQ R11, R14
  1344	ADDQ R11, BX
  1345	JMP  copy_5_end
  1346
  1347copy_5_move_8through16:
  1348	MOVQ (R14), R15
  1349	MOVQ -8(R14)(R11*1), BP
  1350	MOVQ R15, (BX)
  1351	MOVQ BP, -8(BX)(R11*1)
  1352	ADDQ R11, R14
  1353	ADDQ R11, BX
  1354
  1355copy_5_end:
  1356	ADDQ R11, DI
  1357	SUBQ R11, R13
  1358
  1359	// Copy match from the current buffer
  1360copy_match:
  1361	MOVQ BX, R11
  1362	SUBQ R12, R11
  1363
  1364	// ml <= mo
  1365	CMPQ R13, R12
  1366	JA   copy_overlapping_match
  1367
  1368	// Copy non-overlapping match
  1369	ADDQ R13, DI
  1370	MOVQ BX, R12
  1371	ADDQ R13, BX
  1372
  1373copy_2:
  1374	MOVUPS (R11), X0
  1375	MOVUPS X0, (R12)
  1376	ADDQ   $0x10, R11
  1377	ADDQ   $0x10, R12
  1378	SUBQ   $0x10, R13
  1379	JHI    copy_2
  1380	JMP    handle_loop
  1381
  1382	// Copy overlapping match
  1383copy_overlapping_match:
  1384	ADDQ R13, DI
  1385
  1386copy_slow_3:
  1387	MOVB (R11), R12
  1388	MOVB R12, (BX)
  1389	INCQ R11
  1390	INCQ BX
  1391	DECQ R13
  1392	JNZ  copy_slow_3
  1393
  1394handle_loop:
  1395	ADDQ $0x18, AX
  1396	INCQ DX
  1397	CMPQ DX, CX
  1398	JB   main_loop
  1399
  1400loop_finished:
  1401	// Return value
  1402	MOVB $0x01, ret+8(FP)
  1403
  1404	// Update the context
  1405	MOVQ ctx+0(FP), AX
  1406	MOVQ DX, 24(AX)
  1407	MOVQ DI, 104(AX)
  1408	SUBQ 80(AX), SI
  1409	MOVQ SI, 112(AX)
  1410	RET
  1411
  1412error_match_off_too_big:
  1413	// Return value
  1414	MOVB $0x00, ret+8(FP)
  1415
  1416	// Update the context
  1417	MOVQ ctx+0(FP), AX
  1418	MOVQ DX, 24(AX)
  1419	MOVQ DI, 104(AX)
  1420	SUBQ 80(AX), SI
  1421	MOVQ SI, 112(AX)
  1422	RET
  1423
  1424empty_seqs:
  1425	// Return value
  1426	MOVB $0x01, ret+8(FP)
  1427	RET
  1428
  1429// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
  1430// Requires: SSE
  1431TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
  1432	MOVQ  ctx+0(FP), R10
  1433	MOVQ  8(R10), CX
  1434	TESTQ CX, CX
  1435	JZ    empty_seqs
  1436	MOVQ  (R10), AX
  1437	MOVQ  24(R10), DX
  1438	MOVQ  32(R10), BX
  1439	MOVQ  80(R10), SI
  1440	MOVQ  104(R10), DI
  1441	MOVQ  120(R10), R8
  1442	MOVQ  56(R10), R9
  1443	MOVQ  64(R10), R10
  1444	ADDQ  R10, R9
  1445
  1446	// seqsBase += 24 * seqIndex
  1447	LEAQ (DX)(DX*2), R11
  1448	SHLQ $0x03, R11
  1449	ADDQ R11, AX
  1450
  1451	// outBase += outPosition
  1452	ADDQ DI, BX
  1453
  1454main_loop:
  1455	MOVQ (AX), R11
  1456	MOVQ 16(AX), R12
  1457	MOVQ 8(AX), R13
  1458
  1459	// Copy literals
  1460	TESTQ R11, R11
  1461	JZ    check_offset
  1462	MOVQ  R11, R14
  1463	SUBQ  $0x10, R14
  1464	JB    copy_1_small
  1465
  1466copy_1_loop:
  1467	MOVUPS (SI), X0
  1468	MOVUPS X0, (BX)
  1469	ADDQ   $0x10, SI
  1470	ADDQ   $0x10, BX
  1471	SUBQ   $0x10, R14
  1472	JAE    copy_1_loop
  1473	LEAQ   16(SI)(R14*1), SI
  1474	LEAQ   16(BX)(R14*1), BX
  1475	MOVUPS -16(SI), X0
  1476	MOVUPS X0, -16(BX)
  1477	JMP    copy_1_end
  1478
  1479copy_1_small:
  1480	CMPQ R11, $0x03
  1481	JE   copy_1_move_3
  1482	JB   copy_1_move_1or2
  1483	CMPQ R11, $0x08
  1484	JB   copy_1_move_4through7
  1485	JMP  copy_1_move_8through16
  1486
  1487copy_1_move_1or2:
  1488	MOVB (SI), R14
  1489	MOVB -1(SI)(R11*1), R15
  1490	MOVB R14, (BX)
  1491	MOVB R15, -1(BX)(R11*1)
  1492	ADDQ R11, SI
  1493	ADDQ R11, BX
  1494	JMP  copy_1_end
  1495
  1496copy_1_move_3:
  1497	MOVW (SI), R14
  1498	MOVB 2(SI), R15
  1499	MOVW R14, (BX)
  1500	MOVB R15, 2(BX)
  1501	ADDQ R11, SI
  1502	ADDQ R11, BX
  1503	JMP  copy_1_end
  1504
  1505copy_1_move_4through7:
  1506	MOVL (SI), R14
  1507	MOVL -4(SI)(R11*1), R15
  1508	MOVL R14, (BX)
  1509	MOVL R15, -4(BX)(R11*1)
  1510	ADDQ R11, SI
  1511	ADDQ R11, BX
  1512	JMP  copy_1_end
  1513
  1514copy_1_move_8through16:
  1515	MOVQ (SI), R14
  1516	MOVQ -8(SI)(R11*1), R15
  1517	MOVQ R14, (BX)
  1518	MOVQ R15, -8(BX)(R11*1)
  1519	ADDQ R11, SI
  1520	ADDQ R11, BX
  1521
  1522copy_1_end:
  1523	ADDQ R11, DI
  1524
  1525	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  1526check_offset:
  1527	LEAQ (DI)(R10*1), R11
  1528	CMPQ R12, R11
  1529	JG   error_match_off_too_big
  1530	CMPQ R12, R8
  1531	JG   error_match_off_too_big
  1532
  1533	// Copy match from history
  1534	MOVQ R12, R11
  1535	SUBQ DI, R11
  1536	JLS  copy_match
  1537	MOVQ R9, R14
  1538	SUBQ R11, R14
  1539	CMPQ R13, R11
  1540	JG   copy_all_from_history
  1541	MOVQ R13, R11
  1542	SUBQ $0x10, R11
  1543	JB   copy_4_small
  1544
  1545copy_4_loop:
  1546	MOVUPS (R14), X0
  1547	MOVUPS X0, (BX)
  1548	ADDQ   $0x10, R14
  1549	ADDQ   $0x10, BX
  1550	SUBQ   $0x10, R11
  1551	JAE    copy_4_loop
  1552	LEAQ   16(R14)(R11*1), R14
  1553	LEAQ   16(BX)(R11*1), BX
  1554	MOVUPS -16(R14), X0
  1555	MOVUPS X0, -16(BX)
  1556	JMP    copy_4_end
  1557
  1558copy_4_small:
  1559	CMPQ R13, $0x03
  1560	JE   copy_4_move_3
  1561	CMPQ R13, $0x08
  1562	JB   copy_4_move_4through7
  1563	JMP  copy_4_move_8through16
  1564
  1565copy_4_move_3:
  1566	MOVW (R14), R11
  1567	MOVB 2(R14), R12
  1568	MOVW R11, (BX)
  1569	MOVB R12, 2(BX)
  1570	ADDQ R13, R14
  1571	ADDQ R13, BX
  1572	JMP  copy_4_end
  1573
  1574copy_4_move_4through7:
  1575	MOVL (R14), R11
  1576	MOVL -4(R14)(R13*1), R12
  1577	MOVL R11, (BX)
  1578	MOVL R12, -4(BX)(R13*1)
  1579	ADDQ R13, R14
  1580	ADDQ R13, BX
  1581	JMP  copy_4_end
  1582
  1583copy_4_move_8through16:
  1584	MOVQ (R14), R11
  1585	MOVQ -8(R14)(R13*1), R12
  1586	MOVQ R11, (BX)
  1587	MOVQ R12, -8(BX)(R13*1)
  1588	ADDQ R13, R14
  1589	ADDQ R13, BX
  1590
  1591copy_4_end:
  1592	ADDQ R13, DI
  1593	ADDQ $0x18, AX
  1594	INCQ DX
  1595	CMPQ DX, CX
  1596	JB   main_loop
  1597	JMP  loop_finished
  1598
  1599copy_all_from_history:
  1600	MOVQ R11, R15
  1601	SUBQ $0x10, R15
  1602	JB   copy_5_small
  1603
  1604copy_5_loop:
  1605	MOVUPS (R14), X0
  1606	MOVUPS X0, (BX)
  1607	ADDQ   $0x10, R14
  1608	ADDQ   $0x10, BX
  1609	SUBQ   $0x10, R15
  1610	JAE    copy_5_loop
  1611	LEAQ   16(R14)(R15*1), R14
  1612	LEAQ   16(BX)(R15*1), BX
  1613	MOVUPS -16(R14), X0
  1614	MOVUPS X0, -16(BX)
  1615	JMP    copy_5_end
  1616
  1617copy_5_small:
  1618	CMPQ R11, $0x03
  1619	JE   copy_5_move_3
  1620	JB   copy_5_move_1or2
  1621	CMPQ R11, $0x08
  1622	JB   copy_5_move_4through7
  1623	JMP  copy_5_move_8through16
  1624
  1625copy_5_move_1or2:
  1626	MOVB (R14), R15
  1627	MOVB -1(R14)(R11*1), BP
  1628	MOVB R15, (BX)
  1629	MOVB BP, -1(BX)(R11*1)
  1630	ADDQ R11, R14
  1631	ADDQ R11, BX
  1632	JMP  copy_5_end
  1633
  1634copy_5_move_3:
  1635	MOVW (R14), R15
  1636	MOVB 2(R14), BP
  1637	MOVW R15, (BX)
  1638	MOVB BP, 2(BX)
  1639	ADDQ R11, R14
  1640	ADDQ R11, BX
  1641	JMP  copy_5_end
  1642
  1643copy_5_move_4through7:
  1644	MOVL (R14), R15
  1645	MOVL -4(R14)(R11*1), BP
  1646	MOVL R15, (BX)
  1647	MOVL BP, -4(BX)(R11*1)
  1648	ADDQ R11, R14
  1649	ADDQ R11, BX
  1650	JMP  copy_5_end
  1651
  1652copy_5_move_8through16:
  1653	MOVQ (R14), R15
  1654	MOVQ -8(R14)(R11*1), BP
  1655	MOVQ R15, (BX)
  1656	MOVQ BP, -8(BX)(R11*1)
  1657	ADDQ R11, R14
  1658	ADDQ R11, BX
  1659
  1660copy_5_end:
  1661	ADDQ R11, DI
  1662	SUBQ R11, R13
  1663
  1664	// Copy match from the current buffer
  1665copy_match:
  1666	MOVQ BX, R11
  1667	SUBQ R12, R11
  1668
  1669	// ml <= mo
  1670	CMPQ R13, R12
  1671	JA   copy_overlapping_match
  1672
  1673	// Copy non-overlapping match
  1674	ADDQ R13, DI
  1675	MOVQ R13, R12
  1676	SUBQ $0x10, R12
  1677	JB   copy_2_small
  1678
  1679copy_2_loop:
  1680	MOVUPS (R11), X0
  1681	MOVUPS X0, (BX)
  1682	ADDQ   $0x10, R11
  1683	ADDQ   $0x10, BX
  1684	SUBQ   $0x10, R12
  1685	JAE    copy_2_loop
  1686	LEAQ   16(R11)(R12*1), R11
  1687	LEAQ   16(BX)(R12*1), BX
  1688	MOVUPS -16(R11), X0
  1689	MOVUPS X0, -16(BX)
  1690	JMP    copy_2_end
  1691
  1692copy_2_small:
  1693	CMPQ R13, $0x03
  1694	JE   copy_2_move_3
  1695	JB   copy_2_move_1or2
  1696	CMPQ R13, $0x08
  1697	JB   copy_2_move_4through7
  1698	JMP  copy_2_move_8through16
  1699
  1700copy_2_move_1or2:
  1701	MOVB (R11), R12
  1702	MOVB -1(R11)(R13*1), R14
  1703	MOVB R12, (BX)
  1704	MOVB R14, -1(BX)(R13*1)
  1705	ADDQ R13, R11
  1706	ADDQ R13, BX
  1707	JMP  copy_2_end
  1708
  1709copy_2_move_3:
  1710	MOVW (R11), R12
  1711	MOVB 2(R11), R14
  1712	MOVW R12, (BX)
  1713	MOVB R14, 2(BX)
  1714	ADDQ R13, R11
  1715	ADDQ R13, BX
  1716	JMP  copy_2_end
  1717
  1718copy_2_move_4through7:
  1719	MOVL (R11), R12
  1720	MOVL -4(R11)(R13*1), R14
  1721	MOVL R12, (BX)
  1722	MOVL R14, -4(BX)(R13*1)
  1723	ADDQ R13, R11
  1724	ADDQ R13, BX
  1725	JMP  copy_2_end
  1726
  1727copy_2_move_8through16:
  1728	MOVQ (R11), R12
  1729	MOVQ -8(R11)(R13*1), R14
  1730	MOVQ R12, (BX)
  1731	MOVQ R14, -8(BX)(R13*1)
  1732	ADDQ R13, R11
  1733	ADDQ R13, BX
  1734
  1735copy_2_end:
  1736	JMP handle_loop
  1737
  1738	// Copy overlapping match
  1739copy_overlapping_match:
  1740	ADDQ R13, DI
  1741
  1742copy_slow_3:
  1743	MOVB (R11), R12
  1744	MOVB R12, (BX)
  1745	INCQ R11
  1746	INCQ BX
  1747	DECQ R13
  1748	JNZ  copy_slow_3
  1749
  1750handle_loop:
  1751	ADDQ $0x18, AX
  1752	INCQ DX
  1753	CMPQ DX, CX
  1754	JB   main_loop
  1755
  1756loop_finished:
  1757	// Return value
  1758	MOVB $0x01, ret+8(FP)
  1759
  1760	// Update the context
  1761	MOVQ ctx+0(FP), AX
  1762	MOVQ DX, 24(AX)
  1763	MOVQ DI, 104(AX)
  1764	SUBQ 80(AX), SI
  1765	MOVQ SI, 112(AX)
  1766	RET
  1767
  1768error_match_off_too_big:
  1769	// Return value
  1770	MOVB $0x00, ret+8(FP)
  1771
  1772	// Update the context
  1773	MOVQ ctx+0(FP), AX
  1774	MOVQ DX, 24(AX)
  1775	MOVQ DI, 104(AX)
  1776	SUBQ 80(AX), SI
  1777	MOVQ SI, 112(AX)
  1778	RET
  1779
  1780empty_seqs:
  1781	// Return value
  1782	MOVB $0x01, ret+8(FP)
  1783	RET
  1784
  1785// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  1786// Requires: CMOV, SSE
  1787TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
  1788	MOVQ    br+8(FP), CX
  1789	MOVQ    24(CX), DX
  1790	MOVBQZX 32(CX), BX
  1791	MOVQ    (CX), AX
  1792	MOVQ    8(CX), SI
  1793	ADDQ    SI, AX
  1794	MOVQ    AX, (SP)
  1795	MOVQ    ctx+16(FP), AX
  1796	MOVQ    72(AX), DI
  1797	MOVQ    80(AX), R8
  1798	MOVQ    88(AX), R9
  1799	XORQ    CX, CX
  1800	MOVQ    CX, 8(SP)
  1801	MOVQ    CX, 16(SP)
  1802	MOVQ    CX, 24(SP)
  1803	MOVQ    112(AX), R10
  1804	MOVQ    128(AX), CX
  1805	MOVQ    CX, 32(SP)
  1806	MOVQ    144(AX), R11
  1807	MOVQ    136(AX), R12
  1808	MOVQ    200(AX), CX
  1809	MOVQ    CX, 56(SP)
  1810	MOVQ    176(AX), CX
  1811	MOVQ    CX, 48(SP)
  1812	MOVQ    184(AX), AX
  1813	MOVQ    AX, 40(SP)
  1814	MOVQ    40(SP), AX
  1815	ADDQ    AX, 48(SP)
  1816
  1817	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  1818	ADDQ R10, 32(SP)
  1819
  1820	// outBase += outPosition
  1821	ADDQ R12, R10
  1822
  1823sequenceDecs_decodeSync_amd64_main_loop:
  1824	MOVQ (SP), R13
  1825
  1826	// Fill bitreader to have enough for the offset and match length.
  1827	CMPQ SI, $0x08
  1828	JL   sequenceDecs_decodeSync_amd64_fill_byte_by_byte
  1829	MOVQ BX, AX
  1830	SHRQ $0x03, AX
  1831	SUBQ AX, R13
  1832	MOVQ (R13), DX
  1833	SUBQ AX, SI
  1834	ANDQ $0x07, BX
  1835	JMP  sequenceDecs_decodeSync_amd64_fill_end
  1836
  1837sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
  1838	CMPQ    SI, $0x00
  1839	JLE     sequenceDecs_decodeSync_amd64_fill_check_overread
  1840	CMPQ    BX, $0x07
  1841	JLE     sequenceDecs_decodeSync_amd64_fill_end
  1842	SHLQ    $0x08, DX
  1843	SUBQ    $0x01, R13
  1844	SUBQ    $0x01, SI
  1845	SUBQ    $0x08, BX
  1846	MOVBQZX (R13), AX
  1847	ORQ     AX, DX
  1848	JMP     sequenceDecs_decodeSync_amd64_fill_byte_by_byte
  1849
  1850sequenceDecs_decodeSync_amd64_fill_check_overread:
  1851	CMPQ BX, $0x40
  1852	JA   error_overread
  1853
  1854sequenceDecs_decodeSync_amd64_fill_end:
  1855	// Update offset
  1856	MOVQ  R9, AX
  1857	MOVQ  BX, CX
  1858	MOVQ  DX, R14
  1859	SHLQ  CL, R14
  1860	MOVB  AH, CL
  1861	SHRQ  $0x20, AX
  1862	TESTQ CX, CX
  1863	JZ    sequenceDecs_decodeSync_amd64_of_update_zero
  1864	ADDQ  CX, BX
  1865	CMPQ  BX, $0x40
  1866	JA    sequenceDecs_decodeSync_amd64_of_update_zero
  1867	CMPQ  CX, $0x40
  1868	JAE   sequenceDecs_decodeSync_amd64_of_update_zero
  1869	NEGQ  CX
  1870	SHRQ  CL, R14
  1871	ADDQ  R14, AX
  1872
  1873sequenceDecs_decodeSync_amd64_of_update_zero:
  1874	MOVQ AX, 8(SP)
  1875
  1876	// Update match length
  1877	MOVQ  R8, AX
  1878	MOVQ  BX, CX
  1879	MOVQ  DX, R14
  1880	SHLQ  CL, R14
  1881	MOVB  AH, CL
  1882	SHRQ  $0x20, AX
  1883	TESTQ CX, CX
  1884	JZ    sequenceDecs_decodeSync_amd64_ml_update_zero
  1885	ADDQ  CX, BX
  1886	CMPQ  BX, $0x40
  1887	JA    sequenceDecs_decodeSync_amd64_ml_update_zero
  1888	CMPQ  CX, $0x40
  1889	JAE   sequenceDecs_decodeSync_amd64_ml_update_zero
  1890	NEGQ  CX
  1891	SHRQ  CL, R14
  1892	ADDQ  R14, AX
  1893
  1894sequenceDecs_decodeSync_amd64_ml_update_zero:
  1895	MOVQ AX, 16(SP)
  1896
  1897	// Fill bitreader to have enough for the remaining
  1898	CMPQ SI, $0x08
  1899	JL   sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
  1900	MOVQ BX, AX
  1901	SHRQ $0x03, AX
  1902	SUBQ AX, R13
  1903	MOVQ (R13), DX
  1904	SUBQ AX, SI
  1905	ANDQ $0x07, BX
  1906	JMP  sequenceDecs_decodeSync_amd64_fill_2_end
  1907
  1908sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
  1909	CMPQ    SI, $0x00
  1910	JLE     sequenceDecs_decodeSync_amd64_fill_2_check_overread
  1911	CMPQ    BX, $0x07
  1912	JLE     sequenceDecs_decodeSync_amd64_fill_2_end
  1913	SHLQ    $0x08, DX
  1914	SUBQ    $0x01, R13
  1915	SUBQ    $0x01, SI
  1916	SUBQ    $0x08, BX
  1917	MOVBQZX (R13), AX
  1918	ORQ     AX, DX
  1919	JMP     sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
  1920
  1921sequenceDecs_decodeSync_amd64_fill_2_check_overread:
  1922	CMPQ BX, $0x40
  1923	JA   error_overread
  1924
  1925sequenceDecs_decodeSync_amd64_fill_2_end:
  1926	// Update literal length
  1927	MOVQ  DI, AX
  1928	MOVQ  BX, CX
  1929	MOVQ  DX, R14
  1930	SHLQ  CL, R14
  1931	MOVB  AH, CL
  1932	SHRQ  $0x20, AX
  1933	TESTQ CX, CX
  1934	JZ    sequenceDecs_decodeSync_amd64_ll_update_zero
  1935	ADDQ  CX, BX
  1936	CMPQ  BX, $0x40
  1937	JA    sequenceDecs_decodeSync_amd64_ll_update_zero
  1938	CMPQ  CX, $0x40
  1939	JAE   sequenceDecs_decodeSync_amd64_ll_update_zero
  1940	NEGQ  CX
  1941	SHRQ  CL, R14
  1942	ADDQ  R14, AX
  1943
  1944sequenceDecs_decodeSync_amd64_ll_update_zero:
  1945	MOVQ AX, 24(SP)
  1946
  1947	// Fill bitreader for state updates
  1948	MOVQ    R13, (SP)
  1949	MOVQ    R9, AX
  1950	SHRQ    $0x08, AX
  1951	MOVBQZX AL, AX
  1952	MOVQ    ctx+16(FP), CX
  1953	CMPQ    96(CX), $0x00
  1954	JZ      sequenceDecs_decodeSync_amd64_skip_update
  1955
  1956	// Update Literal Length State
  1957	MOVBQZX DI, R13
  1958	SHRL    $0x10, DI
  1959	LEAQ    (BX)(R13*1), CX
  1960	MOVQ    DX, R14
  1961	MOVQ    CX, BX
  1962	ROLQ    CL, R14
  1963	MOVL    $0x00000001, R15
  1964	MOVB    R13, CL
  1965	SHLL    CL, R15
  1966	DECL    R15
  1967	ANDQ    R15, R14
  1968	ADDQ    R14, DI
  1969
  1970	// Load ctx.llTable
  1971	MOVQ ctx+16(FP), CX
  1972	MOVQ (CX), CX
  1973	MOVQ (CX)(DI*8), DI
  1974
  1975	// Update Match Length State
  1976	MOVBQZX R8, R13
  1977	SHRL    $0x10, R8
  1978	LEAQ    (BX)(R13*1), CX
  1979	MOVQ    DX, R14
  1980	MOVQ    CX, BX
  1981	ROLQ    CL, R14
  1982	MOVL    $0x00000001, R15
  1983	MOVB    R13, CL
  1984	SHLL    CL, R15
  1985	DECL    R15
  1986	ANDQ    R15, R14
  1987	ADDQ    R14, R8
  1988
  1989	// Load ctx.mlTable
  1990	MOVQ ctx+16(FP), CX
  1991	MOVQ 24(CX), CX
  1992	MOVQ (CX)(R8*8), R8
  1993
  1994	// Update Offset State
  1995	MOVBQZX R9, R13
  1996	SHRL    $0x10, R9
  1997	LEAQ    (BX)(R13*1), CX
  1998	MOVQ    DX, R14
  1999	MOVQ    CX, BX
  2000	ROLQ    CL, R14
  2001	MOVL    $0x00000001, R15
  2002	MOVB    R13, CL
  2003	SHLL    CL, R15
  2004	DECL    R15
  2005	ANDQ    R15, R14
  2006	ADDQ    R14, R9
  2007
  2008	// Load ctx.ofTable
  2009	MOVQ ctx+16(FP), CX
  2010	MOVQ 48(CX), CX
  2011	MOVQ (CX)(R9*8), R9
  2012
  2013sequenceDecs_decodeSync_amd64_skip_update:
  2014	// Adjust offset
  2015	MOVQ   s+0(FP), CX
  2016	MOVQ   8(SP), R13
  2017	CMPQ   AX, $0x01
  2018	JBE    sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
  2019	MOVUPS 144(CX), X0
  2020	MOVQ   R13, 144(CX)
  2021	MOVUPS X0, 152(CX)
  2022	JMP    sequenceDecs_decodeSync_amd64_after_adjust
  2023
  2024sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
  2025	CMPQ 24(SP), $0x00000000
  2026	JNE  sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
  2027	INCQ R13
  2028	JMP  sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
  2029
  2030sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
  2031	TESTQ R13, R13
  2032	JNZ   sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
  2033	MOVQ  144(CX), R13
  2034	JMP   sequenceDecs_decodeSync_amd64_after_adjust
  2035
  2036sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
  2037	MOVQ    R13, AX
  2038	XORQ    R14, R14
  2039	MOVQ    $-1, R15
  2040	CMPQ    R13, $0x03
  2041	CMOVQEQ R14, AX
  2042	CMOVQEQ R15, R14
  2043	ADDQ    144(CX)(AX*8), R14
  2044	JNZ     sequenceDecs_decodeSync_amd64_adjust_temp_valid
  2045	MOVQ    $0x00000001, R14
  2046
  2047sequenceDecs_decodeSync_amd64_adjust_temp_valid:
  2048	CMPQ R13, $0x01
  2049	JZ   sequenceDecs_decodeSync_amd64_adjust_skip
  2050	MOVQ 152(CX), AX
  2051	MOVQ AX, 160(CX)
  2052
  2053sequenceDecs_decodeSync_amd64_adjust_skip:
  2054	MOVQ 144(CX), AX
  2055	MOVQ AX, 152(CX)
  2056	MOVQ R14, 144(CX)
  2057	MOVQ R14, R13
  2058
  2059sequenceDecs_decodeSync_amd64_after_adjust:
  2060	MOVQ R13, 8(SP)
  2061
  2062	// Check values
  2063	MOVQ  16(SP), AX
  2064	MOVQ  24(SP), CX
  2065	LEAQ  (AX)(CX*1), R14
  2066	MOVQ  s+0(FP), R15
  2067	ADDQ  R14, 256(R15)
  2068	MOVQ  ctx+16(FP), R14
  2069	SUBQ  CX, 104(R14)
  2070	JS    error_not_enough_literals
  2071	CMPQ  AX, $0x00020002
  2072	JA    sequenceDecs_decodeSync_amd64_error_match_len_too_big
  2073	TESTQ R13, R13
  2074	JNZ   sequenceDecs_decodeSync_amd64_match_len_ofs_ok
  2075	TESTQ AX, AX
  2076	JNZ   sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
  2077
  2078sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
  2079	MOVQ 24(SP), AX
  2080	MOVQ 8(SP), CX
  2081	MOVQ 16(SP), R13
  2082
  2083	// Check if we have enough space in s.out
  2084	LEAQ (AX)(R13*1), R14
  2085	ADDQ R10, R14
  2086	CMPQ R14, 32(SP)
  2087	JA   error_not_enough_space
  2088
  2089	// Copy literals
  2090	TESTQ AX, AX
  2091	JZ    check_offset
  2092	XORQ  R14, R14
  2093
  2094copy_1:
  2095	MOVUPS (R11)(R14*1), X0
  2096	MOVUPS X0, (R10)(R14*1)
  2097	ADDQ   $0x10, R14
  2098	CMPQ   R14, AX
  2099	JB     copy_1
  2100	ADDQ   AX, R11
  2101	ADDQ   AX, R10
  2102	ADDQ   AX, R12
  2103
  2104	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  2105check_offset:
  2106	MOVQ R12, AX
  2107	ADDQ 40(SP), AX
  2108	CMPQ CX, AX
  2109	JG   error_match_off_too_big
  2110	CMPQ CX, 56(SP)
  2111	JG   error_match_off_too_big
  2112
  2113	// Copy match from history
  2114	MOVQ CX, AX
  2115	SUBQ R12, AX
  2116	JLS  copy_match
  2117	MOVQ 48(SP), R14
  2118	SUBQ AX, R14
  2119	CMPQ R13, AX
  2120	JG   copy_all_from_history
  2121	MOVQ R13, AX
  2122	SUBQ $0x10, AX
  2123	JB   copy_4_small
  2124
  2125copy_4_loop:
  2126	MOVUPS (R14), X0
  2127	MOVUPS X0, (R10)
  2128	ADDQ   $0x10, R14
  2129	ADDQ   $0x10, R10
  2130	SUBQ   $0x10, AX
  2131	JAE    copy_4_loop
  2132	LEAQ   16(R14)(AX*1), R14
  2133	LEAQ   16(R10)(AX*1), R10
  2134	MOVUPS -16(R14), X0
  2135	MOVUPS X0, -16(R10)
  2136	JMP    copy_4_end
  2137
  2138copy_4_small:
  2139	CMPQ R13, $0x03
  2140	JE   copy_4_move_3
  2141	CMPQ R13, $0x08
  2142	JB   copy_4_move_4through7
  2143	JMP  copy_4_move_8through16
  2144
  2145copy_4_move_3:
  2146	MOVW (R14), AX
  2147	MOVB 2(R14), CL
  2148	MOVW AX, (R10)
  2149	MOVB CL, 2(R10)
  2150	ADDQ R13, R14
  2151	ADDQ R13, R10
  2152	JMP  copy_4_end
  2153
  2154copy_4_move_4through7:
  2155	MOVL (R14), AX
  2156	MOVL -4(R14)(R13*1), CX
  2157	MOVL AX, (R10)
  2158	MOVL CX, -4(R10)(R13*1)
  2159	ADDQ R13, R14
  2160	ADDQ R13, R10
  2161	JMP  copy_4_end
  2162
  2163copy_4_move_8through16:
  2164	MOVQ (R14), AX
  2165	MOVQ -8(R14)(R13*1), CX
  2166	MOVQ AX, (R10)
  2167	MOVQ CX, -8(R10)(R13*1)
  2168	ADDQ R13, R14
  2169	ADDQ R13, R10
  2170
  2171copy_4_end:
  2172	ADDQ R13, R12
  2173	JMP  handle_loop
  2174	JMP loop_finished
  2175
  2176copy_all_from_history:
  2177	MOVQ AX, R15
  2178	SUBQ $0x10, R15
  2179	JB   copy_5_small
  2180
  2181copy_5_loop:
  2182	MOVUPS (R14), X0
  2183	MOVUPS X0, (R10)
  2184	ADDQ   $0x10, R14
  2185	ADDQ   $0x10, R10
  2186	SUBQ   $0x10, R15
  2187	JAE    copy_5_loop
  2188	LEAQ   16(R14)(R15*1), R14
  2189	LEAQ   16(R10)(R15*1), R10
  2190	MOVUPS -16(R14), X0
  2191	MOVUPS X0, -16(R10)
  2192	JMP    copy_5_end
  2193
  2194copy_5_small:
  2195	CMPQ AX, $0x03
  2196	JE   copy_5_move_3
  2197	JB   copy_5_move_1or2
  2198	CMPQ AX, $0x08
  2199	JB   copy_5_move_4through7
  2200	JMP  copy_5_move_8through16
  2201
  2202copy_5_move_1or2:
  2203	MOVB (R14), R15
  2204	MOVB -1(R14)(AX*1), BP
  2205	MOVB R15, (R10)
  2206	MOVB BP, -1(R10)(AX*1)
  2207	ADDQ AX, R14
  2208	ADDQ AX, R10
  2209	JMP  copy_5_end
  2210
  2211copy_5_move_3:
  2212	MOVW (R14), R15
  2213	MOVB 2(R14), BP
  2214	MOVW R15, (R10)
  2215	MOVB BP, 2(R10)
  2216	ADDQ AX, R14
  2217	ADDQ AX, R10
  2218	JMP  copy_5_end
  2219
  2220copy_5_move_4through7:
  2221	MOVL (R14), R15
  2222	MOVL -4(R14)(AX*1), BP
  2223	MOVL R15, (R10)
  2224	MOVL BP, -4(R10)(AX*1)
  2225	ADDQ AX, R14
  2226	ADDQ AX, R10
  2227	JMP  copy_5_end
  2228
  2229copy_5_move_8through16:
  2230	MOVQ (R14), R15
  2231	MOVQ -8(R14)(AX*1), BP
  2232	MOVQ R15, (R10)
  2233	MOVQ BP, -8(R10)(AX*1)
  2234	ADDQ AX, R14
  2235	ADDQ AX, R10
  2236
  2237copy_5_end:
  2238	ADDQ AX, R12
  2239	SUBQ AX, R13
  2240
  2241	// Copy match from the current buffer
  2242copy_match:
  2243	MOVQ R10, AX
  2244	SUBQ CX, AX
  2245
  2246	// ml <= mo
  2247	CMPQ R13, CX
  2248	JA   copy_overlapping_match
  2249
  2250	// Copy non-overlapping match
  2251	ADDQ R13, R12
  2252	MOVQ R10, CX
  2253	ADDQ R13, R10
  2254
  2255copy_2:
  2256	MOVUPS (AX), X0
  2257	MOVUPS X0, (CX)
  2258	ADDQ   $0x10, AX
  2259	ADDQ   $0x10, CX
  2260	SUBQ   $0x10, R13
  2261	JHI    copy_2
  2262	JMP    handle_loop
  2263
  2264	// Copy overlapping match
  2265copy_overlapping_match:
  2266	ADDQ R13, R12
  2267
  2268copy_slow_3:
  2269	MOVB (AX), CL
  2270	MOVB CL, (R10)
  2271	INCQ AX
  2272	INCQ R10
  2273	DECQ R13
  2274	JNZ  copy_slow_3
  2275
  2276handle_loop:
  2277	MOVQ ctx+16(FP), AX
  2278	DECQ 96(AX)
  2279	JNS  sequenceDecs_decodeSync_amd64_main_loop
  2280
  2281loop_finished:
  2282	MOVQ br+8(FP), AX
  2283	MOVQ DX, 24(AX)
  2284	MOVB BL, 32(AX)
  2285	MOVQ SI, 8(AX)
  2286
  2287	// Update the context
  2288	MOVQ ctx+16(FP), AX
  2289	MOVQ R12, 136(AX)
  2290	MOVQ 144(AX), CX
  2291	SUBQ CX, R11
  2292	MOVQ R11, 168(AX)
  2293
  2294	// Return success
  2295	MOVQ $0x00000000, ret+24(FP)
  2296	RET
  2297
  2298	// Return with match length error
  2299sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
  2300	MOVQ 16(SP), AX
  2301	MOVQ ctx+16(FP), CX
  2302	MOVQ AX, 216(CX)
  2303	MOVQ $0x00000001, ret+24(FP)
  2304	RET
  2305
  2306	// Return with match too long error
  2307sequenceDecs_decodeSync_amd64_error_match_len_too_big:
  2308	MOVQ ctx+16(FP), AX
  2309	MOVQ 16(SP), CX
  2310	MOVQ CX, 216(AX)
  2311	MOVQ $0x00000002, ret+24(FP)
  2312	RET
  2313
  2314	// Return with match offset too long error
  2315error_match_off_too_big:
  2316	MOVQ ctx+16(FP), AX
  2317	MOVQ 8(SP), CX
  2318	MOVQ CX, 224(AX)
  2319	MOVQ R12, 136(AX)
  2320	MOVQ $0x00000003, ret+24(FP)
  2321	RET
  2322
  2323	// Return with not enough literals error
  2324error_not_enough_literals:
  2325	MOVQ ctx+16(FP), AX
  2326	MOVQ 24(SP), CX
  2327	MOVQ CX, 208(AX)
  2328	MOVQ $0x00000004, ret+24(FP)
  2329	RET
  2330
  2331	// Return with overread error
  2332error_overread:
  2333	MOVQ $0x00000006, ret+24(FP)
  2334	RET
  2335
  2336	// Return with not enough output space error
  2337error_not_enough_space:
  2338	MOVQ ctx+16(FP), AX
  2339	MOVQ 24(SP), CX
  2340	MOVQ CX, 208(AX)
  2341	MOVQ 16(SP), CX
  2342	MOVQ CX, 216(AX)
  2343	MOVQ R12, 136(AX)
  2344	MOVQ $0x00000005, ret+24(FP)
  2345	RET
  2346
  2347// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  2348// Requires: BMI, BMI2, CMOV, SSE
  2349TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
  2350	MOVQ    br+8(FP), BX
  2351	MOVQ    24(BX), AX
  2352	MOVBQZX 32(BX), DX
  2353	MOVQ    (BX), CX
  2354	MOVQ    8(BX), BX
  2355	ADDQ    BX, CX
  2356	MOVQ    CX, (SP)
  2357	MOVQ    ctx+16(FP), CX
  2358	MOVQ    72(CX), SI
  2359	MOVQ    80(CX), DI
  2360	MOVQ    88(CX), R8
  2361	XORQ    R9, R9
  2362	MOVQ    R9, 8(SP)
  2363	MOVQ    R9, 16(SP)
  2364	MOVQ    R9, 24(SP)
  2365	MOVQ    112(CX), R9
  2366	MOVQ    128(CX), R10
  2367	MOVQ    R10, 32(SP)
  2368	MOVQ    144(CX), R10
  2369	MOVQ    136(CX), R11
  2370	MOVQ    200(CX), R12
  2371	MOVQ    R12, 56(SP)
  2372	MOVQ    176(CX), R12
  2373	MOVQ    R12, 48(SP)
  2374	MOVQ    184(CX), CX
  2375	MOVQ    CX, 40(SP)
  2376	MOVQ    40(SP), CX
  2377	ADDQ    CX, 48(SP)
  2378
  2379	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  2380	ADDQ R9, 32(SP)
  2381
  2382	// outBase += outPosition
  2383	ADDQ R11, R9
  2384
  2385sequenceDecs_decodeSync_bmi2_main_loop:
  2386	MOVQ (SP), R12
  2387
  2388	// Fill bitreader to have enough for the offset and match length.
  2389	CMPQ BX, $0x08
  2390	JL   sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
  2391	MOVQ DX, CX
  2392	SHRQ $0x03, CX
  2393	SUBQ CX, R12
  2394	MOVQ (R12), AX
  2395	SUBQ CX, BX
  2396	ANDQ $0x07, DX
  2397	JMP  sequenceDecs_decodeSync_bmi2_fill_end
  2398
  2399sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
  2400	CMPQ    BX, $0x00
  2401	JLE     sequenceDecs_decodeSync_bmi2_fill_check_overread
  2402	CMPQ    DX, $0x07
  2403	JLE     sequenceDecs_decodeSync_bmi2_fill_end
  2404	SHLQ    $0x08, AX
  2405	SUBQ    $0x01, R12
  2406	SUBQ    $0x01, BX
  2407	SUBQ    $0x08, DX
  2408	MOVBQZX (R12), CX
  2409	ORQ     CX, AX
  2410	JMP     sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
  2411
  2412sequenceDecs_decodeSync_bmi2_fill_check_overread:
  2413	CMPQ DX, $0x40
  2414	JA   error_overread
  2415
  2416sequenceDecs_decodeSync_bmi2_fill_end:
  2417	// Update offset
  2418	MOVQ   $0x00000808, CX
  2419	BEXTRQ CX, R8, R13
  2420	MOVQ   AX, R14
  2421	LEAQ   (DX)(R13*1), CX
  2422	ROLQ   CL, R14
  2423	BZHIQ  R13, R14, R14
  2424	MOVQ   CX, DX
  2425	MOVQ   R8, CX
  2426	SHRQ   $0x20, CX
  2427	ADDQ   R14, CX
  2428	MOVQ   CX, 8(SP)
  2429
  2430	// Update match length
  2431	MOVQ   $0x00000808, CX
  2432	BEXTRQ CX, DI, R13
  2433	MOVQ   AX, R14
  2434	LEAQ   (DX)(R13*1), CX
  2435	ROLQ   CL, R14
  2436	BZHIQ  R13, R14, R14
  2437	MOVQ   CX, DX
  2438	MOVQ   DI, CX
  2439	SHRQ   $0x20, CX
  2440	ADDQ   R14, CX
  2441	MOVQ   CX, 16(SP)
  2442
  2443	// Fill bitreader to have enough for the remaining
  2444	CMPQ BX, $0x08
  2445	JL   sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
  2446	MOVQ DX, CX
  2447	SHRQ $0x03, CX
  2448	SUBQ CX, R12
  2449	MOVQ (R12), AX
  2450	SUBQ CX, BX
  2451	ANDQ $0x07, DX
  2452	JMP  sequenceDecs_decodeSync_bmi2_fill_2_end
  2453
  2454sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
  2455	CMPQ    BX, $0x00
  2456	JLE     sequenceDecs_decodeSync_bmi2_fill_2_check_overread
  2457	CMPQ    DX, $0x07
  2458	JLE     sequenceDecs_decodeSync_bmi2_fill_2_end
  2459	SHLQ    $0x08, AX
  2460	SUBQ    $0x01, R12
  2461	SUBQ    $0x01, BX
  2462	SUBQ    $0x08, DX
  2463	MOVBQZX (R12), CX
  2464	ORQ     CX, AX
  2465	JMP     sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
  2466
  2467sequenceDecs_decodeSync_bmi2_fill_2_check_overread:
  2468	CMPQ DX, $0x40
  2469	JA   error_overread
  2470
  2471sequenceDecs_decodeSync_bmi2_fill_2_end:
  2472	// Update literal length
  2473	MOVQ   $0x00000808, CX
  2474	BEXTRQ CX, SI, R13
  2475	MOVQ   AX, R14
  2476	LEAQ   (DX)(R13*1), CX
  2477	ROLQ   CL, R14
  2478	BZHIQ  R13, R14, R14
  2479	MOVQ   CX, DX
  2480	MOVQ   SI, CX
  2481	SHRQ   $0x20, CX
  2482	ADDQ   R14, CX
  2483	MOVQ   CX, 24(SP)
  2484
  2485	// Fill bitreader for state updates
  2486	MOVQ    R12, (SP)
  2487	MOVQ    $0x00000808, CX
  2488	BEXTRQ  CX, R8, R12
  2489	MOVQ    ctx+16(FP), CX
  2490	CMPQ    96(CX), $0x00
  2491	JZ      sequenceDecs_decodeSync_bmi2_skip_update
  2492	LEAQ    (SI)(DI*1), R13
  2493	ADDQ    R8, R13
  2494	MOVBQZX R13, R13
  2495	LEAQ    (DX)(R13*1), CX
  2496	MOVQ    AX, R14
  2497	MOVQ    CX, DX
  2498	ROLQ    CL, R14
  2499	BZHIQ   R13, R14, R14
  2500
  2501	// Update Offset State
  2502	BZHIQ R8, R14, CX
  2503	SHRXQ R8, R14, R14
  2504	SHRL  $0x10, R8
  2505	ADDQ  CX, R8
  2506
  2507	// Load ctx.ofTable
  2508	MOVQ ctx+16(FP), CX
  2509	MOVQ 48(CX), CX
  2510	MOVQ (CX)(R8*8), R8
  2511
  2512	// Update Match Length State
  2513	BZHIQ DI, R14, CX
  2514	SHRXQ DI, R14, R14
  2515	SHRL  $0x10, DI
  2516	ADDQ  CX, DI
  2517
  2518	// Load ctx.mlTable
  2519	MOVQ ctx+16(FP), CX
  2520	MOVQ 24(CX), CX
  2521	MOVQ (CX)(DI*8), DI
  2522
  2523	// Update Literal Length State
  2524	BZHIQ SI, R14, CX
  2525	SHRL  $0x10, SI
  2526	ADDQ  CX, SI
  2527
  2528	// Load ctx.llTable
  2529	MOVQ ctx+16(FP), CX
  2530	MOVQ (CX), CX
  2531	MOVQ (CX)(SI*8), SI
  2532
  2533sequenceDecs_decodeSync_bmi2_skip_update:
  2534	// Adjust offset
  2535	MOVQ   s+0(FP), CX
  2536	MOVQ   8(SP), R13
  2537	CMPQ   R12, $0x01
  2538	JBE    sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
  2539	MOVUPS 144(CX), X0
  2540	MOVQ   R13, 144(CX)
  2541	MOVUPS X0, 152(CX)
  2542	JMP    sequenceDecs_decodeSync_bmi2_after_adjust
  2543
  2544sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
  2545	CMPQ 24(SP), $0x00000000
  2546	JNE  sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
  2547	INCQ R13
  2548	JMP  sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
  2549
  2550sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
  2551	TESTQ R13, R13
  2552	JNZ   sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
  2553	MOVQ  144(CX), R13
  2554	JMP   sequenceDecs_decodeSync_bmi2_after_adjust
  2555
  2556sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
  2557	MOVQ    R13, R12
  2558	XORQ    R14, R14
  2559	MOVQ    $-1, R15
  2560	CMPQ    R13, $0x03
  2561	CMOVQEQ R14, R12
  2562	CMOVQEQ R15, R14
  2563	ADDQ    144(CX)(R12*8), R14
  2564	JNZ     sequenceDecs_decodeSync_bmi2_adjust_temp_valid
  2565	MOVQ    $0x00000001, R14
  2566
  2567sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
  2568	CMPQ R13, $0x01
  2569	JZ   sequenceDecs_decodeSync_bmi2_adjust_skip
  2570	MOVQ 152(CX), R12
  2571	MOVQ R12, 160(CX)
  2572
  2573sequenceDecs_decodeSync_bmi2_adjust_skip:
  2574	MOVQ 144(CX), R12
  2575	MOVQ R12, 152(CX)
  2576	MOVQ R14, 144(CX)
  2577	MOVQ R14, R13
  2578
  2579sequenceDecs_decodeSync_bmi2_after_adjust:
  2580	MOVQ R13, 8(SP)
  2581
  2582	// Check values
  2583	MOVQ  16(SP), CX
  2584	MOVQ  24(SP), R12
  2585	LEAQ  (CX)(R12*1), R14
  2586	MOVQ  s+0(FP), R15
  2587	ADDQ  R14, 256(R15)
  2588	MOVQ  ctx+16(FP), R14
  2589	SUBQ  R12, 104(R14)
  2590	JS    error_not_enough_literals
  2591	CMPQ  CX, $0x00020002
  2592	JA    sequenceDecs_decodeSync_bmi2_error_match_len_too_big
  2593	TESTQ R13, R13
  2594	JNZ   sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
  2595	TESTQ CX, CX
  2596	JNZ   sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
  2597
  2598sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
  2599	MOVQ 24(SP), CX
  2600	MOVQ 8(SP), R12
  2601	MOVQ 16(SP), R13
  2602
  2603	// Check if we have enough space in s.out
  2604	LEAQ (CX)(R13*1), R14
  2605	ADDQ R9, R14
  2606	CMPQ R14, 32(SP)
  2607	JA   error_not_enough_space
  2608
  2609	// Copy literals
  2610	TESTQ CX, CX
  2611	JZ    check_offset
  2612	XORQ  R14, R14
  2613
  2614copy_1:
  2615	MOVUPS (R10)(R14*1), X0
  2616	MOVUPS X0, (R9)(R14*1)
  2617	ADDQ   $0x10, R14
  2618	CMPQ   R14, CX
  2619	JB     copy_1
  2620	ADDQ   CX, R10
  2621	ADDQ   CX, R9
  2622	ADDQ   CX, R11
  2623
  2624	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  2625check_offset:
  2626	MOVQ R11, CX
  2627	ADDQ 40(SP), CX
  2628	CMPQ R12, CX
  2629	JG   error_match_off_too_big
  2630	CMPQ R12, 56(SP)
  2631	JG   error_match_off_too_big
  2632
  2633	// Copy match from history
  2634	MOVQ R12, CX
  2635	SUBQ R11, CX
  2636	JLS  copy_match
  2637	MOVQ 48(SP), R14
  2638	SUBQ CX, R14
  2639	CMPQ R13, CX
  2640	JG   copy_all_from_history
  2641	MOVQ R13, CX
  2642	SUBQ $0x10, CX
  2643	JB   copy_4_small
  2644
  2645copy_4_loop:
  2646	MOVUPS (R14), X0
  2647	MOVUPS X0, (R9)
  2648	ADDQ   $0x10, R14
  2649	ADDQ   $0x10, R9
  2650	SUBQ   $0x10, CX
  2651	JAE    copy_4_loop
  2652	LEAQ   16(R14)(CX*1), R14
  2653	LEAQ   16(R9)(CX*1), R9
  2654	MOVUPS -16(R14), X0
  2655	MOVUPS X0, -16(R9)
  2656	JMP    copy_4_end
  2657
  2658copy_4_small:
  2659	CMPQ R13, $0x03
  2660	JE   copy_4_move_3
  2661	CMPQ R13, $0x08
  2662	JB   copy_4_move_4through7
  2663	JMP  copy_4_move_8through16
  2664
  2665copy_4_move_3:
  2666	MOVW (R14), CX
  2667	MOVB 2(R14), R12
  2668	MOVW CX, (R9)
  2669	MOVB R12, 2(R9)
  2670	ADDQ R13, R14
  2671	ADDQ R13, R9
  2672	JMP  copy_4_end
  2673
  2674copy_4_move_4through7:
  2675	MOVL (R14), CX
  2676	MOVL -4(R14)(R13*1), R12
  2677	MOVL CX, (R9)
  2678	MOVL R12, -4(R9)(R13*1)
  2679	ADDQ R13, R14
  2680	ADDQ R13, R9
  2681	JMP  copy_4_end
  2682
  2683copy_4_move_8through16:
  2684	MOVQ (R14), CX
  2685	MOVQ -8(R14)(R13*1), R12
  2686	MOVQ CX, (R9)
  2687	MOVQ R12, -8(R9)(R13*1)
  2688	ADDQ R13, R14
  2689	ADDQ R13, R9
  2690
  2691copy_4_end:
  2692	ADDQ R13, R11
  2693	JMP  handle_loop
  2694	JMP loop_finished
  2695
  2696copy_all_from_history:
  2697	MOVQ CX, R15
  2698	SUBQ $0x10, R15
  2699	JB   copy_5_small
  2700
  2701copy_5_loop:
  2702	MOVUPS (R14), X0
  2703	MOVUPS X0, (R9)
  2704	ADDQ   $0x10, R14
  2705	ADDQ   $0x10, R9
  2706	SUBQ   $0x10, R15
  2707	JAE    copy_5_loop
  2708	LEAQ   16(R14)(R15*1), R14
  2709	LEAQ   16(R9)(R15*1), R9
  2710	MOVUPS -16(R14), X0
  2711	MOVUPS X0, -16(R9)
  2712	JMP    copy_5_end
  2713
  2714copy_5_small:
  2715	CMPQ CX, $0x03
  2716	JE   copy_5_move_3
  2717	JB   copy_5_move_1or2
  2718	CMPQ CX, $0x08
  2719	JB   copy_5_move_4through7
  2720	JMP  copy_5_move_8through16
  2721
  2722copy_5_move_1or2:
  2723	MOVB (R14), R15
  2724	MOVB -1(R14)(CX*1), BP
  2725	MOVB R15, (R9)
  2726	MOVB BP, -1(R9)(CX*1)
  2727	ADDQ CX, R14
  2728	ADDQ CX, R9
  2729	JMP  copy_5_end
  2730
  2731copy_5_move_3:
  2732	MOVW (R14), R15
  2733	MOVB 2(R14), BP
  2734	MOVW R15, (R9)
  2735	MOVB BP, 2(R9)
  2736	ADDQ CX, R14
  2737	ADDQ CX, R9
  2738	JMP  copy_5_end
  2739
  2740copy_5_move_4through7:
  2741	MOVL (R14), R15
  2742	MOVL -4(R14)(CX*1), BP
  2743	MOVL R15, (R9)
  2744	MOVL BP, -4(R9)(CX*1)
  2745	ADDQ CX, R14
  2746	ADDQ CX, R9
  2747	JMP  copy_5_end
  2748
  2749copy_5_move_8through16:
  2750	MOVQ (R14), R15
  2751	MOVQ -8(R14)(CX*1), BP
  2752	MOVQ R15, (R9)
  2753	MOVQ BP, -8(R9)(CX*1)
  2754	ADDQ CX, R14
  2755	ADDQ CX, R9
  2756
  2757copy_5_end:
  2758	ADDQ CX, R11
  2759	SUBQ CX, R13
  2760
  2761	// Copy match from the current buffer
  2762copy_match:
  2763	MOVQ R9, CX
  2764	SUBQ R12, CX
  2765
  2766	// ml <= mo
  2767	CMPQ R13, R12
  2768	JA   copy_overlapping_match
  2769
  2770	// Copy non-overlapping match
  2771	ADDQ R13, R11
  2772	MOVQ R9, R12
  2773	ADDQ R13, R9
  2774
  2775copy_2:
  2776	MOVUPS (CX), X0
  2777	MOVUPS X0, (R12)
  2778	ADDQ   $0x10, CX
  2779	ADDQ   $0x10, R12
  2780	SUBQ   $0x10, R13
  2781	JHI    copy_2
  2782	JMP    handle_loop
  2783
  2784	// Copy overlapping match
  2785copy_overlapping_match:
  2786	ADDQ R13, R11
  2787
  2788copy_slow_3:
  2789	MOVB (CX), R12
  2790	MOVB R12, (R9)
  2791	INCQ CX
  2792	INCQ R9
  2793	DECQ R13
  2794	JNZ  copy_slow_3
  2795
  2796handle_loop:
  2797	MOVQ ctx+16(FP), CX
  2798	DECQ 96(CX)
  2799	JNS  sequenceDecs_decodeSync_bmi2_main_loop
  2800
  2801loop_finished:
  2802	MOVQ br+8(FP), CX
  2803	MOVQ AX, 24(CX)
  2804	MOVB DL, 32(CX)
  2805	MOVQ BX, 8(CX)
  2806
  2807	// Update the context
  2808	MOVQ ctx+16(FP), AX
  2809	MOVQ R11, 136(AX)
  2810	MOVQ 144(AX), CX
  2811	SUBQ CX, R10
  2812	MOVQ R10, 168(AX)
  2813
  2814	// Return success
  2815	MOVQ $0x00000000, ret+24(FP)
  2816	RET
  2817
  2818	// Return with match length error
  2819sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
  2820	MOVQ 16(SP), AX
  2821	MOVQ ctx+16(FP), CX
  2822	MOVQ AX, 216(CX)
  2823	MOVQ $0x00000001, ret+24(FP)
  2824	RET
  2825
  2826	// Return with match too long error
  2827sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
  2828	MOVQ ctx+16(FP), AX
  2829	MOVQ 16(SP), CX
  2830	MOVQ CX, 216(AX)
  2831	MOVQ $0x00000002, ret+24(FP)
  2832	RET
  2833
  2834	// Return with match offset too long error
  2835error_match_off_too_big:
  2836	MOVQ ctx+16(FP), AX
  2837	MOVQ 8(SP), CX
  2838	MOVQ CX, 224(AX)
  2839	MOVQ R11, 136(AX)
  2840	MOVQ $0x00000003, ret+24(FP)
  2841	RET
  2842
  2843	// Return with not enough literals error
  2844error_not_enough_literals:
  2845	MOVQ ctx+16(FP), AX
  2846	MOVQ 24(SP), CX
  2847	MOVQ CX, 208(AX)
  2848	MOVQ $0x00000004, ret+24(FP)
  2849	RET
  2850
  2851	// Return with overread error
  2852error_overread:
  2853	MOVQ $0x00000006, ret+24(FP)
  2854	RET
  2855
  2856	// Return with not enough output space error
  2857error_not_enough_space:
  2858	MOVQ ctx+16(FP), AX
  2859	MOVQ 24(SP), CX
  2860	MOVQ CX, 208(AX)
  2861	MOVQ 16(SP), CX
  2862	MOVQ CX, 216(AX)
  2863	MOVQ R11, 136(AX)
  2864	MOVQ $0x00000005, ret+24(FP)
  2865	RET
  2866
  2867// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  2868// Requires: CMOV, SSE
  2869TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
  2870	MOVQ    br+8(FP), CX
  2871	MOVQ    24(CX), DX
  2872	MOVBQZX 32(CX), BX
  2873	MOVQ    (CX), AX
  2874	MOVQ    8(CX), SI
  2875	ADDQ    SI, AX
  2876	MOVQ    AX, (SP)
  2877	MOVQ    ctx+16(FP), AX
  2878	MOVQ    72(AX), DI
  2879	MOVQ    80(AX), R8
  2880	MOVQ    88(AX), R9
  2881	XORQ    CX, CX
  2882	MOVQ    CX, 8(SP)
  2883	MOVQ    CX, 16(SP)
  2884	MOVQ    CX, 24(SP)
  2885	MOVQ    112(AX), R10
  2886	MOVQ    128(AX), CX
  2887	MOVQ    CX, 32(SP)
  2888	MOVQ    144(AX), R11
  2889	MOVQ    136(AX), R12
  2890	MOVQ    200(AX), CX
  2891	MOVQ    CX, 56(SP)
  2892	MOVQ    176(AX), CX
  2893	MOVQ    CX, 48(SP)
  2894	MOVQ    184(AX), AX
  2895	MOVQ    AX, 40(SP)
  2896	MOVQ    40(SP), AX
  2897	ADDQ    AX, 48(SP)
  2898
  2899	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  2900	ADDQ R10, 32(SP)
  2901
  2902	// outBase += outPosition
  2903	ADDQ R12, R10
  2904
  2905sequenceDecs_decodeSync_safe_amd64_main_loop:
  2906	MOVQ (SP), R13
  2907
  2908	// Fill bitreader to have enough for the offset and match length.
  2909	CMPQ SI, $0x08
  2910	JL   sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
  2911	MOVQ BX, AX
  2912	SHRQ $0x03, AX
  2913	SUBQ AX, R13
  2914	MOVQ (R13), DX
  2915	SUBQ AX, SI
  2916	ANDQ $0x07, BX
  2917	JMP  sequenceDecs_decodeSync_safe_amd64_fill_end
  2918
  2919sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
  2920	CMPQ    SI, $0x00
  2921	JLE     sequenceDecs_decodeSync_safe_amd64_fill_check_overread
  2922	CMPQ    BX, $0x07
  2923	JLE     sequenceDecs_decodeSync_safe_amd64_fill_end
  2924	SHLQ    $0x08, DX
  2925	SUBQ    $0x01, R13
  2926	SUBQ    $0x01, SI
  2927	SUBQ    $0x08, BX
  2928	MOVBQZX (R13), AX
  2929	ORQ     AX, DX
  2930	JMP     sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
  2931
  2932sequenceDecs_decodeSync_safe_amd64_fill_check_overread:
  2933	CMPQ BX, $0x40
  2934	JA   error_overread
  2935
  2936sequenceDecs_decodeSync_safe_amd64_fill_end:
  2937	// Update offset
  2938	MOVQ  R9, AX
  2939	MOVQ  BX, CX
  2940	MOVQ  DX, R14
  2941	SHLQ  CL, R14
  2942	MOVB  AH, CL
  2943	SHRQ  $0x20, AX
  2944	TESTQ CX, CX
  2945	JZ    sequenceDecs_decodeSync_safe_amd64_of_update_zero
  2946	ADDQ  CX, BX
  2947	CMPQ  BX, $0x40
  2948	JA    sequenceDecs_decodeSync_safe_amd64_of_update_zero
  2949	CMPQ  CX, $0x40
  2950	JAE   sequenceDecs_decodeSync_safe_amd64_of_update_zero
  2951	NEGQ  CX
  2952	SHRQ  CL, R14
  2953	ADDQ  R14, AX
  2954
  2955sequenceDecs_decodeSync_safe_amd64_of_update_zero:
  2956	MOVQ AX, 8(SP)
  2957
  2958	// Update match length
  2959	MOVQ  R8, AX
  2960	MOVQ  BX, CX
  2961	MOVQ  DX, R14
  2962	SHLQ  CL, R14
  2963	MOVB  AH, CL
  2964	SHRQ  $0x20, AX
  2965	TESTQ CX, CX
  2966	JZ    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
  2967	ADDQ  CX, BX
  2968	CMPQ  BX, $0x40
  2969	JA    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
  2970	CMPQ  CX, $0x40
  2971	JAE   sequenceDecs_decodeSync_safe_amd64_ml_update_zero
  2972	NEGQ  CX
  2973	SHRQ  CL, R14
  2974	ADDQ  R14, AX
  2975
  2976sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
  2977	MOVQ AX, 16(SP)
  2978
  2979	// Fill bitreader to have enough for the remaining
  2980	CMPQ SI, $0x08
  2981	JL   sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
  2982	MOVQ BX, AX
  2983	SHRQ $0x03, AX
  2984	SUBQ AX, R13
  2985	MOVQ (R13), DX
  2986	SUBQ AX, SI
  2987	ANDQ $0x07, BX
  2988	JMP  sequenceDecs_decodeSync_safe_amd64_fill_2_end
  2989
  2990sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
  2991	CMPQ    SI, $0x00
  2992	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread
  2993	CMPQ    BX, $0x07
  2994	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_end
  2995	SHLQ    $0x08, DX
  2996	SUBQ    $0x01, R13
  2997	SUBQ    $0x01, SI
  2998	SUBQ    $0x08, BX
  2999	MOVBQZX (R13), AX
  3000	ORQ     AX, DX
  3001	JMP     sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
  3002
  3003sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread:
  3004	CMPQ BX, $0x40
  3005	JA   error_overread
  3006
  3007sequenceDecs_decodeSync_safe_amd64_fill_2_end:
  3008	// Update literal length
  3009	MOVQ  DI, AX
  3010	MOVQ  BX, CX
  3011	MOVQ  DX, R14
  3012	SHLQ  CL, R14
  3013	MOVB  AH, CL
  3014	SHRQ  $0x20, AX
  3015	TESTQ CX, CX
  3016	JZ    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
  3017	ADDQ  CX, BX
  3018	CMPQ  BX, $0x40
  3019	JA    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
  3020	CMPQ  CX, $0x40
  3021	JAE   sequenceDecs_decodeSync_safe_amd64_ll_update_zero
  3022	NEGQ  CX
  3023	SHRQ  CL, R14
  3024	ADDQ  R14, AX
  3025
  3026sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
  3027	MOVQ AX, 24(SP)
  3028
  3029	// Fill bitreader for state updates
  3030	MOVQ    R13, (SP)
  3031	MOVQ    R9, AX
  3032	SHRQ    $0x08, AX
  3033	MOVBQZX AL, AX
  3034	MOVQ    ctx+16(FP), CX
  3035	CMPQ    96(CX), $0x00
  3036	JZ      sequenceDecs_decodeSync_safe_amd64_skip_update
  3037
  3038	// Update Literal Length State
  3039	MOVBQZX DI, R13
  3040	SHRL    $0x10, DI
  3041	LEAQ    (BX)(R13*1), CX
  3042	MOVQ    DX, R14
  3043	MOVQ    CX, BX
  3044	ROLQ    CL, R14
  3045	MOVL    $0x00000001, R15
  3046	MOVB    R13, CL
  3047	SHLL    CL, R15
  3048	DECL    R15
  3049	ANDQ    R15, R14
  3050	ADDQ    R14, DI
  3051
  3052	// Load ctx.llTable
  3053	MOVQ ctx+16(FP), CX
  3054	MOVQ (CX), CX
  3055	MOVQ (CX)(DI*8), DI
  3056
  3057	// Update Match Length State
  3058	MOVBQZX R8, R13
  3059	SHRL    $0x10, R8
  3060	LEAQ    (BX)(R13*1), CX
  3061	MOVQ    DX, R14
  3062	MOVQ    CX, BX
  3063	ROLQ    CL, R14
  3064	MOVL    $0x00000001, R15
  3065	MOVB    R13, CL
  3066	SHLL    CL, R15
  3067	DECL    R15
  3068	ANDQ    R15, R14
  3069	ADDQ    R14, R8
  3070
  3071	// Load ctx.mlTable
  3072	MOVQ ctx+16(FP), CX
  3073	MOVQ 24(CX), CX
  3074	MOVQ (CX)(R8*8), R8
  3075
  3076	// Update Offset State
  3077	MOVBQZX R9, R13
  3078	SHRL    $0x10, R9
  3079	LEAQ    (BX)(R13*1), CX
  3080	MOVQ    DX, R14
  3081	MOVQ    CX, BX
  3082	ROLQ    CL, R14
  3083	MOVL    $0x00000001, R15
  3084	MOVB    R13, CL
  3085	SHLL    CL, R15
  3086	DECL    R15
  3087	ANDQ    R15, R14
  3088	ADDQ    R14, R9
  3089
  3090	// Load ctx.ofTable
  3091	MOVQ ctx+16(FP), CX
  3092	MOVQ 48(CX), CX
  3093	MOVQ (CX)(R9*8), R9
  3094
  3095sequenceDecs_decodeSync_safe_amd64_skip_update:
  3096	// Adjust offset
  3097	MOVQ   s+0(FP), CX
  3098	MOVQ   8(SP), R13
  3099	CMPQ   AX, $0x01
  3100	JBE    sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
  3101	MOVUPS 144(CX), X0
  3102	MOVQ   R13, 144(CX)
  3103	MOVUPS X0, 152(CX)
  3104	JMP    sequenceDecs_decodeSync_safe_amd64_after_adjust
  3105
  3106sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
  3107	CMPQ 24(SP), $0x00000000
  3108	JNE  sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
  3109	INCQ R13
  3110	JMP  sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
  3111
  3112sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
  3113	TESTQ R13, R13
  3114	JNZ   sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
  3115	MOVQ  144(CX), R13
  3116	JMP   sequenceDecs_decodeSync_safe_amd64_after_adjust
  3117
  3118sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
  3119	MOVQ    R13, AX
  3120	XORQ    R14, R14
  3121	MOVQ    $-1, R15
  3122	CMPQ    R13, $0x03
  3123	CMOVQEQ R14, AX
  3124	CMOVQEQ R15, R14
  3125	ADDQ    144(CX)(AX*8), R14
  3126	JNZ     sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
  3127	MOVQ    $0x00000001, R14
  3128
  3129sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
  3130	CMPQ R13, $0x01
  3131	JZ   sequenceDecs_decodeSync_safe_amd64_adjust_skip
  3132	MOVQ 152(CX), AX
  3133	MOVQ AX, 160(CX)
  3134
  3135sequenceDecs_decodeSync_safe_amd64_adjust_skip:
  3136	MOVQ 144(CX), AX
  3137	MOVQ AX, 152(CX)
  3138	MOVQ R14, 144(CX)
  3139	MOVQ R14, R13
  3140
  3141sequenceDecs_decodeSync_safe_amd64_after_adjust:
  3142	MOVQ R13, 8(SP)
  3143
  3144	// Check values
  3145	MOVQ  16(SP), AX
  3146	MOVQ  24(SP), CX
  3147	LEAQ  (AX)(CX*1), R14
  3148	MOVQ  s+0(FP), R15
  3149	ADDQ  R14, 256(R15)
  3150	MOVQ  ctx+16(FP), R14
  3151	SUBQ  CX, 104(R14)
  3152	JS    error_not_enough_literals
  3153	CMPQ  AX, $0x00020002
  3154	JA    sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
  3155	TESTQ R13, R13
  3156	JNZ   sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
  3157	TESTQ AX, AX
  3158	JNZ   sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
  3159
  3160sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
  3161	MOVQ 24(SP), AX
  3162	MOVQ 8(SP), CX
  3163	MOVQ 16(SP), R13
  3164
  3165	// Check if we have enough space in s.out
  3166	LEAQ (AX)(R13*1), R14
  3167	ADDQ R10, R14
  3168	CMPQ R14, 32(SP)
  3169	JA   error_not_enough_space
  3170
  3171	// Copy literals
  3172	TESTQ AX, AX
  3173	JZ    check_offset
  3174	MOVQ  AX, R14
  3175	SUBQ  $0x10, R14
  3176	JB    copy_1_small
  3177
  3178copy_1_loop:
  3179	MOVUPS (R11), X0
  3180	MOVUPS X0, (R10)
  3181	ADDQ   $0x10, R11
  3182	ADDQ   $0x10, R10
  3183	SUBQ   $0x10, R14
  3184	JAE    copy_1_loop
  3185	LEAQ   16(R11)(R14*1), R11
  3186	LEAQ   16(R10)(R14*1), R10
  3187	MOVUPS -16(R11), X0
  3188	MOVUPS X0, -16(R10)
  3189	JMP    copy_1_end
  3190
  3191copy_1_small:
  3192	CMPQ AX, $0x03
  3193	JE   copy_1_move_3
  3194	JB   copy_1_move_1or2
  3195	CMPQ AX, $0x08
  3196	JB   copy_1_move_4through7
  3197	JMP  copy_1_move_8through16
  3198
  3199copy_1_move_1or2:
  3200	MOVB (R11), R14
  3201	MOVB -1(R11)(AX*1), R15
  3202	MOVB R14, (R10)
  3203	MOVB R15, -1(R10)(AX*1)
  3204	ADDQ AX, R11
  3205	ADDQ AX, R10
  3206	JMP  copy_1_end
  3207
  3208copy_1_move_3:
  3209	MOVW (R11), R14
  3210	MOVB 2(R11), R15
  3211	MOVW R14, (R10)
  3212	MOVB R15, 2(R10)
  3213	ADDQ AX, R11
  3214	ADDQ AX, R10
  3215	JMP  copy_1_end
  3216
  3217copy_1_move_4through7:
  3218	MOVL (R11), R14
  3219	MOVL -4(R11)(AX*1), R15
  3220	MOVL R14, (R10)
  3221	MOVL R15, -4(R10)(AX*1)
  3222	ADDQ AX, R11
  3223	ADDQ AX, R10
  3224	JMP  copy_1_end
  3225
  3226copy_1_move_8through16:
  3227	MOVQ (R11), R14
  3228	MOVQ -8(R11)(AX*1), R15
  3229	MOVQ R14, (R10)
  3230	MOVQ R15, -8(R10)(AX*1)
  3231	ADDQ AX, R11
  3232	ADDQ AX, R10
  3233
  3234copy_1_end:
  3235	ADDQ AX, R12
  3236
  3237	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  3238check_offset:
  3239	MOVQ R12, AX
  3240	ADDQ 40(SP), AX
  3241	CMPQ CX, AX
  3242	JG   error_match_off_too_big
  3243	CMPQ CX, 56(SP)
  3244	JG   error_match_off_too_big
  3245
  3246	// Copy match from history
  3247	MOVQ CX, AX
  3248	SUBQ R12, AX
  3249	JLS  copy_match
  3250	MOVQ 48(SP), R14
  3251	SUBQ AX, R14
  3252	CMPQ R13, AX
  3253	JG   copy_all_from_history
  3254	MOVQ R13, AX
  3255	SUBQ $0x10, AX
  3256	JB   copy_4_small
  3257
  3258copy_4_loop:
  3259	MOVUPS (R14), X0
  3260	MOVUPS X0, (R10)
  3261	ADDQ   $0x10, R14
  3262	ADDQ   $0x10, R10
  3263	SUBQ   $0x10, AX
  3264	JAE    copy_4_loop
  3265	LEAQ   16(R14)(AX*1), R14
  3266	LEAQ   16(R10)(AX*1), R10
  3267	MOVUPS -16(R14), X0
  3268	MOVUPS X0, -16(R10)
  3269	JMP    copy_4_end
  3270
  3271copy_4_small:
  3272	CMPQ R13, $0x03
  3273	JE   copy_4_move_3
  3274	CMPQ R13, $0x08
  3275	JB   copy_4_move_4through7
  3276	JMP  copy_4_move_8through16
  3277
  3278copy_4_move_3:
  3279	MOVW (R14), AX
  3280	MOVB 2(R14), CL
  3281	MOVW AX, (R10)
  3282	MOVB CL, 2(R10)
  3283	ADDQ R13, R14
  3284	ADDQ R13, R10
  3285	JMP  copy_4_end
  3286
  3287copy_4_move_4through7:
  3288	MOVL (R14), AX
  3289	MOVL -4(R14)(R13*1), CX
  3290	MOVL AX, (R10)
  3291	MOVL CX, -4(R10)(R13*1)
  3292	ADDQ R13, R14
  3293	ADDQ R13, R10
  3294	JMP  copy_4_end
  3295
  3296copy_4_move_8through16:
  3297	MOVQ (R14), AX
  3298	MOVQ -8(R14)(R13*1), CX
  3299	MOVQ AX, (R10)
  3300	MOVQ CX, -8(R10)(R13*1)
  3301	ADDQ R13, R14
  3302	ADDQ R13, R10
  3303
  3304copy_4_end:
  3305	ADDQ R13, R12
  3306	JMP  handle_loop
  3307	JMP loop_finished
  3308
  3309copy_all_from_history:
  3310	MOVQ AX, R15
  3311	SUBQ $0x10, R15
  3312	JB   copy_5_small
  3313
  3314copy_5_loop:
  3315	MOVUPS (R14), X0
  3316	MOVUPS X0, (R10)
  3317	ADDQ   $0x10, R14
  3318	ADDQ   $0x10, R10
  3319	SUBQ   $0x10, R15
  3320	JAE    copy_5_loop
  3321	LEAQ   16(R14)(R15*1), R14
  3322	LEAQ   16(R10)(R15*1), R10
  3323	MOVUPS -16(R14), X0
  3324	MOVUPS X0, -16(R10)
  3325	JMP    copy_5_end
  3326
  3327copy_5_small:
  3328	CMPQ AX, $0x03
  3329	JE   copy_5_move_3
  3330	JB   copy_5_move_1or2
  3331	CMPQ AX, $0x08
  3332	JB   copy_5_move_4through7
  3333	JMP  copy_5_move_8through16
  3334
  3335copy_5_move_1or2:
  3336	MOVB (R14), R15
  3337	MOVB -1(R14)(AX*1), BP
  3338	MOVB R15, (R10)
  3339	MOVB BP, -1(R10)(AX*1)
  3340	ADDQ AX, R14
  3341	ADDQ AX, R10
  3342	JMP  copy_5_end
  3343
  3344copy_5_move_3:
  3345	MOVW (R14), R15
  3346	MOVB 2(R14), BP
  3347	MOVW R15, (R10)
  3348	MOVB BP, 2(R10)
  3349	ADDQ AX, R14
  3350	ADDQ AX, R10
  3351	JMP  copy_5_end
  3352
  3353copy_5_move_4through7:
  3354	MOVL (R14), R15
  3355	MOVL -4(R14)(AX*1), BP
  3356	MOVL R15, (R10)
  3357	MOVL BP, -4(R10)(AX*1)
  3358	ADDQ AX, R14
  3359	ADDQ AX, R10
  3360	JMP  copy_5_end
  3361
  3362copy_5_move_8through16:
  3363	MOVQ (R14), R15
  3364	MOVQ -8(R14)(AX*1), BP
  3365	MOVQ R15, (R10)
  3366	MOVQ BP, -8(R10)(AX*1)
  3367	ADDQ AX, R14
  3368	ADDQ AX, R10
  3369
  3370copy_5_end:
  3371	ADDQ AX, R12
  3372	SUBQ AX, R13
  3373
  3374	// Copy match from the current buffer
  3375copy_match:
  3376	MOVQ R10, AX
  3377	SUBQ CX, AX
  3378
  3379	// ml <= mo
  3380	CMPQ R13, CX
  3381	JA   copy_overlapping_match
  3382
  3383	// Copy non-overlapping match
  3384	ADDQ R13, R12
  3385	MOVQ R13, CX
  3386	SUBQ $0x10, CX
  3387	JB   copy_2_small
  3388
  3389copy_2_loop:
  3390	MOVUPS (AX), X0
  3391	MOVUPS X0, (R10)
  3392	ADDQ   $0x10, AX
  3393	ADDQ   $0x10, R10
  3394	SUBQ   $0x10, CX
  3395	JAE    copy_2_loop
  3396	LEAQ   16(AX)(CX*1), AX
  3397	LEAQ   16(R10)(CX*1), R10
  3398	MOVUPS -16(AX), X0
  3399	MOVUPS X0, -16(R10)
  3400	JMP    copy_2_end
  3401
  3402copy_2_small:
  3403	CMPQ R13, $0x03
  3404	JE   copy_2_move_3
  3405	JB   copy_2_move_1or2
  3406	CMPQ R13, $0x08
  3407	JB   copy_2_move_4through7
  3408	JMP  copy_2_move_8through16
  3409
  3410copy_2_move_1or2:
  3411	MOVB (AX), CL
  3412	MOVB -1(AX)(R13*1), R14
  3413	MOVB CL, (R10)
  3414	MOVB R14, -1(R10)(R13*1)
  3415	ADDQ R13, AX
  3416	ADDQ R13, R10
  3417	JMP  copy_2_end
  3418
  3419copy_2_move_3:
  3420	MOVW (AX), CX
  3421	MOVB 2(AX), R14
  3422	MOVW CX, (R10)
  3423	MOVB R14, 2(R10)
  3424	ADDQ R13, AX
  3425	ADDQ R13, R10
  3426	JMP  copy_2_end
  3427
  3428copy_2_move_4through7:
  3429	MOVL (AX), CX
  3430	MOVL -4(AX)(R13*1), R14
  3431	MOVL CX, (R10)
  3432	MOVL R14, -4(R10)(R13*1)
  3433	ADDQ R13, AX
  3434	ADDQ R13, R10
  3435	JMP  copy_2_end
  3436
  3437copy_2_move_8through16:
  3438	MOVQ (AX), CX
  3439	MOVQ -8(AX)(R13*1), R14
  3440	MOVQ CX, (R10)
  3441	MOVQ R14, -8(R10)(R13*1)
  3442	ADDQ R13, AX
  3443	ADDQ R13, R10
  3444
  3445copy_2_end:
  3446	JMP handle_loop
  3447
  3448	// Copy overlapping match
  3449copy_overlapping_match:
  3450	ADDQ R13, R12
  3451
  3452copy_slow_3:
  3453	MOVB (AX), CL
  3454	MOVB CL, (R10)
  3455	INCQ AX
  3456	INCQ R10
  3457	DECQ R13
  3458	JNZ  copy_slow_3
  3459
  3460handle_loop:
  3461	MOVQ ctx+16(FP), AX
  3462	DECQ 96(AX)
  3463	JNS  sequenceDecs_decodeSync_safe_amd64_main_loop
  3464
  3465loop_finished:
  3466	MOVQ br+8(FP), AX
  3467	MOVQ DX, 24(AX)
  3468	MOVB BL, 32(AX)
  3469	MOVQ SI, 8(AX)
  3470
  3471	// Update the context
  3472	MOVQ ctx+16(FP), AX
  3473	MOVQ R12, 136(AX)
  3474	MOVQ 144(AX), CX
  3475	SUBQ CX, R11
  3476	MOVQ R11, 168(AX)
  3477
  3478	// Return success
  3479	MOVQ $0x00000000, ret+24(FP)
  3480	RET
  3481
  3482	// Return with match length error
  3483sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
  3484	MOVQ 16(SP), AX
  3485	MOVQ ctx+16(FP), CX
  3486	MOVQ AX, 216(CX)
  3487	MOVQ $0x00000001, ret+24(FP)
  3488	RET
  3489
  3490	// Return with match too long error
  3491sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
  3492	MOVQ ctx+16(FP), AX
  3493	MOVQ 16(SP), CX
  3494	MOVQ CX, 216(AX)
  3495	MOVQ $0x00000002, ret+24(FP)
  3496	RET
  3497
  3498	// Return with match offset too long error
  3499error_match_off_too_big:
  3500	MOVQ ctx+16(FP), AX
  3501	MOVQ 8(SP), CX
  3502	MOVQ CX, 224(AX)
  3503	MOVQ R12, 136(AX)
  3504	MOVQ $0x00000003, ret+24(FP)
  3505	RET
  3506
  3507	// Return with not enough literals error
  3508error_not_enough_literals:
  3509	MOVQ ctx+16(FP), AX
  3510	MOVQ 24(SP), CX
  3511	MOVQ CX, 208(AX)
  3512	MOVQ $0x00000004, ret+24(FP)
  3513	RET
  3514
  3515	// Return with overread error
  3516error_overread:
  3517	MOVQ $0x00000006, ret+24(FP)
  3518	RET
  3519
  3520	// Return with not enough output space error
  3521error_not_enough_space:
  3522	MOVQ ctx+16(FP), AX
  3523	MOVQ 24(SP), CX
  3524	MOVQ CX, 208(AX)
  3525	MOVQ 16(SP), CX
  3526	MOVQ CX, 216(AX)
  3527	MOVQ R12, 136(AX)
  3528	MOVQ $0x00000005, ret+24(FP)
  3529	RET
  3530
  3531// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  3532// Requires: BMI, BMI2, CMOV, SSE
  3533TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
  3534	MOVQ    br+8(FP), BX
  3535	MOVQ    24(BX), AX
  3536	MOVBQZX 32(BX), DX
  3537	MOVQ    (BX), CX
  3538	MOVQ    8(BX), BX
  3539	ADDQ    BX, CX
  3540	MOVQ    CX, (SP)
  3541	MOVQ    ctx+16(FP), CX
  3542	MOVQ    72(CX), SI
  3543	MOVQ    80(CX), DI
  3544	MOVQ    88(CX), R8
  3545	XORQ    R9, R9
  3546	MOVQ    R9, 8(SP)
  3547	MOVQ    R9, 16(SP)
  3548	MOVQ    R9, 24(SP)
  3549	MOVQ    112(CX), R9
  3550	MOVQ    128(CX), R10
  3551	MOVQ    R10, 32(SP)
  3552	MOVQ    144(CX), R10
  3553	MOVQ    136(CX), R11
  3554	MOVQ    200(CX), R12
  3555	MOVQ    R12, 56(SP)
  3556	MOVQ    176(CX), R12
  3557	MOVQ    R12, 48(SP)
  3558	MOVQ    184(CX), CX
  3559	MOVQ    CX, 40(SP)
  3560	MOVQ    40(SP), CX
  3561	ADDQ    CX, 48(SP)
  3562
  3563	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  3564	ADDQ R9, 32(SP)
  3565
  3566	// outBase += outPosition
  3567	ADDQ R11, R9
  3568
  3569sequenceDecs_decodeSync_safe_bmi2_main_loop:
  3570	MOVQ (SP), R12
  3571
  3572	// Fill bitreader to have enough for the offset and match length.
  3573	CMPQ BX, $0x08
  3574	JL   sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
  3575	MOVQ DX, CX
  3576	SHRQ $0x03, CX
  3577	SUBQ CX, R12
  3578	MOVQ (R12), AX
  3579	SUBQ CX, BX
  3580	ANDQ $0x07, DX
  3581	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_end
  3582
  3583sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
  3584	CMPQ    BX, $0x00
  3585	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_check_overread
  3586	CMPQ    DX, $0x07
  3587	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_end
  3588	SHLQ    $0x08, AX
  3589	SUBQ    $0x01, R12
  3590	SUBQ    $0x01, BX
  3591	SUBQ    $0x08, DX
  3592	MOVBQZX (R12), CX
  3593	ORQ     CX, AX
  3594	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
  3595
  3596sequenceDecs_decodeSync_safe_bmi2_fill_check_overread:
  3597	CMPQ DX, $0x40
  3598	JA   error_overread
  3599
  3600sequenceDecs_decodeSync_safe_bmi2_fill_end:
  3601	// Update offset
  3602	MOVQ   $0x00000808, CX
  3603	BEXTRQ CX, R8, R13
  3604	MOVQ   AX, R14
  3605	LEAQ   (DX)(R13*1), CX
  3606	ROLQ   CL, R14
  3607	BZHIQ  R13, R14, R14
  3608	MOVQ   CX, DX
  3609	MOVQ   R8, CX
  3610	SHRQ   $0x20, CX
  3611	ADDQ   R14, CX
  3612	MOVQ   CX, 8(SP)
  3613
  3614	// Update match length
  3615	MOVQ   $0x00000808, CX
  3616	BEXTRQ CX, DI, R13
  3617	MOVQ   AX, R14
  3618	LEAQ   (DX)(R13*1), CX
  3619	ROLQ   CL, R14
  3620	BZHIQ  R13, R14, R14
  3621	MOVQ   CX, DX
  3622	MOVQ   DI, CX
  3623	SHRQ   $0x20, CX
  3624	ADDQ   R14, CX
  3625	MOVQ   CX, 16(SP)
  3626
  3627	// Fill bitreader to have enough for the remaining
  3628	CMPQ BX, $0x08
  3629	JL   sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
  3630	MOVQ DX, CX
  3631	SHRQ $0x03, CX
  3632	SUBQ CX, R12
  3633	MOVQ (R12), AX
  3634	SUBQ CX, BX
  3635	ANDQ $0x07, DX
  3636	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_2_end
  3637
  3638sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
  3639	CMPQ    BX, $0x00
  3640	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread
  3641	CMPQ    DX, $0x07
  3642	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_end
  3643	SHLQ    $0x08, AX
  3644	SUBQ    $0x01, R12
  3645	SUBQ    $0x01, BX
  3646	SUBQ    $0x08, DX
  3647	MOVBQZX (R12), CX
  3648	ORQ     CX, AX
  3649	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
  3650
  3651sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread:
  3652	CMPQ DX, $0x40
  3653	JA   error_overread
  3654
  3655sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
  3656	// Update literal length
  3657	MOVQ   $0x00000808, CX
  3658	BEXTRQ CX, SI, R13
  3659	MOVQ   AX, R14
  3660	LEAQ   (DX)(R13*1), CX
  3661	ROLQ   CL, R14
  3662	BZHIQ  R13, R14, R14
  3663	MOVQ   CX, DX
  3664	MOVQ   SI, CX
  3665	SHRQ   $0x20, CX
  3666	ADDQ   R14, CX
  3667	MOVQ   CX, 24(SP)
  3668
  3669	// Fill bitreader for state updates
  3670	MOVQ    R12, (SP)
  3671	MOVQ    $0x00000808, CX
  3672	BEXTRQ  CX, R8, R12
  3673	MOVQ    ctx+16(FP), CX
  3674	CMPQ    96(CX), $0x00
  3675	JZ      sequenceDecs_decodeSync_safe_bmi2_skip_update
  3676	LEAQ    (SI)(DI*1), R13
  3677	ADDQ    R8, R13
  3678	MOVBQZX R13, R13
  3679	LEAQ    (DX)(R13*1), CX
  3680	MOVQ    AX, R14
  3681	MOVQ    CX, DX
  3682	ROLQ    CL, R14
  3683	BZHIQ   R13, R14, R14
  3684
  3685	// Update Offset State
  3686	BZHIQ R8, R14, CX
  3687	SHRXQ R8, R14, R14
  3688	SHRL  $0x10, R8
  3689	ADDQ  CX, R8
  3690
  3691	// Load ctx.ofTable
  3692	MOVQ ctx+16(FP), CX
  3693	MOVQ 48(CX), CX
  3694	MOVQ (CX)(R8*8), R8
  3695
  3696	// Update Match Length State
  3697	BZHIQ DI, R14, CX
  3698	SHRXQ DI, R14, R14
  3699	SHRL  $0x10, DI
  3700	ADDQ  CX, DI
  3701
  3702	// Load ctx.mlTable
  3703	MOVQ ctx+16(FP), CX
  3704	MOVQ 24(CX), CX
  3705	MOVQ (CX)(DI*8), DI
  3706
  3707	// Update Literal Length State
  3708	BZHIQ SI, R14, CX
  3709	SHRL  $0x10, SI
  3710	ADDQ  CX, SI
  3711
  3712	// Load ctx.llTable
  3713	MOVQ ctx+16(FP), CX
  3714	MOVQ (CX), CX
  3715	MOVQ (CX)(SI*8), SI
  3716
  3717sequenceDecs_decodeSync_safe_bmi2_skip_update:
  3718	// Adjust offset
  3719	MOVQ   s+0(FP), CX
  3720	MOVQ   8(SP), R13
  3721	CMPQ   R12, $0x01
  3722	JBE    sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
  3723	MOVUPS 144(CX), X0
  3724	MOVQ   R13, 144(CX)
  3725	MOVUPS X0, 152(CX)
  3726	JMP    sequenceDecs_decodeSync_safe_bmi2_after_adjust
  3727
  3728sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
  3729	CMPQ 24(SP), $0x00000000
  3730	JNE  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
  3731	INCQ R13
  3732	JMP  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
  3733
  3734sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
  3735	TESTQ R13, R13
  3736	JNZ   sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
  3737	MOVQ  144(CX), R13
  3738	JMP   sequenceDecs_decodeSync_safe_bmi2_after_adjust
  3739
  3740sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
  3741	MOVQ    R13, R12
  3742	XORQ    R14, R14
  3743	MOVQ    $-1, R15
  3744	CMPQ    R13, $0x03
  3745	CMOVQEQ R14, R12
  3746	CMOVQEQ R15, R14
  3747	ADDQ    144(CX)(R12*8), R14
  3748	JNZ     sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
  3749	MOVQ    $0x00000001, R14
  3750
  3751sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
  3752	CMPQ R13, $0x01
  3753	JZ   sequenceDecs_decodeSync_safe_bmi2_adjust_skip
  3754	MOVQ 152(CX), R12
  3755	MOVQ R12, 160(CX)
  3756
  3757sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
  3758	MOVQ 144(CX), R12
  3759	MOVQ R12, 152(CX)
  3760	MOVQ R14, 144(CX)
  3761	MOVQ R14, R13
  3762
  3763sequenceDecs_decodeSync_safe_bmi2_after_adjust:
  3764	MOVQ R13, 8(SP)
  3765
  3766	// Check values
  3767	MOVQ  16(SP), CX
  3768	MOVQ  24(SP), R12
  3769	LEAQ  (CX)(R12*1), R14
  3770	MOVQ  s+0(FP), R15
  3771	ADDQ  R14, 256(R15)
  3772	MOVQ  ctx+16(FP), R14
  3773	SUBQ  R12, 104(R14)
  3774	JS    error_not_enough_literals
  3775	CMPQ  CX, $0x00020002
  3776	JA    sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
  3777	TESTQ R13, R13
  3778	JNZ   sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
  3779	TESTQ CX, CX
  3780	JNZ   sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
  3781
  3782sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
  3783	MOVQ 24(SP), CX
  3784	MOVQ 8(SP), R12
  3785	MOVQ 16(SP), R13
  3786
  3787	// Check if we have enough space in s.out
  3788	LEAQ (CX)(R13*1), R14
  3789	ADDQ R9, R14
  3790	CMPQ R14, 32(SP)
  3791	JA   error_not_enough_space
  3792
  3793	// Copy literals
  3794	TESTQ CX, CX
  3795	JZ    check_offset
  3796	MOVQ  CX, R14
  3797	SUBQ  $0x10, R14
  3798	JB    copy_1_small
  3799
  3800copy_1_loop:
  3801	MOVUPS (R10), X0
  3802	MOVUPS X0, (R9)
  3803	ADDQ   $0x10, R10
  3804	ADDQ   $0x10, R9
  3805	SUBQ   $0x10, R14
  3806	JAE    copy_1_loop
  3807	LEAQ   16(R10)(R14*1), R10
  3808	LEAQ   16(R9)(R14*1), R9
  3809	MOVUPS -16(R10), X0
  3810	MOVUPS X0, -16(R9)
  3811	JMP    copy_1_end
  3812
  3813copy_1_small:
  3814	CMPQ CX, $0x03
  3815	JE   copy_1_move_3
  3816	JB   copy_1_move_1or2
  3817	CMPQ CX, $0x08
  3818	JB   copy_1_move_4through7
  3819	JMP  copy_1_move_8through16
  3820
  3821copy_1_move_1or2:
  3822	MOVB (R10), R14
  3823	MOVB -1(R10)(CX*1), R15
  3824	MOVB R14, (R9)
  3825	MOVB R15, -1(R9)(CX*1)
  3826	ADDQ CX, R10
  3827	ADDQ CX, R9
  3828	JMP  copy_1_end
  3829
  3830copy_1_move_3:
  3831	MOVW (R10), R14
  3832	MOVB 2(R10), R15
  3833	MOVW R14, (R9)
  3834	MOVB R15, 2(R9)
  3835	ADDQ CX, R10
  3836	ADDQ CX, R9
  3837	JMP  copy_1_end
  3838
  3839copy_1_move_4through7:
  3840	MOVL (R10), R14
  3841	MOVL -4(R10)(CX*1), R15
  3842	MOVL R14, (R9)
  3843	MOVL R15, -4(R9)(CX*1)
  3844	ADDQ CX, R10
  3845	ADDQ CX, R9
  3846	JMP  copy_1_end
  3847
  3848copy_1_move_8through16:
  3849	MOVQ (R10), R14
  3850	MOVQ -8(R10)(CX*1), R15
  3851	MOVQ R14, (R9)
  3852	MOVQ R15, -8(R9)(CX*1)
  3853	ADDQ CX, R10
  3854	ADDQ CX, R9
  3855
  3856copy_1_end:
  3857	ADDQ CX, R11
  3858
  3859	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  3860check_offset:
  3861	MOVQ R11, CX
  3862	ADDQ 40(SP), CX
  3863	CMPQ R12, CX
  3864	JG   error_match_off_too_big
  3865	CMPQ R12, 56(SP)
  3866	JG   error_match_off_too_big
  3867
  3868	// Copy match from history
  3869	MOVQ R12, CX
  3870	SUBQ R11, CX
  3871	JLS  copy_match
  3872	MOVQ 48(SP), R14
  3873	SUBQ CX, R14
  3874	CMPQ R13, CX
  3875	JG   copy_all_from_history
  3876	MOVQ R13, CX
  3877	SUBQ $0x10, CX
  3878	JB   copy_4_small
  3879
  3880copy_4_loop:
  3881	MOVUPS (R14), X0
  3882	MOVUPS X0, (R9)
  3883	ADDQ   $0x10, R14
  3884	ADDQ   $0x10, R9
  3885	SUBQ   $0x10, CX
  3886	JAE    copy_4_loop
  3887	LEAQ   16(R14)(CX*1), R14
  3888	LEAQ   16(R9)(CX*1), R9
  3889	MOVUPS -16(R14), X0
  3890	MOVUPS X0, -16(R9)
  3891	JMP    copy_4_end
  3892
  3893copy_4_small:
  3894	CMPQ R13, $0x03
  3895	JE   copy_4_move_3
  3896	CMPQ R13, $0x08
  3897	JB   copy_4_move_4through7
  3898	JMP  copy_4_move_8through16
  3899
  3900copy_4_move_3:
  3901	MOVW (R14), CX
  3902	MOVB 2(R14), R12
  3903	MOVW CX, (R9)
  3904	MOVB R12, 2(R9)
  3905	ADDQ R13, R14
  3906	ADDQ R13, R9
  3907	JMP  copy_4_end
  3908
  3909copy_4_move_4through7:
  3910	MOVL (R14), CX
  3911	MOVL -4(R14)(R13*1), R12
  3912	MOVL CX, (R9)
  3913	MOVL R12, -4(R9)(R13*1)
  3914	ADDQ R13, R14
  3915	ADDQ R13, R9
  3916	JMP  copy_4_end
  3917
  3918copy_4_move_8through16:
  3919	MOVQ (R14), CX
  3920	MOVQ -8(R14)(R13*1), R12
  3921	MOVQ CX, (R9)
  3922	MOVQ R12, -8(R9)(R13*1)
  3923	ADDQ R13, R14
  3924	ADDQ R13, R9
  3925
  3926copy_4_end:
  3927	ADDQ R13, R11
  3928	JMP  handle_loop
  3929	JMP loop_finished
  3930
  3931copy_all_from_history:
  3932	MOVQ CX, R15
  3933	SUBQ $0x10, R15
  3934	JB   copy_5_small
  3935
  3936copy_5_loop:
  3937	MOVUPS (R14), X0
  3938	MOVUPS X0, (R9)
  3939	ADDQ   $0x10, R14
  3940	ADDQ   $0x10, R9
  3941	SUBQ   $0x10, R15
  3942	JAE    copy_5_loop
  3943	LEAQ   16(R14)(R15*1), R14
  3944	LEAQ   16(R9)(R15*1), R9
  3945	MOVUPS -16(R14), X0
  3946	MOVUPS X0, -16(R9)
  3947	JMP    copy_5_end
  3948
  3949copy_5_small:
  3950	CMPQ CX, $0x03
  3951	JE   copy_5_move_3
  3952	JB   copy_5_move_1or2
  3953	CMPQ CX, $0x08
  3954	JB   copy_5_move_4through7
  3955	JMP  copy_5_move_8through16
  3956
  3957copy_5_move_1or2:
  3958	MOVB (R14), R15
  3959	MOVB -1(R14)(CX*1), BP
  3960	MOVB R15, (R9)
  3961	MOVB BP, -1(R9)(CX*1)
  3962	ADDQ CX, R14
  3963	ADDQ CX, R9
  3964	JMP  copy_5_end
  3965
  3966copy_5_move_3:
  3967	MOVW (R14), R15
  3968	MOVB 2(R14), BP
  3969	MOVW R15, (R9)
  3970	MOVB BP, 2(R9)
  3971	ADDQ CX, R14
  3972	ADDQ CX, R9
  3973	JMP  copy_5_end
  3974
  3975copy_5_move_4through7:
  3976	MOVL (R14), R15
  3977	MOVL -4(R14)(CX*1), BP
  3978	MOVL R15, (R9)
  3979	MOVL BP, -4(R9)(CX*1)
  3980	ADDQ CX, R14
  3981	ADDQ CX, R9
  3982	JMP  copy_5_end
  3983
  3984copy_5_move_8through16:
  3985	MOVQ (R14), R15
  3986	MOVQ -8(R14)(CX*1), BP
  3987	MOVQ R15, (R9)
  3988	MOVQ BP, -8(R9)(CX*1)
  3989	ADDQ CX, R14
  3990	ADDQ CX, R9
  3991
  3992copy_5_end:
  3993	ADDQ CX, R11
  3994	SUBQ CX, R13
  3995
  3996	// Copy match from the current buffer
  3997copy_match:
  3998	MOVQ R9, CX
  3999	SUBQ R12, CX
  4000
  4001	// ml <= mo
  4002	CMPQ R13, R12
  4003	JA   copy_overlapping_match
  4004
  4005	// Copy non-overlapping match
  4006	ADDQ R13, R11
  4007	MOVQ R13, R12
  4008	SUBQ $0x10, R12
  4009	JB   copy_2_small
  4010
  4011copy_2_loop:
  4012	MOVUPS (CX), X0
  4013	MOVUPS X0, (R9)
  4014	ADDQ   $0x10, CX
  4015	ADDQ   $0x10, R9
  4016	SUBQ   $0x10, R12
  4017	JAE    copy_2_loop
  4018	LEAQ   16(CX)(R12*1), CX
  4019	LEAQ   16(R9)(R12*1), R9
  4020	MOVUPS -16(CX), X0
  4021	MOVUPS X0, -16(R9)
  4022	JMP    copy_2_end
  4023
  4024copy_2_small:
  4025	CMPQ R13, $0x03
  4026	JE   copy_2_move_3
  4027	JB   copy_2_move_1or2
  4028	CMPQ R13, $0x08
  4029	JB   copy_2_move_4through7
  4030	JMP  copy_2_move_8through16
  4031
  4032copy_2_move_1or2:
  4033	MOVB (CX), R12
  4034	MOVB -1(CX)(R13*1), R14
  4035	MOVB R12, (R9)
  4036	MOVB R14, -1(R9)(R13*1)
  4037	ADDQ R13, CX
  4038	ADDQ R13, R9
  4039	JMP  copy_2_end
  4040
  4041copy_2_move_3:
  4042	MOVW (CX), R12
  4043	MOVB 2(CX), R14
  4044	MOVW R12, (R9)
  4045	MOVB R14, 2(R9)
  4046	ADDQ R13, CX
  4047	ADDQ R13, R9
  4048	JMP  copy_2_end
  4049
  4050copy_2_move_4through7:
  4051	MOVL (CX), R12
  4052	MOVL -4(CX)(R13*1), R14
  4053	MOVL R12, (R9)
  4054	MOVL R14, -4(R9)(R13*1)
  4055	ADDQ R13, CX
  4056	ADDQ R13, R9
  4057	JMP  copy_2_end
  4058
  4059copy_2_move_8through16:
  4060	MOVQ (CX), R12
  4061	MOVQ -8(CX)(R13*1), R14
  4062	MOVQ R12, (R9)
  4063	MOVQ R14, -8(R9)(R13*1)
  4064	ADDQ R13, CX
  4065	ADDQ R13, R9
  4066
  4067copy_2_end:
  4068	JMP handle_loop
  4069
  4070	// Copy overlapping match
  4071copy_overlapping_match:
  4072	ADDQ R13, R11
  4073
  4074copy_slow_3:
  4075	MOVB (CX), R12
  4076	MOVB R12, (R9)
  4077	INCQ CX
  4078	INCQ R9
  4079	DECQ R13
  4080	JNZ  copy_slow_3
  4081
  4082handle_loop:
  4083	MOVQ ctx+16(FP), CX
  4084	DECQ 96(CX)
  4085	JNS  sequenceDecs_decodeSync_safe_bmi2_main_loop
  4086
  4087loop_finished:
  4088	MOVQ br+8(FP), CX
  4089	MOVQ AX, 24(CX)
  4090	MOVB DL, 32(CX)
  4091	MOVQ BX, 8(CX)
  4092
  4093	// Update the context
  4094	MOVQ ctx+16(FP), AX
  4095	MOVQ R11, 136(AX)
  4096	MOVQ 144(AX), CX
  4097	SUBQ CX, R10
  4098	MOVQ R10, 168(AX)
  4099
  4100	// Return success
  4101	MOVQ $0x00000000, ret+24(FP)
  4102	RET
  4103
  4104	// Return with match length error
  4105sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
  4106	MOVQ 16(SP), AX
  4107	MOVQ ctx+16(FP), CX
  4108	MOVQ AX, 216(CX)
  4109	MOVQ $0x00000001, ret+24(FP)
  4110	RET
  4111
  4112	// Return with match too long error
  4113sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
  4114	MOVQ ctx+16(FP), AX
  4115	MOVQ 16(SP), CX
  4116	MOVQ CX, 216(AX)
  4117	MOVQ $0x00000002, ret+24(FP)
  4118	RET
  4119
  4120	// Return with match offset too long error
  4121error_match_off_too_big:
  4122	MOVQ ctx+16(FP), AX
  4123	MOVQ 8(SP), CX
  4124	MOVQ CX, 224(AX)
  4125	MOVQ R11, 136(AX)
  4126	MOVQ $0x00000003, ret+24(FP)
  4127	RET
  4128
  4129	// Return with not enough literals error
  4130error_not_enough_literals:
  4131	MOVQ ctx+16(FP), AX
  4132	MOVQ 24(SP), CX
  4133	MOVQ CX, 208(AX)
  4134	MOVQ $0x00000004, ret+24(FP)
  4135	RET
  4136
  4137	// Return with overread error
  4138error_overread:
  4139	MOVQ $0x00000006, ret+24(FP)
  4140	RET
  4141
  4142	// Return with not enough output space error
  4143error_not_enough_space:
  4144	MOVQ ctx+16(FP), AX
  4145	MOVQ 24(SP), CX
  4146	MOVQ CX, 208(AX)
  4147	MOVQ 16(SP), CX
  4148	MOVQ CX, 216(AX)
  4149	MOVQ R11, 136(AX)
  4150	MOVQ $0x00000005, ret+24(FP)
  4151	RET

View as plain text