...

Text file src/github.com/klauspost/compress/huff0/decompress_amd64.s

Documentation: github.com/klauspost/compress/huff0

     1// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
     2
     3//go:build amd64 && !appengine && !noasm && gc
     4
     5// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
     6TEXT ·decompress4x_main_loop_amd64(SB), $0-8
     7	// Preload values
     8	MOVQ    ctx+0(FP), AX
     9	MOVBQZX 8(AX), DI
    10	MOVQ    16(AX), BX
    11	MOVQ    48(AX), SI
    12	MOVQ    24(AX), R8
    13	MOVQ    32(AX), R9
    14	MOVQ    (AX), R10
    15
    16	// Main loop
    17main_loop:
    18	XORL  DX, DX
    19	CMPQ  BX, SI
    20	SETGE DL
    21
    22	// br0.fillFast32()
    23	MOVQ    32(R10), R11
    24	MOVBQZX 40(R10), R12
    25	CMPQ    R12, $0x20
    26	JBE     skip_fill0
    27	MOVQ    24(R10), AX
    28	SUBQ    $0x20, R12
    29	SUBQ    $0x04, AX
    30	MOVQ    (R10), R13
    31
    32	// b.value |= uint64(low) << (b.bitsRead & 63)
    33	MOVL (AX)(R13*1), R13
    34	MOVQ R12, CX
    35	SHLQ CL, R13
    36	MOVQ AX, 24(R10)
    37	ORQ  R13, R11
    38
    39	// exhausted += (br0.off < 4)
    40	CMPQ AX, $0x04
    41	ADCB $+0, DL
    42
    43skip_fill0:
    44	// val0 := br0.peekTopBits(peekBits)
    45	MOVQ R11, R13
    46	MOVQ DI, CX
    47	SHRQ CL, R13
    48
    49	// v0 := table[val0&mask]
    50	MOVW (R9)(R13*2), CX
    51
    52	// br0.advance(uint8(v0.entry)
    53	MOVB CH, AL
    54	SHLQ CL, R11
    55	ADDB CL, R12
    56
    57	// val1 := br0.peekTopBits(peekBits)
    58	MOVQ DI, CX
    59	MOVQ R11, R13
    60	SHRQ CL, R13
    61
    62	// v1 := table[val1&mask]
    63	MOVW (R9)(R13*2), CX
    64
    65	// br0.advance(uint8(v1.entry))
    66	MOVB CH, AH
    67	SHLQ CL, R11
    68	ADDB CL, R12
    69
    70	// these two writes get coalesced
    71	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
    72	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
    73	MOVW AX, (BX)
    74
    75	// update the bitreader structure
    76	MOVQ R11, 32(R10)
    77	MOVB R12, 40(R10)
    78
    79	// br1.fillFast32()
    80	MOVQ    80(R10), R11
    81	MOVBQZX 88(R10), R12
    82	CMPQ    R12, $0x20
    83	JBE     skip_fill1
    84	MOVQ    72(R10), AX
    85	SUBQ    $0x20, R12
    86	SUBQ    $0x04, AX
    87	MOVQ    48(R10), R13
    88
    89	// b.value |= uint64(low) << (b.bitsRead & 63)
    90	MOVL (AX)(R13*1), R13
    91	MOVQ R12, CX
    92	SHLQ CL, R13
    93	MOVQ AX, 72(R10)
    94	ORQ  R13, R11
    95
    96	// exhausted += (br1.off < 4)
    97	CMPQ AX, $0x04
    98	ADCB $+0, DL
    99
   100skip_fill1:
   101	// val0 := br1.peekTopBits(peekBits)
   102	MOVQ R11, R13
   103	MOVQ DI, CX
   104	SHRQ CL, R13
   105
   106	// v0 := table[val0&mask]
   107	MOVW (R9)(R13*2), CX
   108
   109	// br1.advance(uint8(v0.entry)
   110	MOVB CH, AL
   111	SHLQ CL, R11
   112	ADDB CL, R12
   113
   114	// val1 := br1.peekTopBits(peekBits)
   115	MOVQ DI, CX
   116	MOVQ R11, R13
   117	SHRQ CL, R13
   118
   119	// v1 := table[val1&mask]
   120	MOVW (R9)(R13*2), CX
   121
   122	// br1.advance(uint8(v1.entry))
   123	MOVB CH, AH
   124	SHLQ CL, R11
   125	ADDB CL, R12
   126
   127	// these two writes get coalesced
   128	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
   129	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
   130	MOVW AX, (BX)(R8*1)
   131
   132	// update the bitreader structure
   133	MOVQ R11, 80(R10)
   134	MOVB R12, 88(R10)
   135
   136	// br2.fillFast32()
   137	MOVQ    128(R10), R11
   138	MOVBQZX 136(R10), R12
   139	CMPQ    R12, $0x20
   140	JBE     skip_fill2
   141	MOVQ    120(R10), AX
   142	SUBQ    $0x20, R12
   143	SUBQ    $0x04, AX
   144	MOVQ    96(R10), R13
   145
   146	// b.value |= uint64(low) << (b.bitsRead & 63)
   147	MOVL (AX)(R13*1), R13
   148	MOVQ R12, CX
   149	SHLQ CL, R13
   150	MOVQ AX, 120(R10)
   151	ORQ  R13, R11
   152
   153	// exhausted += (br2.off < 4)
   154	CMPQ AX, $0x04
   155	ADCB $+0, DL
   156
   157skip_fill2:
   158	// val0 := br2.peekTopBits(peekBits)
   159	MOVQ R11, R13
   160	MOVQ DI, CX
   161	SHRQ CL, R13
   162
   163	// v0 := table[val0&mask]
   164	MOVW (R9)(R13*2), CX
   165
   166	// br2.advance(uint8(v0.entry)
   167	MOVB CH, AL
   168	SHLQ CL, R11
   169	ADDB CL, R12
   170
   171	// val1 := br2.peekTopBits(peekBits)
   172	MOVQ DI, CX
   173	MOVQ R11, R13
   174	SHRQ CL, R13
   175
   176	// v1 := table[val1&mask]
   177	MOVW (R9)(R13*2), CX
   178
   179	// br2.advance(uint8(v1.entry))
   180	MOVB CH, AH
   181	SHLQ CL, R11
   182	ADDB CL, R12
   183
   184	// these two writes get coalesced
   185	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
   186	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
   187	MOVW AX, (BX)(R8*2)
   188
   189	// update the bitreader structure
   190	MOVQ R11, 128(R10)
   191	MOVB R12, 136(R10)
   192
   193	// br3.fillFast32()
   194	MOVQ    176(R10), R11
   195	MOVBQZX 184(R10), R12
   196	CMPQ    R12, $0x20
   197	JBE     skip_fill3
   198	MOVQ    168(R10), AX
   199	SUBQ    $0x20, R12
   200	SUBQ    $0x04, AX
   201	MOVQ    144(R10), R13
   202
   203	// b.value |= uint64(low) << (b.bitsRead & 63)
   204	MOVL (AX)(R13*1), R13
   205	MOVQ R12, CX
   206	SHLQ CL, R13
   207	MOVQ AX, 168(R10)
   208	ORQ  R13, R11
   209
   210	// exhausted += (br3.off < 4)
   211	CMPQ AX, $0x04
   212	ADCB $+0, DL
   213
   214skip_fill3:
   215	// val0 := br3.peekTopBits(peekBits)
   216	MOVQ R11, R13
   217	MOVQ DI, CX
   218	SHRQ CL, R13
   219
   220	// v0 := table[val0&mask]
   221	MOVW (R9)(R13*2), CX
   222
   223	// br3.advance(uint8(v0.entry)
   224	MOVB CH, AL
   225	SHLQ CL, R11
   226	ADDB CL, R12
   227
   228	// val1 := br3.peekTopBits(peekBits)
   229	MOVQ DI, CX
   230	MOVQ R11, R13
   231	SHRQ CL, R13
   232
   233	// v1 := table[val1&mask]
   234	MOVW (R9)(R13*2), CX
   235
   236	// br3.advance(uint8(v1.entry))
   237	MOVB CH, AH
   238	SHLQ CL, R11
   239	ADDB CL, R12
   240
   241	// these two writes get coalesced
   242	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
   243	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
   244	LEAQ (R8)(R8*2), CX
   245	MOVW AX, (BX)(CX*1)
   246
   247	// update the bitreader structure
   248	MOVQ  R11, 176(R10)
   249	MOVB  R12, 184(R10)
   250	ADDQ  $0x02, BX
   251	TESTB DL, DL
   252	JZ    main_loop
   253	MOVQ  ctx+0(FP), AX
   254	SUBQ  16(AX), BX
   255	SHLQ  $0x02, BX
   256	MOVQ  BX, 40(AX)
   257	RET
   258
   259// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
   260TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
   261	// Preload values
   262	MOVQ    ctx+0(FP), CX
   263	MOVBQZX 8(CX), DI
   264	MOVQ    16(CX), BX
   265	MOVQ    48(CX), SI
   266	MOVQ    24(CX), R8
   267	MOVQ    32(CX), R9
   268	MOVQ    (CX), R10
   269
   270	// Main loop
   271main_loop:
   272	XORL  DX, DX
   273	CMPQ  BX, SI
   274	SETGE DL
   275
   276	// br0.fillFast32()
   277	MOVQ    32(R10), R11
   278	MOVBQZX 40(R10), R12
   279	CMPQ    R12, $0x20
   280	JBE     skip_fill0
   281	MOVQ    24(R10), R13
   282	SUBQ    $0x20, R12
   283	SUBQ    $0x04, R13
   284	MOVQ    (R10), R14
   285
   286	// b.value |= uint64(low) << (b.bitsRead & 63)
   287	MOVL (R13)(R14*1), R14
   288	MOVQ R12, CX
   289	SHLQ CL, R14
   290	MOVQ R13, 24(R10)
   291	ORQ  R14, R11
   292
   293	// exhausted += (br0.off < 4)
   294	CMPQ R13, $0x04
   295	ADCB $+0, DL
   296
   297skip_fill0:
   298	// val0 := br0.peekTopBits(peekBits)
   299	MOVQ R11, R13
   300	MOVQ DI, CX
   301	SHRQ CL, R13
   302
   303	// v0 := table[val0&mask]
   304	MOVW (R9)(R13*2), CX
   305
   306	// br0.advance(uint8(v0.entry)
   307	MOVB CH, AL
   308	SHLQ CL, R11
   309	ADDB CL, R12
   310
   311	// val1 := br0.peekTopBits(peekBits)
   312	MOVQ R11, R13
   313	MOVQ DI, CX
   314	SHRQ CL, R13
   315
   316	// v1 := table[val0&mask]
   317	MOVW (R9)(R13*2), CX
   318
   319	// br0.advance(uint8(v1.entry)
   320	MOVB   CH, AH
   321	SHLQ   CL, R11
   322	ADDB   CL, R12
   323	BSWAPL AX
   324
   325	// val2 := br0.peekTopBits(peekBits)
   326	MOVQ R11, R13
   327	MOVQ DI, CX
   328	SHRQ CL, R13
   329
   330	// v2 := table[val0&mask]
   331	MOVW (R9)(R13*2), CX
   332
   333	// br0.advance(uint8(v2.entry)
   334	MOVB CH, AH
   335	SHLQ CL, R11
   336	ADDB CL, R12
   337
   338	// val3 := br0.peekTopBits(peekBits)
   339	MOVQ R11, R13
   340	MOVQ DI, CX
   341	SHRQ CL, R13
   342
   343	// v3 := table[val0&mask]
   344	MOVW (R9)(R13*2), CX
   345
   346	// br0.advance(uint8(v3.entry)
   347	MOVB   CH, AL
   348	SHLQ   CL, R11
   349	ADDB   CL, R12
   350	BSWAPL AX
   351
   352	// these four writes get coalesced
   353	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
   354	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
   355	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
   356	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
   357	MOVL AX, (BX)
   358
   359	// update the bitreader structure
   360	MOVQ R11, 32(R10)
   361	MOVB R12, 40(R10)
   362
   363	// br1.fillFast32()
   364	MOVQ    80(R10), R11
   365	MOVBQZX 88(R10), R12
   366	CMPQ    R12, $0x20
   367	JBE     skip_fill1
   368	MOVQ    72(R10), R13
   369	SUBQ    $0x20, R12
   370	SUBQ    $0x04, R13
   371	MOVQ    48(R10), R14
   372
   373	// b.value |= uint64(low) << (b.bitsRead & 63)
   374	MOVL (R13)(R14*1), R14
   375	MOVQ R12, CX
   376	SHLQ CL, R14
   377	MOVQ R13, 72(R10)
   378	ORQ  R14, R11
   379
   380	// exhausted += (br1.off < 4)
   381	CMPQ R13, $0x04
   382	ADCB $+0, DL
   383
   384skip_fill1:
   385	// val0 := br1.peekTopBits(peekBits)
   386	MOVQ R11, R13
   387	MOVQ DI, CX
   388	SHRQ CL, R13
   389
   390	// v0 := table[val0&mask]
   391	MOVW (R9)(R13*2), CX
   392
   393	// br1.advance(uint8(v0.entry)
   394	MOVB CH, AL
   395	SHLQ CL, R11
   396	ADDB CL, R12
   397
   398	// val1 := br1.peekTopBits(peekBits)
   399	MOVQ R11, R13
   400	MOVQ DI, CX
   401	SHRQ CL, R13
   402
   403	// v1 := table[val0&mask]
   404	MOVW (R9)(R13*2), CX
   405
   406	// br1.advance(uint8(v1.entry)
   407	MOVB   CH, AH
   408	SHLQ   CL, R11
   409	ADDB   CL, R12
   410	BSWAPL AX
   411
   412	// val2 := br1.peekTopBits(peekBits)
   413	MOVQ R11, R13
   414	MOVQ DI, CX
   415	SHRQ CL, R13
   416
   417	// v2 := table[val0&mask]
   418	MOVW (R9)(R13*2), CX
   419
   420	// br1.advance(uint8(v2.entry)
   421	MOVB CH, AH
   422	SHLQ CL, R11
   423	ADDB CL, R12
   424
   425	// val3 := br1.peekTopBits(peekBits)
   426	MOVQ R11, R13
   427	MOVQ DI, CX
   428	SHRQ CL, R13
   429
   430	// v3 := table[val0&mask]
   431	MOVW (R9)(R13*2), CX
   432
   433	// br1.advance(uint8(v3.entry)
   434	MOVB   CH, AL
   435	SHLQ   CL, R11
   436	ADDB   CL, R12
   437	BSWAPL AX
   438
   439	// these four writes get coalesced
   440	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
   441	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
   442	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
   443	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
   444	MOVL AX, (BX)(R8*1)
   445
   446	// update the bitreader structure
   447	MOVQ R11, 80(R10)
   448	MOVB R12, 88(R10)
   449
   450	// br2.fillFast32()
   451	MOVQ    128(R10), R11
   452	MOVBQZX 136(R10), R12
   453	CMPQ    R12, $0x20
   454	JBE     skip_fill2
   455	MOVQ    120(R10), R13
   456	SUBQ    $0x20, R12
   457	SUBQ    $0x04, R13
   458	MOVQ    96(R10), R14
   459
   460	// b.value |= uint64(low) << (b.bitsRead & 63)
   461	MOVL (R13)(R14*1), R14
   462	MOVQ R12, CX
   463	SHLQ CL, R14
   464	MOVQ R13, 120(R10)
   465	ORQ  R14, R11
   466
   467	// exhausted += (br2.off < 4)
   468	CMPQ R13, $0x04
   469	ADCB $+0, DL
   470
   471skip_fill2:
   472	// val0 := br2.peekTopBits(peekBits)
   473	MOVQ R11, R13
   474	MOVQ DI, CX
   475	SHRQ CL, R13
   476
   477	// v0 := table[val0&mask]
   478	MOVW (R9)(R13*2), CX
   479
   480	// br2.advance(uint8(v0.entry)
   481	MOVB CH, AL
   482	SHLQ CL, R11
   483	ADDB CL, R12
   484
   485	// val1 := br2.peekTopBits(peekBits)
   486	MOVQ R11, R13
   487	MOVQ DI, CX
   488	SHRQ CL, R13
   489
   490	// v1 := table[val0&mask]
   491	MOVW (R9)(R13*2), CX
   492
   493	// br2.advance(uint8(v1.entry)
   494	MOVB   CH, AH
   495	SHLQ   CL, R11
   496	ADDB   CL, R12
   497	BSWAPL AX
   498
   499	// val2 := br2.peekTopBits(peekBits)
   500	MOVQ R11, R13
   501	MOVQ DI, CX
   502	SHRQ CL, R13
   503
   504	// v2 := table[val0&mask]
   505	MOVW (R9)(R13*2), CX
   506
   507	// br2.advance(uint8(v2.entry)
   508	MOVB CH, AH
   509	SHLQ CL, R11
   510	ADDB CL, R12
   511
   512	// val3 := br2.peekTopBits(peekBits)
   513	MOVQ R11, R13
   514	MOVQ DI, CX
   515	SHRQ CL, R13
   516
   517	// v3 := table[val0&mask]
   518	MOVW (R9)(R13*2), CX
   519
   520	// br2.advance(uint8(v3.entry)
   521	MOVB   CH, AL
   522	SHLQ   CL, R11
   523	ADDB   CL, R12
   524	BSWAPL AX
   525
   526	// these four writes get coalesced
   527	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
   528	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
   529	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
   530	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
   531	MOVL AX, (BX)(R8*2)
   532
   533	// update the bitreader structure
   534	MOVQ R11, 128(R10)
   535	MOVB R12, 136(R10)
   536
   537	// br3.fillFast32()
   538	MOVQ    176(R10), R11
   539	MOVBQZX 184(R10), R12
   540	CMPQ    R12, $0x20
   541	JBE     skip_fill3
   542	MOVQ    168(R10), R13
   543	SUBQ    $0x20, R12
   544	SUBQ    $0x04, R13
   545	MOVQ    144(R10), R14
   546
   547	// b.value |= uint64(low) << (b.bitsRead & 63)
   548	MOVL (R13)(R14*1), R14
   549	MOVQ R12, CX
   550	SHLQ CL, R14
   551	MOVQ R13, 168(R10)
   552	ORQ  R14, R11
   553
   554	// exhausted += (br3.off < 4)
   555	CMPQ R13, $0x04
   556	ADCB $+0, DL
   557
   558skip_fill3:
   559	// val0 := br3.peekTopBits(peekBits)
   560	MOVQ R11, R13
   561	MOVQ DI, CX
   562	SHRQ CL, R13
   563
   564	// v0 := table[val0&mask]
   565	MOVW (R9)(R13*2), CX
   566
   567	// br3.advance(uint8(v0.entry)
   568	MOVB CH, AL
   569	SHLQ CL, R11
   570	ADDB CL, R12
   571
   572	// val1 := br3.peekTopBits(peekBits)
   573	MOVQ R11, R13
   574	MOVQ DI, CX
   575	SHRQ CL, R13
   576
   577	// v1 := table[val0&mask]
   578	MOVW (R9)(R13*2), CX
   579
   580	// br3.advance(uint8(v1.entry)
   581	MOVB   CH, AH
   582	SHLQ   CL, R11
   583	ADDB   CL, R12
   584	BSWAPL AX
   585
   586	// val2 := br3.peekTopBits(peekBits)
   587	MOVQ R11, R13
   588	MOVQ DI, CX
   589	SHRQ CL, R13
   590
   591	// v2 := table[val0&mask]
   592	MOVW (R9)(R13*2), CX
   593
   594	// br3.advance(uint8(v2.entry)
   595	MOVB CH, AH
   596	SHLQ CL, R11
   597	ADDB CL, R12
   598
   599	// val3 := br3.peekTopBits(peekBits)
   600	MOVQ R11, R13
   601	MOVQ DI, CX
   602	SHRQ CL, R13
   603
   604	// v3 := table[val0&mask]
   605	MOVW (R9)(R13*2), CX
   606
   607	// br3.advance(uint8(v3.entry)
   608	MOVB   CH, AL
   609	SHLQ   CL, R11
   610	ADDB   CL, R12
   611	BSWAPL AX
   612
   613	// these four writes get coalesced
   614	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
   615	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
   616	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
   617	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
   618	LEAQ (R8)(R8*2), CX
   619	MOVL AX, (BX)(CX*1)
   620
   621	// update the bitreader structure
   622	MOVQ  R11, 176(R10)
   623	MOVB  R12, 184(R10)
   624	ADDQ  $0x04, BX
   625	TESTB DL, DL
   626	JZ    main_loop
   627	MOVQ  ctx+0(FP), AX
   628	SUBQ  16(AX), BX
   629	SHLQ  $0x02, BX
   630	MOVQ  BX, 40(AX)
   631	RET
   632
   633// func decompress1x_main_loop_amd64(ctx *decompress1xContext)
   634TEXT ·decompress1x_main_loop_amd64(SB), $0-8
   635	MOVQ    ctx+0(FP), CX
   636	MOVQ    16(CX), DX
   637	MOVQ    24(CX), BX
   638	CMPQ    BX, $0x04
   639	JB      error_max_decoded_size_exceeded
   640	LEAQ    (DX)(BX*1), BX
   641	MOVQ    (CX), SI
   642	MOVQ    (SI), R8
   643	MOVQ    24(SI), R9
   644	MOVQ    32(SI), R10
   645	MOVBQZX 40(SI), R11
   646	MOVQ    32(CX), SI
   647	MOVBQZX 8(CX), DI
   648	JMP     loop_condition
   649
   650main_loop:
   651	// Check if we have room for 4 bytes in the output buffer
   652	LEAQ 4(DX), CX
   653	CMPQ CX, BX
   654	JGE  error_max_decoded_size_exceeded
   655
   656	// Decode 4 values
   657	CMPQ R11, $0x20
   658	JL   bitReader_fillFast_1_end
   659	SUBQ $0x20, R11
   660	SUBQ $0x04, R9
   661	MOVL (R8)(R9*1), R12
   662	MOVQ R11, CX
   663	SHLQ CL, R12
   664	ORQ  R12, R10
   665
   666bitReader_fillFast_1_end:
   667	MOVQ    DI, CX
   668	MOVQ    R10, R12
   669	SHRQ    CL, R12
   670	MOVW    (SI)(R12*2), CX
   671	MOVB    CH, AL
   672	MOVBQZX CL, CX
   673	ADDQ    CX, R11
   674	SHLQ    CL, R10
   675	MOVQ    DI, CX
   676	MOVQ    R10, R12
   677	SHRQ    CL, R12
   678	MOVW    (SI)(R12*2), CX
   679	MOVB    CH, AH
   680	MOVBQZX CL, CX
   681	ADDQ    CX, R11
   682	SHLQ    CL, R10
   683	BSWAPL  AX
   684	CMPQ    R11, $0x20
   685	JL      bitReader_fillFast_2_end
   686	SUBQ    $0x20, R11
   687	SUBQ    $0x04, R9
   688	MOVL    (R8)(R9*1), R12
   689	MOVQ    R11, CX
   690	SHLQ    CL, R12
   691	ORQ     R12, R10
   692
   693bitReader_fillFast_2_end:
   694	MOVQ    DI, CX
   695	MOVQ    R10, R12
   696	SHRQ    CL, R12
   697	MOVW    (SI)(R12*2), CX
   698	MOVB    CH, AH
   699	MOVBQZX CL, CX
   700	ADDQ    CX, R11
   701	SHLQ    CL, R10
   702	MOVQ    DI, CX
   703	MOVQ    R10, R12
   704	SHRQ    CL, R12
   705	MOVW    (SI)(R12*2), CX
   706	MOVB    CH, AL
   707	MOVBQZX CL, CX
   708	ADDQ    CX, R11
   709	SHLQ    CL, R10
   710	BSWAPL  AX
   711
   712	// Store the decoded values
   713	MOVL AX, (DX)
   714	ADDQ $0x04, DX
   715
   716loop_condition:
   717	CMPQ R9, $0x08
   718	JGE  main_loop
   719
   720	// Update ctx structure
   721	MOVQ ctx+0(FP), AX
   722	SUBQ 16(AX), DX
   723	MOVQ DX, 40(AX)
   724	MOVQ (AX), AX
   725	MOVQ R9, 24(AX)
   726	MOVQ R10, 32(AX)
   727	MOVB R11, 40(AX)
   728	RET
   729
   730	// Report error
   731error_max_decoded_size_exceeded:
   732	MOVQ ctx+0(FP), AX
   733	MOVQ $-1, CX
   734	MOVQ CX, 40(AX)
   735	RET
   736
   737// func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
   738// Requires: BMI2
   739TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
   740	MOVQ    ctx+0(FP), CX
   741	MOVQ    16(CX), DX
   742	MOVQ    24(CX), BX
   743	CMPQ    BX, $0x04
   744	JB      error_max_decoded_size_exceeded
   745	LEAQ    (DX)(BX*1), BX
   746	MOVQ    (CX), SI
   747	MOVQ    (SI), R8
   748	MOVQ    24(SI), R9
   749	MOVQ    32(SI), R10
   750	MOVBQZX 40(SI), R11
   751	MOVQ    32(CX), SI
   752	MOVBQZX 8(CX), DI
   753	JMP     loop_condition
   754
   755main_loop:
   756	// Check if we have room for 4 bytes in the output buffer
   757	LEAQ 4(DX), CX
   758	CMPQ CX, BX
   759	JGE  error_max_decoded_size_exceeded
   760
   761	// Decode 4 values
   762	CMPQ  R11, $0x20
   763	JL    bitReader_fillFast_1_end
   764	SUBQ  $0x20, R11
   765	SUBQ  $0x04, R9
   766	MOVL  (R8)(R9*1), CX
   767	SHLXQ R11, CX, CX
   768	ORQ   CX, R10
   769
   770bitReader_fillFast_1_end:
   771	SHRXQ   DI, R10, CX
   772	MOVW    (SI)(CX*2), CX
   773	MOVB    CH, AL
   774	MOVBQZX CL, CX
   775	ADDQ    CX, R11
   776	SHLXQ   CX, R10, R10
   777	SHRXQ   DI, R10, CX
   778	MOVW    (SI)(CX*2), CX
   779	MOVB    CH, AH
   780	MOVBQZX CL, CX
   781	ADDQ    CX, R11
   782	SHLXQ   CX, R10, R10
   783	BSWAPL  AX
   784	CMPQ    R11, $0x20
   785	JL      bitReader_fillFast_2_end
   786	SUBQ    $0x20, R11
   787	SUBQ    $0x04, R9
   788	MOVL    (R8)(R9*1), CX
   789	SHLXQ   R11, CX, CX
   790	ORQ     CX, R10
   791
   792bitReader_fillFast_2_end:
   793	SHRXQ   DI, R10, CX
   794	MOVW    (SI)(CX*2), CX
   795	MOVB    CH, AH
   796	MOVBQZX CL, CX
   797	ADDQ    CX, R11
   798	SHLXQ   CX, R10, R10
   799	SHRXQ   DI, R10, CX
   800	MOVW    (SI)(CX*2), CX
   801	MOVB    CH, AL
   802	MOVBQZX CL, CX
   803	ADDQ    CX, R11
   804	SHLXQ   CX, R10, R10
   805	BSWAPL  AX
   806
   807	// Store the decoded values
   808	MOVL AX, (DX)
   809	ADDQ $0x04, DX
   810
   811loop_condition:
   812	CMPQ R9, $0x08
   813	JGE  main_loop
   814
   815	// Update ctx structure
   816	MOVQ ctx+0(FP), AX
   817	SUBQ 16(AX), DX
   818	MOVQ DX, 40(AX)
   819	MOVQ (AX), AX
   820	MOVQ R9, 24(AX)
   821	MOVQ R10, 32(AX)
   822	MOVB R11, 40(AX)
   823	RET
   824
   825	// Report error
   826error_max_decoded_size_exceeded:
   827	MOVQ ctx+0(FP), AX
   828	MOVQ $-1, CX
   829	MOVQ CX, 40(AX)
   830	RET

View as plain text