decode_amd64.s

Documentation: github.com/klauspost/compress/s2

     1// Copyright 2016 The Go Authors. All rights reserved.
     2// Copyright (c) 2019 Klaus Post. All rights reserved.
     3// Use of this source code is governed by a BSD-style
     4// license that can be found in the LICENSE file.
     5
     6// +build !appengine
     7// +build gc
     8// +build !noasm
     9
    10#include "textflag.h"
    11
    12#define R_TMP0 AX
    13#define R_TMP1 BX
    14#define R_LEN CX
    15#define R_OFF DX
    16#define R_SRC SI
    17#define R_DST DI
    18#define R_DBASE R8
    19#define R_DLEN R9
    20#define R_DEND R10
    21#define R_SBASE R11
    22#define R_SLEN R12
    23#define R_SEND R13
    24#define R_TMP2 R14
    25#define R_TMP3 R15
    26
    27// The asm code generally follows the pure Go code in decode_other.go, except
    28// where marked with a "!!!".
    29
    30// func decode(dst, src []byte) int
    31//
    32// All local variables fit into registers. The non-zero stack size is only to
    33// spill registers and push args when issuing a CALL. The register allocation:
    34//	- R_TMP0	scratch
    35//	- R_TMP1	scratch
    36//	- R_LEN	    length or x (shared)
    37//	- R_OFF	    offset
    38//	- R_SRC	    &src[s]
    39//	- R_DST	    &dst[d]
    40//	+ R_DBASE	dst_base
    41//	+ R_DLEN	dst_len
    42//	+ R_DEND	dst_base + dst_len
    43//	+ R_SBASE	src_base
    44//	+ R_SLEN	src_len
    45//	+ R_SEND	src_base + src_len
    46//	- R_TMP2	used by doCopy
    47//	- R_TMP3	used by doCopy
    48//
    49// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
    50// function, and after a CALL returns, and are not otherwise modified.
    51//
    52// The d variable is implicitly R_DST - R_DBASE,  and len(dst)-d is R_DEND - R_DST.
    53// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
    54TEXT ·s2Decode(SB), NOSPLIT, $48-56
    55	// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
    56	MOVQ dst_base+0(FP), R_DBASE
    57	MOVQ dst_len+8(FP), R_DLEN
    58	MOVQ R_DBASE, R_DST
    59	MOVQ R_DBASE, R_DEND
    60	ADDQ R_DLEN, R_DEND
    61	MOVQ src_base+24(FP), R_SBASE
    62	MOVQ src_len+32(FP), R_SLEN
    63	MOVQ R_SBASE, R_SRC
    64	MOVQ R_SBASE, R_SEND
    65	ADDQ R_SLEN, R_SEND
    66	XORQ R_OFF, R_OFF
    67
    68loop:
    69	// for s < len(src)
    70	CMPQ R_SRC, R_SEND
    71	JEQ  end
    72
    73	// R_LEN = uint32(src[s])
    74	//
    75	// switch src[s] & 0x03
    76	MOVBLZX (R_SRC), R_LEN
    77	MOVL    R_LEN, R_TMP1
    78	ANDL    $3, R_TMP1
    79	CMPL    R_TMP1, $1
    80	JAE     tagCopy
    81
    82	// ----------------------------------------
    83	// The code below handles literal tags.
    84
    85	// case tagLiteral:
    86	// x := uint32(src[s] >> 2)
    87	// switch
    88	SHRL $2, R_LEN
    89	CMPL R_LEN, $60
    90	JAE  tagLit60Plus
    91
    92	// case x < 60:
    93	// s++
    94	INCQ R_SRC
    95
    96doLit:
    97	// This is the end of the inner "switch", when we have a literal tag.
    98	//
    99	// We assume that R_LEN == x and x fits in a uint32, where x is the variable
   100	// used in the pure Go decode_other.go code.
   101
   102	// length = int(x) + 1
   103	//
   104	// Unlike the pure Go code, we don't need to check if length <= 0 because
   105	// R_LEN can hold 64 bits, so the increment cannot overflow.
   106	INCQ R_LEN
   107
   108	// Prepare to check if copying length bytes will run past the end of dst or
   109	// src.
   110	//
   111	// R_TMP0 = len(dst) - d
   112	// R_TMP1 = len(src) - s
   113	MOVQ R_DEND, R_TMP0
   114	SUBQ R_DST, R_TMP0
   115	MOVQ R_SEND, R_TMP1
   116	SUBQ R_SRC, R_TMP1
   117
   118	// !!! Try a faster technique for short (16 or fewer bytes) copies.
   119	//
   120	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
   121	//   goto callMemmove // Fall back on calling runtime·memmove.
   122	// }
   123	//
   124	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
   125	// against 21 instead of 16, because it cannot assume that all of its input
   126	// is contiguous in memory and so it needs to leave enough source bytes to
   127	// read the next tag without refilling buffers, but Go's Decode assumes
   128	// contiguousness (the src argument is a []byte).
   129	CMPQ R_LEN, $16
   130	JGT  callMemmove
   131	CMPQ R_TMP0, $16
   132	JLT  callMemmove
   133	CMPQ R_TMP1, $16
   134	JLT  callMemmove
   135
   136	// !!! Implement the copy from src to dst as a 16-byte load and store.
   137	// (Decode's documentation says that dst and src must not overlap.)
   138	//
   139	// This always copies 16 bytes, instead of only length bytes, but that's
   140	// OK. If the input is a valid Snappy encoding then subsequent iterations
   141	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
   142	// non-nil error), so the overrun will be ignored.
   143	//
   144	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
   145	// 16-byte loads and stores. This technique probably wouldn't be as
   146	// effective on architectures that are fussier about alignment.
   147	MOVOU 0(R_SRC), X0
   148	MOVOU X0, 0(R_DST)
   149
   150	// d += length
   151	// s += length
   152	ADDQ R_LEN, R_DST
   153	ADDQ R_LEN, R_SRC
   154	JMP  loop
   155
   156callMemmove:
   157	// if length > len(dst)-d || length > len(src)-s { etc }
   158	CMPQ R_LEN, R_TMP0
   159	JGT  errCorrupt
   160	CMPQ R_LEN, R_TMP1
   161	JGT  errCorrupt
   162
   163	// copy(dst[d:], src[s:s+length])
   164	//
   165	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
   166	// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
   167	// three registers to the stack, to save local variables across the CALL.
   168	MOVQ R_DST, 0(SP)
   169	MOVQ R_SRC, 8(SP)
   170	MOVQ R_LEN, 16(SP)
   171	MOVQ R_DST, 24(SP)
   172	MOVQ R_SRC, 32(SP)
   173	MOVQ R_LEN, 40(SP)
   174	MOVQ R_OFF, 48(SP)
   175	CALL runtime·memmove(SB)
   176
   177	// Restore local variables: unspill registers from the stack and
   178	// re-calculate R_DBASE-R_SEND.
   179	MOVQ 24(SP), R_DST
   180	MOVQ 32(SP), R_SRC
   181	MOVQ 40(SP), R_LEN
   182	MOVQ 48(SP), R_OFF
   183	MOVQ dst_base+0(FP), R_DBASE
   184	MOVQ dst_len+8(FP), R_DLEN
   185	MOVQ R_DBASE, R_DEND
   186	ADDQ R_DLEN, R_DEND
   187	MOVQ src_base+24(FP), R_SBASE
   188	MOVQ src_len+32(FP), R_SLEN
   189	MOVQ R_SBASE, R_SEND
   190	ADDQ R_SLEN, R_SEND
   191
   192	// d += length
   193	// s += length
   194	ADDQ R_LEN, R_DST
   195	ADDQ R_LEN, R_SRC
   196	JMP  loop
   197
   198tagLit60Plus:
   199	// !!! This fragment does the
   200	//
   201	// s += x - 58; if uint(s) > uint(len(src)) { etc }
   202	//
   203	// checks. In the asm version, we code it once instead of once per switch case.
   204	ADDQ R_LEN, R_SRC
   205	SUBQ $58, R_SRC
   206	CMPQ R_SRC, R_SEND
   207	JA   errCorrupt
   208
   209	// case x == 60:
   210	CMPL R_LEN, $61
   211	JEQ  tagLit61
   212	JA   tagLit62Plus
   213
   214	// x = uint32(src[s-1])
   215	MOVBLZX -1(R_SRC), R_LEN
   216	JMP     doLit
   217
   218tagLit61:
   219	// case x == 61:
   220	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
   221	MOVWLZX -2(R_SRC), R_LEN
   222	JMP     doLit
   223
   224tagLit62Plus:
   225	CMPL R_LEN, $62
   226	JA   tagLit63
   227
   228	// case x == 62:
   229	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
   230	// We read one byte, safe to read one back, since we are just reading tag.
   231	// x = binary.LittleEndian.Uint32(src[s-1:]) >> 8
   232	MOVL -4(R_SRC), R_LEN
   233	SHRL $8, R_LEN
   234	JMP  doLit
   235
   236tagLit63:
   237	// case x == 63:
   238	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
   239	MOVL -4(R_SRC), R_LEN
   240	JMP  doLit
   241
   242// The code above handles literal tags.
   243// ----------------------------------------
   244// The code below handles copy tags.
   245
   246tagCopy4:
   247	// case tagCopy4:
   248	// s += 5
   249	ADDQ $5, R_SRC
   250
   251	// if uint(s) > uint(len(src)) { etc }
   252	CMPQ R_SRC, R_SEND
   253	JA   errCorrupt
   254
   255	// length = 1 + int(src[s-5])>>2
   256	SHRQ $2, R_LEN
   257	INCQ R_LEN
   258
   259	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
   260	MOVLQZX -4(R_SRC), R_OFF
   261	JMP     doCopy
   262
   263tagCopy2:
   264	// case tagCopy2:
   265	// s += 3
   266	ADDQ $3, R_SRC
   267
   268	// if uint(s) > uint(len(src)) { etc }
   269	CMPQ R_SRC, R_SEND
   270	JA   errCorrupt
   271
   272	// length = 1 + int(src[s-3])>>2
   273	SHRQ $2, R_LEN
   274	INCQ R_LEN
   275
   276	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
   277	MOVWQZX -2(R_SRC), R_OFF
   278	JMP     doCopy
   279
   280tagCopy:
   281	// We have a copy tag. We assume that:
   282	//	- R_TMP1 == src[s] & 0x03
   283	//	- R_LEN == src[s]
   284	CMPQ R_TMP1, $2
   285	JEQ  tagCopy2
   286	JA   tagCopy4
   287
   288	// case tagCopy1:
   289	// s += 2
   290	ADDQ $2, R_SRC
   291
   292	// if uint(s) > uint(len(src)) { etc }
   293	CMPQ R_SRC, R_SEND
   294	JA   errCorrupt
   295
   296	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
   297	// length = 4 + int(src[s-2])>>2&0x7
   298	MOVBQZX -1(R_SRC), R_TMP1
   299	MOVQ    R_LEN, R_TMP0
   300	SHRQ    $2, R_LEN
   301	ANDQ    $0xe0, R_TMP0
   302	ANDQ    $7, R_LEN
   303	SHLQ    $3, R_TMP0
   304	ADDQ    $4, R_LEN
   305	ORQ     R_TMP1, R_TMP0
   306
   307	// check if repeat code, ZF set by ORQ.
   308	JZ repeatCode
   309
   310	// This is a regular copy, transfer our temporary value to R_OFF (length)
   311	MOVQ R_TMP0, R_OFF
   312	JMP  doCopy
   313
   314// This is a repeat code.
   315repeatCode:
   316	// If length < 9, reuse last offset, with the length already calculated.
   317	CMPQ R_LEN, $9
   318	JL   doCopyRepeat
   319
   320	// Read additional bytes for length.
   321	JE repeatLen1
   322
   323	// Rare, so the extra branch shouldn't hurt too much.
   324	CMPQ R_LEN, $10
   325	JE   repeatLen2
   326	JMP  repeatLen3
   327
   328// Read repeat lengths.
   329repeatLen1:
   330	// s ++
   331	ADDQ $1, R_SRC
   332
   333	// if uint(s) > uint(len(src)) { etc }
   334	CMPQ R_SRC, R_SEND
   335	JA   errCorrupt
   336
   337	// length = src[s-1] + 8
   338	MOVBQZX -1(R_SRC), R_LEN
   339	ADDL    $8, R_LEN
   340	JMP     doCopyRepeat
   341
   342repeatLen2:
   343	// s +=2
   344	ADDQ $2, R_SRC
   345
   346	// if uint(s) > uint(len(src)) { etc }
   347	CMPQ R_SRC, R_SEND
   348	JA   errCorrupt
   349
   350	// length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8)
   351	MOVWQZX -2(R_SRC), R_LEN
   352	ADDL    $260, R_LEN
   353	JMP     doCopyRepeat
   354
   355repeatLen3:
   356	// s +=3
   357	ADDQ $3, R_SRC
   358
   359	// if uint(s) > uint(len(src)) { etc }
   360	CMPQ R_SRC, R_SEND
   361	JA   errCorrupt
   362
   363	// length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16)
   364	// Read one byte further back (just part of the tag, shifted out)
   365	MOVL -4(R_SRC), R_LEN
   366	SHRL $8, R_LEN
   367	ADDL $65540, R_LEN
   368	JMP  doCopyRepeat
   369
   370doCopy:
   371	// This is the end of the outer "switch", when we have a copy tag.
   372	//
   373	// We assume that:
   374	//	- R_LEN == length && R_LEN > 0
   375	//	- R_OFF == offset
   376
   377	// if d < offset { etc }
   378	MOVQ R_DST, R_TMP1
   379	SUBQ R_DBASE, R_TMP1
   380	CMPQ R_TMP1, R_OFF
   381	JLT  errCorrupt
   382
   383	// Repeat values can skip the test above, since any offset > 0 will be in dst.
   384doCopyRepeat:
   385	// if offset <= 0 { etc }
   386	CMPQ R_OFF, $0
   387	JLE  errCorrupt
   388
   389	// if length > len(dst)-d { etc }
   390	MOVQ R_DEND, R_TMP1
   391	SUBQ R_DST, R_TMP1
   392	CMPQ R_LEN, R_TMP1
   393	JGT  errCorrupt
   394
   395	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
   396	//
   397	// Set:
   398	//	- R_TMP2 = len(dst)-d
   399	//	- R_TMP3 = &dst[d-offset]
   400	MOVQ R_DEND, R_TMP2
   401	SUBQ R_DST, R_TMP2
   402	MOVQ R_DST, R_TMP3
   403	SUBQ R_OFF, R_TMP3
   404
   405	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
   406	//
   407	// First, try using two 8-byte load/stores, similar to the doLit technique
   408	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
   409	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
   410	// and not one 16-byte load/store, and the first store has to be before the
   411	// second load, due to the overlap if offset is in the range [8, 16).
   412	//
   413	// if length > 16 || offset < 8 || len(dst)-d < 16 {
   414	//   goto slowForwardCopy
   415	// }
   416	// copy 16 bytes
   417	// d += length
   418	CMPQ R_LEN, $16
   419	JGT  slowForwardCopy
   420	CMPQ R_OFF, $8
   421	JLT  slowForwardCopy
   422	CMPQ R_TMP2, $16
   423	JLT  slowForwardCopy
   424	MOVQ 0(R_TMP3), R_TMP0
   425	MOVQ R_TMP0, 0(R_DST)
   426	MOVQ 8(R_TMP3), R_TMP1
   427	MOVQ R_TMP1, 8(R_DST)
   428	ADDQ R_LEN, R_DST
   429	JMP  loop
   430
   431slowForwardCopy:
   432	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
   433	// can still try 8-byte load stores, provided we can overrun up to 10 extra
   434	// bytes. As above, the overrun will be fixed up by subsequent iterations
   435	// of the outermost loop.
   436	//
   437	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
   438	// commentary says:
   439	//
   440	// ----
   441	//
   442	// The main part of this loop is a simple copy of eight bytes at a time
   443	// until we've copied (at least) the requested amount of bytes.  However,
   444	// if d and d-offset are less than eight bytes apart (indicating a
   445	// repeating pattern of length < 8), we first need to expand the pattern in
   446	// order to get the correct results. For instance, if the buffer looks like
   447	// this, with the eight-byte <d-offset> and <d> patterns marked as
   448	// intervals:
   449	//
   450	//    abxxxxxxxxxxxx
   451	//    [------]           d-offset
   452	//      [------]         d
   453	//
   454	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
   455	// once, after which we can move <d> two bytes without moving <d-offset>:
   456	//
   457	//    ababxxxxxxxxxx
   458	//    [------]           d-offset
   459	//        [------]       d
   460	//
   461	// and repeat the exercise until the two no longer overlap.
   462	//
   463	// This allows us to do very well in the special case of one single byte
   464	// repeated many times, without taking a big hit for more general cases.
   465	//
   466	// The worst case of extra writing past the end of the match occurs when
   467	// offset == 1 and length == 1; the last copy will read from byte positions
   468	// [0..7] and write to [4..11], whereas it was only supposed to write to
   469	// position 1. Thus, ten excess bytes.
   470	//
   471	// ----
   472	//
   473	// That "10 byte overrun" worst case is confirmed by Go's
   474	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
   475	// and finishSlowForwardCopy algorithm.
   476	//
   477	// if length > len(dst)-d-10 {
   478	//   goto verySlowForwardCopy
   479	// }
   480	SUBQ $10, R_TMP2
   481	CMPQ R_LEN, R_TMP2
   482	JGT  verySlowForwardCopy
   483
   484	// We want to keep the offset, so we use R_TMP2 from here.
   485	MOVQ R_OFF, R_TMP2
   486
   487makeOffsetAtLeast8:
   488	// !!! As above, expand the pattern so that offset >= 8 and we can use
   489	// 8-byte load/stores.
   490	//
   491	// for offset < 8 {
   492	//   copy 8 bytes from dst[d-offset:] to dst[d:]
   493	//   length -= offset
   494	//   d      += offset
   495	//   offset += offset
   496	//   // The two previous lines together means that d-offset, and therefore
   497	//   // R_TMP3, is unchanged.
   498	// }
   499	CMPQ R_TMP2, $8
   500	JGE  fixUpSlowForwardCopy
   501	MOVQ (R_TMP3), R_TMP1
   502	MOVQ R_TMP1, (R_DST)
   503	SUBQ R_TMP2, R_LEN
   504	ADDQ R_TMP2, R_DST
   505	ADDQ R_TMP2, R_TMP2
   506	JMP  makeOffsetAtLeast8
   507
   508fixUpSlowForwardCopy:
   509	// !!! Add length (which might be negative now) to d (implied by R_DST being
   510	// &dst[d]) so that d ends up at the right place when we jump back to the
   511	// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
   512	// length is positive, copying the remaining length bytes will write to the
   513	// right place.
   514	MOVQ R_DST, R_TMP0
   515	ADDQ R_LEN, R_DST
   516
   517finishSlowForwardCopy:
   518	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
   519	// length means that we overrun, but as above, that will be fixed up by
   520	// subsequent iterations of the outermost loop.
   521	CMPQ R_LEN, $0
   522	JLE  loop
   523	MOVQ (R_TMP3), R_TMP1
   524	MOVQ R_TMP1, (R_TMP0)
   525	ADDQ $8, R_TMP3
   526	ADDQ $8, R_TMP0
   527	SUBQ $8, R_LEN
   528	JMP  finishSlowForwardCopy
   529
   530verySlowForwardCopy:
   531	// verySlowForwardCopy is a simple implementation of forward copy. In C
   532	// parlance, this is a do/while loop instead of a while loop, since we know
   533	// that length > 0. In Go syntax:
   534	//
   535	// for {
   536	//   dst[d] = dst[d - offset]
   537	//   d++
   538	//   length--
   539	//   if length == 0 {
   540	//     break
   541	//   }
   542	// }
   543	MOVB (R_TMP3), R_TMP1
   544	MOVB R_TMP1, (R_DST)
   545	INCQ R_TMP3
   546	INCQ R_DST
   547	DECQ R_LEN
   548	JNZ  verySlowForwardCopy
   549	JMP  loop
   550
   551// The code above handles copy tags.
   552// ----------------------------------------
   553
   554end:
   555	// This is the end of the "for s < len(src)".
   556	//
   557	// if d != len(dst) { etc }
   558	CMPQ R_DST, R_DEND
   559	JNE  errCorrupt
   560
   561	// return 0
   562	MOVQ $0, ret+48(FP)
   563	RET
   564
   565errCorrupt:
   566	// return decodeErrCodeCorrupt
   567	MOVQ $1, ret+48(FP)
   568	RET
View as plain text