decode_amd64.s

Documentation: github.com/golang/snappy

     1// Copyright 2016 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5// +build !appengine
     6// +build gc
     7// +build !noasm
     8
     9#include "textflag.h"
    10
    11// The asm code generally follows the pure Go code in decode_other.go, except
    12// where marked with a "!!!".
    13
    14// func decode(dst, src []byte) int
    15//
    16// All local variables fit into registers. The non-zero stack size is only to
    17// spill registers and push args when issuing a CALL. The register allocation:
    18//	- AX	scratch
    19//	- BX	scratch
    20//	- CX	length or x
    21//	- DX	offset
    22//	- SI	&src[s]
    23//	- DI	&dst[d]
    24//	+ R8	dst_base
    25//	+ R9	dst_len
    26//	+ R10	dst_base + dst_len
    27//	+ R11	src_base
    28//	+ R12	src_len
    29//	+ R13	src_base + src_len
    30//	- R14	used by doCopy
    31//	- R15	used by doCopy
    32//
    33// The registers R8-R13 (marked with a "+") are set at the start of the
    34// function, and after a CALL returns, and are not otherwise modified.
    35//
    36// The d variable is implicitly DI - R8,  and len(dst)-d is R10 - DI.
    37// The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
    38TEXT ·decode(SB), NOSPLIT, $48-56
    39	// Initialize SI, DI and R8-R13.
    40	MOVQ dst_base+0(FP), R8
    41	MOVQ dst_len+8(FP), R9
    42	MOVQ R8, DI
    43	MOVQ R8, R10
    44	ADDQ R9, R10
    45	MOVQ src_base+24(FP), R11
    46	MOVQ src_len+32(FP), R12
    47	MOVQ R11, SI
    48	MOVQ R11, R13
    49	ADDQ R12, R13
    50
    51loop:
    52	// for s < len(src)
    53	CMPQ SI, R13
    54	JEQ  end
    55
    56	// CX = uint32(src[s])
    57	//
    58	// switch src[s] & 0x03
    59	MOVBLZX (SI), CX
    60	MOVL    CX, BX
    61	ANDL    $3, BX
    62	CMPL    BX, $1
    63	JAE     tagCopy
    64
    65	// ----------------------------------------
    66	// The code below handles literal tags.
    67
    68	// case tagLiteral:
    69	// x := uint32(src[s] >> 2)
    70	// switch
    71	SHRL $2, CX
    72	CMPL CX, $60
    73	JAE  tagLit60Plus
    74
    75	// case x < 60:
    76	// s++
    77	INCQ SI
    78
    79doLit:
    80	// This is the end of the inner "switch", when we have a literal tag.
    81	//
    82	// We assume that CX == x and x fits in a uint32, where x is the variable
    83	// used in the pure Go decode_other.go code.
    84
    85	// length = int(x) + 1
    86	//
    87	// Unlike the pure Go code, we don't need to check if length <= 0 because
    88	// CX can hold 64 bits, so the increment cannot overflow.
    89	INCQ CX
    90
    91	// Prepare to check if copying length bytes will run past the end of dst or
    92	// src.
    93	//
    94	// AX = len(dst) - d
    95	// BX = len(src) - s
    96	MOVQ R10, AX
    97	SUBQ DI, AX
    98	MOVQ R13, BX
    99	SUBQ SI, BX
   100
   101	// !!! Try a faster technique for short (16 or fewer bytes) copies.
   102	//
   103	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
   104	//   goto callMemmove // Fall back on calling runtime·memmove.
   105	// }
   106	//
   107	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
   108	// against 21 instead of 16, because it cannot assume that all of its input
   109	// is contiguous in memory and so it needs to leave enough source bytes to
   110	// read the next tag without refilling buffers, but Go's Decode assumes
   111	// contiguousness (the src argument is a []byte).
   112	CMPQ CX, $16
   113	JGT  callMemmove
   114	CMPQ AX, $16
   115	JLT  callMemmove
   116	CMPQ BX, $16
   117	JLT  callMemmove
   118
   119	// !!! Implement the copy from src to dst as a 16-byte load and store.
   120	// (Decode's documentation says that dst and src must not overlap.)
   121	//
   122	// This always copies 16 bytes, instead of only length bytes, but that's
   123	// OK. If the input is a valid Snappy encoding then subsequent iterations
   124	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
   125	// non-nil error), so the overrun will be ignored.
   126	//
   127	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
   128	// 16-byte loads and stores. This technique probably wouldn't be as
   129	// effective on architectures that are fussier about alignment.
   130	MOVOU 0(SI), X0
   131	MOVOU X0, 0(DI)
   132
   133	// d += length
   134	// s += length
   135	ADDQ CX, DI
   136	ADDQ CX, SI
   137	JMP  loop
   138
   139callMemmove:
   140	// if length > len(dst)-d || length > len(src)-s { etc }
   141	CMPQ CX, AX
   142	JGT  errCorrupt
   143	CMPQ CX, BX
   144	JGT  errCorrupt
   145
   146	// copy(dst[d:], src[s:s+length])
   147	//
   148	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
   149	// DI, SI and CX as arguments. Coincidentally, we also need to spill those
   150	// three registers to the stack, to save local variables across the CALL.
   151	MOVQ DI, 0(SP)
   152	MOVQ SI, 8(SP)
   153	MOVQ CX, 16(SP)
   154	MOVQ DI, 24(SP)
   155	MOVQ SI, 32(SP)
   156	MOVQ CX, 40(SP)
   157	CALL runtime·memmove(SB)
   158
   159	// Restore local variables: unspill registers from the stack and
   160	// re-calculate R8-R13.
   161	MOVQ 24(SP), DI
   162	MOVQ 32(SP), SI
   163	MOVQ 40(SP), CX
   164	MOVQ dst_base+0(FP), R8
   165	MOVQ dst_len+8(FP), R9
   166	MOVQ R8, R10
   167	ADDQ R9, R10
   168	MOVQ src_base+24(FP), R11
   169	MOVQ src_len+32(FP), R12
   170	MOVQ R11, R13
   171	ADDQ R12, R13
   172
   173	// d += length
   174	// s += length
   175	ADDQ CX, DI
   176	ADDQ CX, SI
   177	JMP  loop
   178
   179tagLit60Plus:
   180	// !!! This fragment does the
   181	//
   182	// s += x - 58; if uint(s) > uint(len(src)) { etc }
   183	//
   184	// checks. In the asm version, we code it once instead of once per switch case.
   185	ADDQ CX, SI
   186	SUBQ $58, SI
   187	MOVQ SI, BX
   188	SUBQ R11, BX
   189	CMPQ BX, R12
   190	JA   errCorrupt
   191
   192	// case x == 60:
   193	CMPL CX, $61
   194	JEQ  tagLit61
   195	JA   tagLit62Plus
   196
   197	// x = uint32(src[s-1])
   198	MOVBLZX -1(SI), CX
   199	JMP     doLit
   200
   201tagLit61:
   202	// case x == 61:
   203	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
   204	MOVWLZX -2(SI), CX
   205	JMP     doLit
   206
   207tagLit62Plus:
   208	CMPL CX, $62
   209	JA   tagLit63
   210
   211	// case x == 62:
   212	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
   213	MOVWLZX -3(SI), CX
   214	MOVBLZX -1(SI), BX
   215	SHLL    $16, BX
   216	ORL     BX, CX
   217	JMP     doLit
   218
   219tagLit63:
   220	// case x == 63:
   221	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
   222	MOVL -4(SI), CX
   223	JMP  doLit
   224
   225// The code above handles literal tags.
   226// ----------------------------------------
   227// The code below handles copy tags.
   228
   229tagCopy4:
   230	// case tagCopy4:
   231	// s += 5
   232	ADDQ $5, SI
   233
   234	// if uint(s) > uint(len(src)) { etc }
   235	MOVQ SI, BX
   236	SUBQ R11, BX
   237	CMPQ BX, R12
   238	JA   errCorrupt
   239
   240	// length = 1 + int(src[s-5])>>2
   241	SHRQ $2, CX
   242	INCQ CX
   243
   244	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
   245	MOVLQZX -4(SI), DX
   246	JMP     doCopy
   247
   248tagCopy2:
   249	// case tagCopy2:
   250	// s += 3
   251	ADDQ $3, SI
   252
   253	// if uint(s) > uint(len(src)) { etc }
   254	MOVQ SI, BX
   255	SUBQ R11, BX
   256	CMPQ BX, R12
   257	JA   errCorrupt
   258
   259	// length = 1 + int(src[s-3])>>2
   260	SHRQ $2, CX
   261	INCQ CX
   262
   263	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
   264	MOVWQZX -2(SI), DX
   265	JMP     doCopy
   266
   267tagCopy:
   268	// We have a copy tag. We assume that:
   269	//	- BX == src[s] & 0x03
   270	//	- CX == src[s]
   271	CMPQ BX, $2
   272	JEQ  tagCopy2
   273	JA   tagCopy4
   274
   275	// case tagCopy1:
   276	// s += 2
   277	ADDQ $2, SI
   278
   279	// if uint(s) > uint(len(src)) { etc }
   280	MOVQ SI, BX
   281	SUBQ R11, BX
   282	CMPQ BX, R12
   283	JA   errCorrupt
   284
   285	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
   286	MOVQ    CX, DX
   287	ANDQ    $0xe0, DX
   288	SHLQ    $3, DX
   289	MOVBQZX -1(SI), BX
   290	ORQ     BX, DX
   291
   292	// length = 4 + int(src[s-2])>>2&0x7
   293	SHRQ $2, CX
   294	ANDQ $7, CX
   295	ADDQ $4, CX
   296
   297doCopy:
   298	// This is the end of the outer "switch", when we have a copy tag.
   299	//
   300	// We assume that:
   301	//	- CX == length && CX > 0
   302	//	- DX == offset
   303
   304	// if offset <= 0 { etc }
   305	CMPQ DX, $0
   306	JLE  errCorrupt
   307
   308	// if d < offset { etc }
   309	MOVQ DI, BX
   310	SUBQ R8, BX
   311	CMPQ BX, DX
   312	JLT  errCorrupt
   313
   314	// if length > len(dst)-d { etc }
   315	MOVQ R10, BX
   316	SUBQ DI, BX
   317	CMPQ CX, BX
   318	JGT  errCorrupt
   319
   320	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
   321	//
   322	// Set:
   323	//	- R14 = len(dst)-d
   324	//	- R15 = &dst[d-offset]
   325	MOVQ R10, R14
   326	SUBQ DI, R14
   327	MOVQ DI, R15
   328	SUBQ DX, R15
   329
   330	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
   331	//
   332	// First, try using two 8-byte load/stores, similar to the doLit technique
   333	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
   334	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
   335	// and not one 16-byte load/store, and the first store has to be before the
   336	// second load, due to the overlap if offset is in the range [8, 16).
   337	//
   338	// if length > 16 || offset < 8 || len(dst)-d < 16 {
   339	//   goto slowForwardCopy
   340	// }
   341	// copy 16 bytes
   342	// d += length
   343	CMPQ CX, $16
   344	JGT  slowForwardCopy
   345	CMPQ DX, $8
   346	JLT  slowForwardCopy
   347	CMPQ R14, $16
   348	JLT  slowForwardCopy
   349	MOVQ 0(R15), AX
   350	MOVQ AX, 0(DI)
   351	MOVQ 8(R15), BX
   352	MOVQ BX, 8(DI)
   353	ADDQ CX, DI
   354	JMP  loop
   355
   356slowForwardCopy:
   357	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
   358	// can still try 8-byte load stores, provided we can overrun up to 10 extra
   359	// bytes. As above, the overrun will be fixed up by subsequent iterations
   360	// of the outermost loop.
   361	//
   362	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
   363	// commentary says:
   364	//
   365	// ----
   366	//
   367	// The main part of this loop is a simple copy of eight bytes at a time
   368	// until we've copied (at least) the requested amount of bytes.  However,
   369	// if d and d-offset are less than eight bytes apart (indicating a
   370	// repeating pattern of length < 8), we first need to expand the pattern in
   371	// order to get the correct results. For instance, if the buffer looks like
   372	// this, with the eight-byte <d-offset> and <d> patterns marked as
   373	// intervals:
   374	//
   375	//    abxxxxxxxxxxxx
   376	//    [------]           d-offset
   377	//      [------]         d
   378	//
   379	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
   380	// once, after which we can move <d> two bytes without moving <d-offset>:
   381	//
   382	//    ababxxxxxxxxxx
   383	//    [------]           d-offset
   384	//        [------]       d
   385	//
   386	// and repeat the exercise until the two no longer overlap.
   387	//
   388	// This allows us to do very well in the special case of one single byte
   389	// repeated many times, without taking a big hit for more general cases.
   390	//
   391	// The worst case of extra writing past the end of the match occurs when
   392	// offset == 1 and length == 1; the last copy will read from byte positions
   393	// [0..7] and write to [4..11], whereas it was only supposed to write to
   394	// position 1. Thus, ten excess bytes.
   395	//
   396	// ----
   397	//
   398	// That "10 byte overrun" worst case is confirmed by Go's
   399	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
   400	// and finishSlowForwardCopy algorithm.
   401	//
   402	// if length > len(dst)-d-10 {
   403	//   goto verySlowForwardCopy
   404	// }
   405	SUBQ $10, R14
   406	CMPQ CX, R14
   407	JGT  verySlowForwardCopy
   408
   409makeOffsetAtLeast8:
   410	// !!! As above, expand the pattern so that offset >= 8 and we can use
   411	// 8-byte load/stores.
   412	//
   413	// for offset < 8 {
   414	//   copy 8 bytes from dst[d-offset:] to dst[d:]
   415	//   length -= offset
   416	//   d      += offset
   417	//   offset += offset
   418	//   // The two previous lines together means that d-offset, and therefore
   419	//   // R15, is unchanged.
   420	// }
   421	CMPQ DX, $8
   422	JGE  fixUpSlowForwardCopy
   423	MOVQ (R15), BX
   424	MOVQ BX, (DI)
   425	SUBQ DX, CX
   426	ADDQ DX, DI
   427	ADDQ DX, DX
   428	JMP  makeOffsetAtLeast8
   429
   430fixUpSlowForwardCopy:
   431	// !!! Add length (which might be negative now) to d (implied by DI being
   432	// &dst[d]) so that d ends up at the right place when we jump back to the
   433	// top of the loop. Before we do that, though, we save DI to AX so that, if
   434	// length is positive, copying the remaining length bytes will write to the
   435	// right place.
   436	MOVQ DI, AX
   437	ADDQ CX, DI
   438
   439finishSlowForwardCopy:
   440	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
   441	// length means that we overrun, but as above, that will be fixed up by
   442	// subsequent iterations of the outermost loop.
   443	CMPQ CX, $0
   444	JLE  loop
   445	MOVQ (R15), BX
   446	MOVQ BX, (AX)
   447	ADDQ $8, R15
   448	ADDQ $8, AX
   449	SUBQ $8, CX
   450	JMP  finishSlowForwardCopy
   451
   452verySlowForwardCopy:
   453	// verySlowForwardCopy is a simple implementation of forward copy. In C
   454	// parlance, this is a do/while loop instead of a while loop, since we know
   455	// that length > 0. In Go syntax:
   456	//
   457	// for {
   458	//   dst[d] = dst[d - offset]
   459	//   d++
   460	//   length--
   461	//   if length == 0 {
   462	//     break
   463	//   }
   464	// }
   465	MOVB (R15), BX
   466	MOVB BX, (DI)
   467	INCQ R15
   468	INCQ DI
   469	DECQ CX
   470	JNZ  verySlowForwardCopy
   471	JMP  loop
   472
   473// The code above handles copy tags.
   474// ----------------------------------------
   475
   476end:
   477	// This is the end of the "for s < len(src)".
   478	//
   479	// if d != len(dst) { etc }
   480	CMPQ DI, R10
   481	JNE  errCorrupt
   482
   483	// return 0
   484	MOVQ $0, ret+48(FP)
   485	RET
   486
   487errCorrupt:
   488	// return decodeErrCodeCorrupt
   489	MOVQ $1, ret+48(FP)
   490	RET
View as plain text