decode_arm64.s

Documentation: github.com/klauspost/compress/s2

     1// Copyright 2020 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5// +build !appengine
     6// +build gc
     7// +build !noasm
     8
     9#include "textflag.h"
    10
    11#define R_TMP0 R2
    12#define R_TMP1 R3
    13#define R_LEN R4
    14#define R_OFF R5
    15#define R_SRC R6
    16#define R_DST R7
    17#define R_DBASE R8
    18#define R_DLEN R9
    19#define R_DEND R10
    20#define R_SBASE R11
    21#define R_SLEN R12
    22#define R_SEND R13
    23#define R_TMP2 R14
    24#define R_TMP3 R15
    25
    26// TEST_SRC will check if R_SRC is <= SRC_END
    27#define TEST_SRC() \
    28	CMP R_SEND, R_SRC \
    29	BGT errCorrupt
    30
    31// MOVD R_SRC, R_TMP1
    32// SUB  R_SBASE, R_TMP1, R_TMP1
    33// CMP  R_SLEN, R_TMP1
    34// BGT  errCorrupt
    35
    36// The asm code generally follows the pure Go code in decode_other.go, except
    37// where marked with a "!!!".
    38
    39// func decode(dst, src []byte) int
    40//
    41// All local variables fit into registers. The non-zero stack size is only to
    42// spill registers and push args when issuing a CALL. The register allocation:
    43//	- R_TMP0	scratch
    44//	- R_TMP1	scratch
    45//	- R_LEN	length or x
    46//	- R_OFF	offset
    47//	- R_SRC	&src[s]
    48//	- R_DST	&dst[d]
    49//	+ R_DBASE	dst_base
    50//	+ R_DLEN	dst_len
    51//	+ R_DEND	dst_base + dst_len
    52//	+ R_SBASE	src_base
    53//	+ R_SLEN	src_len
    54//	+ R_SEND	src_base + src_len
    55//	- R_TMP2	used by doCopy
    56//	- R_TMP3	used by doCopy
    57//
    58// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
    59// function, and after a CALL returns, and are not otherwise modified.
    60//
    61// The d variable is implicitly R_DST - R_DBASE,  and len(dst)-d is R_DEND - R_DST.
    62// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
    63TEXT ·s2Decode(SB), NOSPLIT, $56-64
    64	// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
    65	MOVD dst_base+0(FP), R_DBASE
    66	MOVD dst_len+8(FP), R_DLEN
    67	MOVD R_DBASE, R_DST
    68	MOVD R_DBASE, R_DEND
    69	ADD  R_DLEN, R_DEND, R_DEND
    70	MOVD src_base+24(FP), R_SBASE
    71	MOVD src_len+32(FP), R_SLEN
    72	MOVD R_SBASE, R_SRC
    73	MOVD R_SBASE, R_SEND
    74	ADD  R_SLEN, R_SEND, R_SEND
    75	MOVD $0, R_OFF
    76
    77loop:
    78	// for s < len(src)
    79	CMP R_SEND, R_SRC
    80	BEQ end
    81
    82	// R_LEN = uint32(src[s])
    83	//
    84	// switch src[s] & 0x03
    85	MOVBU (R_SRC), R_LEN
    86	MOVW  R_LEN, R_TMP1
    87	ANDW  $3, R_TMP1
    88	MOVW  $1, R1
    89	CMPW  R1, R_TMP1
    90	BGE   tagCopy
    91
    92	// ----------------------------------------
    93	// The code below handles literal tags.
    94
    95	// case tagLiteral:
    96	// x := uint32(src[s] >> 2)
    97	// switch
    98	MOVW $60, R1
    99	LSRW $2, R_LEN, R_LEN
   100	CMPW R_LEN, R1
   101	BLS  tagLit60Plus
   102
   103	// case x < 60:
   104	// s++
   105	ADD $1, R_SRC, R_SRC
   106
   107doLit:
   108	// This is the end of the inner "switch", when we have a literal tag.
   109	//
   110	// We assume that R_LEN == x and x fits in a uint32, where x is the variable
   111	// used in the pure Go decode_other.go code.
   112
   113	// length = int(x) + 1
   114	//
   115	// Unlike the pure Go code, we don't need to check if length <= 0 because
   116	// R_LEN can hold 64 bits, so the increment cannot overflow.
   117	ADD $1, R_LEN, R_LEN
   118
   119	// Prepare to check if copying length bytes will run past the end of dst or
   120	// src.
   121	//
   122	// R_TMP0 = len(dst) - d
   123	// R_TMP1 = len(src) - s
   124	MOVD R_DEND, R_TMP0
   125	SUB  R_DST, R_TMP0, R_TMP0
   126	MOVD R_SEND, R_TMP1
   127	SUB  R_SRC, R_TMP1, R_TMP1
   128
   129	// !!! Try a faster technique for short (16 or fewer bytes) copies.
   130	//
   131	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
   132	//   goto callMemmove // Fall back on calling runtime·memmove.
   133	// }
   134	//
   135	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
   136	// against 21 instead of 16, because it cannot assume that all of its input
   137	// is contiguous in memory and so it needs to leave enough source bytes to
   138	// read the next tag without refilling buffers, but Go's Decode assumes
   139	// contiguousness (the src argument is a []byte).
   140	CMP $16, R_LEN
   141	BGT callMemmove
   142	CMP $16, R_TMP0
   143	BLT callMemmove
   144	CMP $16, R_TMP1
   145	BLT callMemmove
   146
   147	// !!! Implement the copy from src to dst as a 16-byte load and store.
   148	// (Decode's documentation says that dst and src must not overlap.)
   149	//
   150	// This always copies 16 bytes, instead of only length bytes, but that's
   151	// OK. If the input is a valid Snappy encoding then subsequent iterations
   152	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
   153	// non-nil error), so the overrun will be ignored.
   154	//
   155	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
   156	// 16-byte loads and stores. This technique probably wouldn't be as
   157	// effective on architectures that are fussier about alignment.
   158	LDP 0(R_SRC), (R_TMP2, R_TMP3)
   159	STP (R_TMP2, R_TMP3), 0(R_DST)
   160
   161	// d += length
   162	// s += length
   163	ADD R_LEN, R_DST, R_DST
   164	ADD R_LEN, R_SRC, R_SRC
   165	B   loop
   166
   167callMemmove:
   168	// if length > len(dst)-d || length > len(src)-s { etc }
   169	CMP R_TMP0, R_LEN
   170	BGT errCorrupt
   171	CMP R_TMP1, R_LEN
   172	BGT errCorrupt
   173
   174	// copy(dst[d:], src[s:s+length])
   175	//
   176	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
   177	// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
   178	// three registers to the stack, to save local variables across the CALL.
   179	MOVD R_DST, 8(RSP)
   180	MOVD R_SRC, 16(RSP)
   181	MOVD R_LEN, 24(RSP)
   182	MOVD R_DST, 32(RSP)
   183	MOVD R_SRC, 40(RSP)
   184	MOVD R_LEN, 48(RSP)
   185	MOVD R_OFF, 56(RSP)
   186	CALL runtime·memmove(SB)
   187
   188	// Restore local variables: unspill registers from the stack and
   189	// re-calculate R_DBASE-R_SEND.
   190	MOVD 32(RSP), R_DST
   191	MOVD 40(RSP), R_SRC
   192	MOVD 48(RSP), R_LEN
   193	MOVD 56(RSP), R_OFF
   194	MOVD dst_base+0(FP), R_DBASE
   195	MOVD dst_len+8(FP), R_DLEN
   196	MOVD R_DBASE, R_DEND
   197	ADD  R_DLEN, R_DEND, R_DEND
   198	MOVD src_base+24(FP), R_SBASE
   199	MOVD src_len+32(FP), R_SLEN
   200	MOVD R_SBASE, R_SEND
   201	ADD  R_SLEN, R_SEND, R_SEND
   202
   203	// d += length
   204	// s += length
   205	ADD R_LEN, R_DST, R_DST
   206	ADD R_LEN, R_SRC, R_SRC
   207	B   loop
   208
   209tagLit60Plus:
   210	// !!! This fragment does the
   211	//
   212	// s += x - 58; if uint(s) > uint(len(src)) { etc }
   213	//
   214	// checks. In the asm version, we code it once instead of once per switch case.
   215	ADD R_LEN, R_SRC, R_SRC
   216	SUB $58, R_SRC, R_SRC
   217	TEST_SRC()
   218
   219	// case x == 60:
   220	MOVW $61, R1
   221	CMPW R1, R_LEN
   222	BEQ  tagLit61
   223	BGT  tagLit62Plus
   224
   225	// x = uint32(src[s-1])
   226	MOVBU -1(R_SRC), R_LEN
   227	B     doLit
   228
   229tagLit61:
   230	// case x == 61:
   231	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
   232	MOVHU -2(R_SRC), R_LEN
   233	B     doLit
   234
   235tagLit62Plus:
   236	CMPW $62, R_LEN
   237	BHI  tagLit63
   238
   239	// case x == 62:
   240	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
   241	MOVHU -3(R_SRC), R_LEN
   242	MOVBU -1(R_SRC), R_TMP1
   243	ORR   R_TMP1<<16, R_LEN
   244	B     doLit
   245
   246tagLit63:
   247	// case x == 63:
   248	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
   249	MOVWU -4(R_SRC), R_LEN
   250	B     doLit
   251
   252	// The code above handles literal tags.
   253	// ----------------------------------------
   254	// The code below handles copy tags.
   255
   256tagCopy4:
   257	// case tagCopy4:
   258	// s += 5
   259	ADD $5, R_SRC, R_SRC
   260
   261	// if uint(s) > uint(len(src)) { etc }
   262	MOVD R_SRC, R_TMP1
   263	SUB  R_SBASE, R_TMP1, R_TMP1
   264	CMP  R_SLEN, R_TMP1
   265	BGT  errCorrupt
   266
   267	// length = 1 + int(src[s-5])>>2
   268	MOVD $1, R1
   269	ADD  R_LEN>>2, R1, R_LEN
   270
   271	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
   272	MOVWU -4(R_SRC), R_OFF
   273	B     doCopy
   274
   275tagCopy2:
   276	// case tagCopy2:
   277	// s += 3
   278	ADD $3, R_SRC, R_SRC
   279
   280	// if uint(s) > uint(len(src)) { etc }
   281	TEST_SRC()
   282
   283	// length = 1 + int(src[s-3])>>2
   284	MOVD $1, R1
   285	ADD  R_LEN>>2, R1, R_LEN
   286
   287	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
   288	MOVHU -2(R_SRC), R_OFF
   289	B     doCopy
   290
   291tagCopy:
   292	// We have a copy tag. We assume that:
   293	//	- R_TMP1 == src[s] & 0x03
   294	//	- R_LEN == src[s]
   295	CMP $2, R_TMP1
   296	BEQ tagCopy2
   297	BGT tagCopy4
   298
   299	// case tagCopy1:
   300	// s += 2
   301	ADD $2, R_SRC, R_SRC
   302
   303	// if uint(s) > uint(len(src)) { etc }
   304	TEST_SRC()
   305
   306	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
   307	// Calculate offset in R_TMP0 in case it is a repeat.
   308	MOVD  R_LEN, R_TMP0
   309	AND   $0xe0, R_TMP0
   310	MOVBU -1(R_SRC), R_TMP1
   311	ORR   R_TMP0<<3, R_TMP1, R_TMP0
   312
   313	// length = 4 + int(src[s-2])>>2&0x7
   314	MOVD $7, R1
   315	AND  R_LEN>>2, R1, R_LEN
   316	ADD  $4, R_LEN, R_LEN
   317
   318	// check if repeat code with offset 0.
   319	CMP $0, R_TMP0
   320	BEQ repeatCode
   321
   322	// This is a regular copy, transfer our temporary value to R_OFF (offset)
   323	MOVD R_TMP0, R_OFF
   324	B    doCopy
   325
   326	// This is a repeat code.
   327repeatCode:
   328	// If length < 9, reuse last offset, with the length already calculated.
   329	CMP $9, R_LEN
   330	BLT doCopyRepeat
   331	BEQ repeatLen1
   332	CMP $10, R_LEN
   333	BEQ repeatLen2
   334
   335repeatLen3:
   336	// s +=3
   337	ADD $3, R_SRC, R_SRC
   338
   339	// if uint(s) > uint(len(src)) { etc }
   340	TEST_SRC()
   341
   342	// length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + 65540
   343	MOVBU -1(R_SRC), R_TMP0
   344	MOVHU -3(R_SRC), R_LEN
   345	ORR   R_TMP0<<16, R_LEN, R_LEN
   346	ADD   $65540, R_LEN, R_LEN
   347	B     doCopyRepeat
   348
   349repeatLen2:
   350	// s +=2
   351	ADD $2, R_SRC, R_SRC
   352
   353	// if uint(s) > uint(len(src)) { etc }
   354	TEST_SRC()
   355
   356	// length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + 260
   357	MOVHU -2(R_SRC), R_LEN
   358	ADD   $260, R_LEN, R_LEN
   359	B     doCopyRepeat
   360
   361repeatLen1:
   362	// s +=1
   363	ADD $1, R_SRC, R_SRC
   364
   365	// if uint(s) > uint(len(src)) { etc }
   366	TEST_SRC()
   367
   368	// length = src[s-1] + 8
   369	MOVBU -1(R_SRC), R_LEN
   370	ADD   $8, R_LEN, R_LEN
   371	B     doCopyRepeat
   372
   373doCopy:
   374	// This is the end of the outer "switch", when we have a copy tag.
   375	//
   376	// We assume that:
   377	//	- R_LEN == length && R_LEN > 0
   378	//	- R_OFF == offset
   379
   380	// if d < offset { etc }
   381	MOVD R_DST, R_TMP1
   382	SUB  R_DBASE, R_TMP1, R_TMP1
   383	CMP  R_OFF, R_TMP1
   384	BLT  errCorrupt
   385
   386	// Repeat values can skip the test above, since any offset > 0 will be in dst.
   387doCopyRepeat:
   388
   389	// if offset <= 0 { etc }
   390	CMP $0, R_OFF
   391	BLE errCorrupt
   392
   393	// if length > len(dst)-d { etc }
   394	MOVD R_DEND, R_TMP1
   395	SUB  R_DST, R_TMP1, R_TMP1
   396	CMP  R_TMP1, R_LEN
   397	BGT  errCorrupt
   398
   399	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
   400	//
   401	// Set:
   402	//	- R_TMP2 = len(dst)-d
   403	//	- R_TMP3 = &dst[d-offset]
   404	MOVD R_DEND, R_TMP2
   405	SUB  R_DST, R_TMP2, R_TMP2
   406	MOVD R_DST, R_TMP3
   407	SUB  R_OFF, R_TMP3, R_TMP3
   408
   409	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
   410	//
   411	// First, try using two 8-byte load/stores, similar to the doLit technique
   412	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
   413	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
   414	// and not one 16-byte load/store, and the first store has to be before the
   415	// second load, due to the overlap if offset is in the range [8, 16).
   416	//
   417	// if length > 16 || offset < 8 || len(dst)-d < 16 {
   418	//   goto slowForwardCopy
   419	// }
   420	// copy 16 bytes
   421	// d += length
   422	CMP  $16, R_LEN
   423	BGT  slowForwardCopy
   424	CMP  $8, R_OFF
   425	BLT  slowForwardCopy
   426	CMP  $16, R_TMP2
   427	BLT  slowForwardCopy
   428	MOVD 0(R_TMP3), R_TMP0
   429	MOVD R_TMP0, 0(R_DST)
   430	MOVD 8(R_TMP3), R_TMP1
   431	MOVD R_TMP1, 8(R_DST)
   432	ADD  R_LEN, R_DST, R_DST
   433	B    loop
   434
   435slowForwardCopy:
   436	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
   437	// can still try 8-byte load stores, provided we can overrun up to 10 extra
   438	// bytes. As above, the overrun will be fixed up by subsequent iterations
   439	// of the outermost loop.
   440	//
   441	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
   442	// commentary says:
   443	//
   444	// ----
   445	//
   446	// The main part of this loop is a simple copy of eight bytes at a time
   447	// until we've copied (at least) the requested amount of bytes.  However,
   448	// if d and d-offset are less than eight bytes apart (indicating a
   449	// repeating pattern of length < 8), we first need to expand the pattern in
   450	// order to get the correct results. For instance, if the buffer looks like
   451	// this, with the eight-byte <d-offset> and <d> patterns marked as
   452	// intervals:
   453	//
   454	//    abxxxxxxxxxxxx
   455	//    [------]           d-offset
   456	//      [------]         d
   457	//
   458	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
   459	// once, after which we can move <d> two bytes without moving <d-offset>:
   460	//
   461	//    ababxxxxxxxxxx
   462	//    [------]           d-offset
   463	//        [------]       d
   464	//
   465	// and repeat the exercise until the two no longer overlap.
   466	//
   467	// This allows us to do very well in the special case of one single byte
   468	// repeated many times, without taking a big hit for more general cases.
   469	//
   470	// The worst case of extra writing past the end of the match occurs when
   471	// offset == 1 and length == 1; the last copy will read from byte positions
   472	// [0..7] and write to [4..11], whereas it was only supposed to write to
   473	// position 1. Thus, ten excess bytes.
   474	//
   475	// ----
   476	//
   477	// That "10 byte overrun" worst case is confirmed by Go's
   478	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
   479	// and finishSlowForwardCopy algorithm.
   480	//
   481	// if length > len(dst)-d-10 {
   482	//   goto verySlowForwardCopy
   483	// }
   484	SUB $10, R_TMP2, R_TMP2
   485	CMP R_TMP2, R_LEN
   486	BGT verySlowForwardCopy
   487
   488	// We want to keep the offset, so we use R_TMP2 from here.
   489	MOVD R_OFF, R_TMP2
   490
   491makeOffsetAtLeast8:
   492	// !!! As above, expand the pattern so that offset >= 8 and we can use
   493	// 8-byte load/stores.
   494	//
   495	// for offset < 8 {
   496	//   copy 8 bytes from dst[d-offset:] to dst[d:]
   497	//   length -= offset
   498	//   d      += offset
   499	//   offset += offset
   500	//   // The two previous lines together means that d-offset, and therefore
   501	//   // R_TMP3, is unchanged.
   502	// }
   503	CMP  $8, R_TMP2
   504	BGE  fixUpSlowForwardCopy
   505	MOVD (R_TMP3), R_TMP1
   506	MOVD R_TMP1, (R_DST)
   507	SUB  R_TMP2, R_LEN, R_LEN
   508	ADD  R_TMP2, R_DST, R_DST
   509	ADD  R_TMP2, R_TMP2, R_TMP2
   510	B    makeOffsetAtLeast8
   511
   512fixUpSlowForwardCopy:
   513	// !!! Add length (which might be negative now) to d (implied by R_DST being
   514	// &dst[d]) so that d ends up at the right place when we jump back to the
   515	// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
   516	// length is positive, copying the remaining length bytes will write to the
   517	// right place.
   518	MOVD R_DST, R_TMP0
   519	ADD  R_LEN, R_DST, R_DST
   520
   521finishSlowForwardCopy:
   522	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
   523	// length means that we overrun, but as above, that will be fixed up by
   524	// subsequent iterations of the outermost loop.
   525	MOVD $0, R1
   526	CMP  R1, R_LEN
   527	BLE  loop
   528	MOVD (R_TMP3), R_TMP1
   529	MOVD R_TMP1, (R_TMP0)
   530	ADD  $8, R_TMP3, R_TMP3
   531	ADD  $8, R_TMP0, R_TMP0
   532	SUB  $8, R_LEN, R_LEN
   533	B    finishSlowForwardCopy
   534
   535verySlowForwardCopy:
   536	// verySlowForwardCopy is a simple implementation of forward copy. In C
   537	// parlance, this is a do/while loop instead of a while loop, since we know
   538	// that length > 0. In Go syntax:
   539	//
   540	// for {
   541	//   dst[d] = dst[d - offset]
   542	//   d++
   543	//   length--
   544	//   if length == 0 {
   545	//     break
   546	//   }
   547	// }
   548	MOVB (R_TMP3), R_TMP1
   549	MOVB R_TMP1, (R_DST)
   550	ADD  $1, R_TMP3, R_TMP3
   551	ADD  $1, R_DST, R_DST
   552	SUB  $1, R_LEN, R_LEN
   553	CBNZ R_LEN, verySlowForwardCopy
   554	B    loop
   555
   556	// The code above handles copy tags.
   557	// ----------------------------------------
   558
   559end:
   560	// This is the end of the "for s < len(src)".
   561	//
   562	// if d != len(dst) { etc }
   563	CMP R_DEND, R_DST
   564	BNE errCorrupt
   565
   566	// return 0
   567	MOVD $0, ret+48(FP)
   568	RET
   569
   570errCorrupt:
   571	// return decodeErrCodeCorrupt
   572	MOVD $1, R_TMP0
   573	MOVD R_TMP0, ret+48(FP)
   574	RET
View as plain text