decode_arm64.s

Documentation: github.com/golang/snappy

     1// Copyright 2020 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5// +build !appengine
     6// +build gc
     7// +build !noasm
     8
     9#include "textflag.h"
    10
    11// The asm code generally follows the pure Go code in decode_other.go, except
    12// where marked with a "!!!".
    13
    14// func decode(dst, src []byte) int
    15//
    16// All local variables fit into registers. The non-zero stack size is only to
    17// spill registers and push args when issuing a CALL. The register allocation:
    18//	- R2	scratch
    19//	- R3	scratch
    20//	- R4	length or x
    21//	- R5	offset
    22//	- R6	&src[s]
    23//	- R7	&dst[d]
    24//	+ R8	dst_base
    25//	+ R9	dst_len
    26//	+ R10	dst_base + dst_len
    27//	+ R11	src_base
    28//	+ R12	src_len
    29//	+ R13	src_base + src_len
    30//	- R14	used by doCopy
    31//	- R15	used by doCopy
    32//
    33// The registers R8-R13 (marked with a "+") are set at the start of the
    34// function, and after a CALL returns, and are not otherwise modified.
    35//
    36// The d variable is implicitly R7 - R8,  and len(dst)-d is R10 - R7.
    37// The s variable is implicitly R6 - R11, and len(src)-s is R13 - R6.
    38TEXT ·decode(SB), NOSPLIT, $56-56
    39	// Initialize R6, R7 and R8-R13.
    40	MOVD dst_base+0(FP), R8
    41	MOVD dst_len+8(FP), R9
    42	MOVD R8, R7
    43	MOVD R8, R10
    44	ADD  R9, R10, R10
    45	MOVD src_base+24(FP), R11
    46	MOVD src_len+32(FP), R12
    47	MOVD R11, R6
    48	MOVD R11, R13
    49	ADD  R12, R13, R13
    50
    51loop:
    52	// for s < len(src)
    53	CMP R13, R6
    54	BEQ end
    55
    56	// R4 = uint32(src[s])
    57	//
    58	// switch src[s] & 0x03
    59	MOVBU (R6), R4
    60	MOVW  R4, R3
    61	ANDW  $3, R3
    62	MOVW  $1, R1
    63	CMPW  R1, R3
    64	BGE   tagCopy
    65
    66	// ----------------------------------------
    67	// The code below handles literal tags.
    68
    69	// case tagLiteral:
    70	// x := uint32(src[s] >> 2)
    71	// switch
    72	MOVW $60, R1
    73	LSRW $2, R4, R4
    74	CMPW R4, R1
    75	BLS  tagLit60Plus
    76
    77	// case x < 60:
    78	// s++
    79	ADD $1, R6, R6
    80
    81doLit:
    82	// This is the end of the inner "switch", when we have a literal tag.
    83	//
    84	// We assume that R4 == x and x fits in a uint32, where x is the variable
    85	// used in the pure Go decode_other.go code.
    86
    87	// length = int(x) + 1
    88	//
    89	// Unlike the pure Go code, we don't need to check if length <= 0 because
    90	// R4 can hold 64 bits, so the increment cannot overflow.
    91	ADD $1, R4, R4
    92
    93	// Prepare to check if copying length bytes will run past the end of dst or
    94	// src.
    95	//
    96	// R2 = len(dst) - d
    97	// R3 = len(src) - s
    98	MOVD R10, R2
    99	SUB  R7, R2, R2
   100	MOVD R13, R3
   101	SUB  R6, R3, R3
   102
   103	// !!! Try a faster technique for short (16 or fewer bytes) copies.
   104	//
   105	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
   106	//   goto callMemmove // Fall back on calling runtime·memmove.
   107	// }
   108	//
   109	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
   110	// against 21 instead of 16, because it cannot assume that all of its input
   111	// is contiguous in memory and so it needs to leave enough source bytes to
   112	// read the next tag without refilling buffers, but Go's Decode assumes
   113	// contiguousness (the src argument is a []byte).
   114	CMP $16, R4
   115	BGT callMemmove
   116	CMP $16, R2
   117	BLT callMemmove
   118	CMP $16, R3
   119	BLT callMemmove
   120
   121	// !!! Implement the copy from src to dst as a 16-byte load and store.
   122	// (Decode's documentation says that dst and src must not overlap.)
   123	//
   124	// This always copies 16 bytes, instead of only length bytes, but that's
   125	// OK. If the input is a valid Snappy encoding then subsequent iterations
   126	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
   127	// non-nil error), so the overrun will be ignored.
   128	//
   129	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
   130	// 16-byte loads and stores. This technique probably wouldn't be as
   131	// effective on architectures that are fussier about alignment.
   132	LDP 0(R6), (R14, R15)
   133	STP (R14, R15), 0(R7)
   134
   135	// d += length
   136	// s += length
   137	ADD R4, R7, R7
   138	ADD R4, R6, R6
   139	B   loop
   140
   141callMemmove:
   142	// if length > len(dst)-d || length > len(src)-s { etc }
   143	CMP R2, R4
   144	BGT errCorrupt
   145	CMP R3, R4
   146	BGT errCorrupt
   147
   148	// copy(dst[d:], src[s:s+length])
   149	//
   150	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
   151	// R7, R6 and R4 as arguments. Coincidentally, we also need to spill those
   152	// three registers to the stack, to save local variables across the CALL.
   153	MOVD R7, 8(RSP)
   154	MOVD R6, 16(RSP)
   155	MOVD R4, 24(RSP)
   156	MOVD R7, 32(RSP)
   157	MOVD R6, 40(RSP)
   158	MOVD R4, 48(RSP)
   159	CALL runtime·memmove(SB)
   160
   161	// Restore local variables: unspill registers from the stack and
   162	// re-calculate R8-R13.
   163	MOVD 32(RSP), R7
   164	MOVD 40(RSP), R6
   165	MOVD 48(RSP), R4
   166	MOVD dst_base+0(FP), R8
   167	MOVD dst_len+8(FP), R9
   168	MOVD R8, R10
   169	ADD  R9, R10, R10
   170	MOVD src_base+24(FP), R11
   171	MOVD src_len+32(FP), R12
   172	MOVD R11, R13
   173	ADD  R12, R13, R13
   174
   175	// d += length
   176	// s += length
   177	ADD R4, R7, R7
   178	ADD R4, R6, R6
   179	B   loop
   180
   181tagLit60Plus:
   182	// !!! This fragment does the
   183	//
   184	// s += x - 58; if uint(s) > uint(len(src)) { etc }
   185	//
   186	// checks. In the asm version, we code it once instead of once per switch case.
   187	ADD  R4, R6, R6
   188	SUB  $58, R6, R6
   189	MOVD R6, R3
   190	SUB  R11, R3, R3
   191	CMP  R12, R3
   192	BGT  errCorrupt
   193
   194	// case x == 60:
   195	MOVW $61, R1
   196	CMPW R1, R4
   197	BEQ  tagLit61
   198	BGT  tagLit62Plus
   199
   200	// x = uint32(src[s-1])
   201	MOVBU -1(R6), R4
   202	B     doLit
   203
   204tagLit61:
   205	// case x == 61:
   206	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
   207	MOVHU -2(R6), R4
   208	B     doLit
   209
   210tagLit62Plus:
   211	CMPW $62, R4
   212	BHI  tagLit63
   213
   214	// case x == 62:
   215	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
   216	MOVHU -3(R6), R4
   217	MOVBU -1(R6), R3
   218	ORR   R3<<16, R4
   219	B     doLit
   220
   221tagLit63:
   222	// case x == 63:
   223	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
   224	MOVWU -4(R6), R4
   225	B     doLit
   226
   227	// The code above handles literal tags.
   228	// ----------------------------------------
   229	// The code below handles copy tags.
   230
   231tagCopy4:
   232	// case tagCopy4:
   233	// s += 5
   234	ADD $5, R6, R6
   235
   236	// if uint(s) > uint(len(src)) { etc }
   237	MOVD R6, R3
   238	SUB  R11, R3, R3
   239	CMP  R12, R3
   240	BGT  errCorrupt
   241
   242	// length = 1 + int(src[s-5])>>2
   243	MOVD $1, R1
   244	ADD  R4>>2, R1, R4
   245
   246	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
   247	MOVWU -4(R6), R5
   248	B     doCopy
   249
   250tagCopy2:
   251	// case tagCopy2:
   252	// s += 3
   253	ADD $3, R6, R6
   254
   255	// if uint(s) > uint(len(src)) { etc }
   256	MOVD R6, R3
   257	SUB  R11, R3, R3
   258	CMP  R12, R3
   259	BGT  errCorrupt
   260
   261	// length = 1 + int(src[s-3])>>2
   262	MOVD $1, R1
   263	ADD  R4>>2, R1, R4
   264
   265	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
   266	MOVHU -2(R6), R5
   267	B     doCopy
   268
   269tagCopy:
   270	// We have a copy tag. We assume that:
   271	//	- R3 == src[s] & 0x03
   272	//	- R4 == src[s]
   273	CMP $2, R3
   274	BEQ tagCopy2
   275	BGT tagCopy4
   276
   277	// case tagCopy1:
   278	// s += 2
   279	ADD $2, R6, R6
   280
   281	// if uint(s) > uint(len(src)) { etc }
   282	MOVD R6, R3
   283	SUB  R11, R3, R3
   284	CMP  R12, R3
   285	BGT  errCorrupt
   286
   287	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
   288	MOVD  R4, R5
   289	AND   $0xe0, R5
   290	MOVBU -1(R6), R3
   291	ORR   R5<<3, R3, R5
   292
   293	// length = 4 + int(src[s-2])>>2&0x7
   294	MOVD $7, R1
   295	AND  R4>>2, R1, R4
   296	ADD  $4, R4, R4
   297
   298doCopy:
   299	// This is the end of the outer "switch", when we have a copy tag.
   300	//
   301	// We assume that:
   302	//	- R4 == length && R4 > 0
   303	//	- R5 == offset
   304
   305	// if offset <= 0 { etc }
   306	MOVD $0, R1
   307	CMP  R1, R5
   308	BLE  errCorrupt
   309
   310	// if d < offset { etc }
   311	MOVD R7, R3
   312	SUB  R8, R3, R3
   313	CMP  R5, R3
   314	BLT  errCorrupt
   315
   316	// if length > len(dst)-d { etc }
   317	MOVD R10, R3
   318	SUB  R7, R3, R3
   319	CMP  R3, R4
   320	BGT  errCorrupt
   321
   322	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
   323	//
   324	// Set:
   325	//	- R14 = len(dst)-d
   326	//	- R15 = &dst[d-offset]
   327	MOVD R10, R14
   328	SUB  R7, R14, R14
   329	MOVD R7, R15
   330	SUB  R5, R15, R15
   331
   332	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
   333	//
   334	// First, try using two 8-byte load/stores, similar to the doLit technique
   335	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
   336	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
   337	// and not one 16-byte load/store, and the first store has to be before the
   338	// second load, due to the overlap if offset is in the range [8, 16).
   339	//
   340	// if length > 16 || offset < 8 || len(dst)-d < 16 {
   341	//   goto slowForwardCopy
   342	// }
   343	// copy 16 bytes
   344	// d += length
   345	CMP  $16, R4
   346	BGT  slowForwardCopy
   347	CMP  $8, R5
   348	BLT  slowForwardCopy
   349	CMP  $16, R14
   350	BLT  slowForwardCopy
   351	MOVD 0(R15), R2
   352	MOVD R2, 0(R7)
   353	MOVD 8(R15), R3
   354	MOVD R3, 8(R7)
   355	ADD  R4, R7, R7
   356	B    loop
   357
   358slowForwardCopy:
   359	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
   360	// can still try 8-byte load stores, provided we can overrun up to 10 extra
   361	// bytes. As above, the overrun will be fixed up by subsequent iterations
   362	// of the outermost loop.
   363	//
   364	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
   365	// commentary says:
   366	//
   367	// ----
   368	//
   369	// The main part of this loop is a simple copy of eight bytes at a time
   370	// until we've copied (at least) the requested amount of bytes.  However,
   371	// if d and d-offset are less than eight bytes apart (indicating a
   372	// repeating pattern of length < 8), we first need to expand the pattern in
   373	// order to get the correct results. For instance, if the buffer looks like
   374	// this, with the eight-byte <d-offset> and <d> patterns marked as
   375	// intervals:
   376	//
   377	//    abxxxxxxxxxxxx
   378	//    [------]           d-offset
   379	//      [------]         d
   380	//
   381	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
   382	// once, after which we can move <d> two bytes without moving <d-offset>:
   383	//
   384	//    ababxxxxxxxxxx
   385	//    [------]           d-offset
   386	//        [------]       d
   387	//
   388	// and repeat the exercise until the two no longer overlap.
   389	//
   390	// This allows us to do very well in the special case of one single byte
   391	// repeated many times, without taking a big hit for more general cases.
   392	//
   393	// The worst case of extra writing past the end of the match occurs when
   394	// offset == 1 and length == 1; the last copy will read from byte positions
   395	// [0..7] and write to [4..11], whereas it was only supposed to write to
   396	// position 1. Thus, ten excess bytes.
   397	//
   398	// ----
   399	//
   400	// That "10 byte overrun" worst case is confirmed by Go's
   401	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
   402	// and finishSlowForwardCopy algorithm.
   403	//
   404	// if length > len(dst)-d-10 {
   405	//   goto verySlowForwardCopy
   406	// }
   407	SUB $10, R14, R14
   408	CMP R14, R4
   409	BGT verySlowForwardCopy
   410
   411makeOffsetAtLeast8:
   412	// !!! As above, expand the pattern so that offset >= 8 and we can use
   413	// 8-byte load/stores.
   414	//
   415	// for offset < 8 {
   416	//   copy 8 bytes from dst[d-offset:] to dst[d:]
   417	//   length -= offset
   418	//   d      += offset
   419	//   offset += offset
   420	//   // The two previous lines together means that d-offset, and therefore
   421	//   // R15, is unchanged.
   422	// }
   423	CMP  $8, R5
   424	BGE  fixUpSlowForwardCopy
   425	MOVD (R15), R3
   426	MOVD R3, (R7)
   427	SUB  R5, R4, R4
   428	ADD  R5, R7, R7
   429	ADD  R5, R5, R5
   430	B    makeOffsetAtLeast8
   431
   432fixUpSlowForwardCopy:
   433	// !!! Add length (which might be negative now) to d (implied by R7 being
   434	// &dst[d]) so that d ends up at the right place when we jump back to the
   435	// top of the loop. Before we do that, though, we save R7 to R2 so that, if
   436	// length is positive, copying the remaining length bytes will write to the
   437	// right place.
   438	MOVD R7, R2
   439	ADD  R4, R7, R7
   440
   441finishSlowForwardCopy:
   442	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
   443	// length means that we overrun, but as above, that will be fixed up by
   444	// subsequent iterations of the outermost loop.
   445	MOVD $0, R1
   446	CMP  R1, R4
   447	BLE  loop
   448	MOVD (R15), R3
   449	MOVD R3, (R2)
   450	ADD  $8, R15, R15
   451	ADD  $8, R2, R2
   452	SUB  $8, R4, R4
   453	B    finishSlowForwardCopy
   454
   455verySlowForwardCopy:
   456	// verySlowForwardCopy is a simple implementation of forward copy. In C
   457	// parlance, this is a do/while loop instead of a while loop, since we know
   458	// that length > 0. In Go syntax:
   459	//
   460	// for {
   461	//   dst[d] = dst[d - offset]
   462	//   d++
   463	//   length--
   464	//   if length == 0 {
   465	//     break
   466	//   }
   467	// }
   468	MOVB (R15), R3
   469	MOVB R3, (R7)
   470	ADD  $1, R15, R15
   471	ADD  $1, R7, R7
   472	SUB  $1, R4, R4
   473	CBNZ R4, verySlowForwardCopy
   474	B    loop
   475
   476	// The code above handles copy tags.
   477	// ----------------------------------------
   478
   479end:
   480	// This is the end of the "for s < len(src)".
   481	//
   482	// if d != len(dst) { etc }
   483	CMP R10, R7
   484	BNE errCorrupt
   485
   486	// return 0
   487	MOVD $0, ret+48(FP)
   488	RET
   489
   490errCorrupt:
   491	// return decodeErrCodeCorrupt
   492	MOVD $1, R2
   493	MOVD R2, ret+48(FP)
   494	RET
View as plain text