encode_amd64.s

Documentation: github.com/golang/snappy

     1// Copyright 2016 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5// +build !appengine
     6// +build gc
     7// +build !noasm
     8
     9#include "textflag.h"
    10
    11// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
    12// Go toolchain regression. See https://github.com/golang/go/issues/15426 and
    13// https://github.com/golang/snappy/issues/29
    14//
    15// As a workaround, the package was built with a known good assembler, and
    16// those instructions were disassembled by "objdump -d" to yield the
    17//	4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
    18// style comments, in AT&T asm syntax. Note that rsp here is a physical
    19// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
    20// The instructions were then encoded as "BYTE $0x.." sequences, which assemble
    21// fine on Go 1.6.
    22
    23// The asm code generally follows the pure Go code in encode_other.go, except
    24// where marked with a "!!!".
    25
    26// ----------------------------------------------------------------------------
    27
    28// func emitLiteral(dst, lit []byte) int
    29//
    30// All local variables fit into registers. The register allocation:
    31//	- AX	len(lit)
    32//	- BX	n
    33//	- DX	return value
    34//	- DI	&dst[i]
    35//	- R10	&lit[0]
    36//
    37// The 24 bytes of stack space is to call runtime·memmove.
    38//
    39// The unusual register allocation of local variables, such as R10 for the
    40// source pointer, matches the allocation used at the call site in encodeBlock,
    41// which makes it easier to manually inline this function.
    42TEXT ·emitLiteral(SB), NOSPLIT, $24-56
    43	MOVQ dst_base+0(FP), DI
    44	MOVQ lit_base+24(FP), R10
    45	MOVQ lit_len+32(FP), AX
    46	MOVQ AX, DX
    47	MOVL AX, BX
    48	SUBL $1, BX
    49
    50	CMPL BX, $60
    51	JLT  oneByte
    52	CMPL BX, $256
    53	JLT  twoBytes
    54
    55threeBytes:
    56	MOVB $0xf4, 0(DI)
    57	MOVW BX, 1(DI)
    58	ADDQ $3, DI
    59	ADDQ $3, DX
    60	JMP  memmove
    61
    62twoBytes:
    63	MOVB $0xf0, 0(DI)
    64	MOVB BX, 1(DI)
    65	ADDQ $2, DI
    66	ADDQ $2, DX
    67	JMP  memmove
    68
    69oneByte:
    70	SHLB $2, BX
    71	MOVB BX, 0(DI)
    72	ADDQ $1, DI
    73	ADDQ $1, DX
    74
    75memmove:
    76	MOVQ DX, ret+48(FP)
    77
    78	// copy(dst[i:], lit)
    79	//
    80	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
    81	// DI, R10 and AX as arguments.
    82	MOVQ DI, 0(SP)
    83	MOVQ R10, 8(SP)
    84	MOVQ AX, 16(SP)
    85	CALL runtime·memmove(SB)
    86	RET
    87
    88// ----------------------------------------------------------------------------
    89
    90// func emitCopy(dst []byte, offset, length int) int
    91//
    92// All local variables fit into registers. The register allocation:
    93//	- AX	length
    94//	- SI	&dst[0]
    95//	- DI	&dst[i]
    96//	- R11	offset
    97//
    98// The unusual register allocation of local variables, such as R11 for the
    99// offset, matches the allocation used at the call site in encodeBlock, which
   100// makes it easier to manually inline this function.
   101TEXT ·emitCopy(SB), NOSPLIT, $0-48
   102	MOVQ dst_base+0(FP), DI
   103	MOVQ DI, SI
   104	MOVQ offset+24(FP), R11
   105	MOVQ length+32(FP), AX
   106
   107loop0:
   108	// for length >= 68 { etc }
   109	CMPL AX, $68
   110	JLT  step1
   111
   112	// Emit a length 64 copy, encoded as 3 bytes.
   113	MOVB $0xfe, 0(DI)
   114	MOVW R11, 1(DI)
   115	ADDQ $3, DI
   116	SUBL $64, AX
   117	JMP  loop0
   118
   119step1:
   120	// if length > 64 { etc }
   121	CMPL AX, $64
   122	JLE  step2
   123
   124	// Emit a length 60 copy, encoded as 3 bytes.
   125	MOVB $0xee, 0(DI)
   126	MOVW R11, 1(DI)
   127	ADDQ $3, DI
   128	SUBL $60, AX
   129
   130step2:
   131	// if length >= 12 || offset >= 2048 { goto step3 }
   132	CMPL AX, $12
   133	JGE  step3
   134	CMPL R11, $2048
   135	JGE  step3
   136
   137	// Emit the remaining copy, encoded as 2 bytes.
   138	MOVB R11, 1(DI)
   139	SHRL $8, R11
   140	SHLB $5, R11
   141	SUBB $4, AX
   142	SHLB $2, AX
   143	ORB  AX, R11
   144	ORB  $1, R11
   145	MOVB R11, 0(DI)
   146	ADDQ $2, DI
   147
   148	// Return the number of bytes written.
   149	SUBQ SI, DI
   150	MOVQ DI, ret+40(FP)
   151	RET
   152
   153step3:
   154	// Emit the remaining copy, encoded as 3 bytes.
   155	SUBL $1, AX
   156	SHLB $2, AX
   157	ORB  $2, AX
   158	MOVB AX, 0(DI)
   159	MOVW R11, 1(DI)
   160	ADDQ $3, DI
   161
   162	// Return the number of bytes written.
   163	SUBQ SI, DI
   164	MOVQ DI, ret+40(FP)
   165	RET
   166
   167// ----------------------------------------------------------------------------
   168
   169// func extendMatch(src []byte, i, j int) int
   170//
   171// All local variables fit into registers. The register allocation:
   172//	- DX	&src[0]
   173//	- SI	&src[j]
   174//	- R13	&src[len(src) - 8]
   175//	- R14	&src[len(src)]
   176//	- R15	&src[i]
   177//
   178// The unusual register allocation of local variables, such as R15 for a source
   179// pointer, matches the allocation used at the call site in encodeBlock, which
   180// makes it easier to manually inline this function.
   181TEXT ·extendMatch(SB), NOSPLIT, $0-48
   182	MOVQ src_base+0(FP), DX
   183	MOVQ src_len+8(FP), R14
   184	MOVQ i+24(FP), R15
   185	MOVQ j+32(FP), SI
   186	ADDQ DX, R14
   187	ADDQ DX, R15
   188	ADDQ DX, SI
   189	MOVQ R14, R13
   190	SUBQ $8, R13
   191
   192cmp8:
   193	// As long as we are 8 or more bytes before the end of src, we can load and
   194	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
   195	CMPQ SI, R13
   196	JA   cmp1
   197	MOVQ (R15), AX
   198	MOVQ (SI), BX
   199	CMPQ AX, BX
   200	JNE  bsf
   201	ADDQ $8, R15
   202	ADDQ $8, SI
   203	JMP  cmp8
   204
   205bsf:
   206	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
   207	// the index of the first byte that differs. The BSF instruction finds the
   208	// least significant 1 bit, the amd64 architecture is little-endian, and
   209	// the shift by 3 converts a bit index to a byte index.
   210	XORQ AX, BX
   211	BSFQ BX, BX
   212	SHRQ $3, BX
   213	ADDQ BX, SI
   214
   215	// Convert from &src[ret] to ret.
   216	SUBQ DX, SI
   217	MOVQ SI, ret+40(FP)
   218	RET
   219
   220cmp1:
   221	// In src's tail, compare 1 byte at a time.
   222	CMPQ SI, R14
   223	JAE  extendMatchEnd
   224	MOVB (R15), AX
   225	MOVB (SI), BX
   226	CMPB AX, BX
   227	JNE  extendMatchEnd
   228	ADDQ $1, R15
   229	ADDQ $1, SI
   230	JMP  cmp1
   231
   232extendMatchEnd:
   233	// Convert from &src[ret] to ret.
   234	SUBQ DX, SI
   235	MOVQ SI, ret+40(FP)
   236	RET
   237
   238// ----------------------------------------------------------------------------
   239
   240// func encodeBlock(dst, src []byte) (d int)
   241//
   242// All local variables fit into registers, other than "var table". The register
   243// allocation:
   244//	- AX	.	.
   245//	- BX	.	.
   246//	- CX	56	shift (note that amd64 shifts by non-immediates must use CX).
   247//	- DX	64	&src[0], tableSize
   248//	- SI	72	&src[s]
   249//	- DI	80	&dst[d]
   250//	- R9	88	sLimit
   251//	- R10	.	&src[nextEmit]
   252//	- R11	96	prevHash, currHash, nextHash, offset
   253//	- R12	104	&src[base], skip
   254//	- R13	.	&src[nextS], &src[len(src) - 8]
   255//	- R14	.	len(src), bytesBetweenHashLookups, &src[len(src)], x
   256//	- R15	112	candidate
   257//
   258// The second column (56, 64, etc) is the stack offset to spill the registers
   259// when calling other functions. We could pack this slightly tighter, but it's
   260// simpler to have a dedicated spill map independent of the function called.
   261//
   262// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
   263// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
   264// local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
   265TEXT ·encodeBlock(SB), 0, $32888-56
   266	MOVQ dst_base+0(FP), DI
   267	MOVQ src_base+24(FP), SI
   268	MOVQ src_len+32(FP), R14
   269
   270	// shift, tableSize := uint32(32-8), 1<<8
   271	MOVQ $24, CX
   272	MOVQ $256, DX
   273
   274calcShift:
   275	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
   276	//	shift--
   277	// }
   278	CMPQ DX, $16384
   279	JGE  varTable
   280	CMPQ DX, R14
   281	JGE  varTable
   282	SUBQ $1, CX
   283	SHLQ $1, DX
   284	JMP  calcShift
   285
   286varTable:
   287	// var table [maxTableSize]uint16
   288	//
   289	// In the asm code, unlike the Go code, we can zero-initialize only the
   290	// first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
   291	// writes 16 bytes, so we can do only tableSize/8 writes instead of the
   292	// 2048 writes that would zero-initialize all of table's 32768 bytes.
   293	SHRQ $3, DX
   294	LEAQ table-32768(SP), BX
   295	PXOR X0, X0
   296
   297memclr:
   298	MOVOU X0, 0(BX)
   299	ADDQ  $16, BX
   300	SUBQ  $1, DX
   301	JNZ   memclr
   302
   303	// !!! DX = &src[0]
   304	MOVQ SI, DX
   305
   306	// sLimit := len(src) - inputMargin
   307	MOVQ R14, R9
   308	SUBQ $15, R9
   309
   310	// !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
   311	// change for the rest of the function.
   312	MOVQ CX, 56(SP)
   313	MOVQ DX, 64(SP)
   314	MOVQ R9, 88(SP)
   315
   316	// nextEmit := 0
   317	MOVQ DX, R10
   318
   319	// s := 1
   320	ADDQ $1, SI
   321
   322	// nextHash := hash(load32(src, s), shift)
   323	MOVL  0(SI), R11
   324	IMULL $0x1e35a7bd, R11
   325	SHRL  CX, R11
   326
   327outer:
   328	// for { etc }
   329
   330	// skip := 32
   331	MOVQ $32, R12
   332
   333	// nextS := s
   334	MOVQ SI, R13
   335
   336	// candidate := 0
   337	MOVQ $0, R15
   338
   339inner0:
   340	// for { etc }
   341
   342	// s := nextS
   343	MOVQ R13, SI
   344
   345	// bytesBetweenHashLookups := skip >> 5
   346	MOVQ R12, R14
   347	SHRQ $5, R14
   348
   349	// nextS = s + bytesBetweenHashLookups
   350	ADDQ R14, R13
   351
   352	// skip += bytesBetweenHashLookups
   353	ADDQ R14, R12
   354
   355	// if nextS > sLimit { goto emitRemainder }
   356	MOVQ R13, AX
   357	SUBQ DX, AX
   358	CMPQ AX, R9
   359	JA   emitRemainder
   360
   361	// candidate = int(table[nextHash])
   362	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
   363	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
   364	BYTE $0x4e
   365	BYTE $0x0f
   366	BYTE $0xb7
   367	BYTE $0x7c
   368	BYTE $0x5c
   369	BYTE $0x78
   370
   371	// table[nextHash] = uint16(s)
   372	MOVQ SI, AX
   373	SUBQ DX, AX
   374
   375	// XXX: MOVW AX, table-32768(SP)(R11*2)
   376	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
   377	BYTE $0x66
   378	BYTE $0x42
   379	BYTE $0x89
   380	BYTE $0x44
   381	BYTE $0x5c
   382	BYTE $0x78
   383
   384	// nextHash = hash(load32(src, nextS), shift)
   385	MOVL  0(R13), R11
   386	IMULL $0x1e35a7bd, R11
   387	SHRL  CX, R11
   388
   389	// if load32(src, s) != load32(src, candidate) { continue } break
   390	MOVL 0(SI), AX
   391	MOVL (DX)(R15*1), BX
   392	CMPL AX, BX
   393	JNE  inner0
   394
   395fourByteMatch:
   396	// As per the encode_other.go code:
   397	//
   398	// A 4-byte match has been found. We'll later see etc.
   399
   400	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
   401	// on inputMargin in encode.go.
   402	MOVQ SI, AX
   403	SUBQ R10, AX
   404	CMPQ AX, $16
   405	JLE  emitLiteralFastPath
   406
   407	// ----------------------------------------
   408	// Begin inline of the emitLiteral call.
   409	//
   410	// d += emitLiteral(dst[d:], src[nextEmit:s])
   411
   412	MOVL AX, BX
   413	SUBL $1, BX
   414
   415	CMPL BX, $60
   416	JLT  inlineEmitLiteralOneByte
   417	CMPL BX, $256
   418	JLT  inlineEmitLiteralTwoBytes
   419
   420inlineEmitLiteralThreeBytes:
   421	MOVB $0xf4, 0(DI)
   422	MOVW BX, 1(DI)
   423	ADDQ $3, DI
   424	JMP  inlineEmitLiteralMemmove
   425
   426inlineEmitLiteralTwoBytes:
   427	MOVB $0xf0, 0(DI)
   428	MOVB BX, 1(DI)
   429	ADDQ $2, DI
   430	JMP  inlineEmitLiteralMemmove
   431
   432inlineEmitLiteralOneByte:
   433	SHLB $2, BX
   434	MOVB BX, 0(DI)
   435	ADDQ $1, DI
   436
   437inlineEmitLiteralMemmove:
   438	// Spill local variables (registers) onto the stack; call; unspill.
   439	//
   440	// copy(dst[i:], lit)
   441	//
   442	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
   443	// DI, R10 and AX as arguments.
   444	MOVQ DI, 0(SP)
   445	MOVQ R10, 8(SP)
   446	MOVQ AX, 16(SP)
   447	ADDQ AX, DI              // Finish the "d +=" part of "d += emitLiteral(etc)".
   448	MOVQ SI, 72(SP)
   449	MOVQ DI, 80(SP)
   450	MOVQ R15, 112(SP)
   451	CALL runtime·memmove(SB)
   452	MOVQ 56(SP), CX
   453	MOVQ 64(SP), DX
   454	MOVQ 72(SP), SI
   455	MOVQ 80(SP), DI
   456	MOVQ 88(SP), R9
   457	MOVQ 112(SP), R15
   458	JMP  inner1
   459
   460inlineEmitLiteralEnd:
   461	// End inline of the emitLiteral call.
   462	// ----------------------------------------
   463
   464emitLiteralFastPath:
   465	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
   466	MOVB AX, BX
   467	SUBB $1, BX
   468	SHLB $2, BX
   469	MOVB BX, (DI)
   470	ADDQ $1, DI
   471
   472	// !!! Implement the copy from lit to dst as a 16-byte load and store.
   473	// (Encode's documentation says that dst and src must not overlap.)
   474	//
   475	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
   476	// OK. Subsequent iterations will fix up the overrun.
   477	//
   478	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
   479	// 16-byte loads and stores. This technique probably wouldn't be as
   480	// effective on architectures that are fussier about alignment.
   481	MOVOU 0(R10), X0
   482	MOVOU X0, 0(DI)
   483	ADDQ  AX, DI
   484
   485inner1:
   486	// for { etc }
   487
   488	// base := s
   489	MOVQ SI, R12
   490
   491	// !!! offset := base - candidate
   492	MOVQ R12, R11
   493	SUBQ R15, R11
   494	SUBQ DX, R11
   495
   496	// ----------------------------------------
   497	// Begin inline of the extendMatch call.
   498	//
   499	// s = extendMatch(src, candidate+4, s+4)
   500
   501	// !!! R14 = &src[len(src)]
   502	MOVQ src_len+32(FP), R14
   503	ADDQ DX, R14
   504
   505	// !!! R13 = &src[len(src) - 8]
   506	MOVQ R14, R13
   507	SUBQ $8, R13
   508
   509	// !!! R15 = &src[candidate + 4]
   510	ADDQ $4, R15
   511	ADDQ DX, R15
   512
   513	// !!! s += 4
   514	ADDQ $4, SI
   515
   516inlineExtendMatchCmp8:
   517	// As long as we are 8 or more bytes before the end of src, we can load and
   518	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
   519	CMPQ SI, R13
   520	JA   inlineExtendMatchCmp1
   521	MOVQ (R15), AX
   522	MOVQ (SI), BX
   523	CMPQ AX, BX
   524	JNE  inlineExtendMatchBSF
   525	ADDQ $8, R15
   526	ADDQ $8, SI
   527	JMP  inlineExtendMatchCmp8
   528
   529inlineExtendMatchBSF:
   530	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
   531	// the index of the first byte that differs. The BSF instruction finds the
   532	// least significant 1 bit, the amd64 architecture is little-endian, and
   533	// the shift by 3 converts a bit index to a byte index.
   534	XORQ AX, BX
   535	BSFQ BX, BX
   536	SHRQ $3, BX
   537	ADDQ BX, SI
   538	JMP  inlineExtendMatchEnd
   539
   540inlineExtendMatchCmp1:
   541	// In src's tail, compare 1 byte at a time.
   542	CMPQ SI, R14
   543	JAE  inlineExtendMatchEnd
   544	MOVB (R15), AX
   545	MOVB (SI), BX
   546	CMPB AX, BX
   547	JNE  inlineExtendMatchEnd
   548	ADDQ $1, R15
   549	ADDQ $1, SI
   550	JMP  inlineExtendMatchCmp1
   551
   552inlineExtendMatchEnd:
   553	// End inline of the extendMatch call.
   554	// ----------------------------------------
   555
   556	// ----------------------------------------
   557	// Begin inline of the emitCopy call.
   558	//
   559	// d += emitCopy(dst[d:], base-candidate, s-base)
   560
   561	// !!! length := s - base
   562	MOVQ SI, AX
   563	SUBQ R12, AX
   564
   565inlineEmitCopyLoop0:
   566	// for length >= 68 { etc }
   567	CMPL AX, $68
   568	JLT  inlineEmitCopyStep1
   569
   570	// Emit a length 64 copy, encoded as 3 bytes.
   571	MOVB $0xfe, 0(DI)
   572	MOVW R11, 1(DI)
   573	ADDQ $3, DI
   574	SUBL $64, AX
   575	JMP  inlineEmitCopyLoop0
   576
   577inlineEmitCopyStep1:
   578	// if length > 64 { etc }
   579	CMPL AX, $64
   580	JLE  inlineEmitCopyStep2
   581
   582	// Emit a length 60 copy, encoded as 3 bytes.
   583	MOVB $0xee, 0(DI)
   584	MOVW R11, 1(DI)
   585	ADDQ $3, DI
   586	SUBL $60, AX
   587
   588inlineEmitCopyStep2:
   589	// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
   590	CMPL AX, $12
   591	JGE  inlineEmitCopyStep3
   592	CMPL R11, $2048
   593	JGE  inlineEmitCopyStep3
   594
   595	// Emit the remaining copy, encoded as 2 bytes.
   596	MOVB R11, 1(DI)
   597	SHRL $8, R11
   598	SHLB $5, R11
   599	SUBB $4, AX
   600	SHLB $2, AX
   601	ORB  AX, R11
   602	ORB  $1, R11
   603	MOVB R11, 0(DI)
   604	ADDQ $2, DI
   605	JMP  inlineEmitCopyEnd
   606
   607inlineEmitCopyStep3:
   608	// Emit the remaining copy, encoded as 3 bytes.
   609	SUBL $1, AX
   610	SHLB $2, AX
   611	ORB  $2, AX
   612	MOVB AX, 0(DI)
   613	MOVW R11, 1(DI)
   614	ADDQ $3, DI
   615
   616inlineEmitCopyEnd:
   617	// End inline of the emitCopy call.
   618	// ----------------------------------------
   619
   620	// nextEmit = s
   621	MOVQ SI, R10
   622
   623	// if s >= sLimit { goto emitRemainder }
   624	MOVQ SI, AX
   625	SUBQ DX, AX
   626	CMPQ AX, R9
   627	JAE  emitRemainder
   628
   629	// As per the encode_other.go code:
   630	//
   631	// We could immediately etc.
   632
   633	// x := load64(src, s-1)
   634	MOVQ -1(SI), R14
   635
   636	// prevHash := hash(uint32(x>>0), shift)
   637	MOVL  R14, R11
   638	IMULL $0x1e35a7bd, R11
   639	SHRL  CX, R11
   640
   641	// table[prevHash] = uint16(s-1)
   642	MOVQ SI, AX
   643	SUBQ DX, AX
   644	SUBQ $1, AX
   645
   646	// XXX: MOVW AX, table-32768(SP)(R11*2)
   647	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
   648	BYTE $0x66
   649	BYTE $0x42
   650	BYTE $0x89
   651	BYTE $0x44
   652	BYTE $0x5c
   653	BYTE $0x78
   654
   655	// currHash := hash(uint32(x>>8), shift)
   656	SHRQ  $8, R14
   657	MOVL  R14, R11
   658	IMULL $0x1e35a7bd, R11
   659	SHRL  CX, R11
   660
   661	// candidate = int(table[currHash])
   662	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
   663	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
   664	BYTE $0x4e
   665	BYTE $0x0f
   666	BYTE $0xb7
   667	BYTE $0x7c
   668	BYTE $0x5c
   669	BYTE $0x78
   670
   671	// table[currHash] = uint16(s)
   672	ADDQ $1, AX
   673
   674	// XXX: MOVW AX, table-32768(SP)(R11*2)
   675	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
   676	BYTE $0x66
   677	BYTE $0x42
   678	BYTE $0x89
   679	BYTE $0x44
   680	BYTE $0x5c
   681	BYTE $0x78
   682
   683	// if uint32(x>>8) == load32(src, candidate) { continue }
   684	MOVL (DX)(R15*1), BX
   685	CMPL R14, BX
   686	JEQ  inner1
   687
   688	// nextHash = hash(uint32(x>>16), shift)
   689	SHRQ  $8, R14
   690	MOVL  R14, R11
   691	IMULL $0x1e35a7bd, R11
   692	SHRL  CX, R11
   693
   694	// s++
   695	ADDQ $1, SI
   696
   697	// break out of the inner1 for loop, i.e. continue the outer loop.
   698	JMP outer
   699
   700emitRemainder:
   701	// if nextEmit < len(src) { etc }
   702	MOVQ src_len+32(FP), AX
   703	ADDQ DX, AX
   704	CMPQ R10, AX
   705	JEQ  encodeBlockEnd
   706
   707	// d += emitLiteral(dst[d:], src[nextEmit:])
   708	//
   709	// Push args.
   710	MOVQ DI, 0(SP)
   711	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
   712	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
   713	MOVQ R10, 24(SP)
   714	SUBQ R10, AX
   715	MOVQ AX, 32(SP)
   716	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
   717
   718	// Spill local variables (registers) onto the stack; call; unspill.
   719	MOVQ DI, 80(SP)
   720	CALL ·emitLiteral(SB)
   721	MOVQ 80(SP), DI
   722
   723	// Finish the "d +=" part of "d += emitLiteral(etc)".
   724	ADDQ 48(SP), DI
   725
   726encodeBlockEnd:
   727	MOVQ dst_base+0(FP), AX
   728	SUBQ AX, DI
   729	MOVQ DI, d+48(FP)
   730	RET
View as plain text