...

Text file src/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s

Documentation: github.com/klauspost/compress/zstd/internal/xxhash

     1//go:build !appengine && gc && !purego && !noasm
     2// +build !appengine
     3// +build gc
     4// +build !purego
     5// +build !noasm
     6
     7#include "textflag.h"
     8
     9// Registers:
    10#define h      AX
    11#define d      AX
    12#define p      SI // pointer to advance through b
    13#define n      DX
    14#define end    BX // loop end
    15#define v1     R8
    16#define v2     R9
    17#define v3     R10
    18#define v4     R11
    19#define x      R12
    20#define prime1 R13
    21#define prime2 R14
    22#define prime4 DI
    23
    24#define round(acc, x) \
    25	IMULQ prime2, x   \
    26	ADDQ  x, acc      \
    27	ROLQ  $31, acc    \
    28	IMULQ prime1, acc
    29
    30// round0 performs the operation x = round(0, x).
    31#define round0(x) \
    32	IMULQ prime2, x \
    33	ROLQ  $31, x    \
    34	IMULQ prime1, x
    35
    36// mergeRound applies a merge round on the two registers acc and x.
    37// It assumes that prime1, prime2, and prime4 have been loaded.
    38#define mergeRound(acc, x) \
    39	round0(x)         \
    40	XORQ  x, acc      \
    41	IMULQ prime1, acc \
    42	ADDQ  prime4, acc
    43
    44// blockLoop processes as many 32-byte blocks as possible,
    45// updating v1, v2, v3, and v4. It assumes that there is at least one block
    46// to process.
    47#define blockLoop() \
    48loop:  \
    49	MOVQ +0(p), x  \
    50	round(v1, x)   \
    51	MOVQ +8(p), x  \
    52	round(v2, x)   \
    53	MOVQ +16(p), x \
    54	round(v3, x)   \
    55	MOVQ +24(p), x \
    56	round(v4, x)   \
    57	ADDQ $32, p    \
    58	CMPQ p, end    \
    59	JLE  loop
    60
    61// func Sum64(b []byte) uint64
    62TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
    63	// Load fixed primes.
    64	MOVQ ·primes+0(SB), prime1
    65	MOVQ ·primes+8(SB), prime2
    66	MOVQ ·primes+24(SB), prime4
    67
    68	// Load slice.
    69	MOVQ b_base+0(FP), p
    70	MOVQ b_len+8(FP), n
    71	LEAQ (p)(n*1), end
    72
    73	// The first loop limit will be len(b)-32.
    74	SUBQ $32, end
    75
    76	// Check whether we have at least one block.
    77	CMPQ n, $32
    78	JLT  noBlocks
    79
    80	// Set up initial state (v1, v2, v3, v4).
    81	MOVQ prime1, v1
    82	ADDQ prime2, v1
    83	MOVQ prime2, v2
    84	XORQ v3, v3
    85	XORQ v4, v4
    86	SUBQ prime1, v4
    87
    88	blockLoop()
    89
    90	MOVQ v1, h
    91	ROLQ $1, h
    92	MOVQ v2, x
    93	ROLQ $7, x
    94	ADDQ x, h
    95	MOVQ v3, x
    96	ROLQ $12, x
    97	ADDQ x, h
    98	MOVQ v4, x
    99	ROLQ $18, x
   100	ADDQ x, h
   101
   102	mergeRound(h, v1)
   103	mergeRound(h, v2)
   104	mergeRound(h, v3)
   105	mergeRound(h, v4)
   106
   107	JMP afterBlocks
   108
   109noBlocks:
   110	MOVQ ·primes+32(SB), h
   111
   112afterBlocks:
   113	ADDQ n, h
   114
   115	ADDQ $24, end
   116	CMPQ p, end
   117	JG   try4
   118
   119loop8:
   120	MOVQ  (p), x
   121	ADDQ  $8, p
   122	round0(x)
   123	XORQ  x, h
   124	ROLQ  $27, h
   125	IMULQ prime1, h
   126	ADDQ  prime4, h
   127
   128	CMPQ p, end
   129	JLE  loop8
   130
   131try4:
   132	ADDQ $4, end
   133	CMPQ p, end
   134	JG   try1
   135
   136	MOVL  (p), x
   137	ADDQ  $4, p
   138	IMULQ prime1, x
   139	XORQ  x, h
   140
   141	ROLQ  $23, h
   142	IMULQ prime2, h
   143	ADDQ  ·primes+16(SB), h
   144
   145try1:
   146	ADDQ $4, end
   147	CMPQ p, end
   148	JGE  finalize
   149
   150loop1:
   151	MOVBQZX (p), x
   152	ADDQ    $1, p
   153	IMULQ   ·primes+32(SB), x
   154	XORQ    x, h
   155	ROLQ    $11, h
   156	IMULQ   prime1, h
   157
   158	CMPQ p, end
   159	JL   loop1
   160
   161finalize:
   162	MOVQ  h, x
   163	SHRQ  $33, x
   164	XORQ  x, h
   165	IMULQ prime2, h
   166	MOVQ  h, x
   167	SHRQ  $29, x
   168	XORQ  x, h
   169	IMULQ ·primes+16(SB), h
   170	MOVQ  h, x
   171	SHRQ  $32, x
   172	XORQ  x, h
   173
   174	MOVQ h, ret+24(FP)
   175	RET
   176
   177// func writeBlocks(d *Digest, b []byte) int
   178TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
   179	// Load fixed primes needed for round.
   180	MOVQ ·primes+0(SB), prime1
   181	MOVQ ·primes+8(SB), prime2
   182
   183	// Load slice.
   184	MOVQ b_base+8(FP), p
   185	MOVQ b_len+16(FP), n
   186	LEAQ (p)(n*1), end
   187	SUBQ $32, end
   188
   189	// Load vN from d.
   190	MOVQ s+0(FP), d
   191	MOVQ 0(d), v1
   192	MOVQ 8(d), v2
   193	MOVQ 16(d), v3
   194	MOVQ 24(d), v4
   195
   196	// We don't need to check the loop condition here; this function is
   197	// always called with at least one block of data to process.
   198	blockLoop()
   199
   200	// Copy vN back to d.
   201	MOVQ v1, 0(d)
   202	MOVQ v2, 8(d)
   203	MOVQ v3, 16(d)
   204	MOVQ v4, 24(d)
   205
   206	// The number of bytes written is p minus the old base pointer.
   207	SUBQ b_base+8(FP), p
   208	MOVQ p, ret+32(FP)
   209
   210	RET

View as plain text