1//go:build !appengine && gc && !purego && !noasm
2// +build !appengine
3// +build gc
4// +build !purego
5// +build !noasm
6
7#include "textflag.h"
8
9// Registers:
10#define digest R1
11#define h R2 // return value
12#define p R3 // input pointer
13#define n R4 // input length
14#define nblocks R5 // n / 32
15#define prime1 R7
16#define prime2 R8
17#define prime3 R9
18#define prime4 R10
19#define prime5 R11
20#define v1 R12
21#define v2 R13
22#define v3 R14
23#define v4 R15
24#define x1 R20
25#define x2 R21
26#define x3 R22
27#define x4 R23
28
29#define round(acc, x) \
30 MADD prime2, acc, x, acc \
31 ROR $64-31, acc \
32 MUL prime1, acc
33
34// round0 performs the operation x = round(0, x).
35#define round0(x) \
36 MUL prime2, x \
37 ROR $64-31, x \
38 MUL prime1, x
39
40#define mergeRound(acc, x) \
41 round0(x) \
42 EOR x, acc \
43 MADD acc, prime4, prime1, acc
44
45// blockLoop processes as many 32-byte blocks as possible,
46// updating v1, v2, v3, and v4. It assumes that n >= 32.
47#define blockLoop() \
48 LSR $5, n, nblocks \
49 PCALIGN $16 \
50 loop: \
51 LDP.P 16(p), (x1, x2) \
52 LDP.P 16(p), (x3, x4) \
53 round(v1, x1) \
54 round(v2, x2) \
55 round(v3, x3) \
56 round(v4, x4) \
57 SUB $1, nblocks \
58 CBNZ nblocks, loop
59
60// func Sum64(b []byte) uint64
61TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
62 LDP b_base+0(FP), (p, n)
63
64 LDP ·primes+0(SB), (prime1, prime2)
65 LDP ·primes+16(SB), (prime3, prime4)
66 MOVD ·primes+32(SB), prime5
67
68 CMP $32, n
69 CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 }
70 BLT afterLoop
71
72 ADD prime1, prime2, v1
73 MOVD prime2, v2
74 MOVD $0, v3
75 NEG prime1, v4
76
77 blockLoop()
78
79 ROR $64-1, v1, x1
80 ROR $64-7, v2, x2
81 ADD x1, x2
82 ROR $64-12, v3, x3
83 ROR $64-18, v4, x4
84 ADD x3, x4
85 ADD x2, x4, h
86
87 mergeRound(h, v1)
88 mergeRound(h, v2)
89 mergeRound(h, v3)
90 mergeRound(h, v4)
91
92afterLoop:
93 ADD n, h
94
95 TBZ $4, n, try8
96 LDP.P 16(p), (x1, x2)
97
98 round0(x1)
99
100 // NOTE: here and below, sequencing the EOR after the ROR (using a
101 // rotated register) is worth a small but measurable speedup for small
102 // inputs.
103 ROR $64-27, h
104 EOR x1 @> 64-27, h, h
105 MADD h, prime4, prime1, h
106
107 round0(x2)
108 ROR $64-27, h
109 EOR x2 @> 64-27, h, h
110 MADD h, prime4, prime1, h
111
112try8:
113 TBZ $3, n, try4
114 MOVD.P 8(p), x1
115
116 round0(x1)
117 ROR $64-27, h
118 EOR x1 @> 64-27, h, h
119 MADD h, prime4, prime1, h
120
121try4:
122 TBZ $2, n, try2
123 MOVWU.P 4(p), x2
124
125 MUL prime1, x2
126 ROR $64-23, h
127 EOR x2 @> 64-23, h, h
128 MADD h, prime3, prime2, h
129
130try2:
131 TBZ $1, n, try1
132 MOVHU.P 2(p), x3
133 AND $255, x3, x1
134 LSR $8, x3, x2
135
136 MUL prime5, x1
137 ROR $64-11, h
138 EOR x1 @> 64-11, h, h
139 MUL prime1, h
140
141 MUL prime5, x2
142 ROR $64-11, h
143 EOR x2 @> 64-11, h, h
144 MUL prime1, h
145
146try1:
147 TBZ $0, n, finalize
148 MOVBU (p), x4
149
150 MUL prime5, x4
151 ROR $64-11, h
152 EOR x4 @> 64-11, h, h
153 MUL prime1, h
154
155finalize:
156 EOR h >> 33, h
157 MUL prime2, h
158 EOR h >> 29, h
159 MUL prime3, h
160 EOR h >> 32, h
161
162 MOVD h, ret+24(FP)
163 RET
164
165// func writeBlocks(d *Digest, b []byte) int
166TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
167 LDP ·primes+0(SB), (prime1, prime2)
168
169 // Load state. Assume v[1-4] are stored contiguously.
170 MOVD d+0(FP), digest
171 LDP 0(digest), (v1, v2)
172 LDP 16(digest), (v3, v4)
173
174 LDP b_base+8(FP), (p, n)
175
176 blockLoop()
177
178 // Store updated state.
179 STP (v1, v2), 0(digest)
180 STP (v3, v4), 16(digest)
181
182 BIC $31, n
183 MOVD n, ret+32(FP)
184 RET
View as plain text