1//go:build !appengine && gc && !purego
2// +build !appengine
3// +build gc
4// +build !purego
5
6#include "textflag.h"
7
8// Registers:
9#define digest R1
10#define h R2 // return value
11#define p R3 // input pointer
12#define n R4 // input length
13#define nblocks R5 // n / 32
14#define prime1 R7
15#define prime2 R8
16#define prime3 R9
17#define prime4 R10
18#define prime5 R11
19#define v1 R12
20#define v2 R13
21#define v3 R14
22#define v4 R15
23#define x1 R20
24#define x2 R21
25#define x3 R22
26#define x4 R23
27
28#define round(acc, x) \
29 MADD prime2, acc, x, acc \
30 ROR $64-31, acc \
31 MUL prime1, acc
32
33// round0 performs the operation x = round(0, x).
34#define round0(x) \
35 MUL prime2, x \
36 ROR $64-31, x \
37 MUL prime1, x
38
39#define mergeRound(acc, x) \
40 round0(x) \
41 EOR x, acc \
42 MADD acc, prime4, prime1, acc
43
44// blockLoop processes as many 32-byte blocks as possible,
45// updating v1, v2, v3, and v4. It assumes that n >= 32.
46#define blockLoop() \
47 LSR $5, n, nblocks \
48 PCALIGN $16 \
49 loop: \
50 LDP.P 16(p), (x1, x2) \
51 LDP.P 16(p), (x3, x4) \
52 round(v1, x1) \
53 round(v2, x2) \
54 round(v3, x3) \
55 round(v4, x4) \
56 SUB $1, nblocks \
57 CBNZ nblocks, loop
58
59// func Sum64(b []byte) uint64
60TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
61 LDP b_base+0(FP), (p, n)
62
63 LDP ·primes+0(SB), (prime1, prime2)
64 LDP ·primes+16(SB), (prime3, prime4)
65 MOVD ·primes+32(SB), prime5
66
67 CMP $32, n
68 CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 }
69 BLT afterLoop
70
71 ADD prime1, prime2, v1
72 MOVD prime2, v2
73 MOVD $0, v3
74 NEG prime1, v4
75
76 blockLoop()
77
78 ROR $64-1, v1, x1
79 ROR $64-7, v2, x2
80 ADD x1, x2
81 ROR $64-12, v3, x3
82 ROR $64-18, v4, x4
83 ADD x3, x4
84 ADD x2, x4, h
85
86 mergeRound(h, v1)
87 mergeRound(h, v2)
88 mergeRound(h, v3)
89 mergeRound(h, v4)
90
91afterLoop:
92 ADD n, h
93
94 TBZ $4, n, try8
95 LDP.P 16(p), (x1, x2)
96
97 round0(x1)
98
99 // NOTE: here and below, sequencing the EOR after the ROR (using a
100 // rotated register) is worth a small but measurable speedup for small
101 // inputs.
102 ROR $64-27, h
103 EOR x1 @> 64-27, h, h
104 MADD h, prime4, prime1, h
105
106 round0(x2)
107 ROR $64-27, h
108 EOR x2 @> 64-27, h, h
109 MADD h, prime4, prime1, h
110
111try8:
112 TBZ $3, n, try4
113 MOVD.P 8(p), x1
114
115 round0(x1)
116 ROR $64-27, h
117 EOR x1 @> 64-27, h, h
118 MADD h, prime4, prime1, h
119
120try4:
121 TBZ $2, n, try2
122 MOVWU.P 4(p), x2
123
124 MUL prime1, x2
125 ROR $64-23, h
126 EOR x2 @> 64-23, h, h
127 MADD h, prime3, prime2, h
128
129try2:
130 TBZ $1, n, try1
131 MOVHU.P 2(p), x3
132 AND $255, x3, x1
133 LSR $8, x3, x2
134
135 MUL prime5, x1
136 ROR $64-11, h
137 EOR x1 @> 64-11, h, h
138 MUL prime1, h
139
140 MUL prime5, x2
141 ROR $64-11, h
142 EOR x2 @> 64-11, h, h
143 MUL prime1, h
144
145try1:
146 TBZ $0, n, finalize
147 MOVBU (p), x4
148
149 MUL prime5, x4
150 ROR $64-11, h
151 EOR x4 @> 64-11, h, h
152 MUL prime1, h
153
154finalize:
155 EOR h >> 33, h
156 MUL prime2, h
157 EOR h >> 29, h
158 MUL prime3, h
159 EOR h >> 32, h
160
161 MOVD h, ret+24(FP)
162 RET
163
164// func writeBlocks(d *Digest, b []byte) int
165TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
166 LDP ·primes+0(SB), (prime1, prime2)
167
168 // Load state. Assume v[1-4] are stored contiguously.
169 MOVD d+0(FP), digest
170 LDP 0(digest), (v1, v2)
171 LDP 16(digest), (v3, v4)
172
173 LDP b_base+8(FP), (p, n)
174
175 blockLoop()
176
177 // Store updated state.
178 STP (v1, v2), 0(digest)
179 STP (v3, v4), 16(digest)
180
181 BIC $31, n
182 MOVD n, ret+32(FP)
183 RET
View as plain text