1//go:build !appengine && gc && !purego && !noasm
2// +build !appengine
3// +build gc
4// +build !purego
5// +build !noasm
6
7#include "textflag.h"
8
9// Registers:
10#define h AX
11#define d AX
12#define p SI // pointer to advance through b
13#define n DX
14#define end BX // loop end
15#define v1 R8
16#define v2 R9
17#define v3 R10
18#define v4 R11
19#define x R12
20#define prime1 R13
21#define prime2 R14
22#define prime4 DI
23
24#define round(acc, x) \
25 IMULQ prime2, x \
26 ADDQ x, acc \
27 ROLQ $31, acc \
28 IMULQ prime1, acc
29
30// round0 performs the operation x = round(0, x).
31#define round0(x) \
32 IMULQ prime2, x \
33 ROLQ $31, x \
34 IMULQ prime1, x
35
36// mergeRound applies a merge round on the two registers acc and x.
37// It assumes that prime1, prime2, and prime4 have been loaded.
38#define mergeRound(acc, x) \
39 round0(x) \
40 XORQ x, acc \
41 IMULQ prime1, acc \
42 ADDQ prime4, acc
43
44// blockLoop processes as many 32-byte blocks as possible,
45// updating v1, v2, v3, and v4. It assumes that there is at least one block
46// to process.
47#define blockLoop() \
48loop: \
49 MOVQ +0(p), x \
50 round(v1, x) \
51 MOVQ +8(p), x \
52 round(v2, x) \
53 MOVQ +16(p), x \
54 round(v3, x) \
55 MOVQ +24(p), x \
56 round(v4, x) \
57 ADDQ $32, p \
58 CMPQ p, end \
59 JLE loop
60
61// func Sum64(b []byte) uint64
62TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
63 // Load fixed primes.
64 MOVQ ·primes+0(SB), prime1
65 MOVQ ·primes+8(SB), prime2
66 MOVQ ·primes+24(SB), prime4
67
68 // Load slice.
69 MOVQ b_base+0(FP), p
70 MOVQ b_len+8(FP), n
71 LEAQ (p)(n*1), end
72
73 // The first loop limit will be len(b)-32.
74 SUBQ $32, end
75
76 // Check whether we have at least one block.
77 CMPQ n, $32
78 JLT noBlocks
79
80 // Set up initial state (v1, v2, v3, v4).
81 MOVQ prime1, v1
82 ADDQ prime2, v1
83 MOVQ prime2, v2
84 XORQ v3, v3
85 XORQ v4, v4
86 SUBQ prime1, v4
87
88 blockLoop()
89
90 MOVQ v1, h
91 ROLQ $1, h
92 MOVQ v2, x
93 ROLQ $7, x
94 ADDQ x, h
95 MOVQ v3, x
96 ROLQ $12, x
97 ADDQ x, h
98 MOVQ v4, x
99 ROLQ $18, x
100 ADDQ x, h
101
102 mergeRound(h, v1)
103 mergeRound(h, v2)
104 mergeRound(h, v3)
105 mergeRound(h, v4)
106
107 JMP afterBlocks
108
109noBlocks:
110 MOVQ ·primes+32(SB), h
111
112afterBlocks:
113 ADDQ n, h
114
115 ADDQ $24, end
116 CMPQ p, end
117 JG try4
118
119loop8:
120 MOVQ (p), x
121 ADDQ $8, p
122 round0(x)
123 XORQ x, h
124 ROLQ $27, h
125 IMULQ prime1, h
126 ADDQ prime4, h
127
128 CMPQ p, end
129 JLE loop8
130
131try4:
132 ADDQ $4, end
133 CMPQ p, end
134 JG try1
135
136 MOVL (p), x
137 ADDQ $4, p
138 IMULQ prime1, x
139 XORQ x, h
140
141 ROLQ $23, h
142 IMULQ prime2, h
143 ADDQ ·primes+16(SB), h
144
145try1:
146 ADDQ $4, end
147 CMPQ p, end
148 JGE finalize
149
150loop1:
151 MOVBQZX (p), x
152 ADDQ $1, p
153 IMULQ ·primes+32(SB), x
154 XORQ x, h
155 ROLQ $11, h
156 IMULQ prime1, h
157
158 CMPQ p, end
159 JL loop1
160
161finalize:
162 MOVQ h, x
163 SHRQ $33, x
164 XORQ x, h
165 IMULQ prime2, h
166 MOVQ h, x
167 SHRQ $29, x
168 XORQ x, h
169 IMULQ ·primes+16(SB), h
170 MOVQ h, x
171 SHRQ $32, x
172 XORQ x, h
173
174 MOVQ h, ret+24(FP)
175 RET
176
177// func writeBlocks(d *Digest, b []byte) int
178TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
179 // Load fixed primes needed for round.
180 MOVQ ·primes+0(SB), prime1
181 MOVQ ·primes+8(SB), prime2
182
183 // Load slice.
184 MOVQ b_base+8(FP), p
185 MOVQ b_len+16(FP), n
186 LEAQ (p)(n*1), end
187 SUBQ $32, end
188
189 // Load vN from d.
190 MOVQ s+0(FP), d
191 MOVQ 0(d), v1
192 MOVQ 8(d), v2
193 MOVQ 16(d), v3
194 MOVQ 24(d), v4
195
196 // We don't need to check the loop condition here; this function is
197 // always called with at least one block of data to process.
198 blockLoop()
199
200 // Copy vN back to d.
201 MOVQ v1, 0(d)
202 MOVQ v2, 8(d)
203 MOVQ v3, 16(d)
204 MOVQ v4, 24(d)
205
206 // The number of bytes written is p minus the old base pointer.
207 SUBQ b_base+8(FP), p
208 MOVQ p, ret+32(FP)
209
210 RET
View as plain text