1//go:build !appengine && gc && !purego
2// +build !appengine
3// +build gc
4// +build !purego
5
6#include "textflag.h"
7
8// Registers:
9#define h AX
10#define d AX
11#define p SI // pointer to advance through b
12#define n DX
13#define end BX // loop end
14#define v1 R8
15#define v2 R9
16#define v3 R10
17#define v4 R11
18#define x R12
19#define prime1 R13
20#define prime2 R14
21#define prime4 DI
22
23#define round(acc, x) \
24 IMULQ prime2, x \
25 ADDQ x, acc \
26 ROLQ $31, acc \
27 IMULQ prime1, acc
28
29// round0 performs the operation x = round(0, x).
30#define round0(x) \
31 IMULQ prime2, x \
32 ROLQ $31, x \
33 IMULQ prime1, x
34
35// mergeRound applies a merge round on the two registers acc and x.
36// It assumes that prime1, prime2, and prime4 have been loaded.
37#define mergeRound(acc, x) \
38 round0(x) \
39 XORQ x, acc \
40 IMULQ prime1, acc \
41 ADDQ prime4, acc
42
43// blockLoop processes as many 32-byte blocks as possible,
44// updating v1, v2, v3, and v4. It assumes that there is at least one block
45// to process.
46#define blockLoop() \
47loop: \
48 MOVQ +0(p), x \
49 round(v1, x) \
50 MOVQ +8(p), x \
51 round(v2, x) \
52 MOVQ +16(p), x \
53 round(v3, x) \
54 MOVQ +24(p), x \
55 round(v4, x) \
56 ADDQ $32, p \
57 CMPQ p, end \
58 JLE loop
59
60// func Sum64(b []byte) uint64
61TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
62 // Load fixed primes.
63 MOVQ ·primes+0(SB), prime1
64 MOVQ ·primes+8(SB), prime2
65 MOVQ ·primes+24(SB), prime4
66
67 // Load slice.
68 MOVQ b_base+0(FP), p
69 MOVQ b_len+8(FP), n
70 LEAQ (p)(n*1), end
71
72 // The first loop limit will be len(b)-32.
73 SUBQ $32, end
74
75 // Check whether we have at least one block.
76 CMPQ n, $32
77 JLT noBlocks
78
79 // Set up initial state (v1, v2, v3, v4).
80 MOVQ prime1, v1
81 ADDQ prime2, v1
82 MOVQ prime2, v2
83 XORQ v3, v3
84 XORQ v4, v4
85 SUBQ prime1, v4
86
87 blockLoop()
88
89 MOVQ v1, h
90 ROLQ $1, h
91 MOVQ v2, x
92 ROLQ $7, x
93 ADDQ x, h
94 MOVQ v3, x
95 ROLQ $12, x
96 ADDQ x, h
97 MOVQ v4, x
98 ROLQ $18, x
99 ADDQ x, h
100
101 mergeRound(h, v1)
102 mergeRound(h, v2)
103 mergeRound(h, v3)
104 mergeRound(h, v4)
105
106 JMP afterBlocks
107
108noBlocks:
109 MOVQ ·primes+32(SB), h
110
111afterBlocks:
112 ADDQ n, h
113
114 ADDQ $24, end
115 CMPQ p, end
116 JG try4
117
118loop8:
119 MOVQ (p), x
120 ADDQ $8, p
121 round0(x)
122 XORQ x, h
123 ROLQ $27, h
124 IMULQ prime1, h
125 ADDQ prime4, h
126
127 CMPQ p, end
128 JLE loop8
129
130try4:
131 ADDQ $4, end
132 CMPQ p, end
133 JG try1
134
135 MOVL (p), x
136 ADDQ $4, p
137 IMULQ prime1, x
138 XORQ x, h
139
140 ROLQ $23, h
141 IMULQ prime2, h
142 ADDQ ·primes+16(SB), h
143
144try1:
145 ADDQ $4, end
146 CMPQ p, end
147 JGE finalize
148
149loop1:
150 MOVBQZX (p), x
151 ADDQ $1, p
152 IMULQ ·primes+32(SB), x
153 XORQ x, h
154 ROLQ $11, h
155 IMULQ prime1, h
156
157 CMPQ p, end
158 JL loop1
159
160finalize:
161 MOVQ h, x
162 SHRQ $33, x
163 XORQ x, h
164 IMULQ prime2, h
165 MOVQ h, x
166 SHRQ $29, x
167 XORQ x, h
168 IMULQ ·primes+16(SB), h
169 MOVQ h, x
170 SHRQ $32, x
171 XORQ x, h
172
173 MOVQ h, ret+24(FP)
174 RET
175
176// func writeBlocks(d *Digest, b []byte) int
177TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
178 // Load fixed primes needed for round.
179 MOVQ ·primes+0(SB), prime1
180 MOVQ ·primes+8(SB), prime2
181
182 // Load slice.
183 MOVQ b_base+8(FP), p
184 MOVQ b_len+16(FP), n
185 LEAQ (p)(n*1), end
186 SUBQ $32, end
187
188 // Load vN from d.
189 MOVQ s+0(FP), d
190 MOVQ 0(d), v1
191 MOVQ 8(d), v2
192 MOVQ 16(d), v3
193 MOVQ 24(d), v4
194
195 // We don't need to check the loop condition here; this function is
196 // always called with at least one block of data to process.
197 blockLoop()
198
199 // Copy vN back to d.
200 MOVQ v1, 0(d)
201 MOVQ v2, 8(d)
202 MOVQ v3, 16(d)
203 MOVQ v4, 24(d)
204
205 // The number of bytes written is p minus the old base pointer.
206 SUBQ b_base+8(FP), p
207 MOVQ p, ret+32(FP)
208
209 RET
View as plain text