1 package xxh3
2
3
4
5 const avx512Switch = 1 << 10
6
7 func accumScalar(accs *[8]u64, p, secret ptr, l u64) {
8 if secret != key {
9 accumScalarSeed(accs, p, secret, l)
10 return
11 }
12 for l > _block {
13 k := secret
14
15
16 for i := 0; i < 16; i++ {
17 dv0 := readU64(p, 8*0)
18 dk0 := dv0 ^ readU64(k, 8*0)
19 accs[1] += dv0
20 accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
21
22 dv1 := readU64(p, 8*1)
23 dk1 := dv1 ^ readU64(k, 8*1)
24 accs[0] += dv1
25 accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
26
27 dv2 := readU64(p, 8*2)
28 dk2 := dv2 ^ readU64(k, 8*2)
29 accs[3] += dv2
30 accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
31
32 dv3 := readU64(p, 8*3)
33 dk3 := dv3 ^ readU64(k, 8*3)
34 accs[2] += dv3
35 accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
36
37 dv4 := readU64(p, 8*4)
38 dk4 := dv4 ^ readU64(k, 8*4)
39 accs[5] += dv4
40 accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
41
42 dv5 := readU64(p, 8*5)
43 dk5 := dv5 ^ readU64(k, 8*5)
44 accs[4] += dv5
45 accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
46
47 dv6 := readU64(p, 8*6)
48 dk6 := dv6 ^ readU64(k, 8*6)
49 accs[7] += dv6
50 accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
51
52 dv7 := readU64(p, 8*7)
53 dk7 := dv7 ^ readU64(k, 8*7)
54 accs[6] += dv7
55 accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
56
57 l -= _stripe
58 if l > 0 {
59 p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
60 }
61 }
62
63
64 accs[0] ^= accs[0] >> 47
65 accs[0] ^= key64_128
66 accs[0] *= prime32_1
67
68 accs[1] ^= accs[1] >> 47
69 accs[1] ^= key64_136
70 accs[1] *= prime32_1
71
72 accs[2] ^= accs[2] >> 47
73 accs[2] ^= key64_144
74 accs[2] *= prime32_1
75
76 accs[3] ^= accs[3] >> 47
77 accs[3] ^= key64_152
78 accs[3] *= prime32_1
79
80 accs[4] ^= accs[4] >> 47
81 accs[4] ^= key64_160
82 accs[4] *= prime32_1
83
84 accs[5] ^= accs[5] >> 47
85 accs[5] ^= key64_168
86 accs[5] *= prime32_1
87
88 accs[6] ^= accs[6] >> 47
89 accs[6] ^= key64_176
90 accs[6] *= prime32_1
91
92 accs[7] ^= accs[7] >> 47
93 accs[7] ^= key64_184
94 accs[7] *= prime32_1
95 }
96
97 if l > 0 {
98 t, k := (l-1)/_stripe, secret
99
100 for i := u64(0); i < t; i++ {
101 dv0 := readU64(p, 8*0)
102 dk0 := dv0 ^ readU64(k, 8*0)
103 accs[1] += dv0
104 accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
105
106 dv1 := readU64(p, 8*1)
107 dk1 := dv1 ^ readU64(k, 8*1)
108 accs[0] += dv1
109 accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
110
111 dv2 := readU64(p, 8*2)
112 dk2 := dv2 ^ readU64(k, 8*2)
113 accs[3] += dv2
114 accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
115
116 dv3 := readU64(p, 8*3)
117 dk3 := dv3 ^ readU64(k, 8*3)
118 accs[2] += dv3
119 accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
120
121 dv4 := readU64(p, 8*4)
122 dk4 := dv4 ^ readU64(k, 8*4)
123 accs[5] += dv4
124 accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
125
126 dv5 := readU64(p, 8*5)
127 dk5 := dv5 ^ readU64(k, 8*5)
128 accs[4] += dv5
129 accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
130
131 dv6 := readU64(p, 8*6)
132 dk6 := dv6 ^ readU64(k, 8*6)
133 accs[7] += dv6
134 accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
135
136 dv7 := readU64(p, 8*7)
137 dk7 := dv7 ^ readU64(k, 8*7)
138 accs[6] += dv7
139 accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
140
141 l -= _stripe
142 if l > 0 {
143 p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
144 }
145 }
146
147 if l > 0 {
148 p = ptr(ui(p) - uintptr(_stripe-l))
149
150 dv0 := readU64(p, 8*0)
151 dk0 := dv0 ^ key64_121
152 accs[1] += dv0
153 accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
154
155 dv1 := readU64(p, 8*1)
156 dk1 := dv1 ^ key64_129
157 accs[0] += dv1
158 accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
159
160 dv2 := readU64(p, 8*2)
161 dk2 := dv2 ^ key64_137
162 accs[3] += dv2
163 accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
164
165 dv3 := readU64(p, 8*3)
166 dk3 := dv3 ^ key64_145
167 accs[2] += dv3
168 accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
169
170 dv4 := readU64(p, 8*4)
171 dk4 := dv4 ^ key64_153
172 accs[5] += dv4
173 accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
174
175 dv5 := readU64(p, 8*5)
176 dk5 := dv5 ^ key64_161
177 accs[4] += dv5
178 accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
179
180 dv6 := readU64(p, 8*6)
181 dk6 := dv6 ^ key64_169
182 accs[7] += dv6
183 accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
184
185 dv7 := readU64(p, 8*7)
186 dk7 := dv7 ^ key64_177
187 accs[6] += dv7
188 accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
189 }
190 }
191 }
192
193 func accumBlockScalar(accs *[8]u64, p, secret ptr) {
194 if secret != key {
195 accumBlockScalarSeed(accs, p, secret)
196 return
197 }
198
199 for i := 0; i < 16; i++ {
200 dv0 := readU64(p, 8*0)
201 dk0 := dv0 ^ readU64(secret, 8*0)
202 accs[1] += dv0
203 accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
204
205 dv1 := readU64(p, 8*1)
206 dk1 := dv1 ^ readU64(secret, 8*1)
207 accs[0] += dv1
208 accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
209
210 dv2 := readU64(p, 8*2)
211 dk2 := dv2 ^ readU64(secret, 8*2)
212 accs[3] += dv2
213 accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
214
215 dv3 := readU64(p, 8*3)
216 dk3 := dv3 ^ readU64(secret, 8*3)
217 accs[2] += dv3
218 accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
219
220 dv4 := readU64(p, 8*4)
221 dk4 := dv4 ^ readU64(secret, 8*4)
222 accs[5] += dv4
223 accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
224
225 dv5 := readU64(p, 8*5)
226 dk5 := dv5 ^ readU64(secret, 8*5)
227 accs[4] += dv5
228 accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
229
230 dv6 := readU64(p, 8*6)
231 dk6 := dv6 ^ readU64(secret, 8*6)
232 accs[7] += dv6
233 accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
234
235 dv7 := readU64(p, 8*7)
236 dk7 := dv7 ^ readU64(secret, 8*7)
237 accs[6] += dv7
238 accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
239
240 p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8)
241 }
242
243
244 accs[0] ^= accs[0] >> 47
245 accs[0] ^= key64_128
246 accs[0] *= prime32_1
247
248 accs[1] ^= accs[1] >> 47
249 accs[1] ^= key64_136
250 accs[1] *= prime32_1
251
252 accs[2] ^= accs[2] >> 47
253 accs[2] ^= key64_144
254 accs[2] *= prime32_1
255
256 accs[3] ^= accs[3] >> 47
257 accs[3] ^= key64_152
258 accs[3] *= prime32_1
259
260 accs[4] ^= accs[4] >> 47
261 accs[4] ^= key64_160
262 accs[4] *= prime32_1
263
264 accs[5] ^= accs[5] >> 47
265 accs[5] ^= key64_168
266 accs[5] *= prime32_1
267
268 accs[6] ^= accs[6] >> 47
269 accs[6] ^= key64_176
270 accs[6] *= prime32_1
271
272 accs[7] ^= accs[7] >> 47
273 accs[7] ^= key64_184
274 accs[7] *= prime32_1
275 }
276
277
278 func accumScalarSeed(accs *[8]u64, p, secret ptr, l u64) {
279 for l > _block {
280 k := secret
281
282
283 for i := 0; i < 16; i++ {
284 dv0 := readU64(p, 8*0)
285 dk0 := dv0 ^ readU64(k, 8*0)
286 accs[1] += dv0
287 accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
288
289 dv1 := readU64(p, 8*1)
290 dk1 := dv1 ^ readU64(k, 8*1)
291 accs[0] += dv1
292 accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
293
294 dv2 := readU64(p, 8*2)
295 dk2 := dv2 ^ readU64(k, 8*2)
296 accs[3] += dv2
297 accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
298
299 dv3 := readU64(p, 8*3)
300 dk3 := dv3 ^ readU64(k, 8*3)
301 accs[2] += dv3
302 accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
303
304 dv4 := readU64(p, 8*4)
305 dk4 := dv4 ^ readU64(k, 8*4)
306 accs[5] += dv4
307 accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
308
309 dv5 := readU64(p, 8*5)
310 dk5 := dv5 ^ readU64(k, 8*5)
311 accs[4] += dv5
312 accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
313
314 dv6 := readU64(p, 8*6)
315 dk6 := dv6 ^ readU64(k, 8*6)
316 accs[7] += dv6
317 accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
318
319 dv7 := readU64(p, 8*7)
320 dk7 := dv7 ^ readU64(k, 8*7)
321 accs[6] += dv7
322 accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
323
324 l -= _stripe
325 if l > 0 {
326 p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
327 }
328 }
329
330
331 accs[0] ^= accs[0] >> 47
332 accs[0] ^= readU64(secret, 128)
333 accs[0] *= prime32_1
334
335 accs[1] ^= accs[1] >> 47
336 accs[1] ^= readU64(secret, 136)
337 accs[1] *= prime32_1
338
339 accs[2] ^= accs[2] >> 47
340 accs[2] ^= readU64(secret, 144)
341 accs[2] *= prime32_1
342
343 accs[3] ^= accs[3] >> 47
344 accs[3] ^= readU64(secret, 152)
345 accs[3] *= prime32_1
346
347 accs[4] ^= accs[4] >> 47
348 accs[4] ^= readU64(secret, 160)
349 accs[4] *= prime32_1
350
351 accs[5] ^= accs[5] >> 47
352 accs[5] ^= readU64(secret, 168)
353 accs[5] *= prime32_1
354
355 accs[6] ^= accs[6] >> 47
356 accs[6] ^= readU64(secret, 176)
357 accs[6] *= prime32_1
358
359 accs[7] ^= accs[7] >> 47
360 accs[7] ^= readU64(secret, 184)
361 accs[7] *= prime32_1
362 }
363
364 if l > 0 {
365 t, k := (l-1)/_stripe, secret
366
367 for i := u64(0); i < t; i++ {
368 dv0 := readU64(p, 8*0)
369 dk0 := dv0 ^ readU64(k, 8*0)
370 accs[1] += dv0
371 accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
372
373 dv1 := readU64(p, 8*1)
374 dk1 := dv1 ^ readU64(k, 8*1)
375 accs[0] += dv1
376 accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
377
378 dv2 := readU64(p, 8*2)
379 dk2 := dv2 ^ readU64(k, 8*2)
380 accs[3] += dv2
381 accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
382
383 dv3 := readU64(p, 8*3)
384 dk3 := dv3 ^ readU64(k, 8*3)
385 accs[2] += dv3
386 accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
387
388 dv4 := readU64(p, 8*4)
389 dk4 := dv4 ^ readU64(k, 8*4)
390 accs[5] += dv4
391 accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
392
393 dv5 := readU64(p, 8*5)
394 dk5 := dv5 ^ readU64(k, 8*5)
395 accs[4] += dv5
396 accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
397
398 dv6 := readU64(p, 8*6)
399 dk6 := dv6 ^ readU64(k, 8*6)
400 accs[7] += dv6
401 accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
402
403 dv7 := readU64(p, 8*7)
404 dk7 := dv7 ^ readU64(k, 8*7)
405 accs[6] += dv7
406 accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
407
408 l -= _stripe
409 if l > 0 {
410 p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
411 }
412 }
413
414 if l > 0 {
415 p = ptr(ui(p) - uintptr(_stripe-l))
416
417 dv0 := readU64(p, 8*0)
418 dk0 := dv0 ^ readU64(secret, 121)
419 accs[1] += dv0
420 accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
421
422 dv1 := readU64(p, 8*1)
423 dk1 := dv1 ^ readU64(secret, 129)
424 accs[0] += dv1
425 accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
426
427 dv2 := readU64(p, 8*2)
428 dk2 := dv2 ^ readU64(secret, 137)
429 accs[3] += dv2
430 accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
431
432 dv3 := readU64(p, 8*3)
433 dk3 := dv3 ^ readU64(secret, 145)
434 accs[2] += dv3
435 accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
436
437 dv4 := readU64(p, 8*4)
438 dk4 := dv4 ^ readU64(secret, 153)
439 accs[5] += dv4
440 accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
441
442 dv5 := readU64(p, 8*5)
443 dk5 := dv5 ^ readU64(secret, 161)
444 accs[4] += dv5
445 accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
446
447 dv6 := readU64(p, 8*6)
448 dk6 := dv6 ^ readU64(secret, 169)
449 accs[7] += dv6
450 accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
451
452 dv7 := readU64(p, 8*7)
453 dk7 := dv7 ^ readU64(secret, 177)
454 accs[6] += dv7
455 accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
456 }
457 }
458 }
459
460
461 func accumBlockScalarSeed(accs *[8]u64, p, secret ptr) {
462
463 {
464 secret := secret
465 for i := 0; i < 16; i++ {
466 dv0 := readU64(p, 8*0)
467 dk0 := dv0 ^ readU64(secret, 8*0)
468 accs[1] += dv0
469 accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
470
471 dv1 := readU64(p, 8*1)
472 dk1 := dv1 ^ readU64(secret, 8*1)
473 accs[0] += dv1
474 accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
475
476 dv2 := readU64(p, 8*2)
477 dk2 := dv2 ^ readU64(secret, 8*2)
478 accs[3] += dv2
479 accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
480
481 dv3 := readU64(p, 8*3)
482 dk3 := dv3 ^ readU64(secret, 8*3)
483 accs[2] += dv3
484 accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
485
486 dv4 := readU64(p, 8*4)
487 dk4 := dv4 ^ readU64(secret, 8*4)
488 accs[5] += dv4
489 accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
490
491 dv5 := readU64(p, 8*5)
492 dk5 := dv5 ^ readU64(secret, 8*5)
493 accs[4] += dv5
494 accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
495
496 dv6 := readU64(p, 8*6)
497 dk6 := dv6 ^ readU64(secret, 8*6)
498 accs[7] += dv6
499 accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
500
501 dv7 := readU64(p, 8*7)
502 dk7 := dv7 ^ readU64(secret, 8*7)
503 accs[6] += dv7
504 accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
505
506 p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8)
507 }
508 }
509
510
511 accs[0] ^= accs[0] >> 47
512 accs[0] ^= readU64(secret, 128)
513 accs[0] *= prime32_1
514
515 accs[1] ^= accs[1] >> 47
516 accs[1] ^= readU64(secret, 136)
517 accs[1] *= prime32_1
518
519 accs[2] ^= accs[2] >> 47
520 accs[2] ^= readU64(secret, 144)
521 accs[2] *= prime32_1
522
523 accs[3] ^= accs[3] >> 47
524 accs[3] ^= readU64(secret, 152)
525 accs[3] *= prime32_1
526
527 accs[4] ^= accs[4] >> 47
528 accs[4] ^= readU64(secret, 160)
529 accs[4] *= prime32_1
530
531 accs[5] ^= accs[5] >> 47
532 accs[5] ^= readU64(secret, 168)
533 accs[5] *= prime32_1
534
535 accs[6] ^= accs[6] >> 47
536 accs[6] ^= readU64(secret, 176)
537 accs[6] *= prime32_1
538
539 accs[7] ^= accs[7] >> 47
540 accs[7] ^= readU64(secret, 184)
541 accs[7] *= prime32_1
542 }
543
View as plain text