1// +build arm64,!noasm
2
3#include "textflag.h"
4
5TEXT ·fp384Cmov(SB), NOSPLIT, $0
6 MOVD x+0(FP), R0
7 MOVD y+8(FP), R1
8 MOVW b+16(FP), R2
9 CMP $0, R2
10 LDP 0(R0), (R3, R5)
11 LDP 0(R1), (R4, R6)
12 CSEL NE,R4,R3,R7
13 CSEL NE,R6,R5,R8
14 STP (R7, R8), 0(R0)
15 LDP 16(R0), (R3, R5)
16 LDP 16(R1), (R4, R6)
17 CSEL NE,R4,R3,R7
18 CSEL NE,R6,R5,R8
19 STP (R7, R8), 16(R0)
20 LDP 32(R0), (R3, R5)
21 LDP 32(R1), (R4, R6)
22 CSEL NE,R4,R3,R7
23 CSEL NE,R6,R5,R8
24 STP (R7, R8), 32(R0)
25 RET
26
27// Compute c = -a mod p
28TEXT ·fp384Neg(SB), NOSPLIT, $0-16
29 MOVD c+0(FP), R0
30 MOVD a+8(FP), R1
31
32 // Load p in R2-R7, a in R8-R13
33 // Compute p-a in R8-R13
34 LDP ·p+0(SB), (R2, R3)
35 LDP 0(R1), (R8, R9)
36 SUBS R8, R2, R8
37 SBCS R9, R3, R9
38 LDP ·p+16(SB), (R4, R5)
39 LDP 16(R1), (R10, R11)
40 SBCS R10, R4, R10
41 SBCS R11, R5, R11
42 LDP ·p+32(SB), (R6, R7)
43 LDP 32(R1), (R12, R13)
44 SBCS R12, R6, R12
45 SBC R13, R7, R13
46
47 // Compute (p-a)-p in R2-R7
48 SUBS R2, R8, R2
49 SBCS R3, R9, R3
50 SBCS R4, R10, R4
51 SBCS R5, R11, R5
52 SBCS R6, R12, R6
53 SBCS R7, R13, R7
54
55 // If (p-a)-p < 0 (nearly always), return p-a
56 // Only return (p-a)-p for a = 0
57 // Store result in c
58 CSEL CC, R8, R2, R2
59 CSEL CC, R9, R3, R3
60 STP (R2, R3), 0(R0)
61 CSEL CC, R10, R4, R4
62 CSEL CC, R11, R5, R5
63 STP (R4, R5), 16(R0)
64 CSEL CC, R12, R6, R6
65 CSEL CC, R13, R7, R7
66 STP (R6, R7), 32(R0)
67
68 RET
69
70// Compute c = a+b mod p
71TEXT ·fp384Add(SB), NOSPLIT, $0-24
72 MOVD c+0(FP), R0
73 MOVD a+8(FP), R1
74 MOVD b+16(FP), R2
75
76 // Load a in R3-R8, b in R9-R14
77 // Compute a+b in R3-R9
78 LDP 0(R1), (R3, R4)
79 LDP 0(R2), (R9, R10)
80 ADDS R9, R3
81 ADCS R10, R4
82 LDP 16(R1), (R5, R6)
83 LDP 16(R2), (R11, R12)
84 ADCS R11, R5
85 ADCS R12, R6
86 LDP 32(R1), (R7, R8)
87 LDP 32(R2), (R13, R14)
88 ADCS R13, R7
89 ADCS R14, R8
90 ADC ZR, ZR, R9
91
92 // Load p in R10-R15
93 LDP ·p+ 0(SB), (R10, R11)
94 LDP ·p+16(SB), (R12, R13)
95 LDP ·p+32(SB), (R14, R15)
96
97 // Compute a+b-p in R10-R16
98 SUBS R10, R3, R10
99 SBCS R11, R4, R11
100 SBCS R12, R5, R12
101 SBCS R13, R6, R13
102 SBCS R14, R7, R14
103 SBCS R15, R8, R15
104 SBCS ZR, R9, R16
105
106 // If a+b-p is negative, return a+b
107 // Store result in c
108 CSEL CC, R3, R10, R3
109 CSEL CC, R4, R11, R4
110 STP (R3, R4), 0(R0)
111 CSEL CC, R5, R12, R5
112 CSEL CC, R6, R13, R6
113 STP (R5, R6), 16(R0)
114 CSEL CC, R7, R14, R7
115 CSEL CC, R8, R15, R8
116 STP (R7, R8), 32(R0)
117
118 RET
119
120// Compute c = a-b mod p
121TEXT ·fp384Sub(SB), NOSPLIT, $0-24
122 MOVD c+0(FP), R0
123 MOVD a+8(FP), R1
124 MOVD b+16(FP), R2
125
126 // Load a in R3-R8, b in R9-R14
127 // Compute a-b in R3-R9
128 LDP 0(R1), (R3, R4)
129 LDP 0(R2), (R9, R10)
130 SUBS R9, R3
131 SBCS R10, R4
132 LDP 16(R1), (R5, R6)
133 LDP 16(R2), (R11, R12)
134 SBCS R11, R5
135 SBCS R12, R6
136 LDP 32(R1), (R7, R8)
137 LDP 32(R2), (R13, R14)
138 SBCS R13, R7
139 SBCS R14, R8
140 SBC ZR, ZR, R9
141
142 // Load p in R10-R15
143 // If a-b < 0, (a-b)+p to R3-R8
144 // Store result in c
145 LDP ·p+ 0(SB), (R10, R11)
146 AND R9, R10
147 LDP ·p+16(SB), (R12, R13)
148 AND R9, R11
149 AND R9, R12
150 LDP ·p+32(SB), (R14, R15)
151 AND R9, R13
152 AND R9, R14
153 AND R9, R15
154
155 ADDS R10, R3
156 ADCS R11, R4
157 STP (R3, R4), 0(R0)
158 ADCS R12, R5
159 ADCS R13, R6
160 STP (R5, R6), 16(R0)
161 ADCS R14, R7
162 ADC R15, R8
163 STP (R7, R8), 32(R0)
164
165 RET
166
167// Expects that A0*B0 is already in C0(low),C3(high) and A0*B1 in C1(low),C2(high)
168// C0 is not actually touched
169// Result of (A0-A2) * (B0-B2) will be in C0-C5
170// Inputs remain intact
171#define mul192x192comba(A0,A1,A2, B0,B1,B2, C0,C1,C2,C3,C4,C5, S0,S1,S2,S3) \
172 MUL A1, B0, S2 \
173 UMULH A1, B0, S3 \
174 \
175 ADDS C3, C1 \
176 ADCS ZR, C2 \
177 ADC ZR, ZR, C3 \
178 \
179 MUL A0, B2, S0 \
180 UMULH A0, B2, S1 \
181 \
182 ADDS S2, C1 \
183 ADCS S3, C2 \
184 ADC ZR, C3 \
185 \
186 MUL A1, B1, S2 \
187 UMULH A1, B1, S3 \
188 \
189 ADDS S0, C2 \
190 ADCS S1, C3 \
191 ADC ZR, ZR, C4 \
192 \
193 MUL A2, B0, S0 \
194 UMULH A2, B0, S1 \
195 \
196 ADDS S2, C2 \
197 ADCS S3, C3 \
198 ADC ZR, C4 \
199 \
200 MUL A1, B2, S2 \
201 UMULH A1, B2, S3 \
202 \
203 ADDS S0, C2 \
204 ADCS S1, C3 \
205 ADC ZR, C4 \
206 \
207 MUL A2, B1, S0 \
208 UMULH A2, B1, S1 \
209 \
210 ADDS S2, C3 \
211 ADCS S3, C4 \
212 ADC ZR, ZR, C5 \
213 \
214 MUL A2, B2, S2 \
215 UMULH A2, B2, S3 \
216 \
217 ADDS S0, C3 \
218 ADCS S1, C4 \
219 ADC ZR, C5 \
220 \
221 ADDS S2, C4 \
222 ADC S3, C5
223
224
225// Assumes that there are at least 96 bytes left on the stack
226// Expects that X and Y point to input
227// X and Y get overwritten, Z0 will be in Y
228#define mul384x384karatsuba(X,Y, Z1,Z2,Z3,Z4,Z5,Z6,Z7,Z8,Z9,Z10,Z11, T0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12) \
229 /* Load a in Z1-Z6, b in T12,Z7-Z11 */ \
230 LDP 0(X), ( Z1, Z2) \
231 LDP 0(Y), (T12, Z7) \
232 MUL Z1, Z7, T1 \
233 UMULH Z1, T12, T3 \
234 LDP 16(X), ( Z3, Z4) \
235 LDP 16(Y), ( Z8, Z9) \
236 MUL Z1, T12, T0 \
237 UMULH Z1, Z7, T2 \
238 LDP 32(X), ( Z5, Z6) \
239 LDP 32(Y), (Z10, Z11) \
240 \
241 /* Compute aL*bL in T0-T5 */ \
242 mul192x192comba(Z1,Z2,Z3, T12,Z7,Z8, T0,T1,T2,T3,T4,T5, T6,T7,T8,T9) \
243 \
244 /* Compute aH*bH in T6-T11, destroys aL and bL */ \
245 MUL Z4, Z10, T7 \
246 MUL Z4, Z9, T6 \
247 UMULH Z4, Z9, T9 \
248 UMULH Z4, Z10, T8 \
249 mul192x192comba(Z4,Z5,Z6, Z9,Z10,Z11, T6,T7,T8,T9,T10,T11, Z1,Z2,T12,Z7) \
250 \
251 /* Compute aL*bL + aH*bH in Z1-Z6,T12, destroys aH */ \
252 ADDS T0, T6, Z1 \
253 ADCS T1, T7, Z2 \
254 ADCS T2, T8, Z3 \
255 ADCS T3, T9, Z4 \
256 ADCS T4, T10, Z5 \
257 ADCS T5, T11, Z6 \
258 ADC ZR, ZR, T12 \
259 \
260 /* Add to T0-T11 and store on stack */ \
261 STP ( T0, T1), -16(RSP) \
262 ADDS Z1, T3 \
263 STP ( T2, T3), -32(RSP) \
264 ADCS Z2, T4 \
265 ADCS Z3, T5 \
266 STP ( T4, T5), -48(RSP) \
267 ADCS Z4, T6 \
268 ADCS Z5, T7 \
269 STP ( T6, T7), -64(RSP) \
270 ADCS Z6, T8 \
271 ADC ZR, T12 \
272 STP ( T8, T9), -80(RSP) \
273 STP (T10, T11), -96(RSP) \
274 \
275 /* Load a to Z1-Z6 */ \
276 LDP 0(X), (Z1, Z2) \
277 LDP 16(X), (Z3, Z4) \
278 LDP 32(X), (Z5, Z6) \
279 \
280 /* Compute |aL-aH| to Z1-Z3, keep borrow in X */ \
281 SUBS Z4, Z1 \
282 SBCS Z5, Z2 \
283 SBCS Z6, Z3 \
284 SBC ZR, ZR, X \
285 NEGS Z1, Z4 \
286 NGCS Z2, Z5 \
287 NGC Z3, Z6 \
288 ADDS $1, X \
289 \
290 /* Load b to Z7-Z11,T0 */ \
291 LDP 0(Y), ( Z7, Z8) \
292 LDP 16(Y), ( Z9, Z10) \
293 LDP 32(Y), (Z11, T0) \
294 \
295 CSEL EQ, Z4, Z1, Z1 \
296 CSEL EQ, Z5, Z2 ,Z2 \
297 CSEL EQ, Z6, Z3, Z3 \
298 \
299 /* Compute |bH-bL| to Z7-Z9, keep borrow in Y */ \
300 SUBS Z7, Z10 \
301 SBCS Z8, Z11 \
302 SBCS Z9, T0 \
303 SBC ZR, ZR, Y \
304 NEGS Z10, Z7 \
305 NGCS Z11, Z8 \
306 NGC T0, Z9 \
307 ADDS $1, Y \
308 CSEL EQ, Z7, Z10, Z7 \
309 CSEL EQ, Z8, Z11, Z8 \
310 CSEL EQ, Z9, T0, Z9 \
311 \
312 /* Combine borrows */ \
313 EOR Y, X \
314 \
315 /* Compute |aL-aH|*|bH-bL| to Z10,Z11,T0-T3 */ \
316 MUL Z1, Z8, Z11 \
317 MUL Z1, Z7, Z10 \
318 UMULH Z1, Z8, T0 \
319 UMULH Z1, Z7, T1 \
320 mul192x192comba(Z1,Z2,Z3, Z7,Z8,Z9, Z10,Z11,T0,T1,T2,T3, T4,T5,T6,T7) \
321 \
322 /* The result has to be negated if exactly one of the operands was negative */ \
323 NEGS Z10, Y \
324 NGCS Z11, Z1 \
325 NGCS T0, Z2 \
326 NGCS T1, Z3 \
327 NGCS T2, Z4 \
328 NGCS T3, Z5 \
329 NGC ZR, T4 \
330 \
331 AND T4, X \
332 CMP $1, X \
333 CSEL EQ, Y, Z10, Z10 \
334 CSEL EQ, Z1, Z11, Z11 \
335 CSEL EQ, Z2, T0, T0 \
336 CSEL EQ, Z3, T1, T1 \
337 CSEL EQ, Z4, T2, T2 \
338 CSEL EQ, Z5, T3, T3 \
339 \
340 /* Add that to the middle part */ \
341 LDP -16(RSP), ( Y, Z1) \
342 LDP -32(RSP), ( Z2, Z3) \
343 LDP -48(RSP), ( Z4, Z5) \
344 ADDS Z10, Z3 \
345 ADCS Z11, Z4 \
346 LDP -64(RSP), ( Z6, Z7) \
347 ADCS T0, Z5 \
348 ADCS T1, Z6 \
349 LDP -80(RSP), ( Z8, Z9) \
350 ADCS T2, Z7 \
351 ADCS T3, Z8 \
352 LDP -96(RSP), (Z10, Z11) \
353 ADCS T12, Z9 \
354 ADCS ZR, Z10 \
355 ADC ZR, Z11 \
356 SUBS X, Z9 \
357 SBCS ZR, Z10 \
358 SBC ZR, Z11
359
360// Compute c = a*b*R^-1 mod p
361TEXT ·fp384Mul(SB), NOSPLIT, $200-24
362 MOVD c+0(FP), R0
363 MOVD a+8(FP), R1
364 MOVD b+16(FP), R2
365
366 // Compute a*b in R2-R13
367 mul384x384karatsuba(R1, R2, R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,R13, R14,R15,R16,R17,R19,R20,R21,R22,R23,R24,R25,R26,R27)
368
369 // Store a*b on the stack
370 STP ( R2, R3), -112(RSP)
371 STP ( R4, R5), -128(RSP)
372 STP ( R6, R7), -144(RSP)
373 STP ( R8, R9), -160(RSP)
374 STP (R10, R11), -176(RSP)
375 STP (R12, R13), -192(RSP)
376
377 // Compute m = a*b*pp mod 2^384 in R19-R24
378 // Store it temporarily in c
379 MOVD ·pp+0(SB), R14
380 MUL R14, R2, R19
381 UMULH R14, R2, R20
382
383 MUL R14, R3, R16
384 UMULH R14, R3, R21
385 ADDS R16, R20
386 ADC ZR, R21
387
388 MUL R14, R4, R16
389 UMULH R14, R4, R22
390 ADDS R16, R21
391 ADC ZR, R22
392
393 MUL R14, R5, R16
394 UMULH R14, R5, R23
395 ADDS R16, R22
396 ADC ZR, R23
397
398 MUL R14, R6, R16
399 UMULH R14, R6, R24
400 ADDS R16, R23
401 ADC ZR, R24
402
403 MADD R14, R24, R7, R24
404
405 // ·pp+8(SB) = 1, so we can just add
406 ADDS R2, R20
407 STP (R19, R20), 0(R0)
408 ADCS R3, R21
409 ADCS R4, R22
410 ADCS R5, R23
411 ADC R6, R24
412
413 LDP ·pp+16(SB), (R14, R15)
414 MUL R14, R2, R8
415 UMULH R14, R2, R9
416
417 MUL R14, R3, R16
418 UMULH R14, R3, R10
419 ADDS R16, R9
420 ADC ZR, R10
421
422 MUL R14, R4, R16
423 UMULH R14, R4, R11
424 ADDS R16, R10
425 ADC ZR, R11
426
427 MUL R14, R5, R16
428 ADD R16, R11
429
430 ADDS R8, R21
431 ADCS R9, R22
432 ADCS R10, R23
433 ADC R11, R24
434
435 MUL R15, R2, R8
436 UMULH R15, R2, R9
437
438 MUL R15, R3, R16
439 UMULH R15, R3, R10
440 ADDS R16, R9
441 ADC ZR, R10
442
443 MADD R15, R10, R4, R10
444
445 ADDS R8, R22
446 STP (R21, R22), 16(R0)
447 ADCS R9, R23
448 ADC R10, R24
449
450 LDP ·pp+32(SB), (R14, R15)
451 MUL R14, R2, R8
452 UMULH R14, R2, R9
453
454 MADD R14, R9, R3, R9
455
456 ADDS R8, R23
457 ADC R9, R24
458
459 MADD R15, R24, R2, R24
460 STP (R23, R24), 32(R0)
461
462 // Compute m*p in R1-R12
463 MOVD $·p(SB), R1
464 mul384x384karatsuba(R0, R1, R2,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12, R13,R14,R15,R16,R17,R19,R20,R21,R22,R23,R24,R25,R26)
465
466 // Add a*b to m*p in R1-R12,R26
467 LDP -112(RSP), (R13, R14)
468 ADDS R13, R1
469 LDP -128(RSP), (R15, R16)
470 ADCS R14, R2
471 ADCS R15, R3
472 LDP -144(RSP), (R17, R19)
473 ADCS R16, R4
474 ADCS R17, R5
475 LDP -160(RSP), (R20, R21)
476 ADCS R19, R6
477 ADCS R20, R7
478 LDP -176(RSP), (R22, R23)
479 ADCS R21, R8
480 ADCS R22, R9
481 LDP -192(RSP), (R24, R25)
482 ADCS R23, R10
483 ADCS R24, R11
484 ADCS R25, R12
485 ADC ZR, ZR, R26
486
487 // Reduce the top half mod p
488 LDP ·p+ 0(SB), (R13, R14)
489 SUBS R13, R7, R13
490 LDP ·p+16(SB), (R15, R16)
491 SBCS R14, R8, R14
492 SBCS R15, R9, R15
493 LDP ·p+32(SB), (R17, R19)
494 SBCS R16, R10, R16
495 SBCS R17, R11, R17
496 SBCS R19, R12, R19
497 SBCS ZR, R26
498
499 // Store result in c
500 MOVD c+0(FP), R0
501 CSEL CC, R7, R13, R7
502 CSEL CC, R8, R14, R8
503 STP ( R7, R8), 0(R0)
504 CSEL CC, R9, R15, R9
505 CSEL CC, R10, R16, R10
506 STP ( R9, R10), 16(R0)
507 CSEL CC, R11, R17, R11
508 CSEL CC, R12, R19, R12
509 STP (R11, R12), 32(R0)
510
511 RET
View as plain text