1// +build amd64,!noasm
2
3#include "textflag.h"
4
5#define storeBlock(a0,a1,a2,a3,a4,a5, r) \
6 MOVQ a0, 0+r \
7 MOVQ a1, 8+r \
8 MOVQ a2, 16+r \
9 MOVQ a3, 24+r \
10 MOVQ a4, 32+r \
11 MOVQ a5, 40+r
12
13#define loadBlock(r, a0,a1,a2,a3,a4,a5) \
14 MOVQ 0+r, a0 \
15 MOVQ 8+r, a1 \
16 MOVQ 16+r, a2 \
17 MOVQ 24+r, a3 \
18 MOVQ 32+r, a4 \
19 MOVQ 40+r, a5
20
21#define fp384Carry(a0,a1,a2,a3,a4,a5,a6, b0,b1,b2,b3,b4,b5,b6) \
22 \ // b = a-p
23 MOVQ a0, b0 \
24 MOVQ a1, b1 \
25 MOVQ a2, b2 \
26 MOVQ a3, b3 \
27 MOVQ a4, b4 \
28 MOVQ a5, b5 \
29 MOVQ a6, b6 \
30 \
31 SUBQ ·p+0(SB), b0 \
32 SBBQ ·p+8(SB), b1 \
33 SBBQ ·p+16(SB), b2 \
34 SBBQ ·p+24(SB), b3 \
35 SBBQ ·p+32(SB), b4 \
36 SBBQ ·p+40(SB), b5 \
37 SBBQ $0, b6 \
38 \
39 \ // if b is negative then return a
40 \ // else return b
41 CMOVQCC b0, a0 \
42 CMOVQCC b1, a1 \
43 CMOVQCC b2, a2 \
44 CMOVQCC b3, a3 \
45 CMOVQCC b4, a4 \
46 CMOVQCC b5, a5
47
48#define mul(a0,a1,a2,a3,a4,a5, rb, stack) \
49 \ // a0
50 MOVQ a0, AX \
51 MULQ 0+rb \
52 MOVQ AX, R8 \
53 MOVQ DX, R9 \
54 MOVQ a0, AX \
55 MULQ 8+rb \
56 ADDQ AX, R9 \
57 ADCQ $0, DX \
58 MOVQ DX, R10 \
59 MOVQ a0, AX \
60 MULQ 16+rb \
61 ADDQ AX, R10 \
62 ADCQ $0, DX \
63 MOVQ DX, R11 \
64 MOVQ a0, AX \
65 MULQ 24+rb \
66 ADDQ AX, R11 \
67 ADCQ $0, DX \
68 MOVQ DX, R12 \
69 MOVQ a0, AX \
70 MULQ 32+rb \
71 ADDQ AX, R12 \
72 ADCQ $0, DX \
73 MOVQ DX, R13 \
74 MOVQ a0, AX \
75 MULQ 40+rb \
76 ADDQ AX, R13 \
77 ADCQ $0, DX \
78 MOVQ DX, R14 \
79 \
80 storeBlock(R8,R9,R10,R11,R12,R13, 0+stack) \
81 MOVQ R14, 48+stack \
82 \
83 \ // a1
84 MOVQ a1, AX \
85 MULQ 0+rb \
86 MOVQ AX, R8 \
87 MOVQ DX, R9 \
88 MOVQ a1, AX \
89 MULQ 8+rb \
90 ADDQ AX, R9 \
91 ADCQ $0, DX \
92 MOVQ DX, R10 \
93 MOVQ a1, AX \
94 MULQ 16+rb \
95 ADDQ AX, R10 \
96 ADCQ $0, DX \
97 MOVQ DX, R11 \
98 MOVQ a1, AX \
99 MULQ 24+rb \
100 ADDQ AX, R11 \
101 ADCQ $0, DX \
102 MOVQ DX, R12 \
103 MOVQ a1, AX \
104 MULQ 32+rb \
105 ADDQ AX, R12 \
106 ADCQ $0, DX \
107 MOVQ DX, R13 \
108 MOVQ a1, AX \
109 MULQ 40+rb \
110 ADDQ AX, R13 \
111 ADCQ $0, DX \
112 MOVQ DX, R14 \
113 \
114 ADDQ 8+stack, R8 \
115 ADCQ 16+stack, R9 \
116 ADCQ 24+stack, R10 \
117 ADCQ 32+stack, R11 \
118 ADCQ 40+stack, R12 \
119 ADCQ 48+stack, R13 \
120 ADCQ $0, R14 \
121 storeBlock(R8,R9,R10,R11,R12,R13, 8+stack) \
122 MOVQ R14, 56+stack \
123 \
124 \ // a2
125 MOVQ a2, AX \
126 MULQ 0+rb \
127 MOVQ AX, R8 \
128 MOVQ DX, R9 \
129 MOVQ a2, AX \
130 MULQ 8+rb \
131 ADDQ AX, R9 \
132 ADCQ $0, DX \
133 MOVQ DX, R10 \
134 MOVQ a2, AX \
135 MULQ 16+rb \
136 ADDQ AX, R10 \
137 ADCQ $0, DX \
138 MOVQ DX, R11 \
139 MOVQ a2, AX \
140 MULQ 24+rb \
141 ADDQ AX, R11 \
142 ADCQ $0, DX \
143 MOVQ DX, R12 \
144 MOVQ a2, AX \
145 MULQ 32+rb \
146 ADDQ AX, R12 \
147 ADCQ $0, DX \
148 MOVQ DX, R13 \
149 MOVQ a2, AX \
150 MULQ 40+rb \
151 ADDQ AX, R13 \
152 ADCQ $0, DX \
153 MOVQ DX, R14 \
154 \
155 ADDQ 16+stack, R8 \
156 ADCQ 24+stack, R9 \
157 ADCQ 32+stack, R10 \
158 ADCQ 40+stack, R11 \
159 ADCQ 48+stack, R12 \
160 ADCQ 56+stack, R13 \
161 ADCQ $0, R14 \
162 storeBlock(R8,R9,R10,R11,R12,R13, 16+stack) \
163 MOVQ R14, 64+stack \
164 \
165 \ // a3
166 MOVQ a3, AX \
167 MULQ 0+rb \
168 MOVQ AX, R8 \
169 MOVQ DX, R9 \
170 MOVQ a3, AX \
171 MULQ 8+rb \
172 ADDQ AX, R9 \
173 ADCQ $0, DX \
174 MOVQ DX, R10 \
175 MOVQ a3, AX \
176 MULQ 16+rb \
177 ADDQ AX, R10 \
178 ADCQ $0, DX \
179 MOVQ DX, R11 \
180 MOVQ a3, AX \
181 MULQ 24+rb \
182 ADDQ AX, R11 \
183 ADCQ $0, DX \
184 MOVQ DX, R12 \
185 MOVQ a3, AX \
186 MULQ 32+rb \
187 ADDQ AX, R12 \
188 ADCQ $0, DX \
189 MOVQ DX, R13 \
190 MOVQ a3, AX \
191 MULQ 40+rb \
192 ADDQ AX, R13 \
193 ADCQ $0, DX \
194 MOVQ DX, R14 \
195 \
196 ADDQ 24+stack, R8 \
197 ADCQ 32+stack, R9 \
198 ADCQ 40+stack, R10 \
199 ADCQ 48+stack, R11 \
200 ADCQ 56+stack, R12 \
201 ADCQ 64+stack, R13 \
202 ADCQ $0, R14 \
203 storeBlock(R8,R9,R10,R11,R12,R13, 24+stack) \
204 MOVQ R14, 72+stack \
205 \
206 \ // a4
207 MOVQ a4, AX \
208 MULQ 0+rb \
209 MOVQ AX, R8 \
210 MOVQ DX, R9 \
211 MOVQ a4, AX \
212 MULQ 8+rb \
213 ADDQ AX, R9 \
214 ADCQ $0, DX \
215 MOVQ DX, R10 \
216 MOVQ a4, AX \
217 MULQ 16+rb \
218 ADDQ AX, R10 \
219 ADCQ $0, DX \
220 MOVQ DX, R11 \
221 MOVQ a4, AX \
222 MULQ 24+rb \
223 ADDQ AX, R11 \
224 ADCQ $0, DX \
225 MOVQ DX, R12 \
226 MOVQ a4, AX \
227 MULQ 32+rb \
228 ADDQ AX, R12 \
229 ADCQ $0, DX \
230 MOVQ DX, R13 \
231 MOVQ a4, AX \
232 MULQ 40+rb \
233 ADDQ AX, R13 \
234 ADCQ $0, DX \
235 MOVQ DX, R14 \
236 \
237 ADDQ 32+stack, R8 \
238 ADCQ 40+stack, R9 \
239 ADCQ 48+stack, R10 \
240 ADCQ 56+stack, R11 \
241 ADCQ 64+stack, R12 \
242 ADCQ 72+stack, R13 \
243 ADCQ $0, R14 \
244 storeBlock(R8,R9,R10,R11,R12,R13, 32+stack) \
245 MOVQ R14, 80+stack \
246 \
247 \ // a5
248 MOVQ a5, AX \
249 MULQ 0+rb \
250 MOVQ AX, R8 \
251 MOVQ DX, R9 \
252 MOVQ a5, AX \
253 MULQ 8+rb \
254 ADDQ AX, R9 \
255 ADCQ $0, DX \
256 MOVQ DX, R10 \
257 MOVQ a5, AX \
258 MULQ 16+rb \
259 ADDQ AX, R10 \
260 ADCQ $0, DX \
261 MOVQ DX, R11 \
262 MOVQ a5, AX \
263 MULQ 24+rb \
264 ADDQ AX, R11 \
265 ADCQ $0, DX \
266 MOVQ DX, R12 \
267 MOVQ a5, AX \
268 MULQ 32+rb \
269 ADDQ AX, R12 \
270 ADCQ $0, DX \
271 MOVQ DX, R13 \
272 MOVQ a5, AX \
273 MULQ 40+rb \
274 ADDQ AX, R13 \
275 ADCQ $0, DX \
276 MOVQ DX, R14 \
277 \
278 ADDQ 40+stack, R8 \
279 ADCQ 48+stack, R9 \
280 ADCQ 56+stack, R10 \
281 ADCQ 64+stack, R11 \
282 ADCQ 72+stack, R12 \
283 ADCQ 80+stack, R13 \
284 ADCQ $0, R14 \
285 storeBlock(R8,R9,R10,R11,R12,R13, 40+stack) \
286 MOVQ R14, 88+stack
287
288#define fp384Reduce(stack) \
289 \ // m = (T * P') mod R, store m in R8:R9:R10:R11:R12:R13
290 MOVQ ·pp+0(SB), AX \
291 MULQ 0+stack \
292 MOVQ AX, R8 ; MOVQ R8, 96+stack\
293 MOVQ DX, R9 \
294 MOVQ ·pp+0(SB), AX \
295 MULQ 8+stack \
296 ADDQ AX, R9 \
297 ADCQ $0, DX \
298 MOVQ DX, R10 \
299 MOVQ ·pp+0(SB), AX \
300 MULQ 16+stack \
301 ADDQ AX, R10 \
302 ADCQ $0, DX \
303 MOVQ DX, R11 \
304 MOVQ ·pp+0(SB), AX \
305 MULQ 24+stack \
306 ADDQ AX, R11 \
307 ADCQ $0, DX \
308 MOVQ DX, R12 \
309 MOVQ ·pp+0(SB), AX \
310 MULQ 32+stack \
311 ADDQ AX, R12 \
312 ADCQ $0, DX \
313 MOVQ DX, R13 \
314 MOVQ ·pp+0(SB), AX \
315 MULQ 40+stack \
316 ADDQ AX, R13 \
317 \
318 ADDQ 0+stack, R9 \
319 ADCQ 8+stack, R10 \
320 ADCQ 16+stack, R11 \
321 ADCQ 24+stack, R12 \
322 ADCQ 32+stack, R13 \
323 \
324 MOVQ ·pp+16(SB), AX \
325 MULQ 0+stack \
326 MOVQ AX, R14 \
327 MOVQ DX, R8 \
328 MOVQ ·pp+16(SB), AX \
329 MULQ 8+stack \
330 ADDQ AX, R8 \
331 ADCQ $0, DX \
332 MOVQ DX, BX \
333 MOVQ ·pp+16(SB), AX \
334 MULQ 16+stack \
335 ADDQ AX, BX \
336 ADCQ $0, DX \
337 MOVQ DX, CX \
338 MOVQ ·pp+16(SB), AX \
339 MULQ 24+stack \
340 ADDQ AX, CX \
341 \
342 ADDQ R14, R10 \
343 ADCQ R8, R11 \
344 ADCQ BX, R12 \
345 ADCQ CX, R13 \
346 \
347 MOVQ ·pp+24(SB), AX \
348 MULQ 0+stack \
349 MOVQ AX, R14 \
350 MOVQ DX, R8 \
351 MOVQ ·pp+24(SB), AX \
352 MULQ 8+stack \
353 ADDQ AX, R8 \
354 ADCQ $0, DX \
355 MOVQ DX, BX \
356 MOVQ ·pp+24(SB), AX \
357 MULQ 16+stack \
358 ADDQ AX, BX \
359 \
360 ADDQ R14, R11 \
361 ADCQ R8, R12 \
362 ADCQ BX, R13 \
363 \
364 MOVQ ·pp+32(SB), AX \
365 MULQ 0+stack \
366 MOVQ AX, R14 \
367 MOVQ DX, R8 \
368 MOVQ ·pp+32(SB), AX \
369 MULQ 8+stack \
370 ADDQ AX, R8 \
371 \
372 ADDQ R14, R12 \
373 ADCQ R8, R13 \
374 \
375 MOVQ ·pp+40(SB), AX \
376 MULQ 0+stack \
377 ADDQ AX, R13 \
378 \
379 MOVQ 96+stack, R8 \
380 \
381 storeBlock(R8,R9,R10,R11,R12,R13, 96+stack) \
382 \
383 \ // m * P
384 mul(·p+0(SB),·p+8(SB),·p+16(SB),·p+24(SB),·p+32(SB),·p+40(SB), 96+stack, 144+stack) \
385 \
386 \ // Add the 768-bit intermediate to m*N
387 MOVQ $0, R15 \
388 loadBlock(144+stack, R8,R9,R10,R11,R12,R13) \
389 loadBlock(192+stack, R14,SI,AX,BX,CX,DX) \
390 \
391 ADDQ 0+stack, R8 \
392 ADCQ 8+stack, R9 \
393 ADCQ 16+stack, R10 \
394 ADCQ 24+stack, R11 \
395 ADCQ 32+stack, R12 \
396 ADCQ 40+stack, R13 \
397 ADCQ 48+stack, R14 \
398 ADCQ 56+stack, SI \
399 ADCQ 64+stack, AX \
400 ADCQ 72+stack, BX \
401 ADCQ 80+stack, CX \
402 ADCQ 88+stack, DX \
403 ADCQ $0, R15 \
404 \
405 fp384Carry(R14,SI,AX,BX,CX,DX,R15, R8,R9,R10,R11,R12,R13,DI)
406
407#define mulBMI2(a0,a1,a2,a3,a4,a5, rb, stack) \
408 MOVQ a0, DX \
409 MULXQ 0+rb, R8, R9; MOVQ R8, 0+stack; MOVQ $0, R8 \
410 MULXQ 8+rb, AX, R10 \
411 ADDQ AX, R9 \
412 MULXQ 16+rb, AX, R11 \
413 ADCQ AX, R10 \
414 MULXQ 24+rb, AX, R12 \
415 ADCQ AX, R11 \
416 MULXQ 32+rb, AX, R13 \
417 ADCQ AX, R12 \
418 MULXQ 40+rb, AX, R14 \
419 ADCQ AX, R13 \
420 ADCQ $0, R14 \
421 \
422 MOVQ a1, DX \
423 MULXQ 0+rb, AX, BX \
424 ADDQ AX, R9; MOVQ R9, 8+stack; MOVL $0, R9 \
425 ADCQ BX, R10 \
426 MULXQ 16+rb, AX, BX \
427 ADCQ AX, R11 \
428 ADCQ BX, R12 \
429 MULXQ 32+rb, AX, BX \
430 ADCQ AX, R13 \
431 ADCQ BX, R14 \
432 ADCQ $0, R8 \
433 MULXQ 8+rb, AX, BX \
434 ADDQ AX, R10 \
435 ADCQ BX, R11 \
436 MULXQ 24+rb, AX, BX \
437 ADCQ AX, R12 \
438 ADCQ BX, R13 \
439 MULXQ 40+rb, AX, BX \
440 ADCQ AX, R14 \
441 ADCQ BX, R8 \
442 ADCQ $0, R9 \
443 \
444 MOVQ a2, DX \
445 MULXQ 0+rb, AX, BX \
446 ADDQ AX, R10; MOVQ R10, 16+stack; MOVL $0, R10 \
447 ADCQ BX, R11 \
448 MULXQ 16+rb, AX, BX \
449 ADCQ AX, R12 \
450 ADCQ BX, R13 \
451 MULXQ 32+rb, AX, BX \
452 ADCQ AX, R14 \
453 ADCQ BX, R8 \
454 ADCQ $0, R9 \
455 MULXQ 8+rb, AX, BX \
456 ADDQ AX, R11 \
457 ADCQ BX, R12 \
458 MULXQ 24+rb, AX, BX \
459 ADCQ AX, R13 \
460 ADCQ BX, R14 \
461 MULXQ 40+rb, AX, BX \
462 ADCQ AX, R8 \
463 ADCQ BX, R9 \
464 ADCQ $0, R10 \
465 \
466 MOVQ a3, DX \
467 MULXQ 0+rb, AX, BX \
468 ADDQ AX, R11; MOVQ R11, 24+stack; MOVL $0, R11 \
469 ADCQ BX, R12 \
470 MULXQ 16+rb, AX, BX \
471 ADCQ AX, R13 \
472 ADCQ BX, R14 \
473 MULXQ 32+rb, AX, BX \
474 ADCQ AX, R8 \
475 ADCQ BX, R9 \
476 ADCQ $0, R10 \
477 MULXQ 8+rb, AX, BX \
478 ADDQ AX, R12 \
479 ADCQ BX, R13 \
480 MULXQ 24+rb, AX, BX \
481 ADCQ AX, R14 \
482 ADCQ BX, R8 \
483 MULXQ 40+rb, AX, BX \
484 ADCQ AX, R9 \
485 ADCQ BX, R10 \
486 ADCQ $0, R11 \
487 \
488 MOVQ a4, DX \
489 MULXQ 0+rb, AX, BX \
490 ADDQ AX, R12; MOVQ R12, 32+stack; MOVL $0, R12 \
491 ADCQ BX, R13 \
492 MULXQ 16+rb, AX, BX \
493 ADCQ AX, R14 \
494 ADCQ BX, R8 \
495 MULXQ 32+rb, AX, BX \
496 ADCQ AX, R9 \
497 ADCQ BX, R10 \
498 ADCQ $0, R11 \
499 MULXQ 8+rb, AX, BX \
500 ADDQ AX, R13 \
501 ADCQ BX, R14 \
502 MULXQ 24+rb, AX, BX \
503 ADCQ AX, R8 \
504 ADCQ BX, R9 \
505 MULXQ 40+rb, AX, BX \
506 ADCQ AX, R10 \
507 ADCQ BX, R11 \
508 ADCQ $0, R12 \
509 \
510 MOVQ a5, DX \
511 MULXQ 0+rb, AX, BX \
512 ADDQ AX, R13; MOVQ R13, 40+stack \
513 ADCQ BX, R14 \
514 MULXQ 16+rb, AX, BX \
515 ADCQ AX, R8 \
516 ADCQ BX, R9 \
517 MULXQ 32+rb, AX, BX \
518 ADCQ AX, R10 \
519 ADCQ BX, R11 \
520 ADCQ $0, R12 \
521 MULXQ 8+rb, AX, BX \
522 ADDQ AX, R14 \
523 ADCQ BX, R8 \
524 MULXQ 24+rb, AX, BX \
525 ADCQ AX, R9 \
526 ADCQ BX, R10 \
527 MULXQ 40+rb, AX, BX \
528 ADCQ AX, R11 \
529 ADCQ BX, R12
530
531#define fp384ReduceBMI2(stack) \
532 \ // m = (T * P') mod R, store m in R8:R9:R10:R11:R12:R13
533 MOVQ ·pp+0(SB), DX \
534 MULXQ 0+stack, R8, R9 \
535 MULXQ 8+stack, AX, R10 \
536 ADDQ AX, R9 \
537 MULXQ 16+stack, AX, R11 \
538 ADCQ AX, R10 \
539 MULXQ 24+stack, AX, R12 \
540 ADCQ AX, R11 \
541 MULXQ 32+stack, AX, R13 \
542 ADCQ AX, R12 \
543 MULXQ 40+stack, AX, BX \
544 ADCQ AX, R13 \
545 \
546 ADDQ 0+stack, R9 \
547 ADCQ 8+stack, R10 \
548 ADCQ 16+stack, R11 \
549 ADCQ 24+stack, R12 \
550 ADCQ 32+stack, R13 \
551 \
552 MOVQ ·pp+16(SB), DX \
553 MULXQ 0+stack, AX, BX \
554 ADDQ AX, R10 \
555 ADCQ BX, R11 \
556 MULXQ 16+stack, AX, BX \
557 ADCQ AX, R12 \
558 ADCQ BX, R13 \
559 MULXQ 8+stack, AX, BX \
560 ADDQ AX, R11 \
561 ADCQ BX, R12 \
562 MULXQ 24+stack, AX, BX \
563 ADCQ AX, R13 \
564 \
565 MOVQ ·pp+24(SB), DX \
566 MULXQ 0+stack, AX, BX \
567 ADDQ AX, R11 \
568 ADCQ BX, R12 \
569 MULXQ 16+stack, AX, BX \
570 ADCQ AX, R13 \
571 MULXQ 8+stack, AX, BX \
572 ADDQ AX, R12 \
573 ADCQ BX, R13 \
574 \
575 MOVQ ·pp+32(SB), DX \
576 MULXQ 0+stack, AX, BX \
577 ADDQ AX, R12 \
578 ADCQ BX, R13 \
579 MULXQ 8+stack, AX, BX \
580 ADDQ AX, R13 \
581 \
582 MOVQ ·pp+40(SB), DX \
583 MULXQ 0+stack, AX, BX \
584 ADDQ AX, R13 \
585 \
586 storeBlock(R8,R9,R10,R11,R12,R13, 96+stack) \
587 \
588 \ // m * P
589 mulBMI2(·p+0(SB),·p+8(SB),·p+16(SB),·p+24(SB),·p+32(SB),·p+40(SB), 96+stack, 144+stack) \
590 \
591 \ // Add the 768-bit intermediate to m*N
592 loadBlock(144+stack, AX,R13,BX,CX,DX,DI) \
593 \
594 ADDQ 0+stack, AX \
595 ADCQ 8+stack, R13 \
596 ADCQ 16+stack, BX \
597 ADCQ 24+stack, CX \
598 ADCQ 32+stack, DX \
599 ADCQ 40+stack, DI \
600 ADCQ 48+stack, R14 \
601 ADCQ 56+stack, R8 \
602 ADCQ 64+stack, R9 \
603 ADCQ 72+stack, R10 \
604 ADCQ 80+stack, R11 \
605 ADCQ 88+stack, R12 \
606 MOVQ $0, 0+stack \
607 ADCQ $0, 0+stack \
608 \
609 fp384Carry(R14,R8,R9,R10,R11,R12, 0+stack, AX,R13,BX,CX,DX,DI,SI)
610
611TEXT ·fp384Neg(SB), NOSPLIT, $0-16
612 MOVQ ·p+0(SB), R8
613 MOVQ ·p+8(SB), R9
614 MOVQ ·p+16(SB), R10
615 MOVQ ·p+24(SB), R11
616 MOVQ ·p+32(SB), R12
617 MOVQ ·p+40(SB), R13
618
619 MOVQ a+8(FP), DI
620 SUBQ 0(DI), R8
621 SBBQ 8(DI), R9
622 SBBQ 16(DI), R10
623 SBBQ 24(DI), R11
624 SBBQ 32(DI), R12
625 SBBQ 40(DI), R13
626
627 MOVQ $0, R15
628 fp384Carry(R8,R9,R10,R11,R12,R13,R15, R14,AX,BX,CX,DX,DI,SI)
629
630 MOVQ c+0(FP), DI
631 storeBlock(R8,R9,R10,R11,R12,R13, 0(DI))
632 RET
633
634TEXT ·fp384Add(SB), NOSPLIT, $0-24
635 MOVQ a+8(FP), DI
636 MOVQ b+16(FP), SI
637
638 loadBlock(0(DI), R8,R9,R10,R11,R12,R13)
639 MOVQ $0, R15
640
641 ADDQ 0(SI), R8
642 ADCQ 8(SI), R9
643 ADCQ 16(SI), R10
644 ADCQ 24(SI), R11
645 ADCQ 32(SI), R12
646 ADCQ 40(SI), R13
647 ADCQ $0, R15
648
649 fp384Carry(R8,R9,R10,R11,R12,R13,R15, R14,AX,BX,CX,DX,DI,SI)
650
651 MOVQ c+0(FP), DI
652 storeBlock(R8,R9,R10,R11,R12,R13, 0(DI))
653 RET
654
655TEXT ·fp384Sub(SB), NOSPLIT, $0-24
656 MOVQ ·p+0(SB), R8
657 MOVQ ·p+8(SB), R9
658 MOVQ ·p+16(SB), R10
659 MOVQ ·p+24(SB), R11
660 MOVQ ·p+32(SB), R12
661 MOVQ ·p+40(SB), R13
662
663 MOVQ b+16(FP), DI
664 SUBQ 0(DI), R8
665 SBBQ 8(DI), R9
666 SBBQ 16(DI), R10
667 SBBQ 24(DI), R11
668 SBBQ 32(DI), R12
669 SBBQ 40(DI), R13
670
671 MOVQ $0, R15
672 MOVQ a+8(FP), DI
673 ADDQ 0(DI), R8
674 ADCQ 8(DI), R9
675 ADCQ 16(DI), R10
676 ADCQ 24(DI), R11
677 ADCQ 32(DI), R12
678 ADCQ 40(DI), R13
679 ADCQ $0, R15
680
681 fp384Carry(R8,R9,R10,R11,R12,R13,R15, R14,AX,BX,CX,DX,DI,SI)
682
683 MOVQ c+0(FP), DI
684 storeBlock(R8,R9,R10,R11,R12,R13, 0(DI))
685 RET
686
687TEXT ·fp384Mul(SB), NOSPLIT, $240-24
688 MOVQ a+8(FP), DI
689 MOVQ b+16(FP), SI
690
691 // Jump to a slightly different implementation if MULX isn't supported.
692 CMPB ·hasBMI2(SB), $0
693 JE nobmi2Mul
694
695 // T = a * b
696 mulBMI2(0(DI),8(DI),16(DI),24(DI),32(DI),40(DI), 0(SI), 0(SP))
697 storeBlock(R14,R8,R9,R10,R11,R12, 48(SP))
698
699 // Reduce T.
700 fp384ReduceBMI2(0(SP))
701
702 MOVQ c+0(FP), DI
703 storeBlock(R14,R8,R9,R10,R11,R12, 0(DI))
704 JMP end
705
706nobmi2Mul:
707 // T = a * b
708 mul(0(DI),8(DI),16(DI),24(DI),32(DI),40(DI), 0(SI), 0(SP))
709
710 // Reduce T.
711 fp384Reduce(0(SP))
712
713 MOVQ c+0(FP), DI
714 storeBlock(R14,SI,AX,BX,CX,DX, 0(DI))
715
716end:
717 RET
718
719TEXT ·fp384Cmov(SB), NOSPLIT, $0
720 MOVQ x+0(FP), DI
721 MOVQ y+8(FP), SI
722 MOVQ b+16(FP), BX
723 TESTQ BX, BX
724 MOVQ 0(DI), AX; MOVQ 0(SI), DX; CMOVQNE DX, AX; MOVQ AX, 0(DI);
725 MOVQ 8(DI), AX; MOVQ 8(SI), DX; CMOVQNE DX, AX; MOVQ AX, 8(DI);
726 MOVQ 16(DI), AX; MOVQ 16(SI), DX; CMOVQNE DX, AX; MOVQ AX, 16(DI);
727 MOVQ 24(DI), AX; MOVQ 24(SI), DX; CMOVQNE DX, AX; MOVQ AX, 24(DI);
728 MOVQ 32(DI), AX; MOVQ 32(SI), DX; CMOVQNE DX, AX; MOVQ AX, 32(DI);
729 MOVQ 40(DI), AX; MOVQ 40(SI), DX; CMOVQNE DX, AX; MOVQ AX, 40(DI);
730 RET
View as plain text