1// +build amd64
2
3#include "textflag.h"
4
5// Multiplies 512-bit value by 64-bit value. Uses MULQ instruction to
6// multiply 2 64-bit values.
7//
8// Result: x = (y * z) mod 2^512
9//
10// Registers used: AX, CX, DX, SI, DI, R8
11//
12// func mul512Amd64(a, b *Fp, c uint64)
13TEXT ·mul512Amd64(SB), NOSPLIT, $0-24
14 MOVQ a+0(FP), DI // result
15 MOVQ b+8(FP), SI // multiplicand
16
17 // Check whether to use optimized implementation
18 CMPB ·hasBMI2(SB), $1
19 JE mul512_mulx
20
21 MOVQ c+16(FP), R10 // 64 bit multiplier, used by MULQ
22 MOVQ R10, AX; MULQ 0(SI); MOVQ DX, R11; MOVQ AX, 0(DI) //x[0]
23 MOVQ R10, AX; MULQ 8(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 8(DI) //x[1]
24 MOVQ R10, AX; MULQ 16(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 16(DI) //x[2]
25 MOVQ R10, AX; MULQ 24(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 24(DI) //x[3]
26 MOVQ R10, AX; MULQ 32(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 32(DI) //x[4]
27 MOVQ R10, AX; MULQ 40(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 40(DI) //x[5]
28 MOVQ R10, AX; MULQ 48(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 48(DI) //x[6]
29 MOVQ R10, AX; MULQ 56(SI); ADDQ R11, AX; MOVQ AX, 56(DI) //x[7]
30 RET
31
32// Optimized for CPUs with BMI2
33mul512_mulx:
34 MOVQ c+16(FP), DX // 64 bit multiplier, used by MULX
35 MULXQ 0(SI), AX, R10; MOVQ AX, 0(DI) // x[0]
36 MULXQ 8(SI), AX, R11; ADDQ R10, AX; MOVQ AX, 8(DI) // x[1]
37 MULXQ 16(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 16(DI) // x[2]
38 MULXQ 24(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 24(DI) // x[3]
39 MULXQ 32(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 32(DI) // x[4]
40 MULXQ 40(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 40(DI) // x[5]
41 MULXQ 48(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 48(DI) // x[6]
42 MULXQ 56(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 56(DI) // x[7]
43 RET
44
45TEXT ·cswap512Amd64(SB),NOSPLIT,$0-17
46 MOVQ x+0(FP), DI
47 MOVQ y+8(FP), SI
48 MOVBLZX choice+16(FP), AX // AL = 0 or 1
49
50 // Make AX, so that either all bits are set or non
51 // AX = 0 or 1
52 NEGQ AX
53
54 // Fill xmm15. After this step first half of XMM15 is
55 // just zeros and second half is whatever in AX
56 MOVQ AX, X15
57
58 // Copy lower double word everywhere else. So that
59 // XMM15=AL|AL|AL|AL. As AX has either all bits set
60 // or non result will be that XMM15 has also either
61 // all bits set or non of them.
62 PSHUFD $0, X15, X15
63
64#ifndef CSWAP_BLOCK
65#define CSWAP_BLOCK(idx) \
66 MOVOU (idx*16)(DI), X0 \
67 MOVOU (idx*16)(SI), X1 \
68 \ // X2 = mask & (X0 ^ X1)
69 MOVO X1, X2 \
70 PXOR X0, X2 \
71 PAND X15, X2 \
72 \
73 PXOR X2, X0 \
74 PXOR X2, X1 \
75 \
76 MOVOU X0, (idx*16)(DI) \
77 MOVOU X1, (idx*16)(SI)
78#endif
79
80 CSWAP_BLOCK(0)
81 CSWAP_BLOCK(1)
82 CSWAP_BLOCK(2)
83 CSWAP_BLOCK(3)
84
85 RET
86
87// mulAsm implements montgomery multiplication interleaved with
88// montgomery reduction. It uses MULX and ADCX/ADOX instructions.
89// Implementation specific to 511-bit prime 'p'
90//
91// func mulBmiAsm(res, x, y *fp)
92TEXT ·mulBmiAsm(SB),NOSPLIT,$8-24
93
94 MOVQ x+8(FP), DI // multiplicand
95 MOVQ y+16(FP), SI // multiplier
96
97 XORQ R8, R8
98 XORQ R9, R9
99 XORQ R10, R10
100 XORQ R11, R11
101 XORQ R12, R12
102 XORQ R13, R13
103 XORQ R14, R14
104 XORQ CX, CX
105
106 MOVQ BP, 0(SP) // push: BP is Callee-save.
107 XORQ BP, BP
108
109// Uses BMI2 (MULX)
110#ifdef MULS_MULX_512
111#undef MULS_MULX_512
112#endif
113#define MULS_MULX_512(idx, r0, r1, r2, r3, r4, r5, r6, r7, r8) \
114 \ // Reduction step
115 MOVQ ( 0)(SI), DX \
116 MULXQ ( 8*idx)(DI), DX, AX \
117 ADDQ r0, DX \
118 MOVQ ·pNegInv(SB), AX \
119 MULXQ AX, DX, AX \
120 \
121 XORQ AX, AX; \
122 MOVQ ·p+ 0(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r0; ADCXQ BX, r1 \
123 MOVQ ·p+ 8(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r1; ADCXQ BX, r2 \
124 MOVQ ·p+16(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r2; ADCXQ BX, r3 \
125 MOVQ ·p+24(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r3; ADCXQ BX, r4 \
126 MOVQ ·p+32(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r4; ADCXQ BX, r5 \
127 MOVQ ·p+40(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r5; ADCXQ BX, r6 \
128 MOVQ ·p+48(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r6; ADCXQ BX, r7 \
129 MOVQ ·p+56(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r7; ADCXQ BX, r8 \
130 MOVQ $0, AX; ;;;;;;;;;;;;;;;;;;;;;;; ADOXQ AX, r8; \
131 \ // Multiplication step
132 MOVQ (8*idx)(DI), DX \
133 \
134 XORQ AX, AX \
135 MULXQ ( 0)(SI), AX, BX; ADOXQ AX, r0; ADCXQ BX, r1 \
136 MULXQ ( 8)(SI), AX, BX; ADOXQ AX, r1; ADCXQ BX, r2 \
137 MULXQ (16)(SI), AX, BX; ADOXQ AX, r2; ADCXQ BX, r3 \
138 MULXQ (24)(SI), AX, BX; ADOXQ AX, r3; ADCXQ BX, r4 \
139 MULXQ (32)(SI), AX, BX; ADOXQ AX, r4; ADCXQ BX, r5 \
140 MULXQ (40)(SI), AX, BX; ADOXQ AX, r5; ADCXQ BX, r6 \
141 MULXQ (48)(SI), AX, BX; ADOXQ AX, r6; ADCXQ BX, r7 \
142 MULXQ (56)(SI), AX, BX; ADOXQ AX, r7; ADCXQ BX, r8 \
143 MOVQ $0, AX ; ADOXQ AX, r8;
144
145 MULS_MULX_512(0, R8, R9, R10, R11, R12, R13, R14, CX, BP)
146 MULS_MULX_512(1, R9, R10, R11, R12, R13, R14, CX, BP, R8)
147 MULS_MULX_512(2, R10, R11, R12, R13, R14, CX, BP, R8, R9)
148 MULS_MULX_512(3, R11, R12, R13, R14, CX, BP, R8, R9, R10)
149 MULS_MULX_512(4, R12, R13, R14, CX, BP, R8, R9, R10, R11)
150 MULS_MULX_512(5, R13, R14, CX, BP, R8, R9, R10, R11, R12)
151 MULS_MULX_512(6, R14, CX, BP, R8, R9, R10, R11, R12, R13)
152 MULS_MULX_512(7, CX, BP, R8, R9, R10, R11, R12, R13, R14)
153#undef MULS_MULX_512
154
155 MOVQ res+0(FP), DI
156 MOVQ BP, ( 0)(DI)
157 MOVQ R8, ( 8)(DI)
158 MOVQ R9, (16)(DI)
159 MOVQ R10, (24)(DI)
160 MOVQ R11, (32)(DI)
161 MOVQ R12, (40)(DI)
162 MOVQ R13, (48)(DI)
163 MOVQ R14, (56)(DI)
164 MOVQ 0(SP), BP // pop: BP is Callee-save.
165
166 // NOW DI needs to be reduced if > p
167 RET
View as plain text