1// This code was imported from https://github.com/armfazh/rfc7748_precomputed
2
3// CHECK_BMI2ADX triggers bmi2adx if supported,
4// otherwise it fallbacks to legacy code.
5#define CHECK_BMI2ADX(label, legacy, bmi2adx) \
6 CMPB ·hasBmi2Adx(SB), $0 \
7 JE label \
8 bmi2adx \
9 RET \
10 label: \
11 legacy \
12 RET
13
14// cselect is a conditional move
15// if b=1: it copies y into x;
16// if b=0: x remains with the same value;
17// if b<> 0,1: undefined.
18// Uses: AX, DX, FLAGS
19// Instr: x86_64, cmov
20#define cselect(x,y,b) \
21 TESTQ b, b \
22 MOVQ 0+x, AX; MOVQ 0+y, DX; CMOVQNE DX, AX; MOVQ AX, 0+x; \
23 MOVQ 8+x, AX; MOVQ 8+y, DX; CMOVQNE DX, AX; MOVQ AX, 8+x; \
24 MOVQ 16+x, AX; MOVQ 16+y, DX; CMOVQNE DX, AX; MOVQ AX, 16+x; \
25 MOVQ 24+x, AX; MOVQ 24+y, DX; CMOVQNE DX, AX; MOVQ AX, 24+x; \
26 MOVQ 32+x, AX; MOVQ 32+y, DX; CMOVQNE DX, AX; MOVQ AX, 32+x; \
27 MOVQ 40+x, AX; MOVQ 40+y, DX; CMOVQNE DX, AX; MOVQ AX, 40+x; \
28 MOVQ 48+x, AX; MOVQ 48+y, DX; CMOVQNE DX, AX; MOVQ AX, 48+x;
29
30// cswap is a conditional swap
31// if b=1: x,y <- y,x;
32// if b=0: x,y remain with the same values;
33// if b<> 0,1: undefined.
34// Uses: AX, DX, R8, FLAGS
35// Instr: x86_64, cmov
36#define cswap(x,y,b) \
37 TESTQ b, b \
38 MOVQ 0+x, AX; MOVQ AX, R8; MOVQ 0+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 0+x; MOVQ DX, 0+y; \
39 MOVQ 8+x, AX; MOVQ AX, R8; MOVQ 8+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 8+x; MOVQ DX, 8+y; \
40 MOVQ 16+x, AX; MOVQ AX, R8; MOVQ 16+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 16+x; MOVQ DX, 16+y; \
41 MOVQ 24+x, AX; MOVQ AX, R8; MOVQ 24+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 24+x; MOVQ DX, 24+y; \
42 MOVQ 32+x, AX; MOVQ AX, R8; MOVQ 32+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 32+x; MOVQ DX, 32+y; \
43 MOVQ 40+x, AX; MOVQ AX, R8; MOVQ 40+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 40+x; MOVQ DX, 40+y; \
44 MOVQ 48+x, AX; MOVQ AX, R8; MOVQ 48+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 48+x; MOVQ DX, 48+y;
45
46// additionLeg adds x and y and stores in z
47// Uses: AX, DX, R8-R14, FLAGS
48// Instr: x86_64
49#define additionLeg(z,x,y) \
50 MOVQ 0+x, R8; ADDQ 0+y, R8; \
51 MOVQ 8+x, R9; ADCQ 8+y, R9; \
52 MOVQ 16+x, R10; ADCQ 16+y, R10; \
53 MOVQ 24+x, R11; ADCQ 24+y, R11; \
54 MOVQ 32+x, R12; ADCQ 32+y, R12; \
55 MOVQ 40+x, R13; ADCQ 40+y, R13; \
56 MOVQ 48+x, R14; ADCQ 48+y, R14; \
57 MOVQ $0, AX; ADCQ $0, AX; \
58 MOVQ AX, DX; \
59 SHLQ $32, DX; \
60 ADDQ AX, R8; MOVQ $0, AX; \
61 ADCQ $0, R9; \
62 ADCQ $0, R10; \
63 ADCQ DX, R11; \
64 ADCQ $0, R12; \
65 ADCQ $0, R13; \
66 ADCQ $0, R14; \
67 ADCQ $0, AX; \
68 MOVQ AX, DX; \
69 SHLQ $32, DX; \
70 ADDQ AX, R8; MOVQ R8, 0+z; \
71 ADCQ $0, R9; MOVQ R9, 8+z; \
72 ADCQ $0, R10; MOVQ R10, 16+z; \
73 ADCQ DX, R11; MOVQ R11, 24+z; \
74 ADCQ $0, R12; MOVQ R12, 32+z; \
75 ADCQ $0, R13; MOVQ R13, 40+z; \
76 ADCQ $0, R14; MOVQ R14, 48+z;
77
78
79// additionAdx adds x and y and stores in z
80// Uses: AX, DX, R8-R15, FLAGS
81// Instr: x86_64, adx
82#define additionAdx(z,x,y) \
83 MOVL $32, R15; \
84 XORL DX, DX; \
85 MOVQ 0+x, R8; ADCXQ 0+y, R8; \
86 MOVQ 8+x, R9; ADCXQ 8+y, R9; \
87 MOVQ 16+x, R10; ADCXQ 16+y, R10; \
88 MOVQ 24+x, R11; ADCXQ 24+y, R11; \
89 MOVQ 32+x, R12; ADCXQ 32+y, R12; \
90 MOVQ 40+x, R13; ADCXQ 40+y, R13; \
91 MOVQ 48+x, R14; ADCXQ 48+y, R14; \
92 ;;;;;;;;;;;;;;; ADCXQ DX, DX; \
93 XORL AX, AX; \
94 ADCXQ DX, R8; SHLXQ R15, DX, DX; \
95 ADCXQ AX, R9; \
96 ADCXQ AX, R10; \
97 ADCXQ DX, R11; \
98 ADCXQ AX, R12; \
99 ADCXQ AX, R13; \
100 ADCXQ AX, R14; \
101 ADCXQ AX, AX; \
102 XORL DX, DX; \
103 ADCXQ AX, R8; MOVQ R8, 0+z; SHLXQ R15, AX, AX; \
104 ADCXQ DX, R9; MOVQ R9, 8+z; \
105 ADCXQ DX, R10; MOVQ R10, 16+z; \
106 ADCXQ AX, R11; MOVQ R11, 24+z; \
107 ADCXQ DX, R12; MOVQ R12, 32+z; \
108 ADCXQ DX, R13; MOVQ R13, 40+z; \
109 ADCXQ DX, R14; MOVQ R14, 48+z;
110
111// subtraction subtracts y from x and stores in z
112// Uses: AX, DX, R8-R14, FLAGS
113// Instr: x86_64
114#define subtraction(z,x,y) \
115 MOVQ 0+x, R8; SUBQ 0+y, R8; \
116 MOVQ 8+x, R9; SBBQ 8+y, R9; \
117 MOVQ 16+x, R10; SBBQ 16+y, R10; \
118 MOVQ 24+x, R11; SBBQ 24+y, R11; \
119 MOVQ 32+x, R12; SBBQ 32+y, R12; \
120 MOVQ 40+x, R13; SBBQ 40+y, R13; \
121 MOVQ 48+x, R14; SBBQ 48+y, R14; \
122 MOVQ $0, AX; SETCS AX; \
123 MOVQ AX, DX; \
124 SHLQ $32, DX; \
125 SUBQ AX, R8; MOVQ $0, AX; \
126 SBBQ $0, R9; \
127 SBBQ $0, R10; \
128 SBBQ DX, R11; \
129 SBBQ $0, R12; \
130 SBBQ $0, R13; \
131 SBBQ $0, R14; \
132 SETCS AX; \
133 MOVQ AX, DX; \
134 SHLQ $32, DX; \
135 SUBQ AX, R8; MOVQ R8, 0+z; \
136 SBBQ $0, R9; MOVQ R9, 8+z; \
137 SBBQ $0, R10; MOVQ R10, 16+z; \
138 SBBQ DX, R11; MOVQ R11, 24+z; \
139 SBBQ $0, R12; MOVQ R12, 32+z; \
140 SBBQ $0, R13; MOVQ R13, 40+z; \
141 SBBQ $0, R14; MOVQ R14, 48+z;
142
143// maddBmi2Adx multiplies x and y and accumulates in z
144// Uses: AX, DX, R15, FLAGS
145// Instr: x86_64, bmi2, adx
146#define maddBmi2Adx(z,x,y,i,r0,r1,r2,r3,r4,r5,r6) \
147 MOVQ i+y, DX; XORL AX, AX; \
148 MULXQ 0+x, AX, R8; ADOXQ AX, r0; ADCXQ R8, r1; MOVQ r0,i+z; \
149 MULXQ 8+x, AX, r0; ADOXQ AX, r1; ADCXQ r0, r2; MOVQ $0, R8; \
150 MULXQ 16+x, AX, r0; ADOXQ AX, r2; ADCXQ r0, r3; \
151 MULXQ 24+x, AX, r0; ADOXQ AX, r3; ADCXQ r0, r4; \
152 MULXQ 32+x, AX, r0; ADOXQ AX, r4; ADCXQ r0, r5; \
153 MULXQ 40+x, AX, r0; ADOXQ AX, r5; ADCXQ r0, r6; \
154 MULXQ 48+x, AX, r0; ADOXQ AX, r6; ADCXQ R8, r0; \
155 ;;;;;;;;;;;;;;;;;;; ADOXQ R8, r0;
156
157// integerMulAdx multiplies x and y and stores in z
158// Uses: AX, DX, R8-R15, FLAGS
159// Instr: x86_64, bmi2, adx
160#define integerMulAdx(z,x,y) \
161 MOVL $0,R15; \
162 MOVQ 0+y, DX; XORL AX, AX; MOVQ $0, R8; \
163 MULXQ 0+x, AX, R9; MOVQ AX, 0+z; \
164 MULXQ 8+x, AX, R10; ADCXQ AX, R9; \
165 MULXQ 16+x, AX, R11; ADCXQ AX, R10; \
166 MULXQ 24+x, AX, R12; ADCXQ AX, R11; \
167 MULXQ 32+x, AX, R13; ADCXQ AX, R12; \
168 MULXQ 40+x, AX, R14; ADCXQ AX, R13; \
169 MULXQ 48+x, AX, R15; ADCXQ AX, R14; \
170 ;;;;;;;;;;;;;;;;;;;; ADCXQ R8, R15; \
171 maddBmi2Adx(z,x,y, 8, R9,R10,R11,R12,R13,R14,R15) \
172 maddBmi2Adx(z,x,y,16,R10,R11,R12,R13,R14,R15, R9) \
173 maddBmi2Adx(z,x,y,24,R11,R12,R13,R14,R15, R9,R10) \
174 maddBmi2Adx(z,x,y,32,R12,R13,R14,R15, R9,R10,R11) \
175 maddBmi2Adx(z,x,y,40,R13,R14,R15, R9,R10,R11,R12) \
176 maddBmi2Adx(z,x,y,48,R14,R15, R9,R10,R11,R12,R13) \
177 MOVQ R15, 56+z; \
178 MOVQ R9, 64+z; \
179 MOVQ R10, 72+z; \
180 MOVQ R11, 80+z; \
181 MOVQ R12, 88+z; \
182 MOVQ R13, 96+z; \
183 MOVQ R14, 104+z;
184
185// maddLegacy multiplies x and y and accumulates in z
186// Uses: AX, DX, R15, FLAGS
187// Instr: x86_64
188#define maddLegacy(z,x,y,i) \
189 MOVQ i+y, R15; \
190 MOVQ 0+x, AX; MULQ R15; MOVQ AX, R8; ;;;;;;;;;;;; MOVQ DX, R9; \
191 MOVQ 8+x, AX; MULQ R15; ADDQ AX, R9; ADCQ $0, DX; MOVQ DX, R10; \
192 MOVQ 16+x, AX; MULQ R15; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; \
193 MOVQ 24+x, AX; MULQ R15; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; \
194 MOVQ 32+x, AX; MULQ R15; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; \
195 MOVQ 40+x, AX; MULQ R15; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX, R14; \
196 MOVQ 48+x, AX; MULQ R15; ADDQ AX, R14; ADCQ $0, DX; \
197 ADDQ 0+i+z, R8; MOVQ R8, 0+i+z; \
198 ADCQ 8+i+z, R9; MOVQ R9, 8+i+z; \
199 ADCQ 16+i+z, R10; MOVQ R10, 16+i+z; \
200 ADCQ 24+i+z, R11; MOVQ R11, 24+i+z; \
201 ADCQ 32+i+z, R12; MOVQ R12, 32+i+z; \
202 ADCQ 40+i+z, R13; MOVQ R13, 40+i+z; \
203 ADCQ 48+i+z, R14; MOVQ R14, 48+i+z; \
204 ADCQ $0, DX; MOVQ DX, 56+i+z;
205
206// integerMulLeg multiplies x and y and stores in z
207// Uses: AX, DX, R8-R15, FLAGS
208// Instr: x86_64
209#define integerMulLeg(z,x,y) \
210 MOVQ 0+y, R15; \
211 MOVQ 0+x, AX; MULQ R15; MOVQ AX, 0+z; ;;;;;;;;;;;; MOVQ DX, R8; \
212 MOVQ 8+x, AX; MULQ R15; ADDQ AX, R8; ADCQ $0, DX; MOVQ DX, R9; MOVQ R8, 8+z; \
213 MOVQ 16+x, AX; MULQ R15; ADDQ AX, R9; ADCQ $0, DX; MOVQ DX, R10; MOVQ R9, 16+z; \
214 MOVQ 24+x, AX; MULQ R15; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; MOVQ R10, 24+z; \
215 MOVQ 32+x, AX; MULQ R15; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; MOVQ R11, 32+z; \
216 MOVQ 40+x, AX; MULQ R15; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; MOVQ R12, 40+z; \
217 MOVQ 48+x, AX; MULQ R15; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX,56+z; MOVQ R13, 48+z; \
218 maddLegacy(z,x,y, 8) \
219 maddLegacy(z,x,y,16) \
220 maddLegacy(z,x,y,24) \
221 maddLegacy(z,x,y,32) \
222 maddLegacy(z,x,y,40) \
223 maddLegacy(z,x,y,48)
224
225// integerSqrLeg squares x and stores in z
226// Uses: AX, CX, DX, R8-R15, FLAGS
227// Instr: x86_64
228#define integerSqrLeg(z,x) \
229 XORL R15, R15; \
230 MOVQ 0+x, CX; \
231 MOVQ CX, AX; MULQ CX; MOVQ AX, 0+z; MOVQ DX, R8; \
232 ADDQ CX, CX; ADCQ $0, R15; \
233 MOVQ 8+x, AX; MULQ CX; ADDQ AX, R8; ADCQ $0, DX; MOVQ DX, R9; MOVQ R8, 8+z; \
234 MOVQ 16+x, AX; MULQ CX; ADDQ AX, R9; ADCQ $0, DX; MOVQ DX, R10; \
235 MOVQ 24+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; \
236 MOVQ 32+x, AX; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; \
237 MOVQ 40+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; \
238 MOVQ 48+x, AX; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX, R14; \
239 \
240 MOVQ 8+x, CX; \
241 MOVQ CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
242 ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R9; ADCQ $0, DX; MOVQ R9,16+z; \
243 MOVQ R15, AX; NEGQ AX; ANDQ 8+x, AX; ADDQ AX, DX; ADCQ $0, R11; MOVQ DX, R8; \
244 ADDQ 8+x, CX; ADCQ $0, R15; \
245 MOVQ 16+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; ADDQ R8, R10; ADCQ $0, DX; MOVQ DX, R8; MOVQ R10, 24+z; \
246 MOVQ 24+x, AX; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; ADDQ R8, R11; ADCQ $0, DX; MOVQ DX, R8; \
247 MOVQ 32+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; ADDQ R8, R12; ADCQ $0, DX; MOVQ DX, R8; \
248 MOVQ 40+x, AX; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; ADDQ R8, R13; ADCQ $0, DX; MOVQ DX, R8; \
249 MOVQ 48+x, AX; MULQ CX; ADDQ AX, R14; ADCQ $0, DX; ADDQ R8, R14; ADCQ $0, DX; MOVQ DX, R9; \
250 \
251 MOVQ 16+x, CX; \
252 MOVQ CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
253 ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; MOVQ R11, 32+z; \
254 MOVQ R15, AX; NEGQ AX; ANDQ 16+x,AX; ADDQ AX, DX; ADCQ $0, R13; MOVQ DX, R8; \
255 ADDQ 16+x, CX; ADCQ $0, R15; \
256 MOVQ 24+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; ADDQ R8, R12; ADCQ $0, DX; MOVQ DX, R8; MOVQ R12, 40+z; \
257 MOVQ 32+x, AX; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; ADDQ R8, R13; ADCQ $0, DX; MOVQ DX, R8; \
258 MOVQ 40+x, AX; MULQ CX; ADDQ AX, R14; ADCQ $0, DX; ADDQ R8, R14; ADCQ $0, DX; MOVQ DX, R8; \
259 MOVQ 48+x, AX; MULQ CX; ADDQ AX, R9; ADCQ $0, DX; ADDQ R8, R9; ADCQ $0, DX; MOVQ DX,R10; \
260 \
261 MOVQ 24+x, CX; \
262 MOVQ CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
263 ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; MOVQ R13, 48+z; \
264 MOVQ R15, AX; NEGQ AX; ANDQ 24+x,AX; ADDQ AX, DX; ADCQ $0, R9; MOVQ DX, R8; \
265 ADDQ 24+x, CX; ADCQ $0, R15; \
266 MOVQ 32+x, AX; MULQ CX; ADDQ AX, R14; ADCQ $0, DX; ADDQ R8, R14; ADCQ $0, DX; MOVQ DX, R8; MOVQ R14, 56+z; \
267 MOVQ 40+x, AX; MULQ CX; ADDQ AX, R9; ADCQ $0, DX; ADDQ R8, R9; ADCQ $0, DX; MOVQ DX, R8; \
268 MOVQ 48+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; ADDQ R8, R10; ADCQ $0, DX; MOVQ DX,R11; \
269 \
270 MOVQ 32+x, CX; \
271 MOVQ CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
272 ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R9; ADCQ $0, DX; MOVQ R9, 64+z; \
273 MOVQ R15, AX; NEGQ AX; ANDQ 32+x,AX; ADDQ AX, DX; ADCQ $0, R11; MOVQ DX, R8; \
274 ADDQ 32+x, CX; ADCQ $0, R15; \
275 MOVQ 40+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; ADDQ R8, R10; ADCQ $0, DX; MOVQ DX, R8; MOVQ R10, 72+z; \
276 MOVQ 48+x, AX; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; ADDQ R8, R11; ADCQ $0, DX; MOVQ DX,R12; \
277 \
278 XORL R13, R13; \
279 XORL R14, R14; \
280 MOVQ 40+x, CX; \
281 MOVQ CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
282 ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; MOVQ R11, 80+z; \
283 MOVQ R15, AX; NEGQ AX; ANDQ 40+x,AX; ADDQ AX, DX; ADCQ $0, R13; MOVQ DX, R8; \
284 ADDQ 40+x, CX; ADCQ $0, R15; \
285 MOVQ 48+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; ADDQ R8, R12; ADCQ $0, DX; MOVQ DX, R8; MOVQ R12, 88+z; \
286 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ADDQ R8, R13; ADCQ $0,R14; \
287 \
288 XORL R9, R9; \
289 MOVQ 48+x, CX; \
290 MOVQ CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
291 ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; MOVQ R13, 96+z; \
292 MOVQ R15, AX; NEGQ AX; ANDQ 48+x,AX; ADDQ AX, DX; ADCQ $0, R9; MOVQ DX, R8; \
293 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ADDQ R8,R14; ADCQ $0, R9; MOVQ R14, 104+z;
294
295
296// integerSqrAdx squares x and stores in z
297// Uses: AX, CX, DX, R8-R15, FLAGS
298// Instr: x86_64, bmi2, adx
299#define integerSqrAdx(z,x) \
300 XORL R15, R15; \
301 MOVQ 0+x, DX; \
302 ;;;;;;;;;;;;;; MULXQ DX, AX, R8; MOVQ AX, 0+z; \
303 ADDQ DX, DX; ADCQ $0, R15; CLC; \
304 MULXQ 8+x, AX, R9; ADCXQ AX, R8; MOVQ R8, 8+z; \
305 MULXQ 16+x, AX, R10; ADCXQ AX, R9; MOVQ $0, R8;\
306 MULXQ 24+x, AX, R11; ADCXQ AX, R10; \
307 MULXQ 32+x, AX, R12; ADCXQ AX, R11; \
308 MULXQ 40+x, AX, R13; ADCXQ AX, R12; \
309 MULXQ 48+x, AX, R14; ADCXQ AX, R13; \
310 ;;;;;;;;;;;;;;;;;;;; ADCXQ R8, R14; \
311 \
312 MOVQ 8+x, DX; \
313 MOVQ DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ $0, R15; \
314 MULXQ AX, AX, CX; \
315 MOVQ R15, R8; NEGQ R8; ANDQ 8+x, R8; \
316 ADDQ AX, R9; MOVQ R9, 16+z; \
317 ADCQ CX, R8; \
318 ADCQ $0, R11; \
319 ADDQ 8+x, DX; \
320 ADCQ $0, R15; \
321 XORL R9, R9; ;;;;;;;;;;;;;;;;;;;;; ADOXQ R8, R10; \
322 MULXQ 16+x, AX, CX; ADCXQ AX, R10; ADOXQ CX, R11; MOVQ R10, 24+z; \
323 MULXQ 24+x, AX, CX; ADCXQ AX, R11; ADOXQ CX, R12; MOVQ $0, R10; \
324 MULXQ 32+x, AX, CX; ADCXQ AX, R12; ADOXQ CX, R13; \
325 MULXQ 40+x, AX, CX; ADCXQ AX, R13; ADOXQ CX, R14; \
326 MULXQ 48+x, AX, CX; ADCXQ AX, R14; ADOXQ CX, R9; \
327 ;;;;;;;;;;;;;;;;;;; ADCXQ R10, R9; \
328 \
329 MOVQ 16+x, DX; \
330 MOVQ DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ $0, R15; \
331 MULXQ AX, AX, CX; \
332 MOVQ R15, R8; NEGQ R8; ANDQ 16+x, R8; \
333 ADDQ AX, R11; MOVQ R11, 32+z; \
334 ADCQ CX, R8; \
335 ADCQ $0, R13; \
336 ADDQ 16+x, DX; \
337 ADCQ $0, R15; \
338 XORL R11, R11; ;;;;;;;;;;;;;;;;;;; ADOXQ R8, R12; \
339 MULXQ 24+x, AX, CX; ADCXQ AX, R12; ADOXQ CX, R13; MOVQ R12, 40+z; \
340 MULXQ 32+x, AX, CX; ADCXQ AX, R13; ADOXQ CX, R14; MOVQ $0, R12; \
341 MULXQ 40+x, AX, CX; ADCXQ AX, R14; ADOXQ CX, R9; \
342 MULXQ 48+x, AX, CX; ADCXQ AX, R9; ADOXQ CX, R10; \
343 ;;;;;;;;;;;;;;;;;;; ADCXQ R11,R10; \
344 \
345 MOVQ 24+x, DX; \
346 MOVQ DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ $0, R15; \
347 MULXQ AX, AX, CX; \
348 MOVQ R15, R8; NEGQ R8; ANDQ 24+x, R8; \
349 ADDQ AX, R13; MOVQ R13, 48+z; \
350 ADCQ CX, R8; \
351 ADCQ $0, R9; \
352 ADDQ 24+x, DX; \
353 ADCQ $0, R15; \
354 XORL R13, R13; ;;;;;;;;;;;;;;;;;;; ADOXQ R8, R14; \
355 MULXQ 32+x, AX, CX; ADCXQ AX, R14; ADOXQ CX, R9; MOVQ R14, 56+z; \
356 MULXQ 40+x, AX, CX; ADCXQ AX, R9; ADOXQ CX, R10; MOVQ $0, R14; \
357 MULXQ 48+x, AX, CX; ADCXQ AX, R10; ADOXQ CX, R11; \
358 ;;;;;;;;;;;;;;;;;;; ADCXQ R12,R11; \
359 \
360 MOVQ 32+x, DX; \
361 MOVQ DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ $0, R15; \
362 MULXQ AX, AX, CX; \
363 MOVQ R15, R8; NEGQ R8; ANDQ 32+x, R8; \
364 ADDQ AX, R9; MOVQ R9, 64+z; \
365 ADCQ CX, R8; \
366 ADCQ $0, R11; \
367 ADDQ 32+x, DX; \
368 ADCQ $0, R15; \
369 XORL R9, R9; ;;;;;;;;;;;;;;;;;;;;; ADOXQ R8, R10; \
370 MULXQ 40+x, AX, CX; ADCXQ AX, R10; ADOXQ CX, R11; MOVQ R10, 72+z; \
371 MULXQ 48+x, AX, CX; ADCXQ AX, R11; ADOXQ CX, R12; \
372 ;;;;;;;;;;;;;;;;;;; ADCXQ R13,R12; \
373 \
374 MOVQ 40+x, DX; \
375 MOVQ DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ $0, R15; \
376 MULXQ AX, AX, CX; \
377 MOVQ R15, R8; NEGQ R8; ANDQ 40+x, R8; \
378 ADDQ AX, R11; MOVQ R11, 80+z; \
379 ADCQ CX, R8; \
380 ADCQ $0, R13; \
381 ADDQ 40+x, DX; \
382 ADCQ $0, R15; \
383 XORL R11, R11; ;;;;;;;;;;;;;;;;;;; ADOXQ R8, R12; \
384 MULXQ 48+x, AX, CX; ADCXQ AX, R12; ADOXQ CX, R13; MOVQ R12, 88+z; \
385 ;;;;;;;;;;;;;;;;;;; ADCXQ R14,R13; \
386 \
387 MOVQ 48+x, DX; \
388 MOVQ DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ $0, R15; \
389 MULXQ AX, AX, CX; \
390 MOVQ R15, R8; NEGQ R8; ANDQ 48+x, R8; \
391 XORL R10, R10; ;;;;;;;;;;;;;; ADOXQ CX, R14; \
392 ;;;;;;;;;;;;;; ADCXQ AX, R13; ;;;;;;;;;;;;;; MOVQ R13, 96+z; \
393 ;;;;;;;;;;;;;; ADCXQ R8, R14; MOVQ R14, 104+z;
394
395// reduceFromDoubleLeg finds a z=x modulo p such that z<2^448 and stores in z
396// Uses: AX, R8-R15, FLAGS
397// Instr: x86_64
398#define reduceFromDoubleLeg(z,x) \
399 /* ( ,2C13,2C12,2C11,2C10|C10,C9,C8, C7) + (C6,...,C0) */ \
400 /* (r14, r13, r12, r11, r10,r9,r8,r15) */ \
401 MOVQ 80+x,AX; MOVQ AX,R10; \
402 MOVQ $0xFFFFFFFF00000000, R8; \
403 ANDQ R8,R10; \
404 \
405 MOVQ $0,R14; \
406 MOVQ 104+x,R13; SHLQ $1,R13,R14; \
407 MOVQ 96+x,R12; SHLQ $1,R12,R13; \
408 MOVQ 88+x,R11; SHLQ $1,R11,R12; \
409 MOVQ 72+x, R9; SHLQ $1,R10,R11; \
410 MOVQ 64+x, R8; SHLQ $1,R10; \
411 MOVQ $0xFFFFFFFF,R15; ANDQ R15,AX; ORQ AX,R10; \
412 MOVQ 56+x,R15; \
413 \
414 ADDQ 0+x,R15; MOVQ R15, 0+z; MOVQ 56+x,R15; \
415 ADCQ 8+x, R8; MOVQ R8, 8+z; MOVQ 64+x, R8; \
416 ADCQ 16+x, R9; MOVQ R9,16+z; MOVQ 72+x, R9; \
417 ADCQ 24+x,R10; MOVQ R10,24+z; MOVQ 80+x,R10; \
418 ADCQ 32+x,R11; MOVQ R11,32+z; MOVQ 88+x,R11; \
419 ADCQ 40+x,R12; MOVQ R12,40+z; MOVQ 96+x,R12; \
420 ADCQ 48+x,R13; MOVQ R13,48+z; MOVQ 104+x,R13; \
421 ADCQ $0,R14; \
422 /* (c10c9,c9c8,c8c7,c7c13,c13c12,c12c11,c11c10) + (c6,...,c0) */ \
423 /* ( r9, r8, r15, r13, r12, r11, r10) */ \
424 MOVQ R10, AX; \
425 SHRQ $32,R11,R10; \
426 SHRQ $32,R12,R11; \
427 SHRQ $32,R13,R12; \
428 SHRQ $32,R15,R13; \
429 SHRQ $32, R8,R15; \
430 SHRQ $32, R9, R8; \
431 SHRQ $32, AX, R9; \
432 \
433 ADDQ 0+z,R10; \
434 ADCQ 8+z,R11; \
435 ADCQ 16+z,R12; \
436 ADCQ 24+z,R13; \
437 ADCQ 32+z,R15; \
438 ADCQ 40+z, R8; \
439 ADCQ 48+z, R9; \
440 ADCQ $0,R14; \
441 /* ( c7) + (c6,...,c0) */ \
442 /* (r14) */ \
443 MOVQ R14, AX; SHLQ $32, AX; \
444 ADDQ R14,R10; MOVQ $0,R14; \
445 ADCQ $0,R11; \
446 ADCQ $0,R12; \
447 ADCQ AX,R13; \
448 ADCQ $0,R15; \
449 ADCQ $0, R8; \
450 ADCQ $0, R9; \
451 ADCQ $0,R14; \
452 /* ( c7) + (c6,...,c0) */ \
453 /* (r14) */ \
454 MOVQ R14, AX; SHLQ $32,AX; \
455 ADDQ R14,R10; MOVQ R10, 0+z; \
456 ADCQ $0,R11; MOVQ R11, 8+z; \
457 ADCQ $0,R12; MOVQ R12,16+z; \
458 ADCQ AX,R13; MOVQ R13,24+z; \
459 ADCQ $0,R15; MOVQ R15,32+z; \
460 ADCQ $0, R8; MOVQ R8,40+z; \
461 ADCQ $0, R9; MOVQ R9,48+z;
462
463// reduceFromDoubleAdx finds a z=x modulo p such that z<2^448 and stores in z
464// Uses: AX, R8-R15, FLAGS
465// Instr: x86_64, adx
466#define reduceFromDoubleAdx(z,x) \
467 /* ( ,2C13,2C12,2C11,2C10|C10,C9,C8, C7) + (C6,...,C0) */ \
468 /* (r14, r13, r12, r11, r10,r9,r8,r15) */ \
469 MOVQ 80+x,AX; MOVQ AX,R10; \
470 MOVQ $0xFFFFFFFF00000000, R8; \
471 ANDQ R8,R10; \
472 \
473 MOVQ $0,R14; \
474 MOVQ 104+x,R13; SHLQ $1,R13,R14; \
475 MOVQ 96+x,R12; SHLQ $1,R12,R13; \
476 MOVQ 88+x,R11; SHLQ $1,R11,R12; \
477 MOVQ 72+x, R9; SHLQ $1,R10,R11; \
478 MOVQ 64+x, R8; SHLQ $1,R10; \
479 MOVQ $0xFFFFFFFF,R15; ANDQ R15,AX; ORQ AX,R10; \
480 MOVQ 56+x,R15; \
481 \
482 XORL AX,AX; \
483 ADCXQ 0+x,R15; MOVQ R15, 0+z; MOVQ 56+x,R15; \
484 ADCXQ 8+x, R8; MOVQ R8, 8+z; MOVQ 64+x, R8; \
485 ADCXQ 16+x, R9; MOVQ R9,16+z; MOVQ 72+x, R9; \
486 ADCXQ 24+x,R10; MOVQ R10,24+z; MOVQ 80+x,R10; \
487 ADCXQ 32+x,R11; MOVQ R11,32+z; MOVQ 88+x,R11; \
488 ADCXQ 40+x,R12; MOVQ R12,40+z; MOVQ 96+x,R12; \
489 ADCXQ 48+x,R13; MOVQ R13,48+z; MOVQ 104+x,R13; \
490 ADCXQ AX,R14; \
491 /* (c10c9,c9c8,c8c7,c7c13,c13c12,c12c11,c11c10) + (c6,...,c0) */ \
492 /* ( r9, r8, r15, r13, r12, r11, r10) */ \
493 MOVQ R10, AX; \
494 SHRQ $32,R11,R10; \
495 SHRQ $32,R12,R11; \
496 SHRQ $32,R13,R12; \
497 SHRQ $32,R15,R13; \
498 SHRQ $32, R8,R15; \
499 SHRQ $32, R9, R8; \
500 SHRQ $32, AX, R9; \
501 \
502 XORL AX,AX; \
503 ADCXQ 0+z,R10; \
504 ADCXQ 8+z,R11; \
505 ADCXQ 16+z,R12; \
506 ADCXQ 24+z,R13; \
507 ADCXQ 32+z,R15; \
508 ADCXQ 40+z, R8; \
509 ADCXQ 48+z, R9; \
510 ADCXQ AX,R14; \
511 /* ( c7) + (c6,...,c0) */ \
512 /* (r14) */ \
513 MOVQ R14, AX; SHLQ $32, AX; \
514 CLC; \
515 ADCXQ R14,R10; MOVQ $0,R14; \
516 ADCXQ R14,R11; \
517 ADCXQ R14,R12; \
518 ADCXQ AX,R13; \
519 ADCXQ R14,R15; \
520 ADCXQ R14, R8; \
521 ADCXQ R14, R9; \
522 ADCXQ R14,R14; \
523 /* ( c7) + (c6,...,c0) */ \
524 /* (r14) */ \
525 MOVQ R14, AX; SHLQ $32, AX; \
526 CLC; \
527 ADCXQ R14,R10; MOVQ R10, 0+z; MOVQ $0,R14; \
528 ADCXQ R14,R11; MOVQ R11, 8+z; \
529 ADCXQ R14,R12; MOVQ R12,16+z; \
530 ADCXQ AX,R13; MOVQ R13,24+z; \
531 ADCXQ R14,R15; MOVQ R15,32+z; \
532 ADCXQ R14, R8; MOVQ R8,40+z; \
533 ADCXQ R14, R9; MOVQ R9,48+z;
534
535// addSub calculates two operations: x,y = x+y,x-y
536// Uses: AX, DX, R8-R15, FLAGS
537#define addSub(x,y) \
538 MOVQ 0+x, R8; ADDQ 0+y, R8; \
539 MOVQ 8+x, R9; ADCQ 8+y, R9; \
540 MOVQ 16+x, R10; ADCQ 16+y, R10; \
541 MOVQ 24+x, R11; ADCQ 24+y, R11; \
542 MOVQ 32+x, R12; ADCQ 32+y, R12; \
543 MOVQ 40+x, R13; ADCQ 40+y, R13; \
544 MOVQ 48+x, R14; ADCQ 48+y, R14; \
545 MOVQ $0, AX; ADCQ $0, AX; \
546 MOVQ AX, DX; \
547 SHLQ $32, DX; \
548 ADDQ AX, R8; MOVQ $0, AX; \
549 ADCQ $0, R9; \
550 ADCQ $0, R10; \
551 ADCQ DX, R11; \
552 ADCQ $0, R12; \
553 ADCQ $0, R13; \
554 ADCQ $0, R14; \
555 ADCQ $0, AX; \
556 MOVQ AX, DX; \
557 SHLQ $32, DX; \
558 ADDQ AX, R8; MOVQ 0+x,AX; MOVQ R8, 0+x; MOVQ AX, R8; \
559 ADCQ $0, R9; MOVQ 8+x,AX; MOVQ R9, 8+x; MOVQ AX, R9; \
560 ADCQ $0, R10; MOVQ 16+x,AX; MOVQ R10, 16+x; MOVQ AX, R10; \
561 ADCQ DX, R11; MOVQ 24+x,AX; MOVQ R11, 24+x; MOVQ AX, R11; \
562 ADCQ $0, R12; MOVQ 32+x,AX; MOVQ R12, 32+x; MOVQ AX, R12; \
563 ADCQ $0, R13; MOVQ 40+x,AX; MOVQ R13, 40+x; MOVQ AX, R13; \
564 ADCQ $0, R14; MOVQ 48+x,AX; MOVQ R14, 48+x; MOVQ AX, R14; \
565 SUBQ 0+y, R8; \
566 SBBQ 8+y, R9; \
567 SBBQ 16+y, R10; \
568 SBBQ 24+y, R11; \
569 SBBQ 32+y, R12; \
570 SBBQ 40+y, R13; \
571 SBBQ 48+y, R14; \
572 MOVQ $0, AX; SETCS AX; \
573 MOVQ AX, DX; \
574 SHLQ $32, DX; \
575 SUBQ AX, R8; MOVQ $0, AX; \
576 SBBQ $0, R9; \
577 SBBQ $0, R10; \
578 SBBQ DX, R11; \
579 SBBQ $0, R12; \
580 SBBQ $0, R13; \
581 SBBQ $0, R14; \
582 SETCS AX; \
583 MOVQ AX, DX; \
584 SHLQ $32, DX; \
585 SUBQ AX, R8; MOVQ R8, 0+y; \
586 SBBQ $0, R9; MOVQ R9, 8+y; \
587 SBBQ $0, R10; MOVQ R10, 16+y; \
588 SBBQ DX, R11; MOVQ R11, 24+y; \
589 SBBQ $0, R12; MOVQ R12, 32+y; \
590 SBBQ $0, R13; MOVQ R13, 40+y; \
591 SBBQ $0, R14; MOVQ R14, 48+y;
View as plain text