1// This code was imported from https://github.com/armfazh/rfc7748_precomputed
2
3// CHECK_BMI2ADX triggers bmi2adx if supported,
4// otherwise it fallbacks to legacy code.
5#define CHECK_BMI2ADX(label, legacy, bmi2adx) \
6 CMPB ·hasBmi2Adx(SB), $0 \
7 JE label \
8 bmi2adx \
9 RET \
10 label: \
11 legacy \
12 RET
13
14// cselect is a conditional move
15// if b=1: it copies y into x;
16// if b=0: x remains with the same value;
17// if b<> 0,1: undefined.
18// Uses: AX, DX, FLAGS
19// Instr: x86_64, cmov
20#define cselect(x,y,b) \
21 TESTQ b, b \
22 MOVQ 0+x, AX; MOVQ 0+y, DX; CMOVQNE DX, AX; MOVQ AX, 0+x; \
23 MOVQ 8+x, AX; MOVQ 8+y, DX; CMOVQNE DX, AX; MOVQ AX, 8+x; \
24 MOVQ 16+x, AX; MOVQ 16+y, DX; CMOVQNE DX, AX; MOVQ AX, 16+x; \
25 MOVQ 24+x, AX; MOVQ 24+y, DX; CMOVQNE DX, AX; MOVQ AX, 24+x;
26
27// cswap is a conditional swap
28// if b=1: x,y <- y,x;
29// if b=0: x,y remain with the same values;
30// if b<> 0,1: undefined.
31// Uses: AX, DX, R8, FLAGS
32// Instr: x86_64, cmov
33#define cswap(x,y,b) \
34 TESTQ b, b \
35 MOVQ 0+x, AX; MOVQ AX, R8; MOVQ 0+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 0+x; MOVQ DX, 0+y; \
36 MOVQ 8+x, AX; MOVQ AX, R8; MOVQ 8+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 8+x; MOVQ DX, 8+y; \
37 MOVQ 16+x, AX; MOVQ AX, R8; MOVQ 16+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 16+x; MOVQ DX, 16+y; \
38 MOVQ 24+x, AX; MOVQ AX, R8; MOVQ 24+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 24+x; MOVQ DX, 24+y;
39
40// additionLeg adds x and y and stores in z
41// Uses: AX, DX, R8-R11, FLAGS
42// Instr: x86_64, cmov
43#define additionLeg(z,x,y) \
44 MOVL $38, AX; \
45 MOVL $0, DX; \
46 MOVQ 0+x, R8; ADDQ 0+y, R8; \
47 MOVQ 8+x, R9; ADCQ 8+y, R9; \
48 MOVQ 16+x, R10; ADCQ 16+y, R10; \
49 MOVQ 24+x, R11; ADCQ 24+y, R11; \
50 CMOVQCS AX, DX; \
51 ADDQ DX, R8; \
52 ADCQ $0, R9; MOVQ R9, 8+z; \
53 ADCQ $0, R10; MOVQ R10, 16+z; \
54 ADCQ $0, R11; MOVQ R11, 24+z; \
55 MOVL $0, DX; \
56 CMOVQCS AX, DX; \
57 ADDQ DX, R8; MOVQ R8, 0+z;
58
59// additionAdx adds x and y and stores in z
60// Uses: AX, DX, R8-R11, FLAGS
61// Instr: x86_64, cmov, adx
62#define additionAdx(z,x,y) \
63 MOVL $38, AX; \
64 XORL DX, DX; \
65 MOVQ 0+x, R8; ADCXQ 0+y, R8; \
66 MOVQ 8+x, R9; ADCXQ 8+y, R9; \
67 MOVQ 16+x, R10; ADCXQ 16+y, R10; \
68 MOVQ 24+x, R11; ADCXQ 24+y, R11; \
69 CMOVQCS AX, DX ; \
70 XORL AX, AX; \
71 ADCXQ DX, R8; \
72 ADCXQ AX, R9; MOVQ R9, 8+z; \
73 ADCXQ AX, R10; MOVQ R10, 16+z; \
74 ADCXQ AX, R11; MOVQ R11, 24+z; \
75 MOVL $38, DX; \
76 CMOVQCS DX, AX; \
77 ADDQ AX, R8; MOVQ R8, 0+z;
78
79// subtraction subtracts y from x and stores in z
80// Uses: AX, DX, R8-R11, FLAGS
81// Instr: x86_64, cmov
82#define subtraction(z,x,y) \
83 MOVL $38, AX; \
84 MOVQ 0+x, R8; SUBQ 0+y, R8; \
85 MOVQ 8+x, R9; SBBQ 8+y, R9; \
86 MOVQ 16+x, R10; SBBQ 16+y, R10; \
87 MOVQ 24+x, R11; SBBQ 24+y, R11; \
88 MOVL $0, DX; \
89 CMOVQCS AX, DX; \
90 SUBQ DX, R8; \
91 SBBQ $0, R9; MOVQ R9, 8+z; \
92 SBBQ $0, R10; MOVQ R10, 16+z; \
93 SBBQ $0, R11; MOVQ R11, 24+z; \
94 MOVL $0, DX; \
95 CMOVQCS AX, DX; \
96 SUBQ DX, R8; MOVQ R8, 0+z;
97
98// integerMulAdx multiplies x and y and stores in z
99// Uses: AX, DX, R8-R15, FLAGS
100// Instr: x86_64, bmi2, adx
101#define integerMulAdx(z,x,y) \
102 MOVL $0,R15; \
103 MOVQ 0+y, DX; XORL AX, AX; \
104 MULXQ 0+x, AX, R8; MOVQ AX, 0+z; \
105 MULXQ 8+x, AX, R9; ADCXQ AX, R8; \
106 MULXQ 16+x, AX, R10; ADCXQ AX, R9; \
107 MULXQ 24+x, AX, R11; ADCXQ AX, R10; \
108 MOVL $0, AX;;;;;;;;; ADCXQ AX, R11; \
109 MOVQ 8+y, DX; XORL AX, AX; \
110 MULXQ 0+x, AX, R12; ADCXQ R8, AX; MOVQ AX, 8+z; \
111 MULXQ 8+x, AX, R13; ADCXQ R9, R12; ADOXQ AX, R12; \
112 MULXQ 16+x, AX, R14; ADCXQ R10, R13; ADOXQ AX, R13; \
113 MULXQ 24+x, AX, R15; ADCXQ R11, R14; ADOXQ AX, R14; \
114 MOVL $0, AX;;;;;;;;; ADCXQ AX, R15; ADOXQ AX, R15; \
115 MOVQ 16+y, DX; XORL AX, AX; \
116 MULXQ 0+x, AX, R8; ADCXQ R12, AX; MOVQ AX, 16+z; \
117 MULXQ 8+x, AX, R9; ADCXQ R13, R8; ADOXQ AX, R8; \
118 MULXQ 16+x, AX, R10; ADCXQ R14, R9; ADOXQ AX, R9; \
119 MULXQ 24+x, AX, R11; ADCXQ R15, R10; ADOXQ AX, R10; \
120 MOVL $0, AX;;;;;;;;; ADCXQ AX, R11; ADOXQ AX, R11; \
121 MOVQ 24+y, DX; XORL AX, AX; \
122 MULXQ 0+x, AX, R12; ADCXQ R8, AX; MOVQ AX, 24+z; \
123 MULXQ 8+x, AX, R13; ADCXQ R9, R12; ADOXQ AX, R12; MOVQ R12, 32+z; \
124 MULXQ 16+x, AX, R14; ADCXQ R10, R13; ADOXQ AX, R13; MOVQ R13, 40+z; \
125 MULXQ 24+x, AX, R15; ADCXQ R11, R14; ADOXQ AX, R14; MOVQ R14, 48+z; \
126 MOVL $0, AX;;;;;;;;; ADCXQ AX, R15; ADOXQ AX, R15; MOVQ R15, 56+z;
127
128// integerMulLeg multiplies x and y and stores in z
129// Uses: AX, DX, R8-R15, FLAGS
130// Instr: x86_64
131#define integerMulLeg(z,x,y) \
132 MOVQ 0+y, R8; \
133 MOVQ 0+x, AX; MULQ R8; MOVQ AX, 0+z; MOVQ DX, R15; \
134 MOVQ 8+x, AX; MULQ R8; MOVQ AX, R13; MOVQ DX, R10; \
135 MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; \
136 MOVQ 24+x, AX; MULQ R8; \
137 ADDQ R13, R15; \
138 ADCQ R14, R10; MOVQ R10, 16+z; \
139 ADCQ AX, R11; MOVQ R11, 24+z; \
140 ADCQ $0, DX; MOVQ DX, 32+z; \
141 MOVQ 8+y, R8; \
142 MOVQ 0+x, AX; MULQ R8; MOVQ AX, R12; MOVQ DX, R9; \
143 MOVQ 8+x, AX; MULQ R8; MOVQ AX, R13; MOVQ DX, R10; \
144 MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; \
145 MOVQ 24+x, AX; MULQ R8; \
146 ADDQ R12, R15; MOVQ R15, 8+z; \
147 ADCQ R13, R9; \
148 ADCQ R14, R10; \
149 ADCQ AX, R11; \
150 ADCQ $0, DX; \
151 ADCQ 16+z, R9; MOVQ R9, R15; \
152 ADCQ 24+z, R10; MOVQ R10, 24+z; \
153 ADCQ 32+z, R11; MOVQ R11, 32+z; \
154 ADCQ $0, DX; MOVQ DX, 40+z; \
155 MOVQ 16+y, R8; \
156 MOVQ 0+x, AX; MULQ R8; MOVQ AX, R12; MOVQ DX, R9; \
157 MOVQ 8+x, AX; MULQ R8; MOVQ AX, R13; MOVQ DX, R10; \
158 MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; \
159 MOVQ 24+x, AX; MULQ R8; \
160 ADDQ R12, R15; MOVQ R15, 16+z; \
161 ADCQ R13, R9; \
162 ADCQ R14, R10; \
163 ADCQ AX, R11; \
164 ADCQ $0, DX; \
165 ADCQ 24+z, R9; MOVQ R9, R15; \
166 ADCQ 32+z, R10; MOVQ R10, 32+z; \
167 ADCQ 40+z, R11; MOVQ R11, 40+z; \
168 ADCQ $0, DX; MOVQ DX, 48+z; \
169 MOVQ 24+y, R8; \
170 MOVQ 0+x, AX; MULQ R8; MOVQ AX, R12; MOVQ DX, R9; \
171 MOVQ 8+x, AX; MULQ R8; MOVQ AX, R13; MOVQ DX, R10; \
172 MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; \
173 MOVQ 24+x, AX; MULQ R8; \
174 ADDQ R12, R15; MOVQ R15, 24+z; \
175 ADCQ R13, R9; \
176 ADCQ R14, R10; \
177 ADCQ AX, R11; \
178 ADCQ $0, DX; \
179 ADCQ 32+z, R9; MOVQ R9, 32+z; \
180 ADCQ 40+z, R10; MOVQ R10, 40+z; \
181 ADCQ 48+z, R11; MOVQ R11, 48+z; \
182 ADCQ $0, DX; MOVQ DX, 56+z;
183
184// integerSqrLeg squares x and stores in z
185// Uses: AX, CX, DX, R8-R15, FLAGS
186// Instr: x86_64
187#define integerSqrLeg(z,x) \
188 MOVQ 0+x, R8; \
189 MOVQ 8+x, AX; MULQ R8; MOVQ AX, R9; MOVQ DX, R10; /* A[0]*A[1] */ \
190 MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; /* A[0]*A[2] */ \
191 MOVQ 24+x, AX; MULQ R8; MOVQ AX, R15; MOVQ DX, R12; /* A[0]*A[3] */ \
192 MOVQ 24+x, R8; \
193 MOVQ 8+x, AX; MULQ R8; MOVQ AX, CX; MOVQ DX, R13; /* A[3]*A[1] */ \
194 MOVQ 16+x, AX; MULQ R8; /* A[3]*A[2] */ \
195 \
196 ADDQ R14, R10;\
197 ADCQ R15, R11; MOVL $0, R15;\
198 ADCQ CX, R12;\
199 ADCQ AX, R13;\
200 ADCQ $0, DX; MOVQ DX, R14;\
201 MOVQ 8+x, AX; MULQ 16+x;\
202 \
203 ADDQ AX, R11;\
204 ADCQ DX, R12;\
205 ADCQ $0, R13;\
206 ADCQ $0, R14;\
207 ADCQ $0, R15;\
208 \
209 SHLQ $1, R14, R15; MOVQ R15, 56+z;\
210 SHLQ $1, R13, R14; MOVQ R14, 48+z;\
211 SHLQ $1, R12, R13; MOVQ R13, 40+z;\
212 SHLQ $1, R11, R12; MOVQ R12, 32+z;\
213 SHLQ $1, R10, R11; MOVQ R11, 24+z;\
214 SHLQ $1, R9, R10; MOVQ R10, 16+z;\
215 SHLQ $1, R9; MOVQ R9, 8+z;\
216 \
217 MOVQ 0+x,AX; MULQ AX; MOVQ AX, 0+z; MOVQ DX, R9;\
218 MOVQ 8+x,AX; MULQ AX; MOVQ AX, R10; MOVQ DX, R11;\
219 MOVQ 16+x,AX; MULQ AX; MOVQ AX, R12; MOVQ DX, R13;\
220 MOVQ 24+x,AX; MULQ AX; MOVQ AX, R14; MOVQ DX, R15;\
221 \
222 ADDQ 8+z, R9; MOVQ R9, 8+z;\
223 ADCQ 16+z, R10; MOVQ R10, 16+z;\
224 ADCQ 24+z, R11; MOVQ R11, 24+z;\
225 ADCQ 32+z, R12; MOVQ R12, 32+z;\
226 ADCQ 40+z, R13; MOVQ R13, 40+z;\
227 ADCQ 48+z, R14; MOVQ R14, 48+z;\
228 ADCQ 56+z, R15; MOVQ R15, 56+z;
229
230// integerSqrAdx squares x and stores in z
231// Uses: AX, CX, DX, R8-R15, FLAGS
232// Instr: x86_64, bmi2, adx
233#define integerSqrAdx(z,x) \
234 MOVQ 0+x, DX; /* A[0] */ \
235 MULXQ 8+x, R8, R14; /* A[1]*A[0] */ XORL R15, R15; \
236 MULXQ 16+x, R9, R10; /* A[2]*A[0] */ ADCXQ R14, R9; \
237 MULXQ 24+x, AX, CX; /* A[3]*A[0] */ ADCXQ AX, R10; \
238 MOVQ 24+x, DX; /* A[3] */ \
239 MULXQ 8+x, R11, R12; /* A[1]*A[3] */ ADCXQ CX, R11; \
240 MULXQ 16+x, AX, R13; /* A[2]*A[3] */ ADCXQ AX, R12; \
241 MOVQ 8+x, DX; /* A[1] */ ADCXQ R15, R13; \
242 MULXQ 16+x, AX, CX; /* A[2]*A[1] */ MOVL $0, R14; \
243 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ADCXQ R15, R14; \
244 XORL R15, R15; \
245 ADOXQ AX, R10; ADCXQ R8, R8; \
246 ADOXQ CX, R11; ADCXQ R9, R9; \
247 ADOXQ R15, R12; ADCXQ R10, R10; \
248 ADOXQ R15, R13; ADCXQ R11, R11; \
249 ADOXQ R15, R14; ADCXQ R12, R12; \
250 ;;;;;;;;;;;;;;; ADCXQ R13, R13; \
251 ;;;;;;;;;;;;;;; ADCXQ R14, R14; \
252 MOVQ 0+x, DX; MULXQ DX, AX, CX; /* A[0]^2 */ \
253 ;;;;;;;;;;;;;;; MOVQ AX, 0+z; \
254 ADDQ CX, R8; MOVQ R8, 8+z; \
255 MOVQ 8+x, DX; MULXQ DX, AX, CX; /* A[1]^2 */ \
256 ADCQ AX, R9; MOVQ R9, 16+z; \
257 ADCQ CX, R10; MOVQ R10, 24+z; \
258 MOVQ 16+x, DX; MULXQ DX, AX, CX; /* A[2]^2 */ \
259 ADCQ AX, R11; MOVQ R11, 32+z; \
260 ADCQ CX, R12; MOVQ R12, 40+z; \
261 MOVQ 24+x, DX; MULXQ DX, AX, CX; /* A[3]^2 */ \
262 ADCQ AX, R13; MOVQ R13, 48+z; \
263 ADCQ CX, R14; MOVQ R14, 56+z;
264
265// reduceFromDouble finds z congruent to x modulo p such that 0<z<2^256
266// Uses: AX, DX, R8-R13, FLAGS
267// Instr: x86_64
268#define reduceFromDoubleLeg(z,x) \
269 /* 2*C = 38 = 2^256 */ \
270 MOVL $38, AX; MULQ 32+x; MOVQ AX, R8; MOVQ DX, R9; /* C*C[4] */ \
271 MOVL $38, AX; MULQ 40+x; MOVQ AX, R12; MOVQ DX, R10; /* C*C[5] */ \
272 MOVL $38, AX; MULQ 48+x; MOVQ AX, R13; MOVQ DX, R11; /* C*C[6] */ \
273 MOVL $38, AX; MULQ 56+x; /* C*C[7] */ \
274 ADDQ R12, R9; \
275 ADCQ R13, R10; \
276 ADCQ AX, R11; \
277 ADCQ $0, DX; \
278 ADDQ 0+x, R8; \
279 ADCQ 8+x, R9; \
280 ADCQ 16+x, R10; \
281 ADCQ 24+x, R11; \
282 ADCQ $0, DX; \
283 MOVL $38, AX; \
284 IMULQ AX, DX; /* C*C[4], CF=0, OF=0 */ \
285 ADDQ DX, R8; \
286 ADCQ $0, R9; MOVQ R9, 8+z; \
287 ADCQ $0, R10; MOVQ R10, 16+z; \
288 ADCQ $0, R11; MOVQ R11, 24+z; \
289 MOVL $0, DX; \
290 CMOVQCS AX, DX; \
291 ADDQ DX, R8; MOVQ R8, 0+z;
292
293// reduceFromDoubleAdx finds z congruent to x modulo p such that 0<z<2^256
294// Uses: AX, DX, R8-R13, FLAGS
295// Instr: x86_64, bmi2, adx
296#define reduceFromDoubleAdx(z,x) \
297 MOVL $38, DX; /* 2*C = 38 = 2^256 */ \
298 MULXQ 32+x, R8, R10; /* C*C[4] */ XORL AX, AX; ADOXQ 0+x, R8; \
299 MULXQ 40+x, R9, R11; /* C*C[5] */ ADCXQ R10, R9; ADOXQ 8+x, R9; \
300 MULXQ 48+x, R10, R13; /* C*C[6] */ ADCXQ R11, R10; ADOXQ 16+x, R10; \
301 MULXQ 56+x, R11, R12; /* C*C[7] */ ADCXQ R13, R11; ADOXQ 24+x, R11; \
302 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ADCXQ AX, R12; ADOXQ AX, R12; \
303 IMULQ DX, R12; /* C*C[4], CF=0, OF=0 */ \
304 ADCXQ R12, R8; \
305 ADCXQ AX, R9; MOVQ R9, 8+z; \
306 ADCXQ AX, R10; MOVQ R10, 16+z; \
307 ADCXQ AX, R11; MOVQ R11, 24+z; \
308 MOVL $0, R12; \
309 CMOVQCS DX, R12; \
310 ADDQ R12, R8; MOVQ R8, 0+z;
311
312// addSub calculates two operations: x,y = x+y,x-y
313// Uses: AX, DX, R8-R15, FLAGS
314#define addSub(x,y) \
315 MOVL $38, AX; \
316 XORL DX, DX; \
317 MOVQ 0+x, R8; MOVQ R8, R12; ADDQ 0+y, R8; \
318 MOVQ 8+x, R9; MOVQ R9, R13; ADCQ 8+y, R9; \
319 MOVQ 16+x, R10; MOVQ R10, R14; ADCQ 16+y, R10; \
320 MOVQ 24+x, R11; MOVQ R11, R15; ADCQ 24+y, R11; \
321 CMOVQCS AX, DX; \
322 XORL AX, AX; \
323 ADDQ DX, R8; \
324 ADCQ $0, R9; \
325 ADCQ $0, R10; \
326 ADCQ $0, R11; \
327 MOVL $38, DX; \
328 CMOVQCS DX, AX; \
329 ADDQ AX, R8; \
330 MOVL $38, AX; \
331 SUBQ 0+y, R12; \
332 SBBQ 8+y, R13; \
333 SBBQ 16+y, R14; \
334 SBBQ 24+y, R15; \
335 MOVL $0, DX; \
336 CMOVQCS AX, DX; \
337 SUBQ DX, R12; \
338 SBBQ $0, R13; \
339 SBBQ $0, R14; \
340 SBBQ $0, R15; \
341 MOVL $0, DX; \
342 CMOVQCS AX, DX; \
343 SUBQ DX, R12; \
344 MOVQ R8, 0+x; \
345 MOVQ R9, 8+x; \
346 MOVQ R10, 16+x; \
347 MOVQ R11, 24+x; \
348 MOVQ R12, 0+y; \
349 MOVQ R13, 8+y; \
350 MOVQ R14, 16+y; \
351 MOVQ R15, 24+y;
View as plain text