...

Text file src/github.com/cloudflare/circl/ecc/p384/arith_amd64.s

Documentation: github.com/cloudflare/circl/ecc/p384

     1// +build amd64,!noasm
     2
     3#include "textflag.h"
     4
     5#define storeBlock(a0,a1,a2,a3,a4,a5, r) \
     6	MOVQ a0,  0+r \
     7	MOVQ a1,  8+r \
     8	MOVQ a2, 16+r \
     9	MOVQ a3, 24+r \
    10	MOVQ a4, 32+r \
    11	MOVQ a5, 40+r
    12
    13#define loadBlock(r, a0,a1,a2,a3,a4,a5) \
    14	MOVQ  0+r, a0 \
    15	MOVQ  8+r, a1 \
    16	MOVQ 16+r, a2 \
    17	MOVQ 24+r, a3 \
    18	MOVQ 32+r, a4 \
    19	MOVQ 40+r, a5
    20
    21#define fp384Carry(a0,a1,a2,a3,a4,a5,a6, b0,b1,b2,b3,b4,b5,b6) \
    22	\ // b = a-p
    23	MOVQ a0, b0 \
    24	MOVQ a1, b1 \
    25	MOVQ a2, b2 \
    26	MOVQ a3, b3 \
    27	MOVQ a4, b4 \
    28	MOVQ a5, b5 \
    29	MOVQ a6, b6 \
    30	\
    31	SUBQ ·p+0(SB), b0 \
    32	SBBQ ·p+8(SB), b1 \
    33	SBBQ ·p+16(SB), b2 \
    34	SBBQ ·p+24(SB), b3 \
    35	SBBQ ·p+32(SB), b4 \
    36	SBBQ ·p+40(SB), b5 \
    37	SBBQ $0, b6 \
    38	\
    39	\ // if b is negative then return a
    40	\ // else return b
    41	CMOVQCC b0, a0 \
    42	CMOVQCC b1, a1 \
    43	CMOVQCC b2, a2 \
    44	CMOVQCC b3, a3 \
    45	CMOVQCC b4, a4 \
    46	CMOVQCC b5, a5
    47
    48#define mul(a0,a1,a2,a3,a4,a5, rb, stack) \
    49	\ // a0
    50	MOVQ a0, AX \
    51	MULQ 0+rb \
    52	MOVQ AX, R8 \
    53	MOVQ DX, R9 \
    54	MOVQ a0, AX \
    55	MULQ 8+rb \
    56	ADDQ AX, R9 \
    57	ADCQ $0, DX \
    58	MOVQ DX, R10 \
    59	MOVQ a0, AX \
    60	MULQ 16+rb \
    61	ADDQ AX, R10 \
    62	ADCQ $0, DX \
    63	MOVQ DX, R11 \
    64	MOVQ a0, AX \
    65	MULQ 24+rb \
    66	ADDQ AX, R11 \
    67	ADCQ $0, DX \
    68	MOVQ DX, R12 \
    69	MOVQ a0, AX \
    70	MULQ 32+rb \
    71	ADDQ AX, R12 \
    72	ADCQ $0, DX \
    73	MOVQ DX, R13 \
    74	MOVQ a0, AX \
    75	MULQ 40+rb \
    76	ADDQ AX, R13 \
    77	ADCQ $0, DX \
    78	MOVQ DX, R14 \
    79	\
    80	storeBlock(R8,R9,R10,R11,R12,R13, 0+stack) \
    81	MOVQ R14, 48+stack \
    82	\
    83	\ // a1
    84	MOVQ a1, AX \
    85	MULQ 0+rb \
    86	MOVQ AX, R8 \
    87	MOVQ DX, R9 \
    88	MOVQ a1, AX \
    89	MULQ 8+rb \
    90	ADDQ AX, R9 \
    91	ADCQ $0, DX \
    92	MOVQ DX, R10 \
    93	MOVQ a1, AX \
    94	MULQ 16+rb \
    95	ADDQ AX, R10 \
    96	ADCQ $0, DX \
    97	MOVQ DX, R11 \
    98	MOVQ a1, AX \
    99	MULQ 24+rb \
   100	ADDQ AX, R11 \
   101	ADCQ $0, DX \
   102	MOVQ DX, R12 \
   103	MOVQ a1, AX \
   104	MULQ 32+rb \
   105	ADDQ AX, R12 \
   106	ADCQ $0, DX \
   107	MOVQ DX, R13 \
   108	MOVQ a1, AX \
   109	MULQ 40+rb \
   110	ADDQ AX, R13 \
   111	ADCQ $0, DX \
   112	MOVQ DX, R14 \
   113	\
   114	ADDQ 8+stack, R8 \
   115	ADCQ 16+stack, R9 \
   116	ADCQ 24+stack, R10 \
   117	ADCQ 32+stack, R11 \
   118	ADCQ 40+stack, R12 \
   119	ADCQ 48+stack, R13 \
   120	ADCQ $0, R14 \
   121	storeBlock(R8,R9,R10,R11,R12,R13, 8+stack) \
   122	MOVQ R14, 56+stack \
   123	\
   124	\ // a2
   125	MOVQ a2, AX \
   126	MULQ 0+rb \
   127	MOVQ AX, R8 \
   128	MOVQ DX, R9 \
   129	MOVQ a2, AX \
   130	MULQ 8+rb \
   131	ADDQ AX, R9 \
   132	ADCQ $0, DX \
   133	MOVQ DX, R10 \
   134	MOVQ a2, AX \
   135	MULQ 16+rb \
   136	ADDQ AX, R10 \
   137	ADCQ $0, DX \
   138	MOVQ DX, R11 \
   139	MOVQ a2, AX \
   140	MULQ 24+rb \
   141	ADDQ AX, R11 \
   142	ADCQ $0, DX \
   143	MOVQ DX, R12 \
   144	MOVQ a2, AX \
   145	MULQ 32+rb \
   146	ADDQ AX, R12 \
   147	ADCQ $0, DX \
   148	MOVQ DX, R13 \
   149	MOVQ a2, AX \
   150	MULQ 40+rb \
   151	ADDQ AX, R13 \
   152	ADCQ $0, DX \
   153	MOVQ DX, R14 \
   154	\
   155	ADDQ 16+stack, R8 \
   156	ADCQ 24+stack, R9 \
   157	ADCQ 32+stack, R10 \
   158	ADCQ 40+stack, R11 \
   159	ADCQ 48+stack, R12 \
   160	ADCQ 56+stack, R13 \
   161	ADCQ $0, R14 \
   162	storeBlock(R8,R9,R10,R11,R12,R13, 16+stack) \
   163	MOVQ R14, 64+stack \
   164	\
   165	\ // a3
   166	MOVQ a3, AX \
   167	MULQ 0+rb \
   168	MOVQ AX, R8 \
   169	MOVQ DX, R9 \
   170	MOVQ a3, AX \
   171	MULQ 8+rb \
   172	ADDQ AX, R9 \
   173	ADCQ $0, DX \
   174	MOVQ DX, R10 \
   175	MOVQ a3, AX \
   176	MULQ 16+rb \
   177	ADDQ AX, R10 \
   178	ADCQ $0, DX \
   179	MOVQ DX, R11 \
   180	MOVQ a3, AX \
   181	MULQ 24+rb \
   182	ADDQ AX, R11 \
   183	ADCQ $0, DX \
   184	MOVQ DX, R12 \
   185	MOVQ a3, AX \
   186	MULQ 32+rb \
   187	ADDQ AX, R12 \
   188	ADCQ $0, DX \
   189	MOVQ DX, R13 \
   190	MOVQ a3, AX \
   191	MULQ 40+rb \
   192	ADDQ AX, R13 \
   193	ADCQ $0, DX \
   194	MOVQ DX, R14 \
   195	\
   196	ADDQ 24+stack, R8 \
   197	ADCQ 32+stack, R9 \
   198	ADCQ 40+stack, R10 \
   199	ADCQ 48+stack, R11 \
   200	ADCQ 56+stack, R12 \
   201	ADCQ 64+stack, R13 \
   202	ADCQ $0, R14 \
   203	storeBlock(R8,R9,R10,R11,R12,R13, 24+stack) \
   204	MOVQ R14, 72+stack \
   205	\
   206	\ // a4
   207	MOVQ a4, AX \
   208	MULQ 0+rb \
   209	MOVQ AX, R8 \
   210	MOVQ DX, R9 \
   211	MOVQ a4, AX \
   212	MULQ 8+rb \
   213	ADDQ AX, R9 \
   214	ADCQ $0, DX \
   215	MOVQ DX, R10 \
   216	MOVQ a4, AX \
   217	MULQ 16+rb \
   218	ADDQ AX, R10 \
   219	ADCQ $0, DX \
   220	MOVQ DX, R11 \
   221	MOVQ a4, AX \
   222	MULQ 24+rb \
   223	ADDQ AX, R11 \
   224	ADCQ $0, DX \
   225	MOVQ DX, R12 \
   226	MOVQ a4, AX \
   227	MULQ 32+rb \
   228	ADDQ AX, R12 \
   229	ADCQ $0, DX \
   230	MOVQ DX, R13 \
   231	MOVQ a4, AX \
   232	MULQ 40+rb \
   233	ADDQ AX, R13 \
   234	ADCQ $0, DX \
   235	MOVQ DX, R14 \
   236	\
   237	ADDQ 32+stack, R8 \
   238	ADCQ 40+stack, R9 \
   239	ADCQ 48+stack, R10 \
   240	ADCQ 56+stack, R11 \
   241	ADCQ 64+stack, R12 \
   242	ADCQ 72+stack, R13 \
   243	ADCQ $0, R14 \
   244	storeBlock(R8,R9,R10,R11,R12,R13, 32+stack) \
   245	MOVQ R14, 80+stack \
   246	\
   247	\ // a5
   248	MOVQ a5, AX \
   249	MULQ 0+rb \
   250	MOVQ AX, R8 \
   251	MOVQ DX, R9 \
   252	MOVQ a5, AX \
   253	MULQ 8+rb \
   254	ADDQ AX, R9 \
   255	ADCQ $0, DX \
   256	MOVQ DX, R10 \
   257	MOVQ a5, AX \
   258	MULQ 16+rb \
   259	ADDQ AX, R10 \
   260	ADCQ $0, DX \
   261	MOVQ DX, R11 \
   262	MOVQ a5, AX \
   263	MULQ 24+rb \
   264	ADDQ AX, R11 \
   265	ADCQ $0, DX \
   266	MOVQ DX, R12 \
   267	MOVQ a5, AX \
   268	MULQ 32+rb \
   269	ADDQ AX, R12 \
   270	ADCQ $0, DX \
   271	MOVQ DX, R13 \
   272	MOVQ a5, AX \
   273	MULQ 40+rb \
   274	ADDQ AX, R13 \
   275	ADCQ $0, DX \
   276	MOVQ DX, R14 \
   277	\
   278	ADDQ 40+stack, R8 \
   279	ADCQ 48+stack, R9 \
   280	ADCQ 56+stack, R10 \
   281	ADCQ 64+stack, R11 \
   282	ADCQ 72+stack, R12 \
   283	ADCQ 80+stack, R13 \
   284	ADCQ $0, R14 \
   285	storeBlock(R8,R9,R10,R11,R12,R13, 40+stack) \
   286	MOVQ R14, 88+stack
   287
   288#define fp384Reduce(stack) \
   289	\ // m = (T * P') mod R, store m in R8:R9:R10:R11:R12:R13
   290	MOVQ ·pp+0(SB), AX \
   291	MULQ 0+stack \
   292	MOVQ AX, R8 ; MOVQ R8, 96+stack\
   293	MOVQ DX, R9 \
   294	MOVQ ·pp+0(SB), AX \
   295	MULQ 8+stack \
   296	ADDQ AX, R9 \
   297	ADCQ $0, DX \
   298	MOVQ DX, R10 \
   299	MOVQ ·pp+0(SB), AX \
   300	MULQ 16+stack \
   301	ADDQ AX, R10 \
   302	ADCQ $0, DX \
   303	MOVQ DX, R11 \
   304	MOVQ ·pp+0(SB), AX \
   305	MULQ 24+stack \
   306	ADDQ AX, R11 \
   307	ADCQ $0, DX \
   308	MOVQ DX, R12 \
   309	MOVQ ·pp+0(SB), AX \
   310	MULQ 32+stack \
   311	ADDQ AX, R12 \
   312	ADCQ $0, DX \
   313	MOVQ DX, R13 \
   314	MOVQ ·pp+0(SB), AX \
   315	MULQ 40+stack \
   316	ADDQ AX, R13 \
   317	\
   318	ADDQ 0+stack, R9 \
   319	ADCQ 8+stack, R10 \
   320	ADCQ 16+stack, R11 \
   321	ADCQ 24+stack, R12 \
   322	ADCQ 32+stack, R13 \
   323	\
   324	MOVQ ·pp+16(SB), AX \
   325	MULQ 0+stack \
   326	MOVQ AX, R14 \
   327	MOVQ DX, R8 \
   328	MOVQ ·pp+16(SB), AX \
   329	MULQ 8+stack \
   330	ADDQ AX, R8 \
   331	ADCQ $0, DX \
   332	MOVQ DX, BX \
   333	MOVQ ·pp+16(SB), AX \
   334	MULQ 16+stack \
   335	ADDQ AX, BX \
   336	ADCQ $0, DX \
   337	MOVQ DX, CX \
   338	MOVQ ·pp+16(SB), AX \
   339	MULQ 24+stack \
   340	ADDQ AX, CX \
   341	\
   342	ADDQ R14, R10 \
   343	ADCQ R8, R11 \
   344	ADCQ BX, R12 \
   345	ADCQ CX, R13 \
   346	\
   347	MOVQ ·pp+24(SB), AX \
   348	MULQ 0+stack \
   349	MOVQ AX, R14 \
   350	MOVQ DX, R8 \
   351	MOVQ ·pp+24(SB), AX \
   352	MULQ 8+stack \
   353	ADDQ AX, R8 \
   354	ADCQ $0, DX \
   355	MOVQ DX, BX \
   356	MOVQ ·pp+24(SB), AX \
   357	MULQ 16+stack \
   358	ADDQ AX, BX \
   359	\
   360	ADDQ R14, R11 \
   361	ADCQ R8, R12 \
   362	ADCQ BX, R13 \
   363	\
   364	MOVQ ·pp+32(SB), AX \
   365	MULQ 0+stack \
   366	MOVQ AX, R14 \
   367	MOVQ DX, R8 \
   368	MOVQ ·pp+32(SB), AX \
   369	MULQ 8+stack \
   370	ADDQ AX, R8 \
   371	\
   372	ADDQ R14, R12 \
   373	ADCQ R8, R13 \
   374	\
   375	MOVQ ·pp+40(SB), AX \
   376	MULQ 0+stack \
   377	ADDQ AX, R13 \
   378	\
   379	MOVQ 96+stack, R8 \
   380	\
   381	storeBlock(R8,R9,R10,R11,R12,R13, 96+stack) \
   382	\
   383	\ // m * P
   384	mul(·p+0(SB),·p+8(SB),·p+16(SB),·p+24(SB),·p+32(SB),·p+40(SB), 96+stack, 144+stack) \
   385	\
   386	\ // Add the 768-bit intermediate to m*N
   387	MOVQ $0, R15 \
   388	loadBlock(144+stack, R8,R9,R10,R11,R12,R13) \
   389	loadBlock(192+stack, R14,SI,AX,BX,CX,DX) \
   390	\
   391	ADDQ 0+stack, R8 \
   392	ADCQ 8+stack, R9 \
   393	ADCQ 16+stack, R10 \
   394	ADCQ 24+stack, R11 \
   395	ADCQ 32+stack, R12 \
   396	ADCQ 40+stack, R13 \
   397	ADCQ 48+stack, R14 \
   398	ADCQ 56+stack, SI \
   399	ADCQ 64+stack, AX \
   400	ADCQ 72+stack, BX \
   401	ADCQ 80+stack, CX \
   402	ADCQ 88+stack, DX \
   403	ADCQ $0, R15 \
   404	\
   405	fp384Carry(R14,SI,AX,BX,CX,DX,R15, R8,R9,R10,R11,R12,R13,DI)
   406
   407#define mulBMI2(a0,a1,a2,a3,a4,a5, rb, stack) \
   408	MOVQ a0, DX \
   409	MULXQ 0+rb, R8, R9; MOVQ R8, 0+stack; MOVQ $0, R8 \
   410	MULXQ 8+rb, AX, R10 \
   411	ADDQ AX, R9 \
   412	MULXQ 16+rb, AX, R11 \
   413	ADCQ AX, R10 \
   414	MULXQ 24+rb, AX, R12 \
   415	ADCQ AX, R11 \
   416	MULXQ 32+rb, AX, R13 \
   417	ADCQ AX, R12 \
   418	MULXQ 40+rb, AX, R14 \
   419	ADCQ AX, R13 \
   420	ADCQ $0, R14 \
   421	\
   422	MOVQ a1, DX \
   423	MULXQ 0+rb, AX, BX \
   424	ADDQ AX, R9; MOVQ R9, 8+stack; MOVL $0, R9 \
   425	ADCQ BX, R10 \
   426	MULXQ 16+rb, AX, BX \
   427	ADCQ AX, R11 \
   428	ADCQ BX, R12 \
   429	MULXQ 32+rb, AX, BX \
   430	ADCQ AX, R13 \
   431	ADCQ BX, R14 \
   432	ADCQ $0,  R8 \
   433	MULXQ 8+rb, AX, BX \
   434	ADDQ AX, R10 \
   435	ADCQ BX, R11 \
   436	MULXQ 24+rb, AX, BX \
   437	ADCQ AX, R12 \
   438	ADCQ BX, R13 \
   439	MULXQ 40+rb, AX, BX \
   440	ADCQ AX, R14 \
   441	ADCQ BX, R8 \
   442	ADCQ $0, R9 \
   443	\
   444	MOVQ a2, DX \
   445	MULXQ 0+rb, AX, BX \
   446	ADDQ AX, R10; MOVQ R10, 16+stack; MOVL $0, R10 \
   447	ADCQ BX, R11 \
   448	MULXQ 16+rb, AX, BX \
   449	ADCQ AX, R12 \
   450	ADCQ BX, R13 \
   451	MULXQ 32+rb, AX, BX \
   452	ADCQ AX, R14 \
   453	ADCQ BX, R8 \
   454	ADCQ $0, R9 \
   455	MULXQ 8+rb, AX, BX \
   456	ADDQ AX, R11 \
   457	ADCQ BX, R12 \
   458	MULXQ 24+rb, AX, BX \
   459	ADCQ AX, R13 \
   460	ADCQ BX, R14 \
   461	MULXQ 40+rb, AX, BX \
   462	ADCQ AX, R8 \
   463	ADCQ BX, R9 \
   464	ADCQ $0, R10 \
   465	\
   466	MOVQ a3, DX \
   467	MULXQ 0+rb, AX, BX \
   468	ADDQ AX, R11; MOVQ R11, 24+stack; MOVL $0, R11 \
   469	ADCQ BX, R12 \
   470	MULXQ 16+rb, AX, BX \
   471	ADCQ AX, R13 \
   472	ADCQ BX, R14 \
   473	MULXQ 32+rb, AX, BX \
   474	ADCQ AX, R8 \
   475	ADCQ BX, R9 \
   476	ADCQ $0, R10 \
   477	MULXQ 8+rb, AX, BX \
   478	ADDQ AX, R12 \
   479	ADCQ BX, R13 \
   480	MULXQ 24+rb, AX, BX \
   481	ADCQ AX, R14 \
   482	ADCQ BX, R8 \
   483	MULXQ 40+rb, AX, BX \
   484	ADCQ AX, R9 \
   485	ADCQ BX, R10 \
   486	ADCQ $0, R11 \
   487	\
   488	MOVQ a4, DX \
   489	MULXQ 0+rb, AX, BX \
   490	ADDQ AX, R12; MOVQ R12, 32+stack; MOVL $0, R12 \
   491	ADCQ BX, R13 \
   492	MULXQ 16+rb, AX, BX \
   493	ADCQ AX, R14 \
   494	ADCQ BX, R8 \
   495	MULXQ 32+rb, AX, BX \
   496	ADCQ AX, R9 \
   497	ADCQ BX, R10 \
   498	ADCQ $0, R11 \
   499	MULXQ 8+rb, AX, BX \
   500	ADDQ AX, R13 \
   501	ADCQ BX, R14 \
   502	MULXQ 24+rb, AX, BX \
   503	ADCQ AX, R8 \
   504	ADCQ BX, R9 \
   505	MULXQ 40+rb, AX, BX \
   506	ADCQ AX, R10 \
   507	ADCQ BX, R11 \
   508	ADCQ $0, R12 \
   509	\
   510	MOVQ a5, DX \
   511	MULXQ 0+rb, AX, BX \
   512	ADDQ AX, R13; MOVQ R13, 40+stack \
   513	ADCQ BX, R14 \
   514	MULXQ 16+rb, AX, BX \
   515	ADCQ AX, R8 \
   516	ADCQ BX, R9 \
   517	MULXQ 32+rb, AX, BX \
   518	ADCQ AX, R10 \
   519	ADCQ BX, R11 \
   520	ADCQ $0, R12 \
   521	MULXQ 8+rb, AX, BX \
   522	ADDQ AX, R14 \
   523	ADCQ BX, R8 \
   524	MULXQ 24+rb, AX, BX \
   525	ADCQ AX, R9 \
   526	ADCQ BX, R10 \
   527	MULXQ 40+rb, AX, BX \
   528	ADCQ AX, R11 \
   529	ADCQ BX, R12
   530
   531#define fp384ReduceBMI2(stack) \
   532	\ // m = (T * P') mod R, store m in R8:R9:R10:R11:R12:R13
   533	MOVQ ·pp+0(SB), DX \
   534	MULXQ 0+stack, R8, R9 \
   535	MULXQ 8+stack, AX, R10 \
   536	ADDQ AX, R9 \
   537	MULXQ 16+stack, AX, R11 \
   538	ADCQ AX, R10 \
   539	MULXQ 24+stack, AX, R12 \
   540	ADCQ AX, R11 \
   541	MULXQ 32+stack, AX, R13 \
   542	ADCQ AX, R12 \
   543	MULXQ 40+stack, AX, BX \
   544	ADCQ AX, R13 \
   545	\
   546	ADDQ 0+stack, R9 \
   547	ADCQ 8+stack, R10 \
   548	ADCQ 16+stack, R11 \
   549	ADCQ 24+stack, R12 \
   550	ADCQ 32+stack, R13 \
   551	\
   552	MOVQ ·pp+16(SB), DX \
   553	MULXQ 0+stack, AX, BX \
   554	ADDQ AX, R10 \
   555	ADCQ BX, R11 \
   556	MULXQ 16+stack, AX, BX \
   557	ADCQ AX, R12 \
   558	ADCQ BX, R13 \
   559	MULXQ 8+stack, AX, BX \
   560	ADDQ AX, R11 \
   561	ADCQ BX, R12 \
   562	MULXQ 24+stack, AX, BX \
   563	ADCQ AX, R13 \
   564	\
   565	MOVQ ·pp+24(SB), DX \
   566	MULXQ 0+stack, AX, BX \
   567	ADDQ AX, R11 \
   568	ADCQ BX, R12 \
   569	MULXQ 16+stack, AX, BX \
   570	ADCQ AX, R13 \
   571	MULXQ 8+stack, AX, BX \
   572	ADDQ AX, R12 \
   573	ADCQ BX, R13 \
   574	\
   575	MOVQ ·pp+32(SB), DX \
   576	MULXQ 0+stack, AX, BX \
   577	ADDQ AX, R12 \
   578	ADCQ BX, R13 \
   579	MULXQ 8+stack, AX, BX \
   580	ADDQ AX, R13 \
   581	\
   582	MOVQ ·pp+40(SB), DX \
   583	MULXQ 0+stack, AX, BX \
   584	ADDQ AX, R13 \
   585	\
   586	storeBlock(R8,R9,R10,R11,R12,R13, 96+stack) \
   587	\
   588	\ // m * P
   589	mulBMI2(·p+0(SB),·p+8(SB),·p+16(SB),·p+24(SB),·p+32(SB),·p+40(SB), 96+stack, 144+stack) \
   590	\
   591	\ // Add the 768-bit intermediate to m*N
   592	loadBlock(144+stack, AX,R13,BX,CX,DX,DI) \
   593	\
   594	ADDQ 0+stack,  AX \
   595	ADCQ 8+stack, R13 \
   596	ADCQ 16+stack, BX \
   597	ADCQ 24+stack, CX \
   598	ADCQ 32+stack, DX \
   599	ADCQ 40+stack, DI \
   600	ADCQ 48+stack, R14 \
   601	ADCQ 56+stack, R8 \
   602	ADCQ 64+stack, R9 \
   603	ADCQ 72+stack, R10 \
   604	ADCQ 80+stack, R11 \
   605	ADCQ 88+stack, R12 \
   606	MOVQ $0, 0+stack \
   607	ADCQ $0, 0+stack \
   608	\
   609	fp384Carry(R14,R8,R9,R10,R11,R12, 0+stack, AX,R13,BX,CX,DX,DI,SI)
   610
   611TEXT ·fp384Neg(SB), NOSPLIT, $0-16
   612	MOVQ ·p+0(SB), R8
   613	MOVQ ·p+8(SB), R9
   614	MOVQ ·p+16(SB), R10
   615	MOVQ ·p+24(SB), R11
   616	MOVQ ·p+32(SB), R12
   617	MOVQ ·p+40(SB), R13
   618
   619	MOVQ a+8(FP), DI
   620	SUBQ 0(DI), R8
   621	SBBQ 8(DI), R9
   622	SBBQ 16(DI), R10
   623	SBBQ 24(DI), R11
   624	SBBQ 32(DI), R12
   625	SBBQ 40(DI), R13
   626
   627	MOVQ $0, R15
   628	fp384Carry(R8,R9,R10,R11,R12,R13,R15, R14,AX,BX,CX,DX,DI,SI)
   629
   630	MOVQ c+0(FP), DI
   631	storeBlock(R8,R9,R10,R11,R12,R13, 0(DI))
   632	RET
   633
   634TEXT ·fp384Add(SB), NOSPLIT, $0-24
   635	MOVQ a+8(FP), DI
   636	MOVQ b+16(FP), SI
   637
   638	loadBlock(0(DI), R8,R9,R10,R11,R12,R13)
   639	MOVQ $0, R15
   640
   641	ADDQ  0(SI), R8
   642	ADCQ  8(SI), R9
   643	ADCQ 16(SI), R10
   644	ADCQ 24(SI), R11
   645	ADCQ 32(SI), R12
   646	ADCQ 40(SI), R13
   647	ADCQ $0, R15
   648
   649	fp384Carry(R8,R9,R10,R11,R12,R13,R15, R14,AX,BX,CX,DX,DI,SI)
   650
   651	MOVQ c+0(FP), DI
   652	storeBlock(R8,R9,R10,R11,R12,R13, 0(DI))
   653	RET
   654
   655TEXT ·fp384Sub(SB), NOSPLIT, $0-24
   656	MOVQ ·p+0(SB), R8
   657	MOVQ ·p+8(SB), R9
   658	MOVQ ·p+16(SB), R10
   659	MOVQ ·p+24(SB), R11
   660	MOVQ ·p+32(SB), R12
   661	MOVQ ·p+40(SB), R13
   662
   663	MOVQ b+16(FP), DI
   664	SUBQ 0(DI), R8
   665	SBBQ 8(DI), R9
   666	SBBQ 16(DI), R10
   667	SBBQ 24(DI), R11
   668	SBBQ 32(DI), R12
   669	SBBQ 40(DI), R13
   670
   671	MOVQ $0, R15
   672	MOVQ a+8(FP), DI
   673	ADDQ 0(DI), R8
   674	ADCQ 8(DI), R9
   675	ADCQ 16(DI), R10
   676	ADCQ 24(DI), R11
   677	ADCQ 32(DI), R12
   678	ADCQ 40(DI), R13
   679	ADCQ $0, R15
   680
   681	fp384Carry(R8,R9,R10,R11,R12,R13,R15, R14,AX,BX,CX,DX,DI,SI)
   682
   683	MOVQ c+0(FP), DI
   684	storeBlock(R8,R9,R10,R11,R12,R13, 0(DI))
   685	RET
   686
   687TEXT ·fp384Mul(SB), NOSPLIT, $240-24
   688	MOVQ a+8(FP), DI
   689	MOVQ b+16(FP), SI
   690
   691	// Jump to a slightly different implementation if MULX isn't supported.
   692	CMPB ·hasBMI2(SB), $0
   693	JE   nobmi2Mul
   694
   695	// T = a * b
   696	mulBMI2(0(DI),8(DI),16(DI),24(DI),32(DI),40(DI), 0(SI), 0(SP))
   697	storeBlock(R14,R8,R9,R10,R11,R12, 48(SP))
   698
   699	// Reduce T.
   700	fp384ReduceBMI2(0(SP))
   701
   702	MOVQ c+0(FP), DI
   703	storeBlock(R14,R8,R9,R10,R11,R12, 0(DI))
   704	JMP end
   705
   706nobmi2Mul:
   707	// T = a * b
   708	mul(0(DI),8(DI),16(DI),24(DI),32(DI),40(DI), 0(SI), 0(SP))
   709
   710	// Reduce T.
   711	fp384Reduce(0(SP))
   712
   713	MOVQ c+0(FP), DI
   714	storeBlock(R14,SI,AX,BX,CX,DX, 0(DI))
   715
   716end:
   717	RET
   718
   719TEXT ·fp384Cmov(SB), NOSPLIT, $0
   720    MOVQ x+0(FP), DI
   721    MOVQ y+8(FP), SI
   722    MOVQ b+16(FP), BX
   723    TESTQ BX, BX
   724    MOVQ  0(DI), AX; MOVQ  0(SI), DX; CMOVQNE DX, AX; MOVQ AX,  0(DI);
   725    MOVQ  8(DI), AX; MOVQ  8(SI), DX; CMOVQNE DX, AX; MOVQ AX,  8(DI);
   726    MOVQ 16(DI), AX; MOVQ 16(SI), DX; CMOVQNE DX, AX; MOVQ AX, 16(DI);
   727    MOVQ 24(DI), AX; MOVQ 24(SI), DX; CMOVQNE DX, AX; MOVQ AX, 24(DI);
   728    MOVQ 32(DI), AX; MOVQ 32(SI), DX; CMOVQNE DX, AX; MOVQ AX, 32(DI);
   729    MOVQ 40(DI), AX; MOVQ 40(SI), DX; CMOVQNE DX, AX; MOVQ AX, 40(DI);
   730    RET

View as plain text