...

Text file src/github.com/cloudflare/circl/math/fp448/fp_amd64.h

Documentation: github.com/cloudflare/circl/math/fp448

     1// This code was imported from https://github.com/armfazh/rfc7748_precomputed
     2
     3// CHECK_BMI2ADX triggers bmi2adx if supported,
     4// otherwise it fallbacks to legacy code.
     5#define CHECK_BMI2ADX(label, legacy, bmi2adx) \
     6    CMPB ·hasBmi2Adx(SB), $0  \
     7    JE label                  \
     8    bmi2adx                   \
     9    RET                       \
    10    label:                    \
    11    legacy                    \
    12    RET
    13
    14// cselect is a conditional move
    15// if b=1: it copies y into x;
    16// if b=0: x remains with the same value;
    17// if b<> 0,1: undefined.
    18// Uses: AX, DX, FLAGS
    19// Instr: x86_64, cmov
    20#define cselect(x,y,b) \
    21    TESTQ b, b \
    22    MOVQ  0+x, AX; MOVQ  0+y, DX; CMOVQNE DX, AX; MOVQ AX,  0+x; \
    23    MOVQ  8+x, AX; MOVQ  8+y, DX; CMOVQNE DX, AX; MOVQ AX,  8+x; \
    24    MOVQ 16+x, AX; MOVQ 16+y, DX; CMOVQNE DX, AX; MOVQ AX, 16+x; \
    25    MOVQ 24+x, AX; MOVQ 24+y, DX; CMOVQNE DX, AX; MOVQ AX, 24+x; \
    26    MOVQ 32+x, AX; MOVQ 32+y, DX; CMOVQNE DX, AX; MOVQ AX, 32+x; \
    27    MOVQ 40+x, AX; MOVQ 40+y, DX; CMOVQNE DX, AX; MOVQ AX, 40+x; \
    28    MOVQ 48+x, AX; MOVQ 48+y, DX; CMOVQNE DX, AX; MOVQ AX, 48+x;
    29
    30// cswap is a conditional swap
    31// if b=1: x,y <- y,x;
    32// if b=0: x,y remain with the same values;
    33// if b<> 0,1: undefined.
    34// Uses: AX, DX, R8, FLAGS
    35// Instr: x86_64, cmov
    36#define cswap(x,y,b) \
    37    TESTQ b, b \
    38    MOVQ  0+x, AX; MOVQ AX, R8; MOVQ  0+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX,  0+x; MOVQ DX,  0+y; \
    39    MOVQ  8+x, AX; MOVQ AX, R8; MOVQ  8+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX,  8+x; MOVQ DX,  8+y; \
    40    MOVQ 16+x, AX; MOVQ AX, R8; MOVQ 16+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 16+x; MOVQ DX, 16+y; \
    41    MOVQ 24+x, AX; MOVQ AX, R8; MOVQ 24+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 24+x; MOVQ DX, 24+y; \
    42    MOVQ 32+x, AX; MOVQ AX, R8; MOVQ 32+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 32+x; MOVQ DX, 32+y; \
    43    MOVQ 40+x, AX; MOVQ AX, R8; MOVQ 40+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 40+x; MOVQ DX, 40+y; \
    44    MOVQ 48+x, AX; MOVQ AX, R8; MOVQ 48+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 48+x; MOVQ DX, 48+y;
    45
    46// additionLeg adds x and y and stores in z
    47// Uses: AX, DX, R8-R14, FLAGS
    48// Instr: x86_64
    49#define additionLeg(z,x,y) \
    50    MOVQ  0+x,  R8;  ADDQ  0+y,  R8; \
    51    MOVQ  8+x,  R9;  ADCQ  8+y,  R9; \
    52    MOVQ 16+x, R10;  ADCQ 16+y, R10; \
    53    MOVQ 24+x, R11;  ADCQ 24+y, R11; \
    54    MOVQ 32+x, R12;  ADCQ 32+y, R12; \
    55    MOVQ 40+x, R13;  ADCQ 40+y, R13; \
    56    MOVQ 48+x, R14;  ADCQ 48+y, R14; \
    57    MOVQ   $0,  AX;  ADCQ   $0,  AX; \
    58    MOVQ AX,  DX; \
    59    SHLQ $32, DX; \
    60    ADDQ AX,  R8; MOVQ  $0, AX; \
    61    ADCQ $0,  R9; \
    62    ADCQ $0, R10; \
    63    ADCQ DX, R11; \
    64    ADCQ $0, R12; \
    65    ADCQ $0, R13; \
    66    ADCQ $0, R14; \
    67    ADCQ $0,  AX; \
    68    MOVQ AX,  DX; \
    69    SHLQ $32, DX; \
    70    ADDQ AX,  R8;  MOVQ  R8,  0+z; \
    71    ADCQ $0,  R9;  MOVQ  R9,  8+z; \
    72    ADCQ $0, R10;  MOVQ R10, 16+z; \
    73    ADCQ DX, R11;  MOVQ R11, 24+z; \
    74    ADCQ $0, R12;  MOVQ R12, 32+z; \
    75    ADCQ $0, R13;  MOVQ R13, 40+z; \
    76    ADCQ $0, R14;  MOVQ R14, 48+z;
    77
    78
    79// additionAdx adds x and y and stores in z
    80// Uses: AX, DX, R8-R15, FLAGS
    81// Instr: x86_64, adx
    82#define additionAdx(z,x,y) \
    83    MOVL $32, R15; \
    84    XORL DX, DX; \
    85    MOVQ  0+x,  R8;  ADCXQ  0+y,  R8; \
    86    MOVQ  8+x,  R9;  ADCXQ  8+y,  R9; \
    87    MOVQ 16+x, R10;  ADCXQ 16+y, R10; \
    88    MOVQ 24+x, R11;  ADCXQ 24+y, R11; \
    89    MOVQ 32+x, R12;  ADCXQ 32+y, R12; \
    90    MOVQ 40+x, R13;  ADCXQ 40+y, R13; \
    91    MOVQ 48+x, R14;  ADCXQ 48+y, R14; \
    92    ;;;;;;;;;;;;;;;  ADCXQ   DX,  DX; \
    93    XORL AX, AX; \
    94    ADCXQ DX,  R8; SHLXQ R15, DX, DX; \
    95    ADCXQ AX,  R9; \
    96    ADCXQ AX, R10; \
    97    ADCXQ DX, R11; \
    98    ADCXQ AX, R12; \
    99    ADCXQ AX, R13; \
   100    ADCXQ AX, R14; \
   101    ADCXQ AX,  AX; \
   102    XORL  DX,  DX; \
   103    ADCXQ AX,  R8;  MOVQ  R8,  0+z; SHLXQ R15, AX, AX; \
   104    ADCXQ DX,  R9;  MOVQ  R9,  8+z; \
   105    ADCXQ DX, R10;  MOVQ R10, 16+z; \
   106    ADCXQ AX, R11;  MOVQ R11, 24+z; \
   107    ADCXQ DX, R12;  MOVQ R12, 32+z; \
   108    ADCXQ DX, R13;  MOVQ R13, 40+z; \
   109    ADCXQ DX, R14;  MOVQ R14, 48+z;
   110
   111// subtraction subtracts y from x and stores in z
   112// Uses: AX, DX, R8-R14, FLAGS
   113// Instr: x86_64
   114#define subtraction(z,x,y) \
   115    MOVQ  0+x,  R8;  SUBQ  0+y,  R8; \
   116    MOVQ  8+x,  R9;  SBBQ  8+y,  R9; \
   117    MOVQ 16+x, R10;  SBBQ 16+y, R10; \
   118    MOVQ 24+x, R11;  SBBQ 24+y, R11; \
   119    MOVQ 32+x, R12;  SBBQ 32+y, R12; \
   120    MOVQ 40+x, R13;  SBBQ 40+y, R13; \
   121    MOVQ 48+x, R14;  SBBQ 48+y, R14; \
   122    MOVQ   $0,  AX;  SETCS AX; \
   123    MOVQ AX,  DX; \
   124    SHLQ $32, DX; \
   125    SUBQ AX,  R8; MOVQ  $0, AX; \
   126    SBBQ $0,  R9; \
   127    SBBQ $0, R10; \
   128    SBBQ DX, R11; \
   129    SBBQ $0, R12; \
   130    SBBQ $0, R13; \
   131    SBBQ $0, R14; \
   132    SETCS AX; \
   133    MOVQ AX,  DX; \
   134    SHLQ $32, DX; \
   135    SUBQ AX,  R8;  MOVQ  R8,  0+z; \
   136    SBBQ $0,  R9;  MOVQ  R9,  8+z; \
   137    SBBQ $0, R10;  MOVQ R10, 16+z; \
   138    SBBQ DX, R11;  MOVQ R11, 24+z; \
   139    SBBQ $0, R12;  MOVQ R12, 32+z; \
   140    SBBQ $0, R13;  MOVQ R13, 40+z; \
   141    SBBQ $0, R14;  MOVQ R14, 48+z;
   142
   143// maddBmi2Adx multiplies x and y and accumulates in z
   144// Uses: AX, DX, R15, FLAGS
   145// Instr: x86_64, bmi2, adx
   146#define maddBmi2Adx(z,x,y,i,r0,r1,r2,r3,r4,r5,r6) \
   147    MOVQ   i+y, DX; XORL AX, AX; \
   148    MULXQ  0+x, AX, R8;  ADOXQ AX, r0;  ADCXQ R8, r1; MOVQ r0,i+z; \
   149    MULXQ  8+x, AX, r0;  ADOXQ AX, r1;  ADCXQ r0, r2; MOVQ $0, R8; \
   150    MULXQ 16+x, AX, r0;  ADOXQ AX, r2;  ADCXQ r0, r3; \
   151    MULXQ 24+x, AX, r0;  ADOXQ AX, r3;  ADCXQ r0, r4; \
   152    MULXQ 32+x, AX, r0;  ADOXQ AX, r4;  ADCXQ r0, r5; \
   153    MULXQ 40+x, AX, r0;  ADOXQ AX, r5;  ADCXQ r0, r6; \
   154    MULXQ 48+x, AX, r0;  ADOXQ AX, r6;  ADCXQ R8, r0; \
   155    ;;;;;;;;;;;;;;;;;;;  ADOXQ R8, r0;
   156
   157// integerMulAdx multiplies x and y and stores in z
   158// Uses: AX, DX, R8-R15, FLAGS
   159// Instr: x86_64, bmi2, adx
   160#define integerMulAdx(z,x,y) \
   161    MOVL    $0,R15; \
   162    MOVQ   0+y, DX;  XORL AX, AX;  MOVQ $0, R8; \
   163    MULXQ  0+x, AX,  R9;  MOVQ  AX, 0+z; \
   164    MULXQ  8+x, AX, R10;  ADCXQ AX,  R9; \
   165    MULXQ 16+x, AX, R11;  ADCXQ AX, R10; \
   166    MULXQ 24+x, AX, R12;  ADCXQ AX, R11; \
   167    MULXQ 32+x, AX, R13;  ADCXQ AX, R12; \
   168    MULXQ 40+x, AX, R14;  ADCXQ AX, R13; \
   169    MULXQ 48+x, AX, R15;  ADCXQ AX, R14; \
   170    ;;;;;;;;;;;;;;;;;;;;  ADCXQ R8, R15; \
   171    maddBmi2Adx(z,x,y, 8, R9,R10,R11,R12,R13,R14,R15) \
   172    maddBmi2Adx(z,x,y,16,R10,R11,R12,R13,R14,R15, R9) \
   173    maddBmi2Adx(z,x,y,24,R11,R12,R13,R14,R15, R9,R10) \
   174    maddBmi2Adx(z,x,y,32,R12,R13,R14,R15, R9,R10,R11) \
   175    maddBmi2Adx(z,x,y,40,R13,R14,R15, R9,R10,R11,R12) \
   176    maddBmi2Adx(z,x,y,48,R14,R15, R9,R10,R11,R12,R13) \
   177    MOVQ R15,  56+z; \
   178    MOVQ  R9,  64+z; \
   179    MOVQ R10,  72+z; \
   180    MOVQ R11,  80+z; \
   181    MOVQ R12,  88+z; \
   182    MOVQ R13,  96+z; \
   183    MOVQ R14, 104+z;
   184
   185// maddLegacy multiplies x and y and accumulates in z
   186// Uses: AX, DX, R15, FLAGS
   187// Instr: x86_64
   188#define maddLegacy(z,x,y,i) \
   189    MOVQ  i+y, R15; \
   190    MOVQ  0+x, AX; MULQ R15; MOVQ AX,  R8; ;;;;;;;;;;;; MOVQ DX,  R9; \
   191    MOVQ  8+x, AX; MULQ R15; ADDQ AX,  R9; ADCQ $0, DX; MOVQ DX, R10; \
   192    MOVQ 16+x, AX; MULQ R15; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; \
   193    MOVQ 24+x, AX; MULQ R15; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; \
   194    MOVQ 32+x, AX; MULQ R15; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; \
   195    MOVQ 40+x, AX; MULQ R15; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX, R14; \
   196    MOVQ 48+x, AX; MULQ R15; ADDQ AX, R14; ADCQ $0, DX; \
   197    ADDQ  0+i+z,  R8; MOVQ  R8,  0+i+z; \
   198    ADCQ  8+i+z,  R9; MOVQ  R9,  8+i+z; \
   199    ADCQ 16+i+z, R10; MOVQ R10, 16+i+z; \
   200    ADCQ 24+i+z, R11; MOVQ R11, 24+i+z; \
   201    ADCQ 32+i+z, R12; MOVQ R12, 32+i+z; \
   202    ADCQ 40+i+z, R13; MOVQ R13, 40+i+z; \
   203    ADCQ 48+i+z, R14; MOVQ R14, 48+i+z; \
   204    ADCQ     $0,  DX; MOVQ  DX, 56+i+z;
   205
   206// integerMulLeg multiplies x and y and stores in z
   207// Uses: AX, DX, R8-R15, FLAGS
   208// Instr: x86_64
   209#define integerMulLeg(z,x,y) \
   210    MOVQ  0+y, R15; \
   211    MOVQ  0+x, AX; MULQ R15; MOVQ AX, 0+z; ;;;;;;;;;;;; MOVQ DX,  R8; \
   212    MOVQ  8+x, AX; MULQ R15; ADDQ AX,  R8; ADCQ $0, DX; MOVQ DX,  R9; MOVQ  R8,  8+z; \
   213    MOVQ 16+x, AX; MULQ R15; ADDQ AX,  R9; ADCQ $0, DX; MOVQ DX, R10; MOVQ  R9, 16+z; \
   214    MOVQ 24+x, AX; MULQ R15; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; MOVQ R10, 24+z; \
   215    MOVQ 32+x, AX; MULQ R15; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; MOVQ R11, 32+z; \
   216    MOVQ 40+x, AX; MULQ R15; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; MOVQ R12, 40+z; \
   217    MOVQ 48+x, AX; MULQ R15; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX,56+z; MOVQ R13, 48+z; \
   218    maddLegacy(z,x,y, 8) \
   219    maddLegacy(z,x,y,16) \
   220    maddLegacy(z,x,y,24) \
   221    maddLegacy(z,x,y,32) \
   222    maddLegacy(z,x,y,40) \
   223    maddLegacy(z,x,y,48)
   224
   225// integerSqrLeg squares x and stores in z
   226// Uses: AX, CX, DX, R8-R15, FLAGS
   227// Instr: x86_64
   228#define integerSqrLeg(z,x) \
   229    XORL R15, R15; \
   230    MOVQ  0+x, CX; \
   231    MOVQ   CX, AX; MULQ CX; MOVQ AX, 0+z; MOVQ DX, R8; \
   232    ADDQ   CX, CX; ADCQ $0, R15; \
   233    MOVQ  8+x, AX; MULQ CX; ADDQ AX,  R8; ADCQ $0, DX; MOVQ DX,  R9; MOVQ R8, 8+z; \
   234    MOVQ 16+x, AX; MULQ CX; ADDQ AX,  R9; ADCQ $0, DX; MOVQ DX, R10; \
   235    MOVQ 24+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; \
   236    MOVQ 32+x, AX; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; \
   237    MOVQ 40+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; \
   238    MOVQ 48+x, AX; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX, R14; \
   239    \
   240    MOVQ  8+x, CX; \
   241    MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
   242    ;;;;;;;;;;;;;; MULQ CX; ADDQ  AX, R9; ADCQ $0, DX; MOVQ R9,16+z; \
   243    MOVQ  R15, AX; NEGQ AX; ANDQ 8+x, AX; ADDQ AX, DX; ADCQ $0, R11; MOVQ DX, R8; \
   244    ADDQ  8+x, CX; ADCQ $0, R15; \
   245    MOVQ 16+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; ADDQ R8, R10; ADCQ $0, DX; MOVQ DX, R8; MOVQ R10, 24+z; \
   246    MOVQ 24+x, AX; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; ADDQ R8, R11; ADCQ $0, DX; MOVQ DX, R8; \
   247    MOVQ 32+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; ADDQ R8, R12; ADCQ $0, DX; MOVQ DX, R8; \
   248    MOVQ 40+x, AX; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; ADDQ R8, R13; ADCQ $0, DX; MOVQ DX, R8; \
   249    MOVQ 48+x, AX; MULQ CX; ADDQ AX, R14; ADCQ $0, DX; ADDQ R8, R14; ADCQ $0, DX; MOVQ DX, R9; \
   250    \
   251    MOVQ 16+x, CX; \
   252    MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
   253    ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; MOVQ R11, 32+z; \
   254    MOVQ  R15, AX; NEGQ AX; ANDQ 16+x,AX; ADDQ AX, DX; ADCQ $0, R13; MOVQ DX, R8; \
   255    ADDQ 16+x, CX; ADCQ $0, R15; \
   256    MOVQ 24+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; ADDQ R8, R12; ADCQ $0, DX; MOVQ DX, R8; MOVQ R12, 40+z; \
   257    MOVQ 32+x, AX; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; ADDQ R8, R13; ADCQ $0, DX; MOVQ DX, R8; \
   258    MOVQ 40+x, AX; MULQ CX; ADDQ AX, R14; ADCQ $0, DX; ADDQ R8, R14; ADCQ $0, DX; MOVQ DX, R8; \
   259    MOVQ 48+x, AX; MULQ CX; ADDQ AX,  R9; ADCQ $0, DX; ADDQ R8,  R9; ADCQ $0, DX; MOVQ DX,R10; \
   260    \
   261    MOVQ 24+x, CX; \
   262    MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
   263    ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; MOVQ R13, 48+z; \
   264    MOVQ  R15, AX; NEGQ AX; ANDQ 24+x,AX; ADDQ AX, DX; ADCQ $0,  R9; MOVQ DX, R8; \
   265    ADDQ 24+x, CX; ADCQ $0, R15; \
   266    MOVQ 32+x, AX; MULQ CX; ADDQ AX, R14; ADCQ $0, DX; ADDQ R8, R14; ADCQ $0, DX; MOVQ DX, R8; MOVQ R14, 56+z; \
   267    MOVQ 40+x, AX; MULQ CX; ADDQ AX,  R9; ADCQ $0, DX; ADDQ R8,  R9; ADCQ $0, DX; MOVQ DX, R8; \
   268    MOVQ 48+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; ADDQ R8, R10; ADCQ $0, DX; MOVQ DX,R11; \
   269    \
   270    MOVQ 32+x, CX; \
   271    MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
   272    ;;;;;;;;;;;;;; MULQ CX; ADDQ AX,  R9; ADCQ $0, DX; MOVQ R9, 64+z; \
   273    MOVQ  R15, AX; NEGQ AX; ANDQ 32+x,AX; ADDQ AX, DX; ADCQ $0, R11; MOVQ DX, R8; \
   274    ADDQ 32+x, CX; ADCQ $0, R15; \
   275    MOVQ 40+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; ADDQ R8, R10; ADCQ $0, DX; MOVQ DX, R8; MOVQ R10, 72+z; \
   276    MOVQ 48+x, AX; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; ADDQ R8, R11; ADCQ $0, DX; MOVQ DX,R12; \
   277    \
   278    XORL R13, R13; \
   279    XORL R14, R14; \
   280    MOVQ 40+x, CX; \
   281    MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
   282    ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; MOVQ R11, 80+z; \
   283    MOVQ  R15, AX; NEGQ AX; ANDQ 40+x,AX; ADDQ AX, DX; ADCQ $0, R13; MOVQ DX, R8; \
   284    ADDQ 40+x, CX; ADCQ $0, R15; \
   285    MOVQ 48+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; ADDQ R8, R12; ADCQ $0, DX; MOVQ DX, R8; MOVQ R12, 88+z; \
   286    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ADDQ R8, R13; ADCQ $0,R14; \
   287    \
   288    XORL   R9, R9; \
   289    MOVQ 48+x, CX; \
   290    MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
   291    ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; MOVQ R13, 96+z; \
   292    MOVQ  R15, AX; NEGQ AX; ANDQ 48+x,AX; ADDQ AX, DX; ADCQ $0, R9; MOVQ DX, R8; \
   293    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ADDQ R8,R14; ADCQ $0, R9; MOVQ R14, 104+z;
   294
   295
   296// integerSqrAdx squares x and stores in z
   297// Uses: AX, CX, DX, R8-R15, FLAGS
   298// Instr: x86_64, bmi2, adx
   299#define integerSqrAdx(z,x) \
   300    XORL R15, R15; \
   301    MOVQ  0+x, DX; \
   302    ;;;;;;;;;;;;;; MULXQ DX, AX, R8; MOVQ AX, 0+z; \
   303    ADDQ   DX, DX; ADCQ $0, R15; CLC; \
   304    MULXQ  8+x, AX,  R9; ADCXQ AX,  R8; MOVQ R8, 8+z; \
   305    MULXQ 16+x, AX, R10; ADCXQ AX,  R9; MOVQ $0, R8;\
   306    MULXQ 24+x, AX, R11; ADCXQ AX, R10; \
   307    MULXQ 32+x, AX, R12; ADCXQ AX, R11; \
   308    MULXQ 40+x, AX, R13; ADCXQ AX, R12; \
   309    MULXQ 48+x, AX, R14; ADCXQ AX, R13; \
   310    ;;;;;;;;;;;;;;;;;;;; ADCXQ R8, R14; \
   311    \
   312    MOVQ  8+x, DX; \
   313    MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
   314    MULXQ AX,  AX, CX; \
   315    MOVQ R15,  R8; NEGQ R8; ANDQ 8+x, R8; \
   316    ADDQ AX,  R9; MOVQ R9, 16+z; \
   317    ADCQ CX,  R8; \
   318    ADCQ $0, R11; \
   319    ADDQ  8+x,  DX; \
   320    ADCQ   $0, R15; \
   321    XORL R9, R9; ;;;;;;;;;;;;;;;;;;;;; ADOXQ R8, R10; \
   322    MULXQ 16+x, AX, CX; ADCXQ AX, R10; ADOXQ CX, R11; MOVQ R10, 24+z; \
   323    MULXQ 24+x, AX, CX; ADCXQ AX, R11; ADOXQ CX, R12; MOVQ  $0, R10; \
   324    MULXQ 32+x, AX, CX; ADCXQ AX, R12; ADOXQ CX, R13; \
   325    MULXQ 40+x, AX, CX; ADCXQ AX, R13; ADOXQ CX, R14; \
   326    MULXQ 48+x, AX, CX; ADCXQ AX, R14; ADOXQ CX,  R9; \
   327    ;;;;;;;;;;;;;;;;;;; ADCXQ R10, R9; \
   328    \
   329    MOVQ 16+x, DX; \
   330    MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
   331    MULXQ AX,  AX, CX; \
   332    MOVQ R15,  R8; NEGQ R8; ANDQ 16+x, R8; \
   333    ADDQ AX, R11; MOVQ R11, 32+z; \
   334    ADCQ CX,  R8; \
   335    ADCQ $0, R13; \
   336    ADDQ 16+x,  DX; \
   337    ADCQ   $0, R15; \
   338    XORL R11, R11; ;;;;;;;;;;;;;;;;;;; ADOXQ R8, R12; \
   339    MULXQ 24+x, AX, CX; ADCXQ AX, R12; ADOXQ CX, R13; MOVQ R12, 40+z; \
   340    MULXQ 32+x, AX, CX; ADCXQ AX, R13; ADOXQ CX, R14; MOVQ  $0, R12; \
   341    MULXQ 40+x, AX, CX; ADCXQ AX, R14; ADOXQ CX,  R9; \
   342    MULXQ 48+x, AX, CX; ADCXQ AX,  R9; ADOXQ CX, R10; \
   343    ;;;;;;;;;;;;;;;;;;; ADCXQ R11,R10; \
   344    \
   345    MOVQ 24+x, DX; \
   346    MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
   347    MULXQ AX,  AX, CX; \
   348    MOVQ R15,  R8; NEGQ R8; ANDQ 24+x, R8; \
   349    ADDQ AX, R13; MOVQ R13, 48+z; \
   350    ADCQ CX,  R8; \
   351    ADCQ $0,  R9; \
   352    ADDQ 24+x,  DX; \
   353    ADCQ   $0, R15; \
   354    XORL R13, R13; ;;;;;;;;;;;;;;;;;;; ADOXQ R8, R14; \
   355    MULXQ 32+x, AX, CX; ADCXQ AX, R14; ADOXQ CX,  R9; MOVQ R14, 56+z; \
   356    MULXQ 40+x, AX, CX; ADCXQ AX,  R9; ADOXQ CX, R10; MOVQ  $0, R14; \
   357    MULXQ 48+x, AX, CX; ADCXQ AX, R10; ADOXQ CX, R11; \
   358    ;;;;;;;;;;;;;;;;;;; ADCXQ R12,R11; \
   359    \
   360    MOVQ 32+x, DX; \
   361    MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
   362    MULXQ AX,  AX, CX; \
   363    MOVQ R15,  R8; NEGQ R8; ANDQ 32+x, R8; \
   364    ADDQ AX,  R9; MOVQ R9, 64+z; \
   365    ADCQ CX,  R8; \
   366    ADCQ $0, R11; \
   367    ADDQ 32+x,  DX; \
   368    ADCQ   $0, R15; \
   369    XORL R9, R9; ;;;;;;;;;;;;;;;;;;;;; ADOXQ R8, R10; \
   370    MULXQ 40+x, AX, CX; ADCXQ AX, R10; ADOXQ CX, R11; MOVQ R10, 72+z; \
   371    MULXQ 48+x, AX, CX; ADCXQ AX, R11; ADOXQ CX, R12; \
   372    ;;;;;;;;;;;;;;;;;;; ADCXQ R13,R12; \
   373    \
   374    MOVQ 40+x, DX; \
   375    MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
   376    MULXQ AX,  AX, CX; \
   377    MOVQ R15,  R8; NEGQ R8; ANDQ 40+x, R8; \
   378    ADDQ AX, R11; MOVQ R11, 80+z; \
   379    ADCQ CX,  R8; \
   380    ADCQ $0, R13; \
   381    ADDQ 40+x,  DX; \
   382    ADCQ   $0, R15; \
   383    XORL R11, R11; ;;;;;;;;;;;;;;;;;;; ADOXQ R8, R12; \
   384    MULXQ 48+x, AX, CX; ADCXQ AX, R12; ADOXQ CX, R13; MOVQ R12, 88+z; \
   385    ;;;;;;;;;;;;;;;;;;; ADCXQ R14,R13; \
   386    \
   387    MOVQ 48+x, DX; \
   388    MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
   389    MULXQ AX,  AX, CX; \
   390    MOVQ R15,  R8; NEGQ R8; ANDQ 48+x, R8; \
   391    XORL R10, R10; ;;;;;;;;;;;;;; ADOXQ CX, R14; \
   392    ;;;;;;;;;;;;;; ADCXQ AX, R13; ;;;;;;;;;;;;;; MOVQ R13, 96+z; \
   393    ;;;;;;;;;;;;;; ADCXQ R8, R14; MOVQ R14, 104+z;
   394
   395// reduceFromDoubleLeg finds a z=x modulo p such that z<2^448 and stores in z
   396// Uses: AX, R8-R15, FLAGS
   397// Instr: x86_64
   398#define reduceFromDoubleLeg(z,x) \
   399    /* (   ,2C13,2C12,2C11,2C10|C10,C9,C8, C7) + (C6,...,C0) */ \
   400    /* (r14, r13, r12, r11,     r10,r9,r8,r15) */ \
   401    MOVQ 80+x,AX; MOVQ AX,R10; \
   402    MOVQ $0xFFFFFFFF00000000, R8; \
   403    ANDQ R8,R10; \
   404    \
   405    MOVQ $0,R14; \
   406    MOVQ 104+x,R13; SHLQ $1,R13,R14; \
   407    MOVQ  96+x,R12; SHLQ $1,R12,R13; \
   408    MOVQ  88+x,R11; SHLQ $1,R11,R12; \
   409    MOVQ  72+x, R9; SHLQ $1,R10,R11; \
   410    MOVQ  64+x, R8; SHLQ $1,R10; \
   411    MOVQ $0xFFFFFFFF,R15; ANDQ R15,AX; ORQ AX,R10; \
   412    MOVQ  56+x,R15; \
   413    \
   414    ADDQ  0+x,R15; MOVQ R15, 0+z; MOVQ  56+x,R15; \
   415    ADCQ  8+x, R8; MOVQ  R8, 8+z; MOVQ  64+x, R8; \
   416    ADCQ 16+x, R9; MOVQ  R9,16+z; MOVQ  72+x, R9; \
   417    ADCQ 24+x,R10; MOVQ R10,24+z; MOVQ  80+x,R10; \
   418    ADCQ 32+x,R11; MOVQ R11,32+z; MOVQ  88+x,R11; \
   419    ADCQ 40+x,R12; MOVQ R12,40+z; MOVQ  96+x,R12; \
   420    ADCQ 48+x,R13; MOVQ R13,48+z; MOVQ 104+x,R13; \
   421    ADCQ   $0,R14; \
   422    /* (c10c9,c9c8,c8c7,c7c13,c13c12,c12c11,c11c10) + (c6,...,c0) */ \
   423    /* (   r9,  r8, r15,  r13,   r12,   r11,   r10) */ \
   424    MOVQ R10, AX; \
   425    SHRQ $32,R11,R10; \
   426    SHRQ $32,R12,R11; \
   427    SHRQ $32,R13,R12; \
   428    SHRQ $32,R15,R13; \
   429    SHRQ $32, R8,R15; \
   430    SHRQ $32, R9, R8; \
   431    SHRQ $32, AX, R9; \
   432    \
   433    ADDQ  0+z,R10; \
   434    ADCQ  8+z,R11; \
   435    ADCQ 16+z,R12; \
   436    ADCQ 24+z,R13; \
   437    ADCQ 32+z,R15; \
   438    ADCQ 40+z, R8; \
   439    ADCQ 48+z, R9; \
   440    ADCQ   $0,R14; \
   441    /* ( c7) + (c6,...,c0) */ \
   442    /* (r14) */ \
   443    MOVQ R14, AX; SHLQ $32, AX; \
   444    ADDQ R14,R10; MOVQ  $0,R14; \
   445    ADCQ  $0,R11; \
   446    ADCQ  $0,R12; \
   447    ADCQ  AX,R13; \
   448    ADCQ  $0,R15; \
   449    ADCQ  $0, R8; \
   450    ADCQ  $0, R9; \
   451    ADCQ  $0,R14; \
   452    /* ( c7) + (c6,...,c0) */ \
   453    /* (r14) */ \
   454    MOVQ R14, AX; SHLQ $32,AX; \
   455    ADDQ R14,R10; MOVQ R10, 0+z; \
   456    ADCQ  $0,R11; MOVQ R11, 8+z; \
   457    ADCQ  $0,R12; MOVQ R12,16+z; \
   458    ADCQ  AX,R13; MOVQ R13,24+z; \
   459    ADCQ  $0,R15; MOVQ R15,32+z; \
   460    ADCQ  $0, R8; MOVQ  R8,40+z; \
   461    ADCQ  $0, R9; MOVQ  R9,48+z;
   462
   463// reduceFromDoubleAdx finds a z=x modulo p such that z<2^448 and stores in z
   464// Uses: AX, R8-R15, FLAGS
   465// Instr: x86_64, adx
   466#define reduceFromDoubleAdx(z,x) \
   467    /* (   ,2C13,2C12,2C11,2C10|C10,C9,C8, C7) + (C6,...,C0) */ \
   468    /* (r14, r13, r12, r11,     r10,r9,r8,r15) */ \
   469    MOVQ 80+x,AX; MOVQ AX,R10; \
   470    MOVQ $0xFFFFFFFF00000000, R8; \
   471    ANDQ R8,R10; \
   472    \
   473    MOVQ $0,R14; \
   474    MOVQ 104+x,R13; SHLQ $1,R13,R14; \
   475    MOVQ  96+x,R12; SHLQ $1,R12,R13; \
   476    MOVQ  88+x,R11; SHLQ $1,R11,R12; \
   477    MOVQ  72+x, R9; SHLQ $1,R10,R11; \
   478    MOVQ  64+x, R8; SHLQ $1,R10; \
   479    MOVQ $0xFFFFFFFF,R15; ANDQ R15,AX; ORQ AX,R10; \
   480    MOVQ  56+x,R15; \
   481    \
   482    XORL AX,AX; \
   483    ADCXQ  0+x,R15; MOVQ R15, 0+z; MOVQ  56+x,R15; \
   484    ADCXQ  8+x, R8; MOVQ  R8, 8+z; MOVQ  64+x, R8; \
   485    ADCXQ 16+x, R9; MOVQ  R9,16+z; MOVQ  72+x, R9; \
   486    ADCXQ 24+x,R10; MOVQ R10,24+z; MOVQ  80+x,R10; \
   487    ADCXQ 32+x,R11; MOVQ R11,32+z; MOVQ  88+x,R11; \
   488    ADCXQ 40+x,R12; MOVQ R12,40+z; MOVQ  96+x,R12; \
   489    ADCXQ 48+x,R13; MOVQ R13,48+z; MOVQ 104+x,R13; \
   490    ADCXQ   AX,R14; \
   491    /* (c10c9,c9c8,c8c7,c7c13,c13c12,c12c11,c11c10) + (c6,...,c0) */ \
   492    /* (   r9,  r8, r15,  r13,   r12,   r11,   r10) */ \
   493    MOVQ R10, AX; \
   494    SHRQ $32,R11,R10; \
   495    SHRQ $32,R12,R11; \
   496    SHRQ $32,R13,R12; \
   497    SHRQ $32,R15,R13; \
   498    SHRQ $32, R8,R15; \
   499    SHRQ $32, R9, R8; \
   500    SHRQ $32, AX, R9; \
   501    \
   502    XORL AX,AX; \
   503    ADCXQ  0+z,R10; \
   504    ADCXQ  8+z,R11; \
   505    ADCXQ 16+z,R12; \
   506    ADCXQ 24+z,R13; \
   507    ADCXQ 32+z,R15; \
   508    ADCXQ 40+z, R8; \
   509    ADCXQ 48+z, R9; \
   510    ADCXQ   AX,R14; \
   511    /* ( c7) + (c6,...,c0) */ \
   512    /* (r14) */ \
   513    MOVQ R14, AX; SHLQ $32, AX; \
   514    CLC; \
   515    ADCXQ R14,R10; MOVQ $0,R14; \
   516    ADCXQ R14,R11; \
   517    ADCXQ R14,R12; \
   518    ADCXQ  AX,R13; \
   519    ADCXQ R14,R15; \
   520    ADCXQ R14, R8; \
   521    ADCXQ R14, R9; \
   522    ADCXQ R14,R14; \
   523    /* ( c7) + (c6,...,c0) */ \
   524    /* (r14) */ \
   525    MOVQ R14, AX; SHLQ $32, AX; \
   526    CLC; \
   527    ADCXQ R14,R10; MOVQ R10, 0+z; MOVQ $0,R14; \
   528    ADCXQ R14,R11; MOVQ R11, 8+z; \
   529    ADCXQ R14,R12; MOVQ R12,16+z; \
   530    ADCXQ  AX,R13; MOVQ R13,24+z; \
   531    ADCXQ R14,R15; MOVQ R15,32+z; \
   532    ADCXQ R14, R8; MOVQ  R8,40+z; \
   533    ADCXQ R14, R9; MOVQ  R9,48+z;
   534
   535// addSub calculates two operations: x,y = x+y,x-y
   536// Uses: AX, DX, R8-R15, FLAGS
   537#define addSub(x,y) \
   538    MOVQ  0+x,  R8;  ADDQ  0+y,  R8; \
   539    MOVQ  8+x,  R9;  ADCQ  8+y,  R9; \
   540    MOVQ 16+x, R10;  ADCQ 16+y, R10; \
   541    MOVQ 24+x, R11;  ADCQ 24+y, R11; \
   542    MOVQ 32+x, R12;  ADCQ 32+y, R12; \
   543    MOVQ 40+x, R13;  ADCQ 40+y, R13; \
   544    MOVQ 48+x, R14;  ADCQ 48+y, R14; \
   545    MOVQ   $0,  AX;  ADCQ   $0,  AX; \
   546    MOVQ AX,  DX; \
   547    SHLQ $32, DX; \
   548    ADDQ AX,  R8; MOVQ  $0, AX; \
   549    ADCQ $0,  R9; \
   550    ADCQ $0, R10; \
   551    ADCQ DX, R11; \
   552    ADCQ $0, R12; \
   553    ADCQ $0, R13; \
   554    ADCQ $0, R14; \
   555    ADCQ $0,  AX; \
   556    MOVQ AX,  DX; \
   557    SHLQ $32, DX; \
   558    ADDQ AX,  R8;  MOVQ  0+x,AX; MOVQ  R8,  0+x; MOVQ AX,  R8; \
   559    ADCQ $0,  R9;  MOVQ  8+x,AX; MOVQ  R9,  8+x; MOVQ AX,  R9; \
   560    ADCQ $0, R10;  MOVQ 16+x,AX; MOVQ R10, 16+x; MOVQ AX, R10; \
   561    ADCQ DX, R11;  MOVQ 24+x,AX; MOVQ R11, 24+x; MOVQ AX, R11; \
   562    ADCQ $0, R12;  MOVQ 32+x,AX; MOVQ R12, 32+x; MOVQ AX, R12; \
   563    ADCQ $0, R13;  MOVQ 40+x,AX; MOVQ R13, 40+x; MOVQ AX, R13; \
   564    ADCQ $0, R14;  MOVQ 48+x,AX; MOVQ R14, 48+x; MOVQ AX, R14; \
   565    SUBQ  0+y,  R8; \
   566    SBBQ  8+y,  R9; \
   567    SBBQ 16+y, R10; \
   568    SBBQ 24+y, R11; \
   569    SBBQ 32+y, R12; \
   570    SBBQ 40+y, R13; \
   571    SBBQ 48+y, R14; \
   572    MOVQ   $0,  AX;  SETCS AX; \
   573    MOVQ AX,  DX; \
   574    SHLQ $32, DX; \
   575    SUBQ AX,  R8; MOVQ  $0, AX; \
   576    SBBQ $0,  R9; \
   577    SBBQ $0, R10; \
   578    SBBQ DX, R11; \
   579    SBBQ $0, R12; \
   580    SBBQ $0, R13; \
   581    SBBQ $0, R14; \
   582    SETCS AX; \
   583    MOVQ AX,  DX; \
   584    SHLQ $32, DX; \
   585    SUBQ AX,  R8;  MOVQ  R8,  0+y; \
   586    SBBQ $0,  R9;  MOVQ  R9,  8+y; \
   587    SBBQ $0, R10;  MOVQ R10, 16+y; \
   588    SBBQ DX, R11;  MOVQ R11, 24+y; \
   589    SBBQ $0, R12;  MOVQ R12, 32+y; \
   590    SBBQ $0, R13;  MOVQ R13, 40+y; \
   591    SBBQ $0, R14;  MOVQ R14, 48+y;

View as plain text