curve_amd64.s

Documentation: github.com/cloudflare/circl/dh/x25519

     1// +build amd64
     2
     3#include "textflag.h"
     4
     5// Depends on circl/math/fp25519 package
     6#include "../../math/fp25519/fp_amd64.h"
     7#include "curve_amd64.h"
     8
     9// CTE_A24 is (A+2)/4 from Curve25519
    10#define CTE_A24 121666
    11
    12#define Size 32
    13
    14// multiplyA24Leg multiplies x times CTE_A24 and stores in z
    15// Uses: AX, DX, R8-R13, FLAGS
    16// Instr: x86_64, cmov
    17#define multiplyA24Leg(z,x) \
    18    MOVL $CTE_A24, AX; MULQ  0+x; MOVQ AX,  R8; MOVQ DX,  R9; \
    19    MOVL $CTE_A24, AX; MULQ  8+x; MOVQ AX, R12; MOVQ DX, R10; \
    20    MOVL $CTE_A24, AX; MULQ 16+x; MOVQ AX, R13; MOVQ DX, R11; \
    21    MOVL $CTE_A24, AX; MULQ 24+x; \
    22    ADDQ R12,  R9; \
    23    ADCQ R13, R10; \
    24    ADCQ  AX, R11; \
    25    ADCQ  $0,  DX; \
    26    MOVL $38,  AX; /* 2*C = 38 = 2^256 MOD 2^255-19*/ \
    27    IMULQ AX, DX; \
    28    ADDQ DX, R8; \
    29    ADCQ $0,  R9;  MOVQ  R9,  8+z; \
    30    ADCQ $0, R10;  MOVQ R10, 16+z; \
    31    ADCQ $0, R11;  MOVQ R11, 24+z; \
    32    MOVQ $0, DX; \
    33    CMOVQCS AX, DX; \
    34    ADDQ DX, R8;  MOVQ  R8,   0+z;
    35
    36// multiplyA24Adx multiplies x times CTE_A24 and stores in z
    37// Uses: AX, DX, R8-R12, FLAGS
    38// Instr: x86_64, cmov, bmi2
    39#define multiplyA24Adx(z,x) \
    40    MOVQ  $CTE_A24, DX; \
    41    MULXQ  0+x,  R8, R10; \
    42    MULXQ  8+x,  R9, R11;  ADDQ R10,  R9; \
    43    MULXQ 16+x, R10,  AX;  ADCQ R11, R10; \
    44    MULXQ 24+x, R11, R12;  ADCQ  AX, R11; \
    45    ;;;;;;;;;;;;;;;;;;;;;  ADCQ  $0, R12; \
    46    MOVL $38,  DX; /* 2*C = 38 = 2^256 MOD 2^255-19*/ \
    47    IMULQ DX, R12; \
    48    ADDQ R12, R8; \
    49    ADCQ $0,  R9;  MOVQ  R9,  8+z; \
    50    ADCQ $0, R10;  MOVQ R10, 16+z; \
    51    ADCQ $0, R11;  MOVQ R11, 24+z; \
    52    MOVQ $0, R12; \
    53    CMOVQCS DX, R12; \
    54    ADDQ R12, R8;  MOVQ  R8,  0+z;
    55
    56#define mulA24Legacy \
    57    multiplyA24Leg(0(DI),0(SI))
    58#define mulA24Bmi2Adx \
    59    multiplyA24Adx(0(DI),0(SI))
    60
    61// func mulA24Amd64(z, x *fp255.Elt)
    62TEXT ·mulA24Amd64(SB),NOSPLIT,$0-16
    63    MOVQ z+0(FP), DI
    64    MOVQ x+8(FP), SI
    65    CHECK_BMI2ADX(LMA24, mulA24Legacy, mulA24Bmi2Adx)
    66
    67
    68// func ladderStepAmd64(w *[5]fp255.Elt, b uint)
    69// ladderStepAmd64 calculates a point addition and doubling as follows:
    70// (x2,z2) = 2*(x2,z2) and (x3,z3) = (x2,z2)+(x3,z3) using as a difference (x1,-).
    71//  work  = (x1,x2,z2,x3,z3) are five fp255.Elt of 32 bytes.
    72//  stack = (t0,t1) are two fp.Elt of fp.Size bytes, and
    73//          (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
    74TEXT ·ladderStepAmd64(SB),NOSPLIT,$192-16
    75    // Parameters
    76    #define regWork DI
    77    #define regMove SI
    78    #define x1 0*Size(regWork)
    79    #define x2 1*Size(regWork)
    80    #define z2 2*Size(regWork)
    81    #define x3 3*Size(regWork)
    82    #define z3 4*Size(regWork)
    83    // Local variables
    84    #define t0 0*Size(SP)
    85    #define t1 1*Size(SP)
    86    #define b0 2*Size(SP)
    87    #define b1 4*Size(SP)
    88    MOVQ w+0(FP), regWork
    89    MOVQ b+8(FP), regMove
    90    CHECK_BMI2ADX(LLADSTEP, ladderStepLeg, ladderStepBmi2Adx)
    91    #undef regWork
    92    #undef regMove
    93    #undef x1
    94    #undef x2
    95    #undef z2
    96    #undef x3
    97    #undef z3
    98    #undef t0
    99    #undef t1
   100    #undef b0
   101    #undef b1
   102
   103// func diffAddAmd64(w *[5]fp255.Elt, b uint)
   104// diffAddAmd64 calculates a differential point addition using a precomputed point.
   105// (x1,z1) = (x1,z1)+(mu) using a difference point (x2,z2)
   106//    w    = (mu,x1,z1,x2,z2) are five fp.Elt, and
   107//   stack = (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
   108TEXT ·diffAddAmd64(SB),NOSPLIT,$128-16
   109    // Parameters
   110    #define regWork DI
   111    #define regSwap SI
   112    #define ui 0*Size(regWork)
   113    #define x1 1*Size(regWork)
   114    #define z1 2*Size(regWork)
   115    #define x2 3*Size(regWork)
   116    #define z2 4*Size(regWork)
   117    // Local variables
   118    #define b0 0*Size(SP)
   119    #define b1 2*Size(SP)
   120    MOVQ w+0(FP), regWork
   121    MOVQ b+8(FP), regSwap
   122    cswap(x1,x2,regSwap)
   123    cswap(z1,z2,regSwap)
   124    CHECK_BMI2ADX(LDIFADD, difAddLeg, difAddBmi2Adx)
   125    #undef regWork
   126    #undef regSwap
   127    #undef ui
   128    #undef x1
   129    #undef z1
   130    #undef x2
   131    #undef z2
   132    #undef b0
   133    #undef b1
   134
   135// func doubleAmd64(x, z *fp255.Elt)
   136// doubleAmd64 calculates a point doubling (x1,z1) = 2*(x1,z1).
   137//  stack = (t0,t1) are two fp.Elt of fp.Size bytes, and
   138//          (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
   139TEXT ·doubleAmd64(SB),NOSPLIT,$192-16
   140    // Parameters
   141    #define x1 0(DI)
   142    #define z1 0(SI)
   143    // Local variables
   144    #define t0 0*Size(SP)
   145    #define t1 1*Size(SP)
   146    #define b0 2*Size(SP)
   147    #define b1 4*Size(SP)
   148    MOVQ x+0(FP), DI
   149    MOVQ z+8(FP), SI
   150    CHECK_BMI2ADX(LDOUB,doubleLeg,doubleBmi2Adx)
   151    #undef x1
   152    #undef z1
   153    #undef t0
   154    #undef t1
   155    #undef b0
   156    #undef b1
View as plain text