modnscalar.go

Documentation: github.com/decred/dcrd/dcrec/secp256k1/v4

     1  // Copyright (c) 2020-2023 The Decred developers
     2  // Use of this source code is governed by an ISC
     3  // license that can be found in the LICENSE file.
     4  
     5  package secp256k1
     6  
     7  import (
     8  	"encoding/hex"
     9  	"math/big"
    10  )
    11  
    12  // References:
    13  //   [SECG]: Recommended Elliptic Curve Domain Parameters
    14  //     https://www.secg.org/sec2-v2.pdf
    15  //
    16  //   [HAC]: Handbook of Applied Cryptography Menezes, van Oorschot, Vanstone.
    17  //     http://cacr.uwaterloo.ca/hac/
    18  
    19  // Many elliptic curve operations require working with scalars in a finite field
    20  // characterized by the order of the group underlying the secp256k1 curve.
    21  // Given this precision is larger than the biggest available native type,
    22  // obviously some form of bignum math is needed.  This code implements
    23  // specialized fixed-precision field arithmetic rather than relying on an
    24  // arbitrary-precision arithmetic package such as math/big for dealing with the
    25  // math modulo the group order since the size is known.  As a result, rather
    26  // large performance gains are achieved by taking advantage of many
    27  // optimizations not available to arbitrary-precision arithmetic and generic
    28  // modular arithmetic algorithms.
    29  //
    30  // There are various ways to internally represent each element.  For example,
    31  // the most obvious representation would be to use an array of 4 uint64s (64
    32  // bits * 4 = 256 bits).  However, that representation suffers from the fact
    33  // that there is no native Go type large enough to handle the intermediate
    34  // results while adding or multiplying two 64-bit numbers.
    35  //
    36  // Given the above, this implementation represents the field elements as 8
    37  // uint32s with each word (array entry) treated as base 2^32.  This was chosen
    38  // because most systems at the current time are 64-bit (or at least have 64-bit
    39  // registers available for specialized purposes such as MMX) so the intermediate
    40  // results can typically be done using a native register (and using uint64s to
    41  // avoid the need for additional half-word arithmetic)
    42  
    43  const (
    44  	// These fields provide convenient access to each of the words of the
    45  	// secp256k1 curve group order N to improve code readability.
    46  	//
    47  	// The group order of the curve per [SECG] is:
    48  	// 0xffffffff ffffffff ffffffff fffffffe baaedce6 af48a03b bfd25e8c d0364141
    49  	//
    50  	// nolint: dupword
    51  	orderWordZero  uint32 = 0xd0364141
    52  	orderWordOne   uint32 = 0xbfd25e8c
    53  	orderWordTwo   uint32 = 0xaf48a03b
    54  	orderWordThree uint32 = 0xbaaedce6
    55  	orderWordFour  uint32 = 0xfffffffe
    56  	orderWordFive  uint32 = 0xffffffff
    57  	orderWordSix   uint32 = 0xffffffff
    58  	orderWordSeven uint32 = 0xffffffff
    59  
    60  	// These fields provide convenient access to each of the words of the two's
    61  	// complement of the secp256k1 curve group order N to improve code
    62  	// readability.
    63  	//
    64  	// The two's complement of the group order is:
    65  	// 0x00000000 00000000 00000000 00000001 45512319 50b75fc4 402da173 2fc9bebf
    66  	orderComplementWordZero  uint32 = (^orderWordZero) + 1
    67  	orderComplementWordOne   uint32 = ^orderWordOne
    68  	orderComplementWordTwo   uint32 = ^orderWordTwo
    69  	orderComplementWordThree uint32 = ^orderWordThree
    70  	//orderComplementWordFour  uint32 = ^orderWordFour  // unused
    71  	//orderComplementWordFive  uint32 = ^orderWordFive  // unused
    72  	//orderComplementWordSix   uint32 = ^orderWordSix   // unused
    73  	//orderComplementWordSeven uint32 = ^orderWordSeven // unused
    74  
    75  	// These fields provide convenient access to each of the words of the
    76  	// secp256k1 curve group order N / 2 to improve code readability and avoid
    77  	// the need to recalculate them.
    78  	//
    79  	// The half order of the secp256k1 curve group is:
    80  	// 0x7fffffff ffffffff ffffffff ffffffff 5d576e73 57a4501d dfe92f46 681b20a0
    81  	//
    82  	// nolint: dupword
    83  	halfOrderWordZero  uint32 = 0x681b20a0
    84  	halfOrderWordOne   uint32 = 0xdfe92f46
    85  	halfOrderWordTwo   uint32 = 0x57a4501d
    86  	halfOrderWordThree uint32 = 0x5d576e73
    87  	halfOrderWordFour  uint32 = 0xffffffff
    88  	halfOrderWordFive  uint32 = 0xffffffff
    89  	halfOrderWordSix   uint32 = 0xffffffff
    90  	halfOrderWordSeven uint32 = 0x7fffffff
    91  
    92  	// uint32Mask is simply a mask with all bits set for a uint32 and is used to
    93  	// improve the readability of the code.
    94  	uint32Mask = 0xffffffff
    95  )
    96  
    97  var (
    98  	// zero32 is an array of 32 bytes used for the purposes of zeroing and is
    99  	// defined here to avoid extra allocations.
   100  	zero32 = [32]byte{}
   101  )
   102  
   103  // ModNScalar implements optimized 256-bit constant-time fixed-precision
   104  // arithmetic over the secp256k1 group order. This means all arithmetic is
   105  // performed modulo:
   106  //
   107  //	0xfffffffffffffffffffffffffffffffebaaedce6af48a03bbfd25e8cd0364141
   108  //
   109  // It only implements the arithmetic needed for elliptic curve operations,
   110  // however, the operations that are not implemented can typically be worked
   111  // around if absolutely needed.  For example, subtraction can be performed by
   112  // adding the negation.
   113  //
   114  // Should it be absolutely necessary, conversion to the standard library
   115  // math/big.Int can be accomplished by using the Bytes method, slicing the
   116  // resulting fixed-size array, and feeding it to big.Int.SetBytes.  However,
   117  // that should typically be avoided when possible as conversion to big.Ints
   118  // requires allocations, is not constant time, and is slower when working modulo
   119  // the group order.
   120  type ModNScalar struct {
   121  	// The scalar is represented as 8 32-bit integers in base 2^32.
   122  	//
   123  	// The following depicts the internal representation:
   124  	// 	 ---------------------------------------------------------
   125  	// 	|       n[7]     |      n[6]      | ... |      n[0]      |
   126  	// 	| 32 bits        | 32 bits        | ... | 32 bits        |
   127  	// 	| Mult: 2^(32*7) | Mult: 2^(32*6) | ... | Mult: 2^(32*0) |
   128  	// 	 ---------------------------------------------------------
   129  	//
   130  	// For example, consider the number 2^87 + 2^42 + 1.  It would be
   131  	// represented as:
   132  	// 	n[0] = 1
   133  	// 	n[1] = 2^10
   134  	// 	n[2] = 2^23
   135  	// 	n[3..7] = 0
   136  	//
   137  	// The full 256-bit value is then calculated by looping i from 7..0 and
   138  	// doing sum(n[i] * 2^(32i)) like so:
   139  	// 	n[7] * 2^(32*7) = 0    * 2^224 = 0
   140  	// 	n[6] * 2^(32*6) = 0    * 2^192 = 0
   141  	// 	...
   142  	// 	n[2] * 2^(32*2) = 2^23 * 2^64  = 2^87
   143  	// 	n[1] * 2^(32*1) = 2^10 * 2^32  = 2^42
   144  	// 	n[0] * 2^(32*0) = 1    * 2^0   = 1
   145  	// 	Sum: 0 + 0 + ... + 2^87 + 2^42 + 1 = 2^87 + 2^42 + 1
   146  	n [8]uint32
   147  }
   148  
   149  // String returns the scalar as a human-readable hex string.
   150  //
   151  // This is NOT constant time.
   152  func (s ModNScalar) String() string {
   153  	b := s.Bytes()
   154  	return hex.EncodeToString(b[:])
   155  }
   156  
   157  // Set sets the scalar equal to a copy of the passed one in constant time.
   158  //
   159  // The scalar is returned to support chaining.  This enables syntax like:
   160  // s := new(ModNScalar).Set(s2).Add(1) so that s = s2 + 1 where s2 is not
   161  // modified.
   162  func (s *ModNScalar) Set(val *ModNScalar) *ModNScalar {
   163  	*s = *val
   164  	return s
   165  }
   166  
   167  // Zero sets the scalar to zero in constant time.  A newly created scalar is
   168  // already set to zero.  This function can be useful to clear an existing scalar
   169  // for reuse.
   170  func (s *ModNScalar) Zero() {
   171  	s.n[0] = 0
   172  	s.n[1] = 0
   173  	s.n[2] = 0
   174  	s.n[3] = 0
   175  	s.n[4] = 0
   176  	s.n[5] = 0
   177  	s.n[6] = 0
   178  	s.n[7] = 0
   179  }
   180  
   181  // IsZeroBit returns 1 when the scalar is equal to zero or 0 otherwise in
   182  // constant time.
   183  //
   184  // Note that a bool is not used here because it is not possible in Go to convert
   185  // from a bool to numeric value in constant time and many constant-time
   186  // operations require a numeric value.  See IsZero for the version that returns
   187  // a bool.
   188  func (s *ModNScalar) IsZeroBit() uint32 {
   189  	// The scalar can only be zero if no bits are set in any of the words.
   190  	bits := s.n[0] | s.n[1] | s.n[2] | s.n[3] | s.n[4] | s.n[5] | s.n[6] | s.n[7]
   191  	return constantTimeEq(bits, 0)
   192  }
   193  
   194  // IsZero returns whether or not the scalar is equal to zero in constant time.
   195  func (s *ModNScalar) IsZero() bool {
   196  	// The scalar can only be zero if no bits are set in any of the words.
   197  	bits := s.n[0] | s.n[1] | s.n[2] | s.n[3] | s.n[4] | s.n[5] | s.n[6] | s.n[7]
   198  	return bits == 0
   199  }
   200  
   201  // SetInt sets the scalar to the passed integer in constant time.  This is a
   202  // convenience function since it is fairly common to perform some arithmetic
   203  // with small native integers.
   204  //
   205  // The scalar is returned to support chaining.  This enables syntax like:
   206  // s := new(ModNScalar).SetInt(2).Mul(s2) so that s = 2 * s2.
   207  func (s *ModNScalar) SetInt(ui uint32) *ModNScalar {
   208  	s.Zero()
   209  	s.n[0] = ui
   210  	return s
   211  }
   212  
   213  // constantTimeEq returns 1 if a == b or 0 otherwise in constant time.
   214  func constantTimeEq(a, b uint32) uint32 {
   215  	return uint32((uint64(a^b) - 1) >> 63)
   216  }
   217  
   218  // constantTimeNotEq returns 1 if a != b or 0 otherwise in constant time.
   219  func constantTimeNotEq(a, b uint32) uint32 {
   220  	return ^uint32((uint64(a^b)-1)>>63) & 1
   221  }
   222  
   223  // constantTimeLess returns 1 if a < b or 0 otherwise in constant time.
   224  func constantTimeLess(a, b uint32) uint32 {
   225  	return uint32((uint64(a) - uint64(b)) >> 63)
   226  }
   227  
   228  // constantTimeLessOrEq returns 1 if a <= b or 0 otherwise in constant time.
   229  func constantTimeLessOrEq(a, b uint32) uint32 {
   230  	return uint32((uint64(a) - uint64(b) - 1) >> 63)
   231  }
   232  
   233  // constantTimeGreater returns 1 if a > b or 0 otherwise in constant time.
   234  func constantTimeGreater(a, b uint32) uint32 {
   235  	return constantTimeLess(b, a)
   236  }
   237  
   238  // constantTimeGreaterOrEq returns 1 if a >= b or 0 otherwise in constant time.
   239  func constantTimeGreaterOrEq(a, b uint32) uint32 {
   240  	return constantTimeLessOrEq(b, a)
   241  }
   242  
   243  // constantTimeMin returns min(a,b) in constant time.
   244  func constantTimeMin(a, b uint32) uint32 {
   245  	return b ^ ((a ^ b) & -constantTimeLess(a, b))
   246  }
   247  
   248  // overflows determines if the current scalar is greater than or equal to the
   249  // group order in constant time and returns 1 if it is or 0 otherwise.
   250  func (s *ModNScalar) overflows() uint32 {
   251  	// The intuition here is that the scalar is greater than the group order if
   252  	// one of the higher individual words is greater than corresponding word of
   253  	// the group order and all higher words in the scalar are equal to their
   254  	// corresponding word of the group order.  Since this type is modulo the
   255  	// group order, being equal is also an overflow back to 0.
   256  	//
   257  	// Note that the words 5, 6, and 7 are all the max uint32 value, so there is
   258  	// no need to test if those individual words of the scalar exceeds them,
   259  	// hence, only equality is checked for them.
   260  	highWordsEqual := constantTimeEq(s.n[7], orderWordSeven)
   261  	highWordsEqual &= constantTimeEq(s.n[6], orderWordSix)
   262  	highWordsEqual &= constantTimeEq(s.n[5], orderWordFive)
   263  	overflow := highWordsEqual & constantTimeGreater(s.n[4], orderWordFour)
   264  	highWordsEqual &= constantTimeEq(s.n[4], orderWordFour)
   265  	overflow |= highWordsEqual & constantTimeGreater(s.n[3], orderWordThree)
   266  	highWordsEqual &= constantTimeEq(s.n[3], orderWordThree)
   267  	overflow |= highWordsEqual & constantTimeGreater(s.n[2], orderWordTwo)
   268  	highWordsEqual &= constantTimeEq(s.n[2], orderWordTwo)
   269  	overflow |= highWordsEqual & constantTimeGreater(s.n[1], orderWordOne)
   270  	highWordsEqual &= constantTimeEq(s.n[1], orderWordOne)
   271  	overflow |= highWordsEqual & constantTimeGreaterOrEq(s.n[0], orderWordZero)
   272  
   273  	return overflow
   274  }
   275  
   276  // reduce256 reduces the current scalar modulo the group order in accordance
   277  // with the overflows parameter in constant time.  The overflows parameter
   278  // specifies whether or not the scalar is known to be greater than the group
   279  // order and MUST either be 1 in the case it is or 0 in the case it is not for a
   280  // correct result.
   281  func (s *ModNScalar) reduce256(overflows uint32) {
   282  	// Notice that since s < 2^256 < 2N (where N is the group order), the max
   283  	// possible number of reductions required is one.  Therefore, in the case a
   284  	// reduction is needed, it can be performed with a single subtraction of N.
   285  	// Also, recall that subtraction is equivalent to addition by the two's
   286  	// complement while ignoring the carry.
   287  	//
   288  	// When s >= N, the overflows parameter will be 1.  Conversely, it will be 0
   289  	// when s < N.  Thus multiplying by the overflows parameter will either
   290  	// result in 0 or the multiplicand itself.
   291  	//
   292  	// Combining the above along with the fact that s + 0 = s, the following is
   293  	// a constant time implementation that works by either adding 0 or the two's
   294  	// complement of N as needed.
   295  	//
   296  	// The final result will be in the range 0 <= s < N as expected.
   297  	overflows64 := uint64(overflows)
   298  	c := uint64(s.n[0]) + overflows64*uint64(orderComplementWordZero)
   299  	s.n[0] = uint32(c & uint32Mask)
   300  	c = (c >> 32) + uint64(s.n[1]) + overflows64*uint64(orderComplementWordOne)
   301  	s.n[1] = uint32(c & uint32Mask)
   302  	c = (c >> 32) + uint64(s.n[2]) + overflows64*uint64(orderComplementWordTwo)
   303  	s.n[2] = uint32(c & uint32Mask)
   304  	c = (c >> 32) + uint64(s.n[3]) + overflows64*uint64(orderComplementWordThree)
   305  	s.n[3] = uint32(c & uint32Mask)
   306  	c = (c >> 32) + uint64(s.n[4]) + overflows64 // * 1
   307  	s.n[4] = uint32(c & uint32Mask)
   308  	c = (c >> 32) + uint64(s.n[5]) // + overflows64 * 0
   309  	s.n[5] = uint32(c & uint32Mask)
   310  	c = (c >> 32) + uint64(s.n[6]) // + overflows64 * 0
   311  	s.n[6] = uint32(c & uint32Mask)
   312  	c = (c >> 32) + uint64(s.n[7]) // + overflows64 * 0
   313  	s.n[7] = uint32(c & uint32Mask)
   314  }
   315  
   316  // SetBytes interprets the provided array as a 256-bit big-endian unsigned
   317  // integer, reduces it modulo the group order, sets the scalar to the result,
   318  // and returns either 1 if it was reduced (aka it overflowed) or 0 otherwise in
   319  // constant time.
   320  //
   321  // Note that a bool is not used here because it is not possible in Go to convert
   322  // from a bool to numeric value in constant time and many constant-time
   323  // operations require a numeric value.
   324  func (s *ModNScalar) SetBytes(b *[32]byte) uint32 {
   325  	// Pack the 256 total bits across the 8 uint32 words.  This could be done
   326  	// with a for loop, but benchmarks show this unrolled version is about 2
   327  	// times faster than the variant that uses a loop.
   328  	s.n[0] = uint32(b[31]) | uint32(b[30])<<8 | uint32(b[29])<<16 | uint32(b[28])<<24
   329  	s.n[1] = uint32(b[27]) | uint32(b[26])<<8 | uint32(b[25])<<16 | uint32(b[24])<<24
   330  	s.n[2] = uint32(b[23]) | uint32(b[22])<<8 | uint32(b[21])<<16 | uint32(b[20])<<24
   331  	s.n[3] = uint32(b[19]) | uint32(b[18])<<8 | uint32(b[17])<<16 | uint32(b[16])<<24
   332  	s.n[4] = uint32(b[15]) | uint32(b[14])<<8 | uint32(b[13])<<16 | uint32(b[12])<<24
   333  	s.n[5] = uint32(b[11]) | uint32(b[10])<<8 | uint32(b[9])<<16 | uint32(b[8])<<24
   334  	s.n[6] = uint32(b[7]) | uint32(b[6])<<8 | uint32(b[5])<<16 | uint32(b[4])<<24
   335  	s.n[7] = uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
   336  
   337  	// The value might be >= N, so reduce it as required and return whether or
   338  	// not it was reduced.
   339  	needsReduce := s.overflows()
   340  	s.reduce256(needsReduce)
   341  	return needsReduce
   342  }
   343  
   344  // zeroArray32 zeroes the provided 32-byte buffer.
   345  func zeroArray32(b *[32]byte) {
   346  	copy(b[:], zero32[:])
   347  }
   348  
   349  // SetByteSlice interprets the provided slice as a 256-bit big-endian unsigned
   350  // integer (meaning it is truncated to the first 32 bytes), reduces it modulo
   351  // the group order, sets the scalar to the result, and returns whether or not
   352  // the resulting truncated 256-bit integer overflowed in constant time.
   353  //
   354  // Note that since passing a slice with more than 32 bytes is truncated, it is
   355  // possible that the truncated value is less than the order of the curve and
   356  // hence it will not be reported as having overflowed in that case.  It is up to
   357  // the caller to decide whether it needs to provide numbers of the appropriate
   358  // size or it is acceptable to use this function with the described truncation
   359  // and overflow behavior.
   360  func (s *ModNScalar) SetByteSlice(b []byte) bool {
   361  	var b32 [32]byte
   362  	b = b[:constantTimeMin(uint32(len(b)), 32)]
   363  	copy(b32[:], b32[:32-len(b)])
   364  	copy(b32[32-len(b):], b)
   365  	result := s.SetBytes(&b32)
   366  	zeroArray32(&b32)
   367  	return result != 0
   368  }
   369  
   370  // PutBytesUnchecked unpacks the scalar to a 32-byte big-endian value directly
   371  // into the passed byte slice in constant time.  The target slice must have at
   372  // least 32 bytes available or it will panic.
   373  //
   374  // There is a similar function, PutBytes, which unpacks the scalar into a
   375  // 32-byte array directly.  This version is provided since it can be useful to
   376  // write directly into part of a larger buffer without needing a separate
   377  // allocation.
   378  //
   379  // Preconditions:
   380  //   - The target slice MUST have at least 32 bytes available
   381  func (s *ModNScalar) PutBytesUnchecked(b []byte) {
   382  	// Unpack the 256 total bits from the 8 uint32 words.  This could be done
   383  	// with a for loop, but benchmarks show this unrolled version is about 2
   384  	// times faster than the variant which uses a loop.
   385  	b[31] = byte(s.n[0])
   386  	b[30] = byte(s.n[0] >> 8)
   387  	b[29] = byte(s.n[0] >> 16)
   388  	b[28] = byte(s.n[0] >> 24)
   389  	b[27] = byte(s.n[1])
   390  	b[26] = byte(s.n[1] >> 8)
   391  	b[25] = byte(s.n[1] >> 16)
   392  	b[24] = byte(s.n[1] >> 24)
   393  	b[23] = byte(s.n[2])
   394  	b[22] = byte(s.n[2] >> 8)
   395  	b[21] = byte(s.n[2] >> 16)
   396  	b[20] = byte(s.n[2] >> 24)
   397  	b[19] = byte(s.n[3])
   398  	b[18] = byte(s.n[3] >> 8)
   399  	b[17] = byte(s.n[3] >> 16)
   400  	b[16] = byte(s.n[3] >> 24)
   401  	b[15] = byte(s.n[4])
   402  	b[14] = byte(s.n[4] >> 8)
   403  	b[13] = byte(s.n[4] >> 16)
   404  	b[12] = byte(s.n[4] >> 24)
   405  	b[11] = byte(s.n[5])
   406  	b[10] = byte(s.n[5] >> 8)
   407  	b[9] = byte(s.n[5] >> 16)
   408  	b[8] = byte(s.n[5] >> 24)
   409  	b[7] = byte(s.n[6])
   410  	b[6] = byte(s.n[6] >> 8)
   411  	b[5] = byte(s.n[6] >> 16)
   412  	b[4] = byte(s.n[6] >> 24)
   413  	b[3] = byte(s.n[7])
   414  	b[2] = byte(s.n[7] >> 8)
   415  	b[1] = byte(s.n[7] >> 16)
   416  	b[0] = byte(s.n[7] >> 24)
   417  }
   418  
   419  // PutBytes unpacks the scalar to a 32-byte big-endian value using the passed
   420  // byte array in constant time.
   421  //
   422  // There is a similar function, PutBytesUnchecked, which unpacks the scalar into
   423  // a slice that must have at least 32 bytes available.  This version is provided
   424  // since it can be useful to write directly into an array that is type checked.
   425  //
   426  // Alternatively, there is also Bytes, which unpacks the scalar into a new array
   427  // and returns that which can sometimes be more ergonomic in applications that
   428  // aren't concerned about an additional copy.
   429  func (s *ModNScalar) PutBytes(b *[32]byte) {
   430  	s.PutBytesUnchecked(b[:])
   431  }
   432  
   433  // Bytes unpacks the scalar to a 32-byte big-endian value in constant time.
   434  //
   435  // See PutBytes and PutBytesUnchecked for variants that allow an array or slice
   436  // to be passed which can be useful to cut down on the number of allocations
   437  // by allowing the caller to reuse a buffer or write directly into part of a
   438  // larger buffer.
   439  func (s *ModNScalar) Bytes() [32]byte {
   440  	var b [32]byte
   441  	s.PutBytesUnchecked(b[:])
   442  	return b
   443  }
   444  
   445  // IsOdd returns whether or not the scalar is an odd number in constant time.
   446  func (s *ModNScalar) IsOdd() bool {
   447  	// Only odd numbers have the bottom bit set.
   448  	return s.n[0]&1 == 1
   449  }
   450  
   451  // Equals returns whether or not the two scalars are the same in constant time.
   452  func (s *ModNScalar) Equals(val *ModNScalar) bool {
   453  	// Xor only sets bits when they are different, so the two scalars can only
   454  	// be the same if no bits are set after xoring each word.
   455  	bits := (s.n[0] ^ val.n[0]) | (s.n[1] ^ val.n[1]) | (s.n[2] ^ val.n[2]) |
   456  		(s.n[3] ^ val.n[3]) | (s.n[4] ^ val.n[4]) | (s.n[5] ^ val.n[5]) |
   457  		(s.n[6] ^ val.n[6]) | (s.n[7] ^ val.n[7])
   458  
   459  	return bits == 0
   460  }
   461  
   462  // Add2 adds the passed two scalars together modulo the group order in constant
   463  // time and stores the result in s.
   464  //
   465  // The scalar is returned to support chaining.  This enables syntax like:
   466  // s3.Add2(s, s2).AddInt(1) so that s3 = s + s2 + 1.
   467  func (s *ModNScalar) Add2(val1, val2 *ModNScalar) *ModNScalar {
   468  	c := uint64(val1.n[0]) + uint64(val2.n[0])
   469  	s.n[0] = uint32(c & uint32Mask)
   470  	c = (c >> 32) + uint64(val1.n[1]) + uint64(val2.n[1])
   471  	s.n[1] = uint32(c & uint32Mask)
   472  	c = (c >> 32) + uint64(val1.n[2]) + uint64(val2.n[2])
   473  	s.n[2] = uint32(c & uint32Mask)
   474  	c = (c >> 32) + uint64(val1.n[3]) + uint64(val2.n[3])
   475  	s.n[3] = uint32(c & uint32Mask)
   476  	c = (c >> 32) + uint64(val1.n[4]) + uint64(val2.n[4])
   477  	s.n[4] = uint32(c & uint32Mask)
   478  	c = (c >> 32) + uint64(val1.n[5]) + uint64(val2.n[5])
   479  	s.n[5] = uint32(c & uint32Mask)
   480  	c = (c >> 32) + uint64(val1.n[6]) + uint64(val2.n[6])
   481  	s.n[6] = uint32(c & uint32Mask)
   482  	c = (c >> 32) + uint64(val1.n[7]) + uint64(val2.n[7])
   483  	s.n[7] = uint32(c & uint32Mask)
   484  
   485  	// The result is now 256 bits, but it might still be >= N, so use the
   486  	// existing normal reduce method for 256-bit values.
   487  	s.reduce256(uint32(c>>32) + s.overflows())
   488  	return s
   489  }
   490  
   491  // Add adds the passed scalar to the existing one modulo the group order in
   492  // constant time and stores the result in s.
   493  //
   494  // The scalar is returned to support chaining.  This enables syntax like:
   495  // s.Add(s2).AddInt(1) so that s = s + s2 + 1.
   496  func (s *ModNScalar) Add(val *ModNScalar) *ModNScalar {
   497  	return s.Add2(s, val)
   498  }
   499  
   500  // accumulator96 provides a 96-bit accumulator for use in the intermediate
   501  // calculations requiring more than 64-bits.
   502  type accumulator96 struct {
   503  	n [3]uint32
   504  }
   505  
   506  // Add adds the passed unsigned 64-bit value to the accumulator.
   507  func (a *accumulator96) Add(v uint64) {
   508  	low := uint32(v & uint32Mask)
   509  	hi := uint32(v >> 32)
   510  	a.n[0] += low
   511  	hi += constantTimeLess(a.n[0], low) // Carry if overflow in n[0].
   512  	a.n[1] += hi
   513  	a.n[2] += constantTimeLess(a.n[1], hi) // Carry if overflow in n[1].
   514  }
   515  
   516  // Rsh32 right shifts the accumulator by 32 bits.
   517  func (a *accumulator96) Rsh32() {
   518  	a.n[0] = a.n[1]
   519  	a.n[1] = a.n[2]
   520  	a.n[2] = 0
   521  }
   522  
   523  // reduce385 reduces the 385-bit intermediate result in the passed terms modulo
   524  // the group order in constant time and stores the result in s.
   525  func (s *ModNScalar) reduce385(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12 uint64) {
   526  	// At this point, the intermediate result in the passed terms has been
   527  	// reduced to fit within 385 bits, so reduce it again using the same method
   528  	// described in reduce512.  As before, the intermediate result will end up
   529  	// being reduced by another 127 bits to 258 bits, thus 9 32-bit terms are
   530  	// needed for this iteration.  The reduced terms are assigned back to t0
   531  	// through t8.
   532  	//
   533  	// Note that several of the intermediate calculations require adding 64-bit
   534  	// products together which would overflow a uint64, so a 96-bit accumulator
   535  	// is used instead until the value is reduced enough to use native uint64s.
   536  
   537  	// Terms for 2^(32*0).
   538  	var acc accumulator96
   539  	acc.n[0] = uint32(t0) // == acc.Add(t0) because acc is guaranteed to be 0.
   540  	acc.Add(t8 * uint64(orderComplementWordZero))
   541  	t0 = uint64(acc.n[0])
   542  	acc.Rsh32()
   543  
   544  	// Terms for 2^(32*1).
   545  	acc.Add(t1)
   546  	acc.Add(t8 * uint64(orderComplementWordOne))
   547  	acc.Add(t9 * uint64(orderComplementWordZero))
   548  	t1 = uint64(acc.n[0])
   549  	acc.Rsh32()
   550  
   551  	// Terms for 2^(32*2).
   552  	acc.Add(t2)
   553  	acc.Add(t8 * uint64(orderComplementWordTwo))
   554  	acc.Add(t9 * uint64(orderComplementWordOne))
   555  	acc.Add(t10 * uint64(orderComplementWordZero))
   556  	t2 = uint64(acc.n[0])
   557  	acc.Rsh32()
   558  
   559  	// Terms for 2^(32*3).
   560  	acc.Add(t3)
   561  	acc.Add(t8 * uint64(orderComplementWordThree))
   562  	acc.Add(t9 * uint64(orderComplementWordTwo))
   563  	acc.Add(t10 * uint64(orderComplementWordOne))
   564  	acc.Add(t11 * uint64(orderComplementWordZero))
   565  	t3 = uint64(acc.n[0])
   566  	acc.Rsh32()
   567  
   568  	// Terms for 2^(32*4).
   569  	acc.Add(t4)
   570  	acc.Add(t8) // * uint64(orderComplementWordFour) // * 1
   571  	acc.Add(t9 * uint64(orderComplementWordThree))
   572  	acc.Add(t10 * uint64(orderComplementWordTwo))
   573  	acc.Add(t11 * uint64(orderComplementWordOne))
   574  	acc.Add(t12 * uint64(orderComplementWordZero))
   575  	t4 = uint64(acc.n[0])
   576  	acc.Rsh32()
   577  
   578  	// Terms for 2^(32*5).
   579  	acc.Add(t5)
   580  	// acc.Add(t8 * uint64(orderComplementWordFive)) // 0
   581  	acc.Add(t9) // * uint64(orderComplementWordFour) // * 1
   582  	acc.Add(t10 * uint64(orderComplementWordThree))
   583  	acc.Add(t11 * uint64(orderComplementWordTwo))
   584  	acc.Add(t12 * uint64(orderComplementWordOne))
   585  	t5 = uint64(acc.n[0])
   586  	acc.Rsh32()
   587  
   588  	// Terms for 2^(32*6).
   589  	acc.Add(t6)
   590  	// acc.Add(t8 * uint64(orderComplementWordSix)) // 0
   591  	// acc.Add(t9 * uint64(orderComplementWordFive)) // 0
   592  	acc.Add(t10) // * uint64(orderComplementWordFour) // * 1
   593  	acc.Add(t11 * uint64(orderComplementWordThree))
   594  	acc.Add(t12 * uint64(orderComplementWordTwo))
   595  	t6 = uint64(acc.n[0])
   596  	acc.Rsh32()
   597  
   598  	// Terms for 2^(32*7).
   599  	acc.Add(t7)
   600  	// acc.Add(t8 * uint64(orderComplementWordSeven)) // 0
   601  	// acc.Add(t9 * uint64(orderComplementWordSix)) // 0
   602  	// acc.Add(t10 * uint64(orderComplementWordFive)) // 0
   603  	acc.Add(t11) // * uint64(orderComplementWordFour) // * 1
   604  	acc.Add(t12 * uint64(orderComplementWordThree))
   605  	t7 = uint64(acc.n[0])
   606  	acc.Rsh32()
   607  
   608  	// Terms for 2^(32*8).
   609  	// acc.Add(t9 * uint64(orderComplementWordSeven)) // 0
   610  	// acc.Add(t10 * uint64(orderComplementWordSix)) // 0
   611  	// acc.Add(t11 * uint64(orderComplementWordFive)) // 0
   612  	acc.Add(t12) // * uint64(orderComplementWordFour) // * 1
   613  	t8 = uint64(acc.n[0])
   614  	// acc.Rsh32() // No need since not used after this.  Guaranteed to be 0.
   615  
   616  	// NOTE: All of the remaining multiplications for this iteration result in 0
   617  	// as they all involve multiplying by combinations of the fifth, sixth, and
   618  	// seventh words of the two's complement of N, which are 0, so skip them.
   619  
   620  	// At this point, the result is reduced to fit within 258 bits, so reduce it
   621  	// again using a slightly modified version of the same method.  The maximum
   622  	// value in t8 is 2 at this point and therefore multiplying it by each word
   623  	// of the two's complement of N and adding it to a 32-bit term will result
   624  	// in a maximum requirement of 33 bits, so it is safe to use native uint64s
   625  	// here for the intermediate term carry propagation.
   626  	//
   627  	// Also, since the maximum value in t8 is 2, this ends up reducing by
   628  	// another 2 bits to 256 bits.
   629  	c := t0 + t8*uint64(orderComplementWordZero)
   630  	s.n[0] = uint32(c & uint32Mask)
   631  	c = (c >> 32) + t1 + t8*uint64(orderComplementWordOne)
   632  	s.n[1] = uint32(c & uint32Mask)
   633  	c = (c >> 32) + t2 + t8*uint64(orderComplementWordTwo)
   634  	s.n[2] = uint32(c & uint32Mask)
   635  	c = (c >> 32) + t3 + t8*uint64(orderComplementWordThree)
   636  	s.n[3] = uint32(c & uint32Mask)
   637  	c = (c >> 32) + t4 + t8 // * uint64(orderComplementWordFour) == * 1
   638  	s.n[4] = uint32(c & uint32Mask)
   639  	c = (c >> 32) + t5 // + t8*uint64(orderComplementWordFive) == 0
   640  	s.n[5] = uint32(c & uint32Mask)
   641  	c = (c >> 32) + t6 // + t8*uint64(orderComplementWordSix) == 0
   642  	s.n[6] = uint32(c & uint32Mask)
   643  	c = (c >> 32) + t7 // + t8*uint64(orderComplementWordSeven) == 0
   644  	s.n[7] = uint32(c & uint32Mask)
   645  
   646  	// The result is now 256 bits, but it might still be >= N, so use the
   647  	// existing normal reduce method for 256-bit values.
   648  	s.reduce256(uint32(c>>32) + s.overflows())
   649  }
   650  
   651  // reduce512 reduces the 512-bit intermediate result in the passed terms modulo
   652  // the group order down to 385 bits in constant time and stores the result in s.
   653  func (s *ModNScalar) reduce512(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15 uint64) {
   654  	// At this point, the intermediate result in the passed terms is grouped
   655  	// into the respective bases.
   656  	//
   657  	// Per [HAC] section 14.3.4: Reduction method of moduli of special form,
   658  	// when the modulus is of the special form m = b^t - c, where log_2(c) < t,
   659  	// highly efficient reduction can be achieved per the provided algorithm.
   660  	//
   661  	// The secp256k1 group order fits this criteria since it is:
   662  	//   2^256 - 432420386565659656852420866394968145599
   663  	//
   664  	// Technically the max possible value here is (N-1)^2 since the two scalars
   665  	// being multiplied are always mod N.  Nevertheless, it is safer to consider
   666  	// it to be (2^256-1)^2 = 2^512 - 2^256 + 1 since it is the product of two
   667  	// 256-bit values.
   668  	//
   669  	// The algorithm is to reduce the result modulo the prime by subtracting
   670  	// multiples of the group order N.  However, in order simplify carry
   671  	// propagation, this adds with the two's complement of N to achieve the same
   672  	// result.
   673  	//
   674  	// Since the two's complement of N has 127 leading zero bits, this will end
   675  	// up reducing the intermediate result from 512 bits to 385 bits, resulting
   676  	// in 13 32-bit terms.  The reduced terms are assigned back to t0 through
   677  	// t12.
   678  	//
   679  	// Note that several of the intermediate calculations require adding 64-bit
   680  	// products together which would overflow a uint64, so a 96-bit accumulator
   681  	// is used instead.
   682  
   683  	// Terms for 2^(32*0).
   684  	var acc accumulator96
   685  	acc.n[0] = uint32(t0) // == acc.Add(t0) because acc is guaranteed to be 0.
   686  	acc.Add(t8 * uint64(orderComplementWordZero))
   687  	t0 = uint64(acc.n[0])
   688  	acc.Rsh32()
   689  
   690  	// Terms for 2^(32*1).
   691  	acc.Add(t1)
   692  	acc.Add(t8 * uint64(orderComplementWordOne))
   693  	acc.Add(t9 * uint64(orderComplementWordZero))
   694  	t1 = uint64(acc.n[0])
   695  	acc.Rsh32()
   696  
   697  	// Terms for 2^(32*2).
   698  	acc.Add(t2)
   699  	acc.Add(t8 * uint64(orderComplementWordTwo))
   700  	acc.Add(t9 * uint64(orderComplementWordOne))
   701  	acc.Add(t10 * uint64(orderComplementWordZero))
   702  	t2 = uint64(acc.n[0])
   703  	acc.Rsh32()
   704  
   705  	// Terms for 2^(32*3).
   706  	acc.Add(t3)
   707  	acc.Add(t8 * uint64(orderComplementWordThree))
   708  	acc.Add(t9 * uint64(orderComplementWordTwo))
   709  	acc.Add(t10 * uint64(orderComplementWordOne))
   710  	acc.Add(t11 * uint64(orderComplementWordZero))
   711  	t3 = uint64(acc.n[0])
   712  	acc.Rsh32()
   713  
   714  	// Terms for 2^(32*4).
   715  	acc.Add(t4)
   716  	acc.Add(t8) // * uint64(orderComplementWordFour) // * 1
   717  	acc.Add(t9 * uint64(orderComplementWordThree))
   718  	acc.Add(t10 * uint64(orderComplementWordTwo))
   719  	acc.Add(t11 * uint64(orderComplementWordOne))
   720  	acc.Add(t12 * uint64(orderComplementWordZero))
   721  	t4 = uint64(acc.n[0])
   722  	acc.Rsh32()
   723  
   724  	// Terms for 2^(32*5).
   725  	acc.Add(t5)
   726  	// acc.Add(t8 * uint64(orderComplementWordFive)) // 0
   727  	acc.Add(t9) // * uint64(orderComplementWordFour) // * 1
   728  	acc.Add(t10 * uint64(orderComplementWordThree))
   729  	acc.Add(t11 * uint64(orderComplementWordTwo))
   730  	acc.Add(t12 * uint64(orderComplementWordOne))
   731  	acc.Add(t13 * uint64(orderComplementWordZero))
   732  	t5 = uint64(acc.n[0])
   733  	acc.Rsh32()
   734  
   735  	// Terms for 2^(32*6).
   736  	acc.Add(t6)
   737  	// acc.Add(t8 * uint64(orderComplementWordSix)) // 0
   738  	// acc.Add(t9 * uint64(orderComplementWordFive)) // 0
   739  	acc.Add(t10) // * uint64(orderComplementWordFour)) // * 1
   740  	acc.Add(t11 * uint64(orderComplementWordThree))
   741  	acc.Add(t12 * uint64(orderComplementWordTwo))
   742  	acc.Add(t13 * uint64(orderComplementWordOne))
   743  	acc.Add(t14 * uint64(orderComplementWordZero))
   744  	t6 = uint64(acc.n[0])
   745  	acc.Rsh32()
   746  
   747  	// Terms for 2^(32*7).
   748  	acc.Add(t7)
   749  	// acc.Add(t8 * uint64(orderComplementWordSeven)) // 0
   750  	// acc.Add(t9 * uint64(orderComplementWordSix)) // 0
   751  	// acc.Add(t10 * uint64(orderComplementWordFive)) // 0
   752  	acc.Add(t11) // * uint64(orderComplementWordFour) // * 1
   753  	acc.Add(t12 * uint64(orderComplementWordThree))
   754  	acc.Add(t13 * uint64(orderComplementWordTwo))
   755  	acc.Add(t14 * uint64(orderComplementWordOne))
   756  	acc.Add(t15 * uint64(orderComplementWordZero))
   757  	t7 = uint64(acc.n[0])
   758  	acc.Rsh32()
   759  
   760  	// Terms for 2^(32*8).
   761  	// acc.Add(t9 * uint64(orderComplementWordSeven)) // 0
   762  	// acc.Add(t10 * uint64(orderComplementWordSix)) // 0
   763  	// acc.Add(t11 * uint64(orderComplementWordFive)) // 0
   764  	acc.Add(t12) // * uint64(orderComplementWordFour) // * 1
   765  	acc.Add(t13 * uint64(orderComplementWordThree))
   766  	acc.Add(t14 * uint64(orderComplementWordTwo))
   767  	acc.Add(t15 * uint64(orderComplementWordOne))
   768  	t8 = uint64(acc.n[0])
   769  	acc.Rsh32()
   770  
   771  	// Terms for 2^(32*9).
   772  	// acc.Add(t10 * uint64(orderComplementWordSeven)) // 0
   773  	// acc.Add(t11 * uint64(orderComplementWordSix)) // 0
   774  	// acc.Add(t12 * uint64(orderComplementWordFive)) // 0
   775  	acc.Add(t13) // * uint64(orderComplementWordFour) // * 1
   776  	acc.Add(t14 * uint64(orderComplementWordThree))
   777  	acc.Add(t15 * uint64(orderComplementWordTwo))
   778  	t9 = uint64(acc.n[0])
   779  	acc.Rsh32()
   780  
   781  	// Terms for 2^(32*10).
   782  	// acc.Add(t11 * uint64(orderComplementWordSeven)) // 0
   783  	// acc.Add(t12 * uint64(orderComplementWordSix)) // 0
   784  	// acc.Add(t13 * uint64(orderComplementWordFive)) // 0
   785  	acc.Add(t14) // * uint64(orderComplementWordFour) // * 1
   786  	acc.Add(t15 * uint64(orderComplementWordThree))
   787  	t10 = uint64(acc.n[0])
   788  	acc.Rsh32()
   789  
   790  	// Terms for 2^(32*11).
   791  	// acc.Add(t12 * uint64(orderComplementWordSeven)) // 0
   792  	// acc.Add(t13 * uint64(orderComplementWordSix)) // 0
   793  	// acc.Add(t14 * uint64(orderComplementWordFive)) // 0
   794  	acc.Add(t15) // * uint64(orderComplementWordFour) // * 1
   795  	t11 = uint64(acc.n[0])
   796  	acc.Rsh32()
   797  
   798  	// NOTE: All of the remaining multiplications for this iteration result in 0
   799  	// as they all involve multiplying by combinations of the fifth, sixth, and
   800  	// seventh words of the two's complement of N, which are 0, so skip them.
   801  
   802  	// Terms for 2^(32*12).
   803  	t12 = uint64(acc.n[0])
   804  	// acc.Rsh32() // No need since not used after this.  Guaranteed to be 0.
   805  
   806  	// At this point, the result is reduced to fit within 385 bits, so reduce it
   807  	// again using the same method accordingly.
   808  	s.reduce385(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12)
   809  }
   810  
   811  // Mul2 multiplies the passed two scalars together modulo the group order in
   812  // constant time and stores the result in s.
   813  //
   814  // The scalar is returned to support chaining.  This enables syntax like:
   815  // s3.Mul2(s, s2).AddInt(1) so that s3 = (s * s2) + 1.
   816  func (s *ModNScalar) Mul2(val, val2 *ModNScalar) *ModNScalar {
   817  	// This could be done with for loops and an array to store the intermediate
   818  	// terms, but this unrolled version is significantly faster.
   819  
   820  	// The overall strategy employed here is:
   821  	// 1) Calculate the 512-bit product of the two scalars using the standard
   822  	//    pencil-and-paper method.
   823  	// 2) Reduce the result modulo the prime by effectively subtracting
   824  	//    multiples of the group order N (actually performed by adding multiples
   825  	//    of the two's complement of N to avoid implementing subtraction).
   826  	// 3) Repeat step 2 noting that each iteration reduces the required number
   827  	//    of bits by 127 because the two's complement of N has 127 leading zero
   828  	//    bits.
   829  	// 4) Once reduced to 256 bits, call the existing reduce method to perform
   830  	//    a final reduction as needed.
   831  	//
   832  	// Note that several of the intermediate calculations require adding 64-bit
   833  	// products together which would overflow a uint64, so a 96-bit accumulator
   834  	// is used instead.
   835  
   836  	// Terms for 2^(32*0).
   837  	var acc accumulator96
   838  	acc.Add(uint64(val.n[0]) * uint64(val2.n[0]))
   839  	t0 := uint64(acc.n[0])
   840  	acc.Rsh32()
   841  
   842  	// Terms for 2^(32*1).
   843  	acc.Add(uint64(val.n[0]) * uint64(val2.n[1]))
   844  	acc.Add(uint64(val.n[1]) * uint64(val2.n[0]))
   845  	t1 := uint64(acc.n[0])
   846  	acc.Rsh32()
   847  
   848  	// Terms for 2^(32*2).
   849  	acc.Add(uint64(val.n[0]) * uint64(val2.n[2]))
   850  	acc.Add(uint64(val.n[1]) * uint64(val2.n[1]))
   851  	acc.Add(uint64(val.n[2]) * uint64(val2.n[0]))
   852  	t2 := uint64(acc.n[0])
   853  	acc.Rsh32()
   854  
   855  	// Terms for 2^(32*3).
   856  	acc.Add(uint64(val.n[0]) * uint64(val2.n[3]))
   857  	acc.Add(uint64(val.n[1]) * uint64(val2.n[2]))
   858  	acc.Add(uint64(val.n[2]) * uint64(val2.n[1]))
   859  	acc.Add(uint64(val.n[3]) * uint64(val2.n[0]))
   860  	t3 := uint64(acc.n[0])
   861  	acc.Rsh32()
   862  
   863  	// Terms for 2^(32*4).
   864  	acc.Add(uint64(val.n[0]) * uint64(val2.n[4]))
   865  	acc.Add(uint64(val.n[1]) * uint64(val2.n[3]))
   866  	acc.Add(uint64(val.n[2]) * uint64(val2.n[2]))
   867  	acc.Add(uint64(val.n[3]) * uint64(val2.n[1]))
   868  	acc.Add(uint64(val.n[4]) * uint64(val2.n[0]))
   869  	t4 := uint64(acc.n[0])
   870  	acc.Rsh32()
   871  
   872  	// Terms for 2^(32*5).
   873  	acc.Add(uint64(val.n[0]) * uint64(val2.n[5]))
   874  	acc.Add(uint64(val.n[1]) * uint64(val2.n[4]))
   875  	acc.Add(uint64(val.n[2]) * uint64(val2.n[3]))
   876  	acc.Add(uint64(val.n[3]) * uint64(val2.n[2]))
   877  	acc.Add(uint64(val.n[4]) * uint64(val2.n[1]))
   878  	acc.Add(uint64(val.n[5]) * uint64(val2.n[0]))
   879  	t5 := uint64(acc.n[0])
   880  	acc.Rsh32()
   881  
   882  	// Terms for 2^(32*6).
   883  	acc.Add(uint64(val.n[0]) * uint64(val2.n[6]))
   884  	acc.Add(uint64(val.n[1]) * uint64(val2.n[5]))
   885  	acc.Add(uint64(val.n[2]) * uint64(val2.n[4]))
   886  	acc.Add(uint64(val.n[3]) * uint64(val2.n[3]))
   887  	acc.Add(uint64(val.n[4]) * uint64(val2.n[2]))
   888  	acc.Add(uint64(val.n[5]) * uint64(val2.n[1]))
   889  	acc.Add(uint64(val.n[6]) * uint64(val2.n[0]))
   890  	t6 := uint64(acc.n[0])
   891  	acc.Rsh32()
   892  
   893  	// Terms for 2^(32*7).
   894  	acc.Add(uint64(val.n[0]) * uint64(val2.n[7]))
   895  	acc.Add(uint64(val.n[1]) * uint64(val2.n[6]))
   896  	acc.Add(uint64(val.n[2]) * uint64(val2.n[5]))
   897  	acc.Add(uint64(val.n[3]) * uint64(val2.n[4]))
   898  	acc.Add(uint64(val.n[4]) * uint64(val2.n[3]))
   899  	acc.Add(uint64(val.n[5]) * uint64(val2.n[2]))
   900  	acc.Add(uint64(val.n[6]) * uint64(val2.n[1]))
   901  	acc.Add(uint64(val.n[7]) * uint64(val2.n[0]))
   902  	t7 := uint64(acc.n[0])
   903  	acc.Rsh32()
   904  
   905  	// Terms for 2^(32*8).
   906  	acc.Add(uint64(val.n[1]) * uint64(val2.n[7]))
   907  	acc.Add(uint64(val.n[2]) * uint64(val2.n[6]))
   908  	acc.Add(uint64(val.n[3]) * uint64(val2.n[5]))
   909  	acc.Add(uint64(val.n[4]) * uint64(val2.n[4]))
   910  	acc.Add(uint64(val.n[5]) * uint64(val2.n[3]))
   911  	acc.Add(uint64(val.n[6]) * uint64(val2.n[2]))
   912  	acc.Add(uint64(val.n[7]) * uint64(val2.n[1]))
   913  	t8 := uint64(acc.n[0])
   914  	acc.Rsh32()
   915  
   916  	// Terms for 2^(32*9).
   917  	acc.Add(uint64(val.n[2]) * uint64(val2.n[7]))
   918  	acc.Add(uint64(val.n[3]) * uint64(val2.n[6]))
   919  	acc.Add(uint64(val.n[4]) * uint64(val2.n[5]))
   920  	acc.Add(uint64(val.n[5]) * uint64(val2.n[4]))
   921  	acc.Add(uint64(val.n[6]) * uint64(val2.n[3]))
   922  	acc.Add(uint64(val.n[7]) * uint64(val2.n[2]))
   923  	t9 := uint64(acc.n[0])
   924  	acc.Rsh32()
   925  
   926  	// Terms for 2^(32*10).
   927  	acc.Add(uint64(val.n[3]) * uint64(val2.n[7]))
   928  	acc.Add(uint64(val.n[4]) * uint64(val2.n[6]))
   929  	acc.Add(uint64(val.n[5]) * uint64(val2.n[5]))
   930  	acc.Add(uint64(val.n[6]) * uint64(val2.n[4]))
   931  	acc.Add(uint64(val.n[7]) * uint64(val2.n[3]))
   932  	t10 := uint64(acc.n[0])
   933  	acc.Rsh32()
   934  
   935  	// Terms for 2^(32*11).
   936  	acc.Add(uint64(val.n[4]) * uint64(val2.n[7]))
   937  	acc.Add(uint64(val.n[5]) * uint64(val2.n[6]))
   938  	acc.Add(uint64(val.n[6]) * uint64(val2.n[5]))
   939  	acc.Add(uint64(val.n[7]) * uint64(val2.n[4]))
   940  	t11 := uint64(acc.n[0])
   941  	acc.Rsh32()
   942  
   943  	// Terms for 2^(32*12).
   944  	acc.Add(uint64(val.n[5]) * uint64(val2.n[7]))
   945  	acc.Add(uint64(val.n[6]) * uint64(val2.n[6]))
   946  	acc.Add(uint64(val.n[7]) * uint64(val2.n[5]))
   947  	t12 := uint64(acc.n[0])
   948  	acc.Rsh32()
   949  
   950  	// Terms for 2^(32*13).
   951  	acc.Add(uint64(val.n[6]) * uint64(val2.n[7]))
   952  	acc.Add(uint64(val.n[7]) * uint64(val2.n[6]))
   953  	t13 := uint64(acc.n[0])
   954  	acc.Rsh32()
   955  
   956  	// Terms for 2^(32*14).
   957  	acc.Add(uint64(val.n[7]) * uint64(val2.n[7]))
   958  	t14 := uint64(acc.n[0])
   959  	acc.Rsh32()
   960  
   961  	// What's left is for 2^(32*15).
   962  	t15 := uint64(acc.n[0])
   963  	// acc.Rsh32() // No need since not used after this.  Guaranteed to be 0.
   964  
   965  	// At this point, all of the terms are grouped into their respective base
   966  	// and occupy up to 512 bits.  Reduce the result accordingly.
   967  	s.reduce512(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14,
   968  		t15)
   969  	return s
   970  }
   971  
   972  // Mul multiplies the passed scalar with the existing one modulo the group order
   973  // in constant time and stores the result in s.
   974  //
   975  // The scalar is returned to support chaining.  This enables syntax like:
   976  // s.Mul(s2).AddInt(1) so that s = (s * s2) + 1.
   977  func (s *ModNScalar) Mul(val *ModNScalar) *ModNScalar {
   978  	return s.Mul2(s, val)
   979  }
   980  
   981  // SquareVal squares the passed scalar modulo the group order in constant time
   982  // and stores the result in s.
   983  //
   984  // The scalar is returned to support chaining.  This enables syntax like:
   985  // s3.SquareVal(s).Mul(s) so that s3 = s^2 * s = s^3.
   986  func (s *ModNScalar) SquareVal(val *ModNScalar) *ModNScalar {
   987  	// This could technically be optimized slightly to take advantage of the
   988  	// fact that many of the intermediate calculations in squaring are just
   989  	// doubling, however, benchmarking has shown that due to the need to use a
   990  	// 96-bit accumulator, any savings are essentially offset by that and
   991  	// consequently there is no real difference in performance over just
   992  	// multiplying the value by itself to justify the extra code for now.  This
   993  	// can be revisited in the future if it becomes a bottleneck in practice.
   994  
   995  	return s.Mul2(val, val)
   996  }
   997  
   998  // Square squares the scalar modulo the group order in constant time.  The
   999  // existing scalar is modified.
  1000  //
  1001  // The scalar is returned to support chaining.  This enables syntax like:
  1002  // s.Square().Mul(s2) so that s = s^2 * s2.
  1003  func (s *ModNScalar) Square() *ModNScalar {
  1004  	return s.SquareVal(s)
  1005  }
  1006  
  1007  // NegateVal negates the passed scalar modulo the group order and stores the
  1008  // result in s in constant time.
  1009  //
  1010  // The scalar is returned to support chaining.  This enables syntax like:
  1011  // s.NegateVal(s2).AddInt(1) so that s = -s2 + 1.
  1012  func (s *ModNScalar) NegateVal(val *ModNScalar) *ModNScalar {
  1013  	// Since the scalar is already in the range 0 <= val < N, where N is the
  1014  	// group order, negation modulo the group order is just the group order
  1015  	// minus the value.  This implies that the result will always be in the
  1016  	// desired range with the sole exception of 0 because N - 0 = N itself.
  1017  	//
  1018  	// Therefore, in order to avoid the need to reduce the result for every
  1019  	// other case in order to achieve constant time, this creates a mask that is
  1020  	// all 0s in the case of the scalar being negated is 0 and all 1s otherwise
  1021  	// and bitwise ands that mask with each word.
  1022  	//
  1023  	// Finally, to simplify the carry propagation, this adds the two's
  1024  	// complement of the scalar to N in order to achieve the same result.
  1025  	bits := val.n[0] | val.n[1] | val.n[2] | val.n[3] | val.n[4] | val.n[5] |
  1026  		val.n[6] | val.n[7]
  1027  	mask := uint64(uint32Mask * constantTimeNotEq(bits, 0))
  1028  	c := uint64(orderWordZero) + (uint64(^val.n[0]) + 1)
  1029  	s.n[0] = uint32(c & mask)
  1030  	c = (c >> 32) + uint64(orderWordOne) + uint64(^val.n[1])
  1031  	s.n[1] = uint32(c & mask)
  1032  	c = (c >> 32) + uint64(orderWordTwo) + uint64(^val.n[2])
  1033  	s.n[2] = uint32(c & mask)
  1034  	c = (c >> 32) + uint64(orderWordThree) + uint64(^val.n[3])
  1035  	s.n[3] = uint32(c & mask)
  1036  	c = (c >> 32) + uint64(orderWordFour) + uint64(^val.n[4])
  1037  	s.n[4] = uint32(c & mask)
  1038  	c = (c >> 32) + uint64(orderWordFive) + uint64(^val.n[5])
  1039  	s.n[5] = uint32(c & mask)
  1040  	c = (c >> 32) + uint64(orderWordSix) + uint64(^val.n[6])
  1041  	s.n[6] = uint32(c & mask)
  1042  	c = (c >> 32) + uint64(orderWordSeven) + uint64(^val.n[7])
  1043  	s.n[7] = uint32(c & mask)
  1044  	return s
  1045  }
  1046  
  1047  // Negate negates the scalar modulo the group order in constant time.  The
  1048  // existing scalar is modified.
  1049  //
  1050  // The scalar is returned to support chaining.  This enables syntax like:
  1051  // s.Negate().AddInt(1) so that s = -s + 1.
  1052  func (s *ModNScalar) Negate() *ModNScalar {
  1053  	return s.NegateVal(s)
  1054  }
  1055  
  1056  // InverseValNonConst finds the modular multiplicative inverse of the passed
  1057  // scalar and stores result in s in *non-constant* time.
  1058  //
  1059  // The scalar is returned to support chaining.  This enables syntax like:
  1060  // s3.InverseVal(s1).Mul(s2) so that s3 = s1^-1 * s2.
  1061  func (s *ModNScalar) InverseValNonConst(val *ModNScalar) *ModNScalar {
  1062  	// This is making use of big integers for now.  Ideally it will be replaced
  1063  	// with an implementation that does not depend on big integers.
  1064  	valBytes := val.Bytes()
  1065  	bigVal := new(big.Int).SetBytes(valBytes[:])
  1066  	bigVal.ModInverse(bigVal, curveParams.N)
  1067  	s.SetByteSlice(bigVal.Bytes())
  1068  	return s
  1069  }
  1070  
  1071  // InverseNonConst finds the modular multiplicative inverse of the scalar in
  1072  // *non-constant* time.  The existing scalar is modified.
  1073  //
  1074  // The scalar is returned to support chaining.  This enables syntax like:
  1075  // s.Inverse().Mul(s2) so that s = s^-1 * s2.
  1076  func (s *ModNScalar) InverseNonConst() *ModNScalar {
  1077  	return s.InverseValNonConst(s)
  1078  }
  1079  
  1080  // IsOverHalfOrder returns whether or not the scalar exceeds the group order
  1081  // divided by 2 in constant time.
  1082  func (s *ModNScalar) IsOverHalfOrder() bool {
  1083  	// The intuition here is that the scalar is greater than half of the group
  1084  	// order if one of the higher individual words is greater than the
  1085  	// corresponding word of the half group order and all higher words in the
  1086  	// scalar are equal to their corresponding word of the half group order.
  1087  	//
  1088  	// Note that the words 4, 5, and 6 are all the max uint32 value, so there is
  1089  	// no need to test if those individual words of the scalar exceeds them,
  1090  	// hence, only equality is checked for them.
  1091  	result := constantTimeGreater(s.n[7], halfOrderWordSeven)
  1092  	highWordsEqual := constantTimeEq(s.n[7], halfOrderWordSeven)
  1093  	highWordsEqual &= constantTimeEq(s.n[6], halfOrderWordSix)
  1094  	highWordsEqual &= constantTimeEq(s.n[5], halfOrderWordFive)
  1095  	highWordsEqual &= constantTimeEq(s.n[4], halfOrderWordFour)
  1096  	result |= highWordsEqual & constantTimeGreater(s.n[3], halfOrderWordThree)
  1097  	highWordsEqual &= constantTimeEq(s.n[3], halfOrderWordThree)
  1098  	result |= highWordsEqual & constantTimeGreater(s.n[2], halfOrderWordTwo)
  1099  	highWordsEqual &= constantTimeEq(s.n[2], halfOrderWordTwo)
  1100  	result |= highWordsEqual & constantTimeGreater(s.n[1], halfOrderWordOne)
  1101  	highWordsEqual &= constantTimeEq(s.n[1], halfOrderWordOne)
  1102  	result |= highWordsEqual & constantTimeGreater(s.n[0], halfOrderWordZero)
  1103  
  1104  	return result != 0
  1105  }
  1106
View as plain text