1 // Copyright (c) 2020-2023 The Decred developers 2 // Use of this source code is governed by an ISC 3 // license that can be found in the LICENSE file. 4 5 package secp256k1 6 7 import ( 8 "encoding/hex" 9 "math/big" 10 ) 11 12 // References: 13 // [SECG]: Recommended Elliptic Curve Domain Parameters 14 // https://www.secg.org/sec2-v2.pdf 15 // 16 // [HAC]: Handbook of Applied Cryptography Menezes, van Oorschot, Vanstone. 17 // http://cacr.uwaterloo.ca/hac/ 18 19 // Many elliptic curve operations require working with scalars in a finite field 20 // characterized by the order of the group underlying the secp256k1 curve. 21 // Given this precision is larger than the biggest available native type, 22 // obviously some form of bignum math is needed. This code implements 23 // specialized fixed-precision field arithmetic rather than relying on an 24 // arbitrary-precision arithmetic package such as math/big for dealing with the 25 // math modulo the group order since the size is known. As a result, rather 26 // large performance gains are achieved by taking advantage of many 27 // optimizations not available to arbitrary-precision arithmetic and generic 28 // modular arithmetic algorithms. 29 // 30 // There are various ways to internally represent each element. For example, 31 // the most obvious representation would be to use an array of 4 uint64s (64 32 // bits * 4 = 256 bits). However, that representation suffers from the fact 33 // that there is no native Go type large enough to handle the intermediate 34 // results while adding or multiplying two 64-bit numbers. 35 // 36 // Given the above, this implementation represents the field elements as 8 37 // uint32s with each word (array entry) treated as base 2^32. This was chosen 38 // because most systems at the current time are 64-bit (or at least have 64-bit 39 // registers available for specialized purposes such as MMX) so the intermediate 40 // results can typically be done using a native register (and using uint64s to 41 // avoid the need for additional half-word arithmetic) 42 43 const ( 44 // These fields provide convenient access to each of the words of the 45 // secp256k1 curve group order N to improve code readability. 46 // 47 // The group order of the curve per [SECG] is: 48 // 0xffffffff ffffffff ffffffff fffffffe baaedce6 af48a03b bfd25e8c d0364141 49 // 50 // nolint: dupword 51 orderWordZero uint32 = 0xd0364141 52 orderWordOne uint32 = 0xbfd25e8c 53 orderWordTwo uint32 = 0xaf48a03b 54 orderWordThree uint32 = 0xbaaedce6 55 orderWordFour uint32 = 0xfffffffe 56 orderWordFive uint32 = 0xffffffff 57 orderWordSix uint32 = 0xffffffff 58 orderWordSeven uint32 = 0xffffffff 59 60 // These fields provide convenient access to each of the words of the two's 61 // complement of the secp256k1 curve group order N to improve code 62 // readability. 63 // 64 // The two's complement of the group order is: 65 // 0x00000000 00000000 00000000 00000001 45512319 50b75fc4 402da173 2fc9bebf 66 orderComplementWordZero uint32 = (^orderWordZero) + 1 67 orderComplementWordOne uint32 = ^orderWordOne 68 orderComplementWordTwo uint32 = ^orderWordTwo 69 orderComplementWordThree uint32 = ^orderWordThree 70 //orderComplementWordFour uint32 = ^orderWordFour // unused 71 //orderComplementWordFive uint32 = ^orderWordFive // unused 72 //orderComplementWordSix uint32 = ^orderWordSix // unused 73 //orderComplementWordSeven uint32 = ^orderWordSeven // unused 74 75 // These fields provide convenient access to each of the words of the 76 // secp256k1 curve group order N / 2 to improve code readability and avoid 77 // the need to recalculate them. 78 // 79 // The half order of the secp256k1 curve group is: 80 // 0x7fffffff ffffffff ffffffff ffffffff 5d576e73 57a4501d dfe92f46 681b20a0 81 // 82 // nolint: dupword 83 halfOrderWordZero uint32 = 0x681b20a0 84 halfOrderWordOne uint32 = 0xdfe92f46 85 halfOrderWordTwo uint32 = 0x57a4501d 86 halfOrderWordThree uint32 = 0x5d576e73 87 halfOrderWordFour uint32 = 0xffffffff 88 halfOrderWordFive uint32 = 0xffffffff 89 halfOrderWordSix uint32 = 0xffffffff 90 halfOrderWordSeven uint32 = 0x7fffffff 91 92 // uint32Mask is simply a mask with all bits set for a uint32 and is used to 93 // improve the readability of the code. 94 uint32Mask = 0xffffffff 95 ) 96 97 var ( 98 // zero32 is an array of 32 bytes used for the purposes of zeroing and is 99 // defined here to avoid extra allocations. 100 zero32 = [32]byte{} 101 ) 102 103 // ModNScalar implements optimized 256-bit constant-time fixed-precision 104 // arithmetic over the secp256k1 group order. This means all arithmetic is 105 // performed modulo: 106 // 107 // 0xfffffffffffffffffffffffffffffffebaaedce6af48a03bbfd25e8cd0364141 108 // 109 // It only implements the arithmetic needed for elliptic curve operations, 110 // however, the operations that are not implemented can typically be worked 111 // around if absolutely needed. For example, subtraction can be performed by 112 // adding the negation. 113 // 114 // Should it be absolutely necessary, conversion to the standard library 115 // math/big.Int can be accomplished by using the Bytes method, slicing the 116 // resulting fixed-size array, and feeding it to big.Int.SetBytes. However, 117 // that should typically be avoided when possible as conversion to big.Ints 118 // requires allocations, is not constant time, and is slower when working modulo 119 // the group order. 120 type ModNScalar struct { 121 // The scalar is represented as 8 32-bit integers in base 2^32. 122 // 123 // The following depicts the internal representation: 124 // --------------------------------------------------------- 125 // | n[7] | n[6] | ... | n[0] | 126 // | 32 bits | 32 bits | ... | 32 bits | 127 // | Mult: 2^(32*7) | Mult: 2^(32*6) | ... | Mult: 2^(32*0) | 128 // --------------------------------------------------------- 129 // 130 // For example, consider the number 2^87 + 2^42 + 1. It would be 131 // represented as: 132 // n[0] = 1 133 // n[1] = 2^10 134 // n[2] = 2^23 135 // n[3..7] = 0 136 // 137 // The full 256-bit value is then calculated by looping i from 7..0 and 138 // doing sum(n[i] * 2^(32i)) like so: 139 // n[7] * 2^(32*7) = 0 * 2^224 = 0 140 // n[6] * 2^(32*6) = 0 * 2^192 = 0 141 // ... 142 // n[2] * 2^(32*2) = 2^23 * 2^64 = 2^87 143 // n[1] * 2^(32*1) = 2^10 * 2^32 = 2^42 144 // n[0] * 2^(32*0) = 1 * 2^0 = 1 145 // Sum: 0 + 0 + ... + 2^87 + 2^42 + 1 = 2^87 + 2^42 + 1 146 n [8]uint32 147 } 148 149 // String returns the scalar as a human-readable hex string. 150 // 151 // This is NOT constant time. 152 func (s ModNScalar) String() string { 153 b := s.Bytes() 154 return hex.EncodeToString(b[:]) 155 } 156 157 // Set sets the scalar equal to a copy of the passed one in constant time. 158 // 159 // The scalar is returned to support chaining. This enables syntax like: 160 // s := new(ModNScalar).Set(s2).Add(1) so that s = s2 + 1 where s2 is not 161 // modified. 162 func (s *ModNScalar) Set(val *ModNScalar) *ModNScalar { 163 *s = *val 164 return s 165 } 166 167 // Zero sets the scalar to zero in constant time. A newly created scalar is 168 // already set to zero. This function can be useful to clear an existing scalar 169 // for reuse. 170 func (s *ModNScalar) Zero() { 171 s.n[0] = 0 172 s.n[1] = 0 173 s.n[2] = 0 174 s.n[3] = 0 175 s.n[4] = 0 176 s.n[5] = 0 177 s.n[6] = 0 178 s.n[7] = 0 179 } 180 181 // IsZeroBit returns 1 when the scalar is equal to zero or 0 otherwise in 182 // constant time. 183 // 184 // Note that a bool is not used here because it is not possible in Go to convert 185 // from a bool to numeric value in constant time and many constant-time 186 // operations require a numeric value. See IsZero for the version that returns 187 // a bool. 188 func (s *ModNScalar) IsZeroBit() uint32 { 189 // The scalar can only be zero if no bits are set in any of the words. 190 bits := s.n[0] | s.n[1] | s.n[2] | s.n[3] | s.n[4] | s.n[5] | s.n[6] | s.n[7] 191 return constantTimeEq(bits, 0) 192 } 193 194 // IsZero returns whether or not the scalar is equal to zero in constant time. 195 func (s *ModNScalar) IsZero() bool { 196 // The scalar can only be zero if no bits are set in any of the words. 197 bits := s.n[0] | s.n[1] | s.n[2] | s.n[3] | s.n[4] | s.n[5] | s.n[6] | s.n[7] 198 return bits == 0 199 } 200 201 // SetInt sets the scalar to the passed integer in constant time. This is a 202 // convenience function since it is fairly common to perform some arithmetic 203 // with small native integers. 204 // 205 // The scalar is returned to support chaining. This enables syntax like: 206 // s := new(ModNScalar).SetInt(2).Mul(s2) so that s = 2 * s2. 207 func (s *ModNScalar) SetInt(ui uint32) *ModNScalar { 208 s.Zero() 209 s.n[0] = ui 210 return s 211 } 212 213 // constantTimeEq returns 1 if a == b or 0 otherwise in constant time. 214 func constantTimeEq(a, b uint32) uint32 { 215 return uint32((uint64(a^b) - 1) >> 63) 216 } 217 218 // constantTimeNotEq returns 1 if a != b or 0 otherwise in constant time. 219 func constantTimeNotEq(a, b uint32) uint32 { 220 return ^uint32((uint64(a^b)-1)>>63) & 1 221 } 222 223 // constantTimeLess returns 1 if a < b or 0 otherwise in constant time. 224 func constantTimeLess(a, b uint32) uint32 { 225 return uint32((uint64(a) - uint64(b)) >> 63) 226 } 227 228 // constantTimeLessOrEq returns 1 if a <= b or 0 otherwise in constant time. 229 func constantTimeLessOrEq(a, b uint32) uint32 { 230 return uint32((uint64(a) - uint64(b) - 1) >> 63) 231 } 232 233 // constantTimeGreater returns 1 if a > b or 0 otherwise in constant time. 234 func constantTimeGreater(a, b uint32) uint32 { 235 return constantTimeLess(b, a) 236 } 237 238 // constantTimeGreaterOrEq returns 1 if a >= b or 0 otherwise in constant time. 239 func constantTimeGreaterOrEq(a, b uint32) uint32 { 240 return constantTimeLessOrEq(b, a) 241 } 242 243 // constantTimeMin returns min(a,b) in constant time. 244 func constantTimeMin(a, b uint32) uint32 { 245 return b ^ ((a ^ b) & -constantTimeLess(a, b)) 246 } 247 248 // overflows determines if the current scalar is greater than or equal to the 249 // group order in constant time and returns 1 if it is or 0 otherwise. 250 func (s *ModNScalar) overflows() uint32 { 251 // The intuition here is that the scalar is greater than the group order if 252 // one of the higher individual words is greater than corresponding word of 253 // the group order and all higher words in the scalar are equal to their 254 // corresponding word of the group order. Since this type is modulo the 255 // group order, being equal is also an overflow back to 0. 256 // 257 // Note that the words 5, 6, and 7 are all the max uint32 value, so there is 258 // no need to test if those individual words of the scalar exceeds them, 259 // hence, only equality is checked for them. 260 highWordsEqual := constantTimeEq(s.n[7], orderWordSeven) 261 highWordsEqual &= constantTimeEq(s.n[6], orderWordSix) 262 highWordsEqual &= constantTimeEq(s.n[5], orderWordFive) 263 overflow := highWordsEqual & constantTimeGreater(s.n[4], orderWordFour) 264 highWordsEqual &= constantTimeEq(s.n[4], orderWordFour) 265 overflow |= highWordsEqual & constantTimeGreater(s.n[3], orderWordThree) 266 highWordsEqual &= constantTimeEq(s.n[3], orderWordThree) 267 overflow |= highWordsEqual & constantTimeGreater(s.n[2], orderWordTwo) 268 highWordsEqual &= constantTimeEq(s.n[2], orderWordTwo) 269 overflow |= highWordsEqual & constantTimeGreater(s.n[1], orderWordOne) 270 highWordsEqual &= constantTimeEq(s.n[1], orderWordOne) 271 overflow |= highWordsEqual & constantTimeGreaterOrEq(s.n[0], orderWordZero) 272 273 return overflow 274 } 275 276 // reduce256 reduces the current scalar modulo the group order in accordance 277 // with the overflows parameter in constant time. The overflows parameter 278 // specifies whether or not the scalar is known to be greater than the group 279 // order and MUST either be 1 in the case it is or 0 in the case it is not for a 280 // correct result. 281 func (s *ModNScalar) reduce256(overflows uint32) { 282 // Notice that since s < 2^256 < 2N (where N is the group order), the max 283 // possible number of reductions required is one. Therefore, in the case a 284 // reduction is needed, it can be performed with a single subtraction of N. 285 // Also, recall that subtraction is equivalent to addition by the two's 286 // complement while ignoring the carry. 287 // 288 // When s >= N, the overflows parameter will be 1. Conversely, it will be 0 289 // when s < N. Thus multiplying by the overflows parameter will either 290 // result in 0 or the multiplicand itself. 291 // 292 // Combining the above along with the fact that s + 0 = s, the following is 293 // a constant time implementation that works by either adding 0 or the two's 294 // complement of N as needed. 295 // 296 // The final result will be in the range 0 <= s < N as expected. 297 overflows64 := uint64(overflows) 298 c := uint64(s.n[0]) + overflows64*uint64(orderComplementWordZero) 299 s.n[0] = uint32(c & uint32Mask) 300 c = (c >> 32) + uint64(s.n[1]) + overflows64*uint64(orderComplementWordOne) 301 s.n[1] = uint32(c & uint32Mask) 302 c = (c >> 32) + uint64(s.n[2]) + overflows64*uint64(orderComplementWordTwo) 303 s.n[2] = uint32(c & uint32Mask) 304 c = (c >> 32) + uint64(s.n[3]) + overflows64*uint64(orderComplementWordThree) 305 s.n[3] = uint32(c & uint32Mask) 306 c = (c >> 32) + uint64(s.n[4]) + overflows64 // * 1 307 s.n[4] = uint32(c & uint32Mask) 308 c = (c >> 32) + uint64(s.n[5]) // + overflows64 * 0 309 s.n[5] = uint32(c & uint32Mask) 310 c = (c >> 32) + uint64(s.n[6]) // + overflows64 * 0 311 s.n[6] = uint32(c & uint32Mask) 312 c = (c >> 32) + uint64(s.n[7]) // + overflows64 * 0 313 s.n[7] = uint32(c & uint32Mask) 314 } 315 316 // SetBytes interprets the provided array as a 256-bit big-endian unsigned 317 // integer, reduces it modulo the group order, sets the scalar to the result, 318 // and returns either 1 if it was reduced (aka it overflowed) or 0 otherwise in 319 // constant time. 320 // 321 // Note that a bool is not used here because it is not possible in Go to convert 322 // from a bool to numeric value in constant time and many constant-time 323 // operations require a numeric value. 324 func (s *ModNScalar) SetBytes(b *[32]byte) uint32 { 325 // Pack the 256 total bits across the 8 uint32 words. This could be done 326 // with a for loop, but benchmarks show this unrolled version is about 2 327 // times faster than the variant that uses a loop. 328 s.n[0] = uint32(b[31]) | uint32(b[30])<<8 | uint32(b[29])<<16 | uint32(b[28])<<24 329 s.n[1] = uint32(b[27]) | uint32(b[26])<<8 | uint32(b[25])<<16 | uint32(b[24])<<24 330 s.n[2] = uint32(b[23]) | uint32(b[22])<<8 | uint32(b[21])<<16 | uint32(b[20])<<24 331 s.n[3] = uint32(b[19]) | uint32(b[18])<<8 | uint32(b[17])<<16 | uint32(b[16])<<24 332 s.n[4] = uint32(b[15]) | uint32(b[14])<<8 | uint32(b[13])<<16 | uint32(b[12])<<24 333 s.n[5] = uint32(b[11]) | uint32(b[10])<<8 | uint32(b[9])<<16 | uint32(b[8])<<24 334 s.n[6] = uint32(b[7]) | uint32(b[6])<<8 | uint32(b[5])<<16 | uint32(b[4])<<24 335 s.n[7] = uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24 336 337 // The value might be >= N, so reduce it as required and return whether or 338 // not it was reduced. 339 needsReduce := s.overflows() 340 s.reduce256(needsReduce) 341 return needsReduce 342 } 343 344 // zeroArray32 zeroes the provided 32-byte buffer. 345 func zeroArray32(b *[32]byte) { 346 copy(b[:], zero32[:]) 347 } 348 349 // SetByteSlice interprets the provided slice as a 256-bit big-endian unsigned 350 // integer (meaning it is truncated to the first 32 bytes), reduces it modulo 351 // the group order, sets the scalar to the result, and returns whether or not 352 // the resulting truncated 256-bit integer overflowed in constant time. 353 // 354 // Note that since passing a slice with more than 32 bytes is truncated, it is 355 // possible that the truncated value is less than the order of the curve and 356 // hence it will not be reported as having overflowed in that case. It is up to 357 // the caller to decide whether it needs to provide numbers of the appropriate 358 // size or it is acceptable to use this function with the described truncation 359 // and overflow behavior. 360 func (s *ModNScalar) SetByteSlice(b []byte) bool { 361 var b32 [32]byte 362 b = b[:constantTimeMin(uint32(len(b)), 32)] 363 copy(b32[:], b32[:32-len(b)]) 364 copy(b32[32-len(b):], b) 365 result := s.SetBytes(&b32) 366 zeroArray32(&b32) 367 return result != 0 368 } 369 370 // PutBytesUnchecked unpacks the scalar to a 32-byte big-endian value directly 371 // into the passed byte slice in constant time. The target slice must have at 372 // least 32 bytes available or it will panic. 373 // 374 // There is a similar function, PutBytes, which unpacks the scalar into a 375 // 32-byte array directly. This version is provided since it can be useful to 376 // write directly into part of a larger buffer without needing a separate 377 // allocation. 378 // 379 // Preconditions: 380 // - The target slice MUST have at least 32 bytes available 381 func (s *ModNScalar) PutBytesUnchecked(b []byte) { 382 // Unpack the 256 total bits from the 8 uint32 words. This could be done 383 // with a for loop, but benchmarks show this unrolled version is about 2 384 // times faster than the variant which uses a loop. 385 b[31] = byte(s.n[0]) 386 b[30] = byte(s.n[0] >> 8) 387 b[29] = byte(s.n[0] >> 16) 388 b[28] = byte(s.n[0] >> 24) 389 b[27] = byte(s.n[1]) 390 b[26] = byte(s.n[1] >> 8) 391 b[25] = byte(s.n[1] >> 16) 392 b[24] = byte(s.n[1] >> 24) 393 b[23] = byte(s.n[2]) 394 b[22] = byte(s.n[2] >> 8) 395 b[21] = byte(s.n[2] >> 16) 396 b[20] = byte(s.n[2] >> 24) 397 b[19] = byte(s.n[3]) 398 b[18] = byte(s.n[3] >> 8) 399 b[17] = byte(s.n[3] >> 16) 400 b[16] = byte(s.n[3] >> 24) 401 b[15] = byte(s.n[4]) 402 b[14] = byte(s.n[4] >> 8) 403 b[13] = byte(s.n[4] >> 16) 404 b[12] = byte(s.n[4] >> 24) 405 b[11] = byte(s.n[5]) 406 b[10] = byte(s.n[5] >> 8) 407 b[9] = byte(s.n[5] >> 16) 408 b[8] = byte(s.n[5] >> 24) 409 b[7] = byte(s.n[6]) 410 b[6] = byte(s.n[6] >> 8) 411 b[5] = byte(s.n[6] >> 16) 412 b[4] = byte(s.n[6] >> 24) 413 b[3] = byte(s.n[7]) 414 b[2] = byte(s.n[7] >> 8) 415 b[1] = byte(s.n[7] >> 16) 416 b[0] = byte(s.n[7] >> 24) 417 } 418 419 // PutBytes unpacks the scalar to a 32-byte big-endian value using the passed 420 // byte array in constant time. 421 // 422 // There is a similar function, PutBytesUnchecked, which unpacks the scalar into 423 // a slice that must have at least 32 bytes available. This version is provided 424 // since it can be useful to write directly into an array that is type checked. 425 // 426 // Alternatively, there is also Bytes, which unpacks the scalar into a new array 427 // and returns that which can sometimes be more ergonomic in applications that 428 // aren't concerned about an additional copy. 429 func (s *ModNScalar) PutBytes(b *[32]byte) { 430 s.PutBytesUnchecked(b[:]) 431 } 432 433 // Bytes unpacks the scalar to a 32-byte big-endian value in constant time. 434 // 435 // See PutBytes and PutBytesUnchecked for variants that allow an array or slice 436 // to be passed which can be useful to cut down on the number of allocations 437 // by allowing the caller to reuse a buffer or write directly into part of a 438 // larger buffer. 439 func (s *ModNScalar) Bytes() [32]byte { 440 var b [32]byte 441 s.PutBytesUnchecked(b[:]) 442 return b 443 } 444 445 // IsOdd returns whether or not the scalar is an odd number in constant time. 446 func (s *ModNScalar) IsOdd() bool { 447 // Only odd numbers have the bottom bit set. 448 return s.n[0]&1 == 1 449 } 450 451 // Equals returns whether or not the two scalars are the same in constant time. 452 func (s *ModNScalar) Equals(val *ModNScalar) bool { 453 // Xor only sets bits when they are different, so the two scalars can only 454 // be the same if no bits are set after xoring each word. 455 bits := (s.n[0] ^ val.n[0]) | (s.n[1] ^ val.n[1]) | (s.n[2] ^ val.n[2]) | 456 (s.n[3] ^ val.n[3]) | (s.n[4] ^ val.n[4]) | (s.n[5] ^ val.n[5]) | 457 (s.n[6] ^ val.n[6]) | (s.n[7] ^ val.n[7]) 458 459 return bits == 0 460 } 461 462 // Add2 adds the passed two scalars together modulo the group order in constant 463 // time and stores the result in s. 464 // 465 // The scalar is returned to support chaining. This enables syntax like: 466 // s3.Add2(s, s2).AddInt(1) so that s3 = s + s2 + 1. 467 func (s *ModNScalar) Add2(val1, val2 *ModNScalar) *ModNScalar { 468 c := uint64(val1.n[0]) + uint64(val2.n[0]) 469 s.n[0] = uint32(c & uint32Mask) 470 c = (c >> 32) + uint64(val1.n[1]) + uint64(val2.n[1]) 471 s.n[1] = uint32(c & uint32Mask) 472 c = (c >> 32) + uint64(val1.n[2]) + uint64(val2.n[2]) 473 s.n[2] = uint32(c & uint32Mask) 474 c = (c >> 32) + uint64(val1.n[3]) + uint64(val2.n[3]) 475 s.n[3] = uint32(c & uint32Mask) 476 c = (c >> 32) + uint64(val1.n[4]) + uint64(val2.n[4]) 477 s.n[4] = uint32(c & uint32Mask) 478 c = (c >> 32) + uint64(val1.n[5]) + uint64(val2.n[5]) 479 s.n[5] = uint32(c & uint32Mask) 480 c = (c >> 32) + uint64(val1.n[6]) + uint64(val2.n[6]) 481 s.n[6] = uint32(c & uint32Mask) 482 c = (c >> 32) + uint64(val1.n[7]) + uint64(val2.n[7]) 483 s.n[7] = uint32(c & uint32Mask) 484 485 // The result is now 256 bits, but it might still be >= N, so use the 486 // existing normal reduce method for 256-bit values. 487 s.reduce256(uint32(c>>32) + s.overflows()) 488 return s 489 } 490 491 // Add adds the passed scalar to the existing one modulo the group order in 492 // constant time and stores the result in s. 493 // 494 // The scalar is returned to support chaining. This enables syntax like: 495 // s.Add(s2).AddInt(1) so that s = s + s2 + 1. 496 func (s *ModNScalar) Add(val *ModNScalar) *ModNScalar { 497 return s.Add2(s, val) 498 } 499 500 // accumulator96 provides a 96-bit accumulator for use in the intermediate 501 // calculations requiring more than 64-bits. 502 type accumulator96 struct { 503 n [3]uint32 504 } 505 506 // Add adds the passed unsigned 64-bit value to the accumulator. 507 func (a *accumulator96) Add(v uint64) { 508 low := uint32(v & uint32Mask) 509 hi := uint32(v >> 32) 510 a.n[0] += low 511 hi += constantTimeLess(a.n[0], low) // Carry if overflow in n[0]. 512 a.n[1] += hi 513 a.n[2] += constantTimeLess(a.n[1], hi) // Carry if overflow in n[1]. 514 } 515 516 // Rsh32 right shifts the accumulator by 32 bits. 517 func (a *accumulator96) Rsh32() { 518 a.n[0] = a.n[1] 519 a.n[1] = a.n[2] 520 a.n[2] = 0 521 } 522 523 // reduce385 reduces the 385-bit intermediate result in the passed terms modulo 524 // the group order in constant time and stores the result in s. 525 func (s *ModNScalar) reduce385(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12 uint64) { 526 // At this point, the intermediate result in the passed terms has been 527 // reduced to fit within 385 bits, so reduce it again using the same method 528 // described in reduce512. As before, the intermediate result will end up 529 // being reduced by another 127 bits to 258 bits, thus 9 32-bit terms are 530 // needed for this iteration. The reduced terms are assigned back to t0 531 // through t8. 532 // 533 // Note that several of the intermediate calculations require adding 64-bit 534 // products together which would overflow a uint64, so a 96-bit accumulator 535 // is used instead until the value is reduced enough to use native uint64s. 536 537 // Terms for 2^(32*0). 538 var acc accumulator96 539 acc.n[0] = uint32(t0) // == acc.Add(t0) because acc is guaranteed to be 0. 540 acc.Add(t8 * uint64(orderComplementWordZero)) 541 t0 = uint64(acc.n[0]) 542 acc.Rsh32() 543 544 // Terms for 2^(32*1). 545 acc.Add(t1) 546 acc.Add(t8 * uint64(orderComplementWordOne)) 547 acc.Add(t9 * uint64(orderComplementWordZero)) 548 t1 = uint64(acc.n[0]) 549 acc.Rsh32() 550 551 // Terms for 2^(32*2). 552 acc.Add(t2) 553 acc.Add(t8 * uint64(orderComplementWordTwo)) 554 acc.Add(t9 * uint64(orderComplementWordOne)) 555 acc.Add(t10 * uint64(orderComplementWordZero)) 556 t2 = uint64(acc.n[0]) 557 acc.Rsh32() 558 559 // Terms for 2^(32*3). 560 acc.Add(t3) 561 acc.Add(t8 * uint64(orderComplementWordThree)) 562 acc.Add(t9 * uint64(orderComplementWordTwo)) 563 acc.Add(t10 * uint64(orderComplementWordOne)) 564 acc.Add(t11 * uint64(orderComplementWordZero)) 565 t3 = uint64(acc.n[0]) 566 acc.Rsh32() 567 568 // Terms for 2^(32*4). 569 acc.Add(t4) 570 acc.Add(t8) // * uint64(orderComplementWordFour) // * 1 571 acc.Add(t9 * uint64(orderComplementWordThree)) 572 acc.Add(t10 * uint64(orderComplementWordTwo)) 573 acc.Add(t11 * uint64(orderComplementWordOne)) 574 acc.Add(t12 * uint64(orderComplementWordZero)) 575 t4 = uint64(acc.n[0]) 576 acc.Rsh32() 577 578 // Terms for 2^(32*5). 579 acc.Add(t5) 580 // acc.Add(t8 * uint64(orderComplementWordFive)) // 0 581 acc.Add(t9) // * uint64(orderComplementWordFour) // * 1 582 acc.Add(t10 * uint64(orderComplementWordThree)) 583 acc.Add(t11 * uint64(orderComplementWordTwo)) 584 acc.Add(t12 * uint64(orderComplementWordOne)) 585 t5 = uint64(acc.n[0]) 586 acc.Rsh32() 587 588 // Terms for 2^(32*6). 589 acc.Add(t6) 590 // acc.Add(t8 * uint64(orderComplementWordSix)) // 0 591 // acc.Add(t9 * uint64(orderComplementWordFive)) // 0 592 acc.Add(t10) // * uint64(orderComplementWordFour) // * 1 593 acc.Add(t11 * uint64(orderComplementWordThree)) 594 acc.Add(t12 * uint64(orderComplementWordTwo)) 595 t6 = uint64(acc.n[0]) 596 acc.Rsh32() 597 598 // Terms for 2^(32*7). 599 acc.Add(t7) 600 // acc.Add(t8 * uint64(orderComplementWordSeven)) // 0 601 // acc.Add(t9 * uint64(orderComplementWordSix)) // 0 602 // acc.Add(t10 * uint64(orderComplementWordFive)) // 0 603 acc.Add(t11) // * uint64(orderComplementWordFour) // * 1 604 acc.Add(t12 * uint64(orderComplementWordThree)) 605 t7 = uint64(acc.n[0]) 606 acc.Rsh32() 607 608 // Terms for 2^(32*8). 609 // acc.Add(t9 * uint64(orderComplementWordSeven)) // 0 610 // acc.Add(t10 * uint64(orderComplementWordSix)) // 0 611 // acc.Add(t11 * uint64(orderComplementWordFive)) // 0 612 acc.Add(t12) // * uint64(orderComplementWordFour) // * 1 613 t8 = uint64(acc.n[0]) 614 // acc.Rsh32() // No need since not used after this. Guaranteed to be 0. 615 616 // NOTE: All of the remaining multiplications for this iteration result in 0 617 // as they all involve multiplying by combinations of the fifth, sixth, and 618 // seventh words of the two's complement of N, which are 0, so skip them. 619 620 // At this point, the result is reduced to fit within 258 bits, so reduce it 621 // again using a slightly modified version of the same method. The maximum 622 // value in t8 is 2 at this point and therefore multiplying it by each word 623 // of the two's complement of N and adding it to a 32-bit term will result 624 // in a maximum requirement of 33 bits, so it is safe to use native uint64s 625 // here for the intermediate term carry propagation. 626 // 627 // Also, since the maximum value in t8 is 2, this ends up reducing by 628 // another 2 bits to 256 bits. 629 c := t0 + t8*uint64(orderComplementWordZero) 630 s.n[0] = uint32(c & uint32Mask) 631 c = (c >> 32) + t1 + t8*uint64(orderComplementWordOne) 632 s.n[1] = uint32(c & uint32Mask) 633 c = (c >> 32) + t2 + t8*uint64(orderComplementWordTwo) 634 s.n[2] = uint32(c & uint32Mask) 635 c = (c >> 32) + t3 + t8*uint64(orderComplementWordThree) 636 s.n[3] = uint32(c & uint32Mask) 637 c = (c >> 32) + t4 + t8 // * uint64(orderComplementWordFour) == * 1 638 s.n[4] = uint32(c & uint32Mask) 639 c = (c >> 32) + t5 // + t8*uint64(orderComplementWordFive) == 0 640 s.n[5] = uint32(c & uint32Mask) 641 c = (c >> 32) + t6 // + t8*uint64(orderComplementWordSix) == 0 642 s.n[6] = uint32(c & uint32Mask) 643 c = (c >> 32) + t7 // + t8*uint64(orderComplementWordSeven) == 0 644 s.n[7] = uint32(c & uint32Mask) 645 646 // The result is now 256 bits, but it might still be >= N, so use the 647 // existing normal reduce method for 256-bit values. 648 s.reduce256(uint32(c>>32) + s.overflows()) 649 } 650 651 // reduce512 reduces the 512-bit intermediate result in the passed terms modulo 652 // the group order down to 385 bits in constant time and stores the result in s. 653 func (s *ModNScalar) reduce512(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15 uint64) { 654 // At this point, the intermediate result in the passed terms is grouped 655 // into the respective bases. 656 // 657 // Per [HAC] section 14.3.4: Reduction method of moduli of special form, 658 // when the modulus is of the special form m = b^t - c, where log_2(c) < t, 659 // highly efficient reduction can be achieved per the provided algorithm. 660 // 661 // The secp256k1 group order fits this criteria since it is: 662 // 2^256 - 432420386565659656852420866394968145599 663 // 664 // Technically the max possible value here is (N-1)^2 since the two scalars 665 // being multiplied are always mod N. Nevertheless, it is safer to consider 666 // it to be (2^256-1)^2 = 2^512 - 2^256 + 1 since it is the product of two 667 // 256-bit values. 668 // 669 // The algorithm is to reduce the result modulo the prime by subtracting 670 // multiples of the group order N. However, in order simplify carry 671 // propagation, this adds with the two's complement of N to achieve the same 672 // result. 673 // 674 // Since the two's complement of N has 127 leading zero bits, this will end 675 // up reducing the intermediate result from 512 bits to 385 bits, resulting 676 // in 13 32-bit terms. The reduced terms are assigned back to t0 through 677 // t12. 678 // 679 // Note that several of the intermediate calculations require adding 64-bit 680 // products together which would overflow a uint64, so a 96-bit accumulator 681 // is used instead. 682 683 // Terms for 2^(32*0). 684 var acc accumulator96 685 acc.n[0] = uint32(t0) // == acc.Add(t0) because acc is guaranteed to be 0. 686 acc.Add(t8 * uint64(orderComplementWordZero)) 687 t0 = uint64(acc.n[0]) 688 acc.Rsh32() 689 690 // Terms for 2^(32*1). 691 acc.Add(t1) 692 acc.Add(t8 * uint64(orderComplementWordOne)) 693 acc.Add(t9 * uint64(orderComplementWordZero)) 694 t1 = uint64(acc.n[0]) 695 acc.Rsh32() 696 697 // Terms for 2^(32*2). 698 acc.Add(t2) 699 acc.Add(t8 * uint64(orderComplementWordTwo)) 700 acc.Add(t9 * uint64(orderComplementWordOne)) 701 acc.Add(t10 * uint64(orderComplementWordZero)) 702 t2 = uint64(acc.n[0]) 703 acc.Rsh32() 704 705 // Terms for 2^(32*3). 706 acc.Add(t3) 707 acc.Add(t8 * uint64(orderComplementWordThree)) 708 acc.Add(t9 * uint64(orderComplementWordTwo)) 709 acc.Add(t10 * uint64(orderComplementWordOne)) 710 acc.Add(t11 * uint64(orderComplementWordZero)) 711 t3 = uint64(acc.n[0]) 712 acc.Rsh32() 713 714 // Terms for 2^(32*4). 715 acc.Add(t4) 716 acc.Add(t8) // * uint64(orderComplementWordFour) // * 1 717 acc.Add(t9 * uint64(orderComplementWordThree)) 718 acc.Add(t10 * uint64(orderComplementWordTwo)) 719 acc.Add(t11 * uint64(orderComplementWordOne)) 720 acc.Add(t12 * uint64(orderComplementWordZero)) 721 t4 = uint64(acc.n[0]) 722 acc.Rsh32() 723 724 // Terms for 2^(32*5). 725 acc.Add(t5) 726 // acc.Add(t8 * uint64(orderComplementWordFive)) // 0 727 acc.Add(t9) // * uint64(orderComplementWordFour) // * 1 728 acc.Add(t10 * uint64(orderComplementWordThree)) 729 acc.Add(t11 * uint64(orderComplementWordTwo)) 730 acc.Add(t12 * uint64(orderComplementWordOne)) 731 acc.Add(t13 * uint64(orderComplementWordZero)) 732 t5 = uint64(acc.n[0]) 733 acc.Rsh32() 734 735 // Terms for 2^(32*6). 736 acc.Add(t6) 737 // acc.Add(t8 * uint64(orderComplementWordSix)) // 0 738 // acc.Add(t9 * uint64(orderComplementWordFive)) // 0 739 acc.Add(t10) // * uint64(orderComplementWordFour)) // * 1 740 acc.Add(t11 * uint64(orderComplementWordThree)) 741 acc.Add(t12 * uint64(orderComplementWordTwo)) 742 acc.Add(t13 * uint64(orderComplementWordOne)) 743 acc.Add(t14 * uint64(orderComplementWordZero)) 744 t6 = uint64(acc.n[0]) 745 acc.Rsh32() 746 747 // Terms for 2^(32*7). 748 acc.Add(t7) 749 // acc.Add(t8 * uint64(orderComplementWordSeven)) // 0 750 // acc.Add(t9 * uint64(orderComplementWordSix)) // 0 751 // acc.Add(t10 * uint64(orderComplementWordFive)) // 0 752 acc.Add(t11) // * uint64(orderComplementWordFour) // * 1 753 acc.Add(t12 * uint64(orderComplementWordThree)) 754 acc.Add(t13 * uint64(orderComplementWordTwo)) 755 acc.Add(t14 * uint64(orderComplementWordOne)) 756 acc.Add(t15 * uint64(orderComplementWordZero)) 757 t7 = uint64(acc.n[0]) 758 acc.Rsh32() 759 760 // Terms for 2^(32*8). 761 // acc.Add(t9 * uint64(orderComplementWordSeven)) // 0 762 // acc.Add(t10 * uint64(orderComplementWordSix)) // 0 763 // acc.Add(t11 * uint64(orderComplementWordFive)) // 0 764 acc.Add(t12) // * uint64(orderComplementWordFour) // * 1 765 acc.Add(t13 * uint64(orderComplementWordThree)) 766 acc.Add(t14 * uint64(orderComplementWordTwo)) 767 acc.Add(t15 * uint64(orderComplementWordOne)) 768 t8 = uint64(acc.n[0]) 769 acc.Rsh32() 770 771 // Terms for 2^(32*9). 772 // acc.Add(t10 * uint64(orderComplementWordSeven)) // 0 773 // acc.Add(t11 * uint64(orderComplementWordSix)) // 0 774 // acc.Add(t12 * uint64(orderComplementWordFive)) // 0 775 acc.Add(t13) // * uint64(orderComplementWordFour) // * 1 776 acc.Add(t14 * uint64(orderComplementWordThree)) 777 acc.Add(t15 * uint64(orderComplementWordTwo)) 778 t9 = uint64(acc.n[0]) 779 acc.Rsh32() 780 781 // Terms for 2^(32*10). 782 // acc.Add(t11 * uint64(orderComplementWordSeven)) // 0 783 // acc.Add(t12 * uint64(orderComplementWordSix)) // 0 784 // acc.Add(t13 * uint64(orderComplementWordFive)) // 0 785 acc.Add(t14) // * uint64(orderComplementWordFour) // * 1 786 acc.Add(t15 * uint64(orderComplementWordThree)) 787 t10 = uint64(acc.n[0]) 788 acc.Rsh32() 789 790 // Terms for 2^(32*11). 791 // acc.Add(t12 * uint64(orderComplementWordSeven)) // 0 792 // acc.Add(t13 * uint64(orderComplementWordSix)) // 0 793 // acc.Add(t14 * uint64(orderComplementWordFive)) // 0 794 acc.Add(t15) // * uint64(orderComplementWordFour) // * 1 795 t11 = uint64(acc.n[0]) 796 acc.Rsh32() 797 798 // NOTE: All of the remaining multiplications for this iteration result in 0 799 // as they all involve multiplying by combinations of the fifth, sixth, and 800 // seventh words of the two's complement of N, which are 0, so skip them. 801 802 // Terms for 2^(32*12). 803 t12 = uint64(acc.n[0]) 804 // acc.Rsh32() // No need since not used after this. Guaranteed to be 0. 805 806 // At this point, the result is reduced to fit within 385 bits, so reduce it 807 // again using the same method accordingly. 808 s.reduce385(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12) 809 } 810 811 // Mul2 multiplies the passed two scalars together modulo the group order in 812 // constant time and stores the result in s. 813 // 814 // The scalar is returned to support chaining. This enables syntax like: 815 // s3.Mul2(s, s2).AddInt(1) so that s3 = (s * s2) + 1. 816 func (s *ModNScalar) Mul2(val, val2 *ModNScalar) *ModNScalar { 817 // This could be done with for loops and an array to store the intermediate 818 // terms, but this unrolled version is significantly faster. 819 820 // The overall strategy employed here is: 821 // 1) Calculate the 512-bit product of the two scalars using the standard 822 // pencil-and-paper method. 823 // 2) Reduce the result modulo the prime by effectively subtracting 824 // multiples of the group order N (actually performed by adding multiples 825 // of the two's complement of N to avoid implementing subtraction). 826 // 3) Repeat step 2 noting that each iteration reduces the required number 827 // of bits by 127 because the two's complement of N has 127 leading zero 828 // bits. 829 // 4) Once reduced to 256 bits, call the existing reduce method to perform 830 // a final reduction as needed. 831 // 832 // Note that several of the intermediate calculations require adding 64-bit 833 // products together which would overflow a uint64, so a 96-bit accumulator 834 // is used instead. 835 836 // Terms for 2^(32*0). 837 var acc accumulator96 838 acc.Add(uint64(val.n[0]) * uint64(val2.n[0])) 839 t0 := uint64(acc.n[0]) 840 acc.Rsh32() 841 842 // Terms for 2^(32*1). 843 acc.Add(uint64(val.n[0]) * uint64(val2.n[1])) 844 acc.Add(uint64(val.n[1]) * uint64(val2.n[0])) 845 t1 := uint64(acc.n[0]) 846 acc.Rsh32() 847 848 // Terms for 2^(32*2). 849 acc.Add(uint64(val.n[0]) * uint64(val2.n[2])) 850 acc.Add(uint64(val.n[1]) * uint64(val2.n[1])) 851 acc.Add(uint64(val.n[2]) * uint64(val2.n[0])) 852 t2 := uint64(acc.n[0]) 853 acc.Rsh32() 854 855 // Terms for 2^(32*3). 856 acc.Add(uint64(val.n[0]) * uint64(val2.n[3])) 857 acc.Add(uint64(val.n[1]) * uint64(val2.n[2])) 858 acc.Add(uint64(val.n[2]) * uint64(val2.n[1])) 859 acc.Add(uint64(val.n[3]) * uint64(val2.n[0])) 860 t3 := uint64(acc.n[0]) 861 acc.Rsh32() 862 863 // Terms for 2^(32*4). 864 acc.Add(uint64(val.n[0]) * uint64(val2.n[4])) 865 acc.Add(uint64(val.n[1]) * uint64(val2.n[3])) 866 acc.Add(uint64(val.n[2]) * uint64(val2.n[2])) 867 acc.Add(uint64(val.n[3]) * uint64(val2.n[1])) 868 acc.Add(uint64(val.n[4]) * uint64(val2.n[0])) 869 t4 := uint64(acc.n[0]) 870 acc.Rsh32() 871 872 // Terms for 2^(32*5). 873 acc.Add(uint64(val.n[0]) * uint64(val2.n[5])) 874 acc.Add(uint64(val.n[1]) * uint64(val2.n[4])) 875 acc.Add(uint64(val.n[2]) * uint64(val2.n[3])) 876 acc.Add(uint64(val.n[3]) * uint64(val2.n[2])) 877 acc.Add(uint64(val.n[4]) * uint64(val2.n[1])) 878 acc.Add(uint64(val.n[5]) * uint64(val2.n[0])) 879 t5 := uint64(acc.n[0]) 880 acc.Rsh32() 881 882 // Terms for 2^(32*6). 883 acc.Add(uint64(val.n[0]) * uint64(val2.n[6])) 884 acc.Add(uint64(val.n[1]) * uint64(val2.n[5])) 885 acc.Add(uint64(val.n[2]) * uint64(val2.n[4])) 886 acc.Add(uint64(val.n[3]) * uint64(val2.n[3])) 887 acc.Add(uint64(val.n[4]) * uint64(val2.n[2])) 888 acc.Add(uint64(val.n[5]) * uint64(val2.n[1])) 889 acc.Add(uint64(val.n[6]) * uint64(val2.n[0])) 890 t6 := uint64(acc.n[0]) 891 acc.Rsh32() 892 893 // Terms for 2^(32*7). 894 acc.Add(uint64(val.n[0]) * uint64(val2.n[7])) 895 acc.Add(uint64(val.n[1]) * uint64(val2.n[6])) 896 acc.Add(uint64(val.n[2]) * uint64(val2.n[5])) 897 acc.Add(uint64(val.n[3]) * uint64(val2.n[4])) 898 acc.Add(uint64(val.n[4]) * uint64(val2.n[3])) 899 acc.Add(uint64(val.n[5]) * uint64(val2.n[2])) 900 acc.Add(uint64(val.n[6]) * uint64(val2.n[1])) 901 acc.Add(uint64(val.n[7]) * uint64(val2.n[0])) 902 t7 := uint64(acc.n[0]) 903 acc.Rsh32() 904 905 // Terms for 2^(32*8). 906 acc.Add(uint64(val.n[1]) * uint64(val2.n[7])) 907 acc.Add(uint64(val.n[2]) * uint64(val2.n[6])) 908 acc.Add(uint64(val.n[3]) * uint64(val2.n[5])) 909 acc.Add(uint64(val.n[4]) * uint64(val2.n[4])) 910 acc.Add(uint64(val.n[5]) * uint64(val2.n[3])) 911 acc.Add(uint64(val.n[6]) * uint64(val2.n[2])) 912 acc.Add(uint64(val.n[7]) * uint64(val2.n[1])) 913 t8 := uint64(acc.n[0]) 914 acc.Rsh32() 915 916 // Terms for 2^(32*9). 917 acc.Add(uint64(val.n[2]) * uint64(val2.n[7])) 918 acc.Add(uint64(val.n[3]) * uint64(val2.n[6])) 919 acc.Add(uint64(val.n[4]) * uint64(val2.n[5])) 920 acc.Add(uint64(val.n[5]) * uint64(val2.n[4])) 921 acc.Add(uint64(val.n[6]) * uint64(val2.n[3])) 922 acc.Add(uint64(val.n[7]) * uint64(val2.n[2])) 923 t9 := uint64(acc.n[0]) 924 acc.Rsh32() 925 926 // Terms for 2^(32*10). 927 acc.Add(uint64(val.n[3]) * uint64(val2.n[7])) 928 acc.Add(uint64(val.n[4]) * uint64(val2.n[6])) 929 acc.Add(uint64(val.n[5]) * uint64(val2.n[5])) 930 acc.Add(uint64(val.n[6]) * uint64(val2.n[4])) 931 acc.Add(uint64(val.n[7]) * uint64(val2.n[3])) 932 t10 := uint64(acc.n[0]) 933 acc.Rsh32() 934 935 // Terms for 2^(32*11). 936 acc.Add(uint64(val.n[4]) * uint64(val2.n[7])) 937 acc.Add(uint64(val.n[5]) * uint64(val2.n[6])) 938 acc.Add(uint64(val.n[6]) * uint64(val2.n[5])) 939 acc.Add(uint64(val.n[7]) * uint64(val2.n[4])) 940 t11 := uint64(acc.n[0]) 941 acc.Rsh32() 942 943 // Terms for 2^(32*12). 944 acc.Add(uint64(val.n[5]) * uint64(val2.n[7])) 945 acc.Add(uint64(val.n[6]) * uint64(val2.n[6])) 946 acc.Add(uint64(val.n[7]) * uint64(val2.n[5])) 947 t12 := uint64(acc.n[0]) 948 acc.Rsh32() 949 950 // Terms for 2^(32*13). 951 acc.Add(uint64(val.n[6]) * uint64(val2.n[7])) 952 acc.Add(uint64(val.n[7]) * uint64(val2.n[6])) 953 t13 := uint64(acc.n[0]) 954 acc.Rsh32() 955 956 // Terms for 2^(32*14). 957 acc.Add(uint64(val.n[7]) * uint64(val2.n[7])) 958 t14 := uint64(acc.n[0]) 959 acc.Rsh32() 960 961 // What's left is for 2^(32*15). 962 t15 := uint64(acc.n[0]) 963 // acc.Rsh32() // No need since not used after this. Guaranteed to be 0. 964 965 // At this point, all of the terms are grouped into their respective base 966 // and occupy up to 512 bits. Reduce the result accordingly. 967 s.reduce512(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, 968 t15) 969 return s 970 } 971 972 // Mul multiplies the passed scalar with the existing one modulo the group order 973 // in constant time and stores the result in s. 974 // 975 // The scalar is returned to support chaining. This enables syntax like: 976 // s.Mul(s2).AddInt(1) so that s = (s * s2) + 1. 977 func (s *ModNScalar) Mul(val *ModNScalar) *ModNScalar { 978 return s.Mul2(s, val) 979 } 980 981 // SquareVal squares the passed scalar modulo the group order in constant time 982 // and stores the result in s. 983 // 984 // The scalar is returned to support chaining. This enables syntax like: 985 // s3.SquareVal(s).Mul(s) so that s3 = s^2 * s = s^3. 986 func (s *ModNScalar) SquareVal(val *ModNScalar) *ModNScalar { 987 // This could technically be optimized slightly to take advantage of the 988 // fact that many of the intermediate calculations in squaring are just 989 // doubling, however, benchmarking has shown that due to the need to use a 990 // 96-bit accumulator, any savings are essentially offset by that and 991 // consequently there is no real difference in performance over just 992 // multiplying the value by itself to justify the extra code for now. This 993 // can be revisited in the future if it becomes a bottleneck in practice. 994 995 return s.Mul2(val, val) 996 } 997 998 // Square squares the scalar modulo the group order in constant time. The 999 // existing scalar is modified. 1000 // 1001 // The scalar is returned to support chaining. This enables syntax like: 1002 // s.Square().Mul(s2) so that s = s^2 * s2. 1003 func (s *ModNScalar) Square() *ModNScalar { 1004 return s.SquareVal(s) 1005 } 1006 1007 // NegateVal negates the passed scalar modulo the group order and stores the 1008 // result in s in constant time. 1009 // 1010 // The scalar is returned to support chaining. This enables syntax like: 1011 // s.NegateVal(s2).AddInt(1) so that s = -s2 + 1. 1012 func (s *ModNScalar) NegateVal(val *ModNScalar) *ModNScalar { 1013 // Since the scalar is already in the range 0 <= val < N, where N is the 1014 // group order, negation modulo the group order is just the group order 1015 // minus the value. This implies that the result will always be in the 1016 // desired range with the sole exception of 0 because N - 0 = N itself. 1017 // 1018 // Therefore, in order to avoid the need to reduce the result for every 1019 // other case in order to achieve constant time, this creates a mask that is 1020 // all 0s in the case of the scalar being negated is 0 and all 1s otherwise 1021 // and bitwise ands that mask with each word. 1022 // 1023 // Finally, to simplify the carry propagation, this adds the two's 1024 // complement of the scalar to N in order to achieve the same result. 1025 bits := val.n[0] | val.n[1] | val.n[2] | val.n[3] | val.n[4] | val.n[5] | 1026 val.n[6] | val.n[7] 1027 mask := uint64(uint32Mask * constantTimeNotEq(bits, 0)) 1028 c := uint64(orderWordZero) + (uint64(^val.n[0]) + 1) 1029 s.n[0] = uint32(c & mask) 1030 c = (c >> 32) + uint64(orderWordOne) + uint64(^val.n[1]) 1031 s.n[1] = uint32(c & mask) 1032 c = (c >> 32) + uint64(orderWordTwo) + uint64(^val.n[2]) 1033 s.n[2] = uint32(c & mask) 1034 c = (c >> 32) + uint64(orderWordThree) + uint64(^val.n[3]) 1035 s.n[3] = uint32(c & mask) 1036 c = (c >> 32) + uint64(orderWordFour) + uint64(^val.n[4]) 1037 s.n[4] = uint32(c & mask) 1038 c = (c >> 32) + uint64(orderWordFive) + uint64(^val.n[5]) 1039 s.n[5] = uint32(c & mask) 1040 c = (c >> 32) + uint64(orderWordSix) + uint64(^val.n[6]) 1041 s.n[6] = uint32(c & mask) 1042 c = (c >> 32) + uint64(orderWordSeven) + uint64(^val.n[7]) 1043 s.n[7] = uint32(c & mask) 1044 return s 1045 } 1046 1047 // Negate negates the scalar modulo the group order in constant time. The 1048 // existing scalar is modified. 1049 // 1050 // The scalar is returned to support chaining. This enables syntax like: 1051 // s.Negate().AddInt(1) so that s = -s + 1. 1052 func (s *ModNScalar) Negate() *ModNScalar { 1053 return s.NegateVal(s) 1054 } 1055 1056 // InverseValNonConst finds the modular multiplicative inverse of the passed 1057 // scalar and stores result in s in *non-constant* time. 1058 // 1059 // The scalar is returned to support chaining. This enables syntax like: 1060 // s3.InverseVal(s1).Mul(s2) so that s3 = s1^-1 * s2. 1061 func (s *ModNScalar) InverseValNonConst(val *ModNScalar) *ModNScalar { 1062 // This is making use of big integers for now. Ideally it will be replaced 1063 // with an implementation that does not depend on big integers. 1064 valBytes := val.Bytes() 1065 bigVal := new(big.Int).SetBytes(valBytes[:]) 1066 bigVal.ModInverse(bigVal, curveParams.N) 1067 s.SetByteSlice(bigVal.Bytes()) 1068 return s 1069 } 1070 1071 // InverseNonConst finds the modular multiplicative inverse of the scalar in 1072 // *non-constant* time. The existing scalar is modified. 1073 // 1074 // The scalar is returned to support chaining. This enables syntax like: 1075 // s.Inverse().Mul(s2) so that s = s^-1 * s2. 1076 func (s *ModNScalar) InverseNonConst() *ModNScalar { 1077 return s.InverseValNonConst(s) 1078 } 1079 1080 // IsOverHalfOrder returns whether or not the scalar exceeds the group order 1081 // divided by 2 in constant time. 1082 func (s *ModNScalar) IsOverHalfOrder() bool { 1083 // The intuition here is that the scalar is greater than half of the group 1084 // order if one of the higher individual words is greater than the 1085 // corresponding word of the half group order and all higher words in the 1086 // scalar are equal to their corresponding word of the half group order. 1087 // 1088 // Note that the words 4, 5, and 6 are all the max uint32 value, so there is 1089 // no need to test if those individual words of the scalar exceeds them, 1090 // hence, only equality is checked for them. 1091 result := constantTimeGreater(s.n[7], halfOrderWordSeven) 1092 highWordsEqual := constantTimeEq(s.n[7], halfOrderWordSeven) 1093 highWordsEqual &= constantTimeEq(s.n[6], halfOrderWordSix) 1094 highWordsEqual &= constantTimeEq(s.n[5], halfOrderWordFive) 1095 highWordsEqual &= constantTimeEq(s.n[4], halfOrderWordFour) 1096 result |= highWordsEqual & constantTimeGreater(s.n[3], halfOrderWordThree) 1097 highWordsEqual &= constantTimeEq(s.n[3], halfOrderWordThree) 1098 result |= highWordsEqual & constantTimeGreater(s.n[2], halfOrderWordTwo) 1099 highWordsEqual &= constantTimeEq(s.n[2], halfOrderWordTwo) 1100 result |= highWordsEqual & constantTimeGreater(s.n[1], halfOrderWordOne) 1101 highWordsEqual &= constantTimeEq(s.n[1], halfOrderWordOne) 1102 result |= highWordsEqual & constantTimeGreater(s.n[0], halfOrderWordZero) 1103 1104 return result != 0 1105 } 1106