...

Source file src/github.com/zeebo/xxh3/accum_generic.go

Documentation: github.com/zeebo/xxh3

     1  package xxh3
     2  
     3  // avx512Switch is the size at which the avx512 code is used.
     4  // Bigger blocks benefit more.
     5  const avx512Switch = 1 << 10
     6  
     7  func accumScalar(accs *[8]u64, p, secret ptr, l u64) {
     8  	if secret != key {
     9  		accumScalarSeed(accs, p, secret, l)
    10  		return
    11  	}
    12  	for l > _block {
    13  		k := secret
    14  
    15  		// accs
    16  		for i := 0; i < 16; i++ {
    17  			dv0 := readU64(p, 8*0)
    18  			dk0 := dv0 ^ readU64(k, 8*0)
    19  			accs[1] += dv0
    20  			accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
    21  
    22  			dv1 := readU64(p, 8*1)
    23  			dk1 := dv1 ^ readU64(k, 8*1)
    24  			accs[0] += dv1
    25  			accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
    26  
    27  			dv2 := readU64(p, 8*2)
    28  			dk2 := dv2 ^ readU64(k, 8*2)
    29  			accs[3] += dv2
    30  			accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
    31  
    32  			dv3 := readU64(p, 8*3)
    33  			dk3 := dv3 ^ readU64(k, 8*3)
    34  			accs[2] += dv3
    35  			accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
    36  
    37  			dv4 := readU64(p, 8*4)
    38  			dk4 := dv4 ^ readU64(k, 8*4)
    39  			accs[5] += dv4
    40  			accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
    41  
    42  			dv5 := readU64(p, 8*5)
    43  			dk5 := dv5 ^ readU64(k, 8*5)
    44  			accs[4] += dv5
    45  			accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
    46  
    47  			dv6 := readU64(p, 8*6)
    48  			dk6 := dv6 ^ readU64(k, 8*6)
    49  			accs[7] += dv6
    50  			accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
    51  
    52  			dv7 := readU64(p, 8*7)
    53  			dk7 := dv7 ^ readU64(k, 8*7)
    54  			accs[6] += dv7
    55  			accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
    56  
    57  			l -= _stripe
    58  			if l > 0 {
    59  				p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
    60  			}
    61  		}
    62  
    63  		// scramble accs
    64  		accs[0] ^= accs[0] >> 47
    65  		accs[0] ^= key64_128
    66  		accs[0] *= prime32_1
    67  
    68  		accs[1] ^= accs[1] >> 47
    69  		accs[1] ^= key64_136
    70  		accs[1] *= prime32_1
    71  
    72  		accs[2] ^= accs[2] >> 47
    73  		accs[2] ^= key64_144
    74  		accs[2] *= prime32_1
    75  
    76  		accs[3] ^= accs[3] >> 47
    77  		accs[3] ^= key64_152
    78  		accs[3] *= prime32_1
    79  
    80  		accs[4] ^= accs[4] >> 47
    81  		accs[4] ^= key64_160
    82  		accs[4] *= prime32_1
    83  
    84  		accs[5] ^= accs[5] >> 47
    85  		accs[5] ^= key64_168
    86  		accs[5] *= prime32_1
    87  
    88  		accs[6] ^= accs[6] >> 47
    89  		accs[6] ^= key64_176
    90  		accs[6] *= prime32_1
    91  
    92  		accs[7] ^= accs[7] >> 47
    93  		accs[7] ^= key64_184
    94  		accs[7] *= prime32_1
    95  	}
    96  
    97  	if l > 0 {
    98  		t, k := (l-1)/_stripe, secret
    99  
   100  		for i := u64(0); i < t; i++ {
   101  			dv0 := readU64(p, 8*0)
   102  			dk0 := dv0 ^ readU64(k, 8*0)
   103  			accs[1] += dv0
   104  			accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
   105  
   106  			dv1 := readU64(p, 8*1)
   107  			dk1 := dv1 ^ readU64(k, 8*1)
   108  			accs[0] += dv1
   109  			accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
   110  
   111  			dv2 := readU64(p, 8*2)
   112  			dk2 := dv2 ^ readU64(k, 8*2)
   113  			accs[3] += dv2
   114  			accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
   115  
   116  			dv3 := readU64(p, 8*3)
   117  			dk3 := dv3 ^ readU64(k, 8*3)
   118  			accs[2] += dv3
   119  			accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
   120  
   121  			dv4 := readU64(p, 8*4)
   122  			dk4 := dv4 ^ readU64(k, 8*4)
   123  			accs[5] += dv4
   124  			accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
   125  
   126  			dv5 := readU64(p, 8*5)
   127  			dk5 := dv5 ^ readU64(k, 8*5)
   128  			accs[4] += dv5
   129  			accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
   130  
   131  			dv6 := readU64(p, 8*6)
   132  			dk6 := dv6 ^ readU64(k, 8*6)
   133  			accs[7] += dv6
   134  			accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
   135  
   136  			dv7 := readU64(p, 8*7)
   137  			dk7 := dv7 ^ readU64(k, 8*7)
   138  			accs[6] += dv7
   139  			accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
   140  
   141  			l -= _stripe
   142  			if l > 0 {
   143  				p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
   144  			}
   145  		}
   146  
   147  		if l > 0 {
   148  			p = ptr(ui(p) - uintptr(_stripe-l))
   149  
   150  			dv0 := readU64(p, 8*0)
   151  			dk0 := dv0 ^ key64_121
   152  			accs[1] += dv0
   153  			accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
   154  
   155  			dv1 := readU64(p, 8*1)
   156  			dk1 := dv1 ^ key64_129
   157  			accs[0] += dv1
   158  			accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
   159  
   160  			dv2 := readU64(p, 8*2)
   161  			dk2 := dv2 ^ key64_137
   162  			accs[3] += dv2
   163  			accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
   164  
   165  			dv3 := readU64(p, 8*3)
   166  			dk3 := dv3 ^ key64_145
   167  			accs[2] += dv3
   168  			accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
   169  
   170  			dv4 := readU64(p, 8*4)
   171  			dk4 := dv4 ^ key64_153
   172  			accs[5] += dv4
   173  			accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
   174  
   175  			dv5 := readU64(p, 8*5)
   176  			dk5 := dv5 ^ key64_161
   177  			accs[4] += dv5
   178  			accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
   179  
   180  			dv6 := readU64(p, 8*6)
   181  			dk6 := dv6 ^ key64_169
   182  			accs[7] += dv6
   183  			accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
   184  
   185  			dv7 := readU64(p, 8*7)
   186  			dk7 := dv7 ^ key64_177
   187  			accs[6] += dv7
   188  			accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
   189  		}
   190  	}
   191  }
   192  
   193  func accumBlockScalar(accs *[8]u64, p, secret ptr) {
   194  	if secret != key {
   195  		accumBlockScalarSeed(accs, p, secret)
   196  		return
   197  	}
   198  	// accs
   199  	for i := 0; i < 16; i++ {
   200  		dv0 := readU64(p, 8*0)
   201  		dk0 := dv0 ^ readU64(secret, 8*0)
   202  		accs[1] += dv0
   203  		accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
   204  
   205  		dv1 := readU64(p, 8*1)
   206  		dk1 := dv1 ^ readU64(secret, 8*1)
   207  		accs[0] += dv1
   208  		accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
   209  
   210  		dv2 := readU64(p, 8*2)
   211  		dk2 := dv2 ^ readU64(secret, 8*2)
   212  		accs[3] += dv2
   213  		accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
   214  
   215  		dv3 := readU64(p, 8*3)
   216  		dk3 := dv3 ^ readU64(secret, 8*3)
   217  		accs[2] += dv3
   218  		accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
   219  
   220  		dv4 := readU64(p, 8*4)
   221  		dk4 := dv4 ^ readU64(secret, 8*4)
   222  		accs[5] += dv4
   223  		accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
   224  
   225  		dv5 := readU64(p, 8*5)
   226  		dk5 := dv5 ^ readU64(secret, 8*5)
   227  		accs[4] += dv5
   228  		accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
   229  
   230  		dv6 := readU64(p, 8*6)
   231  		dk6 := dv6 ^ readU64(secret, 8*6)
   232  		accs[7] += dv6
   233  		accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
   234  
   235  		dv7 := readU64(p, 8*7)
   236  		dk7 := dv7 ^ readU64(secret, 8*7)
   237  		accs[6] += dv7
   238  		accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
   239  
   240  		p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8)
   241  	}
   242  
   243  	// scramble accs
   244  	accs[0] ^= accs[0] >> 47
   245  	accs[0] ^= key64_128
   246  	accs[0] *= prime32_1
   247  
   248  	accs[1] ^= accs[1] >> 47
   249  	accs[1] ^= key64_136
   250  	accs[1] *= prime32_1
   251  
   252  	accs[2] ^= accs[2] >> 47
   253  	accs[2] ^= key64_144
   254  	accs[2] *= prime32_1
   255  
   256  	accs[3] ^= accs[3] >> 47
   257  	accs[3] ^= key64_152
   258  	accs[3] *= prime32_1
   259  
   260  	accs[4] ^= accs[4] >> 47
   261  	accs[4] ^= key64_160
   262  	accs[4] *= prime32_1
   263  
   264  	accs[5] ^= accs[5] >> 47
   265  	accs[5] ^= key64_168
   266  	accs[5] *= prime32_1
   267  
   268  	accs[6] ^= accs[6] >> 47
   269  	accs[6] ^= key64_176
   270  	accs[6] *= prime32_1
   271  
   272  	accs[7] ^= accs[7] >> 47
   273  	accs[7] ^= key64_184
   274  	accs[7] *= prime32_1
   275  }
   276  
   277  // accumScalarSeed should be used with custom key.
   278  func accumScalarSeed(accs *[8]u64, p, secret ptr, l u64) {
   279  	for l > _block {
   280  		k := secret
   281  
   282  		// accs
   283  		for i := 0; i < 16; i++ {
   284  			dv0 := readU64(p, 8*0)
   285  			dk0 := dv0 ^ readU64(k, 8*0)
   286  			accs[1] += dv0
   287  			accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
   288  
   289  			dv1 := readU64(p, 8*1)
   290  			dk1 := dv1 ^ readU64(k, 8*1)
   291  			accs[0] += dv1
   292  			accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
   293  
   294  			dv2 := readU64(p, 8*2)
   295  			dk2 := dv2 ^ readU64(k, 8*2)
   296  			accs[3] += dv2
   297  			accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
   298  
   299  			dv3 := readU64(p, 8*3)
   300  			dk3 := dv3 ^ readU64(k, 8*3)
   301  			accs[2] += dv3
   302  			accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
   303  
   304  			dv4 := readU64(p, 8*4)
   305  			dk4 := dv4 ^ readU64(k, 8*4)
   306  			accs[5] += dv4
   307  			accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
   308  
   309  			dv5 := readU64(p, 8*5)
   310  			dk5 := dv5 ^ readU64(k, 8*5)
   311  			accs[4] += dv5
   312  			accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
   313  
   314  			dv6 := readU64(p, 8*6)
   315  			dk6 := dv6 ^ readU64(k, 8*6)
   316  			accs[7] += dv6
   317  			accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
   318  
   319  			dv7 := readU64(p, 8*7)
   320  			dk7 := dv7 ^ readU64(k, 8*7)
   321  			accs[6] += dv7
   322  			accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
   323  
   324  			l -= _stripe
   325  			if l > 0 {
   326  				p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
   327  			}
   328  		}
   329  
   330  		// scramble accs
   331  		accs[0] ^= accs[0] >> 47
   332  		accs[0] ^= readU64(secret, 128)
   333  		accs[0] *= prime32_1
   334  
   335  		accs[1] ^= accs[1] >> 47
   336  		accs[1] ^= readU64(secret, 136)
   337  		accs[1] *= prime32_1
   338  
   339  		accs[2] ^= accs[2] >> 47
   340  		accs[2] ^= readU64(secret, 144)
   341  		accs[2] *= prime32_1
   342  
   343  		accs[3] ^= accs[3] >> 47
   344  		accs[3] ^= readU64(secret, 152)
   345  		accs[3] *= prime32_1
   346  
   347  		accs[4] ^= accs[4] >> 47
   348  		accs[4] ^= readU64(secret, 160)
   349  		accs[4] *= prime32_1
   350  
   351  		accs[5] ^= accs[5] >> 47
   352  		accs[5] ^= readU64(secret, 168)
   353  		accs[5] *= prime32_1
   354  
   355  		accs[6] ^= accs[6] >> 47
   356  		accs[6] ^= readU64(secret, 176)
   357  		accs[6] *= prime32_1
   358  
   359  		accs[7] ^= accs[7] >> 47
   360  		accs[7] ^= readU64(secret, 184)
   361  		accs[7] *= prime32_1
   362  	}
   363  
   364  	if l > 0 {
   365  		t, k := (l-1)/_stripe, secret
   366  
   367  		for i := u64(0); i < t; i++ {
   368  			dv0 := readU64(p, 8*0)
   369  			dk0 := dv0 ^ readU64(k, 8*0)
   370  			accs[1] += dv0
   371  			accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
   372  
   373  			dv1 := readU64(p, 8*1)
   374  			dk1 := dv1 ^ readU64(k, 8*1)
   375  			accs[0] += dv1
   376  			accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
   377  
   378  			dv2 := readU64(p, 8*2)
   379  			dk2 := dv2 ^ readU64(k, 8*2)
   380  			accs[3] += dv2
   381  			accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
   382  
   383  			dv3 := readU64(p, 8*3)
   384  			dk3 := dv3 ^ readU64(k, 8*3)
   385  			accs[2] += dv3
   386  			accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
   387  
   388  			dv4 := readU64(p, 8*4)
   389  			dk4 := dv4 ^ readU64(k, 8*4)
   390  			accs[5] += dv4
   391  			accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
   392  
   393  			dv5 := readU64(p, 8*5)
   394  			dk5 := dv5 ^ readU64(k, 8*5)
   395  			accs[4] += dv5
   396  			accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
   397  
   398  			dv6 := readU64(p, 8*6)
   399  			dk6 := dv6 ^ readU64(k, 8*6)
   400  			accs[7] += dv6
   401  			accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
   402  
   403  			dv7 := readU64(p, 8*7)
   404  			dk7 := dv7 ^ readU64(k, 8*7)
   405  			accs[6] += dv7
   406  			accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
   407  
   408  			l -= _stripe
   409  			if l > 0 {
   410  				p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
   411  			}
   412  		}
   413  
   414  		if l > 0 {
   415  			p = ptr(ui(p) - uintptr(_stripe-l))
   416  
   417  			dv0 := readU64(p, 8*0)
   418  			dk0 := dv0 ^ readU64(secret, 121)
   419  			accs[1] += dv0
   420  			accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
   421  
   422  			dv1 := readU64(p, 8*1)
   423  			dk1 := dv1 ^ readU64(secret, 129)
   424  			accs[0] += dv1
   425  			accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
   426  
   427  			dv2 := readU64(p, 8*2)
   428  			dk2 := dv2 ^ readU64(secret, 137)
   429  			accs[3] += dv2
   430  			accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
   431  
   432  			dv3 := readU64(p, 8*3)
   433  			dk3 := dv3 ^ readU64(secret, 145)
   434  			accs[2] += dv3
   435  			accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
   436  
   437  			dv4 := readU64(p, 8*4)
   438  			dk4 := dv4 ^ readU64(secret, 153)
   439  			accs[5] += dv4
   440  			accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
   441  
   442  			dv5 := readU64(p, 8*5)
   443  			dk5 := dv5 ^ readU64(secret, 161)
   444  			accs[4] += dv5
   445  			accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
   446  
   447  			dv6 := readU64(p, 8*6)
   448  			dk6 := dv6 ^ readU64(secret, 169)
   449  			accs[7] += dv6
   450  			accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
   451  
   452  			dv7 := readU64(p, 8*7)
   453  			dk7 := dv7 ^ readU64(secret, 177)
   454  			accs[6] += dv7
   455  			accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
   456  		}
   457  	}
   458  }
   459  
   460  // accumBlockScalarSeed should be used with custom key.
   461  func accumBlockScalarSeed(accs *[8]u64, p, secret ptr) {
   462  	// accs
   463  	{
   464  		secret := secret
   465  		for i := 0; i < 16; i++ {
   466  			dv0 := readU64(p, 8*0)
   467  			dk0 := dv0 ^ readU64(secret, 8*0)
   468  			accs[1] += dv0
   469  			accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
   470  
   471  			dv1 := readU64(p, 8*1)
   472  			dk1 := dv1 ^ readU64(secret, 8*1)
   473  			accs[0] += dv1
   474  			accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
   475  
   476  			dv2 := readU64(p, 8*2)
   477  			dk2 := dv2 ^ readU64(secret, 8*2)
   478  			accs[3] += dv2
   479  			accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
   480  
   481  			dv3 := readU64(p, 8*3)
   482  			dk3 := dv3 ^ readU64(secret, 8*3)
   483  			accs[2] += dv3
   484  			accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
   485  
   486  			dv4 := readU64(p, 8*4)
   487  			dk4 := dv4 ^ readU64(secret, 8*4)
   488  			accs[5] += dv4
   489  			accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
   490  
   491  			dv5 := readU64(p, 8*5)
   492  			dk5 := dv5 ^ readU64(secret, 8*5)
   493  			accs[4] += dv5
   494  			accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
   495  
   496  			dv6 := readU64(p, 8*6)
   497  			dk6 := dv6 ^ readU64(secret, 8*6)
   498  			accs[7] += dv6
   499  			accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
   500  
   501  			dv7 := readU64(p, 8*7)
   502  			dk7 := dv7 ^ readU64(secret, 8*7)
   503  			accs[6] += dv7
   504  			accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
   505  
   506  			p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8)
   507  		}
   508  	}
   509  
   510  	// scramble accs
   511  	accs[0] ^= accs[0] >> 47
   512  	accs[0] ^= readU64(secret, 128)
   513  	accs[0] *= prime32_1
   514  
   515  	accs[1] ^= accs[1] >> 47
   516  	accs[1] ^= readU64(secret, 136)
   517  	accs[1] *= prime32_1
   518  
   519  	accs[2] ^= accs[2] >> 47
   520  	accs[2] ^= readU64(secret, 144)
   521  	accs[2] *= prime32_1
   522  
   523  	accs[3] ^= accs[3] >> 47
   524  	accs[3] ^= readU64(secret, 152)
   525  	accs[3] *= prime32_1
   526  
   527  	accs[4] ^= accs[4] >> 47
   528  	accs[4] ^= readU64(secret, 160)
   529  	accs[4] *= prime32_1
   530  
   531  	accs[5] ^= accs[5] >> 47
   532  	accs[5] ^= readU64(secret, 168)
   533  	accs[5] *= prime32_1
   534  
   535  	accs[6] ^= accs[6] >> 47
   536  	accs[6] ^= readU64(secret, 176)
   537  	accs[6] *= prime32_1
   538  
   539  	accs[7] ^= accs[7] >> 47
   540  	accs[7] ^= readU64(secret, 184)
   541  	accs[7] *= prime32_1
   542  }
   543  

View as plain text