...

Text file src/github.com/cloudflare/circl/simd/keccakf1600/f1600x2_arm64.s

Documentation: github.com/cloudflare/circl/simd/keccakf1600

     1// +build arm64,go1.16
     2
     3// Taken from https://github.com/bwesterb/armed-keccak
     4
     5#include "textflag.h"
     6
     7// func f1600x2ARM(state *uint64, rc *[24]uint64, turbo bool)
     8TEXT ·f1600x2ARM(SB), NOSPLIT, $0-17
     9    MOVD state+0(FP), R0
    10    MOVD rc+8(FP), R1
    11    MOVD R0, R2
    12    MOVD $24, R3
    13
    14    VLD1.P 64(R0), [ V0.B16,  V1.B16,  V2.B16,  V3.B16]
    15    VLD1.P 64(R0), [ V4.B16,  V5.B16,  V6.B16,  V7.B16]
    16    VLD1.P 64(R0), [ V8.B16,  V9.B16, V10.B16, V11.B16]
    17    VLD1.P 64(R0), [V12.B16, V13.B16, V14.B16, V15.B16]
    18    VLD1.P 64(R0), [V16.B16, V17.B16, V18.B16, V19.B16]
    19    VLD1.P 64(R0), [V20.B16, V21.B16, V22.B16, V23.B16]
    20    VLD1.P (R0),   [V24.B16]
    21
    22    MOVBU turbo+16(FP), R4
    23    CBZ R4, loop
    24
    25    SUB  $12, R3, R3
    26    ADD  $96, R1, R1
    27
    28loop:
    29    // Execute theta but without xorring into the state yet.
    30    VEOR3 V10.B16, V5.B16, V0.B16, V25.B16
    31    VEOR3 V11.B16, V6.B16, V1.B16, V26.B16
    32    VEOR3 V12.B16, V7.B16, V2.B16, V27.B16
    33    VEOR3 V13.B16, V8.B16, V3.B16, V28.B16
    34    VEOR3 V14.B16, V9.B16, V4.B16, V29.B16
    35
    36    VEOR3 V20.B16, V15.B16, V25.B16, V25.B16
    37    VEOR3 V21.B16, V16.B16, V26.B16, V26.B16
    38    VEOR3 V22.B16, V17.B16, V27.B16, V27.B16
    39    VEOR3 V23.B16, V18.B16, V28.B16, V28.B16
    40    VEOR3 V24.B16, V19.B16, V29.B16, V29.B16
    41
    42    // Xor parities from step theta into the state at the same time as
    43    // exeuting rho and pi.   
    44    VRAX1 V26.D2, V29.D2, V30.D2
    45    VRAX1 V29.D2, V27.D2, V29.D2
    46    VRAX1 V27.D2, V25.D2, V27.D2
    47    VRAX1 V25.D2, V28.D2, V25.D2
    48    VRAX1 V28.D2, V26.D2, V28.D2
    49
    50    VEOR V30.B16, V0.B16, V0.B16
    51    VMOV V1.B16, V31.B16
    52
    53    VXAR $20, V27.D2,  V6.D2,  V1.D2   
    54    VXAR $44, V25.D2,  V9.D2,  V6.D2   
    55    VXAR $3 , V28.D2, V22.D2,  V9.D2   
    56    VXAR $25, V25.D2, V14.D2, V22.D2  
    57    VXAR $46, V30.D2, V20.D2, V14.D2  
    58    VXAR $2 , V28.D2,  V2.D2, V20.D2  
    59    VXAR $21, V28.D2, V12.D2,  V2.D2  
    60    VXAR $39, V29.D2, V13.D2, V12.D2  
    61    VXAR $56, V25.D2, V19.D2, V13.D2  
    62    VXAR $8 , V29.D2, V23.D2, V19.D2  
    63    VXAR $23, V30.D2, V15.D2, V23.D2  
    64    VXAR $37, V25.D2,  V4.D2, V15.D2  
    65    VXAR $50, V25.D2, V24.D2,  V4.D2   
    66    VXAR $62, V27.D2, V21.D2, V24.D2  
    67    VXAR $9 , V29.D2,  V8.D2, V21.D2  
    68    VXAR $19, V27.D2, V16.D2,  V8.D2   
    69    VXAR $28, V30.D2,  V5.D2, V16.D2  
    70    VXAR $36, V29.D2,  V3.D2,  V5.D2   
    71    VXAR $43, V29.D2, V18.D2,  V3.D2   
    72    VXAR $49, V28.D2, V17.D2, V18.D2  
    73    VXAR $54, V27.D2, V11.D2, V17.D2  
    74    VXAR $58, V28.D2,  V7.D2, V11.D2  
    75    VXAR $61, V30.D2, V10.D2,  V7.D2   
    76    VXAR $63, V27.D2, V31.D2, V10.D2  
    77
    78    // Chi
    79    VBCAX V1.B16, V2.B16, V0.B16, V25.B16
    80    VBCAX V2.B16, V3.B16, V1.B16, V26.B16
    81    VBCAX V3.B16, V4.B16, V2.B16,  V2.B16
    82    VBCAX V4.B16, V0.B16, V3.B16,  V3.B16
    83    VBCAX V0.B16, V1.B16, V4.B16,  V4.B16
    84    VMOV V25.B16, V0.B16
    85    VMOV V26.B16, V1.B16
    86
    87    VBCAX V6.B16, V7.B16, V5.B16, V25.B16
    88    VBCAX V7.B16, V8.B16, V6.B16, V26.B16
    89    VBCAX V8.B16, V9.B16, V7.B16,  V7.B16
    90    VBCAX V9.B16, V5.B16, V8.B16,  V8.B16
    91    VBCAX V5.B16, V6.B16, V9.B16,  V9.B16
    92    VMOV V25.B16, V5.B16
    93    VMOV V26.B16, V6.B16
    94
    95    VBCAX V11.B16, V12.B16, V10.B16, V25.B16
    96    VBCAX V12.B16, V13.B16, V11.B16, V26.B16
    97    VBCAX V13.B16, V14.B16, V12.B16, V12.B16
    98    VBCAX V14.B16, V10.B16, V13.B16, V13.B16
    99    VBCAX V10.B16, V11.B16, V14.B16, V14.B16
   100    VMOV V25.B16, V10.B16
   101    VMOV V26.B16, V11.B16
   102
   103    VBCAX V16.B16, V17.B16, V15.B16, V25.B16
   104    VBCAX V17.B16, V18.B16, V16.B16, V26.B16
   105    VBCAX V18.B16, V19.B16, V17.B16, V17.B16
   106    VBCAX V19.B16, V15.B16, V18.B16, V18.B16
   107    VBCAX V15.B16, V16.B16, V19.B16, V19.B16
   108    VMOV V25.B16, V15.B16
   109    VMOV V26.B16, V16.B16
   110
   111    VBCAX V21.B16, V22.B16, V20.B16, V25.B16
   112    VBCAX V22.B16, V23.B16, V21.B16, V26.B16
   113    VBCAX V23.B16, V24.B16, V22.B16, V22.B16
   114    VBCAX V24.B16, V20.B16, V23.B16, V23.B16
   115    VBCAX V20.B16, V21.B16, V24.B16, V24.B16
   116    VMOV V25.B16, V20.B16
   117    VMOV V26.B16, V21.B16
   118
   119    // Iota
   120    VLD1R.P 8(R1), [V25.D2]
   121    VEOR V25.B16, V0.B16, V0.B16
   122
   123    SUBS $1, R3, R3
   124    CBNZ R3, loop
   125
   126    MOVD R2, R0
   127
   128    VST1.P [ V0.B16,  V1.B16,  V2.B16,  V3.B16], 64(R0) 
   129    VST1.P [ V4.B16,  V5.B16,  V6.B16,  V7.B16], 64(R0)
   130    VST1.P [ V8.B16,  V9.B16, V10.B16, V11.B16], 64(R0)
   131    VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R0)
   132    VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R0)
   133    VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R0)
   134    VST1.P [V24.B16], (R0)
   135
   136    RET

View as plain text