1// +build arm64,go1.16
2
3// Taken from https://github.com/bwesterb/armed-keccak
4
5#include "textflag.h"
6
7// func f1600x2ARM(state *uint64, rc *[24]uint64, turbo bool)
8TEXT ·f1600x2ARM(SB), NOSPLIT, $0-17
9 MOVD state+0(FP), R0
10 MOVD rc+8(FP), R1
11 MOVD R0, R2
12 MOVD $24, R3
13
14 VLD1.P 64(R0), [ V0.B16, V1.B16, V2.B16, V3.B16]
15 VLD1.P 64(R0), [ V4.B16, V5.B16, V6.B16, V7.B16]
16 VLD1.P 64(R0), [ V8.B16, V9.B16, V10.B16, V11.B16]
17 VLD1.P 64(R0), [V12.B16, V13.B16, V14.B16, V15.B16]
18 VLD1.P 64(R0), [V16.B16, V17.B16, V18.B16, V19.B16]
19 VLD1.P 64(R0), [V20.B16, V21.B16, V22.B16, V23.B16]
20 VLD1.P (R0), [V24.B16]
21
22 MOVBU turbo+16(FP), R4
23 CBZ R4, loop
24
25 SUB $12, R3, R3
26 ADD $96, R1, R1
27
28loop:
29 // Execute theta but without xorring into the state yet.
30 VEOR3 V10.B16, V5.B16, V0.B16, V25.B16
31 VEOR3 V11.B16, V6.B16, V1.B16, V26.B16
32 VEOR3 V12.B16, V7.B16, V2.B16, V27.B16
33 VEOR3 V13.B16, V8.B16, V3.B16, V28.B16
34 VEOR3 V14.B16, V9.B16, V4.B16, V29.B16
35
36 VEOR3 V20.B16, V15.B16, V25.B16, V25.B16
37 VEOR3 V21.B16, V16.B16, V26.B16, V26.B16
38 VEOR3 V22.B16, V17.B16, V27.B16, V27.B16
39 VEOR3 V23.B16, V18.B16, V28.B16, V28.B16
40 VEOR3 V24.B16, V19.B16, V29.B16, V29.B16
41
42 // Xor parities from step theta into the state at the same time as
43 // exeuting rho and pi.
44 VRAX1 V26.D2, V29.D2, V30.D2
45 VRAX1 V29.D2, V27.D2, V29.D2
46 VRAX1 V27.D2, V25.D2, V27.D2
47 VRAX1 V25.D2, V28.D2, V25.D2
48 VRAX1 V28.D2, V26.D2, V28.D2
49
50 VEOR V30.B16, V0.B16, V0.B16
51 VMOV V1.B16, V31.B16
52
53 VXAR $20, V27.D2, V6.D2, V1.D2
54 VXAR $44, V25.D2, V9.D2, V6.D2
55 VXAR $3 , V28.D2, V22.D2, V9.D2
56 VXAR $25, V25.D2, V14.D2, V22.D2
57 VXAR $46, V30.D2, V20.D2, V14.D2
58 VXAR $2 , V28.D2, V2.D2, V20.D2
59 VXAR $21, V28.D2, V12.D2, V2.D2
60 VXAR $39, V29.D2, V13.D2, V12.D2
61 VXAR $56, V25.D2, V19.D2, V13.D2
62 VXAR $8 , V29.D2, V23.D2, V19.D2
63 VXAR $23, V30.D2, V15.D2, V23.D2
64 VXAR $37, V25.D2, V4.D2, V15.D2
65 VXAR $50, V25.D2, V24.D2, V4.D2
66 VXAR $62, V27.D2, V21.D2, V24.D2
67 VXAR $9 , V29.D2, V8.D2, V21.D2
68 VXAR $19, V27.D2, V16.D2, V8.D2
69 VXAR $28, V30.D2, V5.D2, V16.D2
70 VXAR $36, V29.D2, V3.D2, V5.D2
71 VXAR $43, V29.D2, V18.D2, V3.D2
72 VXAR $49, V28.D2, V17.D2, V18.D2
73 VXAR $54, V27.D2, V11.D2, V17.D2
74 VXAR $58, V28.D2, V7.D2, V11.D2
75 VXAR $61, V30.D2, V10.D2, V7.D2
76 VXAR $63, V27.D2, V31.D2, V10.D2
77
78 // Chi
79 VBCAX V1.B16, V2.B16, V0.B16, V25.B16
80 VBCAX V2.B16, V3.B16, V1.B16, V26.B16
81 VBCAX V3.B16, V4.B16, V2.B16, V2.B16
82 VBCAX V4.B16, V0.B16, V3.B16, V3.B16
83 VBCAX V0.B16, V1.B16, V4.B16, V4.B16
84 VMOV V25.B16, V0.B16
85 VMOV V26.B16, V1.B16
86
87 VBCAX V6.B16, V7.B16, V5.B16, V25.B16
88 VBCAX V7.B16, V8.B16, V6.B16, V26.B16
89 VBCAX V8.B16, V9.B16, V7.B16, V7.B16
90 VBCAX V9.B16, V5.B16, V8.B16, V8.B16
91 VBCAX V5.B16, V6.B16, V9.B16, V9.B16
92 VMOV V25.B16, V5.B16
93 VMOV V26.B16, V6.B16
94
95 VBCAX V11.B16, V12.B16, V10.B16, V25.B16
96 VBCAX V12.B16, V13.B16, V11.B16, V26.B16
97 VBCAX V13.B16, V14.B16, V12.B16, V12.B16
98 VBCAX V14.B16, V10.B16, V13.B16, V13.B16
99 VBCAX V10.B16, V11.B16, V14.B16, V14.B16
100 VMOV V25.B16, V10.B16
101 VMOV V26.B16, V11.B16
102
103 VBCAX V16.B16, V17.B16, V15.B16, V25.B16
104 VBCAX V17.B16, V18.B16, V16.B16, V26.B16
105 VBCAX V18.B16, V19.B16, V17.B16, V17.B16
106 VBCAX V19.B16, V15.B16, V18.B16, V18.B16
107 VBCAX V15.B16, V16.B16, V19.B16, V19.B16
108 VMOV V25.B16, V15.B16
109 VMOV V26.B16, V16.B16
110
111 VBCAX V21.B16, V22.B16, V20.B16, V25.B16
112 VBCAX V22.B16, V23.B16, V21.B16, V26.B16
113 VBCAX V23.B16, V24.B16, V22.B16, V22.B16
114 VBCAX V24.B16, V20.B16, V23.B16, V23.B16
115 VBCAX V20.B16, V21.B16, V24.B16, V24.B16
116 VMOV V25.B16, V20.B16
117 VMOV V26.B16, V21.B16
118
119 // Iota
120 VLD1R.P 8(R1), [V25.D2]
121 VEOR V25.B16, V0.B16, V0.B16
122
123 SUBS $1, R3, R3
124 CBNZ R3, loop
125
126 MOVD R2, R0
127
128 VST1.P [ V0.B16, V1.B16, V2.B16, V3.B16], 64(R0)
129 VST1.P [ V4.B16, V5.B16, V6.B16, V7.B16], 64(R0)
130 VST1.P [ V8.B16, V9.B16, V10.B16, V11.B16], 64(R0)
131 VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R0)
132 VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R0)
133 VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R0)
134 VST1.P [V24.B16], (R0)
135
136 RET
View as plain text