1
2
3
4
5
6
7 package main
8
9 import (
10 "bytes"
11 "io/ioutil"
12 "log"
13 "strings"
14 "text/template"
15 )
16
17 const (
18 copyright = "" +
19 "// Copyright 2016 The Go Authors. All rights reserved.\n" +
20 "// Use of this source code is governed by a BSD-style\n" +
21 "// license that can be found in the LICENSE file.\n"
22
23 doNotEdit = "// generated by go run gen.go; DO NOT EDIT\n"
24
25 dashDashDash = "// --------"
26 )
27
28 func main() {
29 tmpl, err := ioutil.ReadFile("gen_acc_amd64.s.tmpl")
30 if err != nil {
31 log.Fatalf("ReadFile: %v", err)
32 }
33 if !bytes.HasPrefix(tmpl, []byte(copyright)) {
34 log.Fatal("source template did not start with the copyright header")
35 }
36 tmpl = tmpl[len(copyright):]
37
38 preamble := []byte(nil)
39 if i := bytes.Index(tmpl, []byte(dashDashDash)); i < 0 {
40 log.Fatalf("source template did not contain %q", dashDashDash)
41 } else {
42 preamble, tmpl = tmpl[:i], tmpl[i:]
43 }
44
45 t, err := template.New("").Parse(string(tmpl))
46 if err != nil {
47 log.Fatalf("Parse: %v", err)
48 }
49
50 out := bytes.NewBuffer(nil)
51 out.WriteString(doNotEdit)
52 out.Write(preamble)
53
54 for i, v := range instances {
55 if i != 0 {
56 out.WriteString("\n")
57 }
58 if strings.Contains(v.LoadArgs, "{{.ShortName}}") {
59 v.LoadArgs = strings.Replace(v.LoadArgs, "{{.ShortName}}", v.ShortName, -1)
60 }
61 if err := t.Execute(out, v); err != nil {
62 log.Fatalf("Execute(%q): %v", v.ShortName, err)
63 }
64 }
65
66 if err := ioutil.WriteFile("acc_amd64.s", out.Bytes(), 0666); err != nil {
67 log.Fatalf("WriteFile: %v", err)
68 }
69 }
70
71 var instances = []struct {
72 LongName string
73 ShortName string
74 FrameSize string
75 ArgsSize string
76 Args string
77 DstElemSize1 int
78 DstElemSize4 int
79 XMM3 string
80 XMM4 string
81 XMM5 string
82 XMM6 string
83 XMM8 string
84 XMM9 string
85 XMM10 string
86 LoadArgs string
87 Setup string
88 LoadXMMRegs string
89 Add string
90 ClampAndScale string
91 ConvertToInt32 string
92 Store4 string
93 Store1 string
94 }{{
95 LongName: "fixedAccumulateOpOver",
96 ShortName: "fxAccOpOver",
97 FrameSize: fxFrameSize,
98 ArgsSize: twoArgArgsSize,
99 Args: "dst []uint8, src []uint32",
100 DstElemSize1: 1 * sizeOfUint8,
101 DstElemSize4: 4 * sizeOfUint8,
102 XMM3: fxXMM3,
103 XMM4: fxXMM4,
104 XMM5: fxXMM5,
105 XMM6: opOverXMM6,
106 XMM8: opOverXMM8,
107 XMM9: opOverXMM9,
108 XMM10: opOverXMM10,
109 LoadArgs: twoArgLoadArgs,
110 Setup: fxSetup,
111 LoadXMMRegs: fxLoadXMMRegs + "\n" + opOverLoadXMMRegs,
112 Add: fxAdd,
113 ClampAndScale: fxClampAndScale,
114 ConvertToInt32: fxConvertToInt32,
115 Store4: opOverStore4,
116 Store1: opOverStore1,
117 }, {
118 LongName: "fixedAccumulateOpSrc",
119 ShortName: "fxAccOpSrc",
120 FrameSize: fxFrameSize,
121 ArgsSize: twoArgArgsSize,
122 Args: "dst []uint8, src []uint32",
123 DstElemSize1: 1 * sizeOfUint8,
124 DstElemSize4: 4 * sizeOfUint8,
125 XMM3: fxXMM3,
126 XMM4: fxXMM4,
127 XMM5: fxXMM5,
128 XMM6: opSrcXMM6,
129 XMM8: opSrcXMM8,
130 XMM9: opSrcXMM9,
131 XMM10: opSrcXMM10,
132 LoadArgs: twoArgLoadArgs,
133 Setup: fxSetup,
134 LoadXMMRegs: fxLoadXMMRegs + "\n" + opSrcLoadXMMRegs,
135 Add: fxAdd,
136 ClampAndScale: fxClampAndScale,
137 ConvertToInt32: fxConvertToInt32,
138 Store4: opSrcStore4,
139 Store1: opSrcStore1,
140 }, {
141 LongName: "fixedAccumulateMask",
142 ShortName: "fxAccMask",
143 FrameSize: fxFrameSize,
144 ArgsSize: oneArgArgsSize,
145 Args: "buf []uint32",
146 DstElemSize1: 1 * sizeOfUint32,
147 DstElemSize4: 4 * sizeOfUint32,
148 XMM3: fxXMM3,
149 XMM4: fxXMM4,
150 XMM5: fxXMM5,
151 XMM6: maskXMM6,
152 XMM8: maskXMM8,
153 XMM9: maskXMM9,
154 XMM10: maskXMM10,
155 LoadArgs: oneArgLoadArgs,
156 Setup: fxSetup,
157 LoadXMMRegs: fxLoadXMMRegs + "\n" + maskLoadXMMRegs,
158 Add: fxAdd,
159 ClampAndScale: fxClampAndScale,
160 ConvertToInt32: fxConvertToInt32,
161 Store4: maskStore4,
162 Store1: maskStore1,
163 }, {
164 LongName: "floatingAccumulateOpOver",
165 ShortName: "flAccOpOver",
166 FrameSize: flFrameSize,
167 ArgsSize: twoArgArgsSize,
168 Args: "dst []uint8, src []float32",
169 DstElemSize1: 1 * sizeOfUint8,
170 DstElemSize4: 4 * sizeOfUint8,
171 XMM3: flXMM3,
172 XMM4: flXMM4,
173 XMM5: flXMM5,
174 XMM6: opOverXMM6,
175 XMM8: opOverXMM8,
176 XMM9: opOverXMM9,
177 XMM10: opOverXMM10,
178 LoadArgs: twoArgLoadArgs,
179 Setup: flSetup,
180 LoadXMMRegs: flLoadXMMRegs + "\n" + opOverLoadXMMRegs,
181 Add: flAdd,
182 ClampAndScale: flClampAndScale,
183 ConvertToInt32: flConvertToInt32,
184 Store4: opOverStore4,
185 Store1: opOverStore1,
186 }, {
187 LongName: "floatingAccumulateOpSrc",
188 ShortName: "flAccOpSrc",
189 FrameSize: flFrameSize,
190 ArgsSize: twoArgArgsSize,
191 Args: "dst []uint8, src []float32",
192 DstElemSize1: 1 * sizeOfUint8,
193 DstElemSize4: 4 * sizeOfUint8,
194 XMM3: flXMM3,
195 XMM4: flXMM4,
196 XMM5: flXMM5,
197 XMM6: opSrcXMM6,
198 XMM8: opSrcXMM8,
199 XMM9: opSrcXMM9,
200 XMM10: opSrcXMM10,
201 LoadArgs: twoArgLoadArgs,
202 Setup: flSetup,
203 LoadXMMRegs: flLoadXMMRegs + "\n" + opSrcLoadXMMRegs,
204 Add: flAdd,
205 ClampAndScale: flClampAndScale,
206 ConvertToInt32: flConvertToInt32,
207 Store4: opSrcStore4,
208 Store1: opSrcStore1,
209 }, {
210 LongName: "floatingAccumulateMask",
211 ShortName: "flAccMask",
212 FrameSize: flFrameSize,
213 ArgsSize: twoArgArgsSize,
214 Args: "dst []uint32, src []float32",
215 DstElemSize1: 1 * sizeOfUint32,
216 DstElemSize4: 4 * sizeOfUint32,
217 XMM3: flXMM3,
218 XMM4: flXMM4,
219 XMM5: flXMM5,
220 XMM6: maskXMM6,
221 XMM8: maskXMM8,
222 XMM9: maskXMM9,
223 XMM10: maskXMM10,
224 LoadArgs: twoArgLoadArgs,
225 Setup: flSetup,
226 LoadXMMRegs: flLoadXMMRegs + "\n" + maskLoadXMMRegs,
227 Add: flAdd,
228 ClampAndScale: flClampAndScale,
229 ConvertToInt32: flConvertToInt32,
230 Store4: maskStore4,
231 Store1: maskStore1,
232 }}
233
234 const (
235 fxFrameSize = `0`
236 flFrameSize = `8`
237
238 oneArgArgsSize = `24`
239 twoArgArgsSize = `48`
240
241 sizeOfUint8 = 1
242 sizeOfUint32 = 4
243
244 fxXMM3 = `-`
245 flXMM3 = `flSignMask`
246
247 fxXMM4 = `-`
248 flXMM4 = `flOne`
249
250 fxXMM5 = `fxAlmost65536`
251 flXMM5 = `flAlmost65536`
252
253 oneArgLoadArgs = `
254 MOVQ buf_base+0(FP), DI
255 MOVQ buf_len+8(FP), BX
256 MOVQ buf_base+0(FP), SI
257 MOVQ buf_len+8(FP), R10
258 `
259 twoArgLoadArgs = `
260 MOVQ dst_base+0(FP), DI
261 MOVQ dst_len+8(FP), BX
262 MOVQ src_base+24(FP), SI
263 MOVQ src_len+32(FP), R10
264 // Sanity check that len(dst) >= len(src).
265 CMPQ BX, R10
266 JLT {{.ShortName}}End
267 `
268
269 fxSetup = ``
270 flSetup = `
271 // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
272 // "Round To Zero".
273 STMXCSR mxcsrOrig-8(SP)
274 MOVL mxcsrOrig-8(SP), AX
275 ORL $0x6000, AX
276 MOVL AX, mxcsrNew-4(SP)
277 `
278
279 fxLoadXMMRegs = `
280 // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
281 MOVOU fxAlmost65536<>(SB), X5
282 `
283 flLoadXMMRegs = `
284 // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
285 // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
286 // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
287 MOVOU flSignMask<>(SB), X3
288 MOVOU flOne<>(SB), X4
289 MOVOU flAlmost65536<>(SB), X5
290 `
291
292 fxAdd = `PADDD`
293 flAdd = `ADDPS`
294
295 fxClampAndScale = `
296 // y = abs(x)
297 // y >>= 2 // Shift by 2*ϕ - 16.
298 // y = min(y, fxAlmost65536)
299 PABSD X1, X2
300 PSRLL $2, X2
301 PMINUD X5, X2
302 `
303 flClampAndScale = `
304 // y = x & flSignMask
305 // y = min(y, flOne)
306 // y = mul(y, flAlmost65536)
307 MOVOU X3, X2
308 ANDPS X1, X2
309 MINPS X4, X2
310 MULPS X5, X2
311 `
312
313 fxConvertToInt32 = `
314 // z = convertToInt32(y)
315 // No-op.
316 `
317 flConvertToInt32 = `
318 // z = convertToInt32(y)
319 LDMXCSR mxcsrNew-4(SP)
320 CVTPS2PL X2, X2
321 LDMXCSR mxcsrOrig-8(SP)
322 `
323
324 opOverStore4 = `
325 // Blend over the dst's prior value. SIMD for i in 0..3:
326 //
327 // dstA := uint32(dst[i]) * 0x101
328 // maskA := z@i
329 // outA := dstA*(0xffff-maskA)/0xffff + maskA
330 // dst[i] = uint8(outA >> 8)
331 //
332 // First, set X0 to dstA*(0xfff-maskA).
333 MOVL (DI), X0
334 PSHUFB X8, X0
335 MOVOU X9, X11
336 PSUBL X2, X11
337 PMULLD X11, X0
338 // We implement uint32 division by 0xffff as multiplication by a magic
339 // constant (0x800080001) and then a shift by a magic constant (47).
340 // See TestDivideByFFFF for a justification.
341 //
342 // That multiplication widens from uint32 to uint64, so we have to
343 // duplicate and shift our four uint32s from one XMM register (X0) to
344 // two XMM registers (X0 and X11).
345 //
346 // Move the second and fourth uint32s in X0 to be the first and third
347 // uint32s in X11.
348 MOVOU X0, X11
349 PSRLQ $32, X11
350 // Multiply by magic, shift by magic.
351 PMULULQ X10, X0
352 PMULULQ X10, X11
353 PSRLQ $47, X0
354 PSRLQ $47, X11
355 // Merge the two registers back to one, X11, and add maskA.
356 PSLLQ $32, X11
357 XORPS X0, X11
358 PADDD X11, X2
359 // As per opSrcStore4, shuffle and copy the 4 second-lowest bytes.
360 PSHUFB X6, X2
361 MOVL X2, (DI)
362 `
363 opSrcStore4 = `
364 // z = shuffleTheSecondLowestBytesOfEach4ByteElement(z)
365 // copy(dst[:4], low4BytesOf(z))
366 PSHUFB X6, X2
367 MOVL X2, (DI)
368 `
369 maskStore4 = `
370 // copy(dst[:4], z)
371 MOVOU X2, (DI)
372 `
373
374 opOverStore1 = `
375 // Blend over the dst's prior value.
376 //
377 // dstA := uint32(dst[0]) * 0x101
378 // maskA := z
379 // outA := dstA*(0xffff-maskA)/0xffff + maskA
380 // dst[0] = uint8(outA >> 8)
381 MOVBLZX (DI), R12
382 IMULL $0x101, R12
383 MOVL X2, R13
384 MOVL $0xffff, AX
385 SUBL R13, AX
386 MULL R12 // MULL's implicit arg is AX, and the result is stored in DX:AX.
387 MOVL $0x80008001, BX // Divide by 0xffff is to first multiply by a magic constant...
388 MULL BX // MULL's implicit arg is AX, and the result is stored in DX:AX.
389 SHRL $15, DX // ...and then shift by another magic constant (47 - 32 = 15).
390 ADDL DX, R13
391 SHRL $8, R13
392 MOVB R13, (DI)
393 `
394 opSrcStore1 = `
395 // dst[0] = uint8(z>>8)
396 MOVL X2, BX
397 SHRL $8, BX
398 MOVB BX, (DI)
399 `
400 maskStore1 = `
401 // dst[0] = uint32(z)
402 MOVL X2, (DI)
403 `
404
405 opOverXMM6 = `gather`
406 opSrcXMM6 = `gather`
407 maskXMM6 = `-`
408
409 opOverXMM8 = `scatterAndMulBy0x101`
410 opSrcXMM8 = `-`
411 maskXMM8 = `-`
412
413 opOverXMM9 = `fxAlmost65536`
414 opSrcXMM9 = `-`
415 maskXMM9 = `-`
416
417 opOverXMM10 = `inverseFFFF`
418 opSrcXMM10 = `-`
419 maskXMM10 = `-`
420
421 opOverLoadXMMRegs = `
422 // gather := XMM(see above) // PSHUFB shuffle mask.
423 // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask.
424 // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff.
425 // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff.
426 MOVOU gather<>(SB), X6
427 MOVOU scatterAndMulBy0x101<>(SB), X8
428 MOVOU fxAlmost65536<>(SB), X9
429 MOVOU inverseFFFF<>(SB), X10
430 `
431 opSrcLoadXMMRegs = `
432 // gather := XMM(see above) // PSHUFB shuffle mask.
433 MOVOU gather<>(SB), X6
434 `
435 maskLoadXMMRegs = ``
436 )
437
View as plain text