gen.go

Documentation: golang.org/x/image/vector

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ignore
     6  
     7  package main
     8  
     9  import (
    10  	"bytes"
    11  	"io/ioutil"
    12  	"log"
    13  	"strings"
    14  	"text/template"
    15  )
    16  
    17  const (
    18  	copyright = "" +
    19  		"// Copyright 2016 The Go Authors. All rights reserved.\n" +
    20  		"// Use of this source code is governed by a BSD-style\n" +
    21  		"// license that can be found in the LICENSE file.\n"
    22  
    23  	doNotEdit = "// generated by go run gen.go; DO NOT EDIT\n"
    24  
    25  	dashDashDash = "// --------"
    26  )
    27  
    28  func main() {
    29  	tmpl, err := ioutil.ReadFile("gen_acc_amd64.s.tmpl")
    30  	if err != nil {
    31  		log.Fatalf("ReadFile: %v", err)
    32  	}
    33  	if !bytes.HasPrefix(tmpl, []byte(copyright)) {
    34  		log.Fatal("source template did not start with the copyright header")
    35  	}
    36  	tmpl = tmpl[len(copyright):]
    37  
    38  	preamble := []byte(nil)
    39  	if i := bytes.Index(tmpl, []byte(dashDashDash)); i < 0 {
    40  		log.Fatalf("source template did not contain %q", dashDashDash)
    41  	} else {
    42  		preamble, tmpl = tmpl[:i], tmpl[i:]
    43  	}
    44  
    45  	t, err := template.New("").Parse(string(tmpl))
    46  	if err != nil {
    47  		log.Fatalf("Parse: %v", err)
    48  	}
    49  
    50  	out := bytes.NewBuffer(nil)
    51  	out.WriteString(doNotEdit)
    52  	out.Write(preamble)
    53  
    54  	for i, v := range instances {
    55  		if i != 0 {
    56  			out.WriteString("\n")
    57  		}
    58  		if strings.Contains(v.LoadArgs, "{{.ShortName}}") {
    59  			v.LoadArgs = strings.Replace(v.LoadArgs, "{{.ShortName}}", v.ShortName, -1)
    60  		}
    61  		if err := t.Execute(out, v); err != nil {
    62  			log.Fatalf("Execute(%q): %v", v.ShortName, err)
    63  		}
    64  	}
    65  
    66  	if err := ioutil.WriteFile("acc_amd64.s", out.Bytes(), 0666); err != nil {
    67  		log.Fatalf("WriteFile: %v", err)
    68  	}
    69  }
    70  
    71  var instances = []struct {
    72  	LongName       string
    73  	ShortName      string
    74  	FrameSize      string
    75  	ArgsSize       string
    76  	Args           string
    77  	DstElemSize1   int
    78  	DstElemSize4   int
    79  	XMM3           string
    80  	XMM4           string
    81  	XMM5           string
    82  	XMM6           string
    83  	XMM8           string
    84  	XMM9           string
    85  	XMM10          string
    86  	LoadArgs       string
    87  	Setup          string
    88  	LoadXMMRegs    string
    89  	Add            string
    90  	ClampAndScale  string
    91  	ConvertToInt32 string
    92  	Store4         string
    93  	Store1         string
    94  }{{
    95  	LongName:       "fixedAccumulateOpOver",
    96  	ShortName:      "fxAccOpOver",
    97  	FrameSize:      fxFrameSize,
    98  	ArgsSize:       twoArgArgsSize,
    99  	Args:           "dst []uint8, src []uint32",
   100  	DstElemSize1:   1 * sizeOfUint8,
   101  	DstElemSize4:   4 * sizeOfUint8,
   102  	XMM3:           fxXMM3,
   103  	XMM4:           fxXMM4,
   104  	XMM5:           fxXMM5,
   105  	XMM6:           opOverXMM6,
   106  	XMM8:           opOverXMM8,
   107  	XMM9:           opOverXMM9,
   108  	XMM10:          opOverXMM10,
   109  	LoadArgs:       twoArgLoadArgs,
   110  	Setup:          fxSetup,
   111  	LoadXMMRegs:    fxLoadXMMRegs + "\n" + opOverLoadXMMRegs,
   112  	Add:            fxAdd,
   113  	ClampAndScale:  fxClampAndScale,
   114  	ConvertToInt32: fxConvertToInt32,
   115  	Store4:         opOverStore4,
   116  	Store1:         opOverStore1,
   117  }, {
   118  	LongName:       "fixedAccumulateOpSrc",
   119  	ShortName:      "fxAccOpSrc",
   120  	FrameSize:      fxFrameSize,
   121  	ArgsSize:       twoArgArgsSize,
   122  	Args:           "dst []uint8, src []uint32",
   123  	DstElemSize1:   1 * sizeOfUint8,
   124  	DstElemSize4:   4 * sizeOfUint8,
   125  	XMM3:           fxXMM3,
   126  	XMM4:           fxXMM4,
   127  	XMM5:           fxXMM5,
   128  	XMM6:           opSrcXMM6,
   129  	XMM8:           opSrcXMM8,
   130  	XMM9:           opSrcXMM9,
   131  	XMM10:          opSrcXMM10,
   132  	LoadArgs:       twoArgLoadArgs,
   133  	Setup:          fxSetup,
   134  	LoadXMMRegs:    fxLoadXMMRegs + "\n" + opSrcLoadXMMRegs,
   135  	Add:            fxAdd,
   136  	ClampAndScale:  fxClampAndScale,
   137  	ConvertToInt32: fxConvertToInt32,
   138  	Store4:         opSrcStore4,
   139  	Store1:         opSrcStore1,
   140  }, {
   141  	LongName:       "fixedAccumulateMask",
   142  	ShortName:      "fxAccMask",
   143  	FrameSize:      fxFrameSize,
   144  	ArgsSize:       oneArgArgsSize,
   145  	Args:           "buf []uint32",
   146  	DstElemSize1:   1 * sizeOfUint32,
   147  	DstElemSize4:   4 * sizeOfUint32,
   148  	XMM3:           fxXMM3,
   149  	XMM4:           fxXMM4,
   150  	XMM5:           fxXMM5,
   151  	XMM6:           maskXMM6,
   152  	XMM8:           maskXMM8,
   153  	XMM9:           maskXMM9,
   154  	XMM10:          maskXMM10,
   155  	LoadArgs:       oneArgLoadArgs,
   156  	Setup:          fxSetup,
   157  	LoadXMMRegs:    fxLoadXMMRegs + "\n" + maskLoadXMMRegs,
   158  	Add:            fxAdd,
   159  	ClampAndScale:  fxClampAndScale,
   160  	ConvertToInt32: fxConvertToInt32,
   161  	Store4:         maskStore4,
   162  	Store1:         maskStore1,
   163  }, {
   164  	LongName:       "floatingAccumulateOpOver",
   165  	ShortName:      "flAccOpOver",
   166  	FrameSize:      flFrameSize,
   167  	ArgsSize:       twoArgArgsSize,
   168  	Args:           "dst []uint8, src []float32",
   169  	DstElemSize1:   1 * sizeOfUint8,
   170  	DstElemSize4:   4 * sizeOfUint8,
   171  	XMM3:           flXMM3,
   172  	XMM4:           flXMM4,
   173  	XMM5:           flXMM5,
   174  	XMM6:           opOverXMM6,
   175  	XMM8:           opOverXMM8,
   176  	XMM9:           opOverXMM9,
   177  	XMM10:          opOverXMM10,
   178  	LoadArgs:       twoArgLoadArgs,
   179  	Setup:          flSetup,
   180  	LoadXMMRegs:    flLoadXMMRegs + "\n" + opOverLoadXMMRegs,
   181  	Add:            flAdd,
   182  	ClampAndScale:  flClampAndScale,
   183  	ConvertToInt32: flConvertToInt32,
   184  	Store4:         opOverStore4,
   185  	Store1:         opOverStore1,
   186  }, {
   187  	LongName:       "floatingAccumulateOpSrc",
   188  	ShortName:      "flAccOpSrc",
   189  	FrameSize:      flFrameSize,
   190  	ArgsSize:       twoArgArgsSize,
   191  	Args:           "dst []uint8, src []float32",
   192  	DstElemSize1:   1 * sizeOfUint8,
   193  	DstElemSize4:   4 * sizeOfUint8,
   194  	XMM3:           flXMM3,
   195  	XMM4:           flXMM4,
   196  	XMM5:           flXMM5,
   197  	XMM6:           opSrcXMM6,
   198  	XMM8:           opSrcXMM8,
   199  	XMM9:           opSrcXMM9,
   200  	XMM10:          opSrcXMM10,
   201  	LoadArgs:       twoArgLoadArgs,
   202  	Setup:          flSetup,
   203  	LoadXMMRegs:    flLoadXMMRegs + "\n" + opSrcLoadXMMRegs,
   204  	Add:            flAdd,
   205  	ClampAndScale:  flClampAndScale,
   206  	ConvertToInt32: flConvertToInt32,
   207  	Store4:         opSrcStore4,
   208  	Store1:         opSrcStore1,
   209  }, {
   210  	LongName:       "floatingAccumulateMask",
   211  	ShortName:      "flAccMask",
   212  	FrameSize:      flFrameSize,
   213  	ArgsSize:       twoArgArgsSize,
   214  	Args:           "dst []uint32, src []float32",
   215  	DstElemSize1:   1 * sizeOfUint32,
   216  	DstElemSize4:   4 * sizeOfUint32,
   217  	XMM3:           flXMM3,
   218  	XMM4:           flXMM4,
   219  	XMM5:           flXMM5,
   220  	XMM6:           maskXMM6,
   221  	XMM8:           maskXMM8,
   222  	XMM9:           maskXMM9,
   223  	XMM10:          maskXMM10,
   224  	LoadArgs:       twoArgLoadArgs,
   225  	Setup:          flSetup,
   226  	LoadXMMRegs:    flLoadXMMRegs + "\n" + maskLoadXMMRegs,
   227  	Add:            flAdd,
   228  	ClampAndScale:  flClampAndScale,
   229  	ConvertToInt32: flConvertToInt32,
   230  	Store4:         maskStore4,
   231  	Store1:         maskStore1,
   232  }}
   233  
   234  const (
   235  	fxFrameSize = `0`
   236  	flFrameSize = `8`
   237  
   238  	oneArgArgsSize = `24`
   239  	twoArgArgsSize = `48`
   240  
   241  	sizeOfUint8  = 1
   242  	sizeOfUint32 = 4
   243  
   244  	fxXMM3 = `-`
   245  	flXMM3 = `flSignMask`
   246  
   247  	fxXMM4 = `-`
   248  	flXMM4 = `flOne`
   249  
   250  	fxXMM5 = `fxAlmost65536`
   251  	flXMM5 = `flAlmost65536`
   252  
   253  	oneArgLoadArgs = `
   254  		MOVQ buf_base+0(FP), DI
   255  		MOVQ buf_len+8(FP), BX
   256  		MOVQ buf_base+0(FP), SI
   257  		MOVQ buf_len+8(FP), R10
   258  		`
   259  	twoArgLoadArgs = `
   260  		MOVQ dst_base+0(FP), DI
   261  		MOVQ dst_len+8(FP), BX
   262  		MOVQ src_base+24(FP), SI
   263  		MOVQ src_len+32(FP), R10
   264  		// Sanity check that len(dst) >= len(src).
   265  		CMPQ BX, R10
   266  		JLT  {{.ShortName}}End
   267  		`
   268  
   269  	fxSetup = ``
   270  	flSetup = `
   271  		// Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
   272  		// "Round To Zero".
   273  		STMXCSR mxcsrOrig-8(SP)
   274  		MOVL    mxcsrOrig-8(SP), AX
   275  		ORL     $0x6000, AX
   276  		MOVL    AX, mxcsrNew-4(SP)
   277  		`
   278  
   279  	fxLoadXMMRegs = `
   280  		// fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
   281  		MOVOU fxAlmost65536<>(SB), X5
   282  		`
   283  	flLoadXMMRegs = `
   284  		// flSignMask    := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
   285  		// flOne         := XMM(0x3f800000 repeated four times) // 1 as a float32.
   286  		// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
   287  		MOVOU flSignMask<>(SB), X3
   288  		MOVOU flOne<>(SB), X4
   289  		MOVOU flAlmost65536<>(SB), X5
   290  		`
   291  
   292  	fxAdd = `PADDD`
   293  	flAdd = `ADDPS`
   294  
   295  	fxClampAndScale = `
   296  		// y = abs(x)
   297  		// y >>= 2 // Shift by 2*ϕ - 16.
   298  		// y = min(y, fxAlmost65536)
   299  		PABSD  X1, X2
   300  		PSRLL  $2, X2
   301  		PMINUD X5, X2
   302  		`
   303  	flClampAndScale = `
   304  		// y = x & flSignMask
   305  		// y = min(y, flOne)
   306  		// y = mul(y, flAlmost65536)
   307  		MOVOU X3, X2
   308  		ANDPS X1, X2
   309  		MINPS X4, X2
   310  		MULPS X5, X2
   311  		`
   312  
   313  	fxConvertToInt32 = `
   314  		// z = convertToInt32(y)
   315  		// No-op.
   316  		`
   317  	flConvertToInt32 = `
   318  		// z = convertToInt32(y)
   319  		LDMXCSR  mxcsrNew-4(SP)
   320  		CVTPS2PL X2, X2
   321  		LDMXCSR  mxcsrOrig-8(SP)
   322  		`
   323  
   324  	opOverStore4 = `
   325  		// Blend over the dst's prior value. SIMD for i in 0..3:
   326  		//
   327  		// dstA := uint32(dst[i]) * 0x101
   328  		// maskA := z@i
   329  		// outA := dstA*(0xffff-maskA)/0xffff + maskA
   330  		// dst[i] = uint8(outA >> 8)
   331  		//
   332  		// First, set X0 to dstA*(0xfff-maskA).
   333  		MOVL   (DI), X0
   334  		PSHUFB X8, X0
   335  		MOVOU  X9, X11
   336  		PSUBL  X2, X11
   337  		PMULLD X11, X0
   338  		// We implement uint32 division by 0xffff as multiplication by a magic
   339  		// constant (0x800080001) and then a shift by a magic constant (47).
   340  		// See TestDivideByFFFF for a justification.
   341  		//
   342  		// That multiplication widens from uint32 to uint64, so we have to
   343  		// duplicate and shift our four uint32s from one XMM register (X0) to
   344  		// two XMM registers (X0 and X11).
   345  		//
   346  		// Move the second and fourth uint32s in X0 to be the first and third
   347  		// uint32s in X11.
   348  		MOVOU X0, X11
   349  		PSRLQ $32, X11
   350  		// Multiply by magic, shift by magic.
   351  		PMULULQ X10, X0
   352  		PMULULQ X10, X11
   353  		PSRLQ   $47, X0
   354  		PSRLQ   $47, X11
   355  		// Merge the two registers back to one, X11, and add maskA.
   356  		PSLLQ $32, X11
   357  		XORPS X0, X11
   358  		PADDD X11, X2
   359  		// As per opSrcStore4, shuffle and copy the 4 second-lowest bytes.
   360  		PSHUFB X6, X2
   361  		MOVL   X2, (DI)
   362  		`
   363  	opSrcStore4 = `
   364  		// z = shuffleTheSecondLowestBytesOfEach4ByteElement(z)
   365  		// copy(dst[:4], low4BytesOf(z))
   366  		PSHUFB X6, X2
   367  		MOVL   X2, (DI)
   368  		`
   369  	maskStore4 = `
   370  		// copy(dst[:4], z)
   371  		MOVOU X2, (DI)
   372  		`
   373  
   374  	opOverStore1 = `
   375  		// Blend over the dst's prior value.
   376  		//
   377  		// dstA := uint32(dst[0]) * 0x101
   378  		// maskA := z
   379  		// outA := dstA*(0xffff-maskA)/0xffff + maskA
   380  		// dst[0] = uint8(outA >> 8)
   381  		MOVBLZX (DI), R12
   382  		IMULL   $0x101, R12
   383  		MOVL    X2, R13
   384  		MOVL    $0xffff, AX
   385  		SUBL    R13, AX
   386  		MULL    R12             // MULL's implicit arg is AX, and the result is stored in DX:AX.
   387  		MOVL    $0x80008001, BX // Divide by 0xffff is to first multiply by a magic constant...
   388  		MULL    BX              // MULL's implicit arg is AX, and the result is stored in DX:AX.
   389  		SHRL    $15, DX         // ...and then shift by another magic constant (47 - 32 = 15).
   390  		ADDL    DX, R13
   391  		SHRL    $8, R13
   392  		MOVB    R13, (DI)
   393  		`
   394  	opSrcStore1 = `
   395  		// dst[0] = uint8(z>>8)
   396  		MOVL X2, BX
   397  		SHRL $8, BX
   398  		MOVB BX, (DI)
   399  		`
   400  	maskStore1 = `
   401  		// dst[0] = uint32(z)
   402  		MOVL X2, (DI)
   403  		`
   404  
   405  	opOverXMM6 = `gather`
   406  	opSrcXMM6  = `gather`
   407  	maskXMM6   = `-`
   408  
   409  	opOverXMM8 = `scatterAndMulBy0x101`
   410  	opSrcXMM8  = `-`
   411  	maskXMM8   = `-`
   412  
   413  	opOverXMM9 = `fxAlmost65536`
   414  	opSrcXMM9  = `-`
   415  	maskXMM9   = `-`
   416  
   417  	opOverXMM10 = `inverseFFFF`
   418  	opSrcXMM10  = `-`
   419  	maskXMM10   = `-`
   420  
   421  	opOverLoadXMMRegs = `
   422  		// gather               := XMM(see above)                      // PSHUFB shuffle mask.
   423  		// scatterAndMulBy0x101 := XMM(see above)                      // PSHUFB shuffle mask.
   424  		// fxAlmost65536        := XMM(0x0000ffff repeated four times) // 0xffff.
   425  		// inverseFFFF          := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff.
   426  		MOVOU gather<>(SB), X6
   427  		MOVOU scatterAndMulBy0x101<>(SB), X8
   428  		MOVOU fxAlmost65536<>(SB), X9
   429  		MOVOU inverseFFFF<>(SB), X10
   430  		`
   431  	opSrcLoadXMMRegs = `
   432  		// gather := XMM(see above) // PSHUFB shuffle mask.
   433  		MOVOU gather<>(SB), X6
   434  		`
   435  	maskLoadXMMRegs = ``
   436  )
   437
View as plain text