1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build !appengine
6// +build gc
7// +build !noasm
8
9#include "textflag.h"
10
11// fl is short for floating point math. fx is short for fixed point math.
12
13DATA flAlmost65536<>+0x00(SB)/8, $0x477fffff477fffff
14DATA flAlmost65536<>+0x08(SB)/8, $0x477fffff477fffff
15DATA flOne<>+0x00(SB)/8, $0x3f8000003f800000
16DATA flOne<>+0x08(SB)/8, $0x3f8000003f800000
17DATA flSignMask<>+0x00(SB)/8, $0x7fffffff7fffffff
18DATA flSignMask<>+0x08(SB)/8, $0x7fffffff7fffffff
19
20// scatterAndMulBy0x101 is a PSHUFB mask that brings the low four bytes of an
21// XMM register to the low byte of that register's four uint32 values. It
22// duplicates those bytes, effectively multiplying each uint32 by 0x101.
23//
24// It transforms a little-endian 16-byte XMM value from
25// ijkl????????????
26// to
27// ii00jj00kk00ll00
28DATA scatterAndMulBy0x101<>+0x00(SB)/8, $0x8080010180800000
29DATA scatterAndMulBy0x101<>+0x08(SB)/8, $0x8080030380800202
30
31// gather is a PSHUFB mask that brings the second-lowest byte of the XMM
32// register's four uint32 values to the low four bytes of that register.
33//
34// It transforms a little-endian 16-byte XMM value from
35// ?i???j???k???l??
36// to
37// ijkl000000000000
38DATA gather<>+0x00(SB)/8, $0x808080800d090501
39DATA gather<>+0x08(SB)/8, $0x8080808080808080
40
41DATA fxAlmost65536<>+0x00(SB)/8, $0x0000ffff0000ffff
42DATA fxAlmost65536<>+0x08(SB)/8, $0x0000ffff0000ffff
43DATA inverseFFFF<>+0x00(SB)/8, $0x8000800180008001
44DATA inverseFFFF<>+0x08(SB)/8, $0x8000800180008001
45
46GLOBL flAlmost65536<>(SB), (NOPTR+RODATA), $16
47GLOBL flOne<>(SB), (NOPTR+RODATA), $16
48GLOBL flSignMask<>(SB), (NOPTR+RODATA), $16
49GLOBL scatterAndMulBy0x101<>(SB), (NOPTR+RODATA), $16
50GLOBL gather<>(SB), (NOPTR+RODATA), $16
51GLOBL fxAlmost65536<>(SB), (NOPTR+RODATA), $16
52GLOBL inverseFFFF<>(SB), (NOPTR+RODATA), $16
53
54// func haveSSE4_1() bool
55TEXT ·haveSSE4_1(SB), NOSPLIT, $0
56 MOVQ $1, AX
57 CPUID
58 SHRQ $19, CX
59 ANDQ $1, CX
60 MOVB CX, ret+0(FP)
61 RET
62
63// ----------------------------------------------------------------------------
64
65// func {{.LongName}}SIMD({{.Args}})
66//
67// XMM registers. Variable names are per
68// https://github.com/google/font-rs/blob/master/src/accumulate.c
69//
70// xmm0 scratch
71// xmm1 x
72// xmm2 y, z
73// xmm3 {{.XMM3}}
74// xmm4 {{.XMM4}}
75// xmm5 {{.XMM5}}
76// xmm6 {{.XMM6}}
77// xmm7 offset
78// xmm8 {{.XMM8}}
79// xmm9 {{.XMM9}}
80// xmm10 {{.XMM10}}
81TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-{{.ArgsSize}}
82 {{.LoadArgs}}
83
84 // R10 = len(src) &^ 3
85 // R11 = len(src)
86 MOVQ R10, R11
87 ANDQ $-4, R10
88
89 {{.Setup}}
90
91 {{.LoadXMMRegs}}
92
93 // offset := XMM(0x00000000 repeated four times) // Cumulative sum.
94 XORPS X7, X7
95
96 // i := 0
97 MOVQ $0, R9
98
99{{.ShortName}}Loop4:
100 // for i < (len(src) &^ 3)
101 CMPQ R9, R10
102 JAE {{.ShortName}}Loop1
103
104 // x = XMM(s0, s1, s2, s3)
105 //
106 // Where s0 is src[i+0], s1 is src[i+1], etc.
107 MOVOU (SI), X1
108
109 // scratch = XMM(0, s0, s1, s2)
110 // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
111 MOVOU X1, X0
112 PSLLO $4, X0
113 {{.Add}} X0, X1
114
115 // scratch = XMM(0, 0, 0, 0)
116 // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
117 // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
118 XORPS X0, X0
119 SHUFPS $0x40, X1, X0
120 {{.Add}} X0, X1
121
122 // x += offset
123 {{.Add}} X7, X1
124
125 {{.ClampAndScale}}
126
127 {{.ConvertToInt32}}
128
129 {{.Store4}}
130
131 // offset = XMM(x@3, x@3, x@3, x@3)
132 MOVOU X1, X7
133 SHUFPS $0xff, X1, X7
134
135 // i += 4
136 // dst = dst[4:]
137 // src = src[4:]
138 ADDQ $4, R9
139 ADDQ ${{.DstElemSize4}}, DI
140 ADDQ $16, SI
141 JMP {{.ShortName}}Loop4
142
143{{.ShortName}}Loop1:
144 // for i < len(src)
145 CMPQ R9, R11
146 JAE {{.ShortName}}End
147
148 // x = src[i] + offset
149 MOVL (SI), X1
150 {{.Add}} X7, X1
151
152 {{.ClampAndScale}}
153
154 {{.ConvertToInt32}}
155
156 {{.Store1}}
157
158 // offset = x
159 MOVOU X1, X7
160
161 // i += 1
162 // dst = dst[1:]
163 // src = src[1:]
164 ADDQ $1, R9
165 ADDQ ${{.DstElemSize1}}, DI
166 ADDQ $4, SI
167 JMP {{.ShortName}}Loop1
168
169{{.ShortName}}End:
170 RET
View as plain text