1// generated by go run gen.go; DO NOT EDIT
2
3// +build !appengine
4// +build gc
5// +build !noasm
6
7#include "textflag.h"
8
9// fl is short for floating point math. fx is short for fixed point math.
10
11DATA flAlmost65536<>+0x00(SB)/8, $0x477fffff477fffff
12DATA flAlmost65536<>+0x08(SB)/8, $0x477fffff477fffff
13DATA flOne<>+0x00(SB)/8, $0x3f8000003f800000
14DATA flOne<>+0x08(SB)/8, $0x3f8000003f800000
15DATA flSignMask<>+0x00(SB)/8, $0x7fffffff7fffffff
16DATA flSignMask<>+0x08(SB)/8, $0x7fffffff7fffffff
17
18// scatterAndMulBy0x101 is a PSHUFB mask that brings the low four bytes of an
19// XMM register to the low byte of that register's four uint32 values. It
20// duplicates those bytes, effectively multiplying each uint32 by 0x101.
21//
22// It transforms a little-endian 16-byte XMM value from
23// ijkl????????????
24// to
25// ii00jj00kk00ll00
26DATA scatterAndMulBy0x101<>+0x00(SB)/8, $0x8080010180800000
27DATA scatterAndMulBy0x101<>+0x08(SB)/8, $0x8080030380800202
28
29// gather is a PSHUFB mask that brings the second-lowest byte of the XMM
30// register's four uint32 values to the low four bytes of that register.
31//
32// It transforms a little-endian 16-byte XMM value from
33// ?i???j???k???l??
34// to
35// ijkl000000000000
36DATA gather<>+0x00(SB)/8, $0x808080800d090501
37DATA gather<>+0x08(SB)/8, $0x8080808080808080
38
39DATA fxAlmost65536<>+0x00(SB)/8, $0x0000ffff0000ffff
40DATA fxAlmost65536<>+0x08(SB)/8, $0x0000ffff0000ffff
41DATA inverseFFFF<>+0x00(SB)/8, $0x8000800180008001
42DATA inverseFFFF<>+0x08(SB)/8, $0x8000800180008001
43
44GLOBL flAlmost65536<>(SB), (NOPTR+RODATA), $16
45GLOBL flOne<>(SB), (NOPTR+RODATA), $16
46GLOBL flSignMask<>(SB), (NOPTR+RODATA), $16
47GLOBL scatterAndMulBy0x101<>(SB), (NOPTR+RODATA), $16
48GLOBL gather<>(SB), (NOPTR+RODATA), $16
49GLOBL fxAlmost65536<>(SB), (NOPTR+RODATA), $16
50GLOBL inverseFFFF<>(SB), (NOPTR+RODATA), $16
51
52// func haveSSE4_1() bool
53TEXT ·haveSSE4_1(SB), NOSPLIT, $0
54 MOVQ $1, AX
55 CPUID
56 SHRQ $19, CX
57 ANDQ $1, CX
58 MOVB CX, ret+0(FP)
59 RET
60
61// ----------------------------------------------------------------------------
62
63// func fixedAccumulateOpOverSIMD(dst []uint8, src []uint32)
64//
65// XMM registers. Variable names are per
66// https://github.com/google/font-rs/blob/master/src/accumulate.c
67//
68// xmm0 scratch
69// xmm1 x
70// xmm2 y, z
71// xmm3 -
72// xmm4 -
73// xmm5 fxAlmost65536
74// xmm6 gather
75// xmm7 offset
76// xmm8 scatterAndMulBy0x101
77// xmm9 fxAlmost65536
78// xmm10 inverseFFFF
79TEXT ·fixedAccumulateOpOverSIMD(SB), NOSPLIT, $0-48
80
81 MOVQ dst_base+0(FP), DI
82 MOVQ dst_len+8(FP), BX
83 MOVQ src_base+24(FP), SI
84 MOVQ src_len+32(FP), R10
85
86 // Sanity check that len(dst) >= len(src).
87 CMPQ BX, R10
88 JLT fxAccOpOverEnd
89
90 // R10 = len(src) &^ 3
91 // R11 = len(src)
92 MOVQ R10, R11
93 ANDQ $-4, R10
94
95 // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
96 MOVOU fxAlmost65536<>(SB), X5
97
98 // gather := XMM(see above) // PSHUFB shuffle mask.
99 // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask.
100 // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff.
101 // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff.
102 MOVOU gather<>(SB), X6
103 MOVOU scatterAndMulBy0x101<>(SB), X8
104 MOVOU fxAlmost65536<>(SB), X9
105 MOVOU inverseFFFF<>(SB), X10
106
107 // offset := XMM(0x00000000 repeated four times) // Cumulative sum.
108 XORPS X7, X7
109
110 // i := 0
111 MOVQ $0, R9
112
113fxAccOpOverLoop4:
114 // for i < (len(src) &^ 3)
115 CMPQ R9, R10
116 JAE fxAccOpOverLoop1
117
118 // x = XMM(s0, s1, s2, s3)
119 //
120 // Where s0 is src[i+0], s1 is src[i+1], etc.
121 MOVOU (SI), X1
122
123 // scratch = XMM(0, s0, s1, s2)
124 // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
125 MOVOU X1, X0
126 PSLLO $4, X0
127 PADDD X0, X1
128
129 // scratch = XMM(0, 0, 0, 0)
130 // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
131 // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
132 XORPS X0, X0
133 SHUFPS $0x40, X1, X0
134 PADDD X0, X1
135
136 // x += offset
137 PADDD X7, X1
138
139 // y = abs(x)
140 // y >>= 2 // Shift by 2*ϕ - 16.
141 // y = min(y, fxAlmost65536)
142 PABSD X1, X2
143 PSRLL $2, X2
144 PMINUD X5, X2
145
146 // z = convertToInt32(y)
147 // No-op.
148
149 // Blend over the dst's prior value. SIMD for i in 0..3:
150 //
151 // dstA := uint32(dst[i]) * 0x101
152 // maskA := z@i
153 // outA := dstA*(0xffff-maskA)/0xffff + maskA
154 // dst[i] = uint8(outA >> 8)
155 //
156 // First, set X0 to dstA*(0xfff-maskA).
157 MOVL (DI), X0
158 PSHUFB X8, X0
159 MOVOU X9, X11
160 PSUBL X2, X11
161 PMULLD X11, X0
162
163 // We implement uint32 division by 0xffff as multiplication by a magic
164 // constant (0x800080001) and then a shift by a magic constant (47).
165 // See TestDivideByFFFF for a justification.
166 //
167 // That multiplication widens from uint32 to uint64, so we have to
168 // duplicate and shift our four uint32s from one XMM register (X0) to
169 // two XMM registers (X0 and X11).
170 //
171 // Move the second and fourth uint32s in X0 to be the first and third
172 // uint32s in X11.
173 MOVOU X0, X11
174 PSRLQ $32, X11
175
176 // Multiply by magic, shift by magic.
177 PMULULQ X10, X0
178 PMULULQ X10, X11
179 PSRLQ $47, X0
180 PSRLQ $47, X11
181
182 // Merge the two registers back to one, X11, and add maskA.
183 PSLLQ $32, X11
184 XORPS X0, X11
185 PADDD X11, X2
186
187 // As per opSrcStore4, shuffle and copy the 4 second-lowest bytes.
188 PSHUFB X6, X2
189 MOVL X2, (DI)
190
191 // offset = XMM(x@3, x@3, x@3, x@3)
192 MOVOU X1, X7
193 SHUFPS $0xff, X1, X7
194
195 // i += 4
196 // dst = dst[4:]
197 // src = src[4:]
198 ADDQ $4, R9
199 ADDQ $4, DI
200 ADDQ $16, SI
201 JMP fxAccOpOverLoop4
202
203fxAccOpOverLoop1:
204 // for i < len(src)
205 CMPQ R9, R11
206 JAE fxAccOpOverEnd
207
208 // x = src[i] + offset
209 MOVL (SI), X1
210 PADDD X7, X1
211
212 // y = abs(x)
213 // y >>= 2 // Shift by 2*ϕ - 16.
214 // y = min(y, fxAlmost65536)
215 PABSD X1, X2
216 PSRLL $2, X2
217 PMINUD X5, X2
218
219 // z = convertToInt32(y)
220 // No-op.
221
222 // Blend over the dst's prior value.
223 //
224 // dstA := uint32(dst[0]) * 0x101
225 // maskA := z
226 // outA := dstA*(0xffff-maskA)/0xffff + maskA
227 // dst[0] = uint8(outA >> 8)
228 MOVBLZX (DI), R12
229 IMULL $0x101, R12
230 MOVL X2, R13
231 MOVL $0xffff, AX
232 SUBL R13, AX
233 MULL R12 // MULL's implicit arg is AX, and the result is stored in DX:AX.
234 MOVL $0x80008001, BX // Divide by 0xffff is to first multiply by a magic constant...
235 MULL BX // MULL's implicit arg is AX, and the result is stored in DX:AX.
236 SHRL $15, DX // ...and then shift by another magic constant (47 - 32 = 15).
237 ADDL DX, R13
238 SHRL $8, R13
239 MOVB R13, (DI)
240
241 // offset = x
242 MOVOU X1, X7
243
244 // i += 1
245 // dst = dst[1:]
246 // src = src[1:]
247 ADDQ $1, R9
248 ADDQ $1, DI
249 ADDQ $4, SI
250 JMP fxAccOpOverLoop1
251
252fxAccOpOverEnd:
253 RET
254
255// ----------------------------------------------------------------------------
256
257// func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32)
258//
259// XMM registers. Variable names are per
260// https://github.com/google/font-rs/blob/master/src/accumulate.c
261//
262// xmm0 scratch
263// xmm1 x
264// xmm2 y, z
265// xmm3 -
266// xmm4 -
267// xmm5 fxAlmost65536
268// xmm6 gather
269// xmm7 offset
270// xmm8 -
271// xmm9 -
272// xmm10 -
273TEXT ·fixedAccumulateOpSrcSIMD(SB), NOSPLIT, $0-48
274
275 MOVQ dst_base+0(FP), DI
276 MOVQ dst_len+8(FP), BX
277 MOVQ src_base+24(FP), SI
278 MOVQ src_len+32(FP), R10
279
280 // Sanity check that len(dst) >= len(src).
281 CMPQ BX, R10
282 JLT fxAccOpSrcEnd
283
284 // R10 = len(src) &^ 3
285 // R11 = len(src)
286 MOVQ R10, R11
287 ANDQ $-4, R10
288
289 // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
290 MOVOU fxAlmost65536<>(SB), X5
291
292 // gather := XMM(see above) // PSHUFB shuffle mask.
293 MOVOU gather<>(SB), X6
294
295 // offset := XMM(0x00000000 repeated four times) // Cumulative sum.
296 XORPS X7, X7
297
298 // i := 0
299 MOVQ $0, R9
300
301fxAccOpSrcLoop4:
302 // for i < (len(src) &^ 3)
303 CMPQ R9, R10
304 JAE fxAccOpSrcLoop1
305
306 // x = XMM(s0, s1, s2, s3)
307 //
308 // Where s0 is src[i+0], s1 is src[i+1], etc.
309 MOVOU (SI), X1
310
311 // scratch = XMM(0, s0, s1, s2)
312 // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
313 MOVOU X1, X0
314 PSLLO $4, X0
315 PADDD X0, X1
316
317 // scratch = XMM(0, 0, 0, 0)
318 // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
319 // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
320 XORPS X0, X0
321 SHUFPS $0x40, X1, X0
322 PADDD X0, X1
323
324 // x += offset
325 PADDD X7, X1
326
327 // y = abs(x)
328 // y >>= 2 // Shift by 2*ϕ - 16.
329 // y = min(y, fxAlmost65536)
330 PABSD X1, X2
331 PSRLL $2, X2
332 PMINUD X5, X2
333
334 // z = convertToInt32(y)
335 // No-op.
336
337 // z = shuffleTheSecondLowestBytesOfEach4ByteElement(z)
338 // copy(dst[:4], low4BytesOf(z))
339 PSHUFB X6, X2
340 MOVL X2, (DI)
341
342 // offset = XMM(x@3, x@3, x@3, x@3)
343 MOVOU X1, X7
344 SHUFPS $0xff, X1, X7
345
346 // i += 4
347 // dst = dst[4:]
348 // src = src[4:]
349 ADDQ $4, R9
350 ADDQ $4, DI
351 ADDQ $16, SI
352 JMP fxAccOpSrcLoop4
353
354fxAccOpSrcLoop1:
355 // for i < len(src)
356 CMPQ R9, R11
357 JAE fxAccOpSrcEnd
358
359 // x = src[i] + offset
360 MOVL (SI), X1
361 PADDD X7, X1
362
363 // y = abs(x)
364 // y >>= 2 // Shift by 2*ϕ - 16.
365 // y = min(y, fxAlmost65536)
366 PABSD X1, X2
367 PSRLL $2, X2
368 PMINUD X5, X2
369
370 // z = convertToInt32(y)
371 // No-op.
372
373 // dst[0] = uint8(z>>8)
374 MOVL X2, BX
375 SHRL $8, BX
376 MOVB BX, (DI)
377
378 // offset = x
379 MOVOU X1, X7
380
381 // i += 1
382 // dst = dst[1:]
383 // src = src[1:]
384 ADDQ $1, R9
385 ADDQ $1, DI
386 ADDQ $4, SI
387 JMP fxAccOpSrcLoop1
388
389fxAccOpSrcEnd:
390 RET
391
392// ----------------------------------------------------------------------------
393
394// func fixedAccumulateMaskSIMD(buf []uint32)
395//
396// XMM registers. Variable names are per
397// https://github.com/google/font-rs/blob/master/src/accumulate.c
398//
399// xmm0 scratch
400// xmm1 x
401// xmm2 y, z
402// xmm3 -
403// xmm4 -
404// xmm5 fxAlmost65536
405// xmm6 -
406// xmm7 offset
407// xmm8 -
408// xmm9 -
409// xmm10 -
410TEXT ·fixedAccumulateMaskSIMD(SB), NOSPLIT, $0-24
411
412 MOVQ buf_base+0(FP), DI
413 MOVQ buf_len+8(FP), BX
414 MOVQ buf_base+0(FP), SI
415 MOVQ buf_len+8(FP), R10
416
417 // R10 = len(src) &^ 3
418 // R11 = len(src)
419 MOVQ R10, R11
420 ANDQ $-4, R10
421
422 // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
423 MOVOU fxAlmost65536<>(SB), X5
424
425 // offset := XMM(0x00000000 repeated four times) // Cumulative sum.
426 XORPS X7, X7
427
428 // i := 0
429 MOVQ $0, R9
430
431fxAccMaskLoop4:
432 // for i < (len(src) &^ 3)
433 CMPQ R9, R10
434 JAE fxAccMaskLoop1
435
436 // x = XMM(s0, s1, s2, s3)
437 //
438 // Where s0 is src[i+0], s1 is src[i+1], etc.
439 MOVOU (SI), X1
440
441 // scratch = XMM(0, s0, s1, s2)
442 // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
443 MOVOU X1, X0
444 PSLLO $4, X0
445 PADDD X0, X1
446
447 // scratch = XMM(0, 0, 0, 0)
448 // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
449 // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
450 XORPS X0, X0
451 SHUFPS $0x40, X1, X0
452 PADDD X0, X1
453
454 // x += offset
455 PADDD X7, X1
456
457 // y = abs(x)
458 // y >>= 2 // Shift by 2*ϕ - 16.
459 // y = min(y, fxAlmost65536)
460 PABSD X1, X2
461 PSRLL $2, X2
462 PMINUD X5, X2
463
464 // z = convertToInt32(y)
465 // No-op.
466
467 // copy(dst[:4], z)
468 MOVOU X2, (DI)
469
470 // offset = XMM(x@3, x@3, x@3, x@3)
471 MOVOU X1, X7
472 SHUFPS $0xff, X1, X7
473
474 // i += 4
475 // dst = dst[4:]
476 // src = src[4:]
477 ADDQ $4, R9
478 ADDQ $16, DI
479 ADDQ $16, SI
480 JMP fxAccMaskLoop4
481
482fxAccMaskLoop1:
483 // for i < len(src)
484 CMPQ R9, R11
485 JAE fxAccMaskEnd
486
487 // x = src[i] + offset
488 MOVL (SI), X1
489 PADDD X7, X1
490
491 // y = abs(x)
492 // y >>= 2 // Shift by 2*ϕ - 16.
493 // y = min(y, fxAlmost65536)
494 PABSD X1, X2
495 PSRLL $2, X2
496 PMINUD X5, X2
497
498 // z = convertToInt32(y)
499 // No-op.
500
501 // dst[0] = uint32(z)
502 MOVL X2, (DI)
503
504 // offset = x
505 MOVOU X1, X7
506
507 // i += 1
508 // dst = dst[1:]
509 // src = src[1:]
510 ADDQ $1, R9
511 ADDQ $4, DI
512 ADDQ $4, SI
513 JMP fxAccMaskLoop1
514
515fxAccMaskEnd:
516 RET
517
518// ----------------------------------------------------------------------------
519
520// func floatingAccumulateOpOverSIMD(dst []uint8, src []float32)
521//
522// XMM registers. Variable names are per
523// https://github.com/google/font-rs/blob/master/src/accumulate.c
524//
525// xmm0 scratch
526// xmm1 x
527// xmm2 y, z
528// xmm3 flSignMask
529// xmm4 flOne
530// xmm5 flAlmost65536
531// xmm6 gather
532// xmm7 offset
533// xmm8 scatterAndMulBy0x101
534// xmm9 fxAlmost65536
535// xmm10 inverseFFFF
536TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48
537
538 MOVQ dst_base+0(FP), DI
539 MOVQ dst_len+8(FP), BX
540 MOVQ src_base+24(FP), SI
541 MOVQ src_len+32(FP), R10
542
543 // Sanity check that len(dst) >= len(src).
544 CMPQ BX, R10
545 JLT flAccOpOverEnd
546
547 // R10 = len(src) &^ 3
548 // R11 = len(src)
549 MOVQ R10, R11
550 ANDQ $-4, R10
551
552 // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
553 // "Round To Zero".
554 STMXCSR mxcsrOrig-8(SP)
555 MOVL mxcsrOrig-8(SP), AX
556 ORL $0x6000, AX
557 MOVL AX, mxcsrNew-4(SP)
558
559 // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
560 // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
561 // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
562 MOVOU flSignMask<>(SB), X3
563 MOVOU flOne<>(SB), X4
564 MOVOU flAlmost65536<>(SB), X5
565
566 // gather := XMM(see above) // PSHUFB shuffle mask.
567 // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask.
568 // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff.
569 // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff.
570 MOVOU gather<>(SB), X6
571 MOVOU scatterAndMulBy0x101<>(SB), X8
572 MOVOU fxAlmost65536<>(SB), X9
573 MOVOU inverseFFFF<>(SB), X10
574
575 // offset := XMM(0x00000000 repeated four times) // Cumulative sum.
576 XORPS X7, X7
577
578 // i := 0
579 MOVQ $0, R9
580
581flAccOpOverLoop4:
582 // for i < (len(src) &^ 3)
583 CMPQ R9, R10
584 JAE flAccOpOverLoop1
585
586 // x = XMM(s0, s1, s2, s3)
587 //
588 // Where s0 is src[i+0], s1 is src[i+1], etc.
589 MOVOU (SI), X1
590
591 // scratch = XMM(0, s0, s1, s2)
592 // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
593 MOVOU X1, X0
594 PSLLO $4, X0
595 ADDPS X0, X1
596
597 // scratch = XMM(0, 0, 0, 0)
598 // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
599 // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
600 XORPS X0, X0
601 SHUFPS $0x40, X1, X0
602 ADDPS X0, X1
603
604 // x += offset
605 ADDPS X7, X1
606
607 // y = x & flSignMask
608 // y = min(y, flOne)
609 // y = mul(y, flAlmost65536)
610 MOVOU X3, X2
611 ANDPS X1, X2
612 MINPS X4, X2
613 MULPS X5, X2
614
615 // z = convertToInt32(y)
616 LDMXCSR mxcsrNew-4(SP)
617 CVTPS2PL X2, X2
618 LDMXCSR mxcsrOrig-8(SP)
619
620 // Blend over the dst's prior value. SIMD for i in 0..3:
621 //
622 // dstA := uint32(dst[i]) * 0x101
623 // maskA := z@i
624 // outA := dstA*(0xffff-maskA)/0xffff + maskA
625 // dst[i] = uint8(outA >> 8)
626 //
627 // First, set X0 to dstA*(0xfff-maskA).
628 MOVL (DI), X0
629 PSHUFB X8, X0
630 MOVOU X9, X11
631 PSUBL X2, X11
632 PMULLD X11, X0
633
634 // We implement uint32 division by 0xffff as multiplication by a magic
635 // constant (0x800080001) and then a shift by a magic constant (47).
636 // See TestDivideByFFFF for a justification.
637 //
638 // That multiplication widens from uint32 to uint64, so we have to
639 // duplicate and shift our four uint32s from one XMM register (X0) to
640 // two XMM registers (X0 and X11).
641 //
642 // Move the second and fourth uint32s in X0 to be the first and third
643 // uint32s in X11.
644 MOVOU X0, X11
645 PSRLQ $32, X11
646
647 // Multiply by magic, shift by magic.
648 PMULULQ X10, X0
649 PMULULQ X10, X11
650 PSRLQ $47, X0
651 PSRLQ $47, X11
652
653 // Merge the two registers back to one, X11, and add maskA.
654 PSLLQ $32, X11
655 XORPS X0, X11
656 PADDD X11, X2
657
658 // As per opSrcStore4, shuffle and copy the 4 second-lowest bytes.
659 PSHUFB X6, X2
660 MOVL X2, (DI)
661
662 // offset = XMM(x@3, x@3, x@3, x@3)
663 MOVOU X1, X7
664 SHUFPS $0xff, X1, X7
665
666 // i += 4
667 // dst = dst[4:]
668 // src = src[4:]
669 ADDQ $4, R9
670 ADDQ $4, DI
671 ADDQ $16, SI
672 JMP flAccOpOverLoop4
673
674flAccOpOverLoop1:
675 // for i < len(src)
676 CMPQ R9, R11
677 JAE flAccOpOverEnd
678
679 // x = src[i] + offset
680 MOVL (SI), X1
681 ADDPS X7, X1
682
683 // y = x & flSignMask
684 // y = min(y, flOne)
685 // y = mul(y, flAlmost65536)
686 MOVOU X3, X2
687 ANDPS X1, X2
688 MINPS X4, X2
689 MULPS X5, X2
690
691 // z = convertToInt32(y)
692 LDMXCSR mxcsrNew-4(SP)
693 CVTPS2PL X2, X2
694 LDMXCSR mxcsrOrig-8(SP)
695
696 // Blend over the dst's prior value.
697 //
698 // dstA := uint32(dst[0]) * 0x101
699 // maskA := z
700 // outA := dstA*(0xffff-maskA)/0xffff + maskA
701 // dst[0] = uint8(outA >> 8)
702 MOVBLZX (DI), R12
703 IMULL $0x101, R12
704 MOVL X2, R13
705 MOVL $0xffff, AX
706 SUBL R13, AX
707 MULL R12 // MULL's implicit arg is AX, and the result is stored in DX:AX.
708 MOVL $0x80008001, BX // Divide by 0xffff is to first multiply by a magic constant...
709 MULL BX // MULL's implicit arg is AX, and the result is stored in DX:AX.
710 SHRL $15, DX // ...and then shift by another magic constant (47 - 32 = 15).
711 ADDL DX, R13
712 SHRL $8, R13
713 MOVB R13, (DI)
714
715 // offset = x
716 MOVOU X1, X7
717
718 // i += 1
719 // dst = dst[1:]
720 // src = src[1:]
721 ADDQ $1, R9
722 ADDQ $1, DI
723 ADDQ $4, SI
724 JMP flAccOpOverLoop1
725
726flAccOpOverEnd:
727 RET
728
729// ----------------------------------------------------------------------------
730
731// func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32)
732//
733// XMM registers. Variable names are per
734// https://github.com/google/font-rs/blob/master/src/accumulate.c
735//
736// xmm0 scratch
737// xmm1 x
738// xmm2 y, z
739// xmm3 flSignMask
740// xmm4 flOne
741// xmm5 flAlmost65536
742// xmm6 gather
743// xmm7 offset
744// xmm8 -
745// xmm9 -
746// xmm10 -
747TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48
748
749 MOVQ dst_base+0(FP), DI
750 MOVQ dst_len+8(FP), BX
751 MOVQ src_base+24(FP), SI
752 MOVQ src_len+32(FP), R10
753
754 // Sanity check that len(dst) >= len(src).
755 CMPQ BX, R10
756 JLT flAccOpSrcEnd
757
758 // R10 = len(src) &^ 3
759 // R11 = len(src)
760 MOVQ R10, R11
761 ANDQ $-4, R10
762
763 // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
764 // "Round To Zero".
765 STMXCSR mxcsrOrig-8(SP)
766 MOVL mxcsrOrig-8(SP), AX
767 ORL $0x6000, AX
768 MOVL AX, mxcsrNew-4(SP)
769
770 // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
771 // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
772 // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
773 MOVOU flSignMask<>(SB), X3
774 MOVOU flOne<>(SB), X4
775 MOVOU flAlmost65536<>(SB), X5
776
777 // gather := XMM(see above) // PSHUFB shuffle mask.
778 MOVOU gather<>(SB), X6
779
780 // offset := XMM(0x00000000 repeated four times) // Cumulative sum.
781 XORPS X7, X7
782
783 // i := 0
784 MOVQ $0, R9
785
786flAccOpSrcLoop4:
787 // for i < (len(src) &^ 3)
788 CMPQ R9, R10
789 JAE flAccOpSrcLoop1
790
791 // x = XMM(s0, s1, s2, s3)
792 //
793 // Where s0 is src[i+0], s1 is src[i+1], etc.
794 MOVOU (SI), X1
795
796 // scratch = XMM(0, s0, s1, s2)
797 // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
798 MOVOU X1, X0
799 PSLLO $4, X0
800 ADDPS X0, X1
801
802 // scratch = XMM(0, 0, 0, 0)
803 // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
804 // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
805 XORPS X0, X0
806 SHUFPS $0x40, X1, X0
807 ADDPS X0, X1
808
809 // x += offset
810 ADDPS X7, X1
811
812 // y = x & flSignMask
813 // y = min(y, flOne)
814 // y = mul(y, flAlmost65536)
815 MOVOU X3, X2
816 ANDPS X1, X2
817 MINPS X4, X2
818 MULPS X5, X2
819
820 // z = convertToInt32(y)
821 LDMXCSR mxcsrNew-4(SP)
822 CVTPS2PL X2, X2
823 LDMXCSR mxcsrOrig-8(SP)
824
825 // z = shuffleTheSecondLowestBytesOfEach4ByteElement(z)
826 // copy(dst[:4], low4BytesOf(z))
827 PSHUFB X6, X2
828 MOVL X2, (DI)
829
830 // offset = XMM(x@3, x@3, x@3, x@3)
831 MOVOU X1, X7
832 SHUFPS $0xff, X1, X7
833
834 // i += 4
835 // dst = dst[4:]
836 // src = src[4:]
837 ADDQ $4, R9
838 ADDQ $4, DI
839 ADDQ $16, SI
840 JMP flAccOpSrcLoop4
841
842flAccOpSrcLoop1:
843 // for i < len(src)
844 CMPQ R9, R11
845 JAE flAccOpSrcEnd
846
847 // x = src[i] + offset
848 MOVL (SI), X1
849 ADDPS X7, X1
850
851 // y = x & flSignMask
852 // y = min(y, flOne)
853 // y = mul(y, flAlmost65536)
854 MOVOU X3, X2
855 ANDPS X1, X2
856 MINPS X4, X2
857 MULPS X5, X2
858
859 // z = convertToInt32(y)
860 LDMXCSR mxcsrNew-4(SP)
861 CVTPS2PL X2, X2
862 LDMXCSR mxcsrOrig-8(SP)
863
864 // dst[0] = uint8(z>>8)
865 MOVL X2, BX
866 SHRL $8, BX
867 MOVB BX, (DI)
868
869 // offset = x
870 MOVOU X1, X7
871
872 // i += 1
873 // dst = dst[1:]
874 // src = src[1:]
875 ADDQ $1, R9
876 ADDQ $1, DI
877 ADDQ $4, SI
878 JMP flAccOpSrcLoop1
879
880flAccOpSrcEnd:
881 RET
882
883// ----------------------------------------------------------------------------
884
885// func floatingAccumulateMaskSIMD(dst []uint32, src []float32)
886//
887// XMM registers. Variable names are per
888// https://github.com/google/font-rs/blob/master/src/accumulate.c
889//
890// xmm0 scratch
891// xmm1 x
892// xmm2 y, z
893// xmm3 flSignMask
894// xmm4 flOne
895// xmm5 flAlmost65536
896// xmm6 -
897// xmm7 offset
898// xmm8 -
899// xmm9 -
900// xmm10 -
901TEXT ·floatingAccumulateMaskSIMD(SB), NOSPLIT, $8-48
902
903 MOVQ dst_base+0(FP), DI
904 MOVQ dst_len+8(FP), BX
905 MOVQ src_base+24(FP), SI
906 MOVQ src_len+32(FP), R10
907
908 // Sanity check that len(dst) >= len(src).
909 CMPQ BX, R10
910 JLT flAccMaskEnd
911
912 // R10 = len(src) &^ 3
913 // R11 = len(src)
914 MOVQ R10, R11
915 ANDQ $-4, R10
916
917 // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
918 // "Round To Zero".
919 STMXCSR mxcsrOrig-8(SP)
920 MOVL mxcsrOrig-8(SP), AX
921 ORL $0x6000, AX
922 MOVL AX, mxcsrNew-4(SP)
923
924 // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
925 // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
926 // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
927 MOVOU flSignMask<>(SB), X3
928 MOVOU flOne<>(SB), X4
929 MOVOU flAlmost65536<>(SB), X5
930
931 // offset := XMM(0x00000000 repeated four times) // Cumulative sum.
932 XORPS X7, X7
933
934 // i := 0
935 MOVQ $0, R9
936
937flAccMaskLoop4:
938 // for i < (len(src) &^ 3)
939 CMPQ R9, R10
940 JAE flAccMaskLoop1
941
942 // x = XMM(s0, s1, s2, s3)
943 //
944 // Where s0 is src[i+0], s1 is src[i+1], etc.
945 MOVOU (SI), X1
946
947 // scratch = XMM(0, s0, s1, s2)
948 // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
949 MOVOU X1, X0
950 PSLLO $4, X0
951 ADDPS X0, X1
952
953 // scratch = XMM(0, 0, 0, 0)
954 // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
955 // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
956 XORPS X0, X0
957 SHUFPS $0x40, X1, X0
958 ADDPS X0, X1
959
960 // x += offset
961 ADDPS X7, X1
962
963 // y = x & flSignMask
964 // y = min(y, flOne)
965 // y = mul(y, flAlmost65536)
966 MOVOU X3, X2
967 ANDPS X1, X2
968 MINPS X4, X2
969 MULPS X5, X2
970
971 // z = convertToInt32(y)
972 LDMXCSR mxcsrNew-4(SP)
973 CVTPS2PL X2, X2
974 LDMXCSR mxcsrOrig-8(SP)
975
976 // copy(dst[:4], z)
977 MOVOU X2, (DI)
978
979 // offset = XMM(x@3, x@3, x@3, x@3)
980 MOVOU X1, X7
981 SHUFPS $0xff, X1, X7
982
983 // i += 4
984 // dst = dst[4:]
985 // src = src[4:]
986 ADDQ $4, R9
987 ADDQ $16, DI
988 ADDQ $16, SI
989 JMP flAccMaskLoop4
990
991flAccMaskLoop1:
992 // for i < len(src)
993 CMPQ R9, R11
994 JAE flAccMaskEnd
995
996 // x = src[i] + offset
997 MOVL (SI), X1
998 ADDPS X7, X1
999
1000 // y = x & flSignMask
1001 // y = min(y, flOne)
1002 // y = mul(y, flAlmost65536)
1003 MOVOU X3, X2
1004 ANDPS X1, X2
1005 MINPS X4, X2
1006 MULPS X5, X2
1007
1008 // z = convertToInt32(y)
1009 LDMXCSR mxcsrNew-4(SP)
1010 CVTPS2PL X2, X2
1011 LDMXCSR mxcsrOrig-8(SP)
1012
1013 // dst[0] = uint32(z)
1014 MOVL X2, (DI)
1015
1016 // offset = x
1017 MOVOU X1, X7
1018
1019 // i += 1
1020 // dst = dst[1:]
1021 // src = src[1:]
1022 ADDQ $1, R9
1023 ADDQ $4, DI
1024 ADDQ $4, SI
1025 JMP flAccMaskLoop1
1026
1027flAccMaskEnd:
1028 RET
View as plain text