1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17#include <stdint.h>
18#include <string.h>
19
20#include "arm_neon.h"
21
22inline const uint32_t* unpack0_32_neon(const uint32_t* in, uint32_t* out) {
23 for (const uint32_t* end = out + 32; out != end; out++) {
24 *out = 0;
25 }
26
27 return in;
28}
29
30inline static const uint32_t* unpack1_32_neon(const uint32_t* in, uint32_t* out) {
31 uint32_t mask = 0x1;
32 uint32_t ind[4];
33 uint32_t shifts_1st[4] = {0, 1, 2, 3};
34 uint32_t shifts_2nd[4] = {4, 5, 6, 7};
35 uint32_t shifts_3rd[4] = {8, 9, 10, 11};
36 uint32_t shifts_4th[4] = {12, 13, 14, 15};
37 uint32_t shifts_5th[4] = {16, 17, 18, 19};
38 uint32_t shifts_6th[4] = {20, 21, 22, 23};
39 uint32_t shifts_7th[4] = {24, 25, 26, 27};
40 uint32_t shifts_8th[4] = {28, 29, 30, 31};
41 uint32x4_t reg_shift, reg_masks;
42 uint32x4_t results;
43
44 reg_masks = vdupq_n_u32(mask);
45
46 // shift the first 4 outs
47 ind[0] = in[0] >> shifts_1st[0];
48 ind[1] = in[0] >> shifts_1st[1];
49 ind[2] = in[0] >> shifts_1st[2];
50 ind[3] = in[0] >> shifts_1st[3];
51 reg_shift = vld1q_u32(ind);
52 results = vandq_u32(reg_shift, reg_masks);
53 vst1q_u32(out, results);
54 out += 4;
55
56 // shift the 2nd 4 outs
57 ind[0] = in[0] >> shifts_2nd[0];
58 ind[1] = in[0] >> shifts_2nd[1];
59 ind[2] = in[0] >> shifts_2nd[2];
60 ind[3] = in[0] >> shifts_2nd[3];
61 reg_shift = vld1q_u32(ind);
62 results = vandq_u32(reg_shift, reg_masks);
63 vst1q_u32(out, results);
64 out += 4;
65
66 // shift the 3rd 4 outs
67 ind[0] = in[0] >> shifts_3rd[0];
68 ind[1] = in[0] >> shifts_3rd[1];
69 ind[2] = in[0] >> shifts_3rd[2];
70 ind[3] = in[0] >> shifts_3rd[3];
71 reg_shift = vld1q_u32(ind);
72 results = vandq_u32(reg_shift, reg_masks);
73 vst1q_u32(out, results);
74 out += 4;
75
76 // shift the 4th 4 outs
77 ind[0] = in[0] >> shifts_4th[0];
78 ind[1] = in[0] >> shifts_4th[1];
79 ind[2] = in[0] >> shifts_4th[2];
80 ind[3] = in[0] >> shifts_4th[3];
81 reg_shift = vld1q_u32(ind);
82 results = vandq_u32(reg_shift, reg_masks);
83 vst1q_u32(out, results);
84 out += 4;
85
86 // shift the 5th 4 outs
87 ind[0] = in[0] >> shifts_5th[0];
88 ind[1] = in[0] >> shifts_5th[1];
89 ind[2] = in[0] >> shifts_5th[2];
90 ind[3] = in[0] >> shifts_5th[3];
91 reg_shift = vld1q_u32(ind);
92 results = vandq_u32(reg_shift, reg_masks);
93 vst1q_u32(out, results);
94 out += 4;
95
96 // shift the 6th 4 outs
97 ind[0] = in[0] >> shifts_6th[0];
98 ind[1] = in[0] >> shifts_6th[1];
99 ind[2] = in[0] >> shifts_6th[2];
100 ind[3] = in[0] >> shifts_6th[3];
101 reg_shift = vld1q_u32(ind);
102 results = vandq_u32(reg_shift, reg_masks);
103 vst1q_u32(out, results);
104 out += 4;
105
106 // shift the 7th 4 outs
107 ind[0] = in[0] >> shifts_7th[0];
108 ind[1] = in[0] >> shifts_7th[1];
109 ind[2] = in[0] >> shifts_7th[2];
110 ind[3] = in[0] >> shifts_7th[3];
111 reg_shift = vld1q_u32(ind);
112 results = vandq_u32(reg_shift, reg_masks);
113 vst1q_u32(out, results);
114 out += 4;
115
116 // shift the 8th 4 outs
117 ind[0] = in[0] >> shifts_8th[0];
118 ind[1] = in[0] >> shifts_8th[1];
119 ind[2] = in[0] >> shifts_8th[2];
120 ind[3] = in[0] >> shifts_8th[3];
121 reg_shift = vld1q_u32(ind);
122 results = vandq_u32(reg_shift, reg_masks);
123 vst1q_u32(out, results);
124 out += 4;
125
126 in += 1;
127
128 return in;
129}
130
131inline static const uint32_t* unpack2_32_neon(const uint32_t* in, uint32_t* out) {
132 uint32_t mask = 0x3;
133 uint32_t ind[4];
134 uint32_t shifts_1st[4] = {0, 2, 4, 6};
135 uint32_t shifts_2nd[4] = {8, 10, 12, 14};
136 uint32_t shifts_3rd[4] = {16, 18, 20, 22};
137 uint32_t shifts_4th[4] = {24, 26, 28, 30};
138
139 uint32x4_t reg_shift, reg_masks;
140 uint32x4_t results;
141
142 reg_masks = vdupq_n_u32(mask);
143
144 // shift the first 4 outs
145 ind[0] = in[0] >> shifts_1st[0];
146 ind[1] = in[0] >> shifts_1st[1];
147 ind[2] = in[0] >> shifts_1st[2];
148 ind[3] = in[0] >> shifts_1st[3];
149 reg_shift = vld1q_u32(ind);
150 results = vandq_u32(reg_shift, reg_masks);
151 vst1q_u32(out, results);
152 out += 4;
153
154 // shift the 2nd 4 outs
155 ind[0] = in[0] >> shifts_2nd[0];
156 ind[1] = in[0] >> shifts_2nd[1];
157 ind[2] = in[0] >> shifts_2nd[2];
158 ind[3] = in[0] >> shifts_2nd[3];
159 reg_shift = vld1q_u32(ind);
160 results = vandq_u32(reg_shift, reg_masks);
161 vst1q_u32(out, results);
162 out += 4;
163
164 // shift the 3rd 4 outs
165 ind[0] = in[0] >> shifts_3rd[0];
166 ind[1] = in[0] >> shifts_3rd[1];
167 ind[2] = in[0] >> shifts_3rd[2];
168 ind[3] = in[0] >> shifts_3rd[3];
169 reg_shift = vld1q_u32(ind);
170 results = vandq_u32(reg_shift, reg_masks);
171 vst1q_u32(out, results);
172 out += 4;
173
174 // shift the 4th 4 outs
175 ind[0] = in[0] >> shifts_4th[0];
176 ind[1] = in[0] >> shifts_4th[1];
177 ind[2] = in[0] >> shifts_4th[2];
178 ind[3] = in[0] >> shifts_4th[3];
179 reg_shift = vld1q_u32(ind);
180 results = vandq_u32(reg_shift, reg_masks);
181 vst1q_u32(out, results);
182 out += 4;
183
184 // shift the 5th 4 outs
185 ind[0] = in[1] >> shifts_1st[0];
186 ind[1] = in[1] >> shifts_1st[1];
187 ind[2] = in[1] >> shifts_1st[2];
188 ind[3] = in[1] >> shifts_1st[3];
189 reg_shift = vld1q_u32(ind);
190 results = vandq_u32(reg_shift, reg_masks);
191 vst1q_u32(out, results);
192 out += 4;
193
194 // shift the 6th 4 outs
195 ind[0] = in[1] >> shifts_2nd[0];
196 ind[1] = in[1] >> shifts_2nd[1];
197 ind[2] = in[1] >> shifts_2nd[2];
198 ind[3] = in[1] >> shifts_2nd[3];
199 reg_shift = vld1q_u32(ind);
200 results = vandq_u32(reg_shift, reg_masks);
201 vst1q_u32(out, results);
202 out += 4;
203
204 // shift the 7th 4 outs
205 ind[0] = in[1] >> shifts_3rd[0];
206 ind[1] = in[1] >> shifts_3rd[1];
207 ind[2] = in[1] >> shifts_3rd[2];
208 ind[3] = in[1] >> shifts_3rd[3];
209 reg_shift = vld1q_u32(ind);
210 results = vandq_u32(reg_shift, reg_masks);
211 vst1q_u32(out, results);
212 out += 4;
213
214 // shift the 8th 4 outs
215 ind[0] = in[1] >> shifts_4th[0];
216 ind[1] = in[1] >> shifts_4th[1];
217 ind[2] = in[1] >> shifts_4th[2];
218 ind[3] = in[1] >> shifts_4th[3];
219 reg_shift = vld1q_u32(ind);
220 results = vandq_u32(reg_shift, reg_masks);
221 vst1q_u32(out, results);
222 out += 4;
223
224 in += 2;
225
226 return in;
227}
228
229inline static const uint32_t* unpack3_32_neon(const uint32_t* in, uint32_t* out) {
230 uint32_t mask = 0x7;
231 uint32_t ind[4];
232 uint32_t shifts_1st[4] = {0, 3, 6, 9};
233 uint32_t shifts_2nd[4] = {12, 15, 18, 21};
234 uint32_t shifts_3rd[4] = {24, 27, 0, 1};
235 uint32_t shifts_4th[4] = {4, 7, 10, 13};
236 uint32_t shifts_5th[4] = {16, 19, 22, 25};
237 uint32_t shifts_6th[4] = {28, 0, 2, 5};
238 uint32_t shifts_7th[4] = {8, 11, 14, 17};
239 uint32_t shifts_8th[4] = {20, 23, 26, 29};
240 uint32x4_t reg_shift, reg_masks;
241 uint32x4_t results;
242
243 reg_masks = vdupq_n_u32(mask);
244
245 // shift the first 4 outs
246 ind[0] = in[0] >> shifts_1st[0];
247 ind[1] = in[0] >> shifts_1st[1];
248 ind[2] = in[0] >> shifts_1st[2];
249 ind[3] = in[0] >> shifts_1st[3];
250 reg_shift = vld1q_u32(ind);
251 results = vandq_u32(reg_shift, reg_masks);
252 vst1q_u32(out, results);
253 out += 4;
254
255 // shift the 2nd 4 outs
256 ind[0] = in[0] >> shifts_2nd[0];
257 ind[1] = in[0] >> shifts_2nd[1];
258 ind[2] = in[0] >> shifts_2nd[2];
259 ind[3] = in[0] >> shifts_2nd[3];
260 reg_shift = vld1q_u32(ind);
261 results = vandq_u32(reg_shift, reg_masks);
262 vst1q_u32(out, results);
263 out += 4;
264
265 // shift the 3rd 4 outs
266 ind[0] = in[0] >> shifts_3rd[0];
267 ind[1] = in[0] >> shifts_3rd[1];
268 ind[2] = (in[0] >> 30 | in[1] << 2) >> shifts_3rd[2];
269 ind[3] = in[1] >> shifts_3rd[3];
270 reg_shift = vld1q_u32(ind);
271 results = vandq_u32(reg_shift, reg_masks);
272 vst1q_u32(out, results);
273 out += 4;
274
275 // shift the 4th 4 outs
276 ind[0] = in[1] >> shifts_4th[0];
277 ind[1] = in[1] >> shifts_4th[1];
278 ind[2] = in[1] >> shifts_4th[2];
279 ind[3] = in[1] >> shifts_4th[3];
280 reg_shift = vld1q_u32(ind);
281 results = vandq_u32(reg_shift, reg_masks);
282 vst1q_u32(out, results);
283 out += 4;
284
285 // shift the 5th 4 outs
286 ind[0] = in[1] >> shifts_5th[0];
287 ind[1] = in[1] >> shifts_5th[1];
288 ind[2] = in[1] >> shifts_5th[2];
289 ind[3] = in[1] >> shifts_5th[3];
290 reg_shift = vld1q_u32(ind);
291 results = vandq_u32(reg_shift, reg_masks);
292 vst1q_u32(out, results);
293 out += 4;
294
295 // shift the 6th 4 outs
296 ind[0] = in[1] >> shifts_6th[0];
297 ind[1] = (in[1] >> 31 | in[2] << 1) >> shifts_6th[1];
298 ind[2] = in[2] >> shifts_6th[2];
299 ind[3] = in[2] >> shifts_6th[3];
300 reg_shift = vld1q_u32(ind);
301 results = vandq_u32(reg_shift, reg_masks);
302 vst1q_u32(out, results);
303 out += 4;
304
305 // shift the 7th 4 outs
306 ind[0] = in[2] >> shifts_7th[0];
307 ind[1] = in[2] >> shifts_7th[1];
308 ind[2] = in[2] >> shifts_7th[2];
309 ind[3] = in[2] >> shifts_7th[3];
310 reg_shift = vld1q_u32(ind);
311 results = vandq_u32(reg_shift, reg_masks);
312 vst1q_u32(out, results);
313 out += 4;
314
315 // shift the 8th 4 outs
316 ind[0] = in[2] >> shifts_8th[0];
317 ind[1] = in[2] >> shifts_8th[1];
318 ind[2] = in[2] >> shifts_8th[2];
319 ind[3] = in[2] >> shifts_8th[3];
320 reg_shift = vld1q_u32(ind);
321 results = vandq_u32(reg_shift, reg_masks);
322 vst1q_u32(out, results);
323 out += 4;
324
325 in += 3;
326
327 return in;
328}
329
330inline static const uint32_t* unpack4_32_neon(const uint32_t* in, uint32_t* out) {
331 uint32_t mask = 0xf;
332 uint32_t ind[4];
333 uint32_t shifts_1st[4] = {0, 4, 8, 12};
334 uint32_t shifts_2nd[4] = {16, 20, 24, 28};
335 uint32x4_t reg_shift, reg_masks;
336 uint32x4_t results;
337
338 reg_masks = vdupq_n_u32(mask);
339
340 // shift the first 4 outs
341 ind[0] = in[0] >> shifts_1st[0];
342 ind[1] = in[0] >> shifts_1st[1];
343 ind[2] = in[0] >> shifts_1st[2];
344 ind[3] = in[0] >> shifts_1st[3];
345 reg_shift = vld1q_u32(ind);
346 results = vandq_u32(reg_shift, reg_masks);
347 vst1q_u32(out, results);
348 out += 4;
349
350 // shift the 2nd 4 outs
351 ind[0] = in[0] >> shifts_2nd[0];
352 ind[1] = in[0] >> shifts_2nd[1];
353 ind[2] = in[0] >> shifts_2nd[2];
354 ind[3] = in[0] >> shifts_2nd[3];
355 reg_shift = vld1q_u32(ind);
356 results = vandq_u32(reg_shift, reg_masks);
357 vst1q_u32(out, results);
358 out += 4;
359
360 // shift the 3rd 4 outs
361 ind[0] = in[1] >> shifts_1st[0];
362 ind[1] = in[1] >> shifts_1st[1];
363 ind[2] = in[1] >> shifts_1st[2];
364 ind[3] = in[1] >> shifts_1st[3];
365 reg_shift = vld1q_u32(ind);
366 results = vandq_u32(reg_shift, reg_masks);
367 vst1q_u32(out, results);
368 out += 4;
369
370 // shift the 4th 4 outs
371 ind[0] = in[1] >> shifts_2nd[0];
372 ind[1] = in[1] >> shifts_2nd[1];
373 ind[2] = in[1] >> shifts_2nd[2];
374 ind[3] = in[1] >> shifts_2nd[3];
375 reg_shift = vld1q_u32(ind);
376 results = vandq_u32(reg_shift, reg_masks);
377 vst1q_u32(out, results);
378 out += 4;
379
380 // shift the 5th 4 outs
381 ind[0] = in[2] >> shifts_1st[0];
382 ind[1] = in[2] >> shifts_1st[1];
383 ind[2] = in[2] >> shifts_1st[2];
384 ind[3] = in[2] >> shifts_1st[3];
385 reg_shift = vld1q_u32(ind);
386 results = vandq_u32(reg_shift, reg_masks);
387 vst1q_u32(out, results);
388 out += 4;
389
390 // shift the 6th 4 outs
391 ind[0] = in[2] >> shifts_2nd[0];
392 ind[1] = in[2] >> shifts_2nd[1];
393 ind[2] = in[2] >> shifts_2nd[2];
394 ind[3] = in[2] >> shifts_2nd[3];
395 reg_shift = vld1q_u32(ind);
396 results = vandq_u32(reg_shift, reg_masks);
397 vst1q_u32(out, results);
398 out += 4;
399
400 // shift the 7th 4 outs
401 ind[0] = in[3] >> shifts_1st[0];
402 ind[1] = in[3] >> shifts_1st[1];
403 ind[2] = in[3] >> shifts_1st[2];
404 ind[3] = in[3] >> shifts_1st[3];
405 reg_shift = vld1q_u32(ind);
406 results = vandq_u32(reg_shift, reg_masks);
407 vst1q_u32(out, results);
408 out += 4;
409
410 // shift the 8th 4 outs
411 ind[0] = in[3] >> shifts_2nd[0];
412 ind[1] = in[3] >> shifts_2nd[1];
413 ind[2] = in[3] >> shifts_2nd[2];
414 ind[3] = in[3] >> shifts_2nd[3];
415 reg_shift = vld1q_u32(ind);
416 results = vandq_u32(reg_shift, reg_masks);
417 vst1q_u32(out, results);
418 out += 4;
419
420 in += 4;
421
422 return in;
423}
424
425inline static const uint32_t* unpack5_32_neon(const uint32_t* in, uint32_t* out) {
426 uint32_t mask = 0x1f;
427 uint32_t ind[4];
428 uint32_t shifts_1st[4] = {0, 5, 10, 15};
429 uint32_t shifts_2nd[4] = {20, 25, 0, 3};
430 uint32_t shifts_3rd[4] = {8, 13, 18, 23};
431 uint32_t shifts_4th[4] = {0, 1, 6, 11};
432 uint32_t shifts_5th[4] = {16, 21, 26, 0};
433 uint32_t shifts_6th[4] = {4, 9, 14, 19};
434 uint32_t shifts_7th[4] = {24, 0, 2, 7};
435 uint32_t shifts_8th[4] = {12, 17, 22, 27};
436 uint32x4_t reg_shift, reg_masks;
437 uint32x4_t results;
438
439 reg_masks = vdupq_n_u32(mask);
440
441 // shift the first 4 outs
442 ind[0] = in[0] >> shifts_1st[0];
443 ind[1] = in[0] >> shifts_1st[1];
444 ind[2] = in[0] >> shifts_1st[2];
445 ind[3] = in[0] >> shifts_1st[3];
446 reg_shift = vld1q_u32(ind);
447 results = vandq_u32(reg_shift, reg_masks);
448 vst1q_u32(out, results);
449 out += 4;
450
451 // shift the 2nd 4 outs
452 ind[0] = in[0] >> shifts_2nd[0];
453 ind[1] = in[0] >> shifts_2nd[1];
454 ind[2] = (in[0] >> 30 | in[1] << 2) >> shifts_2nd[2];
455 ind[3] = in[1] >> shifts_2nd[3];
456 reg_shift = vld1q_u32(ind);
457 results = vandq_u32(reg_shift, reg_masks);
458 vst1q_u32(out, results);
459 out += 4;
460
461 // shift the 3rd 4 outs
462 ind[0] = in[1] >> shifts_3rd[0];
463 ind[1] = in[1] >> shifts_3rd[1];
464 ind[2] = in[1] >> shifts_3rd[2];
465 ind[3] = in[1] >> shifts_3rd[3];
466 reg_shift = vld1q_u32(ind);
467 results = vandq_u32(reg_shift, reg_masks);
468 vst1q_u32(out, results);
469 out += 4;
470
471 // shift the 4th 4 outs
472 ind[0] = (in[1] >> 28 | in[2] << 4) >> shifts_4th[0];
473 ind[1] = in[2] >> shifts_4th[1];
474 ind[2] = in[2] >> shifts_4th[2];
475 ind[3] = in[2] >> shifts_4th[3];
476 reg_shift = vld1q_u32(ind);
477 results = vandq_u32(reg_shift, reg_masks);
478 vst1q_u32(out, results);
479 out += 4;
480
481 // shift the 5th 4 outs
482 ind[0] = in[2] >> shifts_5th[0];
483 ind[1] = in[2] >> shifts_5th[1];
484 ind[2] = in[2] >> shifts_5th[2];
485 ind[3] = (in[2] >> 31 | in[3] << 1) >> shifts_5th[3];
486 reg_shift = vld1q_u32(ind);
487 results = vandq_u32(reg_shift, reg_masks);
488 vst1q_u32(out, results);
489 out += 4;
490
491 // shift the 6th 4 outs
492 ind[0] = in[3] >> shifts_6th[0];
493 ind[1] = in[3] >> shifts_6th[1];
494 ind[2] = in[3] >> shifts_6th[2];
495 ind[3] = in[3] >> shifts_6th[3];
496 reg_shift = vld1q_u32(ind);
497 results = vandq_u32(reg_shift, reg_masks);
498 vst1q_u32(out, results);
499 out += 4;
500
501 // shift the 7th 4 outs
502 ind[0] = in[3] >> shifts_7th[0];
503 ind[1] = (in[3] >> 29 | in[4] << 3) >> shifts_7th[1];
504 ind[2] = in[4] >> shifts_7th[2];
505 ind[3] = in[4] >> shifts_7th[3];
506 reg_shift = vld1q_u32(ind);
507 results = vandq_u32(reg_shift, reg_masks);
508 vst1q_u32(out, results);
509 out += 4;
510
511 // shift the 8th 4 outs
512 ind[0] = in[4] >> shifts_8th[0];
513 ind[1] = in[4] >> shifts_8th[1];
514 ind[2] = in[4] >> shifts_8th[2];
515 ind[3] = in[4] >> shifts_8th[3];
516 reg_shift = vld1q_u32(ind);
517 results = vandq_u32(reg_shift, reg_masks);
518 vst1q_u32(out, results);
519 out += 4;
520
521 in += 5;
522
523 return in;
524}
525
526inline static const uint32_t* unpack6_32_neon(const uint32_t* in, uint32_t* out) {
527 uint32_t mask = 0x3f;
528 uint32_t ind[4];
529 uint32_t shifts_1st[4] = {0, 6, 12, 18};
530 uint32_t shifts_2nd[4] = {24, 0, 4, 10};
531 uint32_t shifts_3rd[4] = {16, 22, 0, 2};
532 uint32_t shifts_4th[4] = {8, 14, 20, 26};
533
534 uint32x4_t reg_shift, reg_masks;
535 uint32x4_t results;
536
537 reg_masks = vdupq_n_u32(mask);
538
539 // shift the first 4 outs
540 ind[0] = in[0] >> shifts_1st[0];
541 ind[1] = in[0] >> shifts_1st[1];
542 ind[2] = in[0] >> shifts_1st[2];
543 ind[3] = in[0] >> shifts_1st[3];
544 reg_shift = vld1q_u32(ind);
545 results = vandq_u32(reg_shift, reg_masks);
546 vst1q_u32(out, results);
547 out += 4;
548
549 // shift the 2nd 4 outs
550 ind[0] = in[0] >> shifts_2nd[0];
551 ind[1] = (in[0] >> 30 | in[1] << 2) >> shifts_2nd[1];
552 ind[2] = in[1] >> shifts_2nd[2];
553 ind[3] = in[1] >> shifts_2nd[3];
554 reg_shift = vld1q_u32(ind);
555 results = vandq_u32(reg_shift, reg_masks);
556 vst1q_u32(out, results);
557 out += 4;
558
559 // shift the 3rd 4 outs
560 ind[0] = in[1] >> shifts_3rd[0];
561 ind[1] = in[1] >> shifts_3rd[1];
562 ind[2] = (in[1] >> 28 | in[2] << 4) >> shifts_3rd[2];
563 ind[3] = in[2] >> shifts_3rd[3];
564 reg_shift = vld1q_u32(ind);
565 results = vandq_u32(reg_shift, reg_masks);
566 vst1q_u32(out, results);
567 out += 4;
568
569 // shift the 4th 4 outs
570 ind[0] = in[2] >> shifts_4th[0];
571 ind[1] = in[2] >> shifts_4th[1];
572 ind[2] = in[2] >> shifts_4th[2];
573 ind[3] = in[2] >> shifts_4th[3];
574 reg_shift = vld1q_u32(ind);
575 results = vandq_u32(reg_shift, reg_masks);
576 vst1q_u32(out, results);
577 out += 4;
578
579 // shift the 5th 4 outs
580 ind[0] = in[3] >> shifts_1st[0];
581 ind[1] = in[3] >> shifts_1st[1];
582 ind[2] = in[3] >> shifts_1st[2];
583 ind[3] = in[3] >> shifts_1st[3];
584 reg_shift = vld1q_u32(ind);
585 results = vandq_u32(reg_shift, reg_masks);
586 vst1q_u32(out, results);
587 out += 4;
588
589 // shift the 6th 4 outs
590 ind[0] = in[3] >> shifts_2nd[0];
591 ind[1] = (in[3] >> 30 | in[4] << 2) >> shifts_2nd[1];
592 ind[2] = in[4] >> shifts_2nd[2];
593 ind[3] = in[4] >> shifts_2nd[3];
594 reg_shift = vld1q_u32(ind);
595 results = vandq_u32(reg_shift, reg_masks);
596 vst1q_u32(out, results);
597 out += 4;
598
599 // shift the 7th 4 outs
600 ind[0] = in[4] >> shifts_3rd[0];
601 ind[1] = in[4] >> shifts_3rd[1];
602 ind[2] = (in[4] >> 28 | in[5] << 4) >> shifts_3rd[2];
603 ind[3] = in[5] >> shifts_3rd[3];
604 reg_shift = vld1q_u32(ind);
605 results = vandq_u32(reg_shift, reg_masks);
606 vst1q_u32(out, results);
607 out += 4;
608
609 // shift the 8th 4 outs
610 ind[0] = in[5] >> shifts_4th[0];
611 ind[1] = in[5] >> shifts_4th[1];
612 ind[2] = in[5] >> shifts_4th[2];
613 ind[3] = in[5] >> shifts_4th[3];
614 reg_shift = vld1q_u32(ind);
615 results = vandq_u32(reg_shift, reg_masks);
616 vst1q_u32(out, results);
617 out += 4;
618
619 in += 6;
620
621 return in;
622}
623
624inline static const uint32_t* unpack7_32_neon(const uint32_t* in, uint32_t* out) {
625 uint32_t mask = 0x7f;
626 uint32_t ind[4];
627 uint32_t shifts_1st[4] = {0, 7, 14, 21};
628 uint32_t shifts_2nd[4] = {0, 3, 10, 17};
629 uint32_t shifts_3rd[4] = {24, 0, 6, 13};
630 uint32_t shifts_4th[4] = {20, 0, 2, 9};
631 uint32_t shifts_5th[4] = {16, 23, 0, 5};
632 uint32_t shifts_6th[4] = {12, 19, 0, 1};
633 uint32_t shifts_7th[4] = {8, 15, 22, 0};
634 uint32_t shifts_8th[4] = {4, 11, 18, 25};
635 uint32x4_t reg_shift, reg_masks;
636 uint32x4_t results;
637
638 reg_masks = vdupq_n_u32(mask);
639
640 // shift the first 4 outs
641 ind[0] = in[0] >> shifts_1st[0];
642 ind[1] = in[0] >> shifts_1st[1];
643 ind[2] = in[0] >> shifts_1st[2];
644 ind[3] = in[0] >> shifts_1st[3];
645 reg_shift = vld1q_u32(ind);
646 results = vandq_u32(reg_shift, reg_masks);
647 vst1q_u32(out, results);
648 out += 4;
649
650 // shift the 2nd 4 outs
651 ind[0] = (in[0] >> 28 | in[1] << 4) >> shifts_2nd[0];
652 ind[1] = in[1] >> shifts_2nd[1];
653 ind[2] = in[1] >> shifts_2nd[2];
654 ind[3] = in[1] >> shifts_2nd[3];
655 reg_shift = vld1q_u32(ind);
656 results = vandq_u32(reg_shift, reg_masks);
657 vst1q_u32(out, results);
658 out += 4;
659
660 // shift the 3rd 4 outs
661 ind[0] = in[1] >> shifts_3rd[0];
662 ind[1] = (in[1] >> 31 | in[2] << 1) >> shifts_3rd[1];
663 ind[2] = in[2] >> shifts_3rd[2];
664 ind[3] = in[2] >> shifts_3rd[3];
665 reg_shift = vld1q_u32(ind);
666 results = vandq_u32(reg_shift, reg_masks);
667 vst1q_u32(out, results);
668 out += 4;
669
670 // shift the 4th 4 outs
671 ind[0] = in[2] >> shifts_4th[0];
672 ind[1] = (in[2] >> 27 | in[3] << 5) >> shifts_4th[1];
673 ind[2] = in[3] >> shifts_4th[2];
674 ind[3] = in[3] >> shifts_4th[3];
675 reg_shift = vld1q_u32(ind);
676 results = vandq_u32(reg_shift, reg_masks);
677 vst1q_u32(out, results);
678 out += 4;
679
680 // shift the 5th 4 outs
681 ind[0] = in[3] >> shifts_5th[0];
682 ind[1] = in[3] >> shifts_5th[1];
683 ind[2] = (in[3] >> 30 | in[4] << 2) >> shifts_5th[2];
684 ind[3] = in[4] >> shifts_5th[3];
685 reg_shift = vld1q_u32(ind);
686 results = vandq_u32(reg_shift, reg_masks);
687 vst1q_u32(out, results);
688 out += 4;
689
690 // shift the 6th 4 outs
691 ind[0] = in[4] >> shifts_6th[0];
692 ind[1] = in[4] >> shifts_6th[1];
693 ind[2] = (in[4] >> 26 | in[5] << 6) >> shifts_6th[2];
694 ind[3] = in[5] >> shifts_6th[3];
695 reg_shift = vld1q_u32(ind);
696 results = vandq_u32(reg_shift, reg_masks);
697 vst1q_u32(out, results);
698 out += 4;
699
700 // shift the 7th 4 outs
701 ind[0] = in[5] >> shifts_7th[0];
702 ind[1] = in[5] >> shifts_7th[1];
703 ind[2] = in[5] >> shifts_7th[2];
704 ind[3] = (in[5] >> 29 | in[6] << 3) >> shifts_7th[3];
705 reg_shift = vld1q_u32(ind);
706 results = vandq_u32(reg_shift, reg_masks);
707 vst1q_u32(out, results);
708 out += 4;
709
710 // shift the 8th 4 outs
711 ind[0] = in[6] >> shifts_8th[0];
712 ind[1] = in[6] >> shifts_8th[1];
713 ind[2] = in[6] >> shifts_8th[2];
714 ind[3] = in[6] >> shifts_8th[3];
715 reg_shift = vld1q_u32(ind);
716 results = vandq_u32(reg_shift, reg_masks);
717 vst1q_u32(out, results);
718 out += 4;
719
720 in += 7;
721
722 return in;
723}
724
725inline static const uint32_t* unpack8_32_neon(const uint32_t* in, uint32_t* out) {
726 uint32_t mask = 0xff;
727 uint32_t ind[4];
728 uint32_t shifts_1st[4] = {0, 8, 16, 24};
729 uint32x4_t reg_shift, reg_masks;
730 uint32x4_t results;
731
732 reg_masks = vdupq_n_u32(mask);
733
734 // shift the first 4 outs
735 ind[0] = in[0] >> shifts_1st[0];
736 ind[1] = in[0] >> shifts_1st[1];
737 ind[2] = in[0] >> shifts_1st[2];
738 ind[3] = in[0] >> shifts_1st[3];
739 reg_shift = vld1q_u32(ind);
740 results = vandq_u32(reg_shift, reg_masks);
741 vst1q_u32(out, results);
742 out += 4;
743
744 // shift the 2nd 4 outs
745 ind[0] = in[1] >> shifts_1st[0];
746 ind[1] = in[1] >> shifts_1st[1];
747 ind[2] = in[1] >> shifts_1st[2];
748 ind[3] = in[1] >> shifts_1st[3];
749 reg_shift = vld1q_u32(ind);
750 results = vandq_u32(reg_shift, reg_masks);
751 vst1q_u32(out, results);
752 out += 4;
753
754 // shift the 3rd 4 outs
755 ind[0] = in[2] >> shifts_1st[0];
756 ind[1] = in[2] >> shifts_1st[1];
757 ind[2] = in[2] >> shifts_1st[2];
758 ind[3] = in[2] >> shifts_1st[3];
759 reg_shift = vld1q_u32(ind);
760 results = vandq_u32(reg_shift, reg_masks);
761 vst1q_u32(out, results);
762 out += 4;
763
764 // shift the 4th 4 outs
765 ind[0] = in[3] >> shifts_1st[0];
766 ind[1] = in[3] >> shifts_1st[1];
767 ind[2] = in[3] >> shifts_1st[2];
768 ind[3] = in[3] >> shifts_1st[3];
769 reg_shift = vld1q_u32(ind);
770 results = vandq_u32(reg_shift, reg_masks);
771 vst1q_u32(out, results);
772 out += 4;
773
774 // shift the 5th 4 outs
775 ind[0] = in[4] >> shifts_1st[0];
776 ind[1] = in[4] >> shifts_1st[1];
777 ind[2] = in[4] >> shifts_1st[2];
778 ind[3] = in[4] >> shifts_1st[3];
779 reg_shift = vld1q_u32(ind);
780 results = vandq_u32(reg_shift, reg_masks);
781 vst1q_u32(out, results);
782 out += 4;
783
784 // shift the 6th 4 outs
785 ind[0] = in[5] >> shifts_1st[0];
786 ind[1] = in[5] >> shifts_1st[1];
787 ind[2] = in[5] >> shifts_1st[2];
788 ind[3] = in[5] >> shifts_1st[3];
789 reg_shift = vld1q_u32(ind);
790 results = vandq_u32(reg_shift, reg_masks);
791 vst1q_u32(out, results);
792 out += 4;
793
794 // shift the 7th 4 outs
795 ind[0] = in[6] >> shifts_1st[0];
796 ind[1] = in[6] >> shifts_1st[1];
797 ind[2] = in[6] >> shifts_1st[2];
798 ind[3] = in[6] >> shifts_1st[3];
799 reg_shift = vld1q_u32(ind);
800 results = vandq_u32(reg_shift, reg_masks);
801 vst1q_u32(out, results);
802 out += 4;
803
804 // shift the 8th 4 outs
805 ind[0] = in[7] >> shifts_1st[0];
806 ind[1] = in[7] >> shifts_1st[1];
807 ind[2] = in[7] >> shifts_1st[2];
808 ind[3] = in[7] >> shifts_1st[3];
809 reg_shift = vld1q_u32(ind);
810 results = vandq_u32(reg_shift, reg_masks);
811 vst1q_u32(out, results);
812 out += 4;
813
814 in += 8;
815
816 return in;
817}
818
819inline static const uint32_t* unpack9_32_neon(const uint32_t* in, uint32_t* out) {
820 uint32_t mask = 0x1ff;
821 uint32_t ind[4];
822 uint32_t shifts_1st[4] = {0, 9, 18, 0};
823 uint32_t shifts_2nd[4] = {4, 13, 22, 0};
824 uint32_t shifts_3rd[4] = {8, 17, 0, 3};
825 uint32_t shifts_4th[4] = {12, 21, 0, 7};
826 uint32_t shifts_5th[4] = {16, 0, 2, 11};
827 uint32_t shifts_6th[4] = {20, 0, 6, 15};
828 uint32_t shifts_7th[4] = {0, 1, 10, 19};
829 uint32_t shifts_8th[4] = {0, 5, 14, 23};
830 uint32x4_t reg_shift, reg_masks;
831 uint32x4_t results;
832
833 reg_masks = vdupq_n_u32(mask);
834
835 // shift the first 4 outs
836 ind[0] = in[0] >> shifts_1st[0];
837 ind[1] = in[0] >> shifts_1st[1];
838 ind[2] = in[0] >> shifts_1st[2];
839 ind[3] = (in[0] >> 27 | in[1] << 5) >> shifts_1st[3];
840 reg_shift = vld1q_u32(ind);
841 results = vandq_u32(reg_shift, reg_masks);
842 vst1q_u32(out, results);
843 out += 4;
844
845 // shift the 2nd 4 outs
846 ind[0] = in[1] >> shifts_2nd[0];
847 ind[1] = in[1] >> shifts_2nd[1];
848 ind[2] = in[1] >> shifts_2nd[2];
849 ind[3] = (in[1] >> 31 | in[2] << 1) >> shifts_2nd[3];
850 reg_shift = vld1q_u32(ind);
851 results = vandq_u32(reg_shift, reg_masks);
852 vst1q_u32(out, results);
853 out += 4;
854
855 // shift the 3rd 4 outs
856 ind[0] = in[2] >> shifts_3rd[0];
857 ind[1] = in[2] >> shifts_3rd[1];
858 ind[2] = (in[2] >> 26 | in[3] << 6) >> shifts_3rd[2];
859 ind[3] = in[3] >> shifts_3rd[3];
860 reg_shift = vld1q_u32(ind);
861 results = vandq_u32(reg_shift, reg_masks);
862 vst1q_u32(out, results);
863 out += 4;
864
865 // shift the 4th 4 outs
866 ind[0] = in[3] >> shifts_4th[0];
867 ind[1] = in[3] >> shifts_4th[1];
868 ind[2] = (in[3] >> 30 | in[4] << 2) >> shifts_4th[2];
869 ind[3] = in[4] >> shifts_4th[3];
870 reg_shift = vld1q_u32(ind);
871 results = vandq_u32(reg_shift, reg_masks);
872 vst1q_u32(out, results);
873 out += 4;
874
875 // shift the 5th 4 outs
876 ind[0] = in[4] >> shifts_5th[0];
877 ind[1] = (in[4] >> 25 | in[5] << 7) >> shifts_5th[1];
878 ind[2] = in[5] >> shifts_5th[2];
879 ind[3] = in[5] >> shifts_5th[3];
880 reg_shift = vld1q_u32(ind);
881 results = vandq_u32(reg_shift, reg_masks);
882 vst1q_u32(out, results);
883 out += 4;
884
885 // shift the 6th 4 outs
886 ind[0] = in[5] >> shifts_6th[0];
887 ind[1] = (in[5] >> 29 | in[6] << 3) >> shifts_6th[1];
888 ind[2] = in[6] >> shifts_6th[2];
889 ind[3] = in[6] >> shifts_6th[3];
890 reg_shift = vld1q_u32(ind);
891 results = vandq_u32(reg_shift, reg_masks);
892 vst1q_u32(out, results);
893 out += 4;
894
895 // shift the 7th 4 outs
896 ind[0] = (in[6] >> 24 | in[7] << 8) >> shifts_7th[0];
897 ind[1] = in[7] >> shifts_7th[1];
898 ind[2] = in[7] >> shifts_7th[2];
899 ind[3] = in[7] >> shifts_7th[3];
900 reg_shift = vld1q_u32(ind);
901 results = vandq_u32(reg_shift, reg_masks);
902 vst1q_u32(out, results);
903 out += 4;
904
905 // shift the 8th 4 outs
906 ind[0] = (in[7] >> 28 | in[8] << 4) >> shifts_8th[0];
907 ind[1] = in[8] >> shifts_8th[1];
908 ind[2] = in[8] >> shifts_8th[2];
909 ind[3] = in[8] >> shifts_8th[3];
910 reg_shift = vld1q_u32(ind);
911 results = vandq_u32(reg_shift, reg_masks);
912 vst1q_u32(out, results);
913 out += 4;
914
915 in += 9;
916
917 return in;
918}
919
920inline static const uint32_t* unpack10_32_neon(const uint32_t* in, uint32_t* out) {
921 uint32_t mask = 0x3ff;
922 uint32_t ind[4];
923 uint32_t shifts_1st[4] = {0, 10, 20, 0};
924 uint32_t shifts_2nd[4] = {8, 18, 0, 6};
925 uint32_t shifts_3rd[4] = {16, 0, 4, 14};
926 uint32_t shifts_4th[4] = {0, 2, 12, 22};
927 uint32x4_t reg_shift, reg_masks;
928 uint32x4_t results;
929
930 reg_masks = vdupq_n_u32(mask);
931
932 // shift the first 4 outs
933 ind[0] = in[0] >> shifts_1st[0];
934 ind[1] = in[0] >> shifts_1st[1];
935 ind[2] = in[0] >> shifts_1st[2];
936 ind[3] = (in[0] >> 30 | in[1] << 2) >> shifts_1st[3];
937 reg_shift = vld1q_u32(ind);
938 results = vandq_u32(reg_shift, reg_masks);
939 vst1q_u32(out, results);
940 out += 4;
941
942 // shift the 2nd 4 outs
943 ind[0] = in[1] >> shifts_2nd[0];
944 ind[1] = in[1] >> shifts_2nd[1];
945 ind[2] = (in[1] >> 28 | in[2] << 4) >> shifts_2nd[2];
946 ind[3] = in[2] >> shifts_2nd[3];
947 reg_shift = vld1q_u32(ind);
948 results = vandq_u32(reg_shift, reg_masks);
949 vst1q_u32(out, results);
950 out += 4;
951
952 // shift the 3rd 4 outs
953 ind[0] = in[2] >> shifts_3rd[0];
954 ind[1] = (in[2] >> 26 | in[3] << 6) >> shifts_3rd[1];
955 ind[2] = in[3] >> shifts_3rd[2];
956 ind[3] = in[3] >> shifts_3rd[3];
957 reg_shift = vld1q_u32(ind);
958 results = vandq_u32(reg_shift, reg_masks);
959 vst1q_u32(out, results);
960 out += 4;
961
962 // shift the 4th 4 outs
963 ind[0] = (in[3] >> 24 | in[4] << 8) >> shifts_4th[0];
964 ind[1] = in[4] >> shifts_4th[1];
965 ind[2] = in[4] >> shifts_4th[2];
966 ind[3] = in[4] >> shifts_4th[3];
967 reg_shift = vld1q_u32(ind);
968 results = vandq_u32(reg_shift, reg_masks);
969 vst1q_u32(out, results);
970 out += 4;
971
972 // shift the 5th 4 outs
973 ind[0] = in[5] >> shifts_1st[0];
974 ind[1] = in[5] >> shifts_1st[1];
975 ind[2] = in[5] >> shifts_1st[2];
976 ind[3] = (in[5] >> 30 | in[6] << 2) >> shifts_1st[3];
977 reg_shift = vld1q_u32(ind);
978 results = vandq_u32(reg_shift, reg_masks);
979 vst1q_u32(out, results);
980 out += 4;
981
982 // shift the 6th 4 outs
983 ind[0] = in[6] >> shifts_2nd[0];
984 ind[1] = in[6] >> shifts_2nd[1];
985 ind[2] = (in[6] >> 28 | in[7] << 4) >> shifts_2nd[2];
986 ind[3] = in[7] >> shifts_2nd[3];
987 reg_shift = vld1q_u32(ind);
988 results = vandq_u32(reg_shift, reg_masks);
989 vst1q_u32(out, results);
990 out += 4;
991
992 // shift the 7th 4 outs
993 ind[0] = in[7] >> shifts_3rd[0];
994 ind[1] = (in[7] >> 26 | in[8] << 6) >> shifts_3rd[1];
995 ind[2] = in[8] >> shifts_3rd[2];
996 ind[3] = in[8] >> shifts_3rd[3];
997 reg_shift = vld1q_u32(ind);
998 results = vandq_u32(reg_shift, reg_masks);
999 vst1q_u32(out, results);
1000 out += 4;
1001
1002 // shift the 8th 4 outs
1003 ind[0] = (in[8] >> 24 | in[9] << 8) >> shifts_4th[0];
1004 ind[1] = in[9] >> shifts_4th[1];
1005 ind[2] = in[9] >> shifts_4th[2];
1006 ind[3] = in[9] >> shifts_4th[3];
1007 reg_shift = vld1q_u32(ind);
1008 results = vandq_u32(reg_shift, reg_masks);
1009 vst1q_u32(out, results);
1010 out += 4;
1011
1012 in += 10;
1013
1014 return in;
1015}
1016
1017inline static const uint32_t* unpack11_32_neon(const uint32_t* in, uint32_t* out) {
1018 uint32_t mask = 0x7ff;
1019 uint32_t ind[4];
1020 uint32_t shifts_1st[4] = {0, 11, 0, 1};
1021 uint32_t shifts_2nd[4] = {12, 0, 2, 13};
1022 uint32_t shifts_3rd[4] = {0, 3, 14, 0};
1023 uint32_t shifts_4th[4] = {4, 15, 0, 5};
1024 uint32_t shifts_5th[4] = {16, 0, 6, 17};
1025 uint32_t shifts_6th[4] = {0, 7, 18, 0};
1026 uint32_t shifts_7th[4] = {8, 19, 0, 9};
1027 uint32_t shifts_8th[4] = {20, 0, 10, 21};
1028 uint32x4_t reg_shift, reg_masks;
1029 uint32x4_t results;
1030
1031 reg_masks = vdupq_n_u32(mask);
1032
1033 // shift the first 4 outs
1034 ind[0] = in[0] >> shifts_1st[0];
1035 ind[1] = in[0] >> shifts_1st[1];
1036 ind[2] = (in[0] >> 22 | in[1] << 10) >> shifts_1st[2];
1037 ind[3] = in[1] >> shifts_1st[3];
1038 reg_shift = vld1q_u32(ind);
1039 results = vandq_u32(reg_shift, reg_masks);
1040 vst1q_u32(out, results);
1041 out += 4;
1042
1043 // shift the 2nd 4 outs
1044 ind[0] = in[1] >> shifts_2nd[0];
1045 ind[1] = (in[1] >> 23 | in[2] << 9) >> shifts_2nd[1];
1046 ind[2] = in[2] >> shifts_2nd[2];
1047 ind[3] = in[2] >> shifts_2nd[3];
1048 reg_shift = vld1q_u32(ind);
1049 results = vandq_u32(reg_shift, reg_masks);
1050 vst1q_u32(out, results);
1051 out += 4;
1052
1053 // shift the 3rd 4 outs
1054 ind[0] = (in[2] >> 24 | in[3] << 8) >> shifts_3rd[0];
1055 ind[1] = in[3] >> shifts_3rd[1];
1056 ind[2] = in[3] >> shifts_3rd[2];
1057 ind[3] = (in[3] >> 25 | in[4] << 7) >> shifts_3rd[3];
1058 reg_shift = vld1q_u32(ind);
1059 results = vandq_u32(reg_shift, reg_masks);
1060 vst1q_u32(out, results);
1061 out += 4;
1062
1063 // shift the 4th 4 outs
1064 ind[0] = in[4] >> shifts_4th[0];
1065 ind[1] = in[4] >> shifts_4th[1];
1066 ind[2] = (in[4] >> 26 | in[5] << 6) >> shifts_4th[2];
1067 ind[3] = in[5] >> shifts_4th[3];
1068 reg_shift = vld1q_u32(ind);
1069 results = vandq_u32(reg_shift, reg_masks);
1070 vst1q_u32(out, results);
1071 out += 4;
1072
1073 // shift the 5th 4 outs
1074 ind[0] = in[5] >> shifts_5th[0];
1075 ind[1] = (in[5] >> 27 | in[6] << 5) >> shifts_5th[1];
1076 ind[2] = in[6] >> shifts_5th[2];
1077 ind[3] = in[6] >> shifts_5th[3];
1078 reg_shift = vld1q_u32(ind);
1079 results = vandq_u32(reg_shift, reg_masks);
1080 vst1q_u32(out, results);
1081 out += 4;
1082
1083 // shift the 6th 4 outs
1084 ind[0] = (in[6] >> 28 | in[7] << 4) >> shifts_6th[0];
1085 ind[1] = in[7] >> shifts_6th[1];
1086 ind[2] = in[7] >> shifts_6th[2];
1087 ind[3] = (in[7] >> 29 | in[8] << 3) >> shifts_6th[3];
1088 reg_shift = vld1q_u32(ind);
1089 results = vandq_u32(reg_shift, reg_masks);
1090 vst1q_u32(out, results);
1091 out += 4;
1092
1093 // shift the 7th 4 outs
1094 ind[0] = in[8] >> shifts_7th[0];
1095 ind[1] = in[8] >> shifts_7th[1];
1096 ind[2] = (in[8] >> 30 | in[9] << 2) >> shifts_7th[2];
1097 ind[3] = in[9] >> shifts_7th[3];
1098 reg_shift = vld1q_u32(ind);
1099 results = vandq_u32(reg_shift, reg_masks);
1100 vst1q_u32(out, results);
1101 out += 4;
1102
1103 // shift the 8th 4 outs
1104 ind[0] = in[9] >> shifts_8th[0];
1105 ind[1] = (in[9] >> 31 | in[10] << 1) >> shifts_8th[1];
1106 ind[2] = in[10] >> shifts_8th[2];
1107 ind[3] = in[10] >> shifts_8th[3];
1108 reg_shift = vld1q_u32(ind);
1109 results = vandq_u32(reg_shift, reg_masks);
1110 vst1q_u32(out, results);
1111 out += 4;
1112
1113 in += 11;
1114
1115 return in;
1116}
1117
1118inline static const uint32_t* unpack12_32_neon(const uint32_t* in, uint32_t* out) {
1119 uint32_t mask = 0xfff;
1120 uint32_t ind[4];
1121 uint32_t shifts_1st[4] = {0, 12, 0, 4};
1122 uint32_t shifts_2nd[4] = {16, 0, 8, 20};
1123 uint32x4_t reg_shift, reg_masks;
1124 uint32x4_t results;
1125
1126 reg_masks = vdupq_n_u32(mask);
1127
1128 // shift the first 4 outs
1129 ind[0] = in[0] >> shifts_1st[0];
1130 ind[1] = in[0] >> shifts_1st[1];
1131 ind[2] = (in[0] >> 24 | in[1] << 8) >> shifts_1st[2];
1132 ind[3] = in[1] >> shifts_1st[3];
1133 reg_shift = vld1q_u32(ind);
1134 results = vandq_u32(reg_shift, reg_masks);
1135 vst1q_u32(out, results);
1136 out += 4;
1137
1138 // shift the 2nd 4 outs
1139 ind[0] = in[1] >> shifts_2nd[0];
1140 ind[1] = (in[1] >> 28 | in[2] << 4) >> shifts_2nd[1];
1141 ind[2] = in[2] >> shifts_2nd[2];
1142 ind[3] = in[2] >> shifts_2nd[3];
1143 reg_shift = vld1q_u32(ind);
1144 results = vandq_u32(reg_shift, reg_masks);
1145 vst1q_u32(out, results);
1146 out += 4;
1147
1148 // shift the 3rd 4 outs
1149 ind[0] = in[3] >> shifts_1st[0];
1150 ind[1] = in[3] >> shifts_1st[1];
1151 ind[2] = (in[3] >> 24 | in[4] << 8) >> shifts_1st[2];
1152 ind[3] = in[4] >> shifts_1st[3];
1153 reg_shift = vld1q_u32(ind);
1154 results = vandq_u32(reg_shift, reg_masks);
1155 vst1q_u32(out, results);
1156 out += 4;
1157
1158 // shift the 4th 4 outs
1159 ind[0] = in[4] >> shifts_2nd[0];
1160 ind[1] = (in[4] >> 28 | in[5] << 4) >> shifts_2nd[1];
1161 ind[2] = in[5] >> shifts_2nd[2];
1162 ind[3] = in[5] >> shifts_2nd[3];
1163 reg_shift = vld1q_u32(ind);
1164 results = vandq_u32(reg_shift, reg_masks);
1165 vst1q_u32(out, results);
1166 out += 4;
1167
1168 // shift the 5th 4 outs
1169 ind[0] = in[6] >> shifts_1st[0];
1170 ind[1] = in[6] >> shifts_1st[1];
1171 ind[2] = (in[6] >> 24 | in[7] << 8) >> shifts_1st[2];
1172 ind[3] = in[7] >> shifts_1st[3];
1173 reg_shift = vld1q_u32(ind);
1174 results = vandq_u32(reg_shift, reg_masks);
1175 vst1q_u32(out, results);
1176 out += 4;
1177
1178 // shift the 6th 4 outs
1179 ind[0] = in[7] >> shifts_2nd[0];
1180 ind[1] = (in[7] >> 28 | in[8] << 4) >> shifts_2nd[1];
1181 ind[2] = in[8] >> shifts_2nd[2];
1182 ind[3] = in[8] >> shifts_2nd[3];
1183 reg_shift = vld1q_u32(ind);
1184 results = vandq_u32(reg_shift, reg_masks);
1185 vst1q_u32(out, results);
1186 out += 4;
1187
1188 // shift the 7th 4 outs
1189 ind[0] = in[9] >> shifts_1st[0];
1190 ind[1] = in[9] >> shifts_1st[1];
1191 ind[2] = (in[9] >> 24 | in[10] << 8) >> shifts_1st[2];
1192 ind[3] = in[10] >> shifts_1st[3];
1193 reg_shift = vld1q_u32(ind);
1194 results = vandq_u32(reg_shift, reg_masks);
1195 vst1q_u32(out, results);
1196 out += 4;
1197
1198 // shift the 8th 4 outs
1199 ind[0] = in[10] >> shifts_2nd[0];
1200 ind[1] = (in[10] >> 28 | in[11] << 4) >> shifts_2nd[1];
1201 ind[2] = in[11] >> shifts_2nd[2];
1202 ind[3] = in[11] >> shifts_2nd[3];
1203 reg_shift = vld1q_u32(ind);
1204 results = vandq_u32(reg_shift, reg_masks);
1205 vst1q_u32(out, results);
1206 out += 4;
1207
1208 in += 12;
1209
1210 return in;
1211}
1212
1213inline static const uint32_t* unpack13_32_neon(const uint32_t* in, uint32_t* out) {
1214 uint32_t mask = 0x1fff;
1215 uint32_t ind[4];
1216 uint32_t shifts_1st[4] = {0, 13, 0, 7};
1217 uint32_t shifts_2nd[4] = {0, 1, 14, 0};
1218 uint32_t shifts_3rd[4] = {8, 0, 2, 15};
1219 uint32_t shifts_4th[4] = {0, 9, 0, 3};
1220 uint32_t shifts_5th[4] = {16, 0, 10, 0};
1221 uint32_t shifts_6th[4] = {4, 17, 0, 11};
1222 uint32_t shifts_7th[4] = {0, 5, 18, 0};
1223 uint32_t shifts_8th[4] = {12, 0, 6, 19};
1224 uint32x4_t reg_shift, reg_masks;
1225 uint32x4_t results;
1226
1227 reg_masks = vdupq_n_u32(mask);
1228
1229 // shift the first 4 outs
1230 ind[0] = in[0] >> shifts_1st[0];
1231 ind[1] = in[0] >> shifts_1st[1];
1232 ind[2] = (in[0] >> 26 | in[1] << 6) >> shifts_1st[2];
1233 ind[3] = in[1] >> shifts_1st[3];
1234 reg_shift = vld1q_u32(ind);
1235 results = vandq_u32(reg_shift, reg_masks);
1236 vst1q_u32(out, results);
1237 out += 4;
1238
1239 // shift the 2nd 4 outs
1240 ind[0] = (in[1] >> 20 | in[2] << 12) >> shifts_2nd[0];
1241 ind[1] = in[2] >> shifts_2nd[1];
1242 ind[2] = in[2] >> shifts_2nd[2];
1243 ind[3] = (in[2] >> 27 | in[3] << 5) >> shifts_2nd[3];
1244 reg_shift = vld1q_u32(ind);
1245 results = vandq_u32(reg_shift, reg_masks);
1246 vst1q_u32(out, results);
1247 out += 4;
1248
1249 // shift the 3rd 4 outs
1250 ind[0] = in[3] >> shifts_3rd[0];
1251 ind[1] = (in[3] >> 21 | in[4] << 11) >> shifts_3rd[1];
1252 ind[2] = in[4] >> shifts_3rd[2];
1253 ind[3] = in[4] >> shifts_3rd[3];
1254 reg_shift = vld1q_u32(ind);
1255 results = vandq_u32(reg_shift, reg_masks);
1256 vst1q_u32(out, results);
1257 out += 4;
1258
1259 // shift the 4th 4 outs
1260 ind[0] = (in[4] >> 28 | in[5] << 4) >> shifts_4th[0];
1261 ind[1] = in[5] >> shifts_4th[1];
1262 ind[2] = (in[5] >> 22 | in[6] << 10) >> shifts_4th[2];
1263 ind[3] = in[6] >> shifts_4th[3];
1264 reg_shift = vld1q_u32(ind);
1265 results = vandq_u32(reg_shift, reg_masks);
1266 vst1q_u32(out, results);
1267 out += 4;
1268
1269 // shift the 5th 4 outs
1270 ind[0] = in[6] >> shifts_5th[0];
1271 ind[1] = (in[6] >> 29 | in[7] << 3) >> shifts_5th[1];
1272 ind[2] = in[7] >> shifts_5th[2];
1273 ind[3] = (in[7] >> 23 | in[8] << 9) >> shifts_5th[3];
1274 reg_shift = vld1q_u32(ind);
1275 results = vandq_u32(reg_shift, reg_masks);
1276 vst1q_u32(out, results);
1277 out += 4;
1278
1279 // shift the 6th 4 outs
1280 ind[0] = in[8] >> shifts_6th[0];
1281 ind[1] = in[8] >> shifts_6th[1];
1282 ind[2] = (in[8] >> 30 | in[9] << 2) >> shifts_6th[2];
1283 ind[3] = in[9] >> shifts_6th[3];
1284 reg_shift = vld1q_u32(ind);
1285 results = vandq_u32(reg_shift, reg_masks);
1286 vst1q_u32(out, results);
1287 out += 4;
1288
1289 // shift the 7th 4 outs
1290 ind[0] = (in[9] >> 24 | in[10] << 8) >> shifts_7th[0];
1291 ind[1] = in[10] >> shifts_7th[1];
1292 ind[2] = in[10] >> shifts_7th[2];
1293 ind[3] = (in[10] >> 31 | in[11] << 1) >> shifts_7th[3];
1294 reg_shift = vld1q_u32(ind);
1295 results = vandq_u32(reg_shift, reg_masks);
1296 vst1q_u32(out, results);
1297 out += 4;
1298
1299 // shift the 8th 4 outs
1300 ind[0] = in[11] >> shifts_8th[0];
1301 ind[1] = (in[11] >> 25 | in[12] << 7) >> shifts_8th[1];
1302 ind[2] = in[12] >> shifts_8th[2];
1303 ind[3] = in[12] >> shifts_8th[3];
1304 reg_shift = vld1q_u32(ind);
1305 results = vandq_u32(reg_shift, reg_masks);
1306 vst1q_u32(out, results);
1307 out += 4;
1308
1309 in += 13;
1310
1311 return in;
1312}
1313
1314inline static const uint32_t* unpack14_32_neon(const uint32_t* in, uint32_t* out) {
1315 uint32_t mask = 0x3fff;
1316 uint32_t ind[4];
1317 uint32_t shifts_1st[4] = {0, 14, 0, 10};
1318 uint32_t shifts_2nd[4] = {0, 6, 0, 2};
1319 uint32_t shifts_3rd[4] = {16, 0, 12, 0};
1320 uint32_t shifts_4th[4] = {8, 0, 4, 18};
1321 uint32x4_t reg_shift, reg_masks;
1322 uint32x4_t results;
1323
1324 reg_masks = vdupq_n_u32(mask);
1325
1326 // shift the first 4 outs
1327 ind[0] = in[0] >> shifts_1st[0];
1328 ind[1] = in[0] >> shifts_1st[1];
1329 ind[2] = (in[0] >> 28 | in[1] << 4) >> shifts_1st[2];
1330 ind[3] = in[1] >> shifts_1st[3];
1331 reg_shift = vld1q_u32(ind);
1332 results = vandq_u32(reg_shift, reg_masks);
1333 vst1q_u32(out, results);
1334 out += 4;
1335
1336 // shift the 2nd 4 outs
1337 ind[0] = (in[1] >> 24 | in[2] << 8) >> shifts_2nd[0];
1338 ind[1] = in[2] >> shifts_2nd[1];
1339 ind[2] = (in[2] >> 20 | in[3] << 12) >> shifts_2nd[2];
1340 ind[3] = in[3] >> shifts_2nd[3];
1341 reg_shift = vld1q_u32(ind);
1342 results = vandq_u32(reg_shift, reg_masks);
1343 vst1q_u32(out, results);
1344 out += 4;
1345
1346 // shift the 3rd 4 outs
1347 ind[0] = in[3] >> shifts_3rd[0];
1348 ind[1] = (in[3] >> 30 | in[4] << 2) >> shifts_3rd[1];
1349 ind[2] = in[4] >> shifts_3rd[2];
1350 ind[3] = (in[4] >> 26 | in[5] << 6) >> shifts_3rd[3];
1351 reg_shift = vld1q_u32(ind);
1352 results = vandq_u32(reg_shift, reg_masks);
1353 vst1q_u32(out, results);
1354 out += 4;
1355
1356 // shift the 4th 4 outs
1357 ind[0] = in[5] >> shifts_4th[0];
1358 ind[1] = (in[5] >> 22 | in[6] << 10) >> shifts_4th[1];
1359 ind[2] = in[6] >> shifts_4th[2];
1360 ind[3] = in[6] >> shifts_4th[3];
1361 reg_shift = vld1q_u32(ind);
1362 results = vandq_u32(reg_shift, reg_masks);
1363 vst1q_u32(out, results);
1364 out += 4;
1365
1366 // shift the 5th 4 outs
1367 ind[0] = in[7] >> shifts_1st[0];
1368 ind[1] = in[7] >> shifts_1st[1];
1369 ind[2] = (in[7] >> 28 | in[8] << 4) >> shifts_1st[2];
1370 ind[3] = in[8] >> shifts_1st[3];
1371 reg_shift = vld1q_u32(ind);
1372 results = vandq_u32(reg_shift, reg_masks);
1373 vst1q_u32(out, results);
1374 out += 4;
1375
1376 // shift the 6th 4 outs
1377 ind[0] = (in[8] >> 24 | in[9] << 8) >> shifts_2nd[0];
1378 ind[1] = in[9] >> shifts_2nd[1];
1379 ind[2] = (in[9] >> 20 | in[10] << 12) >> shifts_2nd[2];
1380 ind[3] = in[10] >> shifts_2nd[3];
1381 reg_shift = vld1q_u32(ind);
1382 results = vandq_u32(reg_shift, reg_masks);
1383 vst1q_u32(out, results);
1384 out += 4;
1385
1386 // shift the 7th 4 outs
1387 ind[0] = in[10] >> shifts_3rd[0];
1388 ind[1] = (in[10] >> 30 | in[11] << 2) >> shifts_3rd[1];
1389 ind[2] = in[11] >> shifts_3rd[2];
1390 ind[3] = (in[11] >> 26 | in[12] << 6) >> shifts_3rd[3];
1391 reg_shift = vld1q_u32(ind);
1392 results = vandq_u32(reg_shift, reg_masks);
1393 vst1q_u32(out, results);
1394 out += 4;
1395
1396 // shift the 8th 4 outs
1397 ind[0] = in[12] >> shifts_4th[0];
1398 ind[1] = (in[12] >> 22 | in[13] << 10) >> shifts_4th[1];
1399 ind[2] = in[13] >> shifts_4th[2];
1400 ind[3] = in[13] >> shifts_4th[3];
1401 reg_shift = vld1q_u32(ind);
1402 results = vandq_u32(reg_shift, reg_masks);
1403 vst1q_u32(out, results);
1404 out += 4;
1405
1406 in += 14;
1407
1408 return in;
1409}
1410
1411inline static const uint32_t* unpack15_32_neon(const uint32_t* in, uint32_t* out) {
1412 uint32_t mask = 0x7fff;
1413 uint32_t ind[4];
1414 uint32_t shifts_1st[4] = {0, 15, 0, 13};
1415 uint32_t shifts_2nd[4] = {0, 11, 0, 9};
1416 uint32_t shifts_3rd[4] = {0, 7, 0, 5};
1417 uint32_t shifts_4th[4] = {0, 3, 0, 1};
1418 uint32_t shifts_5th[4] = {16, 0, 14, 0};
1419 uint32_t shifts_6th[4] = {12, 0, 10, 0};
1420 uint32_t shifts_7th[4] = {8, 0, 6, 0};
1421 uint32_t shifts_8th[4] = {4, 0, 2, 17};
1422 uint32x4_t reg_shift, reg_masks;
1423 uint32x4_t results;
1424
1425 reg_masks = vdupq_n_u32(mask);
1426
1427 // shift the first 4 outs
1428 ind[0] = in[0] >> shifts_1st[0];
1429 ind[1] = in[0] >> shifts_1st[1];
1430 ind[2] = (in[0] >> 30 | in[1] << 2) >> shifts_1st[2];
1431 ind[3] = in[1] >> shifts_1st[3];
1432 reg_shift = vld1q_u32(ind);
1433 results = vandq_u32(reg_shift, reg_masks);
1434 vst1q_u32(out, results);
1435 out += 4;
1436
1437 // shift the 2nd 4 outs
1438 ind[0] = (in[1] >> 28 | in[2] << 4) >> shifts_2nd[0];
1439 ind[1] = in[2] >> shifts_2nd[1];
1440 ind[2] = (in[2] >> 26 | in[3] << 6) >> shifts_2nd[2];
1441 ind[3] = in[3] >> shifts_2nd[3];
1442 reg_shift = vld1q_u32(ind);
1443 results = vandq_u32(reg_shift, reg_masks);
1444 vst1q_u32(out, results);
1445 out += 4;
1446
1447 // shift the 3rd 4 outs
1448 ind[0] = (in[3] >> 24 | in[4] << 8) >> shifts_3rd[0];
1449 ind[1] = in[4] >> shifts_3rd[1];
1450 ind[2] = (in[4] >> 22 | in[5] << 10) >> shifts_3rd[2];
1451 ind[3] = in[5] >> shifts_3rd[3];
1452 reg_shift = vld1q_u32(ind);
1453 results = vandq_u32(reg_shift, reg_masks);
1454 vst1q_u32(out, results);
1455 out += 4;
1456
1457 // shift the 4th 4 outs
1458 ind[0] = (in[5] >> 20 | in[6] << 12) >> shifts_4th[0];
1459 ind[1] = in[6] >> shifts_4th[1];
1460 ind[2] = (in[6] >> 18 | in[7] << 14) >> shifts_4th[2];
1461 ind[3] = in[7] >> shifts_4th[3];
1462 reg_shift = vld1q_u32(ind);
1463 results = vandq_u32(reg_shift, reg_masks);
1464 vst1q_u32(out, results);
1465 out += 4;
1466
1467 // shift the 5th 4 outs
1468 ind[0] = in[7] >> shifts_5th[0];
1469 ind[1] = (in[7] >> 31 | in[8] << 1) >> shifts_5th[1];
1470 ind[2] = in[8] >> shifts_5th[2];
1471 ind[3] = (in[8] >> 29 | in[9] << 3) >> shifts_5th[3];
1472 reg_shift = vld1q_u32(ind);
1473 results = vandq_u32(reg_shift, reg_masks);
1474 vst1q_u32(out, results);
1475 out += 4;
1476
1477 // shift the 6th 4 outs
1478 ind[0] = in[9] >> shifts_6th[0];
1479 ind[1] = (in[9] >> 27 | in[10] << 5) >> shifts_6th[1];
1480 ind[2] = in[10] >> shifts_6th[2];
1481 ind[3] = (in[10] >> 25 | in[11] << 7) >> shifts_6th[3];
1482 reg_shift = vld1q_u32(ind);
1483 results = vandq_u32(reg_shift, reg_masks);
1484 vst1q_u32(out, results);
1485 out += 4;
1486
1487 // shift the 7th 4 outs
1488 ind[0] = in[11] >> shifts_7th[0];
1489 ind[1] = (in[11] >> 23 | in[12] << 9) >> shifts_7th[1];
1490 ind[2] = in[12] >> shifts_7th[2];
1491 ind[3] = (in[12] >> 21 | in[13] << 11) >> shifts_7th[3];
1492 reg_shift = vld1q_u32(ind);
1493 results = vandq_u32(reg_shift, reg_masks);
1494 vst1q_u32(out, results);
1495 out += 4;
1496
1497 // shift the 8th 4 outs
1498 ind[0] = in[13] >> shifts_8th[0];
1499 ind[1] = (in[13] >> 19 | in[14] << 13) >> shifts_8th[1];
1500 ind[2] = in[14] >> shifts_8th[2];
1501 ind[3] = in[14] >> shifts_8th[3];
1502 reg_shift = vld1q_u32(ind);
1503 results = vandq_u32(reg_shift, reg_masks);
1504 vst1q_u32(out, results);
1505 out += 4;
1506
1507 in += 15;
1508
1509 return in;
1510}
1511
1512inline static const uint32_t* unpack16_32_neon(const uint32_t* in, uint32_t* out) {
1513 uint32_t mask = 0xffff;
1514 uint32_t ind[4];
1515 uint32_t shifts_1st[4] = {0, 16, 0, 16};
1516 uint32x4_t reg_shift, reg_masks;
1517 uint32x4_t results;
1518
1519 reg_masks = vdupq_n_u32(mask);
1520
1521 // shift the first 4 outs
1522 ind[0] = in[0] >> shifts_1st[0];
1523 ind[1] = in[0] >> shifts_1st[1];
1524 ind[2] = in[1] >> shifts_1st[2];
1525 ind[3] = in[1] >> shifts_1st[3];
1526 reg_shift = vld1q_u32(ind);
1527 results = vandq_u32(reg_shift, reg_masks);
1528 vst1q_u32(out, results);
1529 out += 4;
1530
1531 // shift the 2nd 4 outs
1532 ind[0] = in[2] >> shifts_1st[0];
1533 ind[1] = in[2] >> shifts_1st[1];
1534 ind[2] = in[3] >> shifts_1st[2];
1535 ind[3] = in[3] >> shifts_1st[3];
1536 reg_shift = vld1q_u32(ind);
1537 results = vandq_u32(reg_shift, reg_masks);
1538 vst1q_u32(out, results);
1539 out += 4;
1540
1541 // shift the 3rd 4 outs
1542 ind[0] = in[4] >> shifts_1st[0];
1543 ind[1] = in[4] >> shifts_1st[1];
1544 ind[2] = in[5] >> shifts_1st[2];
1545 ind[3] = in[5] >> shifts_1st[3];
1546 reg_shift = vld1q_u32(ind);
1547 results = vandq_u32(reg_shift, reg_masks);
1548 vst1q_u32(out, results);
1549 out += 4;
1550
1551 // shift the 4th 4 outs
1552 ind[0] = in[6] >> shifts_1st[0];
1553 ind[1] = in[6] >> shifts_1st[1];
1554 ind[2] = in[7] >> shifts_1st[2];
1555 ind[3] = in[7] >> shifts_1st[3];
1556 reg_shift = vld1q_u32(ind);
1557 results = vandq_u32(reg_shift, reg_masks);
1558 vst1q_u32(out, results);
1559 out += 4;
1560
1561 // shift the 5th 4 outs
1562 ind[0] = in[8] >> shifts_1st[0];
1563 ind[1] = in[8] >> shifts_1st[1];
1564 ind[2] = in[9] >> shifts_1st[2];
1565 ind[3] = in[9] >> shifts_1st[3];
1566 reg_shift = vld1q_u32(ind);
1567 results = vandq_u32(reg_shift, reg_masks);
1568 vst1q_u32(out, results);
1569 out += 4;
1570
1571 // shift the 6th 4 outs
1572 ind[0] = in[10] >> shifts_1st[0];
1573 ind[1] = in[10] >> shifts_1st[1];
1574 ind[2] = in[11] >> shifts_1st[2];
1575 ind[3] = in[11] >> shifts_1st[3];
1576 reg_shift = vld1q_u32(ind);
1577 results = vandq_u32(reg_shift, reg_masks);
1578 vst1q_u32(out, results);
1579 out += 4;
1580
1581 // shift the 7th 4 outs
1582 ind[0] = in[12] >> shifts_1st[0];
1583 ind[1] = in[12] >> shifts_1st[1];
1584 ind[2] = in[13] >> shifts_1st[2];
1585 ind[3] = in[13] >> shifts_1st[3];
1586 reg_shift = vld1q_u32(ind);
1587 results = vandq_u32(reg_shift, reg_masks);
1588 vst1q_u32(out, results);
1589 out += 4;
1590
1591 // shift the 8th 4 outs
1592 ind[0] = in[14] >> shifts_1st[0];
1593 ind[1] = in[14] >> shifts_1st[1];
1594 ind[2] = in[15] >> shifts_1st[2];
1595 ind[3] = in[15] >> shifts_1st[3];
1596 reg_shift = vld1q_u32(ind);
1597 results = vandq_u32(reg_shift, reg_masks);
1598 vst1q_u32(out, results);
1599 out += 4;
1600
1601 in += 16;
1602
1603 return in;
1604}
1605
1606inline static const uint32_t* unpack17_32_neon(const uint32_t* in, uint32_t* out) {
1607 uint32_t mask = 0x1ffff;
1608 uint32_t ind[4];
1609 uint32_t shifts_1st[4] = {0, 0, 2, 0};
1610 uint32_t shifts_2nd[4] = {4, 0, 6, 0};
1611 uint32_t shifts_3rd[4] = {8, 0, 10, 0};
1612 uint32_t shifts_4th[4] = {12, 0, 14, 0};
1613 uint32_t shifts_5th[4] = {0, 1, 0, 3};
1614 uint32_t shifts_6th[4] = {0, 5, 0, 7};
1615 uint32_t shifts_7th[4] = {0, 9, 0, 11};
1616 uint32_t shifts_8th[4] = {0, 13, 0, 15};
1617 uint32x4_t reg_shift, reg_masks;
1618 uint32x4_t results;
1619
1620 reg_masks = vdupq_n_u32(mask);
1621
1622 // shift the first 4 outs
1623 ind[0] = in[0] >> shifts_1st[0];
1624 ind[1] = (in[0] >> 17 | in[1] << 15) >> shifts_1st[1];
1625 ind[2] = in[1] >> shifts_1st[2];
1626 ind[3] = (in[1] >> 19 | in[2] << 13) >> shifts_1st[3];
1627 reg_shift = vld1q_u32(ind);
1628 results = vandq_u32(reg_shift, reg_masks);
1629 vst1q_u32(out, results);
1630 out += 4;
1631
1632 // shift the 2nd 4 outs
1633 ind[0] = in[2] >> shifts_2nd[0];
1634 ind[1] = (in[2] >> 21 | in[3] << 11) >> shifts_2nd[1];
1635 ind[2] = in[3] >> shifts_2nd[2];
1636 ind[3] = (in[3] >> 23 | in[4] << 9) >> shifts_2nd[3];
1637 reg_shift = vld1q_u32(ind);
1638 results = vandq_u32(reg_shift, reg_masks);
1639 vst1q_u32(out, results);
1640 out += 4;
1641
1642 // shift the 3rd 4 outs
1643 ind[0] = in[4] >> shifts_3rd[0];
1644 ind[1] = (in[4] >> 25 | in[5] << 7) >> shifts_3rd[1];
1645 ind[2] = in[5] >> shifts_3rd[2];
1646 ind[3] = (in[5] >> 27 | in[6] << 5) >> shifts_3rd[3];
1647 reg_shift = vld1q_u32(ind);
1648 results = vandq_u32(reg_shift, reg_masks);
1649 vst1q_u32(out, results);
1650 out += 4;
1651
1652 // shift the 4th 4 outs
1653 ind[0] = in[6] >> shifts_4th[0];
1654 ind[1] = (in[6] >> 29 | in[7] << 3) >> shifts_4th[1];
1655 ind[2] = in[7] >> shifts_4th[2];
1656 ind[3] = (in[7] >> 31 | in[8] << 1) >> shifts_4th[3];
1657 reg_shift = vld1q_u32(ind);
1658 results = vandq_u32(reg_shift, reg_masks);
1659 vst1q_u32(out, results);
1660 out += 4;
1661
1662 // shift the 5th 4 outs
1663 ind[0] = (in[8] >> 16 | in[9] << 16) >> shifts_5th[0];
1664 ind[1] = in[9] >> shifts_5th[1];
1665 ind[2] = (in[9] >> 18 | in[10] << 14) >> shifts_5th[2];
1666 ind[3] = in[10] >> shifts_5th[3];
1667 reg_shift = vld1q_u32(ind);
1668 results = vandq_u32(reg_shift, reg_masks);
1669 vst1q_u32(out, results);
1670 out += 4;
1671
1672 // shift the 6th 4 outs
1673 ind[0] = (in[10] >> 20 | in[11] << 12) >> shifts_6th[0];
1674 ind[1] = in[11] >> shifts_6th[1];
1675 ind[2] = (in[11] >> 22 | in[12] << 10) >> shifts_6th[2];
1676 ind[3] = in[12] >> shifts_6th[3];
1677 reg_shift = vld1q_u32(ind);
1678 results = vandq_u32(reg_shift, reg_masks);
1679 vst1q_u32(out, results);
1680 out += 4;
1681
1682 // shift the 7th 4 outs
1683 ind[0] = (in[12] >> 24 | in[13] << 8) >> shifts_7th[0];
1684 ind[1] = in[13] >> shifts_7th[1];
1685 ind[2] = (in[13] >> 26 | in[14] << 6) >> shifts_7th[2];
1686 ind[3] = in[14] >> shifts_7th[3];
1687 reg_shift = vld1q_u32(ind);
1688 results = vandq_u32(reg_shift, reg_masks);
1689 vst1q_u32(out, results);
1690 out += 4;
1691
1692 // shift the 8th 4 outs
1693 ind[0] = (in[14] >> 28 | in[15] << 4) >> shifts_8th[0];
1694 ind[1] = in[15] >> shifts_8th[1];
1695 ind[2] = (in[15] >> 30 | in[16] << 2) >> shifts_8th[2];
1696 ind[3] = in[16] >> shifts_8th[3];
1697 reg_shift = vld1q_u32(ind);
1698 results = vandq_u32(reg_shift, reg_masks);
1699 vst1q_u32(out, results);
1700 out += 4;
1701
1702 in += 17;
1703
1704 return in;
1705}
1706
1707inline static const uint32_t* unpack18_32_neon(const uint32_t* in, uint32_t* out) {
1708 uint32_t mask = 0x3ffff;
1709 uint32_t ind[4];
1710 uint32_t shifts_1st[4] = {0, 0, 4, 0};
1711 uint32_t shifts_2nd[4] = {8, 0, 12, 0};
1712 uint32_t shifts_3rd[4] = {0, 2, 0, 6};
1713 uint32_t shifts_4th[4] = {0, 10, 0, 14};
1714 uint32x4_t reg_shift, reg_masks;
1715 uint32x4_t results;
1716
1717 reg_masks = vdupq_n_u32(mask);
1718
1719 // shift the first 4 outs
1720 ind[0] = in[0] >> shifts_1st[0];
1721 ind[1] = (in[0] >> 18 | in[1] << 14) >> shifts_1st[1];
1722 ind[2] = in[1] >> shifts_1st[2];
1723 ind[3] = (in[1] >> 22 | in[2] << 10) >> shifts_1st[3];
1724 reg_shift = vld1q_u32(ind);
1725 results = vandq_u32(reg_shift, reg_masks);
1726 vst1q_u32(out, results);
1727 out += 4;
1728
1729 // shift the 2nd 4 outs
1730 ind[0] = in[2] >> shifts_2nd[0];
1731 ind[1] = (in[2] >> 26 | in[3] << 6) >> shifts_2nd[1];
1732 ind[2] = in[3] >> shifts_2nd[2];
1733 ind[3] = (in[3] >> 30 | in[4] << 2) >> shifts_2nd[3];
1734 reg_shift = vld1q_u32(ind);
1735 results = vandq_u32(reg_shift, reg_masks);
1736 vst1q_u32(out, results);
1737 out += 4;
1738
1739 // shift the 3rd 4 outs
1740 ind[0] = (in[4] >> 16 | in[5] << 16) >> shifts_3rd[0];
1741 ind[1] = in[5] >> shifts_3rd[1];
1742 ind[2] = (in[5] >> 20 | in[6] << 12) >> shifts_3rd[2];
1743 ind[3] = in[6] >> shifts_3rd[3];
1744 reg_shift = vld1q_u32(ind);
1745 results = vandq_u32(reg_shift, reg_masks);
1746 vst1q_u32(out, results);
1747 out += 4;
1748
1749 // shift the 4th 4 outs
1750 ind[0] = (in[6] >> 24 | in[7] << 8) >> shifts_4th[0];
1751 ind[1] = in[7] >> shifts_4th[1];
1752 ind[2] = (in[7] >> 28 | in[8] << 4) >> shifts_4th[2];
1753 ind[3] = in[8] >> shifts_4th[3];
1754 reg_shift = vld1q_u32(ind);
1755 results = vandq_u32(reg_shift, reg_masks);
1756 vst1q_u32(out, results);
1757 out += 4;
1758
1759 // shift the 5th 4 outs
1760 ind[0] = in[9] >> shifts_1st[0];
1761 ind[1] = (in[9] >> 18 | in[10] << 14) >> shifts_1st[1];
1762 ind[2] = in[10] >> shifts_1st[2];
1763 ind[3] = (in[10] >> 22 | in[11] << 10) >> shifts_1st[3];
1764 reg_shift = vld1q_u32(ind);
1765 results = vandq_u32(reg_shift, reg_masks);
1766 vst1q_u32(out, results);
1767 out += 4;
1768
1769 // shift the 6th 4 outs
1770 ind[0] = in[11] >> shifts_2nd[0];
1771 ind[1] = (in[11] >> 26 | in[12] << 6) >> shifts_2nd[1];
1772 ind[2] = in[12] >> shifts_2nd[2];
1773 ind[3] = (in[12] >> 30 | in[13] << 2) >> shifts_2nd[3];
1774 reg_shift = vld1q_u32(ind);
1775 results = vandq_u32(reg_shift, reg_masks);
1776 vst1q_u32(out, results);
1777 out += 4;
1778
1779 // shift the 7th 4 outs
1780 ind[0] = (in[13] >> 16 | in[14] << 16) >> shifts_3rd[0];
1781 ind[1] = in[14] >> shifts_3rd[1];
1782 ind[2] = (in[14] >> 20 | in[15] << 12) >> shifts_3rd[2];
1783 ind[3] = in[15] >> shifts_3rd[3];
1784 reg_shift = vld1q_u32(ind);
1785 results = vandq_u32(reg_shift, reg_masks);
1786 vst1q_u32(out, results);
1787 out += 4;
1788
1789 // shift the 8th 4 outs
1790 ind[0] = (in[15] >> 24 | in[16] << 8) >> shifts_4th[0];
1791 ind[1] = in[16] >> shifts_4th[1];
1792 ind[2] = (in[16] >> 28 | in[17] << 4) >> shifts_4th[2];
1793 ind[3] = in[17] >> shifts_4th[3];
1794 reg_shift = vld1q_u32(ind);
1795 results = vandq_u32(reg_shift, reg_masks);
1796 vst1q_u32(out, results);
1797 out += 4;
1798
1799 in += 18;
1800
1801 return in;
1802}
1803
1804inline static const uint32_t* unpack19_32_neon(const uint32_t* in, uint32_t* out) {
1805 uint32_t mask = 0x7ffff;
1806 uint32_t ind[4];
1807 uint32_t shifts_1st[4] = {0, 0, 6, 0};
1808 uint32_t shifts_2nd[4] = {12, 0, 0, 5};
1809 uint32_t shifts_3rd[4] = {0, 11, 0, 0};
1810 uint32_t shifts_4th[4] = {4, 0, 10, 0};
1811 uint32_t shifts_5th[4] = {0, 3, 0, 9};
1812 uint32_t shifts_6th[4] = {0, 0, 2, 0};
1813 uint32_t shifts_7th[4] = {8, 0, 0, 1};
1814 uint32_t shifts_8th[4] = {0, 7, 0, 13};
1815 uint32x4_t reg_shift, reg_masks;
1816 uint32x4_t results;
1817
1818 reg_masks = vdupq_n_u32(mask);
1819
1820 // shift the first 4 outs
1821 ind[0] = in[0] >> shifts_1st[0];
1822 ind[1] = (in[0] >> 19 | in[1] << 13) >> shifts_1st[1];
1823 ind[2] = in[1] >> shifts_1st[2];
1824 ind[3] = (in[1] >> 25 | in[2] << 7) >> shifts_1st[3];
1825 reg_shift = vld1q_u32(ind);
1826 results = vandq_u32(reg_shift, reg_masks);
1827 vst1q_u32(out, results);
1828 out += 4;
1829
1830 // shift the 2nd 4 outs
1831 ind[0] = in[2] >> shifts_2nd[0];
1832 ind[1] = (in[2] >> 31 | in[3] << 1) >> shifts_2nd[1];
1833 ind[2] = (in[3] >> 18 | in[4] << 14) >> shifts_2nd[2];
1834 ind[3] = in[4] >> shifts_2nd[3];
1835 reg_shift = vld1q_u32(ind);
1836 results = vandq_u32(reg_shift, reg_masks);
1837 vst1q_u32(out, results);
1838 out += 4;
1839
1840 // shift the 3rd 4 outs
1841 ind[0] = (in[4] >> 24 | in[5] << 8) >> shifts_3rd[0];
1842 ind[1] = in[5] >> shifts_3rd[1];
1843 ind[2] = (in[5] >> 30 | in[6] << 2) >> shifts_3rd[2];
1844 ind[3] = (in[6] >> 17 | in[7] << 15) >> shifts_3rd[3];
1845 reg_shift = vld1q_u32(ind);
1846 results = vandq_u32(reg_shift, reg_masks);
1847 vst1q_u32(out, results);
1848 out += 4;
1849
1850 // shift the 4th 4 outs
1851 ind[0] = in[7] >> shifts_4th[0];
1852 ind[1] = (in[7] >> 23 | in[8] << 9) >> shifts_4th[1];
1853 ind[2] = in[8] >> shifts_4th[2];
1854 ind[3] = (in[8] >> 29 | in[9] << 3) >> shifts_4th[3];
1855 reg_shift = vld1q_u32(ind);
1856 results = vandq_u32(reg_shift, reg_masks);
1857 vst1q_u32(out, results);
1858 out += 4;
1859
1860 // shift the 5th 4 outs
1861 ind[0] = (in[9] >> 16 | in[10] << 16) >> shifts_5th[0];
1862 ind[1] = in[10] >> shifts_5th[1];
1863 ind[2] = (in[10] >> 22 | in[11] << 10) >> shifts_5th[2];
1864 ind[3] = in[11] >> shifts_5th[3];
1865 reg_shift = vld1q_u32(ind);
1866 results = vandq_u32(reg_shift, reg_masks);
1867 vst1q_u32(out, results);
1868 out += 4;
1869
1870 // shift the 6th 4 outs
1871 ind[0] = (in[11] >> 28 | in[12] << 4) >> shifts_6th[0];
1872 ind[1] = (in[12] >> 15 | in[13] << 17) >> shifts_6th[1];
1873 ind[2] = in[13] >> shifts_6th[2];
1874 ind[3] = (in[13] >> 21 | in[14] << 11) >> shifts_6th[3];
1875 reg_shift = vld1q_u32(ind);
1876 results = vandq_u32(reg_shift, reg_masks);
1877 vst1q_u32(out, results);
1878 out += 4;
1879
1880 // shift the 7th 4 outs
1881 ind[0] = in[14] >> shifts_7th[0];
1882 ind[1] = (in[14] >> 27 | in[15] << 5) >> shifts_7th[1];
1883 ind[2] = (in[15] >> 14 | in[16] << 18) >> shifts_7th[2];
1884 ind[3] = in[16] >> shifts_7th[3];
1885 reg_shift = vld1q_u32(ind);
1886 results = vandq_u32(reg_shift, reg_masks);
1887 vst1q_u32(out, results);
1888 out += 4;
1889
1890 // shift the 8th 4 outs
1891 ind[0] = (in[16] >> 20 | in[17] << 12) >> shifts_8th[0];
1892 ind[1] = in[17] >> shifts_8th[1];
1893 ind[2] = (in[17] >> 26 | in[18] << 6) >> shifts_8th[2];
1894 ind[3] = in[18] >> shifts_8th[3];
1895 reg_shift = vld1q_u32(ind);
1896 results = vandq_u32(reg_shift, reg_masks);
1897 vst1q_u32(out, results);
1898 out += 4;
1899
1900 in += 19;
1901
1902 return in;
1903}
1904
1905inline static const uint32_t* unpack20_32_neon(const uint32_t* in, uint32_t* out) {
1906 uint32_t mask = 0xfffff;
1907 uint32_t ind[4];
1908 uint32_t shifts_1st[4] = {0, 0, 8, 0};
1909 uint32_t shifts_2nd[4] = {0, 4, 0, 12};
1910 uint32x4_t reg_shift, reg_masks;
1911 uint32x4_t results;
1912
1913 reg_masks = vdupq_n_u32(mask);
1914
1915 // shift the first 4 outs
1916 ind[0] = in[0] >> shifts_1st[0];
1917 ind[1] = (in[0] >> 20 | in[1] << 12) >> shifts_1st[1];
1918 ind[2] = in[1] >> shifts_1st[2];
1919 ind[3] = (in[1] >> 28 | in[2] << 4) >> shifts_1st[3];
1920 reg_shift = vld1q_u32(ind);
1921 results = vandq_u32(reg_shift, reg_masks);
1922 vst1q_u32(out, results);
1923 out += 4;
1924
1925 // shift the 2nd 4 outs
1926 ind[0] = (in[2] >> 16 | in[3] << 16) >> shifts_2nd[0];
1927 ind[1] = in[3] >> shifts_2nd[1];
1928 ind[2] = (in[3] >> 24 | in[4] << 8) >> shifts_2nd[2];
1929 ind[3] = in[4] >> shifts_2nd[3];
1930 reg_shift = vld1q_u32(ind);
1931 results = vandq_u32(reg_shift, reg_masks);
1932 vst1q_u32(out, results);
1933 out += 4;
1934
1935 // shift the 3rd 4 outs
1936 ind[0] = in[5] >> shifts_1st[0];
1937 ind[1] = (in[5] >> 20 | in[6] << 12) >> shifts_1st[1];
1938 ind[2] = in[6] >> shifts_1st[2];
1939 ind[3] = (in[6] >> 28 | in[7] << 4) >> shifts_1st[3];
1940 reg_shift = vld1q_u32(ind);
1941 results = vandq_u32(reg_shift, reg_masks);
1942 vst1q_u32(out, results);
1943 out += 4;
1944
1945 // shift the 4th 4 outs
1946 ind[0] = (in[7] >> 16 | in[8] << 16) >> shifts_2nd[0];
1947 ind[1] = in[8] >> shifts_2nd[1];
1948 ind[2] = (in[8] >> 24 | in[9] << 8) >> shifts_2nd[2];
1949 ind[3] = in[9] >> shifts_2nd[3];
1950 reg_shift = vld1q_u32(ind);
1951 results = vandq_u32(reg_shift, reg_masks);
1952 vst1q_u32(out, results);
1953 out += 4;
1954
1955 // shift the 5th 4 outs
1956 ind[0] = in[10] >> shifts_1st[0];
1957 ind[1] = (in[10] >> 20 | in[11] << 12) >> shifts_1st[1];
1958 ind[2] = in[11] >> shifts_1st[2];
1959 ind[3] = (in[11] >> 28 | in[12] << 4) >> shifts_1st[3];
1960 reg_shift = vld1q_u32(ind);
1961 results = vandq_u32(reg_shift, reg_masks);
1962 vst1q_u32(out, results);
1963 out += 4;
1964
1965 // shift the 6th 4 outs
1966 ind[0] = (in[12] >> 16 | in[13] << 16) >> shifts_2nd[0];
1967 ind[1] = in[13] >> shifts_2nd[1];
1968 ind[2] = (in[13] >> 24 | in[14] << 8) >> shifts_2nd[2];
1969 ind[3] = in[14] >> shifts_2nd[3];
1970 reg_shift = vld1q_u32(ind);
1971 results = vandq_u32(reg_shift, reg_masks);
1972 vst1q_u32(out, results);
1973 out += 4;
1974
1975 // shift the 7th 4 outs
1976 ind[0] = in[15] >> shifts_1st[0];
1977 ind[1] = (in[15] >> 20 | in[16] << 12) >> shifts_1st[1];
1978 ind[2] = in[16] >> shifts_1st[2];
1979 ind[3] = (in[16] >> 28 | in[17] << 4) >> shifts_1st[3];
1980 reg_shift = vld1q_u32(ind);
1981 results = vandq_u32(reg_shift, reg_masks);
1982 vst1q_u32(out, results);
1983 out += 4;
1984
1985 // shift the 8th 4 outs
1986 ind[0] = (in[17] >> 16 | in[18] << 16) >> shifts_2nd[0];
1987 ind[1] = in[18] >> shifts_2nd[1];
1988 ind[2] = (in[18] >> 24 | in[19] << 8) >> shifts_2nd[2];
1989 ind[3] = in[19] >> shifts_2nd[3];
1990 reg_shift = vld1q_u32(ind);
1991 results = vandq_u32(reg_shift, reg_masks);
1992 vst1q_u32(out, results);
1993 out += 4;
1994
1995 in += 20;
1996
1997 return in;
1998}
1999
2000inline static const uint32_t* unpack21_32_neon(const uint32_t* in, uint32_t* out) {
2001 uint32_t mask = 0x1fffff;
2002 uint32_t ind[4];
2003 uint32_t shifts_1st[4] = {0, 0, 10, 0};
2004 uint32_t shifts_2nd[4] = {0, 9, 0, 0};
2005 uint32_t shifts_3rd[4] = {8, 0, 0, 7};
2006 uint32_t shifts_4th[4] = {0, 0, 6, 0};
2007 uint32_t shifts_5th[4] = {0, 5, 0, 0};
2008 uint32_t shifts_6th[4] = {4, 0, 0, 3};
2009 uint32_t shifts_7th[4] = {0, 0, 2, 0};
2010 uint32_t shifts_8th[4] = {0, 1, 0, 11};
2011 uint32x4_t reg_shift, reg_masks;
2012 uint32x4_t results;
2013
2014 reg_masks = vdupq_n_u32(mask);
2015
2016 // shift the first 4 outs
2017 ind[0] = in[0] >> shifts_1st[0];
2018 ind[1] = (in[0] >> 21 | in[1] << 11) >> shifts_1st[1];
2019 ind[2] = in[1] >> shifts_1st[2];
2020 ind[3] = (in[1] >> 31 | in[2] << 1) >> shifts_1st[3];
2021 reg_shift = vld1q_u32(ind);
2022 results = vandq_u32(reg_shift, reg_masks);
2023 vst1q_u32(out, results);
2024 out += 4;
2025
2026 // shift the 2nd 4 outs
2027 ind[0] = (in[2] >> 20 | in[3] << 12) >> shifts_2nd[0];
2028 ind[1] = in[3] >> shifts_2nd[1];
2029 ind[2] = (in[3] >> 30 | in[4] << 2) >> shifts_2nd[2];
2030 ind[3] = (in[4] >> 19 | in[5] << 13) >> shifts_2nd[3];
2031 reg_shift = vld1q_u32(ind);
2032 results = vandq_u32(reg_shift, reg_masks);
2033 vst1q_u32(out, results);
2034 out += 4;
2035
2036 // shift the 3rd 4 outs
2037 ind[0] = in[5] >> shifts_3rd[0];
2038 ind[1] = (in[5] >> 29 | in[6] << 3) >> shifts_3rd[1];
2039 ind[2] = (in[6] >> 18 | in[7] << 14) >> shifts_3rd[2];
2040 ind[3] = in[7] >> shifts_3rd[3];
2041 reg_shift = vld1q_u32(ind);
2042 results = vandq_u32(reg_shift, reg_masks);
2043 vst1q_u32(out, results);
2044 out += 4;
2045
2046 // shift the 4th 4 outs
2047 ind[0] = (in[7] >> 28 | in[8] << 4) >> shifts_4th[0];
2048 ind[1] = (in[8] >> 17 | in[9] << 15) >> shifts_4th[1];
2049 ind[2] = in[9] >> shifts_4th[2];
2050 ind[3] = (in[9] >> 27 | in[10] << 5) >> shifts_4th[3];
2051 reg_shift = vld1q_u32(ind);
2052 results = vandq_u32(reg_shift, reg_masks);
2053 vst1q_u32(out, results);
2054 out += 4;
2055
2056 // shift the 5th 4 outs
2057 ind[0] = (in[10] >> 16 | in[11] << 16) >> shifts_5th[0];
2058 ind[1] = in[11] >> shifts_5th[1];
2059 ind[2] = (in[11] >> 26 | in[12] << 6) >> shifts_5th[2];
2060 ind[3] = (in[12] >> 15 | in[13] << 17) >> shifts_5th[3];
2061 reg_shift = vld1q_u32(ind);
2062 results = vandq_u32(reg_shift, reg_masks);
2063 vst1q_u32(out, results);
2064 out += 4;
2065
2066 // shift the 6th 4 outs
2067 ind[0] = in[13] >> shifts_6th[0];
2068 ind[1] = (in[13] >> 25 | in[14] << 7) >> shifts_6th[1];
2069 ind[2] = (in[14] >> 14 | in[15] << 18) >> shifts_6th[2];
2070 ind[3] = in[15] >> shifts_6th[3];
2071 reg_shift = vld1q_u32(ind);
2072 results = vandq_u32(reg_shift, reg_masks);
2073 vst1q_u32(out, results);
2074 out += 4;
2075
2076 // shift the 7th 4 outs
2077 ind[0] = (in[15] >> 24 | in[16] << 8) >> shifts_7th[0];
2078 ind[1] = (in[16] >> 13 | in[17] << 19) >> shifts_7th[1];
2079 ind[2] = in[17] >> shifts_7th[2];
2080 ind[3] = (in[17] >> 23 | in[18] << 9) >> shifts_7th[3];
2081 reg_shift = vld1q_u32(ind);
2082 results = vandq_u32(reg_shift, reg_masks);
2083 vst1q_u32(out, results);
2084 out += 4;
2085
2086 // shift the 8th 4 outs
2087 ind[0] = (in[18] >> 12 | in[19] << 20) >> shifts_8th[0];
2088 ind[1] = in[19] >> shifts_8th[1];
2089 ind[2] = (in[19] >> 22 | in[20] << 10) >> shifts_8th[2];
2090 ind[3] = in[20] >> shifts_8th[3];
2091 reg_shift = vld1q_u32(ind);
2092 results = vandq_u32(reg_shift, reg_masks);
2093 vst1q_u32(out, results);
2094 out += 4;
2095
2096 in += 21;
2097
2098 return in;
2099}
2100
2101inline static const uint32_t* unpack22_32_neon(const uint32_t* in, uint32_t* out) {
2102 uint32_t mask = 0x3fffff;
2103 uint32_t ind[4];
2104 uint32_t shifts_1st[4] = {0, 0, 0, 2};
2105 uint32_t shifts_2nd[4] = {0, 0, 4, 0};
2106 uint32_t shifts_3rd[4] = {0, 6, 0, 0};
2107 uint32_t shifts_4th[4] = {8, 0, 0, 10};
2108 uint32x4_t reg_shift, reg_masks;
2109 uint32x4_t results;
2110
2111 reg_masks = vdupq_n_u32(mask);
2112
2113 // shift the first 4 outs
2114 ind[0] = in[0] >> shifts_1st[0];
2115 ind[1] = (in[0] >> 22 | in[1] << 10) >> shifts_1st[1];
2116 ind[2] = (in[1] >> 12 | in[2] << 20) >> shifts_1st[2];
2117 ind[3] = in[2] >> shifts_1st[3];
2118 reg_shift = vld1q_u32(ind);
2119 results = vandq_u32(reg_shift, reg_masks);
2120 vst1q_u32(out, results);
2121 out += 4;
2122
2123 // shift the 2nd 4 outs
2124 ind[0] = (in[2] >> 24 | in[3] << 8) >> shifts_2nd[0];
2125 ind[1] = (in[3] >> 14 | in[4] << 18) >> shifts_2nd[1];
2126 ind[2] = in[4] >> shifts_2nd[2];
2127 ind[3] = (in[4] >> 26 | in[5] << 6) >> shifts_2nd[3];
2128 reg_shift = vld1q_u32(ind);
2129 results = vandq_u32(reg_shift, reg_masks);
2130 vst1q_u32(out, results);
2131 out += 4;
2132
2133 // shift the 3rd 4 outs
2134 ind[0] = (in[5] >> 16 | in[6] << 16) >> shifts_3rd[0];
2135 ind[1] = in[6] >> shifts_3rd[1];
2136 ind[2] = (in[6] >> 28 | in[7] << 4) >> shifts_3rd[2];
2137 ind[3] = (in[7] >> 18 | in[8] << 14) >> shifts_3rd[3];
2138 reg_shift = vld1q_u32(ind);
2139 results = vandq_u32(reg_shift, reg_masks);
2140 vst1q_u32(out, results);
2141 out += 4;
2142
2143 // shift the 4th 4 outs
2144 ind[0] = in[8] >> shifts_4th[0];
2145 ind[1] = (in[8] >> 30 | in[9] << 2) >> shifts_4th[1];
2146 ind[2] = (in[9] >> 20 | in[10] << 12) >> shifts_4th[2];
2147 ind[3] = in[10] >> shifts_4th[3];
2148 reg_shift = vld1q_u32(ind);
2149 results = vandq_u32(reg_shift, reg_masks);
2150 vst1q_u32(out, results);
2151 out += 4;
2152
2153 // shift the 5th 4 outs
2154 ind[0] = in[11] >> shifts_1st[0];
2155 ind[1] = (in[11] >> 22 | in[12] << 10) >> shifts_1st[1];
2156 ind[2] = (in[12] >> 12 | in[13] << 20) >> shifts_1st[2];
2157 ind[3] = in[13] >> shifts_1st[3];
2158 reg_shift = vld1q_u32(ind);
2159 results = vandq_u32(reg_shift, reg_masks);
2160 vst1q_u32(out, results);
2161 out += 4;
2162
2163 // shift the 6th 4 outs
2164 ind[0] = (in[13] >> 24 | in[14] << 8) >> shifts_2nd[0];
2165 ind[1] = (in[14] >> 14 | in[15] << 18) >> shifts_2nd[1];
2166 ind[2] = in[15] >> shifts_2nd[2];
2167 ind[3] = (in[15] >> 26 | in[16] << 6) >> shifts_2nd[3];
2168 reg_shift = vld1q_u32(ind);
2169 results = vandq_u32(reg_shift, reg_masks);
2170 vst1q_u32(out, results);
2171 out += 4;
2172
2173 // shift the 7th 4 outs
2174 ind[0] = (in[16] >> 16 | in[17] << 16) >> shifts_3rd[0];
2175 ind[1] = in[17] >> shifts_3rd[1];
2176 ind[2] = (in[17] >> 28 | in[18] << 4) >> shifts_3rd[2];
2177 ind[3] = (in[18] >> 18 | in[19] << 14) >> shifts_3rd[3];
2178 reg_shift = vld1q_u32(ind);
2179 results = vandq_u32(reg_shift, reg_masks);
2180 vst1q_u32(out, results);
2181 out += 4;
2182
2183 // shift the 8th 4 outs
2184 ind[0] = in[19] >> shifts_4th[0];
2185 ind[1] = (in[19] >> 30 | in[20] << 2) >> shifts_4th[1];
2186 ind[2] = (in[20] >> 20 | in[21] << 12) >> shifts_4th[2];
2187 ind[3] = in[21] >> shifts_4th[3];
2188 reg_shift = vld1q_u32(ind);
2189 results = vandq_u32(reg_shift, reg_masks);
2190 vst1q_u32(out, results);
2191 out += 4;
2192
2193 in += 22;
2194
2195 return in;
2196}
2197
2198inline static const uint32_t* unpack23_32_neon(const uint32_t* in, uint32_t* out) {
2199 uint32_t mask = 0x7fffff;
2200 uint32_t ind[4];
2201 uint32_t shifts_1st[4] = {0, 0, 0, 5};
2202 uint32_t shifts_2nd[4] = {0, 0, 0, 1};
2203 uint32_t shifts_3rd[4] = {0, 0, 6, 0};
2204 uint32_t shifts_4th[4] = {0, 0, 2, 0};
2205 uint32_t shifts_5th[4] = {0, 7, 0, 0};
2206 uint32_t shifts_6th[4] = {0, 3, 0, 0};
2207 uint32_t shifts_7th[4] = {8, 0, 0, 0};
2208 uint32_t shifts_8th[4] = {4, 0, 0, 9};
2209 uint32x4_t reg_shift, reg_masks;
2210 uint32x4_t results;
2211
2212 reg_masks = vdupq_n_u32(mask);
2213
2214 // shift the first 4 outs
2215 ind[0] = in[0] >> shifts_1st[0];
2216 ind[1] = (in[0] >> 23 | in[1] << 9) >> shifts_1st[1];
2217 ind[2] = (in[1] >> 14 | in[2] << 18) >> shifts_1st[2];
2218 ind[3] = in[2] >> shifts_1st[3];
2219 reg_shift = vld1q_u32(ind);
2220 results = vandq_u32(reg_shift, reg_masks);
2221 vst1q_u32(out, results);
2222 out += 4;
2223
2224 // shift the 2nd 4 outs
2225 ind[0] = (in[2] >> 28 | in[3] << 4) >> shifts_2nd[0];
2226 ind[1] = (in[3] >> 19 | in[4] << 13) >> shifts_2nd[1];
2227 ind[2] = (in[4] >> 10 | in[5] << 22) >> shifts_2nd[2];
2228 ind[3] = in[5] >> shifts_2nd[3];
2229 reg_shift = vld1q_u32(ind);
2230 results = vandq_u32(reg_shift, reg_masks);
2231 vst1q_u32(out, results);
2232 out += 4;
2233
2234 // shift the 3rd 4 outs
2235 ind[0] = (in[5] >> 24 | in[6] << 8) >> shifts_3rd[0];
2236 ind[1] = (in[6] >> 15 | in[7] << 17) >> shifts_3rd[1];
2237 ind[2] = in[7] >> shifts_3rd[2];
2238 ind[3] = (in[7] >> 29 | in[8] << 3) >> shifts_3rd[3];
2239 reg_shift = vld1q_u32(ind);
2240 results = vandq_u32(reg_shift, reg_masks);
2241 vst1q_u32(out, results);
2242 out += 4;
2243
2244 // shift the 4th 4 outs
2245 ind[0] = (in[8] >> 20 | in[9] << 12) >> shifts_4th[0];
2246 ind[1] = (in[9] >> 11 | in[10] << 21) >> shifts_4th[1];
2247 ind[2] = in[10] >> shifts_4th[2];
2248 ind[3] = (in[10] >> 25 | in[11] << 7) >> shifts_4th[3];
2249 reg_shift = vld1q_u32(ind);
2250 results = vandq_u32(reg_shift, reg_masks);
2251 vst1q_u32(out, results);
2252 out += 4;
2253
2254 // shift the 5th 4 outs
2255 ind[0] = (in[11] >> 16 | in[12] << 16) >> shifts_5th[0];
2256 ind[1] = in[12] >> shifts_5th[1];
2257 ind[2] = (in[12] >> 30 | in[13] << 2) >> shifts_5th[2];
2258 ind[3] = (in[13] >> 21 | in[14] << 11) >> shifts_5th[3];
2259 reg_shift = vld1q_u32(ind);
2260 results = vandq_u32(reg_shift, reg_masks);
2261 vst1q_u32(out, results);
2262 out += 4;
2263
2264 // shift the 6th 4 outs
2265 ind[0] = (in[14] >> 12 | in[15] << 20) >> shifts_6th[0];
2266 ind[1] = in[15] >> shifts_6th[1];
2267 ind[2] = (in[15] >> 26 | in[16] << 6) >> shifts_6th[2];
2268 ind[3] = (in[16] >> 17 | in[17] << 15) >> shifts_6th[3];
2269 reg_shift = vld1q_u32(ind);
2270 results = vandq_u32(reg_shift, reg_masks);
2271 vst1q_u32(out, results);
2272 out += 4;
2273
2274 // shift the 7th 4 outs
2275 ind[0] = in[17] >> shifts_7th[0];
2276 ind[1] = (in[17] >> 31 | in[18] << 1) >> shifts_7th[1];
2277 ind[2] = (in[18] >> 22 | in[19] << 10) >> shifts_7th[2];
2278 ind[3] = (in[19] >> 13 | in[20] << 19) >> shifts_7th[3];
2279 reg_shift = vld1q_u32(ind);
2280 results = vandq_u32(reg_shift, reg_masks);
2281 vst1q_u32(out, results);
2282 out += 4;
2283
2284 // shift the 8th 4 outs
2285 ind[0] = in[20] >> shifts_8th[0];
2286 ind[1] = (in[20] >> 27 | in[21] << 5) >> shifts_8th[1];
2287 ind[2] = (in[21] >> 18 | in[22] << 14) >> shifts_8th[2];
2288 ind[3] = in[22] >> shifts_8th[3];
2289 reg_shift = vld1q_u32(ind);
2290 results = vandq_u32(reg_shift, reg_masks);
2291 vst1q_u32(out, results);
2292 out += 4;
2293
2294 in += 23;
2295
2296 return in;
2297}
2298
2299inline static const uint32_t* unpack24_32_neon(const uint32_t* in, uint32_t* out) {
2300 uint32_t mask = 0xffffff;
2301 uint32_t ind[4];
2302 uint32_t shifts_1st[4] = {0, 0, 0, 8};
2303 uint32x4_t reg_shift, reg_masks;
2304 uint32x4_t results;
2305
2306 reg_masks = vdupq_n_u32(mask);
2307
2308 // shift the first 4 outs
2309 ind[0] = in[0] >> shifts_1st[0];
2310 ind[1] = (in[0] >> 24 | in[1] << 8) >> shifts_1st[1];
2311 ind[2] = (in[1] >> 16 | in[2] << 16) >> shifts_1st[2];
2312 ind[3] = in[2] >> shifts_1st[3];
2313 reg_shift = vld1q_u32(ind);
2314 results = vandq_u32(reg_shift, reg_masks);
2315 vst1q_u32(out, results);
2316 out += 4;
2317
2318 // shift the 2nd 4 outs
2319 ind[0] = in[3] >> shifts_1st[0];
2320 ind[1] = (in[3] >> 24 | in[4] << 8) >> shifts_1st[1];
2321 ind[2] = (in[4] >> 16 | in[5] << 16) >> shifts_1st[2];
2322 ind[3] = in[5] >> shifts_1st[3];
2323 reg_shift = vld1q_u32(ind);
2324 results = vandq_u32(reg_shift, reg_masks);
2325 vst1q_u32(out, results);
2326 out += 4;
2327
2328 // shift the 3rd 4 outs
2329 ind[0] = in[6] >> shifts_1st[0];
2330 ind[1] = (in[6] >> 24 | in[7] << 8) >> shifts_1st[1];
2331 ind[2] = (in[7] >> 16 | in[8] << 16) >> shifts_1st[2];
2332 ind[3] = in[8] >> shifts_1st[3];
2333 reg_shift = vld1q_u32(ind);
2334 results = vandq_u32(reg_shift, reg_masks);
2335 vst1q_u32(out, results);
2336 out += 4;
2337
2338 // shift the 4th 4 outs
2339 ind[0] = in[9] >> shifts_1st[0];
2340 ind[1] = (in[9] >> 24 | in[10] << 8) >> shifts_1st[1];
2341 ind[2] = (in[10] >> 16 | in[11] << 16) >> shifts_1st[2];
2342 ind[3] = in[11] >> shifts_1st[3];
2343 reg_shift = vld1q_u32(ind);
2344 results = vandq_u32(reg_shift, reg_masks);
2345 vst1q_u32(out, results);
2346 out += 4;
2347
2348 // shift the 5th 4 outs
2349 ind[0] = in[12] >> shifts_1st[0];
2350 ind[1] = (in[12] >> 24 | in[13] << 8) >> shifts_1st[1];
2351 ind[2] = (in[13] >> 16 | in[14] << 16) >> shifts_1st[2];
2352 ind[3] = in[14] >> shifts_1st[3];
2353 reg_shift = vld1q_u32(ind);
2354 results = vandq_u32(reg_shift, reg_masks);
2355 vst1q_u32(out, results);
2356 out += 4;
2357
2358 // shift the 6th 4 outs
2359 ind[0] = in[15] >> shifts_1st[0];
2360 ind[1] = (in[15] >> 24 | in[16] << 8) >> shifts_1st[1];
2361 ind[2] = (in[16] >> 16 | in[17] << 16) >> shifts_1st[2];
2362 ind[3] = in[17] >> shifts_1st[3];
2363 reg_shift = vld1q_u32(ind);
2364 results = vandq_u32(reg_shift, reg_masks);
2365 vst1q_u32(out, results);
2366 out += 4;
2367
2368 // shift the 7th 4 outs
2369 ind[0] = in[18] >> shifts_1st[0];
2370 ind[1] = (in[18] >> 24 | in[19] << 8) >> shifts_1st[1];
2371 ind[2] = (in[19] >> 16 | in[20] << 16) >> shifts_1st[2];
2372 ind[3] = in[20] >> shifts_1st[3];
2373 reg_shift = vld1q_u32(ind);
2374 results = vandq_u32(reg_shift, reg_masks);
2375 vst1q_u32(out, results);
2376 out += 4;
2377
2378 // shift the 8th 4 outs
2379 ind[0] = in[21] >> shifts_1st[0];
2380 ind[1] = (in[21] >> 24 | in[22] << 8) >> shifts_1st[1];
2381 ind[2] = (in[22] >> 16 | in[23] << 16) >> shifts_1st[2];
2382 ind[3] = in[23] >> shifts_1st[3];
2383 reg_shift = vld1q_u32(ind);
2384 results = vandq_u32(reg_shift, reg_masks);
2385 vst1q_u32(out, results);
2386 out += 4;
2387
2388 in += 24;
2389
2390 return in;
2391}
2392
2393inline static const uint32_t* unpack25_32_neon(const uint32_t* in, uint32_t* out) {
2394 uint32_t mask = 0x1ffffff;
2395 uint32_t ind[4];
2396 uint32_t shifts_1st[4] = {0, 0, 0, 0};
2397 uint32_t shifts_2nd[4] = {4, 0, 0, 0};
2398 uint32_t shifts_3rd[4] = {0, 1, 0, 0};
2399 uint32_t shifts_4th[4] = {0, 5, 0, 0};
2400 uint32_t shifts_5th[4] = {0, 0, 2, 0};
2401 uint32_t shifts_6th[4] = {0, 0, 6, 0};
2402 uint32_t shifts_7th[4] = {0, 0, 0, 3};
2403 uint32_t shifts_8th[4] = {0, 0, 0, 7};
2404 uint32x4_t reg_shift, reg_masks;
2405 uint32x4_t results;
2406
2407 reg_masks = vdupq_n_u32(mask);
2408
2409 // shift the first 4 outs
2410 ind[0] = in[0] >> shifts_1st[0];
2411 ind[1] = (in[0] >> 25 | in[1] << 7) >> shifts_1st[1];
2412 ind[2] = (in[1] >> 18 | in[2] << 14) >> shifts_1st[2];
2413 ind[3] = (in[2] >> 11 | in[3] << 21) >> shifts_1st[3];
2414 reg_shift = vld1q_u32(ind);
2415 results = vandq_u32(reg_shift, reg_masks);
2416 vst1q_u32(out, results);
2417 out += 4;
2418
2419 // shift the 2nd 4 outs
2420 ind[0] = in[3] >> shifts_2nd[0];
2421 ind[1] = (in[3] >> 29 | in[4] << 3) >> shifts_2nd[1];
2422 ind[2] = (in[4] >> 22 | in[5] << 10) >> shifts_2nd[2];
2423 ind[3] = (in[5] >> 15 | in[6] << 17) >> shifts_2nd[3];
2424 reg_shift = vld1q_u32(ind);
2425 results = vandq_u32(reg_shift, reg_masks);
2426 vst1q_u32(out, results);
2427 out += 4;
2428
2429 // shift the 3rd 4 outs
2430 ind[0] = (in[6] >> 8 | in[7] << 24) >> shifts_3rd[0];
2431 ind[1] = in[7] >> shifts_3rd[1];
2432 ind[2] = (in[7] >> 26 | in[8] << 6) >> shifts_3rd[2];
2433 ind[3] = (in[8] >> 19 | in[9] << 13) >> shifts_3rd[3];
2434 reg_shift = vld1q_u32(ind);
2435 results = vandq_u32(reg_shift, reg_masks);
2436 vst1q_u32(out, results);
2437 out += 4;
2438
2439 // shift the 4th 4 outs
2440 ind[0] = (in[9] >> 12 | in[10] << 20) >> shifts_4th[0];
2441 ind[1] = in[10] >> shifts_4th[1];
2442 ind[2] = (in[10] >> 30 | in[11] << 2) >> shifts_4th[2];
2443 ind[3] = (in[11] >> 23 | in[12] << 9) >> shifts_4th[3];
2444 reg_shift = vld1q_u32(ind);
2445 results = vandq_u32(reg_shift, reg_masks);
2446 vst1q_u32(out, results);
2447 out += 4;
2448
2449 // shift the 5th 4 outs
2450 ind[0] = (in[12] >> 16 | in[13] << 16) >> shifts_5th[0];
2451 ind[1] = (in[13] >> 9 | in[14] << 23) >> shifts_5th[1];
2452 ind[2] = in[14] >> shifts_5th[2];
2453 ind[3] = (in[14] >> 27 | in[15] << 5) >> shifts_5th[3];
2454 reg_shift = vld1q_u32(ind);
2455 results = vandq_u32(reg_shift, reg_masks);
2456 vst1q_u32(out, results);
2457 out += 4;
2458
2459 // shift the 6th 4 outs
2460 ind[0] = (in[15] >> 20 | in[16] << 12) >> shifts_6th[0];
2461 ind[1] = (in[16] >> 13 | in[17] << 19) >> shifts_6th[1];
2462 ind[2] = in[17] >> shifts_6th[2];
2463 ind[3] = (in[17] >> 31 | in[18] << 1) >> shifts_6th[3];
2464 reg_shift = vld1q_u32(ind);
2465 results = vandq_u32(reg_shift, reg_masks);
2466 vst1q_u32(out, results);
2467 out += 4;
2468
2469 // shift the 7th 4 outs
2470 ind[0] = (in[18] >> 24 | in[19] << 8) >> shifts_7th[0];
2471 ind[1] = (in[19] >> 17 | in[20] << 15) >> shifts_7th[1];
2472 ind[2] = (in[20] >> 10 | in[21] << 22) >> shifts_7th[2];
2473 ind[3] = in[21] >> shifts_7th[3];
2474 reg_shift = vld1q_u32(ind);
2475 results = vandq_u32(reg_shift, reg_masks);
2476 vst1q_u32(out, results);
2477 out += 4;
2478
2479 // shift the 8th 4 outs
2480 ind[0] = (in[21] >> 28 | in[22] << 4) >> shifts_8th[0];
2481 ind[1] = (in[22] >> 21 | in[23] << 11) >> shifts_8th[1];
2482 ind[2] = (in[23] >> 14 | in[24] << 18) >> shifts_8th[2];
2483 ind[3] = in[24] >> shifts_8th[3];
2484 reg_shift = vld1q_u32(ind);
2485 results = vandq_u32(reg_shift, reg_masks);
2486 vst1q_u32(out, results);
2487 out += 4;
2488
2489 in += 25;
2490
2491 return in;
2492}
2493
2494inline static const uint32_t* unpack26_32_neon(const uint32_t* in, uint32_t* out) {
2495 uint32_t mask = 0x3ffffff;
2496 uint32_t ind[4];
2497 uint32_t shifts_1st[4] = {0, 0, 0, 0};
2498 uint32_t shifts_2nd[4] = {0, 2, 0, 0};
2499 uint32_t shifts_3rd[4] = {0, 0, 4, 0};
2500 uint32_t shifts_4th[4] = {0, 0, 0, 6};
2501 uint32x4_t reg_shift, reg_masks;
2502 uint32x4_t results;
2503
2504 reg_masks = vdupq_n_u32(mask);
2505
2506 // shift the first 4 outs
2507 ind[0] = in[0] >> shifts_1st[0];
2508 ind[1] = (in[0] >> 26 | in[1] << 6) >> shifts_1st[1];
2509 ind[2] = (in[1] >> 20 | in[2] << 12) >> shifts_1st[2];
2510 ind[3] = (in[2] >> 14 | in[3] << 18) >> shifts_1st[3];
2511 reg_shift = vld1q_u32(ind);
2512 results = vandq_u32(reg_shift, reg_masks);
2513 vst1q_u32(out, results);
2514 out += 4;
2515
2516 // shift the 2nd 4 outs
2517 ind[0] = (in[3] >> 8 | in[4] << 24) >> shifts_2nd[0];
2518 ind[1] = in[4] >> shifts_2nd[1];
2519 ind[2] = (in[4] >> 28 | in[5] << 4) >> shifts_2nd[2];
2520 ind[3] = (in[5] >> 22 | in[6] << 10) >> shifts_2nd[3];
2521 reg_shift = vld1q_u32(ind);
2522 results = vandq_u32(reg_shift, reg_masks);
2523 vst1q_u32(out, results);
2524 out += 4;
2525
2526 // shift the 3rd 4 outs
2527 ind[0] = (in[6] >> 16 | in[7] << 16) >> shifts_3rd[0];
2528 ind[1] = (in[7] >> 10 | in[8] << 22) >> shifts_3rd[1];
2529 ind[2] = in[8] >> shifts_3rd[2];
2530 ind[3] = (in[8] >> 30 | in[9] << 2) >> shifts_3rd[3];
2531 reg_shift = vld1q_u32(ind);
2532 results = vandq_u32(reg_shift, reg_masks);
2533 vst1q_u32(out, results);
2534 out += 4;
2535
2536 // shift the 4th 4 outs
2537 ind[0] = (in[9] >> 24 | in[10] << 8) >> shifts_4th[0];
2538 ind[1] = (in[10] >> 18 | in[11] << 14) >> shifts_4th[1];
2539 ind[2] = (in[11] >> 12 | in[12] << 20) >> shifts_4th[2];
2540 ind[3] = in[12] >> shifts_4th[3];
2541 reg_shift = vld1q_u32(ind);
2542 results = vandq_u32(reg_shift, reg_masks);
2543 vst1q_u32(out, results);
2544 out += 4;
2545
2546 // shift the 5th 4 outs
2547 ind[0] = in[13] >> shifts_1st[0];
2548 ind[1] = (in[13] >> 26 | in[14] << 6) >> shifts_1st[1];
2549 ind[2] = (in[14] >> 20 | in[15] << 12) >> shifts_1st[2];
2550 ind[3] = (in[15] >> 14 | in[16] << 18) >> shifts_1st[3];
2551 reg_shift = vld1q_u32(ind);
2552 results = vandq_u32(reg_shift, reg_masks);
2553 vst1q_u32(out, results);
2554 out += 4;
2555
2556 // shift the 6th 4 outs
2557 ind[0] = (in[16] >> 8 | in[17] << 24) >> shifts_2nd[0];
2558 ind[1] = in[17] >> shifts_2nd[1];
2559 ind[2] = (in[17] >> 28 | in[18] << 4) >> shifts_2nd[2];
2560 ind[3] = (in[18] >> 22 | in[19] << 10) >> shifts_2nd[3];
2561 reg_shift = vld1q_u32(ind);
2562 results = vandq_u32(reg_shift, reg_masks);
2563 vst1q_u32(out, results);
2564 out += 4;
2565
2566 // shift the 7th 4 outs
2567 ind[0] = (in[19] >> 16 | in[20] << 16) >> shifts_3rd[0];
2568 ind[1] = (in[20] >> 10 | in[21] << 22) >> shifts_3rd[1];
2569 ind[2] = in[21] >> shifts_3rd[2];
2570 ind[3] = (in[21] >> 30 | in[22] << 2) >> shifts_3rd[3];
2571 reg_shift = vld1q_u32(ind);
2572 results = vandq_u32(reg_shift, reg_masks);
2573 vst1q_u32(out, results);
2574 out += 4;
2575
2576 // shift the 8th 4 outs
2577 ind[0] = (in[22] >> 24 | in[23] << 8) >> shifts_4th[0];
2578 ind[1] = (in[23] >> 18 | in[24] << 14) >> shifts_4th[1];
2579 ind[2] = (in[24] >> 12 | in[25] << 20) >> shifts_4th[2];
2580 ind[3] = in[25] >> shifts_4th[3];
2581 reg_shift = vld1q_u32(ind);
2582 results = vandq_u32(reg_shift, reg_masks);
2583 vst1q_u32(out, results);
2584 out += 4;
2585
2586 in += 26;
2587
2588 return in;
2589}
2590
2591inline static const uint32_t* unpack27_32_neon(const uint32_t* in, uint32_t* out) {
2592 uint32_t mask = 0x7ffffff;
2593 uint32_t ind[4];
2594 uint32_t shifts_1st[4] = {0, 0, 0, 0};
2595 uint32_t shifts_2nd[4] = {0, 0, 2, 0};
2596 uint32_t shifts_3rd[4] = {0, 0, 0, 0};
2597 uint32_t shifts_4th[4] = {4, 0, 0, 0};
2598 uint32_t shifts_5th[4] = {0, 0, 0, 1};
2599 uint32_t shifts_6th[4] = {0, 0, 0, 0};
2600 uint32_t shifts_7th[4] = {0, 3, 0, 0};
2601 uint32_t shifts_8th[4] = {0, 0, 0, 5};
2602 uint32x4_t reg_shift, reg_masks;
2603 uint32x4_t results;
2604
2605 reg_masks = vdupq_n_u32(mask);
2606
2607 // shift the first 4 outs
2608 ind[0] = in[0] >> shifts_1st[0];
2609 ind[1] = (in[0] >> 27 | in[1] << 5) >> shifts_1st[1];
2610 ind[2] = (in[1] >> 22 | in[2] << 10) >> shifts_1st[2];
2611 ind[3] = (in[2] >> 17 | in[3] << 15) >> shifts_1st[3];
2612 reg_shift = vld1q_u32(ind);
2613 results = vandq_u32(reg_shift, reg_masks);
2614 vst1q_u32(out, results);
2615 out += 4;
2616
2617 // shift the 2nd 4 outs
2618 ind[0] = (in[3] >> 12 | in[4] << 20) >> shifts_2nd[0];
2619 ind[1] = (in[4] >> 7 | in[5] << 25) >> shifts_2nd[1];
2620 ind[2] = in[5] >> shifts_2nd[2];
2621 ind[3] = (in[5] >> 29 | in[6] << 3) >> shifts_2nd[3];
2622 reg_shift = vld1q_u32(ind);
2623 results = vandq_u32(reg_shift, reg_masks);
2624 vst1q_u32(out, results);
2625 out += 4;
2626
2627 // shift the 3rd 4 outs
2628 ind[0] = (in[6] >> 24 | in[7] << 8) >> shifts_3rd[0];
2629 ind[1] = (in[7] >> 19 | in[8] << 13) >> shifts_3rd[1];
2630 ind[2] = (in[8] >> 14 | in[9] << 18) >> shifts_3rd[2];
2631 ind[3] = (in[9] >> 9 | in[10] << 23) >> shifts_3rd[3];
2632 reg_shift = vld1q_u32(ind);
2633 results = vandq_u32(reg_shift, reg_masks);
2634 vst1q_u32(out, results);
2635 out += 4;
2636
2637 // shift the 4th 4 outs
2638 ind[0] = in[10] >> shifts_4th[0];
2639 ind[1] = (in[10] >> 31 | in[11] << 1) >> shifts_4th[1];
2640 ind[2] = (in[11] >> 26 | in[12] << 6) >> shifts_4th[2];
2641 ind[3] = (in[12] >> 21 | in[13] << 11) >> shifts_4th[3];
2642 reg_shift = vld1q_u32(ind);
2643 results = vandq_u32(reg_shift, reg_masks);
2644 vst1q_u32(out, results);
2645 out += 4;
2646
2647 // shift the 5th 4 outs
2648 ind[0] = (in[13] >> 16 | in[14] << 16) >> shifts_5th[0];
2649 ind[1] = (in[14] >> 11 | in[15] << 21) >> shifts_5th[1];
2650 ind[2] = (in[15] >> 6 | in[16] << 26) >> shifts_5th[2];
2651 ind[3] = in[16] >> shifts_5th[3];
2652 reg_shift = vld1q_u32(ind);
2653 results = vandq_u32(reg_shift, reg_masks);
2654 vst1q_u32(out, results);
2655 out += 4;
2656
2657 // shift the 6th 4 outs
2658 ind[0] = (in[16] >> 28 | in[17] << 4) >> shifts_6th[0];
2659 ind[1] = (in[17] >> 23 | in[18] << 9) >> shifts_6th[1];
2660 ind[2] = (in[18] >> 18 | in[19] << 14) >> shifts_6th[2];
2661 ind[3] = (in[19] >> 13 | in[20] << 19) >> shifts_6th[3];
2662 reg_shift = vld1q_u32(ind);
2663 results = vandq_u32(reg_shift, reg_masks);
2664 vst1q_u32(out, results);
2665 out += 4;
2666
2667 // shift the 7th 4 outs
2668 ind[0] = (in[20] >> 8 | in[21] << 24) >> shifts_7th[0];
2669 ind[1] = in[21] >> shifts_7th[1];
2670 ind[2] = (in[21] >> 30 | in[22] << 2) >> shifts_7th[2];
2671 ind[3] = (in[22] >> 25 | in[23] << 7) >> shifts_7th[3];
2672 reg_shift = vld1q_u32(ind);
2673 results = vandq_u32(reg_shift, reg_masks);
2674 vst1q_u32(out, results);
2675 out += 4;
2676
2677 // shift the 8th 4 outs
2678 ind[0] = (in[23] >> 20 | in[24] << 12) >> shifts_8th[0];
2679 ind[1] = (in[24] >> 15 | in[25] << 17) >> shifts_8th[1];
2680 ind[2] = (in[25] >> 10 | in[26] << 22) >> shifts_8th[2];
2681 ind[3] = in[26] >> shifts_8th[3];
2682 reg_shift = vld1q_u32(ind);
2683 results = vandq_u32(reg_shift, reg_masks);
2684 vst1q_u32(out, results);
2685 out += 4;
2686
2687 in += 27;
2688
2689 return in;
2690}
2691
2692inline static const uint32_t* unpack28_32_neon(const uint32_t* in, uint32_t* out) {
2693 uint32_t mask = 0xfffffff;
2694 uint32_t ind[4];
2695 uint32_t shifts_1st[4] = {0, 0, 0, 0};
2696 uint32_t shifts_2nd[4] = {0, 0, 0, 4};
2697 uint32x4_t reg_shift, reg_masks;
2698 uint32x4_t results;
2699
2700 reg_masks = vdupq_n_u32(mask);
2701
2702 // shift the first 4 outs
2703 ind[0] = in[0] >> shifts_1st[0];
2704 ind[1] = (in[0] >> 28 | in[1] << 4) >> shifts_1st[1];
2705 ind[2] = (in[1] >> 24 | in[2] << 8) >> shifts_1st[2];
2706 ind[3] = (in[2] >> 20 | in[3] << 12) >> shifts_1st[3];
2707 reg_shift = vld1q_u32(ind);
2708 results = vandq_u32(reg_shift, reg_masks);
2709 vst1q_u32(out, results);
2710 out += 4;
2711
2712 // shift the 2nd 4 outs
2713 ind[0] = (in[3] >> 16 | in[4] << 16) >> shifts_2nd[0];
2714 ind[1] = (in[4] >> 12 | in[5] << 20) >> shifts_2nd[1];
2715 ind[2] = (in[5] >> 8 | in[6] << 24) >> shifts_2nd[2];
2716 ind[3] = in[6] >> shifts_2nd[3];
2717 reg_shift = vld1q_u32(ind);
2718 results = vandq_u32(reg_shift, reg_masks);
2719 vst1q_u32(out, results);
2720 out += 4;
2721
2722 // shift the 3rd 4 outs
2723 ind[0] = in[7] >> shifts_1st[0];
2724 ind[1] = (in[7] >> 28 | in[8] << 4) >> shifts_1st[1];
2725 ind[2] = (in[8] >> 24 | in[9] << 8) >> shifts_1st[2];
2726 ind[3] = (in[9] >> 20 | in[10] << 12) >> shifts_1st[3];
2727 reg_shift = vld1q_u32(ind);
2728 results = vandq_u32(reg_shift, reg_masks);
2729 vst1q_u32(out, results);
2730 out += 4;
2731
2732 // shift the 4th 4 outs
2733 ind[0] = (in[10] >> 16 | in[11] << 16) >> shifts_2nd[0];
2734 ind[1] = (in[11] >> 12 | in[12] << 20) >> shifts_2nd[1];
2735 ind[2] = (in[12] >> 8 | in[13] << 24) >> shifts_2nd[2];
2736 ind[3] = in[13] >> shifts_2nd[3];
2737 reg_shift = vld1q_u32(ind);
2738 results = vandq_u32(reg_shift, reg_masks);
2739 vst1q_u32(out, results);
2740 out += 4;
2741
2742 // shift the 5th 4 outs
2743 ind[0] = in[14] >> shifts_1st[0];
2744 ind[1] = (in[14] >> 28 | in[15] << 4) >> shifts_1st[1];
2745 ind[2] = (in[15] >> 24 | in[16] << 8) >> shifts_1st[2];
2746 ind[3] = (in[16] >> 20 | in[17] << 12) >> shifts_1st[3];
2747 reg_shift = vld1q_u32(ind);
2748 results = vandq_u32(reg_shift, reg_masks);
2749 vst1q_u32(out, results);
2750 out += 4;
2751
2752 // shift the 6th 4 outs
2753 ind[0] = (in[17] >> 16 | in[18] << 16) >> shifts_2nd[0];
2754 ind[1] = (in[18] >> 12 | in[19] << 20) >> shifts_2nd[1];
2755 ind[2] = (in[19] >> 8 | in[20] << 24) >> shifts_2nd[2];
2756 ind[3] = in[20] >> shifts_2nd[3];
2757 reg_shift = vld1q_u32(ind);
2758 results = vandq_u32(reg_shift, reg_masks);
2759 vst1q_u32(out, results);
2760 out += 4;
2761
2762 // shift the 7th 4 outs
2763 ind[0] = in[21] >> shifts_1st[0];
2764 ind[1] = (in[21] >> 28 | in[22] << 4) >> shifts_1st[1];
2765 ind[2] = (in[22] >> 24 | in[23] << 8) >> shifts_1st[2];
2766 ind[3] = (in[23] >> 20 | in[24] << 12) >> shifts_1st[3];
2767 reg_shift = vld1q_u32(ind);
2768 results = vandq_u32(reg_shift, reg_masks);
2769 vst1q_u32(out, results);
2770 out += 4;
2771
2772 // shift the 8th 4 outs
2773 ind[0] = (in[24] >> 16 | in[25] << 16) >> shifts_2nd[0];
2774 ind[1] = (in[25] >> 12 | in[26] << 20) >> shifts_2nd[1];
2775 ind[2] = (in[26] >> 8 | in[27] << 24) >> shifts_2nd[2];
2776 ind[3] = in[27] >> shifts_2nd[3];
2777 reg_shift = vld1q_u32(ind);
2778 results = vandq_u32(reg_shift, reg_masks);
2779 vst1q_u32(out, results);
2780 out += 4;
2781
2782 in += 28;
2783
2784 return in;
2785}
2786
2787inline static const uint32_t* unpack29_32_neon(const uint32_t* in, uint32_t* out) {
2788 uint32_t mask = 0x1fffffff;
2789 uint32_t ind[4];
2790 uint32_t shifts_1st[4] = {0, 0, 0, 0};
2791 uint32_t shifts_2nd[4] = {0, 0, 0, 0};
2792 uint32_t shifts_3rd[4] = {0, 0, 2, 0};
2793 uint32_t shifts_4th[4] = {0, 0, 0, 0};
2794 uint32_t shifts_5th[4] = {0, 0, 0, 0};
2795 uint32_t shifts_6th[4] = {0, 1, 0, 0};
2796 uint32_t shifts_7th[4] = {0, 0, 0, 0};
2797 uint32_t shifts_8th[4] = {0, 0, 0, 3};
2798 uint32x4_t reg_shift, reg_masks;
2799 uint32x4_t results;
2800
2801 reg_masks = vdupq_n_u32(mask);
2802
2803 // shift the first 4 outs
2804 ind[0] = in[0] >> shifts_1st[0];
2805 ind[1] = (in[0] >> 29 | in[1] << 3) >> shifts_1st[1];
2806 ind[2] = (in[1] >> 26 | in[2] << 6) >> shifts_1st[2];
2807 ind[3] = (in[2] >> 23 | in[3] << 9) >> shifts_1st[3];
2808 reg_shift = vld1q_u32(ind);
2809 results = vandq_u32(reg_shift, reg_masks);
2810 vst1q_u32(out, results);
2811 out += 4;
2812
2813 // shift the 2nd 4 outs
2814 ind[0] = (in[3] >> 20 | in[4] << 12) >> shifts_2nd[0];
2815 ind[1] = (in[4] >> 17 | in[5] << 15) >> shifts_2nd[1];
2816 ind[2] = (in[5] >> 14 | in[6] << 18) >> shifts_2nd[2];
2817 ind[3] = (in[6] >> 11 | in[7] << 21) >> shifts_2nd[3];
2818 reg_shift = vld1q_u32(ind);
2819 results = vandq_u32(reg_shift, reg_masks);
2820 vst1q_u32(out, results);
2821 out += 4;
2822
2823 // shift the 3rd 4 outs
2824 ind[0] = (in[7] >> 8 | in[8] << 24) >> shifts_3rd[0];
2825 ind[1] = (in[8] >> 5 | in[9] << 27) >> shifts_3rd[1];
2826 ind[2] = in[9] >> shifts_3rd[2];
2827 ind[3] = (in[9] >> 31 | in[10] << 1) >> shifts_3rd[3];
2828 reg_shift = vld1q_u32(ind);
2829 results = vandq_u32(reg_shift, reg_masks);
2830 vst1q_u32(out, results);
2831 out += 4;
2832
2833 // shift the 4th 4 outs
2834 ind[0] = (in[10] >> 28 | in[11] << 4) >> shifts_4th[0];
2835 ind[1] = (in[11] >> 25 | in[12] << 7) >> shifts_4th[1];
2836 ind[2] = (in[12] >> 22 | in[13] << 10) >> shifts_4th[2];
2837 ind[3] = (in[13] >> 19 | in[14] << 13) >> shifts_4th[3];
2838 reg_shift = vld1q_u32(ind);
2839 results = vandq_u32(reg_shift, reg_masks);
2840 vst1q_u32(out, results);
2841 out += 4;
2842
2843 // shift the 5th 4 outs
2844 ind[0] = (in[14] >> 16 | in[15] << 16) >> shifts_5th[0];
2845 ind[1] = (in[15] >> 13 | in[16] << 19) >> shifts_5th[1];
2846 ind[2] = (in[16] >> 10 | in[17] << 22) >> shifts_5th[2];
2847 ind[3] = (in[17] >> 7 | in[18] << 25) >> shifts_5th[3];
2848 reg_shift = vld1q_u32(ind);
2849 results = vandq_u32(reg_shift, reg_masks);
2850 vst1q_u32(out, results);
2851 out += 4;
2852
2853 // shift the 6th 4 outs
2854 ind[0] = (in[18] >> 4 | in[19] << 28) >> shifts_6th[0];
2855 ind[1] = in[19] >> shifts_6th[1];
2856 ind[2] = (in[19] >> 30 | in[20] << 2) >> shifts_6th[2];
2857 ind[3] = (in[20] >> 27 | in[21] << 5) >> shifts_6th[3];
2858 reg_shift = vld1q_u32(ind);
2859 results = vandq_u32(reg_shift, reg_masks);
2860 vst1q_u32(out, results);
2861 out += 4;
2862
2863 // shift the 7th 4 outs
2864 ind[0] = (in[21] >> 24 | in[22] << 8) >> shifts_7th[0];
2865 ind[1] = (in[22] >> 21 | in[23] << 11) >> shifts_7th[1];
2866 ind[2] = (in[23] >> 18 | in[24] << 14) >> shifts_7th[2];
2867 ind[3] = (in[24] >> 15 | in[25] << 17) >> shifts_7th[3];
2868 reg_shift = vld1q_u32(ind);
2869 results = vandq_u32(reg_shift, reg_masks);
2870 vst1q_u32(out, results);
2871 out += 4;
2872
2873 // shift the 8th 4 outs
2874 ind[0] = (in[25] >> 12 | in[26] << 20) >> shifts_8th[0];
2875 ind[1] = (in[26] >> 9 | in[27] << 23) >> shifts_8th[1];
2876 ind[2] = (in[27] >> 6 | in[28] << 26) >> shifts_8th[2];
2877 ind[3] = in[28] >> shifts_8th[3];
2878 reg_shift = vld1q_u32(ind);
2879 results = vandq_u32(reg_shift, reg_masks);
2880 vst1q_u32(out, results);
2881 out += 4;
2882
2883 in += 29;
2884
2885 return in;
2886}
2887
2888inline static const uint32_t* unpack30_32_neon(const uint32_t* in, uint32_t* out) {
2889 uint32_t mask = 0x3fffffff;
2890 uint32_t ind[4];
2891 uint32_t shifts_1st[4] = {0, 0, 0, 0};
2892 uint32_t shifts_2nd[4] = {0, 0, 0, 0};
2893 uint32_t shifts_3rd[4] = {0, 0, 0, 0};
2894 uint32_t shifts_4th[4] = {0, 0, 0, 2};
2895 uint32x4_t reg_shift, reg_masks;
2896 uint32x4_t results;
2897
2898 reg_masks = vdupq_n_u32(mask);
2899
2900 // shift the first 4 outs
2901 ind[0] = in[0] >> shifts_1st[0];
2902 ind[1] = (in[0] >> 30 | in[1] << 2) >> shifts_1st[1];
2903 ind[2] = (in[1] >> 28 | in[2] << 4) >> shifts_1st[2];
2904 ind[3] = (in[2] >> 26 | in[3] << 6) >> shifts_1st[3];
2905 reg_shift = vld1q_u32(ind);
2906 results = vandq_u32(reg_shift, reg_masks);
2907 vst1q_u32(out, results);
2908 out += 4;
2909
2910 // shift the 2nd 4 outs
2911 ind[0] = (in[3] >> 24 | in[4] << 8) >> shifts_2nd[0];
2912 ind[1] = (in[4] >> 22 | in[5] << 10) >> shifts_2nd[1];
2913 ind[2] = (in[5] >> 20 | in[6] << 12) >> shifts_2nd[2];
2914 ind[3] = (in[6] >> 18 | in[7] << 14) >> shifts_2nd[3];
2915 reg_shift = vld1q_u32(ind);
2916 results = vandq_u32(reg_shift, reg_masks);
2917 vst1q_u32(out, results);
2918 out += 4;
2919
2920 // shift the 3rd 4 outs
2921 ind[0] = (in[7] >> 16 | in[8] << 16) >> shifts_3rd[0];
2922 ind[1] = (in[8] >> 14 | in[9] << 18) >> shifts_3rd[1];
2923 ind[2] = (in[9] >> 12 | in[10] << 20) >> shifts_3rd[2];
2924 ind[3] = (in[10] >> 10 | in[11] << 22) >> shifts_3rd[3];
2925 reg_shift = vld1q_u32(ind);
2926 results = vandq_u32(reg_shift, reg_masks);
2927 vst1q_u32(out, results);
2928 out += 4;
2929
2930 // shift the 4th 4 outs
2931 ind[0] = (in[11] >> 8 | in[12] << 24) >> shifts_4th[0];
2932 ind[1] = (in[12] >> 6 | in[13] << 26) >> shifts_4th[1];
2933 ind[2] = (in[13] >> 4 | in[14] << 28) >> shifts_4th[2];
2934 ind[3] = in[14] >> shifts_4th[3];
2935 reg_shift = vld1q_u32(ind);
2936 results = vandq_u32(reg_shift, reg_masks);
2937 vst1q_u32(out, results);
2938 out += 4;
2939
2940 // shift the 5th 4 outs
2941 ind[0] = in[15] >> shifts_1st[0];
2942 ind[1] = (in[15] >> 30 | in[16] << 2) >> shifts_1st[1];
2943 ind[2] = (in[16] >> 28 | in[17] << 4) >> shifts_1st[2];
2944 ind[3] = (in[17] >> 26 | in[18] << 6) >> shifts_1st[3];
2945 reg_shift = vld1q_u32(ind);
2946 results = vandq_u32(reg_shift, reg_masks);
2947 vst1q_u32(out, results);
2948 out += 4;
2949
2950 // shift the 6th 4 outs
2951 ind[0] = (in[18] >> 24 | in[19] << 8) >> shifts_2nd[0];
2952 ind[1] = (in[19] >> 22 | in[20] << 10) >> shifts_2nd[1];
2953 ind[2] = (in[20] >> 20 | in[21] << 12) >> shifts_2nd[2];
2954 ind[3] = (in[21] >> 18 | in[22] << 14) >> shifts_2nd[3];
2955 reg_shift = vld1q_u32(ind);
2956 results = vandq_u32(reg_shift, reg_masks);
2957 vst1q_u32(out, results);
2958 out += 4;
2959
2960 // shift the 7th 4 outs
2961 ind[0] = (in[22] >> 16 | in[23] << 16) >> shifts_3rd[0];
2962 ind[1] = (in[23] >> 14 | in[24] << 18) >> shifts_3rd[1];
2963 ind[2] = (in[24] >> 12 | in[25] << 20) >> shifts_3rd[2];
2964 ind[3] = (in[25] >> 10 | in[26] << 22) >> shifts_3rd[3];
2965 reg_shift = vld1q_u32(ind);
2966 results = vandq_u32(reg_shift, reg_masks);
2967 vst1q_u32(out, results);
2968 out += 4;
2969
2970 // shift the 8th 4 outs
2971 ind[0] = (in[26] >> 8 | in[27] << 24) >> shifts_4th[0];
2972 ind[1] = (in[27] >> 6 | in[28] << 26) >> shifts_4th[1];
2973 ind[2] = (in[28] >> 4 | in[29] << 28) >> shifts_4th[2];
2974 ind[3] = in[29] >> shifts_4th[3];
2975 reg_shift = vld1q_u32(ind);
2976 results = vandq_u32(reg_shift, reg_masks);
2977 vst1q_u32(out, results);
2978 out += 4;
2979
2980 in += 30;
2981
2982 return in;
2983}
2984
2985inline static const uint32_t* unpack31_32_neon(const uint32_t* in, uint32_t* out) {
2986 uint32_t mask = 0x7fffffff;
2987 uint32_t ind[4];
2988 uint32_t shifts_1st[4] = {0, 0, 0, 0};
2989 uint32_t shifts_2nd[4] = {0, 0, 0, 1};
2990 uint32x4_t reg_shift, reg_masks;
2991 uint32x4_t results;
2992
2993 reg_masks = vdupq_n_u32(mask);
2994
2995 // shift the first 4 outs
2996 ind[0] = in[0] >> shifts_1st[0];
2997 ind[1] = (in[0] >> 31 | in[1] << 1) >> shifts_1st[1];
2998 ind[2] = (in[1] >> 30 | in[2] << 2) >> shifts_1st[2];
2999 ind[3] = (in[2] >> 29 | in[3] << 3) >> shifts_1st[3];
3000 reg_shift = vld1q_u32(ind);
3001 results = vandq_u32(reg_shift, reg_masks);
3002 vst1q_u32(out, results);
3003 out += 4;
3004
3005 // shift the 2nd 4 outs
3006 ind[0] = (in[3] >> 28 | in[4] << 4) >> shifts_1st[0];
3007 ind[1] = (in[4] >> 27 | in[5] << 5) >> shifts_1st[1];
3008 ind[2] = (in[5] >> 26 | in[6] << 6) >> shifts_1st[2];
3009 ind[3] = (in[6] >> 25 | in[7] << 7) >> shifts_1st[3];
3010 reg_shift = vld1q_u32(ind);
3011 results = vandq_u32(reg_shift, reg_masks);
3012 vst1q_u32(out, results);
3013 out += 4;
3014
3015 // shift the 3rd 4 outs
3016 ind[0] = (in[7] >> 24 | in[8] << 8) >> shifts_1st[0];
3017 ind[1] = (in[8] >> 23 | in[9] << 9) >> shifts_1st[1];
3018 ind[2] = (in[9] >> 22 | in[10] << 10) >> shifts_1st[2];
3019 ind[3] = (in[10] >> 21 | in[11] << 11) >> shifts_1st[3];
3020 reg_shift = vld1q_u32(ind);
3021 results = vandq_u32(reg_shift, reg_masks);
3022 vst1q_u32(out, results);
3023 out += 4;
3024
3025 // shift the 4th 4 outs
3026 ind[0] = (in[11] >> 20 | in[12] << 12) >> shifts_1st[0];
3027 ind[1] = (in[12] >> 19 | in[13] << 13) >> shifts_1st[1];
3028 ind[2] = (in[13] >> 18 | in[14] << 14) >> shifts_1st[2];
3029 ind[3] = (in[14] >> 17 | in[15] << 15) >> shifts_1st[3];
3030 reg_shift = vld1q_u32(ind);
3031 results = vandq_u32(reg_shift, reg_masks);
3032 vst1q_u32(out, results);
3033 out += 4;
3034
3035 // shift the 5th 4 outs
3036 ind[0] = (in[15] >> 16 | in[16] << 16) >> shifts_1st[0];
3037 ind[1] = (in[16] >> 15 | in[17] << 17) >> shifts_1st[1];
3038 ind[2] = (in[17] >> 14 | in[18] << 18) >> shifts_1st[2];
3039 ind[3] = (in[18] >> 13 | in[19] << 19) >> shifts_1st[3];
3040 reg_shift = vld1q_u32(ind);
3041 results = vandq_u32(reg_shift, reg_masks);
3042 vst1q_u32(out, results);
3043 out += 4;
3044
3045 // shift the 6th 4 outs
3046 ind[0] = (in[19] >> 12 | in[20] << 20) >> shifts_1st[0];
3047 ind[1] = (in[20] >> 11 | in[21] << 21) >> shifts_1st[1];
3048 ind[2] = (in[21] >> 10 | in[22] << 22) >> shifts_1st[2];
3049 ind[3] = (in[22] >> 9 | in[23] << 23) >> shifts_1st[3];
3050 reg_shift = vld1q_u32(ind);
3051 results = vandq_u32(reg_shift, reg_masks);
3052 vst1q_u32(out, results);
3053 out += 4;
3054
3055 // shift the 7th 4 outs
3056 ind[0] = (in[23] >> 8 | in[24] << 24) >> shifts_1st[0];
3057 ind[1] = (in[24] >> 7 | in[25] << 25) >> shifts_1st[1];
3058 ind[2] = (in[25] >> 6 | in[26] << 26) >> shifts_1st[2];
3059 ind[3] = (in[26] >> 5 | in[27] << 27) >> shifts_1st[3];
3060 reg_shift = vld1q_u32(ind);
3061 results = vandq_u32(reg_shift, reg_masks);
3062 vst1q_u32(out, results);
3063 out += 4;
3064
3065 // shift the 8th 4 outs
3066 ind[0] = (in[27] >> 4 | in[28] << 28) >> shifts_2nd[0];
3067 ind[1] = (in[28] >> 3 | in[29] << 29) >> shifts_2nd[1];
3068 ind[2] = (in[29] >> 2 | in[30] << 30) >> shifts_2nd[2];
3069 ind[3] = in[30] >> shifts_2nd[3];
3070 reg_shift = vld1q_u32(ind);
3071 results = vandq_u32(reg_shift, reg_masks);
3072 vst1q_u32(out, results);
3073 out += 4;
3074
3075 in += 31;
3076
3077 return in;
3078}
3079
3080inline const uint32_t* unpack32_32_neon(const uint32_t* in, uint32_t* out) {
3081 for (const uint32_t* end = out + 32; out != end; out++) {
3082 *out = *in;
3083 in++;
3084 }
3085
3086 return in;
3087}
3088
3089int unpack32_neon(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
3090 batch_size = batch_size / 32 * 32;
3091 int num_loops = batch_size / 32;
3092
3093 switch (num_bits) {
3094 case 0:
3095 for (int i = 0; i < num_loops; ++i) in = unpack0_32_neon(in, out + i * 32);
3096 break;
3097 case 1:
3098 for (int i = 0; i < num_loops; ++i) in = unpack1_32_neon(in, out + i * 32);
3099 break;
3100 case 2:
3101 for (int i = 0; i < num_loops; ++i) in = unpack2_32_neon(in, out + i * 32);
3102 break;
3103 case 3:
3104 for (int i = 0; i < num_loops; ++i) in = unpack3_32_neon(in, out + i * 32);
3105 break;
3106 case 4:
3107 for (int i = 0; i < num_loops; ++i) in = unpack4_32_neon(in, out + i * 32);
3108 break;
3109 case 5:
3110 for (int i = 0; i < num_loops; ++i) in = unpack5_32_neon(in, out + i * 32);
3111 break;
3112 case 6:
3113 for (int i = 0; i < num_loops; ++i) in = unpack6_32_neon(in, out + i * 32);
3114 break;
3115 case 7:
3116 for (int i = 0; i < num_loops; ++i) in = unpack7_32_neon(in, out + i * 32);
3117 break;
3118 case 8:
3119 for (int i = 0; i < num_loops; ++i) in = unpack8_32_neon(in, out + i * 32);
3120 break;
3121 case 9:
3122 for (int i = 0; i < num_loops; ++i) in = unpack9_32_neon(in, out + i * 32);
3123 break;
3124 case 10:
3125 for (int i = 0; i < num_loops; ++i) in = unpack10_32_neon(in, out + i * 32);
3126 break;
3127 case 11:
3128 for (int i = 0; i < num_loops; ++i) in = unpack11_32_neon(in, out + i * 32);
3129 break;
3130 case 12:
3131 for (int i = 0; i < num_loops; ++i) in = unpack12_32_neon(in, out + i * 32);
3132 break;
3133 case 13:
3134 for (int i = 0; i < num_loops; ++i) in = unpack13_32_neon(in, out + i * 32);
3135 break;
3136 case 14:
3137 for (int i = 0; i < num_loops; ++i) in = unpack14_32_neon(in, out + i * 32);
3138 break;
3139 case 15:
3140 for (int i = 0; i < num_loops; ++i) in = unpack15_32_neon(in, out + i * 32);
3141 break;
3142 case 16:
3143 for (int i = 0; i < num_loops; ++i) in = unpack16_32_neon(in, out + i * 32);
3144 break;
3145 case 17:
3146 for (int i = 0; i < num_loops; ++i) in = unpack17_32_neon(in, out + i * 32);
3147 break;
3148 case 18:
3149 for (int i = 0; i < num_loops; ++i) in = unpack18_32_neon(in, out + i * 32);
3150 break;
3151 case 19:
3152 for (int i = 0; i < num_loops; ++i) in = unpack19_32_neon(in, out + i * 32);
3153 break;
3154 case 20:
3155 for (int i = 0; i < num_loops; ++i) in = unpack20_32_neon(in, out + i * 32);
3156 break;
3157 case 21:
3158 for (int i = 0; i < num_loops; ++i) in = unpack21_32_neon(in, out + i * 32);
3159 break;
3160 case 22:
3161 for (int i = 0; i < num_loops; ++i) in = unpack22_32_neon(in, out + i * 32);
3162 break;
3163 case 23:
3164 for (int i = 0; i < num_loops; ++i) in = unpack23_32_neon(in, out + i * 32);
3165 break;
3166 case 24:
3167 for (int i = 0; i < num_loops; ++i) in = unpack24_32_neon(in, out + i * 32);
3168 break;
3169 case 25:
3170 for (int i = 0; i < num_loops; ++i) in = unpack25_32_neon(in, out + i * 32);
3171 break;
3172 case 26:
3173 for (int i = 0; i < num_loops; ++i) in = unpack26_32_neon(in, out + i * 32);
3174 break;
3175 case 27:
3176 for (int i = 0; i < num_loops; ++i) in = unpack27_32_neon(in, out + i * 32);
3177 break;
3178 case 28:
3179 for (int i = 0; i < num_loops; ++i) in = unpack28_32_neon(in, out + i * 32);
3180 break;
3181 case 29:
3182 for (int i = 0; i < num_loops; ++i) in = unpack29_32_neon(in, out + i * 32);
3183 break;
3184 case 30:
3185 for (int i = 0; i < num_loops; ++i) in = unpack30_32_neon(in, out + i * 32);
3186 break;
3187 case 31:
3188 for (int i = 0; i < num_loops; ++i) in = unpack31_32_neon(in, out + i * 32);
3189 break;
3190 case 32:
3191 for (int i = 0; i < num_loops; ++i) in = unpack32_32_neon(in, out + i * 32);
3192 break;
3193 }
3194
3195 return batch_size;
3196}
View as plain text