1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17
18 /**
19 * @brief Functions to generate block size descriptor and decimation tables.
20 */
21
22 #include "astcenc_internal.h"
23
24 /**
25 * @brief Decode the properties of an encoded 2D block mode.
26 *
27 * @param block_mode The encoded block mode.
28 * @param[out] x_weights The number of weights in the X dimension.
29 * @param[out] y_weights The number of weights in the Y dimension.
30 * @param[out] is_dual_plane True if this block mode has two weight planes.
31 * @param[out] quant_mode The quantization level for the weights.
32 * @param[out] weight_bits The storage bit count for the weights.
33 *
34 * @return Returns true if a valid mode, false otherwise.
35 */
decode_block_mode_2d(unsigned int block_mode,unsigned int & x_weights,unsigned int & y_weights,bool & is_dual_plane,unsigned int & quant_mode,unsigned int & weight_bits)36 static bool decode_block_mode_2d(
37 unsigned int block_mode,
38 unsigned int& x_weights,
39 unsigned int& y_weights,
40 bool& is_dual_plane,
41 unsigned int& quant_mode,
42 unsigned int& weight_bits
43 ) {
44 unsigned int base_quant_mode = (block_mode >> 4) & 1;
45 unsigned int H = (block_mode >> 9) & 1;
46 unsigned int D = (block_mode >> 10) & 1;
47 unsigned int A = (block_mode >> 5) & 0x3;
48
49 x_weights = 0;
50 y_weights = 0;
51
52 if ((block_mode & 3) != 0)
53 {
54 base_quant_mode |= (block_mode & 3) << 1;
55 unsigned int B = (block_mode >> 7) & 3;
56 switch ((block_mode >> 2) & 3)
57 {
58 case 0:
59 x_weights = B + 4;
60 y_weights = A + 2;
61 break;
62 case 1:
63 x_weights = B + 8;
64 y_weights = A + 2;
65 break;
66 case 2:
67 x_weights = A + 2;
68 y_weights = B + 8;
69 break;
70 case 3:
71 B &= 1;
72 if (block_mode & 0x100)
73 {
74 x_weights = B + 2;
75 y_weights = A + 2;
76 }
77 else
78 {
79 x_weights = A + 2;
80 y_weights = B + 6;
81 }
82 break;
83 }
84 }
85 else
86 {
87 base_quant_mode |= ((block_mode >> 2) & 3) << 1;
88 if (((block_mode >> 2) & 3) == 0)
89 {
90 return false;
91 }
92
93 unsigned int B = (block_mode >> 9) & 3;
94 switch ((block_mode >> 7) & 3)
95 {
96 case 0:
97 x_weights = 12;
98 y_weights = A + 2;
99 break;
100 case 1:
101 x_weights = A + 2;
102 y_weights = 12;
103 break;
104 case 2:
105 x_weights = A + 6;
106 y_weights = B + 6;
107 D = 0;
108 H = 0;
109 break;
110 case 3:
111 switch ((block_mode >> 5) & 3)
112 {
113 case 0:
114 x_weights = 6;
115 y_weights = 10;
116 break;
117 case 1:
118 x_weights = 10;
119 y_weights = 6;
120 break;
121 case 2:
122 case 3:
123 return false;
124 }
125 break;
126 }
127 }
128
129 unsigned int weight_count = x_weights * y_weights * (D + 1);
130 quant_mode = (base_quant_mode - 2) + 6 * H;
131 is_dual_plane = D != 0;
132
133 weight_bits = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(quant_mode));
134 return (weight_count <= BLOCK_MAX_WEIGHTS &&
135 weight_bits >= BLOCK_MIN_WEIGHT_BITS &&
136 weight_bits <= BLOCK_MAX_WEIGHT_BITS);
137 }
138
139 /**
140 * @brief Decode the properties of an encoded 3D block mode.
141 *
142 * @param block_mode The encoded block mode.
143 * @param[out] x_weights The number of weights in the X dimension.
144 * @param[out] y_weights The number of weights in the Y dimension.
145 * @param[out] z_weights The number of weights in the Z dimension.
146 * @param[out] is_dual_plane True if this block mode has two weight planes.
147 * @param[out] quant_mode The quantization level for the weights.
148 * @param[out] weight_bits The storage bit count for the weights.
149 *
150 * @return Returns true if a valid mode, false otherwise.
151 */
decode_block_mode_3d(unsigned int block_mode,unsigned int & x_weights,unsigned int & y_weights,unsigned int & z_weights,bool & is_dual_plane,unsigned int & quant_mode,unsigned int & weight_bits)152 static bool decode_block_mode_3d(
153 unsigned int block_mode,
154 unsigned int& x_weights,
155 unsigned int& y_weights,
156 unsigned int& z_weights,
157 bool& is_dual_plane,
158 unsigned int& quant_mode,
159 unsigned int& weight_bits
160 ) {
161 unsigned int base_quant_mode = (block_mode >> 4) & 1;
162 unsigned int H = (block_mode >> 9) & 1;
163 unsigned int D = (block_mode >> 10) & 1;
164 unsigned int A = (block_mode >> 5) & 0x3;
165
166 x_weights = 0;
167 y_weights = 0;
168 z_weights = 0;
169
170 if ((block_mode & 3) != 0)
171 {
172 base_quant_mode |= (block_mode & 3) << 1;
173 unsigned int B = (block_mode >> 7) & 3;
174 unsigned int C = (block_mode >> 2) & 0x3;
175 x_weights = A + 2;
176 y_weights = B + 2;
177 z_weights = C + 2;
178 }
179 else
180 {
181 base_quant_mode |= ((block_mode >> 2) & 3) << 1;
182 if (((block_mode >> 2) & 3) == 0)
183 {
184 return false;
185 }
186
187 int B = (block_mode >> 9) & 3;
188 if (((block_mode >> 7) & 3) != 3)
189 {
190 D = 0;
191 H = 0;
192 }
193 switch ((block_mode >> 7) & 3)
194 {
195 case 0:
196 x_weights = 6;
197 y_weights = B + 2;
198 z_weights = A + 2;
199 break;
200 case 1:
201 x_weights = A + 2;
202 y_weights = 6;
203 z_weights = B + 2;
204 break;
205 case 2:
206 x_weights = A + 2;
207 y_weights = B + 2;
208 z_weights = 6;
209 break;
210 case 3:
211 x_weights = 2;
212 y_weights = 2;
213 z_weights = 2;
214 switch ((block_mode >> 5) & 3)
215 {
216 case 0:
217 x_weights = 6;
218 break;
219 case 1:
220 y_weights = 6;
221 break;
222 case 2:
223 z_weights = 6;
224 break;
225 case 3:
226 return false;
227 }
228 break;
229 }
230 }
231
232 unsigned int weight_count = x_weights * y_weights * z_weights * (D + 1);
233 quant_mode = (base_quant_mode - 2) + 6 * H;
234 is_dual_plane = D != 0;
235
236 weight_bits = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(quant_mode));
237 return (weight_count <= BLOCK_MAX_WEIGHTS &&
238 weight_bits >= BLOCK_MIN_WEIGHT_BITS &&
239 weight_bits <= BLOCK_MAX_WEIGHT_BITS);
240 }
241
242 /**
243 * @brief Create a 2D decimation entry for a block-size and weight-decimation pair.
244 *
245 * @param x_texels The number of texels in the X dimension.
246 * @param y_texels The number of texels in the Y dimension.
247 * @param x_weights The number of weights in the X dimension.
248 * @param y_weights The number of weights in the Y dimension.
249 * @param[out] di The decimation info structure to populate.
250 * @param[out] wb The decimation table init scratch working buffers.
251 */
init_decimation_info_2d(unsigned int x_texels,unsigned int y_texels,unsigned int x_weights,unsigned int y_weights,decimation_info & di,dt_init_working_buffers & wb)252 static void init_decimation_info_2d(
253 unsigned int x_texels,
254 unsigned int y_texels,
255 unsigned int x_weights,
256 unsigned int y_weights,
257 decimation_info& di,
258 dt_init_working_buffers& wb
259 ) {
260 unsigned int texels_per_block = x_texels * y_texels;
261 unsigned int weights_per_block = x_weights * y_weights;
262
263 uint8_t max_texel_count_of_weight = 0;
264
265 promise(weights_per_block > 0);
266 promise(texels_per_block > 0);
267 promise(x_texels > 0);
268 promise(y_texels > 0);
269
270 for (unsigned int i = 0; i < weights_per_block; i++)
271 {
272 wb.texel_count_of_weight[i] = 0;
273 }
274
275 for (unsigned int i = 0; i < texels_per_block; i++)
276 {
277 wb.weight_count_of_texel[i] = 0;
278 }
279
280 for (unsigned int y = 0; y < y_texels; y++)
281 {
282 for (unsigned int x = 0; x < x_texels; x++)
283 {
284 unsigned int texel = y * x_texels + x;
285
286 unsigned int x_weight = (((1024 + x_texels / 2) / (x_texels - 1)) * x * (x_weights - 1) + 32) >> 6;
287 unsigned int y_weight = (((1024 + y_texels / 2) / (y_texels - 1)) * y * (y_weights - 1) + 32) >> 6;
288
289 unsigned int x_weight_frac = x_weight & 0xF;
290 unsigned int y_weight_frac = y_weight & 0xF;
291 unsigned int x_weight_int = x_weight >> 4;
292 unsigned int y_weight_int = y_weight >> 4;
293
294 unsigned int qweight[4];
295 qweight[0] = x_weight_int + y_weight_int * x_weights;
296 qweight[1] = qweight[0] + 1;
297 qweight[2] = qweight[0] + x_weights;
298 qweight[3] = qweight[2] + 1;
299
300 // Truncated-precision bilinear interpolation
301 unsigned int prod = x_weight_frac * y_weight_frac;
302
303 unsigned int weight[4];
304 weight[3] = (prod + 8) >> 4;
305 weight[1] = x_weight_frac - weight[3];
306 weight[2] = y_weight_frac - weight[3];
307 weight[0] = 16 - x_weight_frac - y_weight_frac + weight[3];
308
309 for (unsigned int i = 0; i < 4; i++)
310 {
311 if (weight[i] != 0)
312 {
313 wb.grid_weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(qweight[i]);
314 wb.weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(weight[i]);
315 wb.weight_count_of_texel[texel]++;
316 wb.texels_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(texel);
317 wb.texel_weights_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(weight[i]);
318 wb.texel_count_of_weight[qweight[i]]++;
319 max_texel_count_of_weight = astc::max(max_texel_count_of_weight, wb.texel_count_of_weight[qweight[i]]);
320 }
321 }
322 }
323 }
324
325 uint8_t max_texel_weight_count = 0;
326 for (unsigned int i = 0; i < texels_per_block; i++)
327 {
328 di.texel_weight_count[i] = wb.weight_count_of_texel[i];
329 max_texel_weight_count = astc::max(max_texel_weight_count, di.texel_weight_count[i]);
330
331 for (unsigned int j = 0; j < wb.weight_count_of_texel[i]; j++)
332 {
333 di.texel_weights_int_4t[j][i] = wb.weights_of_texel[i][j];
334 di.texel_weights_float_4t[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM);
335 di.texel_weights_4t[j][i] = wb.grid_weights_of_texel[i][j];
336 }
337
338 // Init all 4 entries so we can rely on zeros for vectorization
339 for (unsigned int j = wb.weight_count_of_texel[i]; j < 4; j++)
340 {
341 di.texel_weights_int_4t[j][i] = 0;
342 di.texel_weights_float_4t[j][i] = 0.0f;
343 di.texel_weights_4t[j][i] = 0;
344 }
345 }
346
347 di.max_texel_weight_count = max_texel_weight_count;
348
349 for (unsigned int i = 0; i < weights_per_block; i++)
350 {
351 unsigned int texel_count_wt = wb.texel_count_of_weight[i];
352 di.weight_texel_count[i] = static_cast<uint8_t>(texel_count_wt);
353
354 for (unsigned int j = 0; j < texel_count_wt; j++)
355 {
356 uint8_t texel = wb.texels_of_weight[i][j];
357
358 // Create transposed versions of these for better vectorization
359 di.weight_texel[j][i] = texel;
360 di.weights_flt[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]);
361
362 // perform a layer of array unrolling. An aspect of this unrolling is that
363 // one of the texel-weight indexes is an identity-mapped index; we will use this
364 // fact to reorder the indexes so that the first one is the identity index.
365 int swap_idx = -1;
366 for (unsigned int k = 0; k < 4; k++)
367 {
368 uint8_t dttw = di.texel_weights_4t[k][texel];
369 float dttwf = di.texel_weights_float_4t[k][texel];
370 if (dttw == i && dttwf != 0.0f)
371 {
372 swap_idx = k;
373 }
374 di.texel_weights_texel[i][j][k] = dttw;
375 di.texel_weights_float_texel[i][j][k] = dttwf;
376 }
377
378 if (swap_idx != 0)
379 {
380 uint8_t vi = di.texel_weights_texel[i][j][0];
381 float vf = di.texel_weights_float_texel[i][j][0];
382 di.texel_weights_texel[i][j][0] = di.texel_weights_texel[i][j][swap_idx];
383 di.texel_weights_float_texel[i][j][0] = di.texel_weights_float_texel[i][j][swap_idx];
384 di.texel_weights_texel[i][j][swap_idx] = vi;
385 di.texel_weights_float_texel[i][j][swap_idx] = vf;
386 }
387 }
388
389 // Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
390 // Match last texel in active lane in SIMD group, for better gathers
391 uint8_t last_texel = di.weight_texel[texel_count_wt - 1][i];
392 for (unsigned int j = texel_count_wt; j < max_texel_count_of_weight; j++)
393 {
394 di.weight_texel[j][i] = last_texel;
395 di.weights_flt[j][i] = 0.0f;
396 }
397 }
398
399 // Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
400 unsigned int texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
401 for (unsigned int i = texels_per_block; i < texels_per_block_simd; i++)
402 {
403 di.texel_weight_count[i] = 0;
404
405 for (unsigned int j = 0; j < 4; j++)
406 {
407 di.texel_weights_float_4t[j][i] = 0;
408 di.texel_weights_4t[j][i] = 0;
409 di.texel_weights_int_4t[j][i] = 0;
410 }
411 }
412
413 // Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
414 // Match last texel in active lane in SIMD group, for better gathers
415 unsigned int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1];
416 uint8_t last_texel = di.weight_texel[last_texel_count_wt - 1][weights_per_block - 1];
417
418 unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
419 for (unsigned int i = weights_per_block; i < weights_per_block_simd; i++)
420 {
421 di.weight_texel_count[i] = 0;
422
423 for (unsigned int j = 0; j < max_texel_count_of_weight; j++)
424 {
425 di.weight_texel[j][i] = last_texel;
426 di.weights_flt[j][i] = 0.0f;
427 }
428 }
429
430 di.texel_count = static_cast<uint8_t>(texels_per_block);
431 di.weight_count = static_cast<uint8_t>(weights_per_block);
432 di.weight_x = static_cast<uint8_t>(x_weights);
433 di.weight_y = static_cast<uint8_t>(y_weights);
434 di.weight_z = 1;
435 }
436
437 /**
438 * @brief Create a 3D decimation entry for a block-size and weight-decimation pair.
439 *
440 * @param x_texels The number of texels in the X dimension.
441 * @param y_texels The number of texels in the Y dimension.
442 * @param z_texels The number of texels in the Z dimension.
443 * @param x_weights The number of weights in the X dimension.
444 * @param y_weights The number of weights in the Y dimension.
445 * @param z_weights The number of weights in the Z dimension.
446 * @param[out] di The decimation info structure to populate.
447 @param[out] wb The decimation table init scratch working buffers.
448 */
init_decimation_info_3d(unsigned int x_texels,unsigned int y_texels,unsigned int z_texels,unsigned int x_weights,unsigned int y_weights,unsigned int z_weights,decimation_info & di,dt_init_working_buffers & wb)449 static void init_decimation_info_3d(
450 unsigned int x_texels,
451 unsigned int y_texels,
452 unsigned int z_texels,
453 unsigned int x_weights,
454 unsigned int y_weights,
455 unsigned int z_weights,
456 decimation_info& di,
457 dt_init_working_buffers& wb
458 ) {
459 unsigned int texels_per_block = x_texels * y_texels * z_texels;
460 unsigned int weights_per_block = x_weights * y_weights * z_weights;
461
462 uint8_t max_texel_count_of_weight = 0;
463
464 promise(weights_per_block > 0);
465 promise(texels_per_block > 0);
466
467 for (unsigned int i = 0; i < weights_per_block; i++)
468 {
469 wb.texel_count_of_weight[i] = 0;
470 }
471
472 for (unsigned int i = 0; i < texels_per_block; i++)
473 {
474 wb.weight_count_of_texel[i] = 0;
475 }
476
477 for (unsigned int z = 0; z < z_texels; z++)
478 {
479 for (unsigned int y = 0; y < y_texels; y++)
480 {
481 for (unsigned int x = 0; x < x_texels; x++)
482 {
483 int texel = (z * y_texels + y) * x_texels + x;
484
485 int x_weight = (((1024 + x_texels / 2) / (x_texels - 1)) * x * (x_weights - 1) + 32) >> 6;
486 int y_weight = (((1024 + y_texels / 2) / (y_texels - 1)) * y * (y_weights - 1) + 32) >> 6;
487 int z_weight = (((1024 + z_texels / 2) / (z_texels - 1)) * z * (z_weights - 1) + 32) >> 6;
488
489 int x_weight_frac = x_weight & 0xF;
490 int y_weight_frac = y_weight & 0xF;
491 int z_weight_frac = z_weight & 0xF;
492 int x_weight_int = x_weight >> 4;
493 int y_weight_int = y_weight >> 4;
494 int z_weight_int = z_weight >> 4;
495 int qweight[4];
496 int weight[4];
497 qweight[0] = (z_weight_int * y_weights + y_weight_int) * x_weights + x_weight_int;
498 qweight[3] = ((z_weight_int + 1) * y_weights + (y_weight_int + 1)) * x_weights + (x_weight_int + 1);
499
500 // simplex interpolation
501 int fs = x_weight_frac;
502 int ft = y_weight_frac;
503 int fp = z_weight_frac;
504
505 int cas = ((fs > ft) << 2) + ((ft > fp) << 1) + ((fs > fp));
506 int N = x_weights;
507 int NM = x_weights * y_weights;
508
509 int s1, s2, w0, w1, w2, w3;
510 switch (cas)
511 {
512 case 7:
513 s1 = 1;
514 s2 = N;
515 w0 = 16 - fs;
516 w1 = fs - ft;
517 w2 = ft - fp;
518 w3 = fp;
519 break;
520 case 3:
521 s1 = N;
522 s2 = 1;
523 w0 = 16 - ft;
524 w1 = ft - fs;
525 w2 = fs - fp;
526 w3 = fp;
527 break;
528 case 5:
529 s1 = 1;
530 s2 = NM;
531 w0 = 16 - fs;
532 w1 = fs - fp;
533 w2 = fp - ft;
534 w3 = ft;
535 break;
536 case 4:
537 s1 = NM;
538 s2 = 1;
539 w0 = 16 - fp;
540 w1 = fp - fs;
541 w2 = fs - ft;
542 w3 = ft;
543 break;
544 case 2:
545 s1 = N;
546 s2 = NM;
547 w0 = 16 - ft;
548 w1 = ft - fp;
549 w2 = fp - fs;
550 w3 = fs;
551 break;
552 case 0:
553 s1 = NM;
554 s2 = N;
555 w0 = 16 - fp;
556 w1 = fp - ft;
557 w2 = ft - fs;
558 w3 = fs;
559 break;
560 default:
561 s1 = NM;
562 s2 = N;
563 w0 = 16 - fp;
564 w1 = fp - ft;
565 w2 = ft - fs;
566 w3 = fs;
567 break;
568 }
569
570 qweight[1] = qweight[0] + s1;
571 qweight[2] = qweight[1] + s2;
572 weight[0] = w0;
573 weight[1] = w1;
574 weight[2] = w2;
575 weight[3] = w3;
576
577 for (unsigned int i = 0; i < 4; i++)
578 {
579 if (weight[i] != 0)
580 {
581 wb.grid_weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(qweight[i]);
582 wb.weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(weight[i]);
583 wb.weight_count_of_texel[texel]++;
584 wb.texels_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(texel);
585 wb.texel_weights_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(weight[i]);
586 wb.texel_count_of_weight[qweight[i]]++;
587 max_texel_count_of_weight = astc::max(max_texel_count_of_weight, wb.texel_count_of_weight[qweight[i]]);
588 }
589 }
590 }
591 }
592 }
593
594 uint8_t max_texel_weight_count = 0;
595 for (unsigned int i = 0; i < texels_per_block; i++)
596 {
597 di.texel_weight_count[i] = wb.weight_count_of_texel[i];
598 max_texel_weight_count = astc::max(max_texel_weight_count, di.texel_weight_count[i]);
599
600 // Init all 4 entries so we can rely on zeros for vectorization
601 for (unsigned int j = 0; j < 4; j++)
602 {
603 di.texel_weights_int_4t[j][i] = 0;
604 di.texel_weights_float_4t[j][i] = 0.0f;
605 di.texel_weights_4t[j][i] = 0;
606 }
607
608 for (unsigned int j = 0; j < wb.weight_count_of_texel[i]; j++)
609 {
610 di.texel_weights_int_4t[j][i] = wb.weights_of_texel[i][j];
611 di.texel_weights_float_4t[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM);
612 di.texel_weights_4t[j][i] = wb.grid_weights_of_texel[i][j];
613 }
614 }
615
616 di.max_texel_weight_count = max_texel_weight_count;
617
618 for (unsigned int i = 0; i < weights_per_block; i++)
619 {
620 unsigned int texel_count_wt = wb.texel_count_of_weight[i];
621 di.weight_texel_count[i] = static_cast<uint8_t>(texel_count_wt);
622
623 for (unsigned int j = 0; j < texel_count_wt; j++)
624 {
625 unsigned int texel = wb.texels_of_weight[i][j];
626
627 // Create transposed versions of these for better vectorization
628 di.weight_texel[j][i] = static_cast<uint8_t>(texel);
629 di.weights_flt[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]);
630
631 // perform a layer of array unrolling. An aspect of this unrolling is that
632 // one of the texel-weight indexes is an identity-mapped index; we will use this
633 // fact to reorder the indexes so that the first one is the identity index.
634 int swap_idx = -1;
635 for (unsigned int k = 0; k < 4; k++)
636 {
637 uint8_t dttw = di.texel_weights_4t[k][texel];
638 float dttwf = di.texel_weights_float_4t[k][texel];
639 if (dttw == i && dttwf != 0.0f)
640 {
641 swap_idx = k;
642 }
643 di.texel_weights_texel[i][j][k] = dttw;
644 di.texel_weights_float_texel[i][j][k] = dttwf;
645 }
646
647 if (swap_idx != 0)
648 {
649 uint8_t vi = di.texel_weights_texel[i][j][0];
650 float vf = di.texel_weights_float_texel[i][j][0];
651 di.texel_weights_texel[i][j][0] = di.texel_weights_texel[i][j][swap_idx];
652 di.texel_weights_float_texel[i][j][0] = di.texel_weights_float_texel[i][j][swap_idx];
653 di.texel_weights_texel[i][j][swap_idx] = vi;
654 di.texel_weights_float_texel[i][j][swap_idx] = vf;
655 }
656 }
657
658 // Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
659 // Match last texel in active lane in SIMD group, for better gathers
660 uint8_t last_texel = di.weight_texel[texel_count_wt - 1][i];
661 for (unsigned int j = texel_count_wt; j < max_texel_count_of_weight; j++)
662 {
663 di.weight_texel[j][i] = last_texel;
664 di.weights_flt[j][i] = 0.0f;
665 }
666 }
667
668 // Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
669 unsigned int texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
670 for (unsigned int i = texels_per_block; i < texels_per_block_simd; i++)
671 {
672 di.texel_weight_count[i] = 0;
673
674 for (unsigned int j = 0; j < 4; j++)
675 {
676 di.texel_weights_float_4t[j][i] = 0;
677 di.texel_weights_4t[j][i] = 0;
678 di.texel_weights_int_4t[j][i] = 0;
679 }
680 }
681
682 // Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
683 // Match last texel in active lane in SIMD group, for better gathers
684 int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1];
685 uint8_t last_texel = di.weight_texel[last_texel_count_wt - 1][weights_per_block - 1];
686
687 unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
688 for (unsigned int i = weights_per_block; i < weights_per_block_simd; i++)
689 {
690 di.weight_texel_count[i] = 0;
691
692 for (int j = 0; j < max_texel_count_of_weight; j++)
693 {
694 di.weight_texel[j][i] = last_texel;
695 di.weights_flt[j][i] = 0.0f;
696 }
697 }
698
699 di.texel_count = static_cast<uint8_t>(texels_per_block);
700 di.weight_count = static_cast<uint8_t>(weights_per_block);
701 di.weight_x = static_cast<uint8_t>(x_weights);
702 di.weight_y = static_cast<uint8_t>(y_weights);
703 di.weight_z = static_cast<uint8_t>(z_weights);
704 }
705
706 /**
707 * @brief Assign the texels to use for kmeans clustering.
708 *
709 * The max limit is @c BLOCK_MAX_KMEANS_TEXELS; above this a random selection is used.
710 * The @c bsd.texel_count is an input and must be populated beforehand.
711 *
712 * @param[in,out] bsd The block size descriptor to populate.
713 */
assign_kmeans_texels(block_size_descriptor & bsd)714 static void assign_kmeans_texels(
715 block_size_descriptor& bsd
716 ) {
717 // Use all texels for kmeans on a small block
718 if (bsd.texel_count <= BLOCK_MAX_KMEANS_TEXELS)
719 {
720 for (uint8_t i = 0; i < bsd.texel_count; i++)
721 {
722 bsd.kmeans_texels[i] = i;
723 }
724
725 return;
726 }
727
728 // Select a random subset of BLOCK_MAX_KMEANS_TEXELS for kmeans on a large block
729 uint64_t rng_state[2];
730 astc::rand_init(rng_state);
731
732 // Initialize array used for tracking used indices
733 bool seen[BLOCK_MAX_TEXELS];
734 for (uint8_t i = 0; i < bsd.texel_count; i++)
735 {
736 seen[i] = false;
737 }
738
739 // Assign 64 random indices, retrying if we see repeats
740 unsigned int arr_elements_set = 0;
741 while (arr_elements_set < BLOCK_MAX_KMEANS_TEXELS)
742 {
743 uint8_t texel = static_cast<uint8_t>(astc::rand(rng_state));
744 texel = texel % bsd.texel_count;
745 if (!seen[texel])
746 {
747 bsd.kmeans_texels[arr_elements_set++] = texel;
748 seen[texel] = true;
749 }
750 }
751 }
752
753 /**
754 * @brief Allocate a single 2D decimation table entry.
755 *
756 * @param x_texels The number of texels in the X dimension.
757 * @param y_texels The number of texels in the Y dimension.
758 * @param x_weights The number of weights in the X dimension.
759 * @param y_weights The number of weights in the Y dimension.
760 * @param bsd The block size descriptor we are populating.
761 * @param wb The decimation table init scratch working buffers.
762 * @param index The packed array index to populate.
763 */
construct_dt_entry_2d(unsigned int x_texels,unsigned int y_texels,unsigned int x_weights,unsigned int y_weights,block_size_descriptor & bsd,dt_init_working_buffers & wb,unsigned int index)764 static void construct_dt_entry_2d(
765 unsigned int x_texels,
766 unsigned int y_texels,
767 unsigned int x_weights,
768 unsigned int y_weights,
769 block_size_descriptor& bsd,
770 dt_init_working_buffers& wb,
771 unsigned int index
772 ) {
773 unsigned int weight_count = x_weights * y_weights;
774 assert(weight_count <= BLOCK_MAX_WEIGHTS);
775
776 bool try_2planes = (2 * weight_count) <= BLOCK_MAX_WEIGHTS;
777
778 decimation_info& di = bsd.decimation_tables[index];
779 init_decimation_info_2d(x_texels, y_texels, x_weights, y_weights, di, wb);
780
781 int maxprec_1plane = -1;
782 int maxprec_2planes = -1;
783 for (int i = 0; i < 12; i++)
784 {
785 unsigned int bits_1plane = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(i));
786 if (bits_1plane >= BLOCK_MIN_WEIGHT_BITS && bits_1plane <= BLOCK_MAX_WEIGHT_BITS)
787 {
788 maxprec_1plane = i;
789 }
790
791 if (try_2planes)
792 {
793 unsigned int bits_2planes = get_ise_sequence_bitcount(2 * weight_count, static_cast<quant_method>(i));
794 if (bits_2planes >= BLOCK_MIN_WEIGHT_BITS && bits_2planes <= BLOCK_MAX_WEIGHT_BITS)
795 {
796 maxprec_2planes = i;
797 }
798 }
799 }
800
801 // At least one of the two should be valid ...
802 assert(maxprec_1plane >= 0 || maxprec_2planes >= 0);
803 bsd.decimation_modes[index].maxprec_1plane = static_cast<int8_t>(maxprec_1plane);
804 bsd.decimation_modes[index].maxprec_2planes = static_cast<int8_t>(maxprec_2planes);
805 bsd.decimation_modes[index].refprec_1_plane = 0;
806 bsd.decimation_modes[index].refprec_2_planes = 0;
807 }
808
809 /**
810 * @brief Allocate block modes and decimation tables for a single 2D block size.
811 *
812 * @param x_texels The number of texels in the X dimension.
813 * @param y_texels The number of texels in the Y dimension.
814 * @param can_omit_modes Can we discard modes that astcenc won't use, even if legal?
815 * @param mode_cutoff Percentile cutoff in range [0,1]. Low values more likely to be used.
816 * @param[out] bsd The block size descriptor to populate.
817 */
construct_block_size_descriptor_2d(unsigned int x_texels,unsigned int y_texels,bool can_omit_modes,float mode_cutoff,block_size_descriptor & bsd)818 static void construct_block_size_descriptor_2d(
819 unsigned int x_texels,
820 unsigned int y_texels,
821 bool can_omit_modes,
822 float mode_cutoff,
823 block_size_descriptor& bsd
824 ) {
825 // Store a remap table for storing packed decimation modes.
826 // Indexing uses [Y * 16 + X] and max size for each axis is 12.
827 static const unsigned int MAX_DMI = 12 * 16 + 12;
828 int decimation_mode_index[MAX_DMI];
829
830 dt_init_working_buffers* wb = new dt_init_working_buffers;
831
832 bsd.xdim = static_cast<uint8_t>(x_texels);
833 bsd.ydim = static_cast<uint8_t>(y_texels);
834 bsd.zdim = 1;
835 bsd.texel_count = static_cast<uint8_t>(x_texels * y_texels);
836
837 for (unsigned int i = 0; i < MAX_DMI; i++)
838 {
839 decimation_mode_index[i] = -1;
840 }
841
842 // Gather all the decimation grids that can be used with the current block
843 #if !defined(ASTCENC_DECOMPRESS_ONLY)
844 const float *percentiles = get_2d_percentile_table(x_texels, y_texels);
845 float always_cutoff = 0.0f;
846 #else
847 // Unused in decompress-only builds
848 (void)can_omit_modes;
849 (void)mode_cutoff;
850 #endif
851
852 // Construct the list of block formats referencing the decimation tables
853 unsigned int packed_bm_idx = 0;
854 unsigned int packed_dm_idx = 0;
855
856 // Trackers
857 unsigned int bm_counts[4] { 0 };
858 unsigned int dm_counts[4] { 0 };
859
860 // Clear the list to a known-bad value
861 for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++)
862 {
863 bsd.block_mode_packed_index[i] = BLOCK_BAD_BLOCK_MODE;
864 }
865
866 // Iterate four times to build a usefully ordered list:
867 // - Pass 0 - keep selected single plane "always" block modes
868 // - Pass 1 - keep selected single plane "non-always" block modes
869 // - Pass 2 - keep select dual plane block modes
870 // - Pass 3 - keep everything else that's legal
871 unsigned int limit = can_omit_modes ? 3 : 4;
872 for (unsigned int j = 0; j < limit; j ++)
873 {
874 for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++)
875 {
876 // Skip modes we've already included in a previous pass
877 if (bsd.block_mode_packed_index[i] != BLOCK_BAD_BLOCK_MODE)
878 {
879 continue;
880 }
881
882 // Decode parameters
883 unsigned int x_weights;
884 unsigned int y_weights;
885 bool is_dual_plane;
886 unsigned int quant_mode;
887 unsigned int weight_bits;
888 bool valid = decode_block_mode_2d(i, x_weights, y_weights, is_dual_plane, quant_mode, weight_bits);
889
890 // Always skip invalid encodings for the current block size
891 if (!valid || (x_weights > x_texels) || (y_weights > y_texels))
892 {
893 continue;
894 }
895
896 // Selectively skip dual plane encodings
897 if (((j <= 1) && is_dual_plane) || (j == 2 && !is_dual_plane))
898 {
899 continue;
900 }
901
902 // Always skip encodings we can't physically encode based on
903 // generic encoding bit availability
904 if (is_dual_plane)
905 {
906 // This is the only check we need as only support 1 partition
907 if ((109 - weight_bits) <= 0)
908 {
909 continue;
910 }
911 }
912 else
913 {
914 // This is conservative - fewer bits may be available for > 1 partition
915 if ((111 - weight_bits) <= 0)
916 {
917 continue;
918 }
919 }
920
921 // Selectively skip encodings based on percentile
922 bool percentile_hit = false;
923 #if !defined(ASTCENC_DECOMPRESS_ONLY)
924 if (j == 0)
925 {
926 percentile_hit = percentiles[i] <= always_cutoff;
927 }
928 else
929 {
930 percentile_hit = percentiles[i] <= mode_cutoff;
931 }
932 #endif
933
934 if (j != 3 && !percentile_hit)
935 {
936 continue;
937 }
938
939 // Allocate and initialize the decimation table entry if we've not used it yet
940 int decimation_mode = decimation_mode_index[y_weights * 16 + x_weights];
941 if (decimation_mode < 0)
942 {
943 construct_dt_entry_2d(x_texels, y_texels, x_weights, y_weights, bsd, *wb, packed_dm_idx);
944 decimation_mode_index[y_weights * 16 + x_weights] = packed_dm_idx;
945 decimation_mode = packed_dm_idx;
946
947 dm_counts[j]++;
948 packed_dm_idx++;
949 }
950
951 auto& bm = bsd.block_modes[packed_bm_idx];
952
953 bm.decimation_mode = static_cast<uint8_t>(decimation_mode);
954 bm.quant_mode = static_cast<uint8_t>(quant_mode);
955 bm.is_dual_plane = static_cast<uint8_t>(is_dual_plane);
956 bm.weight_bits = static_cast<uint8_t>(weight_bits);
957 bm.mode_index = static_cast<uint16_t>(i);
958
959 auto& dm = bsd.decimation_modes[decimation_mode];
960
961 if (is_dual_plane)
962 {
963 dm.set_ref_2_plane(bm.get_weight_quant_mode());
964 }
965 else
966 {
967 dm.set_ref_1_plane(bm.get_weight_quant_mode());
968 }
969
970 bsd.block_mode_packed_index[i] = static_cast<uint16_t>(packed_bm_idx);
971
972 packed_bm_idx++;
973 bm_counts[j]++;
974 }
975 }
976
977 bsd.block_mode_count_1plane_always = bm_counts[0];
978 bsd.block_mode_count_1plane_selected = bm_counts[0] + bm_counts[1];
979 bsd.block_mode_count_1plane_2plane_selected = bm_counts[0] + bm_counts[1] + bm_counts[2];
980 bsd.block_mode_count_all = bm_counts[0] + bm_counts[1] + bm_counts[2] + bm_counts[3];
981
982 bsd.decimation_mode_count_always = dm_counts[0];
983 bsd.decimation_mode_count_selected = dm_counts[0] + dm_counts[1] + dm_counts[2];
984 bsd.decimation_mode_count_all = dm_counts[0] + dm_counts[1] + dm_counts[2] + dm_counts[3];
985
986 #if !defined(ASTCENC_DECOMPRESS_ONLY)
987 assert(bsd.block_mode_count_1plane_always > 0);
988 assert(bsd.decimation_mode_count_always > 0);
989
990 delete[] percentiles;
991 #endif
992
993 // Ensure the end of the array contains valid data (should never get read)
994 for (unsigned int i = bsd.decimation_mode_count_all; i < WEIGHTS_MAX_DECIMATION_MODES; i++)
995 {
996 bsd.decimation_modes[i].maxprec_1plane = -1;
997 bsd.decimation_modes[i].maxprec_2planes = -1;
998 bsd.decimation_modes[i].refprec_1_plane = 0;
999 bsd.decimation_modes[i].refprec_2_planes = 0;
1000 }
1001
1002 // Determine the texels to use for kmeans clustering.
1003 assign_kmeans_texels(bsd);
1004
1005 delete wb;
1006 }
1007
1008 /**
1009 * @brief Allocate block modes and decimation tables for a single £D block size.
1010 *
1011 * TODO: This function doesn't include all of the heuristics that we use for 2D block sizes such as
1012 * the percentile mode cutoffs. If 3D becomes more widely used we should look at this.
1013 *
1014 * @param x_texels The number of texels in the X dimension.
1015 * @param y_texels The number of texels in the Y dimension.
1016 * @param z_texels The number of texels in the Z dimension.
1017 * @param[out] bsd The block size descriptor to populate.
1018 */
construct_block_size_descriptor_3d(unsigned int x_texels,unsigned int y_texels,unsigned int z_texels,block_size_descriptor & bsd)1019 static void construct_block_size_descriptor_3d(
1020 unsigned int x_texels,
1021 unsigned int y_texels,
1022 unsigned int z_texels,
1023 block_size_descriptor& bsd
1024 ) {
1025 // Store a remap table for storing packed decimation modes.
1026 // Indexing uses [Z * 64 + Y * 8 + X] and max size for each axis is 6.
1027 static constexpr unsigned int MAX_DMI = 6 * 64 + 6 * 8 + 6;
1028 int decimation_mode_index[MAX_DMI];
1029 unsigned int decimation_mode_count = 0;
1030
1031 dt_init_working_buffers* wb = new dt_init_working_buffers;
1032
1033 bsd.xdim = static_cast<uint8_t>(x_texels);
1034 bsd.ydim = static_cast<uint8_t>(y_texels);
1035 bsd.zdim = static_cast<uint8_t>(z_texels);
1036 bsd.texel_count = static_cast<uint8_t>(x_texels * y_texels * z_texels);
1037
1038 for (unsigned int i = 0; i < MAX_DMI; i++)
1039 {
1040 decimation_mode_index[i] = -1;
1041 }
1042
1043 // gather all the infill-modes that can be used with the current block size
1044 for (unsigned int x_weights = 2; x_weights <= x_texels; x_weights++)
1045 {
1046 for (unsigned int y_weights = 2; y_weights <= y_texels; y_weights++)
1047 {
1048 for (unsigned int z_weights = 2; z_weights <= z_texels; z_weights++)
1049 {
1050 unsigned int weight_count = x_weights * y_weights * z_weights;
1051 if (weight_count > BLOCK_MAX_WEIGHTS)
1052 {
1053 continue;
1054 }
1055
1056 decimation_info& di = bsd.decimation_tables[decimation_mode_count];
1057 decimation_mode_index[z_weights * 64 + y_weights * 8 + x_weights] = decimation_mode_count;
1058 init_decimation_info_3d(x_texels, y_texels, z_texels, x_weights, y_weights, z_weights, di, *wb);
1059
1060 int maxprec_1plane = -1;
1061 int maxprec_2planes = -1;
1062 for (unsigned int i = 0; i < 12; i++)
1063 {
1064 unsigned int bits_1plane = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(i));
1065 if (bits_1plane >= BLOCK_MIN_WEIGHT_BITS && bits_1plane <= BLOCK_MAX_WEIGHT_BITS)
1066 {
1067 maxprec_1plane = i;
1068 }
1069
1070 unsigned int bits_2planes = get_ise_sequence_bitcount(2 * weight_count, static_cast<quant_method>(i));
1071 if (bits_2planes >= BLOCK_MIN_WEIGHT_BITS && bits_2planes <= BLOCK_MAX_WEIGHT_BITS)
1072 {
1073 maxprec_2planes = i;
1074 }
1075 }
1076
1077 if ((2 * weight_count) > BLOCK_MAX_WEIGHTS)
1078 {
1079 maxprec_2planes = -1;
1080 }
1081
1082 bsd.decimation_modes[decimation_mode_count].maxprec_1plane = static_cast<int8_t>(maxprec_1plane);
1083 bsd.decimation_modes[decimation_mode_count].maxprec_2planes = static_cast<int8_t>(maxprec_2planes);
1084 bsd.decimation_modes[decimation_mode_count].refprec_1_plane = maxprec_1plane == -1 ? 0 : 0xFFFF;
1085 bsd.decimation_modes[decimation_mode_count].refprec_2_planes = maxprec_2planes == -1 ? 0 : 0xFFFF;
1086 decimation_mode_count++;
1087 }
1088 }
1089 }
1090
1091 // Ensure the end of the array contains valid data (should never get read)
1092 for (unsigned int i = decimation_mode_count; i < WEIGHTS_MAX_DECIMATION_MODES; i++)
1093 {
1094 bsd.decimation_modes[i].maxprec_1plane = -1;
1095 bsd.decimation_modes[i].maxprec_2planes = -1;
1096 bsd.decimation_modes[i].refprec_1_plane = 0;
1097 bsd.decimation_modes[i].refprec_2_planes = 0;
1098 }
1099
1100 bsd.decimation_mode_count_always = 0; // Skipped for 3D modes
1101 bsd.decimation_mode_count_selected = decimation_mode_count;
1102 bsd.decimation_mode_count_all = decimation_mode_count;
1103
1104 // Construct the list of block formats referencing the decimation tables
1105
1106 // Clear the list to a known-bad value
1107 for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++)
1108 {
1109 bsd.block_mode_packed_index[i] = BLOCK_BAD_BLOCK_MODE;
1110 }
1111
1112 unsigned int packed_idx = 0;
1113 unsigned int bm_counts[2] { 0 };
1114
1115 // Iterate two times to build a usefully ordered list:
1116 // - Pass 0 - keep valid single plane block modes
1117 // - Pass 1 - keep valid dual plane block modes
1118 for (unsigned int j = 0; j < 2; j++)
1119 {
1120 for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++)
1121 {
1122 // Skip modes we've already included in a previous pass
1123 if (bsd.block_mode_packed_index[i] != BLOCK_BAD_BLOCK_MODE)
1124 {
1125 continue;
1126 }
1127
1128 unsigned int x_weights;
1129 unsigned int y_weights;
1130 unsigned int z_weights;
1131 bool is_dual_plane;
1132 unsigned int quant_mode;
1133 unsigned int weight_bits;
1134
1135 bool valid = decode_block_mode_3d(i, x_weights, y_weights, z_weights, is_dual_plane, quant_mode, weight_bits);
1136 // Skip invalid encodings
1137 if (!valid || x_weights > x_texels || y_weights > y_texels || z_weights > z_texels)
1138 {
1139 continue;
1140 }
1141
1142 // Skip encodings in the wrong iteration
1143 if ((j == 0 && is_dual_plane) || (j == 1 && !is_dual_plane))
1144 {
1145 continue;
1146 }
1147
1148 // Always skip encodings we can't physically encode based on bit availability
1149 if (is_dual_plane)
1150 {
1151 // This is the only check we need as only support 1 partition
1152 if ((109 - weight_bits) <= 0)
1153 {
1154 continue;
1155 }
1156 }
1157 else
1158 {
1159 // This is conservative - fewer bits may be available for > 1 partition
1160 if ((111 - weight_bits) <= 0)
1161 {
1162 continue;
1163 }
1164 }
1165
1166 int decimation_mode = decimation_mode_index[z_weights * 64 + y_weights * 8 + x_weights];
1167 bsd.block_modes[packed_idx].decimation_mode = static_cast<uint8_t>(decimation_mode);
1168 bsd.block_modes[packed_idx].quant_mode = static_cast<uint8_t>(quant_mode);
1169 bsd.block_modes[packed_idx].weight_bits = static_cast<uint8_t>(weight_bits);
1170 bsd.block_modes[packed_idx].is_dual_plane = static_cast<uint8_t>(is_dual_plane);
1171 bsd.block_modes[packed_idx].mode_index = static_cast<uint16_t>(i);
1172
1173 bsd.block_mode_packed_index[i] = static_cast<uint16_t>(packed_idx);
1174 bm_counts[j]++;
1175 packed_idx++;
1176 }
1177 }
1178
1179 bsd.block_mode_count_1plane_always = 0; // Skipped for 3D modes
1180 bsd.block_mode_count_1plane_selected = bm_counts[0];
1181 bsd.block_mode_count_1plane_2plane_selected = bm_counts[0] + bm_counts[1];
1182 bsd.block_mode_count_all = bm_counts[0] + bm_counts[1];
1183
1184 // Determine the texels to use for kmeans clustering.
1185 assign_kmeans_texels(bsd);
1186
1187 delete wb;
1188 }
1189
1190 /* See header for documentation. */
init_block_size_descriptor(unsigned int x_texels,unsigned int y_texels,unsigned int z_texels,bool can_omit_modes,unsigned int partition_count_cutoff,float mode_cutoff,block_size_descriptor & bsd)1191 void init_block_size_descriptor(
1192 unsigned int x_texels,
1193 unsigned int y_texels,
1194 unsigned int z_texels,
1195 bool can_omit_modes,
1196 unsigned int partition_count_cutoff,
1197 float mode_cutoff,
1198 block_size_descriptor& bsd
1199 ) {
1200 if (z_texels > 1)
1201 {
1202 construct_block_size_descriptor_3d(x_texels, y_texels, z_texels, bsd);
1203 }
1204 else
1205 {
1206 construct_block_size_descriptor_2d(x_texels, y_texels, can_omit_modes, mode_cutoff, bsd);
1207 }
1208
1209 init_partition_tables(bsd, can_omit_modes, partition_count_cutoff);
1210 }
1211