1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17
18 /**
19 * @brief Functions for the library entrypoint.
20 */
21
22 #include <array>
23 #include <cstring>
24 #include <new>
25
26 #include "astcenc.h"
27 #include "astcenc_internal_entry.h"
28 #include "astcenc_diagnostic_trace.h"
29
30 /**
31 * @brief Record of the quality tuning parameter values.
32 *
33 * See the @c astcenc_config structure for detailed parameter documentation.
34 *
35 * Note that the mse_overshoot entries are scaling factors relative to the base MSE to hit db_limit.
36 * A 20% overshoot is harder to hit for a higher base db_limit, so we may actually use lower ratios
37 * for the more through search presets because the underlying db_limit is so much higher.
38 */
39 struct astcenc_preset_config
40 {
41 float quality;
42 unsigned int tune_partition_count_limit;
43 unsigned int tune_2partition_index_limit;
44 unsigned int tune_3partition_index_limit;
45 unsigned int tune_4partition_index_limit;
46 unsigned int tune_block_mode_limit;
47 unsigned int tune_refinement_limit;
48 unsigned int tune_candidate_limit;
49 unsigned int tune_2partitioning_candidate_limit;
50 unsigned int tune_3partitioning_candidate_limit;
51 unsigned int tune_4partitioning_candidate_limit;
52 float tune_db_limit_a_base;
53 float tune_db_limit_b_base;
54 float tune_mode0_mse_overshoot;
55 float tune_refinement_mse_overshoot;
56 float tune_2_partition_early_out_limit_factor;
57 float tune_3_partition_early_out_limit_factor;
58 float tune_2_plane_early_out_limit_correlation;
59 };
60
61
62 /**
63 * @brief The static quality presets that are built-in for high bandwidth
64 * presets (x < 25 texels per block).
65 */
66 static const std::array<astcenc_preset_config, 6> preset_configs_high {{
67 {
68 ASTCENC_PRE_FASTEST,
69 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f
70 }, {
71 ASTCENC_PRE_FAST,
72 3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.90f
73 }, {
74 ASTCENC_PRE_MEDIUM,
75 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 2.5f, 1.1f, 1.05f, 0.95f
76 }, {
77 ASTCENC_PRE_THOROUGH,
78 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.35f, 1.15f, 0.97f
79 }, {
80 ASTCENC_PRE_VERYTHOROUGH,
81 4, 256, 128, 64, 98, 4, 6, 20, 14, 8, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f
82 }, {
83 ASTCENC_PRE_EXHAUSTIVE,
84 4, 512, 512, 512, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f
85 }
86 }};
87
88 /**
89 * @brief The static quality presets that are built-in for medium bandwidth
90 * presets (25 <= x < 64 texels per block).
91 */
92 static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
93 {
94 ASTCENC_PRE_FASTEST,
95 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.80f
96 }, {
97 ASTCENC_PRE_FAST,
98 3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f
99 }, {
100 ASTCENC_PRE_MEDIUM,
101 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 3.0f, 1.1f, 1.05f, 0.90f
102 }, {
103 ASTCENC_PRE_THOROUGH,
104 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.4f, 1.2f, 0.95f
105 }, {
106 ASTCENC_PRE_VERYTHOROUGH,
107 4, 256, 128, 64, 98, 4, 6, 12, 8, 3, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f
108 }, {
109 ASTCENC_PRE_EXHAUSTIVE,
110 4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f
111 }
112 }};
113
114 /**
115 * @brief The static quality presets that are built-in for low bandwidth
116 * presets (64 <= x texels per block).
117 */
118 static const std::array<astcenc_preset_config, 6> preset_configs_low {{
119 {
120 ASTCENC_PRE_FASTEST,
121 2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.80f
122 }, {
123 ASTCENC_PRE_FAST,
124 2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f
125 }, {
126 ASTCENC_PRE_MEDIUM,
127 3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 3.5f, 1.1f, 1.05f, 0.90f
128 }, {
129 ASTCENC_PRE_THOROUGH,
130 4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.3f, 1.2f, 0.97f
131 }, {
132 ASTCENC_PRE_VERYTHOROUGH,
133 4, 256, 128, 64, 98, 4, 6, 9, 5, 2, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f
134 }, {
135 ASTCENC_PRE_EXHAUSTIVE,
136 4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f
137 }
138 }};
139
140 /**
141 * @brief Validate CPU floating point meets assumptions made in the codec.
142 *
143 * The codec is written with the assumption that a float threaded through the @c if32 union will be
144 * stored and reloaded as a 32-bit IEEE-754 float with round-to-nearest rounding. This is always the
145 * case in an IEEE-754 compliant system, however not every system or compilation mode is actually
146 * IEEE-754 compliant. This normally fails if the code is compiled with fast math enabled.
147 *
148 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
149 */
validate_cpu_float()150 static astcenc_error validate_cpu_float()
151 {
152 if32 p;
153 volatile float xprec_testval = 2.51f;
154 p.f = xprec_testval + 12582912.0f;
155 float q = p.f - 12582912.0f;
156
157 if (q != 3.0f)
158 {
159 return ASTCENC_ERR_BAD_CPU_FLOAT;
160 }
161
162 return ASTCENC_SUCCESS;
163 }
164
165 /**
166 * @brief Validate CPU ISA support meets the requirements of this build of the library.
167 *
168 * Each library build is statically compiled for a particular set of CPU ISA features, such as the
169 * SIMD support or other ISA extensions such as POPCNT. This function checks that the host CPU
170 * actually supports everything this build needs.
171 *
172 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
173 */
validate_cpu_isa()174 static astcenc_error validate_cpu_isa()
175 {
176 #if ASTCENC_SSE >= 41
177 if (!cpu_supports_sse41())
178 {
179 return ASTCENC_ERR_BAD_CPU_ISA;
180 }
181 #endif
182
183 #if ASTCENC_POPCNT >= 1
184 if (!cpu_supports_popcnt())
185 {
186 return ASTCENC_ERR_BAD_CPU_ISA;
187 }
188 #endif
189
190 #if ASTCENC_F16C >= 1
191 if (!cpu_supports_f16c())
192 {
193 return ASTCENC_ERR_BAD_CPU_ISA;
194 }
195 #endif
196
197 #if ASTCENC_AVX >= 2
198 if (!cpu_supports_avx2())
199 {
200 return ASTCENC_ERR_BAD_CPU_ISA;
201 }
202 #endif
203
204 return ASTCENC_SUCCESS;
205 }
206
207 /**
208 * @brief Validate config profile.
209 *
210 * @param profile The profile to check.
211 *
212 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
213 */
validate_profile(astcenc_profile profile)214 static astcenc_error validate_profile(
215 astcenc_profile profile
216 ) {
217 // Values in this enum are from an external user, so not guaranteed to be
218 // bounded to the enum values
219 switch (static_cast<int>(profile))
220 {
221 case ASTCENC_PRF_LDR_SRGB:
222 case ASTCENC_PRF_LDR:
223 case ASTCENC_PRF_HDR_RGB_LDR_A:
224 case ASTCENC_PRF_HDR:
225 return ASTCENC_SUCCESS;
226 default:
227 return ASTCENC_ERR_BAD_PROFILE;
228 }
229 }
230
231 /**
232 * @brief Validate block size.
233 *
234 * @param block_x The block x dimensions.
235 * @param block_y The block y dimensions.
236 * @param block_z The block z dimensions.
237 *
238 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
239 */
validate_block_size(unsigned int block_x,unsigned int block_y,unsigned int block_z)240 static astcenc_error validate_block_size(
241 unsigned int block_x,
242 unsigned int block_y,
243 unsigned int block_z
244 ) {
245 // Test if this is a legal block size at all
246 bool is_legal = (((block_z <= 1) && is_legal_2d_block_size(block_x, block_y)) ||
247 ((block_z >= 2) && is_legal_3d_block_size(block_x, block_y, block_z)));
248 if (!is_legal)
249 {
250 return ASTCENC_ERR_BAD_BLOCK_SIZE;
251 }
252
253 // Test if this build has sufficient capacity for this block size
254 bool have_capacity = (block_x * block_y * block_z) <= BLOCK_MAX_TEXELS;
255 if (!have_capacity)
256 {
257 return ASTCENC_ERR_NOT_IMPLEMENTED;
258 }
259
260 return ASTCENC_SUCCESS;
261 }
262
263 /**
264 * @brief Validate flags.
265 *
266 * @param flags The flags to check.
267 *
268 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
269 */
validate_flags(unsigned int flags)270 static astcenc_error validate_flags(
271 unsigned int flags
272 ) {
273 // Flags field must not contain any unknown flag bits
274 unsigned int exMask = ~ASTCENC_ALL_FLAGS;
275 if (popcount(flags & exMask) != 0)
276 {
277 return ASTCENC_ERR_BAD_FLAGS;
278 }
279
280 // Flags field must only contain at most a single map type
281 exMask = ASTCENC_FLG_MAP_MASK
282 | ASTCENC_FLG_MAP_NORMAL
283 | ASTCENC_FLG_MAP_RGBM;
284 if (popcount(flags & exMask) > 1)
285 {
286 return ASTCENC_ERR_BAD_FLAGS;
287 }
288
289 return ASTCENC_SUCCESS;
290 }
291
292 #if !defined(ASTCENC_DECOMPRESS_ONLY)
293
294 /**
295 * @brief Validate single channel compression swizzle.
296 *
297 * @param swizzle The swizzle to check.
298 *
299 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
300 */
validate_compression_swz(astcenc_swz swizzle)301 static astcenc_error validate_compression_swz(
302 astcenc_swz swizzle
303 ) {
304 // Not all enum values are handled; SWZ_Z is invalid for compression
305 switch (static_cast<int>(swizzle))
306 {
307 case ASTCENC_SWZ_R:
308 case ASTCENC_SWZ_G:
309 case ASTCENC_SWZ_B:
310 case ASTCENC_SWZ_A:
311 case ASTCENC_SWZ_0:
312 case ASTCENC_SWZ_1:
313 return ASTCENC_SUCCESS;
314 default:
315 return ASTCENC_ERR_BAD_SWIZZLE;
316 }
317 }
318
319 /**
320 * @brief Validate overall compression swizzle.
321 *
322 * @param swizzle The swizzle to check.
323 *
324 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
325 */
validate_compression_swizzle(const astcenc_swizzle & swizzle)326 static astcenc_error validate_compression_swizzle(
327 const astcenc_swizzle& swizzle
328 ) {
329 if (validate_compression_swz(swizzle.r) ||
330 validate_compression_swz(swizzle.g) ||
331 validate_compression_swz(swizzle.b) ||
332 validate_compression_swz(swizzle.a))
333 {
334 return ASTCENC_ERR_BAD_SWIZZLE;
335 }
336
337 return ASTCENC_SUCCESS;
338 }
339 #endif
340
341 /**
342 * @brief Validate single channel decompression swizzle.
343 *
344 * @param swizzle The swizzle to check.
345 *
346 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
347 */
validate_decompression_swz(astcenc_swz swizzle)348 static astcenc_error validate_decompression_swz(
349 astcenc_swz swizzle
350 ) {
351 // Values in this enum are from an external user, so not guaranteed to be
352 // bounded to the enum values
353 switch (static_cast<int>(swizzle))
354 {
355 case ASTCENC_SWZ_R:
356 case ASTCENC_SWZ_G:
357 case ASTCENC_SWZ_B:
358 case ASTCENC_SWZ_A:
359 case ASTCENC_SWZ_0:
360 case ASTCENC_SWZ_1:
361 case ASTCENC_SWZ_Z:
362 return ASTCENC_SUCCESS;
363 default:
364 return ASTCENC_ERR_BAD_SWIZZLE;
365 }
366 }
367
368 /**
369 * @brief Validate overall decompression swizzle.
370 *
371 * @param swizzle The swizzle to check.
372 *
373 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
374 */
validate_decompression_swizzle(const astcenc_swizzle & swizzle)375 static astcenc_error validate_decompression_swizzle(
376 const astcenc_swizzle& swizzle
377 ) {
378 if (validate_decompression_swz(swizzle.r) ||
379 validate_decompression_swz(swizzle.g) ||
380 validate_decompression_swz(swizzle.b) ||
381 validate_decompression_swz(swizzle.a))
382 {
383 return ASTCENC_ERR_BAD_SWIZZLE;
384 }
385
386 return ASTCENC_SUCCESS;
387 }
388
389 /**
390 * Validate that an incoming configuration is in-spec.
391 *
392 * This function can respond in two ways:
393 *
394 * * Numerical inputs that have valid ranges are clamped to those valid ranges. No error is thrown
395 * for out-of-range inputs in this case.
396 * * Numerical inputs and logic inputs are are logically invalid and which make no sense
397 * algorithmically will return an error.
398 *
399 * @param[in,out] config The input compressor configuration.
400 *
401 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
402 */
validate_config(astcenc_config & config)403 static astcenc_error validate_config(
404 astcenc_config &config
405 ) {
406 astcenc_error status;
407
408 status = validate_profile(config.profile);
409 if (status != ASTCENC_SUCCESS)
410 {
411 return status;
412 }
413
414 status = validate_flags(config.flags);
415 if (status != ASTCENC_SUCCESS)
416 {
417 return status;
418 }
419
420 status = validate_block_size(config.block_x, config.block_y, config.block_z);
421 if (status != ASTCENC_SUCCESS)
422 {
423 return status;
424 }
425
426 #if defined(ASTCENC_DECOMPRESS_ONLY)
427 // Decompress-only builds only support decompress-only contexts
428 if (!(config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
429 {
430 return ASTCENC_ERR_BAD_PARAM;
431 }
432 #endif
433
434 config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f);
435
436 config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u);
437 config.tune_2partition_index_limit = astc::clamp(config.tune_2partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
438 config.tune_3partition_index_limit = astc::clamp(config.tune_3partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
439 config.tune_4partition_index_limit = astc::clamp(config.tune_4partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
440 config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u);
441 config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u);
442 config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES);
443 config.tune_2partitioning_candidate_limit = astc::clamp(config.tune_2partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES);
444 config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES);
445 config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES);
446 config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f);
447 config.tune_mode0_mse_overshoot = astc::max(config.tune_mode0_mse_overshoot, 1.0f);
448 config.tune_refinement_mse_overshoot = astc::max(config.tune_refinement_mse_overshoot, 1.0f);
449 config.tune_2_partition_early_out_limit_factor = astc::max(config.tune_2_partition_early_out_limit_factor, 0.0f);
450 config.tune_3_partition_early_out_limit_factor = astc::max(config.tune_3_partition_early_out_limit_factor, 0.0f);
451 config.tune_2_plane_early_out_limit_correlation = astc::max(config.tune_2_plane_early_out_limit_correlation, 0.0f);
452
453 // Specifying a zero weight color component is not allowed; force to small value
454 float max_weight = astc::max(astc::max(config.cw_r_weight, config.cw_g_weight),
455 astc::max(config.cw_b_weight, config.cw_a_weight));
456 if (max_weight > 0.0f)
457 {
458 max_weight /= 1000.0f;
459 config.cw_r_weight = astc::max(config.cw_r_weight, max_weight);
460 config.cw_g_weight = astc::max(config.cw_g_weight, max_weight);
461 config.cw_b_weight = astc::max(config.cw_b_weight, max_weight);
462 config.cw_a_weight = astc::max(config.cw_a_weight, max_weight);
463 }
464 // If all color components error weights are zero then return an error
465 else
466 {
467 return ASTCENC_ERR_BAD_PARAM;
468 }
469
470 return ASTCENC_SUCCESS;
471 }
472
473 /* See header for documentation. */
astcenc_config_init(astcenc_profile profile,unsigned int block_x,unsigned int block_y,unsigned int block_z,float quality,unsigned int flags,astcenc_config * configp)474 astcenc_error astcenc_config_init(
475 astcenc_profile profile,
476 unsigned int block_x,
477 unsigned int block_y,
478 unsigned int block_z,
479 float quality,
480 unsigned int flags,
481 astcenc_config* configp
482 ) {
483 astcenc_error status;
484
485 // Check basic library compatibility options here so they are checked early. Note, these checks
486 // are repeated in context_alloc for cases where callers use a manually defined config struct
487 status = validate_cpu_isa();
488 if (status != ASTCENC_SUCCESS)
489 {
490 return status;
491 }
492
493 status = validate_cpu_float();
494 if (status != ASTCENC_SUCCESS)
495 {
496 return status;
497 }
498
499 // Zero init all config fields; although most of will be over written
500 astcenc_config& config = *configp;
501 std::memset(&config, 0, sizeof(config));
502
503 // Process the block size
504 block_z = astc::max(block_z, 1u); // For 2D blocks Z==0 is accepted, but convert to 1
505 status = validate_block_size(block_x, block_y, block_z);
506 if (status != ASTCENC_SUCCESS)
507 {
508 return status;
509 }
510
511 config.block_x = block_x;
512 config.block_y = block_y;
513 config.block_z = block_z;
514
515 float texels = static_cast<float>(block_x * block_y * block_z);
516 float ltexels = logf(texels) / logf(10.0f);
517
518 // Process the performance quality level or preset; note that this must be done before we
519 // process any additional settings, such as color profile and flags, which may replace some of
520 // these settings with more use case tuned values
521 if (quality < ASTCENC_PRE_FASTEST ||
522 quality > ASTCENC_PRE_EXHAUSTIVE)
523 {
524 return ASTCENC_ERR_BAD_QUALITY;
525 }
526
527 static const std::array<astcenc_preset_config, 6>* preset_configs;
528 int texels_int = block_x * block_y * block_z;
529 if (texels_int < 25)
530 {
531 preset_configs = &preset_configs_high;
532 }
533 else if (texels_int < 64)
534 {
535 preset_configs = &preset_configs_mid;
536 }
537 else
538 {
539 preset_configs = &preset_configs_low;
540 }
541
542 // Determine which preset to use, or which pair to interpolate
543 size_t start;
544 size_t end;
545 for (end = 0; end < preset_configs->size(); end++)
546 {
547 if ((*preset_configs)[end].quality >= quality)
548 {
549 break;
550 }
551 }
552
553 start = end == 0 ? 0 : end - 1;
554
555 // Start and end node are the same - so just transfer the values.
556 if (start == end)
557 {
558 config.tune_partition_count_limit = (*preset_configs)[start].tune_partition_count_limit;
559 config.tune_2partition_index_limit = (*preset_configs)[start].tune_2partition_index_limit;
560 config.tune_3partition_index_limit = (*preset_configs)[start].tune_3partition_index_limit;
561 config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit;
562 config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit;
563 config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit;
564 config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit, TUNE_MAX_TRIAL_CANDIDATES);
565 config.tune_2partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_2partitioning_candidate_limit, TUNE_MAX_PARTITIIONING_CANDIDATES);
566 config.tune_3partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_3partitioning_candidate_limit, TUNE_MAX_PARTITIIONING_CANDIDATES);
567 config.tune_4partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_4partitioning_candidate_limit, TUNE_MAX_PARTITIIONING_CANDIDATES);
568 config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels,
569 (*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels);
570
571 config.tune_mode0_mse_overshoot = (*preset_configs)[start].tune_mode0_mse_overshoot;
572 config.tune_refinement_mse_overshoot = (*preset_configs)[start].tune_refinement_mse_overshoot;
573
574 config.tune_2_partition_early_out_limit_factor = (*preset_configs)[start].tune_2_partition_early_out_limit_factor;
575 config.tune_3_partition_early_out_limit_factor =(*preset_configs)[start].tune_3_partition_early_out_limit_factor;
576 config.tune_2_plane_early_out_limit_correlation = (*preset_configs)[start].tune_2_plane_early_out_limit_correlation;
577 }
578 // Start and end node are not the same - so interpolate between them
579 else
580 {
581 auto& node_a = (*preset_configs)[start];
582 auto& node_b = (*preset_configs)[end];
583
584 float wt_range = node_b.quality - node_a.quality;
585 assert(wt_range > 0);
586
587 // Compute interpolation factors
588 float wt_node_a = (node_b.quality - quality) / wt_range;
589 float wt_node_b = (quality - node_a.quality) / wt_range;
590
591 #define LERP(param) ((node_a.param * wt_node_a) + (node_b.param * wt_node_b))
592 #define LERPI(param) astc::flt2int_rtn(\
593 (static_cast<float>(node_a.param) * wt_node_a) + \
594 (static_cast<float>(node_b.param) * wt_node_b))
595 #define LERPUI(param) static_cast<unsigned int>(LERPI(param))
596
597 config.tune_partition_count_limit = LERPI(tune_partition_count_limit);
598 config.tune_2partition_index_limit = LERPI(tune_2partition_index_limit);
599 config.tune_3partition_index_limit = LERPI(tune_3partition_index_limit);
600 config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit);
601 config.tune_block_mode_limit = LERPI(tune_block_mode_limit);
602 config.tune_refinement_limit = LERPI(tune_refinement_limit);
603 config.tune_candidate_limit = astc::min(LERPUI(tune_candidate_limit),
604 TUNE_MAX_TRIAL_CANDIDATES);
605 config.tune_2partitioning_candidate_limit = astc::min(LERPUI(tune_2partitioning_candidate_limit),
606 BLOCK_MAX_PARTITIONINGS);
607 config.tune_3partitioning_candidate_limit = astc::min(LERPUI(tune_3partitioning_candidate_limit),
608 BLOCK_MAX_PARTITIONINGS);
609 config.tune_4partitioning_candidate_limit = astc::min(LERPUI(tune_4partitioning_candidate_limit),
610 BLOCK_MAX_PARTITIONINGS);
611 config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels,
612 LERP(tune_db_limit_b_base) - 19 * ltexels);
613
614 config.tune_mode0_mse_overshoot = LERP(tune_mode0_mse_overshoot);
615 config.tune_refinement_mse_overshoot = LERP(tune_refinement_mse_overshoot);
616
617 config.tune_2_partition_early_out_limit_factor = LERP(tune_2_partition_early_out_limit_factor);
618 config.tune_3_partition_early_out_limit_factor = LERP(tune_3_partition_early_out_limit_factor);
619 config.tune_2_plane_early_out_limit_correlation = LERP(tune_2_plane_early_out_limit_correlation);
620 #undef LERP
621 #undef LERPI
622 #undef LERPUI
623 }
624
625 // Set heuristics to the defaults for each color profile
626 config.cw_r_weight = 1.0f;
627 config.cw_g_weight = 1.0f;
628 config.cw_b_weight = 1.0f;
629 config.cw_a_weight = 1.0f;
630
631 config.a_scale_radius = 0;
632
633 config.rgbm_m_scale = 0.0f;
634
635 config.profile = profile;
636
637 // Values in this enum are from an external user, so not guaranteed to be
638 // bounded to the enum values
639 switch (static_cast<int>(profile))
640 {
641 case ASTCENC_PRF_LDR:
642 case ASTCENC_PRF_LDR_SRGB:
643 break;
644 case ASTCENC_PRF_HDR_RGB_LDR_A:
645 case ASTCENC_PRF_HDR:
646 config.tune_db_limit = 999.0f;
647 break;
648 default:
649 return ASTCENC_ERR_BAD_PROFILE;
650 }
651
652 // Flags field must not contain any unknown flag bits
653 status = validate_flags(flags);
654 if (status != ASTCENC_SUCCESS)
655 {
656 return status;
657 }
658
659 if (flags & ASTCENC_FLG_MAP_NORMAL)
660 {
661 // Normal map encoding uses L+A blocks, so allow one more partitioning
662 // than normal. We need need fewer bits for endpoints, so more likely
663 // to be able to use more partitions than an RGB/RGBA block
664 config.tune_partition_count_limit = astc::min(config.tune_partition_count_limit + 1u, 4u);
665
666 config.cw_g_weight = 0.0f;
667 config.cw_b_weight = 0.0f;
668 config.tune_2_partition_early_out_limit_factor *= 1.5f;
669 config.tune_3_partition_early_out_limit_factor *= 1.5f;
670 config.tune_2_plane_early_out_limit_correlation = 0.99f;
671
672 // Normals are prone to blocking artifacts on smooth curves
673 // so force compressor to try harder here ...
674 config.tune_db_limit *= 1.03f;
675 }
676 else if (flags & ASTCENC_FLG_MAP_MASK)
677 {
678 // Masks are prone to blocking artifacts on mask edges
679 // so force compressor to try harder here ...
680 config.tune_db_limit *= 1.03f;
681 }
682 else if (flags & ASTCENC_FLG_MAP_RGBM)
683 {
684 config.rgbm_m_scale = 5.0f;
685 config.cw_a_weight = 2.0f * config.rgbm_m_scale;
686 }
687 else // (This is color data)
688 {
689 // This is a very basic perceptual metric for RGB color data, which weights error
690 // significance by the perceptual luminance contribution of each color channel. For
691 // luminance the usual weights to compute luminance from a linear RGB value are as
692 // follows:
693 //
694 // l = r * 0.3 + g * 0.59 + b * 0.11
695 //
696 // ... but we scale these up to keep a better balance between color and alpha. Note
697 // that if the content is using alpha we'd recommend using the -a option to weight
698 // the color contribution by the alpha transparency.
699 if (flags & ASTCENC_FLG_USE_PERCEPTUAL)
700 {
701 config.cw_r_weight = 0.30f * 2.25f;
702 config.cw_g_weight = 0.59f * 2.25f;
703 config.cw_b_weight = 0.11f * 2.25f;
704 }
705 }
706 config.flags = flags;
707
708 return ASTCENC_SUCCESS;
709 }
710
711 /* See header for documentation. */
astcenc_context_alloc(const astcenc_config * configp,unsigned int thread_count,astcenc_context ** context)712 astcenc_error astcenc_context_alloc(
713 const astcenc_config* configp,
714 unsigned int thread_count,
715 astcenc_context** context
716 ) {
717 astcenc_error status;
718 const astcenc_config& config = *configp;
719
720 status = validate_cpu_isa();
721 if (status != ASTCENC_SUCCESS)
722 {
723 return status;
724 }
725
726 status = validate_cpu_float();
727 if (status != ASTCENC_SUCCESS)
728 {
729 return status;
730 }
731
732 if (thread_count == 0)
733 {
734 return ASTCENC_ERR_BAD_PARAM;
735 }
736
737 #if defined(ASTCENC_DIAGNOSTICS)
738 // Force single threaded compressor use in diagnostic mode.
739 if (thread_count != 1)
740 {
741 return ASTCENC_ERR_BAD_PARAM;
742 }
743 #endif
744
745 astcenc_context* ctxo = new astcenc_context;
746 astcenc_contexti* ctx = &ctxo->context;
747 ctx->thread_count = thread_count;
748 ctx->config = config;
749 ctx->working_buffers = nullptr;
750
751 // These are allocated per-compress, as they depend on image size
752 ctx->input_alpha_averages = nullptr;
753
754 // Copy the config first and validate the copy (we may modify it)
755 status = validate_config(ctx->config);
756 if (status != ASTCENC_SUCCESS)
757 {
758 delete ctxo;
759 return status;
760 }
761
762 ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN);
763 bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY);
764 init_block_size_descriptor(config.block_x, config.block_y, config.block_z,
765 can_omit_modes,
766 config.tune_partition_count_limit,
767 static_cast<float>(config.tune_block_mode_limit) / 100.0f,
768 *ctx->bsd);
769
770 #if !defined(ASTCENC_DECOMPRESS_ONLY)
771 // Do setup only needed by compression
772 if (!(status & ASTCENC_FLG_DECOMPRESS_ONLY))
773 {
774 // Turn a dB limit into a per-texel error for faster use later
775 if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB))
776 {
777 ctx->config.tune_db_limit = astc::pow(0.1f, ctx->config.tune_db_limit * 0.1f) * 65535.0f * 65535.0f;
778 }
779 else
780 {
781 ctx->config.tune_db_limit = 0.0f;
782 }
783
784 size_t worksize = sizeof(compression_working_buffers) * thread_count;
785 ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN);
786 static_assert((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0,
787 "compression_working_buffers size must be multiple of vector alignment");
788 if (!ctx->working_buffers)
789 {
790 aligned_free<block_size_descriptor>(ctx->bsd);
791 delete ctxo;
792 *context = nullptr;
793 return ASTCENC_ERR_OUT_OF_MEM;
794 }
795 }
796 #endif
797
798 #if defined(ASTCENC_DIAGNOSTICS)
799 ctx->trace_log = new TraceLog(ctx->config.trace_file_path);
800 if (!ctx->trace_log->m_file)
801 {
802 return ASTCENC_ERR_DTRACE_FAILURE;
803 }
804
805 trace_add_data("block_x", config.block_x);
806 trace_add_data("block_y", config.block_y);
807 trace_add_data("block_z", config.block_z);
808 #endif
809
810 *context = ctxo;
811
812 #if !defined(ASTCENC_DECOMPRESS_ONLY)
813 prepare_angular_tables();
814 #endif
815
816 return ASTCENC_SUCCESS;
817 }
818
819 /* See header dor documentation. */
astcenc_context_free(astcenc_context * ctxo)820 void astcenc_context_free(
821 astcenc_context* ctxo
822 ) {
823 if (ctxo)
824 {
825 astcenc_contexti* ctx = &ctxo->context;
826 aligned_free<compression_working_buffers>(ctx->working_buffers);
827 aligned_free<block_size_descriptor>(ctx->bsd);
828 #if defined(ASTCENC_DIAGNOSTICS)
829 delete ctx->trace_log;
830 #endif
831 delete ctxo;
832 }
833 }
834
835 #if !defined(ASTCENC_DECOMPRESS_ONLY)
836
837 /**
838 * @brief Compress an image, after any preflight has completed.
839 *
840 * @param[out] ctxo The compressor context.
841 * @param thread_index The thread index.
842 * @param image The intput image.
843 * @param swizzle The input swizzle.
844 * @param[out] buffer The output array for the compressed data.
845 */
compress_image(astcenc_context & ctxo,unsigned int thread_index,const astcenc_image & image,const astcenc_swizzle & swizzle,uint8_t * buffer)846 static void compress_image(
847 astcenc_context& ctxo,
848 unsigned int thread_index,
849 const astcenc_image& image,
850 const astcenc_swizzle& swizzle,
851 uint8_t* buffer
852 ) {
853 astcenc_contexti& ctx = ctxo.context;
854 const block_size_descriptor& bsd = *ctx.bsd;
855 astcenc_profile decode_mode = ctx.config.profile;
856
857 image_block blk;
858
859 int block_x = bsd.xdim;
860 int block_y = bsd.ydim;
861 int block_z = bsd.zdim;
862 blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
863
864 int dim_x = image.dim_x;
865 int dim_y = image.dim_y;
866 int dim_z = image.dim_z;
867
868 int xblocks = (dim_x + block_x - 1) / block_x;
869 int yblocks = (dim_y + block_y - 1) / block_y;
870 int zblocks = (dim_z + block_z - 1) / block_z;
871 int block_count = zblocks * yblocks * xblocks;
872
873 int row_blocks = xblocks;
874 int plane_blocks = xblocks * yblocks;
875
876 // Populate the block channel weights
877 blk.channel_weight = vfloat4(ctx.config.cw_r_weight,
878 ctx.config.cw_g_weight,
879 ctx.config.cw_b_weight,
880 ctx.config.cw_a_weight);
881
882 // Use preallocated scratch buffer
883 auto& temp_buffers = ctx.working_buffers[thread_index];
884
885 // Only the first thread actually runs the initializer
886 ctxo.manage_compress.init(block_count);
887
888 // Determine if we can use an optimized load function
889 bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) ||
890 (swizzle.b != ASTCENC_SWZ_B) || (swizzle.a != ASTCENC_SWZ_A);
891
892 bool needs_hdr = (decode_mode == ASTCENC_PRF_HDR) ||
893 (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A);
894
895 bool use_fast_load = !needs_swz && !needs_hdr &&
896 block_z == 1 && image.data_type == ASTCENC_TYPE_U8;
897
898 auto load_func = load_image_block;
899 if (use_fast_load)
900 {
901 load_func = load_image_block_fast_ldr;
902 }
903
904 // All threads run this processing loop until there is no work remaining
905 while (true)
906 {
907 unsigned int count;
908 unsigned int base = ctxo.manage_compress.get_task_assignment(16, count);
909 if (!count)
910 {
911 break;
912 }
913
914 for (unsigned int i = base; i < base + count; i++)
915 {
916 // Decode i into x, y, z block indices
917 int z = i / plane_blocks;
918 unsigned int rem = i - (z * plane_blocks);
919 int y = rem / row_blocks;
920 int x = rem - (y * row_blocks);
921
922 // Test if we can apply some basic alpha-scale RDO
923 bool use_full_block = true;
924 if (ctx.config.a_scale_radius != 0 && block_z == 1)
925 {
926 int start_x = x * block_x;
927 int end_x = astc::min(dim_x, start_x + block_x);
928
929 int start_y = y * block_y;
930 int end_y = astc::min(dim_y, start_y + block_y);
931
932 // SATs accumulate error, so don't test exactly zero. Test for
933 // less than 1 alpha in the expanded block footprint that
934 // includes the alpha radius.
935 int x_footprint = block_x + 2 * (ctx.config.a_scale_radius - 1);
936
937 int y_footprint = block_y + 2 * (ctx.config.a_scale_radius - 1);
938
939 float footprint = static_cast<float>(x_footprint * y_footprint);
940 float threshold = 0.9f / (255.0f * footprint);
941
942 // Do we have any alpha values?
943 use_full_block = false;
944 for (int ay = start_y; ay < end_y; ay++)
945 {
946 for (int ax = start_x; ax < end_x; ax++)
947 {
948 float a_avg = ctx.input_alpha_averages[ay * dim_x + ax];
949 if (a_avg > threshold)
950 {
951 use_full_block = true;
952 ax = end_x;
953 ay = end_y;
954 }
955 }
956 }
957 }
958
959 // Fetch the full block for compression
960 if (use_full_block)
961 {
962 load_func(decode_mode, image, blk, bsd, x * block_x, y * block_y, z * block_z, swizzle);
963
964 // Scale RGB error contribution by the maximum alpha in the block
965 // This encourages preserving alpha accuracy in regions with high
966 // transparency, and can buy up to 0.5 dB PSNR.
967 if (ctx.config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT)
968 {
969 float alpha_scale = blk.data_max.lane<3>() * (1.0f / 65535.0f);
970 blk.channel_weight = vfloat4(ctx.config.cw_r_weight * alpha_scale,
971 ctx.config.cw_g_weight * alpha_scale,
972 ctx.config.cw_b_weight * alpha_scale,
973 ctx.config.cw_a_weight);
974 }
975 }
976 // Apply alpha scale RDO - substitute constant color block
977 else
978 {
979 blk.origin_texel = vfloat4::zero();
980 blk.data_min = vfloat4::zero();
981 blk.data_mean = vfloat4::zero();
982 blk.data_max = vfloat4::zero();
983 blk.grayscale = true;
984 }
985
986 int offset = ((z * yblocks + y) * xblocks + x) * 16;
987 uint8_t *bp = buffer + offset;
988 physical_compressed_block* pcb = reinterpret_cast<physical_compressed_block*>(bp);
989 compress_block(ctx, blk, *pcb, temp_buffers);
990 }
991
992 ctxo.manage_compress.complete_task_assignment(count);
993 }
994 }
995
996 /**
997 * @brief Compute regional averages in an image.
998 *
999 * This function can be called by multiple threads, but only after a single
1000 * thread calls the setup function @c init_compute_averages().
1001 *
1002 * Results are written back into @c img->input_alpha_averages.
1003 *
1004 * @param[out] ctx The context.
1005 * @param ag The average and variance arguments created during setup.
1006 */
compute_averages(astcenc_context & ctx,const avg_args & ag)1007 static void compute_averages(
1008 astcenc_context& ctx,
1009 const avg_args &ag
1010 ) {
1011 pixel_region_args arg = ag.arg;
1012 arg.work_memory = new vfloat4[ag.work_memory_size];
1013
1014 int size_x = ag.img_size_x;
1015 int size_y = ag.img_size_y;
1016 int size_z = ag.img_size_z;
1017
1018 int step_xy = ag.blk_size_xy;
1019 int step_z = ag.blk_size_z;
1020
1021 int y_tasks = (size_y + step_xy - 1) / step_xy;
1022
1023 // All threads run this processing loop until there is no work remaining
1024 while (true)
1025 {
1026 unsigned int count;
1027 unsigned int base = ctx.manage_avg.get_task_assignment(16, count);
1028 if (!count)
1029 {
1030 break;
1031 }
1032
1033 for (unsigned int i = base; i < base + count; i++)
1034 {
1035 int z = (i / (y_tasks)) * step_z;
1036 int y = (i - (z * y_tasks)) * step_xy;
1037
1038 arg.size_z = astc::min(step_z, size_z - z);
1039 arg.offset_z = z;
1040
1041 arg.size_y = astc::min(step_xy, size_y - y);
1042 arg.offset_y = y;
1043
1044 for (int x = 0; x < size_x; x += step_xy)
1045 {
1046 arg.size_x = astc::min(step_xy, size_x - x);
1047 arg.offset_x = x;
1048 compute_pixel_region_variance(ctx.context, arg);
1049 }
1050 }
1051
1052 ctx.manage_avg.complete_task_assignment(count);
1053 }
1054
1055 delete[] arg.work_memory;
1056 }
1057
1058 #endif
1059
1060 /* See header for documentation. */
astcenc_compress_image(astcenc_context * ctxo,astcenc_image * imagep,const astcenc_swizzle * swizzle,uint8_t * data_out,size_t data_len,unsigned int thread_index)1061 astcenc_error astcenc_compress_image(
1062 astcenc_context* ctxo,
1063 astcenc_image* imagep,
1064 const astcenc_swizzle* swizzle,
1065 uint8_t* data_out,
1066 size_t data_len,
1067 unsigned int thread_index
1068 ) {
1069 #if defined(ASTCENC_DECOMPRESS_ONLY)
1070 (void)ctxo;
1071 (void)imagep;
1072 (void)swizzle;
1073 (void)data_out;
1074 (void)data_len;
1075 (void)thread_index;
1076 return ASTCENC_ERR_BAD_CONTEXT;
1077 #else
1078 astcenc_contexti* ctx = &ctxo->context;
1079 astcenc_error status;
1080 astcenc_image& image = *imagep;
1081
1082 if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
1083 {
1084 return ASTCENC_ERR_BAD_CONTEXT;
1085 }
1086
1087 status = validate_compression_swizzle(*swizzle);
1088 if (status != ASTCENC_SUCCESS)
1089 {
1090 return status;
1091 }
1092
1093 if (thread_index >= ctx->thread_count)
1094 {
1095 return ASTCENC_ERR_BAD_PARAM;
1096 }
1097
1098 unsigned int block_x = ctx->config.block_x;
1099 unsigned int block_y = ctx->config.block_y;
1100 unsigned int block_z = ctx->config.block_z;
1101
1102 unsigned int xblocks = (image.dim_x + block_x - 1) / block_x;
1103 unsigned int yblocks = (image.dim_y + block_y - 1) / block_y;
1104 unsigned int zblocks = (image.dim_z + block_z - 1) / block_z;
1105
1106 // Check we have enough output space (16 bytes per block)
1107 size_t size_needed = xblocks * yblocks * zblocks * 16;
1108 if (data_len < size_needed)
1109 {
1110 return ASTCENC_ERR_OUT_OF_MEM;
1111 }
1112
1113 // If context thread count is one then implicitly reset
1114 if (ctx->thread_count == 1)
1115 {
1116 astcenc_compress_reset(ctxo);
1117 }
1118
1119 if (ctx->config.a_scale_radius != 0)
1120 {
1121 // First thread to enter will do setup, other threads will subsequently
1122 // enter the critical section but simply skip over the initialization
1123 auto init_avg = [ctx, &image, swizzle]() {
1124 // Perform memory allocations for the destination buffers
1125 size_t texel_count = image.dim_x * image.dim_y * image.dim_z;
1126 ctx->input_alpha_averages = new float[texel_count];
1127
1128 return init_compute_averages(
1129 image, ctx->config.a_scale_radius, *swizzle,
1130 ctx->avg_preprocess_args);
1131 };
1132
1133 // Only the first thread actually runs the initializer
1134 ctxo->manage_avg.init(init_avg);
1135
1136 // All threads will enter this function and dynamically grab work
1137 compute_averages(*ctxo, ctx->avg_preprocess_args);
1138 }
1139
1140 // Wait for compute_averages to complete before compressing
1141 ctxo->manage_avg.wait();
1142
1143 compress_image(*ctxo, thread_index, image, *swizzle, data_out);
1144
1145 // Wait for compress to complete before freeing memory
1146 ctxo->manage_compress.wait();
1147
1148 auto term_compress = [ctx]() {
1149 delete[] ctx->input_alpha_averages;
1150 ctx->input_alpha_averages = nullptr;
1151 };
1152
1153 // Only the first thread to arrive actually runs the term
1154 ctxo->manage_compress.term(term_compress);
1155
1156 return ASTCENC_SUCCESS;
1157 #endif
1158 }
1159
1160 /* See header for documentation. */
astcenc_compress_reset(astcenc_context * ctxo)1161 astcenc_error astcenc_compress_reset(
1162 astcenc_context* ctxo
1163 ) {
1164 #if defined(ASTCENC_DECOMPRESS_ONLY)
1165 (void)ctxo;
1166 return ASTCENC_ERR_BAD_CONTEXT;
1167 #else
1168 astcenc_contexti* ctx = &ctxo->context;
1169 if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
1170 {
1171 return ASTCENC_ERR_BAD_CONTEXT;
1172 }
1173
1174 ctxo->manage_avg.reset();
1175 ctxo->manage_compress.reset();
1176 return ASTCENC_SUCCESS;
1177 #endif
1178 }
1179
1180 /* See header for documentation. */
astcenc_decompress_image(astcenc_context * ctxo,const uint8_t * data,size_t data_len,astcenc_image * image_outp,const astcenc_swizzle * swizzle,unsigned int thread_index)1181 astcenc_error astcenc_decompress_image(
1182 astcenc_context* ctxo,
1183 const uint8_t* data,
1184 size_t data_len,
1185 astcenc_image* image_outp,
1186 const astcenc_swizzle* swizzle,
1187 unsigned int thread_index
1188 ) {
1189 astcenc_error status;
1190 astcenc_image& image_out = *image_outp;
1191 astcenc_contexti* ctx = &ctxo->context;
1192
1193 // Today this doesn't matter (working set on stack) but might in future ...
1194 if (thread_index >= ctx->thread_count)
1195 {
1196 return ASTCENC_ERR_BAD_PARAM;
1197 }
1198
1199 status = validate_decompression_swizzle(*swizzle);
1200 if (status != ASTCENC_SUCCESS)
1201 {
1202 return status;
1203 }
1204
1205 unsigned int block_x = ctx->config.block_x;
1206 unsigned int block_y = ctx->config.block_y;
1207 unsigned int block_z = ctx->config.block_z;
1208
1209 unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x;
1210 unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y;
1211 unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z;
1212
1213 int row_blocks = xblocks;
1214 int plane_blocks = xblocks * yblocks;
1215
1216 // Check we have enough output space (16 bytes per block)
1217 size_t size_needed = xblocks * yblocks * zblocks * 16;
1218 if (data_len < size_needed)
1219 {
1220 return ASTCENC_ERR_OUT_OF_MEM;
1221 }
1222
1223 image_block blk;
1224 blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
1225
1226 // If context thread count is one then implicitly reset
1227 if (ctx->thread_count == 1)
1228 {
1229 astcenc_decompress_reset(ctxo);
1230 }
1231
1232 // Only the first thread actually runs the initializer
1233 ctxo->manage_decompress.init(zblocks * yblocks * xblocks);
1234
1235 // All threads run this processing loop until there is no work remaining
1236 while (true)
1237 {
1238 unsigned int count;
1239 unsigned int base = ctxo->manage_decompress.get_task_assignment(128, count);
1240 if (!count)
1241 {
1242 break;
1243 }
1244
1245 for (unsigned int i = base; i < base + count; i++)
1246 {
1247 // Decode i into x, y, z block indices
1248 int z = i / plane_blocks;
1249 unsigned int rem = i - (z * plane_blocks);
1250 int y = rem / row_blocks;
1251 int x = rem - (y * row_blocks);
1252
1253 unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16;
1254 const uint8_t* bp = data + offset;
1255
1256 const physical_compressed_block& pcb = *reinterpret_cast<const physical_compressed_block*>(bp);
1257 symbolic_compressed_block scb;
1258
1259 physical_to_symbolic(*ctx->bsd, pcb, scb);
1260
1261 decompress_symbolic_block(ctx->config.profile, *ctx->bsd,
1262 x * block_x, y * block_y, z * block_z,
1263 scb, blk);
1264
1265 store_image_block(image_out, blk, *ctx->bsd,
1266 x * block_x, y * block_y, z * block_z, *swizzle);
1267 }
1268
1269 ctxo->manage_decompress.complete_task_assignment(count);
1270 }
1271
1272 return ASTCENC_SUCCESS;
1273 }
1274
1275 /* See header for documentation. */
astcenc_decompress_reset(astcenc_context * ctxo)1276 astcenc_error astcenc_decompress_reset(
1277 astcenc_context* ctxo
1278 ) {
1279 ctxo->manage_decompress.reset();
1280 return ASTCENC_SUCCESS;
1281 }
1282
1283 /* See header for documentation. */
astcenc_get_block_info(astcenc_context * ctxo,const uint8_t data[16],astcenc_block_info * info)1284 astcenc_error astcenc_get_block_info(
1285 astcenc_context* ctxo,
1286 const uint8_t data[16],
1287 astcenc_block_info* info
1288 ) {
1289 #if defined(ASTCENC_DECOMPRESS_ONLY)
1290 (void)ctxo;
1291 (void)data;
1292 (void)info;
1293 return ASTCENC_ERR_BAD_CONTEXT;
1294 #else
1295 astcenc_contexti* ctx = &ctxo->context;
1296
1297 // Decode the compressed data into a symbolic form
1298 const physical_compressed_block&pcb = *reinterpret_cast<const physical_compressed_block*>(data);
1299 symbolic_compressed_block scb;
1300 physical_to_symbolic(*ctx->bsd, pcb, scb);
1301
1302 // Fetch the appropriate partition and decimation tables
1303 block_size_descriptor& bsd = *ctx->bsd;
1304
1305 // Start from a clean slate
1306 memset(info, 0, sizeof(*info));
1307
1308 // Basic info we can always populate
1309 info->profile = ctx->config.profile;
1310
1311 info->block_x = ctx->config.block_x;
1312 info->block_y = ctx->config.block_y;
1313 info->block_z = ctx->config.block_z;
1314 info->texel_count = bsd.texel_count;
1315
1316 // Check for error blocks first
1317 info->is_error_block = scb.block_type == SYM_BTYPE_ERROR;
1318 if (info->is_error_block)
1319 {
1320 return ASTCENC_SUCCESS;
1321 }
1322
1323 // Check for constant color blocks second
1324 info->is_constant_block = scb.block_type == SYM_BTYPE_CONST_F16 ||
1325 scb.block_type == SYM_BTYPE_CONST_U16;
1326 if (info->is_constant_block)
1327 {
1328 return ASTCENC_SUCCESS;
1329 }
1330
1331 // Otherwise handle a full block ; known to be valid after conditions above have been checked
1332 int partition_count = scb.partition_count;
1333 const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
1334
1335 const block_mode& bm = bsd.get_block_mode(scb.block_mode);
1336 const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
1337
1338 info->weight_x = di.weight_x;
1339 info->weight_y = di.weight_y;
1340 info->weight_z = di.weight_z;
1341
1342 info->is_dual_plane_block = bm.is_dual_plane != 0;
1343
1344 info->partition_count = scb.partition_count;
1345 info->partition_index = scb.partition_index;
1346 info->dual_plane_component = scb.plane2_component;
1347
1348 info->color_level_count = get_quant_level(scb.get_color_quant_mode());
1349 info->weight_level_count = get_quant_level(bm.get_weight_quant_mode());
1350
1351 // Unpack color endpoints for each active partition
1352 for (unsigned int i = 0; i < scb.partition_count; i++)
1353 {
1354 bool rgb_hdr;
1355 bool a_hdr;
1356 vint4 endpnt[2];
1357
1358 unpack_color_endpoints(ctx->config.profile,
1359 scb.color_formats[i],
1360 scb.get_color_quant_mode(),
1361 scb.color_values[i],
1362 rgb_hdr, a_hdr,
1363 endpnt[0], endpnt[1]);
1364
1365 // Store the color endpoint mode info
1366 info->color_endpoint_modes[i] = scb.color_formats[i];
1367 info->is_hdr_block = info->is_hdr_block || rgb_hdr || a_hdr;
1368
1369 // Store the unpacked and decoded color endpoint
1370 vmask4 hdr_mask(rgb_hdr, rgb_hdr, rgb_hdr, a_hdr);
1371 for (int j = 0; j < 2; j++)
1372 {
1373 vint4 color_lns = lns_to_sf16(endpnt[j]);
1374 vint4 color_unorm = unorm16_to_sf16(endpnt[j]);
1375 vint4 datai = select(color_unorm, color_lns, hdr_mask);
1376 store(float16_to_float(datai), info->color_endpoints[i][j]);
1377 }
1378 }
1379
1380 // Unpack weights for each texel
1381 int weight_plane1[BLOCK_MAX_TEXELS];
1382 int weight_plane2[BLOCK_MAX_TEXELS];
1383
1384 unpack_weights(bsd, scb, di, bm.is_dual_plane, weight_plane1, weight_plane2);
1385 for (unsigned int i = 0; i < bsd.texel_count; i++)
1386 {
1387 info->weight_values_plane1[i] = static_cast<float>(weight_plane1[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
1388 if (info->is_dual_plane_block)
1389 {
1390 info->weight_values_plane2[i] = static_cast<float>(weight_plane2[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
1391 }
1392 }
1393
1394 // Unpack partition assignments for each texel
1395 for (unsigned int i = 0; i < bsd.texel_count; i++)
1396 {
1397 info->partition_assignment[i] = pi.partition_of_texel[i];
1398 }
1399
1400 return ASTCENC_SUCCESS;
1401 #endif
1402 }
1403
1404 /* See header for documentation. */
astcenc_get_error_string(astcenc_error status)1405 const char* astcenc_get_error_string(
1406 astcenc_error status
1407 ) {
1408 // Values in this enum are from an external user, so not guaranteed to be
1409 // bounded to the enum values
1410 switch (static_cast<int>(status))
1411 {
1412 case ASTCENC_SUCCESS:
1413 return "ASTCENC_SUCCESS";
1414 case ASTCENC_ERR_OUT_OF_MEM:
1415 return "ASTCENC_ERR_OUT_OF_MEM";
1416 case ASTCENC_ERR_BAD_CPU_FLOAT:
1417 return "ASTCENC_ERR_BAD_CPU_FLOAT";
1418 case ASTCENC_ERR_BAD_CPU_ISA:
1419 return "ASTCENC_ERR_BAD_CPU_ISA";
1420 case ASTCENC_ERR_BAD_PARAM:
1421 return "ASTCENC_ERR_BAD_PARAM";
1422 case ASTCENC_ERR_BAD_BLOCK_SIZE:
1423 return "ASTCENC_ERR_BAD_BLOCK_SIZE";
1424 case ASTCENC_ERR_BAD_PROFILE:
1425 return "ASTCENC_ERR_BAD_PROFILE";
1426 case ASTCENC_ERR_BAD_QUALITY:
1427 return "ASTCENC_ERR_BAD_QUALITY";
1428 case ASTCENC_ERR_BAD_FLAGS:
1429 return "ASTCENC_ERR_BAD_FLAGS";
1430 case ASTCENC_ERR_BAD_SWIZZLE:
1431 return "ASTCENC_ERR_BAD_SWIZZLE";
1432 case ASTCENC_ERR_BAD_CONTEXT:
1433 return "ASTCENC_ERR_BAD_CONTEXT";
1434 case ASTCENC_ERR_NOT_IMPLEMENTED:
1435 return "ASTCENC_ERR_NOT_IMPLEMENTED";
1436 #if defined(ASTCENC_DIAGNOSTICS)
1437 case ASTCENC_ERR_DTRACE_FAILURE:
1438 return "ASTCENC_ERR_DTRACE_FAILURE";
1439 #endif
1440 default:
1441 return nullptr;
1442 }
1443 }
1444