1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 //     http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17 
18 /**
19  * @brief Functions for the library entrypoint.
20  */
21 
22 #include <array>
23 #include <cstring>
24 #include <new>
25 
26 #include "astcenc.h"
27 #include "astcenc_internal_entry.h"
28 #include "astcenc_diagnostic_trace.h"
29 
30 /**
31  * @brief Record of the quality tuning parameter values.
32  *
33  * See the @c astcenc_config structure for detailed parameter documentation.
34  *
35  * Note that the mse_overshoot entries are scaling factors relative to the base MSE to hit db_limit.
36  * A 20% overshoot is harder to hit for a higher base db_limit, so we may actually use lower ratios
37  * for the more through search presets because the underlying db_limit is so much higher.
38  */
39 struct astcenc_preset_config
40 {
41 	float quality;
42 	unsigned int tune_partition_count_limit;
43 	unsigned int tune_2partition_index_limit;
44 	unsigned int tune_3partition_index_limit;
45 	unsigned int tune_4partition_index_limit;
46 	unsigned int tune_block_mode_limit;
47 	unsigned int tune_refinement_limit;
48 	unsigned int tune_candidate_limit;
49 	unsigned int tune_2partitioning_candidate_limit;
50 	unsigned int tune_3partitioning_candidate_limit;
51 	unsigned int tune_4partitioning_candidate_limit;
52 	float tune_db_limit_a_base;
53 	float tune_db_limit_b_base;
54 	float tune_mode0_mse_overshoot;
55 	float tune_refinement_mse_overshoot;
56 	float tune_2_partition_early_out_limit_factor;
57 	float tune_3_partition_early_out_limit_factor;
58 	float tune_2_plane_early_out_limit_correlation;
59 };
60 
61 
62 /**
63  * @brief The static quality presets that are built-in for high bandwidth
64  * presets (x < 25 texels per block).
65  */
66 static const std::array<astcenc_preset_config, 6> preset_configs_high {{
67 	{
68 		ASTCENC_PRE_FASTEST,
69 		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f
70 	}, {
71 		ASTCENC_PRE_FAST,
72 		3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.90f
73 	}, {
74 		ASTCENC_PRE_MEDIUM,
75 		4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 2.5f, 1.1f, 1.05f, 0.95f
76 	}, {
77 		ASTCENC_PRE_THOROUGH,
78 		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.35f, 1.15f, 0.97f
79 	}, {
80 		ASTCENC_PRE_VERYTHOROUGH,
81 		4, 256, 128, 64, 98, 4, 6, 20, 14, 8, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f
82 	}, {
83 		ASTCENC_PRE_EXHAUSTIVE,
84 		4, 512, 512, 512, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f
85 	}
86 }};
87 
88 /**
89  * @brief The static quality presets that are built-in for medium bandwidth
90  * presets (25 <= x < 64 texels per block).
91  */
92 static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
93 	{
94 		ASTCENC_PRE_FASTEST,
95 		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.80f
96 	}, {
97 		ASTCENC_PRE_FAST,
98 		3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f
99 	}, {
100 		ASTCENC_PRE_MEDIUM,
101 		4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 3.0f, 1.1f, 1.05f, 0.90f
102 	}, {
103 		ASTCENC_PRE_THOROUGH,
104 		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.4f, 1.2f, 0.95f
105 	}, {
106 		ASTCENC_PRE_VERYTHOROUGH,
107 		4, 256, 128, 64, 98, 4, 6, 12, 8, 3, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f
108 	}, {
109 		ASTCENC_PRE_EXHAUSTIVE,
110 		4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f
111 	}
112 }};
113 
114 /**
115  * @brief The static quality presets that are built-in for low bandwidth
116  * presets (64 <= x texels per block).
117  */
118 static const std::array<astcenc_preset_config, 6> preset_configs_low {{
119 	{
120 		ASTCENC_PRE_FASTEST,
121 		2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.80f
122 	}, {
123 		ASTCENC_PRE_FAST,
124 		2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.85f
125 	}, {
126 		ASTCENC_PRE_MEDIUM,
127 		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 3.5f, 1.1f, 1.05f, 0.90f
128 	}, {
129 		ASTCENC_PRE_THOROUGH,
130 		4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 10.0f, 1.3f, 1.2f, 0.97f
131 	}, {
132 		ASTCENC_PRE_VERYTHOROUGH,
133 		4, 256, 128, 64, 98, 4, 6, 9, 5, 2, 200.0f, 200.0f, 10.0f, 10.0f, 1.6f, 1.4f, 0.98f
134 	}, {
135 		ASTCENC_PRE_EXHAUSTIVE,
136 		4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 10.0f, 2.0f, 2.0f, 0.99f
137 	}
138 }};
139 
140 /**
141  * @brief Validate CPU floating point meets assumptions made in the codec.
142  *
143  * The codec is written with the assumption that a float threaded through the @c if32 union will be
144  * stored and reloaded as a 32-bit IEEE-754 float with round-to-nearest rounding. This is always the
145  * case in an IEEE-754 compliant system, however not every system or compilation mode is actually
146  * IEEE-754 compliant. This normally fails if the code is compiled with fast math enabled.
147  *
148  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
149  */
validate_cpu_float()150 static astcenc_error validate_cpu_float()
151 {
152 	if32 p;
153 	volatile float xprec_testval = 2.51f;
154 	p.f = xprec_testval + 12582912.0f;
155 	float q = p.f - 12582912.0f;
156 
157 	if (q != 3.0f)
158 	{
159 		return ASTCENC_ERR_BAD_CPU_FLOAT;
160 	}
161 
162 	return ASTCENC_SUCCESS;
163 }
164 
165 /**
166  * @brief Validate CPU ISA support meets the requirements of this build of the library.
167  *
168  * Each library build is statically compiled for a particular set of CPU ISA features, such as the
169  * SIMD support or other ISA extensions such as POPCNT. This function checks that the host CPU
170  * actually supports everything this build needs.
171  *
172  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
173  */
validate_cpu_isa()174 static astcenc_error validate_cpu_isa()
175 {
176 	#if ASTCENC_SSE >= 41
177 		if (!cpu_supports_sse41())
178 		{
179 			return ASTCENC_ERR_BAD_CPU_ISA;
180 		}
181 	#endif
182 
183 	#if ASTCENC_POPCNT >= 1
184 		if (!cpu_supports_popcnt())
185 		{
186 			return ASTCENC_ERR_BAD_CPU_ISA;
187 		}
188 	#endif
189 
190 	#if ASTCENC_F16C >= 1
191 		if (!cpu_supports_f16c())
192 		{
193 			return ASTCENC_ERR_BAD_CPU_ISA;
194 		}
195 	#endif
196 
197 	#if ASTCENC_AVX >= 2
198 		if (!cpu_supports_avx2())
199 		{
200 			return ASTCENC_ERR_BAD_CPU_ISA;
201 		}
202 	#endif
203 
204 	return ASTCENC_SUCCESS;
205 }
206 
207 /**
208  * @brief Validate config profile.
209  *
210  * @param profile   The profile to check.
211  *
212  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
213  */
validate_profile(astcenc_profile profile)214 static astcenc_error validate_profile(
215 	astcenc_profile profile
216 ) {
217 	// Values in this enum are from an external user, so not guaranteed to be
218 	// bounded to the enum values
219 	switch (static_cast<int>(profile))
220 	{
221 	case ASTCENC_PRF_LDR_SRGB:
222 	case ASTCENC_PRF_LDR:
223 	case ASTCENC_PRF_HDR_RGB_LDR_A:
224 	case ASTCENC_PRF_HDR:
225 		return ASTCENC_SUCCESS;
226 	default:
227 		return ASTCENC_ERR_BAD_PROFILE;
228 	}
229 }
230 
231 /**
232  * @brief Validate block size.
233  *
234  * @param block_x   The block x dimensions.
235  * @param block_y   The block y dimensions.
236  * @param block_z   The block z dimensions.
237  *
238  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
239  */
validate_block_size(unsigned int block_x,unsigned int block_y,unsigned int block_z)240 static astcenc_error validate_block_size(
241 	unsigned int block_x,
242 	unsigned int block_y,
243 	unsigned int block_z
244 ) {
245 	// Test if this is a legal block size at all
246 	bool is_legal = (((block_z <= 1) && is_legal_2d_block_size(block_x, block_y)) ||
247 	                 ((block_z >= 2) && is_legal_3d_block_size(block_x, block_y, block_z)));
248 	if (!is_legal)
249 	{
250 		return ASTCENC_ERR_BAD_BLOCK_SIZE;
251 	}
252 
253 	// Test if this build has sufficient capacity for this block size
254 	bool have_capacity = (block_x * block_y * block_z) <= BLOCK_MAX_TEXELS;
255 	if (!have_capacity)
256 	{
257 		return ASTCENC_ERR_NOT_IMPLEMENTED;
258 	}
259 
260 	return ASTCENC_SUCCESS;
261 }
262 
263 /**
264  * @brief Validate flags.
265  *
266  * @param flags   The flags to check.
267  *
268  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
269  */
validate_flags(unsigned int flags)270 static astcenc_error validate_flags(
271 	unsigned int flags
272 ) {
273 	// Flags field must not contain any unknown flag bits
274 	unsigned int exMask = ~ASTCENC_ALL_FLAGS;
275 	if (popcount(flags & exMask) != 0)
276 	{
277 		return ASTCENC_ERR_BAD_FLAGS;
278 	}
279 
280 	// Flags field must only contain at most a single map type
281 	exMask = ASTCENC_FLG_MAP_MASK
282 	       | ASTCENC_FLG_MAP_NORMAL
283 	       | ASTCENC_FLG_MAP_RGBM;
284 	if (popcount(flags & exMask) > 1)
285 	{
286 		return ASTCENC_ERR_BAD_FLAGS;
287 	}
288 
289 	return ASTCENC_SUCCESS;
290 }
291 
292 #if !defined(ASTCENC_DECOMPRESS_ONLY)
293 
294 /**
295  * @brief Validate single channel compression swizzle.
296  *
297  * @param swizzle   The swizzle to check.
298  *
299  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
300  */
validate_compression_swz(astcenc_swz swizzle)301 static astcenc_error validate_compression_swz(
302 	astcenc_swz swizzle
303 ) {
304 	// Not all enum values are handled; SWZ_Z is invalid for compression
305 	switch (static_cast<int>(swizzle))
306 	{
307 	case ASTCENC_SWZ_R:
308 	case ASTCENC_SWZ_G:
309 	case ASTCENC_SWZ_B:
310 	case ASTCENC_SWZ_A:
311 	case ASTCENC_SWZ_0:
312 	case ASTCENC_SWZ_1:
313 		return ASTCENC_SUCCESS;
314 	default:
315 		return ASTCENC_ERR_BAD_SWIZZLE;
316 	}
317 }
318 
319 /**
320  * @brief Validate overall compression swizzle.
321  *
322  * @param swizzle   The swizzle to check.
323  *
324  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
325  */
validate_compression_swizzle(const astcenc_swizzle & swizzle)326 static astcenc_error validate_compression_swizzle(
327 	const astcenc_swizzle& swizzle
328 ) {
329 	if (validate_compression_swz(swizzle.r) ||
330 	    validate_compression_swz(swizzle.g) ||
331 	    validate_compression_swz(swizzle.b) ||
332 	    validate_compression_swz(swizzle.a))
333 	{
334 		return ASTCENC_ERR_BAD_SWIZZLE;
335 	}
336 
337 	return ASTCENC_SUCCESS;
338 }
339 #endif
340 
341 /**
342  * @brief Validate single channel decompression swizzle.
343  *
344  * @param swizzle   The swizzle to check.
345  *
346  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
347  */
validate_decompression_swz(astcenc_swz swizzle)348 static astcenc_error validate_decompression_swz(
349 	astcenc_swz swizzle
350 ) {
351 	// Values in this enum are from an external user, so not guaranteed to be
352 	// bounded to the enum values
353 	switch (static_cast<int>(swizzle))
354 	{
355 	case ASTCENC_SWZ_R:
356 	case ASTCENC_SWZ_G:
357 	case ASTCENC_SWZ_B:
358 	case ASTCENC_SWZ_A:
359 	case ASTCENC_SWZ_0:
360 	case ASTCENC_SWZ_1:
361 	case ASTCENC_SWZ_Z:
362 		return ASTCENC_SUCCESS;
363 	default:
364 		return ASTCENC_ERR_BAD_SWIZZLE;
365 	}
366 }
367 
368 /**
369  * @brief Validate overall decompression swizzle.
370  *
371  * @param swizzle   The swizzle to check.
372  *
373  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
374  */
validate_decompression_swizzle(const astcenc_swizzle & swizzle)375 static astcenc_error validate_decompression_swizzle(
376 	const astcenc_swizzle& swizzle
377 ) {
378 	if (validate_decompression_swz(swizzle.r) ||
379 	    validate_decompression_swz(swizzle.g) ||
380 	    validate_decompression_swz(swizzle.b) ||
381 	    validate_decompression_swz(swizzle.a))
382 	{
383 		return ASTCENC_ERR_BAD_SWIZZLE;
384 	}
385 
386 	return ASTCENC_SUCCESS;
387 }
388 
389 /**
390  * Validate that an incoming configuration is in-spec.
391  *
392  * This function can respond in two ways:
393  *
394  *   * Numerical inputs that have valid ranges are clamped to those valid ranges. No error is thrown
395  *     for out-of-range inputs in this case.
396  *   * Numerical inputs and logic inputs are are logically invalid and which make no sense
397  *     algorithmically will return an error.
398  *
399  * @param[in,out] config   The input compressor configuration.
400  *
401  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
402  */
validate_config(astcenc_config & config)403 static astcenc_error validate_config(
404 	astcenc_config &config
405 ) {
406 	astcenc_error status;
407 
408 	status = validate_profile(config.profile);
409 	if (status != ASTCENC_SUCCESS)
410 	{
411 		return status;
412 	}
413 
414 	status = validate_flags(config.flags);
415 	if (status != ASTCENC_SUCCESS)
416 	{
417 		return status;
418 	}
419 
420 	status = validate_block_size(config.block_x, config.block_y, config.block_z);
421 	if (status != ASTCENC_SUCCESS)
422 	{
423 		return status;
424 	}
425 
426 #if defined(ASTCENC_DECOMPRESS_ONLY)
427 	// Decompress-only builds only support decompress-only contexts
428 	if (!(config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
429 	{
430 		return ASTCENC_ERR_BAD_PARAM;
431 	}
432 #endif
433 
434 	config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f);
435 
436 	config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u);
437 	config.tune_2partition_index_limit = astc::clamp(config.tune_2partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
438 	config.tune_3partition_index_limit = astc::clamp(config.tune_3partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
439 	config.tune_4partition_index_limit = astc::clamp(config.tune_4partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
440 	config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u);
441 	config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u);
442 	config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES);
443 	config.tune_2partitioning_candidate_limit = astc::clamp(config.tune_2partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES);
444 	config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES);
445 	config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES);
446 	config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f);
447 	config.tune_mode0_mse_overshoot = astc::max(config.tune_mode0_mse_overshoot, 1.0f);
448 	config.tune_refinement_mse_overshoot = astc::max(config.tune_refinement_mse_overshoot, 1.0f);
449 	config.tune_2_partition_early_out_limit_factor = astc::max(config.tune_2_partition_early_out_limit_factor, 0.0f);
450 	config.tune_3_partition_early_out_limit_factor = astc::max(config.tune_3_partition_early_out_limit_factor, 0.0f);
451 	config.tune_2_plane_early_out_limit_correlation = astc::max(config.tune_2_plane_early_out_limit_correlation, 0.0f);
452 
453 	// Specifying a zero weight color component is not allowed; force to small value
454 	float max_weight = astc::max(astc::max(config.cw_r_weight, config.cw_g_weight),
455 	                             astc::max(config.cw_b_weight, config.cw_a_weight));
456 	if (max_weight > 0.0f)
457 	{
458 		max_weight /= 1000.0f;
459 		config.cw_r_weight = astc::max(config.cw_r_weight, max_weight);
460 		config.cw_g_weight = astc::max(config.cw_g_weight, max_weight);
461 		config.cw_b_weight = astc::max(config.cw_b_weight, max_weight);
462 		config.cw_a_weight = astc::max(config.cw_a_weight, max_weight);
463 	}
464 	// If all color components error weights are zero then return an error
465 	else
466 	{
467 		return ASTCENC_ERR_BAD_PARAM;
468 	}
469 
470 	return ASTCENC_SUCCESS;
471 }
472 
473 /* See header for documentation. */
astcenc_config_init(astcenc_profile profile,unsigned int block_x,unsigned int block_y,unsigned int block_z,float quality,unsigned int flags,astcenc_config * configp)474 astcenc_error astcenc_config_init(
475 	astcenc_profile profile,
476 	unsigned int block_x,
477 	unsigned int block_y,
478 	unsigned int block_z,
479 	float quality,
480 	unsigned int flags,
481 	astcenc_config* configp
482 ) {
483 	astcenc_error status;
484 
485 	// Check basic library compatibility options here so they are checked early. Note, these checks
486 	// are repeated in context_alloc for cases where callers use a manually defined config struct
487 	status = validate_cpu_isa();
488 	if (status != ASTCENC_SUCCESS)
489 	{
490 		return status;
491 	}
492 
493 	status = validate_cpu_float();
494 	if (status != ASTCENC_SUCCESS)
495 	{
496 		return status;
497 	}
498 
499 	// Zero init all config fields; although most of will be over written
500 	astcenc_config& config = *configp;
501 	std::memset(&config, 0, sizeof(config));
502 
503 	// Process the block size
504 	block_z = astc::max(block_z, 1u); // For 2D blocks Z==0 is accepted, but convert to 1
505 	status = validate_block_size(block_x, block_y, block_z);
506 	if (status != ASTCENC_SUCCESS)
507 	{
508 		return status;
509 	}
510 
511 	config.block_x = block_x;
512 	config.block_y = block_y;
513 	config.block_z = block_z;
514 
515 	float texels = static_cast<float>(block_x * block_y * block_z);
516 	float ltexels = logf(texels) / logf(10.0f);
517 
518 	// Process the performance quality level or preset; note that this must be done before we
519 	// process any additional settings, such as color profile and flags, which may replace some of
520 	// these settings with more use case tuned values
521 	if (quality < ASTCENC_PRE_FASTEST ||
522 	    quality > ASTCENC_PRE_EXHAUSTIVE)
523 	{
524 		return ASTCENC_ERR_BAD_QUALITY;
525 	}
526 
527 	static const std::array<astcenc_preset_config, 6>* preset_configs;
528 	int texels_int = block_x * block_y * block_z;
529 	if (texels_int < 25)
530 	{
531 		preset_configs = &preset_configs_high;
532 	}
533 	else if (texels_int < 64)
534 	{
535 		preset_configs = &preset_configs_mid;
536 	}
537 	else
538 	{
539 		preset_configs = &preset_configs_low;
540 	}
541 
542 	// Determine which preset to use, or which pair to interpolate
543 	size_t start;
544 	size_t end;
545 	for (end = 0; end < preset_configs->size(); end++)
546 	{
547 		if ((*preset_configs)[end].quality >= quality)
548 		{
549 			break;
550 		}
551 	}
552 
553 	start = end == 0 ? 0 : end - 1;
554 
555 	// Start and end node are the same - so just transfer the values.
556 	if (start == end)
557 	{
558 		config.tune_partition_count_limit = (*preset_configs)[start].tune_partition_count_limit;
559 		config.tune_2partition_index_limit = (*preset_configs)[start].tune_2partition_index_limit;
560 		config.tune_3partition_index_limit = (*preset_configs)[start].tune_3partition_index_limit;
561 		config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit;
562 		config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit;
563 		config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit;
564 		config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit, TUNE_MAX_TRIAL_CANDIDATES);
565 		config.tune_2partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_2partitioning_candidate_limit, TUNE_MAX_PARTITIIONING_CANDIDATES);
566 		config.tune_3partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_3partitioning_candidate_limit, TUNE_MAX_PARTITIIONING_CANDIDATES);
567 		config.tune_4partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_4partitioning_candidate_limit, TUNE_MAX_PARTITIIONING_CANDIDATES);
568 		config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels,
569 		                                 (*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels);
570 
571 		config.tune_mode0_mse_overshoot = (*preset_configs)[start].tune_mode0_mse_overshoot;
572 		config.tune_refinement_mse_overshoot = (*preset_configs)[start].tune_refinement_mse_overshoot;
573 
574 		config.tune_2_partition_early_out_limit_factor = (*preset_configs)[start].tune_2_partition_early_out_limit_factor;
575 		config.tune_3_partition_early_out_limit_factor =(*preset_configs)[start].tune_3_partition_early_out_limit_factor;
576 		config.tune_2_plane_early_out_limit_correlation = (*preset_configs)[start].tune_2_plane_early_out_limit_correlation;
577 	}
578 	// Start and end node are not the same - so interpolate between them
579 	else
580 	{
581 		auto& node_a = (*preset_configs)[start];
582 		auto& node_b = (*preset_configs)[end];
583 
584 		float wt_range = node_b.quality - node_a.quality;
585 		assert(wt_range > 0);
586 
587 		// Compute interpolation factors
588 		float wt_node_a = (node_b.quality - quality) / wt_range;
589 		float wt_node_b = (quality - node_a.quality) / wt_range;
590 
591 		#define LERP(param) ((node_a.param * wt_node_a) + (node_b.param * wt_node_b))
592 		#define LERPI(param) astc::flt2int_rtn(\
593 		                         (static_cast<float>(node_a.param) * wt_node_a) + \
594 		                         (static_cast<float>(node_b.param) * wt_node_b))
595 		#define LERPUI(param) static_cast<unsigned int>(LERPI(param))
596 
597 		config.tune_partition_count_limit = LERPI(tune_partition_count_limit);
598 		config.tune_2partition_index_limit = LERPI(tune_2partition_index_limit);
599 		config.tune_3partition_index_limit = LERPI(tune_3partition_index_limit);
600 		config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit);
601 		config.tune_block_mode_limit = LERPI(tune_block_mode_limit);
602 		config.tune_refinement_limit = LERPI(tune_refinement_limit);
603 		config.tune_candidate_limit = astc::min(LERPUI(tune_candidate_limit),
604 		                                        TUNE_MAX_TRIAL_CANDIDATES);
605 		config.tune_2partitioning_candidate_limit = astc::min(LERPUI(tune_2partitioning_candidate_limit),
606 		                                                      BLOCK_MAX_PARTITIONINGS);
607 		config.tune_3partitioning_candidate_limit = astc::min(LERPUI(tune_3partitioning_candidate_limit),
608 		                                                      BLOCK_MAX_PARTITIONINGS);
609 		config.tune_4partitioning_candidate_limit = astc::min(LERPUI(tune_4partitioning_candidate_limit),
610 		                                                      BLOCK_MAX_PARTITIONINGS);
611 		config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels,
612 		                                 LERP(tune_db_limit_b_base) - 19 * ltexels);
613 
614 		config.tune_mode0_mse_overshoot = LERP(tune_mode0_mse_overshoot);
615 		config.tune_refinement_mse_overshoot = LERP(tune_refinement_mse_overshoot);
616 
617 		config.tune_2_partition_early_out_limit_factor = LERP(tune_2_partition_early_out_limit_factor);
618 		config.tune_3_partition_early_out_limit_factor = LERP(tune_3_partition_early_out_limit_factor);
619 		config.tune_2_plane_early_out_limit_correlation = LERP(tune_2_plane_early_out_limit_correlation);
620 		#undef LERP
621 		#undef LERPI
622 		#undef LERPUI
623 	}
624 
625 	// Set heuristics to the defaults for each color profile
626 	config.cw_r_weight = 1.0f;
627 	config.cw_g_weight = 1.0f;
628 	config.cw_b_weight = 1.0f;
629 	config.cw_a_weight = 1.0f;
630 
631 	config.a_scale_radius = 0;
632 
633 	config.rgbm_m_scale = 0.0f;
634 
635 	config.profile = profile;
636 
637 	// Values in this enum are from an external user, so not guaranteed to be
638 	// bounded to the enum values
639 	switch (static_cast<int>(profile))
640 	{
641 	case ASTCENC_PRF_LDR:
642 	case ASTCENC_PRF_LDR_SRGB:
643 		break;
644 	case ASTCENC_PRF_HDR_RGB_LDR_A:
645 	case ASTCENC_PRF_HDR:
646 		config.tune_db_limit = 999.0f;
647 		break;
648 	default:
649 		return ASTCENC_ERR_BAD_PROFILE;
650 	}
651 
652 	// Flags field must not contain any unknown flag bits
653 	status = validate_flags(flags);
654 	if (status != ASTCENC_SUCCESS)
655 	{
656 		return status;
657 	}
658 
659 	if (flags & ASTCENC_FLG_MAP_NORMAL)
660 	{
661 		// Normal map encoding uses L+A blocks, so allow one more partitioning
662 		// than normal. We need need fewer bits for endpoints, so more likely
663 		// to be able to use more partitions than an RGB/RGBA block
664 		config.tune_partition_count_limit = astc::min(config.tune_partition_count_limit + 1u, 4u);
665 
666 		config.cw_g_weight = 0.0f;
667 		config.cw_b_weight = 0.0f;
668 		config.tune_2_partition_early_out_limit_factor *= 1.5f;
669 		config.tune_3_partition_early_out_limit_factor *= 1.5f;
670 		config.tune_2_plane_early_out_limit_correlation = 0.99f;
671 
672 		// Normals are prone to blocking artifacts on smooth curves
673 		// so force compressor to try harder here ...
674 		config.tune_db_limit *= 1.03f;
675 	}
676 	else if (flags & ASTCENC_FLG_MAP_MASK)
677 	{
678 		// Masks are prone to blocking artifacts on mask edges
679 		// so force compressor to try harder here ...
680 		config.tune_db_limit *= 1.03f;
681 	}
682 	else if (flags & ASTCENC_FLG_MAP_RGBM)
683 	{
684 		config.rgbm_m_scale = 5.0f;
685 		config.cw_a_weight = 2.0f * config.rgbm_m_scale;
686 	}
687 	else // (This is color data)
688 	{
689 		// This is a very basic perceptual metric for RGB color data, which weights error
690 		// significance by the perceptual luminance contribution of each color channel. For
691 		// luminance the usual weights to compute luminance from a linear RGB value are as
692 		// follows:
693 		//
694 		//     l = r * 0.3 + g * 0.59 + b * 0.11
695 		//
696 		// ... but we scale these up to keep a better balance between color and alpha. Note
697 		// that if the content is using alpha we'd recommend using the -a option to weight
698 		// the color contribution by the alpha transparency.
699 		if (flags & ASTCENC_FLG_USE_PERCEPTUAL)
700 		{
701 			config.cw_r_weight = 0.30f * 2.25f;
702 			config.cw_g_weight = 0.59f * 2.25f;
703 			config.cw_b_weight = 0.11f * 2.25f;
704 		}
705 	}
706 	config.flags = flags;
707 
708 	return ASTCENC_SUCCESS;
709 }
710 
711 /* See header for documentation. */
astcenc_context_alloc(const astcenc_config * configp,unsigned int thread_count,astcenc_context ** context)712 astcenc_error astcenc_context_alloc(
713 	const astcenc_config* configp,
714 	unsigned int thread_count,
715 	astcenc_context** context
716 ) {
717 	astcenc_error status;
718 	const astcenc_config& config = *configp;
719 
720 	status = validate_cpu_isa();
721 	if (status != ASTCENC_SUCCESS)
722 	{
723 		return status;
724 	}
725 
726 	status = validate_cpu_float();
727 	if (status != ASTCENC_SUCCESS)
728 	{
729 		return status;
730 	}
731 
732 	if (thread_count == 0)
733 	{
734 		return ASTCENC_ERR_BAD_PARAM;
735 	}
736 
737 #if defined(ASTCENC_DIAGNOSTICS)
738 	// Force single threaded compressor use in diagnostic mode.
739 	if (thread_count != 1)
740 	{
741 		return ASTCENC_ERR_BAD_PARAM;
742 	}
743 #endif
744 
745 	astcenc_context* ctxo = new astcenc_context;
746 	astcenc_contexti* ctx = &ctxo->context;
747 	ctx->thread_count = thread_count;
748 	ctx->config = config;
749 	ctx->working_buffers = nullptr;
750 
751 	// These are allocated per-compress, as they depend on image size
752 	ctx->input_alpha_averages = nullptr;
753 
754 	// Copy the config first and validate the copy (we may modify it)
755 	status = validate_config(ctx->config);
756 	if (status != ASTCENC_SUCCESS)
757 	{
758 		delete ctxo;
759 		return status;
760 	}
761 
762 	ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN);
763 	bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY);
764 	init_block_size_descriptor(config.block_x, config.block_y, config.block_z,
765 	                           can_omit_modes,
766 	                           config.tune_partition_count_limit,
767 	                           static_cast<float>(config.tune_block_mode_limit) / 100.0f,
768 	                           *ctx->bsd);
769 
770 #if !defined(ASTCENC_DECOMPRESS_ONLY)
771 	// Do setup only needed by compression
772 	if (!(status & ASTCENC_FLG_DECOMPRESS_ONLY))
773 	{
774 		// Turn a dB limit into a per-texel error for faster use later
775 		if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB))
776 		{
777 			ctx->config.tune_db_limit = astc::pow(0.1f, ctx->config.tune_db_limit * 0.1f) * 65535.0f * 65535.0f;
778 		}
779 		else
780 		{
781 			ctx->config.tune_db_limit = 0.0f;
782 		}
783 
784 		size_t worksize = sizeof(compression_working_buffers) * thread_count;
785 		ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN);
786 		static_assert((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0,
787 		              "compression_working_buffers size must be multiple of vector alignment");
788 		if (!ctx->working_buffers)
789 		{
790 			aligned_free<block_size_descriptor>(ctx->bsd);
791 			delete ctxo;
792 			*context = nullptr;
793 			return ASTCENC_ERR_OUT_OF_MEM;
794 		}
795 	}
796 #endif
797 
798 #if defined(ASTCENC_DIAGNOSTICS)
799 	ctx->trace_log = new TraceLog(ctx->config.trace_file_path);
800 	if (!ctx->trace_log->m_file)
801 	{
802 		return ASTCENC_ERR_DTRACE_FAILURE;
803 	}
804 
805 	trace_add_data("block_x", config.block_x);
806 	trace_add_data("block_y", config.block_y);
807 	trace_add_data("block_z", config.block_z);
808 #endif
809 
810 	*context = ctxo;
811 
812 #if !defined(ASTCENC_DECOMPRESS_ONLY)
813 	prepare_angular_tables();
814 #endif
815 
816 	return ASTCENC_SUCCESS;
817 }
818 
819 /* See header dor documentation. */
astcenc_context_free(astcenc_context * ctxo)820 void astcenc_context_free(
821 	astcenc_context* ctxo
822 ) {
823 	if (ctxo)
824 	{
825 		astcenc_contexti* ctx = &ctxo->context;
826 		aligned_free<compression_working_buffers>(ctx->working_buffers);
827 		aligned_free<block_size_descriptor>(ctx->bsd);
828 #if defined(ASTCENC_DIAGNOSTICS)
829 		delete ctx->trace_log;
830 #endif
831 		delete ctxo;
832 	}
833 }
834 
835 #if !defined(ASTCENC_DECOMPRESS_ONLY)
836 
837 /**
838  * @brief Compress an image, after any preflight has completed.
839  *
840  * @param[out] ctxo           The compressor context.
841  * @param      thread_index   The thread index.
842  * @param      image          The intput image.
843  * @param      swizzle        The input swizzle.
844  * @param[out] buffer         The output array for the compressed data.
845  */
compress_image(astcenc_context & ctxo,unsigned int thread_index,const astcenc_image & image,const astcenc_swizzle & swizzle,uint8_t * buffer)846 static void compress_image(
847 	astcenc_context& ctxo,
848 	unsigned int thread_index,
849 	const astcenc_image& image,
850 	const astcenc_swizzle& swizzle,
851 	uint8_t* buffer
852 ) {
853 	astcenc_contexti& ctx = ctxo.context;
854 	const block_size_descriptor& bsd = *ctx.bsd;
855 	astcenc_profile decode_mode = ctx.config.profile;
856 
857 	image_block blk;
858 
859 	int block_x = bsd.xdim;
860 	int block_y = bsd.ydim;
861 	int block_z = bsd.zdim;
862 	blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
863 
864 	int dim_x = image.dim_x;
865 	int dim_y = image.dim_y;
866 	int dim_z = image.dim_z;
867 
868 	int xblocks = (dim_x + block_x - 1) / block_x;
869 	int yblocks = (dim_y + block_y - 1) / block_y;
870 	int zblocks = (dim_z + block_z - 1) / block_z;
871 	int block_count = zblocks * yblocks * xblocks;
872 
873 	int row_blocks = xblocks;
874 	int plane_blocks = xblocks * yblocks;
875 
876 	// Populate the block channel weights
877 	blk.channel_weight = vfloat4(ctx.config.cw_r_weight,
878 	                             ctx.config.cw_g_weight,
879 	                             ctx.config.cw_b_weight,
880 	                             ctx.config.cw_a_weight);
881 
882 	// Use preallocated scratch buffer
883 	auto& temp_buffers = ctx.working_buffers[thread_index];
884 
885 	// Only the first thread actually runs the initializer
886 	ctxo.manage_compress.init(block_count);
887 
888 	// Determine if we can use an optimized load function
889 	bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) ||
890 	                 (swizzle.b != ASTCENC_SWZ_B) || (swizzle.a != ASTCENC_SWZ_A);
891 
892 	bool needs_hdr = (decode_mode == ASTCENC_PRF_HDR) ||
893 	                 (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A);
894 
895 	bool use_fast_load = !needs_swz && !needs_hdr &&
896 	                     block_z == 1 && image.data_type == ASTCENC_TYPE_U8;
897 
898 	auto load_func = load_image_block;
899 	if (use_fast_load)
900 	{
901 		load_func = load_image_block_fast_ldr;
902 	}
903 
904 	// All threads run this processing loop until there is no work remaining
905 	while (true)
906 	{
907 		unsigned int count;
908 		unsigned int base = ctxo.manage_compress.get_task_assignment(16, count);
909 		if (!count)
910 		{
911 			break;
912 		}
913 
914 		for (unsigned int i = base; i < base + count; i++)
915 		{
916 			// Decode i into x, y, z block indices
917 			int z = i / plane_blocks;
918 			unsigned int rem = i - (z * plane_blocks);
919 			int y = rem / row_blocks;
920 			int x = rem - (y * row_blocks);
921 
922 			// Test if we can apply some basic alpha-scale RDO
923 			bool use_full_block = true;
924 			if (ctx.config.a_scale_radius != 0 && block_z == 1)
925 			{
926 				int start_x = x * block_x;
927 				int end_x = astc::min(dim_x, start_x + block_x);
928 
929 				int start_y = y * block_y;
930 				int end_y = astc::min(dim_y, start_y + block_y);
931 
932 				// SATs accumulate error, so don't test exactly zero. Test for
933 				// less than 1 alpha in the expanded block footprint that
934 				// includes the alpha radius.
935 				int x_footprint = block_x + 2 * (ctx.config.a_scale_radius - 1);
936 
937 				int y_footprint = block_y + 2 * (ctx.config.a_scale_radius - 1);
938 
939 				float footprint = static_cast<float>(x_footprint * y_footprint);
940 				float threshold = 0.9f / (255.0f * footprint);
941 
942 				// Do we have any alpha values?
943 				use_full_block = false;
944 				for (int ay = start_y; ay < end_y; ay++)
945 				{
946 					for (int ax = start_x; ax < end_x; ax++)
947 					{
948 						float a_avg = ctx.input_alpha_averages[ay * dim_x + ax];
949 						if (a_avg > threshold)
950 						{
951 							use_full_block = true;
952 							ax = end_x;
953 							ay = end_y;
954 						}
955 					}
956 				}
957 			}
958 
959 			// Fetch the full block for compression
960 			if (use_full_block)
961 			{
962 				load_func(decode_mode, image, blk, bsd, x * block_x, y * block_y, z * block_z, swizzle);
963 
964 				// Scale RGB error contribution by the maximum alpha in the block
965 				// This encourages preserving alpha accuracy in regions with high
966 				// transparency, and can buy up to 0.5 dB PSNR.
967 				if (ctx.config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT)
968 				{
969 					float alpha_scale = blk.data_max.lane<3>() * (1.0f / 65535.0f);
970 					blk.channel_weight = vfloat4(ctx.config.cw_r_weight * alpha_scale,
971 					                             ctx.config.cw_g_weight * alpha_scale,
972 					                             ctx.config.cw_b_weight * alpha_scale,
973 					                             ctx.config.cw_a_weight);
974 				}
975 			}
976 			// Apply alpha scale RDO - substitute constant color block
977 			else
978 			{
979 				blk.origin_texel = vfloat4::zero();
980 				blk.data_min = vfloat4::zero();
981 				blk.data_mean = vfloat4::zero();
982 				blk.data_max = vfloat4::zero();
983 				blk.grayscale = true;
984 			}
985 
986 			int offset = ((z * yblocks + y) * xblocks + x) * 16;
987 			uint8_t *bp = buffer + offset;
988 			physical_compressed_block* pcb = reinterpret_cast<physical_compressed_block*>(bp);
989 			compress_block(ctx, blk, *pcb, temp_buffers);
990 		}
991 
992 		ctxo.manage_compress.complete_task_assignment(count);
993 	}
994 }
995 
996 /**
997  * @brief Compute regional averages in an image.
998  *
999  * This function can be called by multiple threads, but only after a single
1000  * thread calls the setup function @c init_compute_averages().
1001  *
1002  * Results are written back into @c img->input_alpha_averages.
1003  *
1004  * @param[out] ctx   The context.
1005  * @param      ag    The average and variance arguments created during setup.
1006  */
compute_averages(astcenc_context & ctx,const avg_args & ag)1007 static void compute_averages(
1008 	astcenc_context& ctx,
1009 	const avg_args &ag
1010 ) {
1011 	pixel_region_args arg = ag.arg;
1012 	arg.work_memory = new vfloat4[ag.work_memory_size];
1013 
1014 	int size_x = ag.img_size_x;
1015 	int size_y = ag.img_size_y;
1016 	int size_z = ag.img_size_z;
1017 
1018 	int step_xy = ag.blk_size_xy;
1019 	int step_z = ag.blk_size_z;
1020 
1021 	int y_tasks = (size_y + step_xy - 1) / step_xy;
1022 
1023 	// All threads run this processing loop until there is no work remaining
1024 	while (true)
1025 	{
1026 		unsigned int count;
1027 		unsigned int base = ctx.manage_avg.get_task_assignment(16, count);
1028 		if (!count)
1029 		{
1030 			break;
1031 		}
1032 
1033 		for (unsigned int i = base; i < base + count; i++)
1034 		{
1035 			int z = (i / (y_tasks)) * step_z;
1036 			int y = (i - (z * y_tasks)) * step_xy;
1037 
1038 			arg.size_z = astc::min(step_z, size_z - z);
1039 			arg.offset_z = z;
1040 
1041 			arg.size_y = astc::min(step_xy, size_y - y);
1042 			arg.offset_y = y;
1043 
1044 			for (int x = 0; x < size_x; x += step_xy)
1045 			{
1046 				arg.size_x = astc::min(step_xy, size_x - x);
1047 				arg.offset_x = x;
1048 				compute_pixel_region_variance(ctx.context, arg);
1049 			}
1050 		}
1051 
1052 		ctx.manage_avg.complete_task_assignment(count);
1053 	}
1054 
1055 	delete[] arg.work_memory;
1056 }
1057 
1058 #endif
1059 
1060 /* See header for documentation. */
astcenc_compress_image(astcenc_context * ctxo,astcenc_image * imagep,const astcenc_swizzle * swizzle,uint8_t * data_out,size_t data_len,unsigned int thread_index)1061 astcenc_error astcenc_compress_image(
1062 	astcenc_context* ctxo,
1063 	astcenc_image* imagep,
1064 	const astcenc_swizzle* swizzle,
1065 	uint8_t* data_out,
1066 	size_t data_len,
1067 	unsigned int thread_index
1068 ) {
1069 #if defined(ASTCENC_DECOMPRESS_ONLY)
1070 	(void)ctxo;
1071 	(void)imagep;
1072 	(void)swizzle;
1073 	(void)data_out;
1074 	(void)data_len;
1075 	(void)thread_index;
1076 	return ASTCENC_ERR_BAD_CONTEXT;
1077 #else
1078 	astcenc_contexti* ctx = &ctxo->context;
1079 	astcenc_error status;
1080 	astcenc_image& image = *imagep;
1081 
1082 	if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
1083 	{
1084 		return ASTCENC_ERR_BAD_CONTEXT;
1085 	}
1086 
1087 	status = validate_compression_swizzle(*swizzle);
1088 	if (status != ASTCENC_SUCCESS)
1089 	{
1090 		return status;
1091 	}
1092 
1093 	if (thread_index >= ctx->thread_count)
1094 	{
1095 		return ASTCENC_ERR_BAD_PARAM;
1096 	}
1097 
1098 	unsigned int block_x = ctx->config.block_x;
1099 	unsigned int block_y = ctx->config.block_y;
1100 	unsigned int block_z = ctx->config.block_z;
1101 
1102 	unsigned int xblocks = (image.dim_x + block_x - 1) / block_x;
1103 	unsigned int yblocks = (image.dim_y + block_y - 1) / block_y;
1104 	unsigned int zblocks = (image.dim_z + block_z - 1) / block_z;
1105 
1106 	// Check we have enough output space (16 bytes per block)
1107 	size_t size_needed = xblocks * yblocks * zblocks * 16;
1108 	if (data_len < size_needed)
1109 	{
1110 		return ASTCENC_ERR_OUT_OF_MEM;
1111 	}
1112 
1113 	// If context thread count is one then implicitly reset
1114 	if (ctx->thread_count == 1)
1115 	{
1116 		astcenc_compress_reset(ctxo);
1117 	}
1118 
1119 	if (ctx->config.a_scale_radius != 0)
1120 	{
1121 		// First thread to enter will do setup, other threads will subsequently
1122 		// enter the critical section but simply skip over the initialization
1123 		auto init_avg = [ctx, &image, swizzle]() {
1124 			// Perform memory allocations for the destination buffers
1125 			size_t texel_count = image.dim_x * image.dim_y * image.dim_z;
1126 			ctx->input_alpha_averages = new float[texel_count];
1127 
1128 			return init_compute_averages(
1129 				image, ctx->config.a_scale_radius, *swizzle,
1130 				ctx->avg_preprocess_args);
1131 		};
1132 
1133 		// Only the first thread actually runs the initializer
1134 		ctxo->manage_avg.init(init_avg);
1135 
1136 		// All threads will enter this function and dynamically grab work
1137 		compute_averages(*ctxo, ctx->avg_preprocess_args);
1138 	}
1139 
1140 	// Wait for compute_averages to complete before compressing
1141 	ctxo->manage_avg.wait();
1142 
1143 	compress_image(*ctxo, thread_index, image, *swizzle, data_out);
1144 
1145 	// Wait for compress to complete before freeing memory
1146 	ctxo->manage_compress.wait();
1147 
1148 	auto term_compress = [ctx]() {
1149 		delete[] ctx->input_alpha_averages;
1150 		ctx->input_alpha_averages = nullptr;
1151 	};
1152 
1153 	// Only the first thread to arrive actually runs the term
1154 	ctxo->manage_compress.term(term_compress);
1155 
1156 	return ASTCENC_SUCCESS;
1157 #endif
1158 }
1159 
1160 /* See header for documentation. */
astcenc_compress_reset(astcenc_context * ctxo)1161 astcenc_error astcenc_compress_reset(
1162 	astcenc_context* ctxo
1163 ) {
1164 #if defined(ASTCENC_DECOMPRESS_ONLY)
1165 	(void)ctxo;
1166 	return ASTCENC_ERR_BAD_CONTEXT;
1167 #else
1168 	astcenc_contexti* ctx = &ctxo->context;
1169 	if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
1170 	{
1171 		return ASTCENC_ERR_BAD_CONTEXT;
1172 	}
1173 
1174 	ctxo->manage_avg.reset();
1175 	ctxo->manage_compress.reset();
1176 	return ASTCENC_SUCCESS;
1177 #endif
1178 }
1179 
1180 /* See header for documentation. */
astcenc_decompress_image(astcenc_context * ctxo,const uint8_t * data,size_t data_len,astcenc_image * image_outp,const astcenc_swizzle * swizzle,unsigned int thread_index)1181 astcenc_error astcenc_decompress_image(
1182 	astcenc_context* ctxo,
1183 	const uint8_t* data,
1184 	size_t data_len,
1185 	astcenc_image* image_outp,
1186 	const astcenc_swizzle* swizzle,
1187 	unsigned int thread_index
1188 ) {
1189 	astcenc_error status;
1190 	astcenc_image& image_out = *image_outp;
1191 	astcenc_contexti* ctx = &ctxo->context;
1192 
1193 	// Today this doesn't matter (working set on stack) but might in future ...
1194 	if (thread_index >= ctx->thread_count)
1195 	{
1196 		return ASTCENC_ERR_BAD_PARAM;
1197 	}
1198 
1199 	status = validate_decompression_swizzle(*swizzle);
1200 	if (status != ASTCENC_SUCCESS)
1201 	{
1202 		return status;
1203 	}
1204 
1205 	unsigned int block_x = ctx->config.block_x;
1206 	unsigned int block_y = ctx->config.block_y;
1207 	unsigned int block_z = ctx->config.block_z;
1208 
1209 	unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x;
1210 	unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y;
1211 	unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z;
1212 
1213 	int row_blocks = xblocks;
1214 	int plane_blocks = xblocks * yblocks;
1215 
1216 	// Check we have enough output space (16 bytes per block)
1217 	size_t size_needed = xblocks * yblocks * zblocks * 16;
1218 	if (data_len < size_needed)
1219 	{
1220 		return ASTCENC_ERR_OUT_OF_MEM;
1221 	}
1222 
1223 	image_block blk;
1224 	blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
1225 
1226 	// If context thread count is one then implicitly reset
1227 	if (ctx->thread_count == 1)
1228 	{
1229 		astcenc_decompress_reset(ctxo);
1230 	}
1231 
1232 	// Only the first thread actually runs the initializer
1233 	ctxo->manage_decompress.init(zblocks * yblocks * xblocks);
1234 
1235 	// All threads run this processing loop until there is no work remaining
1236 	while (true)
1237 	{
1238 		unsigned int count;
1239 		unsigned int base = ctxo->manage_decompress.get_task_assignment(128, count);
1240 		if (!count)
1241 		{
1242 			break;
1243 		}
1244 
1245 		for (unsigned int i = base; i < base + count; i++)
1246 		{
1247 			// Decode i into x, y, z block indices
1248 			int z = i / plane_blocks;
1249 			unsigned int rem = i - (z * plane_blocks);
1250 			int y = rem / row_blocks;
1251 			int x = rem - (y * row_blocks);
1252 
1253 			unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16;
1254 			const uint8_t* bp = data + offset;
1255 
1256 			const physical_compressed_block& pcb = *reinterpret_cast<const physical_compressed_block*>(bp);
1257 			symbolic_compressed_block scb;
1258 
1259 			physical_to_symbolic(*ctx->bsd, pcb, scb);
1260 
1261 			decompress_symbolic_block(ctx->config.profile, *ctx->bsd,
1262 			                          x * block_x, y * block_y, z * block_z,
1263 			                          scb, blk);
1264 
1265 			store_image_block(image_out, blk, *ctx->bsd,
1266 			                  x * block_x, y * block_y, z * block_z, *swizzle);
1267 		}
1268 
1269 		ctxo->manage_decompress.complete_task_assignment(count);
1270 	}
1271 
1272 	return ASTCENC_SUCCESS;
1273 }
1274 
1275 /* See header for documentation. */
astcenc_decompress_reset(astcenc_context * ctxo)1276 astcenc_error astcenc_decompress_reset(
1277 	astcenc_context* ctxo
1278 ) {
1279 	ctxo->manage_decompress.reset();
1280 	return ASTCENC_SUCCESS;
1281 }
1282 
1283 /* See header for documentation. */
astcenc_get_block_info(astcenc_context * ctxo,const uint8_t data[16],astcenc_block_info * info)1284 astcenc_error astcenc_get_block_info(
1285 	astcenc_context* ctxo,
1286 	const uint8_t data[16],
1287 	astcenc_block_info* info
1288 ) {
1289 #if defined(ASTCENC_DECOMPRESS_ONLY)
1290 	(void)ctxo;
1291 	(void)data;
1292 	(void)info;
1293 	return ASTCENC_ERR_BAD_CONTEXT;
1294 #else
1295 	astcenc_contexti* ctx = &ctxo->context;
1296 
1297 	// Decode the compressed data into a symbolic form
1298 	const physical_compressed_block&pcb = *reinterpret_cast<const physical_compressed_block*>(data);
1299 	symbolic_compressed_block scb;
1300 	physical_to_symbolic(*ctx->bsd, pcb, scb);
1301 
1302 	// Fetch the appropriate partition and decimation tables
1303 	block_size_descriptor& bsd = *ctx->bsd;
1304 
1305 	// Start from a clean slate
1306 	memset(info, 0, sizeof(*info));
1307 
1308 	// Basic info we can always populate
1309 	info->profile = ctx->config.profile;
1310 
1311 	info->block_x = ctx->config.block_x;
1312 	info->block_y = ctx->config.block_y;
1313 	info->block_z = ctx->config.block_z;
1314 	info->texel_count = bsd.texel_count;
1315 
1316 	// Check for error blocks first
1317 	info->is_error_block = scb.block_type == SYM_BTYPE_ERROR;
1318 	if (info->is_error_block)
1319 	{
1320 		return ASTCENC_SUCCESS;
1321 	}
1322 
1323 	// Check for constant color blocks second
1324 	info->is_constant_block = scb.block_type == SYM_BTYPE_CONST_F16 ||
1325 	                          scb.block_type == SYM_BTYPE_CONST_U16;
1326 	if (info->is_constant_block)
1327 	{
1328 		return ASTCENC_SUCCESS;
1329 	}
1330 
1331 	// Otherwise handle a full block ; known to be valid after conditions above have been checked
1332 	int partition_count = scb.partition_count;
1333 	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
1334 
1335 	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
1336 	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
1337 
1338 	info->weight_x = di.weight_x;
1339 	info->weight_y = di.weight_y;
1340 	info->weight_z = di.weight_z;
1341 
1342 	info->is_dual_plane_block = bm.is_dual_plane != 0;
1343 
1344 	info->partition_count = scb.partition_count;
1345 	info->partition_index = scb.partition_index;
1346 	info->dual_plane_component = scb.plane2_component;
1347 
1348 	info->color_level_count = get_quant_level(scb.get_color_quant_mode());
1349 	info->weight_level_count = get_quant_level(bm.get_weight_quant_mode());
1350 
1351 	// Unpack color endpoints for each active partition
1352 	for (unsigned int i = 0; i < scb.partition_count; i++)
1353 	{
1354 		bool rgb_hdr;
1355 		bool a_hdr;
1356 		vint4 endpnt[2];
1357 
1358 		unpack_color_endpoints(ctx->config.profile,
1359 		                       scb.color_formats[i],
1360 		                       scb.get_color_quant_mode(),
1361 		                       scb.color_values[i],
1362 		                       rgb_hdr, a_hdr,
1363 		                       endpnt[0], endpnt[1]);
1364 
1365 		// Store the color endpoint mode info
1366 		info->color_endpoint_modes[i] = scb.color_formats[i];
1367 		info->is_hdr_block = info->is_hdr_block || rgb_hdr || a_hdr;
1368 
1369 		// Store the unpacked and decoded color endpoint
1370 		vmask4 hdr_mask(rgb_hdr, rgb_hdr, rgb_hdr, a_hdr);
1371 		for (int j = 0; j < 2; j++)
1372 		{
1373 			vint4 color_lns = lns_to_sf16(endpnt[j]);
1374 			vint4 color_unorm = unorm16_to_sf16(endpnt[j]);
1375 			vint4 datai = select(color_unorm, color_lns, hdr_mask);
1376 			store(float16_to_float(datai), info->color_endpoints[i][j]);
1377 		}
1378 	}
1379 
1380 	// Unpack weights for each texel
1381 	int weight_plane1[BLOCK_MAX_TEXELS];
1382 	int weight_plane2[BLOCK_MAX_TEXELS];
1383 
1384 	unpack_weights(bsd, scb, di, bm.is_dual_plane, weight_plane1, weight_plane2);
1385 	for (unsigned int i = 0; i < bsd.texel_count; i++)
1386 	{
1387 		info->weight_values_plane1[i] = static_cast<float>(weight_plane1[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
1388 		if (info->is_dual_plane_block)
1389 		{
1390 			info->weight_values_plane2[i] = static_cast<float>(weight_plane2[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
1391 		}
1392 	}
1393 
1394 	// Unpack partition assignments for each texel
1395 	for (unsigned int i = 0; i < bsd.texel_count; i++)
1396 	{
1397 		info->partition_assignment[i] = pi.partition_of_texel[i];
1398 	}
1399 
1400 	return ASTCENC_SUCCESS;
1401 #endif
1402 }
1403 
1404 /* See header for documentation. */
astcenc_get_error_string(astcenc_error status)1405 const char* astcenc_get_error_string(
1406 	astcenc_error status
1407 ) {
1408 	// Values in this enum are from an external user, so not guaranteed to be
1409 	// bounded to the enum values
1410 	switch (static_cast<int>(status))
1411 	{
1412 	case ASTCENC_SUCCESS:
1413 		return "ASTCENC_SUCCESS";
1414 	case ASTCENC_ERR_OUT_OF_MEM:
1415 		return "ASTCENC_ERR_OUT_OF_MEM";
1416 	case ASTCENC_ERR_BAD_CPU_FLOAT:
1417 		return "ASTCENC_ERR_BAD_CPU_FLOAT";
1418 	case ASTCENC_ERR_BAD_CPU_ISA:
1419 		return "ASTCENC_ERR_BAD_CPU_ISA";
1420 	case ASTCENC_ERR_BAD_PARAM:
1421 		return "ASTCENC_ERR_BAD_PARAM";
1422 	case ASTCENC_ERR_BAD_BLOCK_SIZE:
1423 		return "ASTCENC_ERR_BAD_BLOCK_SIZE";
1424 	case ASTCENC_ERR_BAD_PROFILE:
1425 		return "ASTCENC_ERR_BAD_PROFILE";
1426 	case ASTCENC_ERR_BAD_QUALITY:
1427 		return "ASTCENC_ERR_BAD_QUALITY";
1428 	case ASTCENC_ERR_BAD_FLAGS:
1429 		return "ASTCENC_ERR_BAD_FLAGS";
1430 	case ASTCENC_ERR_BAD_SWIZZLE:
1431 		return "ASTCENC_ERR_BAD_SWIZZLE";
1432 	case ASTCENC_ERR_BAD_CONTEXT:
1433 		return "ASTCENC_ERR_BAD_CONTEXT";
1434 	case ASTCENC_ERR_NOT_IMPLEMENTED:
1435 		return "ASTCENC_ERR_NOT_IMPLEMENTED";
1436 #if defined(ASTCENC_DIAGNOSTICS)
1437 	case ASTCENC_ERR_DTRACE_FAILURE:
1438 		return "ASTCENC_ERR_DTRACE_FAILURE";
1439 #endif
1440 	default:
1441 		return nullptr;
1442 	}
1443 }
1444