1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2021 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 //     http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17 
18 /**
19  * @brief Soft-float library for IEEE-754.
20  */
21 #if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
22 
23 #include "astcenc_mathlib.h"
24 
25 /*	sized soft-float types. These are mapped to the sized integer
26     types of C99, instead of C's floating-point types; this is because
27     the library needs to maintain exact, bit-level control on all
28     operations on these data types. */
29 typedef uint16_t sf16;
30 typedef uint32_t sf32;
31 
32 /******************************************
33   helper functions and their lookup tables
34  ******************************************/
35 /* count leading zeros functions. Only used when the input is nonzero. */
36 
37 #if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
38 #elif defined(__arm__) && defined(__ARMCC_VERSION)
39 #elif defined(__arm__) && defined(__GNUC__)
40 #else
41 	/* table used for the slow default versions. */
42 	static const uint8_t clz_table[256] =
43 	{
44 		8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
45 		3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
46 		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47 		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
53 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
54 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
56 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
58 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
59 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
60 	};
61 #endif
62 
63 /*
64    32-bit count-leading-zeros function: use the Assembly instruction whenever possible. */
clz32(uint32_t inp)65 static uint32_t clz32(uint32_t inp)
66 {
67 	#if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
68 		uint32_t bsr;
69 		__asm__("bsrl %1, %0": "=r"(bsr):"r"(inp | 1));
70 		return 31 - bsr;
71 	#else
72 		#if defined(__arm__) && defined(__ARMCC_VERSION)
73 			return __clz(inp);			/* armcc builtin */
74 		#else
75 			#if defined(__arm__) && defined(__GNUC__)
76 				uint32_t lz;
77 				__asm__("clz %0, %1": "=r"(lz):"r"(inp));
78 				return lz;
79 			#else
80 				/* slow default version */
81 				uint32_t summa = 24;
82 				if (inp >= UINT32_C(0x10000))
83 				{
84 					inp >>= 16;
85 					summa -= 16;
86 				}
87 				if (inp >= UINT32_C(0x100))
88 				{
89 					inp >>= 8;
90 					summa -= 8;
91 				}
92 				return summa + clz_table[inp];
93 			#endif
94 		#endif
95 	#endif
96 }
97 
98 /* the five rounding modes that IEEE-754r defines */
99 typedef enum
100 {
101 	SF_UP = 0,				/* round towards positive infinity */
102 	SF_DOWN = 1,			/* round towards negative infinity */
103 	SF_TOZERO = 2,			/* round towards zero */
104 	SF_NEARESTEVEN = 3,		/* round toward nearest value; if mid-between, round to even value */
105 	SF_NEARESTAWAY = 4		/* round toward nearest value; if mid-between, round away from zero */
106 } roundmode;
107 
108 
rtne_shift32(uint32_t inp,uint32_t shamt)109 static uint32_t rtne_shift32(uint32_t inp, uint32_t shamt)
110 {
111 	uint32_t vl1 = UINT32_C(1) << shamt;
112 	uint32_t inp2 = inp + (vl1 >> 1);	/* added 0.5 ULP */
113 	uint32_t msk = (inp | UINT32_C(1)) & vl1;	/* nonzero if odd. '| 1' forces it to 1 if the shamt is 0. */
114 	msk--;						/* negative if even, nonnegative if odd. */
115 	inp2 -= (msk >> 31);		/* subtract epsilon before shift if even. */
116 	inp2 >>= shamt;
117 	return inp2;
118 }
119 
rtna_shift32(uint32_t inp,uint32_t shamt)120 static uint32_t rtna_shift32(uint32_t inp, uint32_t shamt)
121 {
122 	uint32_t vl1 = (UINT32_C(1) << shamt) >> 1;
123 	inp += vl1;
124 	inp >>= shamt;
125 	return inp;
126 }
127 
rtup_shift32(uint32_t inp,uint32_t shamt)128 static uint32_t rtup_shift32(uint32_t inp, uint32_t shamt)
129 {
130 	uint32_t vl1 = UINT32_C(1) << shamt;
131 	inp += vl1;
132 	inp--;
133 	inp >>= shamt;
134 	return inp;
135 }
136 
137 /* convert from FP16 to FP32. */
sf16_to_sf32(sf16 inp)138 static sf32 sf16_to_sf32(sf16 inp)
139 {
140 	uint32_t inpx = inp;
141 
142 	/*
143 		This table contains, for every FP16 sign/exponent value combination,
144 		the difference between the input FP16 value and the value obtained
145 		by shifting the correct FP32 result right by 13 bits.
146 		This table allows us to handle every case except denormals and NaN
147 		with just 1 table lookup, 2 shifts and 1 add.
148 	*/
149 
150 	#define WITH_MSB(a) (UINT32_C(a) | (1u << 31))
151 	static const uint32_t tbl[64] =
152 	{
153 		WITH_MSB(0x00000), 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,          0x1C000,
154 		         0x1C000,  0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,          0x1C000,
155 		         0x1C000,  0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,          0x1C000,
156 		         0x1C000,  0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, WITH_MSB(0x38000),
157 		WITH_MSB(0x38000), 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,          0x54000,
158 		         0x54000,  0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,          0x54000,
159 		         0x54000,  0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,          0x54000,
160 		         0x54000,  0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, WITH_MSB(0x70000)
161 	};
162 
163 	uint32_t res = tbl[inpx >> 10];
164 	res += inpx;
165 
166 	/* Normal cases: MSB of 'res' not set. */
167 	if ((res & WITH_MSB(0)) == 0)
168 	{
169 		return res << 13;
170 	}
171 
172 	/* Infinity and Zero: 10 LSB of 'res' not set. */
173 	if ((res & 0x3FF) == 0)
174 	{
175 		return res << 13;
176 	}
177 
178 	/* NaN: the exponent field of 'inp' is non-zero. */
179 	if ((inpx & 0x7C00) != 0)
180 	{
181 		/* All NaNs are quietened. */
182 		return (res << 13) | 0x400000;
183 	}
184 
185 	/* Denormal cases */
186 	uint32_t sign = (inpx & 0x8000) << 16;
187 	uint32_t mskval = inpx & 0x7FFF;
188 	uint32_t leadingzeroes = clz32(mskval);
189 	mskval <<= leadingzeroes;
190 	return (mskval >> 8) + ((0x85 - leadingzeroes) << 23) + sign;
191 }
192 
193 /* Conversion routine that converts from FP32 to FP16. It supports denormals and all rounding modes. If a NaN is given as input, it is quietened. */
sf32_to_sf16(sf32 inp,roundmode rmode)194 static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
195 {
196 	/* for each possible sign/exponent combination, store a case index. This gives a 512-byte table */
197 	static const uint8_t tab[512] {
198 		0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
199 		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
200 		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
201 		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
202 		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
203 		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
204 		10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
205 		20, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
206 		30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 40,
207 		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
208 		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
209 		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
210 		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
211 		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
212 		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
213 		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 50,
214 
215 		5, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
216 		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
217 		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
218 		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
219 		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
220 		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
221 		15, 15, 15, 15, 15, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
222 		25, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
223 		35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 45,
224 		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
225 		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
226 		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
227 		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
228 		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
229 		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
230 		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 55,
231 	};
232 
233 	/* many of the cases below use a case-dependent magic constant. So we look up a magic constant before actually performing the switch. This table allows us to group cases, thereby minimizing code
234 	   size. */
235 	static const uint32_t tabx[60] {
236 		UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x80000000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
237 		UINT32_C(1), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8001), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
238 		UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
239 		UINT32_C(0xC8001FFF), UINT32_C(0xC8000000), UINT32_C(0xC8000000), UINT32_C(0xC8000FFF), UINT32_C(0xC8001000),
240 		UINT32_C(0x58000000), UINT32_C(0x38001FFF), UINT32_C(0x58000000), UINT32_C(0x58000FFF), UINT32_C(0x58001000),
241 		UINT32_C(0x7C00), UINT32_C(0x7BFF), UINT32_C(0x7BFF), UINT32_C(0x7C00), UINT32_C(0x7C00),
242 		UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFC00),
243 		UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000),
244 		UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000)
245 	};
246 
247 	uint32_t p;
248 	uint32_t idx = rmode + tab[inp >> 23];
249 	uint32_t vlx = tabx[idx];
250 	switch (idx)
251 	{
252 		/*
253 			Positive number which may be Infinity or NaN.
254 			We need to check whether it is NaN; if it is, quieten it by setting the top bit of the mantissa.
255 			(If we don't do this quieting, then a NaN  that is distinguished only by having
256 			its low-order bits set, would be turned into an INF. */
257 	case 50:
258 	case 51:
259 	case 52:
260 	case 53:
261 	case 54:
262 	case 55:
263 	case 56:
264 	case 57:
265 	case 58:
266 	case 59:
267 		/*
268 			the input value is 0x7F800000 or 0xFF800000 if it is INF.
269 			By subtracting 1, we get 7F7FFFFF or FF7FFFFF, that is, bit 23 becomes zero.
270 			For NaNs, however, this operation will keep bit 23 with the value 1.
271 			We can then extract bit 23, and logical-OR bit 9 of the result with this
272 			bit in order to quieten the NaN (a Quiet NaN is a NaN where the top bit
273 			of the mantissa is set.)
274 		*/
275 		p = (inp - 1) & UINT32_C(0x800000);	/* zero if INF, nonzero if NaN. */
276 		return static_cast<sf16>(((inp + vlx) >> 13) | (p >> 14));
277 		/*
278 			positive, exponent = 0, round-mode == UP; need to check whether number actually is 0.
279 			If it is, then return 0, else return 1 (the smallest representable nonzero number)
280 		*/
281 	case 0:
282 		/*
283 			-inp will set the MSB if the input number is nonzero.
284 			Thus (-inp) >> 31 will turn into 0 if the input number is 0 and 1 otherwise.
285 		*/
286 		return static_cast<sf16>(static_cast<uint32_t>((-static_cast<int32_t>(inp))) >> 31);
287 
288 		/*
289 			negative, exponent = , round-mode == DOWN, need to check whether number is
290 			actually 0. If it is, return 0x8000 ( float -0.0 )
291 			Else return the smallest negative number ( 0x8001 ) */
292 	case 6:
293 		/*
294 			in this case 'vlx' is 0x80000000. By subtracting the input value from it,
295 			we obtain a value that is 0 if the input value is in fact zero and has
296 			the MSB set if it isn't. We then right-shift the value by 31 places to
297 			get a value that is 0 if the input is -0.0 and 1 otherwise.
298 		*/
299 		return static_cast<sf16>(((vlx - inp) >> 31) + UINT32_C(0x8000));
300 
301 		/*
302 			for all other cases involving underflow/overflow, we don't need to
303 			do actual tests; we just return 'vlx'.
304 		*/
305 	case 1:
306 	case 2:
307 	case 3:
308 	case 4:
309 	case 5:
310 	case 7:
311 	case 8:
312 	case 9:
313 	case 10:
314 	case 11:
315 	case 12:
316 	case 13:
317 	case 14:
318 	case 15:
319 	case 16:
320 	case 17:
321 	case 18:
322 	case 19:
323 	case 40:
324 	case 41:
325 	case 42:
326 	case 43:
327 	case 44:
328 	case 45:
329 	case 46:
330 	case 47:
331 	case 48:
332 	case 49:
333 		return static_cast<sf16>(vlx);
334 
335 		/*
336 			for normal numbers, 'vlx' is the difference between the FP32 value of a number and the
337 			FP16 representation of the same number left-shifted by 13 places. In addition, a rounding constant is
338 			baked into 'vlx': for rounding-away-from zero, the constant is 2^13 - 1, causing roundoff away
339 			from zero. for round-to-nearest away, the constant is 2^12, causing roundoff away from zero.
340 			for round-to-nearest-even, the constant is 2^12 - 1. This causes correct round-to-nearest-even
341 			except for odd input numbers. For odd input numbers, we need to add 1 to the constant. */
342 
343 		/* normal number, all rounding modes except round-to-nearest-even: */
344 	case 30:
345 	case 31:
346 	case 32:
347 	case 34:
348 	case 35:
349 	case 36:
350 	case 37:
351 	case 39:
352 		return static_cast<sf16>((inp + vlx) >> 13);
353 
354 		/* normal number, round-to-nearest-even. */
355 	case 33:
356 	case 38:
357 		p = inp + vlx;
358 		p += (inp >> 13) & 1;
359 		return static_cast<sf16>(p >> 13);
360 
361 		/*
362 			the various denormal cases. These are not expected to be common, so their performance is a bit
363 			less important. For each of these cases, we need to extract an exponent and a mantissa
364 			(including the implicit '1'!), and then right-shift the mantissa by a shift-amount that
365 			depends on the exponent. The shift must apply the correct rounding mode. 'vlx' is used to supply the
366 			sign of the resulting denormal number.
367 		*/
368 	case 21:
369 	case 22:
370 	case 25:
371 	case 27:
372 		/* denormal, round towards zero. */
373 		p = 126 - ((inp >> 23) & 0xFF);
374 		return static_cast<sf16>((((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000)) >> p) | vlx);
375 	case 20:
376 	case 26:
377 		/* denormal, round away from zero. */
378 		p = 126 - ((inp >> 23) & 0xFF);
379 		return static_cast<sf16>(rtup_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
380 	case 24:
381 	case 29:
382 		/* denormal, round to nearest-away */
383 		p = 126 - ((inp >> 23) & 0xFF);
384 		return static_cast<sf16>(rtna_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
385 	case 23:
386 	case 28:
387 		/* denormal, round to nearest-even. */
388 		p = 126 - ((inp >> 23) & 0xFF);
389 		return static_cast<sf16>(rtne_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
390 	}
391 
392 	return 0;
393 }
394 
395 /* convert from soft-float to native-float */
sf16_to_float(uint16_t p)396 float sf16_to_float(uint16_t p)
397 {
398 	if32 i;
399 	i.u = sf16_to_sf32(p);
400 	return i.f;
401 }
402 
403 /* convert from native-float to soft-float */
float_to_sf16(float p)404 uint16_t float_to_sf16(float p)
405 {
406 	if32 i;
407 	i.f = p;
408 	return sf32_to_sf16(i.u, SF_NEARESTEVEN);
409 }
410 
411 #endif
412