1 /* ------------------------------------------------------------------
2  * Copyright (C) 1998-2009 PacketVideo
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13  * express or implied.
14  * See the License for the specific language governing permissions
15  * and limitations under the License.
16  * -------------------------------------------------------------------
17  */
18 /*
19 
20 ------------------------------------------------------------------------------
21  REVISION HISTORY
22  Who:   Date: July/2001
23  Description:   1. Optimized BlockIDCT bitmap checking.
24                 2. Rearranged functions.
25                 3. Do column IDCT first, then row IDCT.
26                 4. Combine motion comp and IDCT, require
27                    two sets of row IDCTs one for INTRA
28                    and one for INTER.
29                 5. Add AAN IDCT
30 
31  Who:   Date: 8/16/01
32                 1. Increase the input precision to 8 bits, i.e. change RDCTBITS
33                    to 11, have to comment out all in-line assembly since 16 bit
34                     multiplication doesn't work. Try to use diffent precision with
35                     32 bit mult. but hasn't finished. Turns out that without in-line
36                     assembly the performance doesn't change much (only 1%).
37  Who:   Date: 9/04/05
38                 1. Replace AAN IDCT with Chen's IDCT to accommodate 16 bit data type.
39 
40 */
41 #include "mp4def.h"
42 #include "mp4enc_lib.h"
43 #include "mp4lib_int.h"
44 #include "dct.h"
45 
46 #define ADD_CLIP    { \
47             tmp = *rec + tmp; \
48         if((UInt)tmp > mask) tmp = mask&(~(tmp>>31)); \
49         *rec++ = tmp;   \
50         }
51 
52 #define INTRA_CLIP  { \
53         if((UInt)tmp > mask) tmp = mask&(~(tmp>>31)); \
54         *rec++ = tmp;   \
55         }
56 
57 
58 #define CLIP_RESULT(x)      if((UInt)(x) > 0xFF){(x) = 0xFF & (~((x)>>31));}
59 #define ADD_AND_CLIP1(x)    x += (pred_word&0xFF); CLIP_RESULT(x);
60 #define ADD_AND_CLIP2(x)    x += ((pred_word>>8)&0xFF); CLIP_RESULT(x);
61 #define ADD_AND_CLIP3(x)    x += ((pred_word>>16)&0xFF); CLIP_RESULT(x);
62 #define ADD_AND_CLIP4(x)    x += ((pred_word>>24)&0xFF); CLIP_RESULT(x);
63 
64 
idct_col0(Short * blk)65 void idct_col0(Short *blk)
66 {
67     OSCL_UNUSED_ARG(blk);
68 
69     return;
70 }
71 
idct_col1(Short * blk)72 void idct_col1(Short *blk)
73 {
74     blk[0] = blk[8] = blk[16] = blk[24] = blk[32] = blk[40] = blk[48] = blk[56] =
75                                               blk[0] << 3;
76     return ;
77 }
78 
79 /* Ignoring overflows as idct function expects and uses overflows */
80 __attribute__((no_sanitize("signed-integer-overflow")))
idct_col2(Short * blk)81 void idct_col2(Short *blk)
82 {
83     int32 x0, x1, x3, x5, x7;//, x8;
84 
85     x1 = blk[8];
86     x0 = ((int32)blk[0] << 11) + 128;
87     /* both upper and lower*/
88 
89     x7 = W7 * x1;
90     x1 = W1 * x1;
91 
92     x3 = x7;
93     x5 = (181 * (x1 - x7) + 128) >> 8;
94     x7 = (181 * (x1 + x7) + 128) >> 8;
95 
96     blk[0] = (x0 + x1) >> 8;
97     blk[8] = (x0 + x7) >> 8;
98     blk[16] = (x0 + x5) >> 8;
99     blk[24] = (x0 + x3) >> 8;
100     blk[56] = (x0 - x1) >> 8;
101     blk[48] = (x0 - x7) >> 8;
102     blk[40] = (x0 - x5) >> 8;
103     blk[32] = (x0 - x3) >> 8;
104     return ;
105 }
106 
107 /* Ignoring overflows as idct function expects and uses overflows */
108 __attribute__((no_sanitize("signed-integer-overflow")))
idct_col3(Short * blk)109 void idct_col3(Short *blk)
110 {
111     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
112 
113     x2 = blk[16];
114     x1 = blk[8];
115     x0 = ((int32)blk[0] << 11) + 128;
116 
117     x4 = x0;
118     x6 = W6 * x2;
119     x2 = W2 * x2;
120     x8 = x0 - x2;
121     x0 += x2;
122     x2 = x8;
123     x8 = x4 - x6;
124     x4 += x6;
125     x6 = x8;
126 
127     x7 = W7 * x1;
128     x1 = W1 * x1;
129     x3 = x7;
130     x5 = (181 * (x1 - x7) + 128) >> 8;
131     x7 = (181 * (x1 + x7) + 128) >> 8;
132 
133     blk[0] = (x0 + x1) >> 8;
134     blk[8] = (x4 + x7) >> 8;
135     blk[16] = (x6 + x5) >> 8;
136     blk[24] = (x2 + x3) >> 8;
137     blk[56] = (x0 - x1) >> 8;
138     blk[48] = (x4 - x7) >> 8;
139     blk[40] = (x6 - x5) >> 8;
140     blk[32] = (x2 - x3) >> 8;
141     return ;
142 }
143 
144 /* Ignoring overflows as idct function expects and uses overflows */
145 __attribute__((no_sanitize("signed-integer-overflow")))
idct_col4(Short * blk)146 void idct_col4(Short *blk)
147 {
148     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
149     x2 = blk[16];
150     x1 = blk[8];
151     x3 = blk[24];
152     x0 = ((int32)blk[0] << 11) + 128;
153 
154     x4 = x0;
155     x6 = W6 * x2;
156     x2 = W2 * x2;
157     x8 = x0 - x2;
158     x0 += x2;
159     x2 = x8;
160     x8 = x4 - x6;
161     x4 += x6;
162     x6 = x8;
163 
164     x7 = W7 * x1;
165     x1 = W1 * x1;
166     x5 = W3 * x3;
167     x3 = -W5 * x3;
168     x8 = x1 - x5;
169     x1 += x5;
170     x5 = x8;
171     x8 = x7 - x3;
172     x3 += x7;
173     x7 = (181 * (x5 + x8) + 128) >> 8;
174     x5 = (181 * (x5 - x8) + 128) >> 8;
175 
176 
177     blk[0] = (x0 + x1) >> 8;
178     blk[8] = (x4 + x7) >> 8;
179     blk[16] = (x6 + x5) >> 8;
180     blk[24] = (x2 + x3) >> 8;
181     blk[56] = (x0 - x1) >> 8;
182     blk[48] = (x4 - x7) >> 8;
183     blk[40] = (x6 - x5) >> 8;
184     blk[32] = (x2 - x3) >> 8;
185     return ;
186 }
187 
188 #ifndef SMALL_DCT
189 /* Ignoring overflows as idct function expects and uses overflows */
190 __attribute__((no_sanitize("signed-integer-overflow")))
idct_col0x40(Short * blk)191 void idct_col0x40(Short *blk)
192 {
193     int32 x1, x3, x5, x7;//, x8;
194 
195     x1 = blk[8];
196     /* both upper and lower*/
197 
198     x7 = W7 * x1;
199     x1 = W1 * x1;
200 
201     x3 = x7;
202     x5 = (181 * (x1 - x7) + 128) >> 8;
203     x7 = (181 * (x1 + x7) + 128) >> 8;
204 
205     blk[0] = (128 + x1) >> 8;
206     blk[8] = (128 + x7) >> 8;
207     blk[16] = (128 + x5) >> 8;
208     blk[24] = (128 + x3) >> 8;
209     blk[56] = (128 - x1) >> 8;
210     blk[48] = (128 - x7) >> 8;
211     blk[40] = (128 - x5) >> 8;
212     blk[32] = (128 - x3) >> 8;
213 
214     return ;
215 }
216 
idct_col0x20(Short * blk)217 void idct_col0x20(Short *blk)
218 {
219     int32 x0, x2, x4, x6;
220 
221     x2 = blk[16];
222     x6 = W6 * x2;
223     x2 = W2 * x2;
224     x0 = 128 + x2;
225     x2 = 128 - x2;
226     x4 = 128 + x6;
227     x6 = 128 - x6;
228 
229     blk[0] = (x0) >> 8;
230     blk[56] = (x0) >> 8;
231     blk[8] = (x4) >> 8;
232     blk[48] = (x4) >> 8;
233     blk[16] = (x6) >> 8;
234     blk[40] = (x6) >> 8;
235     blk[24] = (x2) >> 8;
236     blk[32] = (x2) >> 8;
237 
238     return ;
239 }
240 
241 /* Ignoring overflows as idct function expects and uses overflows */
242 __attribute__((no_sanitize("signed-integer-overflow")))
idct_col0x10(Short * blk)243 void idct_col0x10(Short *blk)
244 {
245     int32 x1, x3, x5,  x7;
246 
247     x3 = blk[24];
248     x1 = W3 * x3;
249     x3 = W5 * x3;
250 
251     x7 = (181 * (x3 - x1) + 128) >> 8;
252     x5 = (-181 * (x1 + x3) + 128) >> 8;
253 
254 
255     blk[0] = (128 + x1) >> 8;
256     blk[8] = (128 + x7) >> 8;
257     blk[16] = (128 + x5) >> 8;
258     blk[24] = (128 - x3) >> 8;
259     blk[56] = (128 - x1) >> 8;
260     blk[48] = (128 - x7) >> 8;
261     blk[40] = (128 - x5) >> 8;
262     blk[32] = (128 + x3) >> 8;
263 
264     return ;
265 }
266 
267 #endif /* SMALL_DCT */
268 
269 /* Ignoring overflows as idct function expects and uses overflows */
270 __attribute__((no_sanitize("signed-integer-overflow")))
idct_col(Short * blk)271 void idct_col(Short *blk)
272 {
273     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
274 
275     x1 = (int32)blk[32] << 11;
276     x2 = blk[48];
277     x3 = blk[16];
278     x4 = blk[8];
279     x5 = blk[56];
280     x6 = blk[40];
281     x7 = blk[24];
282     x0 = ((int32)blk[0] << 11) + 128;
283 
284     /* first stage */
285     x8 = W7 * (x4 + x5);
286     x4 = x8 + (W1 - W7) * x4;
287     x5 = x8 - (W1 + W7) * x5;
288     x8 = W3 * (x6 + x7);
289     x6 = x8 - (W3 - W5) * x6;
290     x7 = x8 - (W3 + W5) * x7;
291 
292     /* second stage */
293     x8 = x0 + x1;
294     x0 -= x1;
295     x1 = W6 * (x3 + x2);
296     x2 = x1 - (W2 + W6) * x2;
297     x3 = x1 + (W2 - W6) * x3;
298     x1 = x4 + x6;
299     x4 -= x6;
300     x6 = x5 + x7;
301     x5 -= x7;
302 
303     /* third stage */
304     x7 = x8 + x3;
305     x8 -= x3;
306     x3 = x0 + x2;
307     x0 -= x2;
308     x2 = (181 * (x4 + x5) + 128) >> 8;
309     x4 = (181 * (x4 - x5) + 128) >> 8;
310 
311     /* fourth stage */
312     blk[0]    = (x7 + x1) >> 8;
313     blk[8] = (x3 + x2) >> 8;
314     blk[16] = (x0 + x4) >> 8;
315     blk[24] = (x8 + x6) >> 8;
316     blk[32] = (x8 - x6) >> 8;
317     blk[40] = (x0 - x4) >> 8;
318     blk[48] = (x3 - x2) >> 8;
319     blk[56] = (x7 - x1) >> 8;
320 
321     return ;
322 }
323 
324 /* This function should not be called at all ****/
idct_row0Inter(Short * srce,UChar * rec,Int lx)325 void idct_row0Inter(Short *srce, UChar *rec, Int lx)
326 {
327     OSCL_UNUSED_ARG(srce);
328 
329     OSCL_UNUSED_ARG(rec);
330 
331     OSCL_UNUSED_ARG(lx);
332 
333     return;
334 }
335 
idct_row1Inter(Short * blk,UChar * rec,Int lx)336 void idct_row1Inter(Short *blk, UChar *rec, Int lx)
337 {
338     int tmp;
339     int i = 8;
340     uint32 pred_word, dst_word;
341     int res, res2;
342 
343     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
344     rec -= lx;
345     blk -= 8;
346 
347     while (i--)
348     {
349         tmp = (*(blk += 8) + 32) >> 6;
350         *blk = 0;
351 
352         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
353         res = tmp + (pred_word & 0xFF);
354         CLIP_RESULT(res);
355         res2 = tmp + ((pred_word >> 8) & 0xFF);
356         CLIP_RESULT(res2);
357         dst_word = (res2 << 8) | res;
358         res = tmp + ((pred_word >> 16) & 0xFF);
359         CLIP_RESULT(res);
360         dst_word |= (res << 16);
361         res = tmp + ((pred_word >> 24) & 0xFF);
362         CLIP_RESULT(res);
363         dst_word |= (res << 24);
364         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
365 
366         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
367         res = tmp + (pred_word & 0xFF);
368         CLIP_RESULT(res);
369         res2 = tmp + ((pred_word >> 8) & 0xFF);
370         CLIP_RESULT(res2);
371         dst_word = (res2 << 8) | res;
372         res = tmp + ((pred_word >> 16) & 0xFF);
373         CLIP_RESULT(res);
374         dst_word |= (res << 16);
375         res = tmp + ((pred_word >> 24) & 0xFF);
376         CLIP_RESULT(res);
377         dst_word |= (res << 24);
378         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
379     }
380     return;
381 }
382 
383 /* Ignoring overflows as idct function expects and uses overflows */
384 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row2Inter(Short * blk,UChar * rec,Int lx)385 void idct_row2Inter(Short *blk, UChar *rec, Int lx)
386 {
387     int32 x0, x1, x2, x4, x5;
388     int i = 8;
389     uint32 pred_word, dst_word;
390     int res, res2;
391 
392     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
393     rec -= lx;
394     blk -= 8;
395 
396     while (i--)
397     {
398         /* shortcut */
399         x4 = blk[9];
400         blk[9] = 0;
401         x0 = ((*(blk += 8)) << 8) + 8192;
402         *blk = 0;  /* for proper rounding in the fourth stage */
403 
404         /* first stage */
405         x5 = (W7 * x4 + 4) >> 3;
406         x4 = (W1 * x4 + 4) >> 3;
407 
408         /* third stage */
409         x2 = (181 * (x4 + x5) + 128) >> 8;
410         x1 = (181 * (x4 - x5) + 128) >> 8;
411 
412         /* fourth stage */
413         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
414         res = (x0 + x4) >> 14;
415         ADD_AND_CLIP1(res);
416         res2 = (x0 + x2) >> 14;
417         ADD_AND_CLIP2(res2);
418         dst_word = (res2 << 8) | res;
419         res = (x0 + x1) >> 14;
420         ADD_AND_CLIP3(res);
421         dst_word |= (res << 16);
422         res = (x0 + x5) >> 14;
423         ADD_AND_CLIP4(res);
424         dst_word |= (res << 24);
425         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
426 
427         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
428         res = (x0 - x5) >> 14;
429         ADD_AND_CLIP1(res);
430         res2 = (x0 - x1) >> 14;
431         ADD_AND_CLIP2(res2);
432         dst_word = (res2 << 8) | res;
433         res = (x0 - x2) >> 14;
434         ADD_AND_CLIP3(res);
435         dst_word |= (res << 16);
436         res = (x0 - x4) >> 14;
437         ADD_AND_CLIP4(res);
438         dst_word |= (res << 24);
439         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
440     }
441     return ;
442 }
443 
444 /* Ignoring overflows as idct function expects and uses overflows */
445 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row3Inter(Short * blk,UChar * rec,Int lx)446 void idct_row3Inter(Short *blk, UChar *rec, Int lx)
447 {
448     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
449     int i = 8;
450     uint32 pred_word, dst_word;
451     int res, res2;
452 
453     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
454     rec -= lx;
455     blk -= 8;
456 
457     while (i--)
458     {
459         x2 = blk[10];
460         blk[10] = 0;
461         x1 = blk[9];
462         blk[9] = 0;
463         x0 = ((*(blk += 8)) << 8) + 8192;
464         *blk = 0;  /* for proper rounding in the fourth stage */
465         /* both upper and lower*/
466         /* both x2orx6 and x0orx4 */
467 
468         x4 = x0;
469         x6 = (W6 * x2 + 4) >> 3;
470         x2 = (W2 * x2 + 4) >> 3;
471         x8 = x0 - x2;
472         x0 += x2;
473         x2 = x8;
474         x8 = x4 - x6;
475         x4 += x6;
476         x6 = x8;
477 
478         x7 = (W7 * x1 + 4) >> 3;
479         x1 = (W1 * x1 + 4) >> 3;
480         x3 = x7;
481         x5 = (181 * (x1 - x7) + 128) >> 8;
482         x7 = (181 * (x1 + x7) + 128) >> 8;
483 
484         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
485         res = (x0 + x1) >> 14;
486         ADD_AND_CLIP1(res);
487         res2 = (x4 + x7) >> 14;
488         ADD_AND_CLIP2(res2);
489         dst_word = (res2 << 8) | res;
490         res = (x6 + x5) >> 14;
491         ADD_AND_CLIP3(res);
492         dst_word |= (res << 16);
493         res = (x2 + x3) >> 14;
494         ADD_AND_CLIP4(res);
495         dst_word |= (res << 24);
496         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
497 
498         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
499         res = (x2 - x3) >> 14;
500         ADD_AND_CLIP1(res);
501         res2 = (x6 - x5) >> 14;
502         ADD_AND_CLIP2(res2);
503         dst_word = (res2 << 8) | res;
504         res = (x4 - x7) >> 14;
505         ADD_AND_CLIP3(res);
506         dst_word |= (res << 16);
507         res = (x0 - x1) >> 14;
508         ADD_AND_CLIP4(res);
509         dst_word |= (res << 24);
510         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
511     }
512 
513     return ;
514 }
515 
516 /* Ignoring overflows as idct function expects and uses overflows */
517 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row4Inter(Short * blk,UChar * rec,Int lx)518 void idct_row4Inter(Short *blk, UChar *rec, Int lx)
519 {
520     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
521     int i = 8;
522     uint32 pred_word, dst_word;
523     int res, res2;
524 
525     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
526     rec -= lx;
527     blk -= 8;
528 
529     while (i--)
530     {
531         x2 = blk[10];
532         blk[10] = 0;
533         x1 = blk[9];
534         blk[9] = 0;
535         x3 = blk[11];
536         blk[11] = 0;
537         x0 = ((*(blk += 8)) << 8) + 8192;
538         *blk = 0;   /* for proper rounding in the fourth stage */
539 
540         x4 = x0;
541         x6 = (W6 * x2 + 4) >> 3;
542         x2 = (W2 * x2 + 4) >> 3;
543         x8 = x0 - x2;
544         x0 += x2;
545         x2 = x8;
546         x8 = x4 - x6;
547         x4 += x6;
548         x6 = x8;
549 
550         x7 = (W7 * x1 + 4) >> 3;
551         x1 = (W1 * x1 + 4) >> 3;
552         x5 = (W3 * x3 + 4) >> 3;
553         x3 = (- W5 * x3 + 4) >> 3;
554         x8 = x1 - x5;
555         x1 += x5;
556         x5 = x8;
557         x8 = x7 - x3;
558         x3 += x7;
559         x7 = (181 * (x5 + x8) + 128) >> 8;
560         x5 = (181 * (x5 - x8) + 128) >> 8;
561 
562         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
563         res = (x0 + x1) >> 14;
564         ADD_AND_CLIP1(res);
565         res2 = (x4 + x7) >> 14;
566         ADD_AND_CLIP2(res2);
567         dst_word = (res2 << 8) | res;
568         res = (x6 + x5) >> 14;
569         ADD_AND_CLIP3(res);
570         dst_word |= (res << 16);
571         res = (x2 + x3) >> 14;
572         ADD_AND_CLIP4(res);
573         dst_word |= (res << 24);
574         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
575 
576         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
577         res = (x2 - x3) >> 14;
578         ADD_AND_CLIP1(res);
579         res2 = (x6 - x5) >> 14;
580         ADD_AND_CLIP2(res2);
581         dst_word = (res2 << 8) | res;
582         res = (x4 - x7) >> 14;
583         ADD_AND_CLIP3(res);
584         dst_word |= (res << 16);
585         res = (x0 - x1) >> 14;
586         ADD_AND_CLIP4(res);
587         dst_word |= (res << 24);
588         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
589     }
590     return ;
591 }
592 
593 #ifndef SMALL_DCT
594 /* Ignoring overflows as idct function expects and uses overflows */
595 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row0x40Inter(Short * blk,UChar * rec,Int lx)596 void idct_row0x40Inter(Short *blk, UChar *rec, Int lx)
597 {
598     int32 x1, x2, x4, x5;
599     int i = 8;
600     uint32 pred_word, dst_word;
601     int res, res2;
602 
603     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
604     rec -= lx;
605 
606     while (i--)
607     {
608         /* shortcut */
609         x4 = blk[1];
610         blk[1] = 0;
611         blk += 8;  /* for proper rounding in the fourth stage */
612 
613         /* first stage */
614         x5 = (W7 * x4 + 4) >> 3;
615         x4 = (W1 * x4 + 4) >> 3;
616 
617         /* third stage */
618         x2 = (181 * (x4 + x5) + 128) >> 8;
619         x1 = (181 * (x4 - x5) + 128) >> 8;
620 
621         /* fourth stage */
622         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
623         res = (8192 + x4) >> 14;
624         ADD_AND_CLIP1(res);
625         res2 = (8192 + x2) >> 14;
626         ADD_AND_CLIP2(res2);
627         dst_word = (res2 << 8) | res;
628         res = (8192 + x1) >> 14;
629         ADD_AND_CLIP3(res);
630         dst_word |= (res << 16);
631         res = (8192 + x5) >> 14;
632         ADD_AND_CLIP4(res);
633         dst_word |= (res << 24);
634         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
635 
636         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
637         res = (8192 - x5) >> 14;
638         ADD_AND_CLIP1(res);
639         res2 = (8192 - x1) >> 14;
640         ADD_AND_CLIP2(res2);
641         dst_word = (res2 << 8) | res;
642         res = (8192 - x2) >> 14;
643         ADD_AND_CLIP3(res);
644         dst_word |= (res << 16);
645         res = (8192 - x4) >> 14;
646         ADD_AND_CLIP4(res);
647         dst_word |= (res << 24);
648         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
649     }
650     return ;
651 }
652 
idct_row0x20Inter(Short * blk,UChar * rec,Int lx)653 void idct_row0x20Inter(Short *blk, UChar *rec, Int lx)
654 {
655     int32 x0, x2, x4, x6;
656     int i = 8;
657     uint32 pred_word, dst_word;
658     int res, res2;
659 
660     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
661     rec -= lx;
662 
663     while (i--)
664     {
665         x2 = blk[2];
666         blk[2] = 0;
667         blk += 8; /* for proper rounding in the fourth stage */
668         /* both upper and lower*/
669         /* both x2orx6 and x0orx4 */
670         x6 = (W6 * x2 + 4) >> 3;
671         x2 = (W2 * x2 + 4) >> 3;
672         x0 = 8192 + x2;
673         x2 = 8192 - x2;
674         x4 = 8192 + x6;
675         x6 = 8192 - x6;
676 
677         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
678         res = (x0) >> 14;
679         ADD_AND_CLIP1(res);
680         res2 = (x4) >> 14;
681         ADD_AND_CLIP2(res2);
682         dst_word = (res2 << 8) | res;
683         res = (x6) >> 14;
684         ADD_AND_CLIP3(res);
685         dst_word |= (res << 16);
686         res = (x2) >> 14;
687         ADD_AND_CLIP4(res);
688         dst_word |= (res << 24);
689         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
690 
691         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
692         res = (x2) >> 14;
693         ADD_AND_CLIP1(res);
694         res2 = (x6) >> 14;
695         ADD_AND_CLIP2(res2);
696         dst_word = (res2 << 8) | res;
697         res = (x4) >> 14;
698         ADD_AND_CLIP3(res);
699         dst_word |= (res << 16);
700         res = (x0) >> 14;
701         ADD_AND_CLIP4(res);
702         dst_word |= (res << 24);
703         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
704     }
705 
706     return ;
707 }
708 
709 /* Ignoring overflows as idct function expects and uses overflows */
710 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row0x10Inter(Short * blk,UChar * rec,Int lx)711 void idct_row0x10Inter(Short *blk, UChar *rec, Int lx)
712 {
713     int32 x1, x3, x5, x7;
714     int i = 8;
715     uint32 pred_word, dst_word;
716     int res, res2;
717 
718     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
719     rec -= lx;
720 
721     while (i--)
722     {
723         x3 = blk[3];
724         blk[3] = 0;
725         blk += 8;
726 
727         x1 = (W3 * x3 + 4) >> 3;
728         x3 = (-W5 * x3 + 4) >> 3;
729 
730         x7 = (-181 * (x3 + x1) + 128) >> 8;
731         x5 = (181 * (x3 - x1) + 128) >> 8;
732 
733         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
734         res = (8192 + x1) >> 14;
735         ADD_AND_CLIP1(res);
736         res2 = (8192 + x7) >> 14;
737         ADD_AND_CLIP2(res2);
738         dst_word = (res2 << 8) | res;
739         res = (8192 + x5) >> 14;
740         ADD_AND_CLIP3(res);
741         dst_word |= (res << 16);
742         res = (8192 + x3) >> 14;
743         ADD_AND_CLIP4(res);
744         dst_word |= (res << 24);
745         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
746 
747         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
748         res = (8192 - x3) >> 14;
749         ADD_AND_CLIP1(res);
750         res2 = (8192 - x5) >> 14;
751         ADD_AND_CLIP2(res2);
752         dst_word = (res2 << 8) | res;
753         res = (8192 - x7) >> 14;
754         ADD_AND_CLIP3(res);
755         dst_word |= (res << 16);
756         res = (8192 - x1) >> 14;
757         ADD_AND_CLIP4(res);
758         dst_word |= (res << 24);
759         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
760     }
761     return ;
762 }
763 
764 #endif /* SMALL_DCT */
765 
766 /* Ignoring overflows as idct function expects and uses overflows */
767 __attribute__((no_sanitize("signed-integer-overflow")))
idct_rowInter(Short * blk,UChar * rec,Int lx)768 void idct_rowInter(Short *blk, UChar *rec, Int lx)
769 {
770     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
771     int i = 8;
772     uint32 pred_word, dst_word;
773     int res, res2;
774 
775     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
776     rec -= lx;
777     blk -= 8;
778 
779     while (i--)
780     {
781         x1 = (int32)blk[12] << 8;
782         blk[12] = 0;
783         x2 = blk[14];
784         blk[14] = 0;
785         x3 = blk[10];
786         blk[10] = 0;
787         x4 = blk[9];
788         blk[9] = 0;
789         x5 = blk[15];
790         blk[15] = 0;
791         x6 = blk[13];
792         blk[13] = 0;
793         x7 = blk[11];
794         blk[11] = 0;
795         x0 = ((*(blk += 8)) << 8) + 8192;
796         *blk = 0;   /* for proper rounding in the fourth stage */
797 
798         /* first stage */
799         x8 = W7 * (x4 + x5) + 4;
800         x4 = (x8 + (W1 - W7) * x4) >> 3;
801         x5 = (x8 - (W1 + W7) * x5) >> 3;
802         x8 = W3 * (x6 + x7) + 4;
803         x6 = (x8 - (W3 - W5) * x6) >> 3;
804         x7 = (x8 - (W3 + W5) * x7) >> 3;
805 
806         /* second stage */
807         x8 = x0 + x1;
808         x0 -= x1;
809         x1 = W6 * (x3 + x2) + 4;
810         x2 = (x1 - (W2 + W6) * x2) >> 3;
811         x3 = (x1 + (W2 - W6) * x3) >> 3;
812         x1 = x4 + x6;
813         x4 -= x6;
814         x6 = x5 + x7;
815         x5 -= x7;
816 
817         /* third stage */
818         x7 = x8 + x3;
819         x8 -= x3;
820         x3 = x0 + x2;
821         x0 -= x2;
822         x2 = (181 * (x4 + x5) + 128) >> 8;
823         x4 = (181 * (x4 - x5) + 128) >> 8;
824 
825         /* fourth stage */
826         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
827 
828         res = (x7 + x1) >> 14;
829         ADD_AND_CLIP1(res);
830         res2 = (x3 + x2) >> 14;
831         ADD_AND_CLIP2(res2);
832         dst_word = (res2 << 8) | res;
833         res = (x0 + x4) >> 14;
834         ADD_AND_CLIP3(res);
835         dst_word |= (res << 16);
836         res = (x8 + x6) >> 14;
837         ADD_AND_CLIP4(res);
838         dst_word |= (res << 24);
839         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
840 
841         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
842 
843         res = (x8 - x6) >> 14;
844         ADD_AND_CLIP1(res);
845         res2 = (x0 - x4) >> 14;
846         ADD_AND_CLIP2(res2);
847         dst_word = (res2 << 8) | res;
848         res = (x3 - x2) >> 14;
849         ADD_AND_CLIP3(res);
850         dst_word |= (res << 16);
851         res = (x7 - x1) >> 14;
852         ADD_AND_CLIP4(res);
853         dst_word |= (res << 24);
854         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
855     }
856     return;
857 }
858 
idct_row0Intra(Short * srce,UChar * rec,Int lx)859 void idct_row0Intra(Short *srce, UChar *rec, Int lx)
860 {
861     OSCL_UNUSED_ARG(srce);
862 
863     OSCL_UNUSED_ARG(rec);
864 
865     OSCL_UNUSED_ARG(lx);
866 
867     return;
868 }
869 
idct_row1Intra(Short * blk,UChar * rec,Int lx)870 void idct_row1Intra(Short *blk, UChar *rec, Int lx)
871 {
872     int32 tmp;
873     int i = 8;
874 
875     rec -= lx;
876     blk -= 8;
877     while (i--)
878     {
879         tmp = ((*(blk += 8) + 32) >> 6);
880         *blk = 0;
881         CLIP_RESULT(tmp)
882 
883         tmp |= (tmp << 8);
884         tmp |= (tmp << 16);
885         *((uint32*)(rec += lx)) = tmp;
886         *((uint32*)(rec + 4)) = tmp;
887     }
888     return;
889 }
890 
891 /* Ignoring overflows as idct function expects and uses overflows */
892 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row2Intra(Short * blk,UChar * rec,Int lx)893 void idct_row2Intra(Short *blk, UChar *rec, Int lx)
894 {
895     int32 x0, x1, x2, x4, x5;
896     int res, res2;
897     uint32 dst_word;
898     int i = 8;
899 
900     rec -= lx;
901     blk -= 8;
902     while (i--)
903     {
904         /* shortcut */
905         x4 = blk[9];
906         blk[9] = 0;
907         x0 = ((*(blk += 8)) << 8) + 8192;
908         *blk = 0;   /* for proper rounding in the fourth stage */
909 
910         /* first stage */
911         x5 = (W7 * x4 + 4) >> 3;
912         x4 = (W1 * x4 + 4) >> 3;
913 
914         /* third stage */
915         x2 = (181 * (x4 + x5) + 128) >> 8;
916         x1 = (181 * (x4 - x5) + 128) >> 8;
917 
918         /* fourth stage */
919         res = ((x0 + x4) >> 14);
920         CLIP_RESULT(res)
921         res2 = ((x0 + x2) >> 14);
922         CLIP_RESULT(res2)
923         dst_word = (res2 << 8) | res;
924         res = ((x0 + x1) >> 14);
925         CLIP_RESULT(res)
926         dst_word |= (res << 16);
927         res = ((x0 + x5) >> 14);
928         CLIP_RESULT(res)
929         dst_word |= (res << 24);
930         *((uint32*)(rec += lx)) = dst_word;
931 
932         res = ((x0 - x5) >> 14);
933         CLIP_RESULT(res)
934         res2 = ((x0 - x1) >> 14);
935         CLIP_RESULT(res2)
936         dst_word = (res2 << 8) | res;
937         res = ((x0 - x2) >> 14);
938         CLIP_RESULT(res)
939         dst_word |= (res << 16);
940         res = ((x0 - x4) >> 14);
941         CLIP_RESULT(res)
942         dst_word |= (res << 24);
943         *((uint32*)(rec + 4)) = dst_word;
944     }
945     return ;
946 }
947 
948 /* Ignoring overflows as idct function expects and uses overflows */
949 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row3Intra(Short * blk,UChar * rec,Int lx)950 void idct_row3Intra(Short *blk, UChar *rec, Int lx)
951 {
952     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
953     int res, res2;
954     uint32 dst_word;
955     int i = 8;
956 
957     rec -= lx;
958     blk -= 8;
959     while (i--)
960     {
961         x2 = blk[10];
962         blk[10] = 0;
963         x1 = blk[9];
964         blk[9] = 0;
965         x0 = ((*(blk += 8)) << 8) + 8192;
966         *blk = 0;/* for proper rounding in the fourth stage */
967         /* both upper and lower*/
968         /* both x2orx6 and x0orx4 */
969 
970         x4 = x0;
971         x6 = (W6 * x2 + 4) >> 3;
972         x2 = (W2 * x2 + 4) >> 3;
973         x8 = x0 - x2;
974         x0 += x2;
975         x2 = x8;
976         x8 = x4 - x6;
977         x4 += x6;
978         x6 = x8;
979 
980         x7 = (W7 * x1 + 4) >> 3;
981         x1 = (W1 * x1 + 4) >> 3;
982         x3 = x7;
983         x5 = (181 * (x1 - x7) + 128) >> 8;
984         x7 = (181 * (x1 + x7) + 128) >> 8;
985 
986         res = ((x0 + x1) >> 14);
987         CLIP_RESULT(res)
988         res2 = ((x4 + x7) >> 14);
989         CLIP_RESULT(res2)
990         dst_word = (res2 << 8) | res;
991         res = ((x6 + x5) >> 14);
992         CLIP_RESULT(res)
993         dst_word |= (res << 16);
994         res = ((x2 + x3) >> 14);
995         CLIP_RESULT(res)
996         dst_word |= (res << 24);
997         *((uint32*)(rec += lx)) = dst_word;
998 
999         res = ((x2 - x3) >> 14);
1000         CLIP_RESULT(res)
1001         res2 = ((x6 - x5) >> 14);
1002         CLIP_RESULT(res2)
1003         dst_word = (res2 << 8) | res;
1004         res = ((x4 - x7) >> 14);
1005         CLIP_RESULT(res)
1006         dst_word |= (res << 16);
1007         res = ((x0 - x1) >> 14);
1008         CLIP_RESULT(res)
1009         dst_word |= (res << 24);
1010         *((uint32*)(rec + 4)) = dst_word;
1011 
1012     }
1013     return ;
1014 }
1015 
1016 /* Ignoring overflows as idct function expects and uses overflows */
1017 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row4Intra(Short * blk,UChar * rec,Int lx)1018 void idct_row4Intra(Short *blk, UChar *rec, Int lx)
1019 {
1020     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
1021     int res, res2;
1022     uint32 dst_word;
1023     int i = 8;
1024 
1025     rec -= lx;
1026     blk -= 8;
1027     while (i--)
1028     {
1029         x2 = blk[10];
1030         blk[10] = 0;
1031         x1 = blk[9];
1032         blk[9] = 0;
1033         x3 = blk[11];
1034         blk[11] = 0;
1035         x0 = ((*(blk += 8)) << 8) + 8192;
1036         *blk = 0; /* for proper rounding in the fourth stage */
1037 
1038         x4 = x0;
1039         x6 = (W6 * x2 + 4) >> 3;
1040         x2 = (W2 * x2 + 4) >> 3;
1041         x8 = x0 - x2;
1042         x0 += x2;
1043         x2 = x8;
1044         x8 = x4 - x6;
1045         x4 += x6;
1046         x6 = x8;
1047 
1048         x7 = (W7 * x1 + 4) >> 3;
1049         x1 = (W1 * x1 + 4) >> 3;
1050         x5 = (W3 * x3 + 4) >> 3;
1051         x3 = (- W5 * x3 + 4) >> 3;
1052         x8 = x1 - x5;
1053         x1 += x5;
1054         x5 = x8;
1055         x8 = x7 - x3;
1056         x3 += x7;
1057         x7 = (181 * (x5 + x8) + 128) >> 8;
1058         x5 = (181 * (x5 - x8) + 128) >> 8;
1059 
1060         res = ((x0 + x1) >> 14);
1061         CLIP_RESULT(res)
1062         res2 = ((x4 + x7) >> 14);
1063         CLIP_RESULT(res2)
1064         dst_word = (res2 << 8) | res;
1065         res = ((x6 + x5) >> 14);
1066         CLIP_RESULT(res)
1067         dst_word |= (res << 16);
1068         res = ((x2 + x3) >> 14);
1069         CLIP_RESULT(res)
1070         dst_word |= (res << 24);
1071         *((uint32*)(rec += lx)) = dst_word;
1072 
1073         res = ((x2 - x3) >> 14);
1074         CLIP_RESULT(res)
1075         res2 = ((x6 - x5) >> 14);
1076         CLIP_RESULT(res2)
1077         dst_word = (res2 << 8) | res;
1078         res = ((x4 - x7) >> 14);
1079         CLIP_RESULT(res)
1080         dst_word |= (res << 16);
1081         res = ((x0 - x1) >> 14);
1082         CLIP_RESULT(res)
1083         dst_word |= (res << 24);
1084         *((uint32*)(rec + 4)) = dst_word;
1085     }
1086 
1087     return ;
1088 }
1089 
1090 #ifndef SMALL_DCT
1091 /* Ignoring overflows as idct function expects and uses overflows */
1092 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row0x40Intra(Short * blk,UChar * rec,Int lx)1093 void idct_row0x40Intra(Short *blk, UChar *rec, Int lx)
1094 {
1095     int32  x1, x2, x4, x5;
1096     int res, res2;
1097     uint32 dst_word;
1098     int i = 8;
1099 
1100     rec -= lx;
1101 
1102     while (i--)
1103     {
1104         /* shortcut */
1105         x4 = blk[1];
1106         blk[1] = 0;
1107         blk += 8;
1108 
1109         /* first stage */
1110         x5 = (W7 * x4 + 4) >> 3;
1111         x4 = (W1 * x4 + 4) >> 3;
1112 
1113         /* third stage */
1114         x2 = (181 * (x4 + x5) + 128) >> 8;
1115         x1 = (181 * (x4 - x5) + 128) >> 8;
1116 
1117         /* fourth stage */
1118         res = ((8192 + x4) >> 14);
1119         CLIP_RESULT(res)
1120         res2 = ((8192 + x2) >> 14);
1121         CLIP_RESULT(res2)
1122         dst_word = (res2 << 8) | res;
1123         res = ((8192 + x1) >> 14);
1124         CLIP_RESULT(res)
1125         dst_word |= (res << 16);
1126         res = ((8192 + x5) >> 14);
1127         CLIP_RESULT(res)
1128         dst_word |= (res << 24);
1129         *((uint32*)(rec += lx)) = dst_word;
1130 
1131         res = ((8192 - x5) >> 14);
1132         CLIP_RESULT(res)
1133         res2 = ((8192 - x1) >> 14);
1134         CLIP_RESULT(res2)
1135         dst_word = (res2 << 8) | res;
1136         res = ((8192 - x2) >> 14);
1137         CLIP_RESULT(res)
1138         dst_word |= (res << 16);
1139         res = ((8192 - x4) >> 14);
1140         CLIP_RESULT(res)
1141         dst_word |= (res << 24);
1142         *((uint32*)(rec + 4)) = dst_word;
1143 
1144     }
1145     return ;
1146 }
1147 
idct_row0x20Intra(Short * blk,UChar * rec,Int lx)1148 void idct_row0x20Intra(Short *blk, UChar *rec, Int lx)
1149 {
1150     int32 x0, x2, x4, x6;
1151     int res, res2;
1152     uint32 dst_word;
1153     int i = 8;
1154 
1155     rec -= lx;
1156     while (i--)
1157     {
1158         x2 = blk[2];
1159         blk[2] = 0;
1160         blk += 8;
1161 
1162         /* both upper and lower*/
1163         /* both x2orx6 and x0orx4 */
1164         x6 = (W6 * x2 + 4) >> 3;
1165         x2 = (W2 * x2 + 4) >> 3;
1166         x0 = 8192 + x2;
1167         x2 = 8192 - x2;
1168         x4 = 8192 + x6;
1169         x6 = 8192 - x6;
1170 
1171         res = ((x0) >> 14);
1172         CLIP_RESULT(res)
1173         res2 = ((x4) >> 14);
1174         CLIP_RESULT(res2)
1175         dst_word = (res2 << 8) | res;
1176         res = ((x6) >> 14);
1177         CLIP_RESULT(res)
1178         dst_word |= (res << 16);
1179         res = ((x2) >> 14);
1180         CLIP_RESULT(res)
1181         dst_word |= (res << 24);
1182         *((uint32*)(rec += lx)) = dst_word;
1183 
1184         res = ((x2) >> 14);
1185         CLIP_RESULT(res)
1186         res2 = ((x6) >> 14);
1187         CLIP_RESULT(res2)
1188         dst_word = (res2 << 8) | res;
1189         res = ((x4) >> 14);
1190         CLIP_RESULT(res)
1191         dst_word |= (res << 16);
1192         res = ((x0) >> 14);
1193         CLIP_RESULT(res)
1194         dst_word |= (res << 24);
1195         *((uint32*)(rec + 4)) = dst_word;
1196 
1197     }
1198     return ;
1199 }
1200 
1201 /* Ignoring overflows as idct function expects and uses overflows */
1202 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row0x10Intra(Short * blk,UChar * rec,Int lx)1203 void idct_row0x10Intra(Short *blk, UChar *rec, Int lx)
1204 {
1205     int32 x1, x3, x5, x7;
1206     int res, res2;
1207     uint32 dst_word;
1208     int i = 8;
1209 
1210     rec -= lx;
1211     while (i--)
1212     {
1213         x3 = blk[3];
1214         blk[3] = 0 ;
1215         blk += 8;
1216 
1217         x1 = (W3 * x3 + 4) >> 3;
1218         x3 = (W5 * x3 + 4) >> 3;
1219 
1220         x7 = (181 * (x3 - x1) + 128) >> 8;
1221         x5 = (-181 * (x1 + x3) + 128) >> 8;
1222 
1223         res = ((8192 + x1) >> 14);
1224         CLIP_RESULT(res)
1225         res2 = ((8192 + x7) >> 14);
1226         CLIP_RESULT(res2)
1227         dst_word = (res2 << 8) | res;
1228         res = ((8192 + x5) >> 14);
1229         CLIP_RESULT(res)
1230         dst_word |= (res << 16);
1231         res = ((8192 - x3) >> 14);
1232         CLIP_RESULT(res)
1233         dst_word |= (res << 24);
1234         *((uint32*)(rec += lx)) = dst_word;
1235 
1236         res = ((8192 + x3) >> 14);
1237         CLIP_RESULT(res)
1238         res2 = ((8192 - x5) >> 14);
1239         CLIP_RESULT(res2)
1240         dst_word = (res2 << 8) | res;
1241         res = ((8192 - x7) >> 14);
1242         CLIP_RESULT(res)
1243         dst_word |= (res << 16);
1244         res = ((8192 - x1) >> 14);
1245         CLIP_RESULT(res)
1246         dst_word |= (res << 24);
1247         *((uint32*)(rec + 4)) = dst_word;
1248 
1249     }
1250 
1251     return ;
1252 }
1253 
1254 #endif /* SMALL_DCT */
1255 /* Ignoring overflows as idct function expects and uses overflows */
1256 __attribute__((no_sanitize("signed-integer-overflow")))
idct_rowIntra(Short * blk,UChar * rec,Int lx)1257 void idct_rowIntra(Short *blk, UChar *rec, Int lx)
1258 {
1259     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
1260     int i = 8;
1261     int res, res2;
1262     uint32 dst_word;
1263 
1264     blk -= 8;
1265     rec -= lx;
1266 
1267     while (i--)
1268     {
1269         x1 = (int32)blk[12] << 8;
1270         blk[12] = 0;
1271         x2 = blk[14];
1272         blk[14] = 0;
1273         x3 = blk[10];
1274         blk[10] = 0;
1275         x4 = blk[9];
1276         blk[9] = 0;
1277         x5 = blk[15];
1278         blk[15] = 0;
1279         x6 = blk[13];
1280         blk[13] = 0;
1281         x7 = blk[11];
1282         blk[11] = 0;
1283         x0 = ((*(blk += 8)) << 8) + 8192;
1284         *blk = 0;  /* for proper rounding in the fourth stage */
1285 
1286         /* first stage */
1287         x8 = W7 * (x4 + x5) + 4;
1288         x4 = (x8 + (W1 - W7) * x4) >> 3;
1289         x5 = (x8 - (W1 + W7) * x5) >> 3;
1290         x8 = W3 * (x6 + x7) + 4;
1291         x6 = (x8 - (W3 - W5) * x6) >> 3;
1292         x7 = (x8 - (W3 + W5) * x7) >> 3;
1293 
1294         /* second stage */
1295         x8 = x0 + x1;
1296         x0 -= x1;
1297         x1 = W6 * (x3 + x2) + 4;
1298         x2 = (x1 - (W2 + W6) * x2) >> 3;
1299         x3 = (x1 + (W2 - W6) * x3) >> 3;
1300         x1 = x4 + x6;
1301         x4 -= x6;
1302         x6 = x5 + x7;
1303         x5 -= x7;
1304 
1305         /* third stage */
1306         x7 = x8 + x3;
1307         x8 -= x3;
1308         x3 = x0 + x2;
1309         x0 -= x2;
1310         x2 = (181 * (x4 + x5) + 128) >> 8;
1311         x4 = (181 * (x4 - x5) + 128) >> 8;
1312 
1313         /* fourth stage */
1314         res = ((x7 + x1) >> 14);
1315         CLIP_RESULT(res)
1316         res2 = ((x3 + x2) >> 14);
1317         CLIP_RESULT(res2)
1318         dst_word = res | (res2 << 8);
1319         res = ((x0 + x4) >> 14);
1320         CLIP_RESULT(res)
1321         dst_word |= (res << 16);
1322         res = ((x8 + x6) >> 14);
1323         CLIP_RESULT(res)
1324         dst_word |= (res << 24);
1325         *((uint32*)(rec += lx)) = dst_word;
1326 
1327         res = ((x8 - x6) >> 14);
1328         CLIP_RESULT(res)
1329         res2 = ((x0 - x4) >> 14);
1330         CLIP_RESULT(res2)
1331         dst_word = res | (res2 << 8);
1332         res = ((x3 - x2) >> 14);
1333         CLIP_RESULT(res)
1334         dst_word |= (res << 16);
1335         res = ((x7 - x1) >> 14);
1336         CLIP_RESULT(res)
1337         dst_word |= (res << 24);
1338         *((uint32*)(rec + 4)) = dst_word;
1339     }
1340     return;
1341 }
1342 
1343 
1344 /* This function should not be called at all ****/
idct_row0zmv(Short * srce,UChar * rec,UChar * pred,Int lx)1345 void idct_row0zmv(Short *srce, UChar *rec, UChar *pred, Int lx)
1346 {
1347     OSCL_UNUSED_ARG(srce);
1348     OSCL_UNUSED_ARG(rec);
1349     OSCL_UNUSED_ARG(pred);
1350     OSCL_UNUSED_ARG(lx);
1351 
1352     return;
1353 }
1354 
idct_row1zmv(Short * blk,UChar * rec,UChar * pred,Int lx)1355 void idct_row1zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1356 {
1357     int tmp;
1358     int i = 8;
1359     uint32 pred_word, dst_word;
1360     int res, res2;
1361 
1362     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
1363     pred -= 16;
1364     rec -= lx;
1365     blk -= 8;
1366 
1367     while (i--)
1368     {
1369         tmp = (*(blk += 8) + 32) >> 6;
1370         *blk = 0;
1371 
1372         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1373         res = tmp + (pred_word & 0xFF);
1374         CLIP_RESULT(res);
1375         res2 = tmp + ((pred_word >> 8) & 0xFF);
1376         CLIP_RESULT(res2);
1377         dst_word = (res2 << 8) | res;
1378         res = tmp + ((pred_word >> 16) & 0xFF);
1379         CLIP_RESULT(res);
1380         dst_word |= (res << 16);
1381         res = tmp + ((pred_word >> 24) & 0xFF);
1382         CLIP_RESULT(res);
1383         dst_word |= (res << 24);
1384         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1385 
1386         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1387         res = tmp + (pred_word & 0xFF);
1388         CLIP_RESULT(res);
1389         res2 = tmp + ((pred_word >> 8) & 0xFF);
1390         CLIP_RESULT(res2);
1391         dst_word = (res2 << 8) | res;
1392         res = tmp + ((pred_word >> 16) & 0xFF);
1393         CLIP_RESULT(res);
1394         dst_word |= (res << 16);
1395         res = tmp + ((pred_word >> 24) & 0xFF);
1396         CLIP_RESULT(res);
1397         dst_word |= (res << 24);
1398         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1399     }
1400     return;
1401 }
1402 
1403 /* Ignoring overflows as idct function expects and uses overflows */
1404 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row2zmv(Short * blk,UChar * rec,UChar * pred,Int lx)1405 void idct_row2zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1406 {
1407     int32 x0, x1, x2, x4, x5;
1408     int i = 8;
1409     uint32 pred_word, dst_word;
1410     int res, res2;
1411 
1412     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
1413     rec -= lx;
1414     pred -= 16;
1415     blk -= 8;
1416 
1417     while (i--)
1418     {
1419         /* shortcut */
1420         x4 = blk[9];
1421         blk[9] = 0;
1422         x0 = ((*(blk += 8)) << 8) + 8192;
1423         *blk = 0;  /* for proper rounding in the fourth stage */
1424 
1425         /* first stage */
1426         x5 = (W7 * x4 + 4) >> 3;
1427         x4 = (W1 * x4 + 4) >> 3;
1428 
1429         /* third stage */
1430         x2 = (181 * (x4 + x5) + 128) >> 8;
1431         x1 = (181 * (x4 - x5) + 128) >> 8;
1432 
1433         /* fourth stage */
1434         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1435         res = (x0 + x4) >> 14;
1436         ADD_AND_CLIP1(res);
1437         res2 = (x0 + x2) >> 14;
1438         ADD_AND_CLIP2(res2);
1439         dst_word = (res2 << 8) | res;
1440         res = (x0 + x1) >> 14;
1441         ADD_AND_CLIP3(res);
1442         dst_word |= (res << 16);
1443         res = (x0 + x5) >> 14;
1444         ADD_AND_CLIP4(res);
1445         dst_word |= (res << 24);
1446         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1447 
1448         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1449         res = (x0 - x5) >> 14;
1450         ADD_AND_CLIP1(res);
1451         res2 = (x0 - x1) >> 14;
1452         ADD_AND_CLIP2(res2);
1453         dst_word = (res2 << 8) | res;
1454         res = (x0 - x2) >> 14;
1455         ADD_AND_CLIP3(res);
1456         dst_word |= (res << 16);
1457         res = (x0 - x4) >> 14;
1458         ADD_AND_CLIP4(res);
1459         dst_word |= (res << 24);
1460         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1461     }
1462     return ;
1463 }
1464 
1465 /* Ignoring overflows as idct function expects and uses overflows */
1466 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row3zmv(Short * blk,UChar * rec,UChar * pred,Int lx)1467 void idct_row3zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1468 {
1469     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
1470     int i = 8;
1471     uint32 pred_word, dst_word;
1472     int res, res2;
1473 
1474     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
1475     rec -= lx;
1476     pred -= 16;
1477     blk -= 8;
1478 
1479     while (i--)
1480     {
1481         x2 = blk[10];
1482         blk[10] = 0;
1483         x1 = blk[9];
1484         blk[9] = 0;
1485         x0 = ((*(blk += 8)) << 8) + 8192;
1486         *blk = 0;  /* for proper rounding in the fourth stage */
1487         /* both upper and lower*/
1488         /* both x2orx6 and x0orx4 */
1489 
1490         x4 = x0;
1491         x6 = (W6 * x2 + 4) >> 3;
1492         x2 = (W2 * x2 + 4) >> 3;
1493         x8 = x0 - x2;
1494         x0 += x2;
1495         x2 = x8;
1496         x8 = x4 - x6;
1497         x4 += x6;
1498         x6 = x8;
1499 
1500         x7 = (W7 * x1 + 4) >> 3;
1501         x1 = (W1 * x1 + 4) >> 3;
1502         x3 = x7;
1503         x5 = (181 * (x1 - x7) + 128) >> 8;
1504         x7 = (181 * (x1 + x7) + 128) >> 8;
1505 
1506         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1507         res = (x0 + x1) >> 14;
1508         ADD_AND_CLIP1(res);
1509         res2 = (x4 + x7) >> 14;
1510         ADD_AND_CLIP2(res2);
1511         dst_word = (res2 << 8) | res;
1512         res = (x6 + x5) >> 14;
1513         ADD_AND_CLIP3(res);
1514         dst_word |= (res << 16);
1515         res = (x2 + x3) >> 14;
1516         ADD_AND_CLIP4(res);
1517         dst_word |= (res << 24);
1518         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1519 
1520         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1521         res = (x2 - x3) >> 14;
1522         ADD_AND_CLIP1(res);
1523         res2 = (x6 - x5) >> 14;
1524         ADD_AND_CLIP2(res2);
1525         dst_word = (res2 << 8) | res;
1526         res = (x4 - x7) >> 14;
1527         ADD_AND_CLIP3(res);
1528         dst_word |= (res << 16);
1529         res = (x0 - x1) >> 14;
1530         ADD_AND_CLIP4(res);
1531         dst_word |= (res << 24);
1532         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1533     }
1534 
1535     return ;
1536 }
1537 
1538 /* Ignoring overflows as idct function expects and uses overflows */
1539 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row4zmv(Short * blk,UChar * rec,UChar * pred,Int lx)1540 void idct_row4zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1541 {
1542     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
1543     int i = 8;
1544     uint32 pred_word, dst_word;
1545     int res, res2;
1546 
1547     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
1548     rec -= lx;
1549     pred -= 16;
1550     blk -= 8;
1551 
1552     while (i--)
1553     {
1554         x2 = blk[10];
1555         blk[10] = 0;
1556         x1 = blk[9];
1557         blk[9] = 0;
1558         x3 = blk[11];
1559         blk[11] = 0;
1560         x0 = ((*(blk += 8)) << 8) + 8192;
1561         *blk = 0;   /* for proper rounding in the fourth stage */
1562 
1563         x4 = x0;
1564         x6 = (W6 * x2 + 4) >> 3;
1565         x2 = (W2 * x2 + 4) >> 3;
1566         x8 = x0 - x2;
1567         x0 += x2;
1568         x2 = x8;
1569         x8 = x4 - x6;
1570         x4 += x6;
1571         x6 = x8;
1572 
1573         x7 = (W7 * x1 + 4) >> 3;
1574         x1 = (W1 * x1 + 4) >> 3;
1575         x5 = (W3 * x3 + 4) >> 3;
1576         x3 = (- W5 * x3 + 4) >> 3;
1577         x8 = x1 - x5;
1578         x1 += x5;
1579         x5 = x8;
1580         x8 = x7 - x3;
1581         x3 += x7;
1582         x7 = (181 * (x5 + x8) + 128) >> 8;
1583         x5 = (181 * (x5 - x8) + 128) >> 8;
1584 
1585         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1586         res = (x0 + x1) >> 14;
1587         ADD_AND_CLIP1(res);
1588         res2 = (x4 + x7) >> 14;
1589         ADD_AND_CLIP2(res2);
1590         dst_word = (res2 << 8) | res;
1591         res = (x6 + x5) >> 14;
1592         ADD_AND_CLIP3(res);
1593         dst_word |= (res << 16);
1594         res = (x2 + x3) >> 14;
1595         ADD_AND_CLIP4(res);
1596         dst_word |= (res << 24);
1597         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1598 
1599         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1600         res = (x2 - x3) >> 14;
1601         ADD_AND_CLIP1(res);
1602         res2 = (x6 - x5) >> 14;
1603         ADD_AND_CLIP2(res2);
1604         dst_word = (res2 << 8) | res;
1605         res = (x4 - x7) >> 14;
1606         ADD_AND_CLIP3(res);
1607         dst_word |= (res << 16);
1608         res = (x0 - x1) >> 14;
1609         ADD_AND_CLIP4(res);
1610         dst_word |= (res << 24);
1611         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1612     }
1613     return ;
1614 }
1615 
1616 #ifndef SMALL_DCT
1617 /* Ignoring overflows as idct function expects and uses overflows */
1618 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row0x40zmv(Short * blk,UChar * rec,UChar * pred,Int lx)1619 void idct_row0x40zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1620 {
1621     int32 x1, x2, x4, x5;
1622     int i = 8;
1623     uint32 pred_word, dst_word;
1624     int res, res2;
1625 
1626     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
1627     rec -= lx;
1628     pred -= 16;
1629 
1630     while (i--)
1631     {
1632         /* shortcut */
1633         x4 = blk[1];
1634         blk[1] = 0;
1635         blk += 8;  /* for proper rounding in the fourth stage */
1636 
1637         /* first stage */
1638         x5 = (W7 * x4 + 4) >> 3;
1639         x4 = (W1 * x4 + 4) >> 3;
1640 
1641         /* third stage */
1642         x2 = (181 * (x4 + x5) + 128) >> 8;
1643         x1 = (181 * (x4 - x5) + 128) >> 8;
1644 
1645         /* fourth stage */
1646         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1647         res = (8192 + x4) >> 14;
1648         ADD_AND_CLIP1(res);
1649         res2 = (8192 + x2) >> 14;
1650         ADD_AND_CLIP2(res2);
1651         dst_word = (res2 << 8) | res;
1652         res = (8192 + x1) >> 14;
1653         ADD_AND_CLIP3(res);
1654         dst_word |= (res << 16);
1655         res = (8192 + x5) >> 14;
1656         ADD_AND_CLIP4(res);
1657         dst_word |= (res << 24);
1658         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1659 
1660         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1661         res = (8192 - x5) >> 14;
1662         ADD_AND_CLIP1(res);
1663         res2 = (8192 - x1) >> 14;
1664         ADD_AND_CLIP2(res2);
1665         dst_word = (res2 << 8) | res;
1666         res = (8192 - x2) >> 14;
1667         ADD_AND_CLIP3(res);
1668         dst_word |= (res << 16);
1669         res = (8192 - x4) >> 14;
1670         ADD_AND_CLIP4(res);
1671         dst_word |= (res << 24);
1672         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1673     }
1674     return ;
1675 }
1676 
idct_row0x20zmv(Short * blk,UChar * rec,UChar * pred,Int lx)1677 void idct_row0x20zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1678 {
1679     int32 x0, x2, x4, x6;
1680     int i = 8;
1681     uint32 pred_word, dst_word;
1682     int res, res2;
1683 
1684     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
1685     rec -= lx;
1686     pred -= 16;
1687 
1688     while (i--)
1689     {
1690         x2 = blk[2];
1691         blk[2] = 0;
1692         blk += 8; /* for proper rounding in the fourth stage */
1693         /* both upper and lower*/
1694         /* both x2orx6 and x0orx4 */
1695         x6 = (W6 * x2 + 4) >> 3;
1696         x2 = (W2 * x2 + 4) >> 3;
1697         x0 = 8192 + x2;
1698         x2 = 8192 - x2;
1699         x4 = 8192 + x6;
1700         x6 = 8192 - x6;
1701 
1702         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1703         res = (x0) >> 14;
1704         ADD_AND_CLIP1(res);
1705         res2 = (x4) >> 14;
1706         ADD_AND_CLIP2(res2);
1707         dst_word = (res2 << 8) | res;
1708         res = (x6) >> 14;
1709         ADD_AND_CLIP3(res);
1710         dst_word |= (res << 16);
1711         res = (x2) >> 14;
1712         ADD_AND_CLIP4(res);
1713         dst_word |= (res << 24);
1714         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1715 
1716         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1717         res = (x2) >> 14;
1718         ADD_AND_CLIP1(res);
1719         res2 = (x6) >> 14;
1720         ADD_AND_CLIP2(res2);
1721         dst_word = (res2 << 8) | res;
1722         res = (x4) >> 14;
1723         ADD_AND_CLIP3(res);
1724         dst_word |= (res << 16);
1725         res = (x0) >> 14;
1726         ADD_AND_CLIP4(res);
1727         dst_word |= (res << 24);
1728         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1729     }
1730 
1731     return ;
1732 }
1733 
1734 /* Ignoring overflows as idct function expects and uses overflows */
1735 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row0x10zmv(Short * blk,UChar * rec,UChar * pred,Int lx)1736 void idct_row0x10zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1737 {
1738     int32 x1, x3, x5, x7;
1739     int i = 8;
1740     uint32 pred_word, dst_word;
1741     int res, res2;
1742 
1743     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
1744     rec -= lx;
1745     pred -= 16;
1746 
1747     while (i--)
1748     {
1749         x3 = blk[3];
1750         blk[3] = 0;
1751         blk += 8;
1752 
1753         x1 = (W3 * x3 + 4) >> 3;
1754         x3 = (-W5 * x3 + 4) >> 3;
1755 
1756         x7 = (-181 * (x3 + x1) + 128) >> 8;
1757         x5 = (181 * (x3 - x1) + 128) >> 8;
1758 
1759         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1760         res = (8192 + x1) >> 14;
1761         ADD_AND_CLIP1(res);
1762         res2 = (8192 + x7) >> 14;
1763         ADD_AND_CLIP2(res2);
1764         dst_word = (res2 << 8) | res;
1765         res = (8192 + x5) >> 14;
1766         ADD_AND_CLIP3(res);
1767         dst_word |= (res << 16);
1768         res = (8192 + x3) >> 14;
1769         ADD_AND_CLIP4(res);
1770         dst_word |= (res << 24);
1771         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1772 
1773         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1774         res = (8192 - x3) >> 14;
1775         ADD_AND_CLIP1(res);
1776         res2 = (8192 - x5) >> 14;
1777         ADD_AND_CLIP2(res2);
1778         dst_word = (res2 << 8) | res;
1779         res = (8192 - x7) >> 14;
1780         ADD_AND_CLIP3(res);
1781         dst_word |= (res << 16);
1782         res = (8192 - x1) >> 14;
1783         ADD_AND_CLIP4(res);
1784         dst_word |= (res << 24);
1785         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1786     }
1787     return ;
1788 }
1789 
1790 #endif /* SMALL_DCT */
1791 
1792 /* Ignoring overflows as idct function expects and uses overflows */
1793 __attribute__((no_sanitize("signed-integer-overflow")))
idct_rowzmv(Short * blk,UChar * rec,UChar * pred,Int lx)1794 void idct_rowzmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1795 {
1796     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
1797     int i = 8;
1798     uint32 pred_word, dst_word;
1799     int res, res2;
1800 
1801     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
1802     rec -= lx;
1803     pred -= 16;
1804     blk -= 8;
1805 
1806     while (i--)
1807     {
1808         x1 = (int32)blk[12] << 8;
1809         blk[12] = 0;
1810         x2 = blk[14];
1811         blk[14] = 0;
1812         x3 = blk[10];
1813         blk[10] = 0;
1814         x4 = blk[9];
1815         blk[9] = 0;
1816         x5 = blk[15];
1817         blk[15] = 0;
1818         x6 = blk[13];
1819         blk[13] = 0;
1820         x7 = blk[11];
1821         blk[11] = 0;
1822         x0 = ((*(blk += 8)) << 8) + 8192;
1823         *blk = 0;   /* for proper rounding in the fourth stage */
1824 
1825         /* first stage */
1826         x8 = W7 * (x4 + x5) + 4;
1827         x4 = (x8 + (W1 - W7) * x4) >> 3;
1828         x5 = (x8 - (W1 + W7) * x5) >> 3;
1829         x8 = W3 * (x6 + x7) + 4;
1830         x6 = (x8 - (W3 - W5) * x6) >> 3;
1831         x7 = (x8 - (W3 + W5) * x7) >> 3;
1832 
1833         /* second stage */
1834         x8 = x0 + x1;
1835         x0 -= x1;
1836         x1 = W6 * (x3 + x2) + 4;
1837         x2 = (x1 - (W2 + W6) * x2) >> 3;
1838         x3 = (x1 + (W2 - W6) * x3) >> 3;
1839         x1 = x4 + x6;
1840         x4 -= x6;
1841         x6 = x5 + x7;
1842         x5 -= x7;
1843 
1844         /* third stage */
1845         x7 = x8 + x3;
1846         x8 -= x3;
1847         x3 = x0 + x2;
1848         x0 -= x2;
1849         x2 = (181 * (x4 + x5) + 128) >> 8;
1850         x4 = (181 * (x4 - x5) + 128) >> 8;
1851 
1852         /* fourth stage */
1853         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1854 
1855         res = (x7 + x1) >> 14;
1856         ADD_AND_CLIP1(res);
1857         res2 = (x3 + x2) >> 14;
1858         ADD_AND_CLIP2(res2);
1859         dst_word = (res2 << 8) | res;
1860         res = (x0 + x4) >> 14;
1861         ADD_AND_CLIP3(res);
1862         dst_word |= (res << 16);
1863         res = (x8 + x6) >> 14;
1864         ADD_AND_CLIP4(res);
1865         dst_word |= (res << 24);
1866         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1867 
1868         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1869 
1870         res = (x8 - x6) >> 14;
1871         ADD_AND_CLIP1(res);
1872         res2 = (x0 - x4) >> 14;
1873         ADD_AND_CLIP2(res2);
1874         dst_word = (res2 << 8) | res;
1875         res = (x3 - x2) >> 14;
1876         ADD_AND_CLIP3(res);
1877         dst_word |= (res << 16);
1878         res = (x7 - x1) >> 14;
1879         ADD_AND_CLIP4(res);
1880         dst_word |= (res << 24);
1881         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1882     }
1883     return;
1884 }
1885 
1886 /*----------------------------------------------------------------------------
1887 ;  End Function: idctcol
1888 ----------------------------------------------------------------------------*/
1889 /* ======================================================================== */
1890 /*  Function : BlockIDCTMotionComp                                              */
1891 /*  Date     : 10/16/2000                                                   */
1892 /*  Purpose  : fast IDCT routine                                    */
1893 /*  In/out   :                                                              */
1894 /*      Int* coeff_in   Dequantized coefficient
1895         Int block_out   output IDCT coefficient
1896         Int maxval      clip value                                          */
1897 /*  Modified :   7/31/01, add checking for all-zero and DC-only block.  */
1898 /*              do 8 columns at a time                                      */
1899 /*               8/2/01, do column first then row-IDCT.                 */
1900 /*               8/2/01, remove clipping (included in motion comp).     */
1901 /*               8/7/01, combine with motion comp.                      */
1902 /*               8/8/01, use AAN IDCT                                       */
1903 /*               9/4/05, use Chen's IDCT and 16 bit block                   */
1904 /* ======================================================================== */
BlockIDCTMotionComp(Short * block,UChar * bitmapcol,UChar bitmaprow,Int dctMode,UChar * rec,UChar * pred,Int lx_intra)1905 void BlockIDCTMotionComp(Short *block, UChar *bitmapcol, UChar bitmaprow,
1906                          Int dctMode, UChar *rec, UChar *pred, Int lx_intra)
1907 {
1908     Int i;
1909     Int tmp, tmp2;
1910     ULong tmp4;
1911     Int bmap;
1912     Short *ptr = block;
1913     UChar *endcol;
1914     UInt mask = 0xFF;
1915     Int lx = lx_intra >> 1;
1916     Int intra = (lx_intra & 1);
1917 
1918     /*  all-zero block */
1919     if (dctMode == 0 || bitmaprow == 0)
1920     {
1921         if (intra)
1922         {
1923             *((ULong*)rec) = *((ULong*)(rec + 4)) = 0;
1924             *((ULong*)(rec += lx)) = 0;
1925             *((ULong*)(rec + 4)) = 0;
1926             *((ULong*)(rec += lx)) = 0;
1927             *((ULong*)(rec + 4)) = 0;
1928             *((ULong*)(rec += lx)) = 0;
1929             *((ULong*)(rec + 4)) = 0;
1930             *((ULong*)(rec += lx)) = 0;
1931             *((ULong*)(rec + 4)) = 0;
1932             *((ULong*)(rec += lx)) = 0;
1933             *((ULong*)(rec + 4)) = 0;
1934             *((ULong*)(rec += lx)) = 0;
1935             *((ULong*)(rec + 4)) = 0;
1936             *((ULong*)(rec += lx)) = 0;
1937             *((ULong*)(rec + 4)) = 0;
1938             return ;
1939         }
1940         else /* copy from previous frame */
1941         {
1942             *((ULong*)rec) = *((ULong*)pred);
1943             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1944             *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1945             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1946             *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1947             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1948             *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1949             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1950             *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1951             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1952             *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1953             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1954             *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1955             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1956             *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1957             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1958             return ;
1959         }
1960     }
1961 
1962     /* Test for DC only block */
1963     if (dctMode == 1 || (bitmaprow == 0x80 && bitmapcol[0] == 0x80))
1964     {
1965         i = ((block[0] << 3) + 32) >> 6;
1966         block[0] = 0;
1967         if (intra)
1968         {
1969             if ((UInt)i > mask) i = mask & (~(i >> 31));
1970 
1971             tmp = i | (i << 8);
1972             tmp |= (tmp << 16);
1973 
1974             *((ULong*)rec) = *((ULong*)(rec + 4)) = tmp;
1975             *((ULong*)(rec += lx)) = tmp;
1976             *((ULong*)(rec + 4)) = tmp;
1977             *((ULong*)(rec += lx)) = tmp;
1978             *((ULong*)(rec + 4)) = tmp;
1979             *((ULong*)(rec += lx)) = tmp;
1980             *((ULong*)(rec + 4)) = tmp;
1981             *((ULong*)(rec += lx)) = tmp;
1982             *((ULong*)(rec + 4)) = tmp;
1983             *((ULong*)(rec += lx)) = tmp;
1984             *((ULong*)(rec + 4)) = tmp;
1985             *((ULong*)(rec += lx)) = tmp;
1986             *((ULong*)(rec + 4)) = tmp;
1987             *((ULong*)(rec += lx)) = tmp;
1988             *((ULong*)(rec + 4)) = tmp;
1989 
1990             return ;
1991         }
1992         else
1993         {
1994             endcol = rec + (lx << 3);
1995             do
1996             {
1997                 tmp4 = *((ULong*)pred);
1998                 tmp2 = tmp4 & 0xFF;
1999                 tmp2 += i;
2000                 if ((UInt)tmp2 > mask) tmp2 = mask & (~(tmp2 >> 31));
2001                 tmp = (tmp4 >> 8) & 0xFF;
2002                 tmp += i;
2003                 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
2004                 tmp2 |= (tmp << 8);
2005                 tmp = (tmp4 >> 16) & 0xFF;
2006                 tmp += i;
2007                 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
2008                 tmp2 |= (tmp << 16);
2009                 tmp = (tmp4 >> 24) & 0xFF;
2010                 tmp += i;
2011                 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
2012                 tmp2 |= (tmp << 24);
2013                 *((ULong*)rec) = tmp2;
2014 
2015                 tmp4 = *((ULong*)(pred + 4));
2016                 tmp2 = tmp4 & 0xFF;
2017                 tmp2 += i;
2018                 if ((UInt)tmp2 > mask) tmp2 = mask & (~(tmp2 >> 31));
2019                 tmp = (tmp4 >> 8) & 0xFF;
2020                 tmp += i;
2021                 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
2022                 tmp2 |= (tmp << 8);
2023                 tmp = (tmp4 >> 16) & 0xFF;
2024                 tmp += i;
2025                 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
2026                 tmp2 |= (tmp << 16);
2027                 tmp = (tmp4 >> 24) & 0xFF;
2028                 tmp += i;
2029                 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
2030                 tmp2 |= (tmp << 24);
2031                 *((ULong*)(rec + 4)) = tmp2;
2032 
2033                 rec += lx;
2034                 pred += 16;
2035             }
2036             while (rec < endcol);
2037             return ;
2038         }
2039     }
2040 
2041     for (i = 0; i < dctMode; i++)
2042     {
2043         bmap = (Int)bitmapcol[i];
2044         if (bmap)
2045         {
2046             if ((bmap&0xf) == 0)
2047                 (*(idctcolVCA[bmap>>4]))(ptr);
2048             else
2049                 idct_col(ptr);
2050         }
2051         ptr++;
2052     }
2053 
2054     if ((bitmaprow&0xf) == 0)
2055     {
2056         if (intra)
2057             (*(idctrowVCAIntra[(Int)(bitmaprow>>4)]))(block, rec, lx);
2058         else
2059             (*(idctrowVCAzmv[(Int)(bitmaprow>>4)]))(block, rec, pred, lx);
2060     }
2061     else
2062     {
2063         if (intra)
2064             idct_rowIntra(block, rec, lx);
2065         else
2066             idct_rowzmv(block, rec, pred, lx);
2067     }
2068 }
2069