src/3rdparty/libjpeg/jidctint.c
changeset 30 5dc02b23752f
parent 0 1918ee327afb
equal deleted inserted replaced
29:b72c6db6890b 30:5dc02b23752f
     1 /*
     1 /*
     2  * jidctint.c
     2  * jidctint.c
     3  *
     3  *
     4  * Copyright (C) 1991-1998, Thomas G. Lane.
     4  * Copyright (C) 1991-1998, Thomas G. Lane.
       
     5  * Modification developed 2002-2009 by Guido Vollbeding.
     5  * This file is part of the Independent JPEG Group's software.
     6  * This file is part of the Independent JPEG Group's software.
     6  * For conditions of distribution and use, see the accompanying README file.
     7  * For conditions of distribution and use, see the accompanying README file.
     7  *
     8  *
     8  * This file contains a slow-but-accurate integer implementation of the
     9  * This file contains a slow-but-accurate integer implementation of the
     9  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
    10  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
    21  * The primary algorithm described there uses 11 multiplies and 29 adds.
    22  * The primary algorithm described there uses 11 multiplies and 29 adds.
    22  * We use their alternate method with 12 multiplies and 32 adds.
    23  * We use their alternate method with 12 multiplies and 32 adds.
    23  * The advantage of this method is that no data path contains more than one
    24  * The advantage of this method is that no data path contains more than one
    24  * multiplication; this allows a very simple and accurate implementation in
    25  * multiplication; this allows a very simple and accurate implementation in
    25  * scaled fixed-point arithmetic, with a minimal number of shifts.
    26  * scaled fixed-point arithmetic, with a minimal number of shifts.
       
    27  *
       
    28  * We also provide IDCT routines with various output sample block sizes for
       
    29  * direct resolution reduction or enlargement and for direct resolving the
       
    30  * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
       
    31  * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 input DCT block.
       
    32  *
       
    33  * For N<8 we simply take the corresponding low-frequency coefficients of
       
    34  * the 8x8 input DCT block and apply an NxN point IDCT on the sub-block
       
    35  * to yield the downscaled outputs.
       
    36  * This can be seen as direct low-pass downsampling from the DCT domain
       
    37  * point of view rather than the usual spatial domain point of view,
       
    38  * yielding significant computational savings and results at least
       
    39  * as good as common bilinear (averaging) spatial downsampling.
       
    40  *
       
    41  * For N>8 we apply a partial NxN IDCT on the 8 input coefficients as
       
    42  * lower frequencies and higher frequencies assumed to be zero.
       
    43  * It turns out that the computational effort is similar to the 8x8 IDCT
       
    44  * regarding the output size.
       
    45  * Furthermore, the scaling and descaling is the same for all IDCT sizes.
       
    46  *
       
    47  * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
       
    48  * since there would be too many additional constants to pre-calculate.
    26  */
    49  */
    27 
    50 
    28 #define JPEG_INTERNALS
    51 #define JPEG_INTERNALS
    29 #include "jinclude.h"
    52 #include "jinclude.h"
    30 #include "jpeglib.h"
    53 #include "jpeglib.h"
    36 /*
    59 /*
    37  * This module is specialized to the case DCTSIZE = 8.
    60  * This module is specialized to the case DCTSIZE = 8.
    38  */
    61  */
    39 
    62 
    40 #if DCTSIZE != 8
    63 #if DCTSIZE != 8
    41   Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
    64   Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
    42 #endif
    65 #endif
    43 
    66 
    44 
    67 
    45 /*
    68 /*
    46  * The poop on this scaling stuff is as follows:
    69  * The poop on this scaling stuff is as follows:
   149 		 JCOEFPTR coef_block,
   172 		 JCOEFPTR coef_block,
   150 		 JSAMPARRAY output_buf, JDIMENSION output_col)
   173 		 JSAMPARRAY output_buf, JDIMENSION output_col)
   151 {
   174 {
   152   INT32 tmp0, tmp1, tmp2, tmp3;
   175   INT32 tmp0, tmp1, tmp2, tmp3;
   153   INT32 tmp10, tmp11, tmp12, tmp13;
   176   INT32 tmp10, tmp11, tmp12, tmp13;
       
   177   INT32 z1, z2, z3;
       
   178   JCOEFPTR inptr;
       
   179   ISLOW_MULT_TYPE * quantptr;
       
   180   int * wsptr;
       
   181   JSAMPROW outptr;
       
   182   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
   183   int ctr;
       
   184   int workspace[DCTSIZE2];	/* buffers data between passes */
       
   185   SHIFT_TEMPS
       
   186 
       
   187   /* Pass 1: process columns from input, store into work array. */
       
   188   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
       
   189   /* furthermore, we scale the results by 2**PASS1_BITS. */
       
   190 
       
   191   inptr = coef_block;
       
   192   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
   193   wsptr = workspace;
       
   194   for (ctr = DCTSIZE; ctr > 0; ctr--) {
       
   195     /* Due to quantization, we will usually find that many of the input
       
   196      * coefficients are zero, especially the AC terms.  We can exploit this
       
   197      * by short-circuiting the IDCT calculation for any column in which all
       
   198      * the AC terms are zero.  In that case each output is equal to the
       
   199      * DC coefficient (with scale factor as needed).
       
   200      * With typical images and quantization tables, half or more of the
       
   201      * column DCT calculations can be simplified this way.
       
   202      */
       
   203 
       
   204     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
       
   205 	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
       
   206 	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
       
   207 	inptr[DCTSIZE*7] == 0) {
       
   208       /* AC terms all zero */
       
   209       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
       
   210 
       
   211       wsptr[DCTSIZE*0] = dcval;
       
   212       wsptr[DCTSIZE*1] = dcval;
       
   213       wsptr[DCTSIZE*2] = dcval;
       
   214       wsptr[DCTSIZE*3] = dcval;
       
   215       wsptr[DCTSIZE*4] = dcval;
       
   216       wsptr[DCTSIZE*5] = dcval;
       
   217       wsptr[DCTSIZE*6] = dcval;
       
   218       wsptr[DCTSIZE*7] = dcval;
       
   219 
       
   220       inptr++;			/* advance pointers to next column */
       
   221       quantptr++;
       
   222       wsptr++;
       
   223       continue;
       
   224     }
       
   225 
       
   226     /* Even part: reverse the even part of the forward DCT. */
       
   227     /* The rotator is sqrt(2)*c(-6). */
       
   228     
       
   229     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
   230     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
       
   231 
       
   232     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
       
   233     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
       
   234     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
       
   235 
       
   236     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
   237     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
       
   238     z2 <<= CONST_BITS;
       
   239     z3 <<= CONST_BITS;
       
   240     /* Add fudge factor here for final descale. */
       
   241     z2 += ONE << (CONST_BITS-PASS1_BITS-1);
       
   242 
       
   243     tmp0 = z2 + z3;
       
   244     tmp1 = z2 - z3;
       
   245 
       
   246     tmp10 = tmp0 + tmp2;
       
   247     tmp13 = tmp0 - tmp2;
       
   248     tmp11 = tmp1 + tmp3;
       
   249     tmp12 = tmp1 - tmp3;
       
   250 
       
   251     /* Odd part per figure 8; the matrix is unitary and hence its
       
   252      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
       
   253      */
       
   254 
       
   255     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
       
   256     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
       
   257     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
   258     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
   259     
       
   260     z2 = tmp0 + tmp2;
       
   261     z3 = tmp1 + tmp3;
       
   262 
       
   263     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
       
   264     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
       
   265     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
       
   266     z2 += z1;
       
   267     z3 += z1;
       
   268 
       
   269     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
       
   270     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
       
   271     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
       
   272     tmp0 += z1 + z2;
       
   273     tmp3 += z1 + z3;
       
   274 
       
   275     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
       
   276     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
       
   277     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
       
   278     tmp1 += z1 + z3;
       
   279     tmp2 += z1 + z2;
       
   280 
       
   281     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
       
   282 
       
   283     wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
       
   284     wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
       
   285     wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
       
   286     wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
       
   287     wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
       
   288     wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
       
   289     wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
       
   290     wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
       
   291     
       
   292     inptr++;			/* advance pointers to next column */
       
   293     quantptr++;
       
   294     wsptr++;
       
   295   }
       
   296 
       
   297   /* Pass 2: process rows from work array, store into output array. */
       
   298   /* Note that we must descale the results by a factor of 8 == 2**3, */
       
   299   /* and also undo the PASS1_BITS scaling. */
       
   300 
       
   301   wsptr = workspace;
       
   302   for (ctr = 0; ctr < DCTSIZE; ctr++) {
       
   303     outptr = output_buf[ctr] + output_col;
       
   304     /* Rows of zeroes can be exploited in the same way as we did with columns.
       
   305      * However, the column calculation has created many nonzero AC terms, so
       
   306      * the simplification applies less often (typically 5% to 10% of the time).
       
   307      * On machines with very fast multiplication, it's possible that the
       
   308      * test takes more time than it's worth.  In that case this section
       
   309      * may be commented out.
       
   310      */
       
   311 
       
   312 #ifndef NO_ZERO_ROW_TEST
       
   313     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
       
   314 	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       
   315       /* AC terms all zero */
       
   316       JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
       
   317 				  & RANGE_MASK];
       
   318 
       
   319       outptr[0] = dcval;
       
   320       outptr[1] = dcval;
       
   321       outptr[2] = dcval;
       
   322       outptr[3] = dcval;
       
   323       outptr[4] = dcval;
       
   324       outptr[5] = dcval;
       
   325       outptr[6] = dcval;
       
   326       outptr[7] = dcval;
       
   327 
       
   328       wsptr += DCTSIZE;		/* advance pointer to next row */
       
   329       continue;
       
   330     }
       
   331 #endif
       
   332 
       
   333     /* Even part: reverse the even part of the forward DCT. */
       
   334     /* The rotator is sqrt(2)*c(-6). */
       
   335     
       
   336     z2 = (INT32) wsptr[2];
       
   337     z3 = (INT32) wsptr[6];
       
   338 
       
   339     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
       
   340     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
       
   341     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
       
   342 
       
   343     /* Add fudge factor here for final descale. */
       
   344     z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
   345     z3 = (INT32) wsptr[4];
       
   346 
       
   347     tmp0 = (z2 + z3) << CONST_BITS;
       
   348     tmp1 = (z2 - z3) << CONST_BITS;
       
   349     
       
   350     tmp10 = tmp0 + tmp2;
       
   351     tmp13 = tmp0 - tmp2;
       
   352     tmp11 = tmp1 + tmp3;
       
   353     tmp12 = tmp1 - tmp3;
       
   354 
       
   355     /* Odd part per figure 8; the matrix is unitary and hence its
       
   356      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
       
   357      */
       
   358 
       
   359     tmp0 = (INT32) wsptr[7];
       
   360     tmp1 = (INT32) wsptr[5];
       
   361     tmp2 = (INT32) wsptr[3];
       
   362     tmp3 = (INT32) wsptr[1];
       
   363 
       
   364     z2 = tmp0 + tmp2;
       
   365     z3 = tmp1 + tmp3;
       
   366 
       
   367     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
       
   368     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
       
   369     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
       
   370     z2 += z1;
       
   371     z3 += z1;
       
   372 
       
   373     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
       
   374     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
       
   375     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
       
   376     tmp0 += z1 + z2;
       
   377     tmp3 += z1 + z3;
       
   378 
       
   379     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
       
   380     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
       
   381     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
       
   382     tmp1 += z1 + z3;
       
   383     tmp2 += z1 + z2;
       
   384 
       
   385     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
       
   386 
       
   387     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
       
   388 					      CONST_BITS+PASS1_BITS+3)
       
   389 			    & RANGE_MASK];
       
   390     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
       
   391 					      CONST_BITS+PASS1_BITS+3)
       
   392 			    & RANGE_MASK];
       
   393     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
       
   394 					      CONST_BITS+PASS1_BITS+3)
       
   395 			    & RANGE_MASK];
       
   396     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
       
   397 					      CONST_BITS+PASS1_BITS+3)
       
   398 			    & RANGE_MASK];
       
   399     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
       
   400 					      CONST_BITS+PASS1_BITS+3)
       
   401 			    & RANGE_MASK];
       
   402     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
       
   403 					      CONST_BITS+PASS1_BITS+3)
       
   404 			    & RANGE_MASK];
       
   405     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
       
   406 					      CONST_BITS+PASS1_BITS+3)
       
   407 			    & RANGE_MASK];
       
   408     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
       
   409 					      CONST_BITS+PASS1_BITS+3)
       
   410 			    & RANGE_MASK];
       
   411 
       
   412     wsptr += DCTSIZE;		/* advance pointer to next row */
       
   413   }
       
   414 }
       
   415 
       
   416 #ifdef IDCT_SCALING_SUPPORTED
       
   417 
       
   418 
       
   419 /*
       
   420  * Perform dequantization and inverse DCT on one block of coefficients,
       
   421  * producing a 7x7 output block.
       
   422  *
       
   423  * Optimized algorithm with 12 multiplications in the 1-D kernel.
       
   424  * cK represents sqrt(2) * cos(K*pi/14).
       
   425  */
       
   426 
       
   427 GLOBAL(void)
       
   428 jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
   429 	       JCOEFPTR coef_block,
       
   430 	       JSAMPARRAY output_buf, JDIMENSION output_col)
       
   431 {
       
   432   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
       
   433   INT32 z1, z2, z3;
       
   434   JCOEFPTR inptr;
       
   435   ISLOW_MULT_TYPE * quantptr;
       
   436   int * wsptr;
       
   437   JSAMPROW outptr;
       
   438   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
   439   int ctr;
       
   440   int workspace[7*7];	/* buffers data between passes */
       
   441   SHIFT_TEMPS
       
   442 
       
   443   /* Pass 1: process columns from input, store into work array. */
       
   444 
       
   445   inptr = coef_block;
       
   446   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
   447   wsptr = workspace;
       
   448   for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
       
   449     /* Even part */
       
   450 
       
   451     tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
   452     tmp13 <<= CONST_BITS;
       
   453     /* Add fudge factor here for final descale. */
       
   454     tmp13 += ONE << (CONST_BITS-PASS1_BITS-1);
       
   455 
       
   456     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
   457     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
       
   458     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
       
   459 
       
   460     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
       
   461     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
       
   462     tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
       
   463     tmp0 = z1 + z3;
       
   464     z2 -= tmp0;
       
   465     tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
       
   466     tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
       
   467     tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
       
   468     tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
       
   469 
       
   470     /* Odd part */
       
   471 
       
   472     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
   473     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
   474     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
       
   475 
       
   476     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
       
   477     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
       
   478     tmp0 = tmp1 - tmp2;
       
   479     tmp1 += tmp2;
       
   480     tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
       
   481     tmp1 += tmp2;
       
   482     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
       
   483     tmp0 += z2;
       
   484     tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
       
   485 
       
   486     /* Final output stage */
       
   487 
       
   488     wsptr[7*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
       
   489     wsptr[7*6] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
       
   490     wsptr[7*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
       
   491     wsptr[7*5] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
       
   492     wsptr[7*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
       
   493     wsptr[7*4] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
       
   494     wsptr[7*3] = (int) RIGHT_SHIFT(tmp13, CONST_BITS-PASS1_BITS);
       
   495   }
       
   496 
       
   497   /* Pass 2: process 7 rows from work array, store into output array. */
       
   498 
       
   499   wsptr = workspace;
       
   500   for (ctr = 0; ctr < 7; ctr++) {
       
   501     outptr = output_buf[ctr] + output_col;
       
   502 
       
   503     /* Even part */
       
   504 
       
   505     /* Add fudge factor here for final descale. */
       
   506     tmp13 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
   507     tmp13 <<= CONST_BITS;
       
   508 
       
   509     z1 = (INT32) wsptr[2];
       
   510     z2 = (INT32) wsptr[4];
       
   511     z3 = (INT32) wsptr[6];
       
   512 
       
   513     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
       
   514     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
       
   515     tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
       
   516     tmp0 = z1 + z3;
       
   517     z2 -= tmp0;
       
   518     tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
       
   519     tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
       
   520     tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
       
   521     tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
       
   522 
       
   523     /* Odd part */
       
   524 
       
   525     z1 = (INT32) wsptr[1];
       
   526     z2 = (INT32) wsptr[3];
       
   527     z3 = (INT32) wsptr[5];
       
   528 
       
   529     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
       
   530     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
       
   531     tmp0 = tmp1 - tmp2;
       
   532     tmp1 += tmp2;
       
   533     tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
       
   534     tmp1 += tmp2;
       
   535     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
       
   536     tmp0 += z2;
       
   537     tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
       
   538 
       
   539     /* Final output stage */
       
   540 
       
   541     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
       
   542 					      CONST_BITS+PASS1_BITS+3)
       
   543 			    & RANGE_MASK];
       
   544     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
       
   545 					      CONST_BITS+PASS1_BITS+3)
       
   546 			    & RANGE_MASK];
       
   547     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
       
   548 					      CONST_BITS+PASS1_BITS+3)
       
   549 			    & RANGE_MASK];
       
   550     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
       
   551 					      CONST_BITS+PASS1_BITS+3)
       
   552 			    & RANGE_MASK];
       
   553     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
       
   554 					      CONST_BITS+PASS1_BITS+3)
       
   555 			    & RANGE_MASK];
       
   556     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
       
   557 					      CONST_BITS+PASS1_BITS+3)
       
   558 			    & RANGE_MASK];
       
   559     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13,
       
   560 					      CONST_BITS+PASS1_BITS+3)
       
   561 			    & RANGE_MASK];
       
   562 
       
   563     wsptr += 7;		/* advance pointer to next row */
       
   564   }
       
   565 }
       
   566 
       
   567 
       
   568 /*
       
   569  * Perform dequantization and inverse DCT on one block of coefficients,
       
   570  * producing a reduced-size 6x6 output block.
       
   571  *
       
   572  * Optimized algorithm with 3 multiplications in the 1-D kernel.
       
   573  * cK represents sqrt(2) * cos(K*pi/12).
       
   574  */
       
   575 
       
   576 GLOBAL(void)
       
   577 jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
   578 	       JCOEFPTR coef_block,
       
   579 	       JSAMPARRAY output_buf, JDIMENSION output_col)
       
   580 {
       
   581   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
       
   582   INT32 z1, z2, z3;
       
   583   JCOEFPTR inptr;
       
   584   ISLOW_MULT_TYPE * quantptr;
       
   585   int * wsptr;
       
   586   JSAMPROW outptr;
       
   587   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
   588   int ctr;
       
   589   int workspace[6*6];	/* buffers data between passes */
       
   590   SHIFT_TEMPS
       
   591 
       
   592   /* Pass 1: process columns from input, store into work array. */
       
   593 
       
   594   inptr = coef_block;
       
   595   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
   596   wsptr = workspace;
       
   597   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
       
   598     /* Even part */
       
   599 
       
   600     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
   601     tmp0 <<= CONST_BITS;
       
   602     /* Add fudge factor here for final descale. */
       
   603     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
       
   604     tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
       
   605     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
       
   606     tmp1 = tmp0 + tmp10;
       
   607     tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
       
   608     tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
   609     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
       
   610     tmp10 = tmp1 + tmp0;
       
   611     tmp12 = tmp1 - tmp0;
       
   612 
       
   613     /* Odd part */
       
   614 
       
   615     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
   616     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
   617     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
       
   618     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
       
   619     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
       
   620     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
       
   621     tmp1 = (z1 - z2 - z3) << PASS1_BITS;
       
   622 
       
   623     /* Final output stage */
       
   624 
       
   625     wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
       
   626     wsptr[6*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
       
   627     wsptr[6*1] = (int) (tmp11 + tmp1);
       
   628     wsptr[6*4] = (int) (tmp11 - tmp1);
       
   629     wsptr[6*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
       
   630     wsptr[6*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
       
   631   }
       
   632 
       
   633   /* Pass 2: process 6 rows from work array, store into output array. */
       
   634 
       
   635   wsptr = workspace;
       
   636   for (ctr = 0; ctr < 6; ctr++) {
       
   637     outptr = output_buf[ctr] + output_col;
       
   638 
       
   639     /* Even part */
       
   640 
       
   641     /* Add fudge factor here for final descale. */
       
   642     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
   643     tmp0 <<= CONST_BITS;
       
   644     tmp2 = (INT32) wsptr[4];
       
   645     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
       
   646     tmp1 = tmp0 + tmp10;
       
   647     tmp11 = tmp0 - tmp10 - tmp10;
       
   648     tmp10 = (INT32) wsptr[2];
       
   649     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
       
   650     tmp10 = tmp1 + tmp0;
       
   651     tmp12 = tmp1 - tmp0;
       
   652 
       
   653     /* Odd part */
       
   654 
       
   655     z1 = (INT32) wsptr[1];
       
   656     z2 = (INT32) wsptr[3];
       
   657     z3 = (INT32) wsptr[5];
       
   658     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
       
   659     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
       
   660     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
       
   661     tmp1 = (z1 - z2 - z3) << CONST_BITS;
       
   662 
       
   663     /* Final output stage */
       
   664 
       
   665     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
       
   666 					      CONST_BITS+PASS1_BITS+3)
       
   667 			    & RANGE_MASK];
       
   668     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
       
   669 					      CONST_BITS+PASS1_BITS+3)
       
   670 			    & RANGE_MASK];
       
   671     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
       
   672 					      CONST_BITS+PASS1_BITS+3)
       
   673 			    & RANGE_MASK];
       
   674     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
       
   675 					      CONST_BITS+PASS1_BITS+3)
       
   676 			    & RANGE_MASK];
       
   677     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
       
   678 					      CONST_BITS+PASS1_BITS+3)
       
   679 			    & RANGE_MASK];
       
   680     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
       
   681 					      CONST_BITS+PASS1_BITS+3)
       
   682 			    & RANGE_MASK];
       
   683 
       
   684     wsptr += 6;		/* advance pointer to next row */
       
   685   }
       
   686 }
       
   687 
       
   688 
       
   689 /*
       
   690  * Perform dequantization and inverse DCT on one block of coefficients,
       
   691  * producing a reduced-size 5x5 output block.
       
   692  *
       
   693  * Optimized algorithm with 5 multiplications in the 1-D kernel.
       
   694  * cK represents sqrt(2) * cos(K*pi/10).
       
   695  */
       
   696 
       
   697 GLOBAL(void)
       
   698 jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
   699 	       JCOEFPTR coef_block,
       
   700 	       JSAMPARRAY output_buf, JDIMENSION output_col)
       
   701 {
       
   702   INT32 tmp0, tmp1, tmp10, tmp11, tmp12;
       
   703   INT32 z1, z2, z3;
       
   704   JCOEFPTR inptr;
       
   705   ISLOW_MULT_TYPE * quantptr;
       
   706   int * wsptr;
       
   707   JSAMPROW outptr;
       
   708   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
   709   int ctr;
       
   710   int workspace[5*5];	/* buffers data between passes */
       
   711   SHIFT_TEMPS
       
   712 
       
   713   /* Pass 1: process columns from input, store into work array. */
       
   714 
       
   715   inptr = coef_block;
       
   716   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
   717   wsptr = workspace;
       
   718   for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
       
   719     /* Even part */
       
   720 
       
   721     tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
   722     tmp12 <<= CONST_BITS;
       
   723     /* Add fudge factor here for final descale. */
       
   724     tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
       
   725     tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
   726     tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
       
   727     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
       
   728     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
       
   729     z3 = tmp12 + z2;
       
   730     tmp10 = z3 + z1;
       
   731     tmp11 = z3 - z1;
       
   732     tmp12 -= z2 << 2;
       
   733 
       
   734     /* Odd part */
       
   735 
       
   736     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
   737     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
   738 
       
   739     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
       
   740     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
       
   741     tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
       
   742 
       
   743     /* Final output stage */
       
   744 
       
   745     wsptr[5*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
       
   746     wsptr[5*4] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
       
   747     wsptr[5*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
       
   748     wsptr[5*3] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
       
   749     wsptr[5*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
       
   750   }
       
   751 
       
   752   /* Pass 2: process 5 rows from work array, store into output array. */
       
   753 
       
   754   wsptr = workspace;
       
   755   for (ctr = 0; ctr < 5; ctr++) {
       
   756     outptr = output_buf[ctr] + output_col;
       
   757 
       
   758     /* Even part */
       
   759 
       
   760     /* Add fudge factor here for final descale. */
       
   761     tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
   762     tmp12 <<= CONST_BITS;
       
   763     tmp0 = (INT32) wsptr[2];
       
   764     tmp1 = (INT32) wsptr[4];
       
   765     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
       
   766     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
       
   767     z3 = tmp12 + z2;
       
   768     tmp10 = z3 + z1;
       
   769     tmp11 = z3 - z1;
       
   770     tmp12 -= z2 << 2;
       
   771 
       
   772     /* Odd part */
       
   773 
       
   774     z2 = (INT32) wsptr[1];
       
   775     z3 = (INT32) wsptr[3];
       
   776 
       
   777     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
       
   778     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
       
   779     tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
       
   780 
       
   781     /* Final output stage */
       
   782 
       
   783     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
       
   784 					      CONST_BITS+PASS1_BITS+3)
       
   785 			    & RANGE_MASK];
       
   786     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
       
   787 					      CONST_BITS+PASS1_BITS+3)
       
   788 			    & RANGE_MASK];
       
   789     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
       
   790 					      CONST_BITS+PASS1_BITS+3)
       
   791 			    & RANGE_MASK];
       
   792     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
       
   793 					      CONST_BITS+PASS1_BITS+3)
       
   794 			    & RANGE_MASK];
       
   795     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
       
   796 					      CONST_BITS+PASS1_BITS+3)
       
   797 			    & RANGE_MASK];
       
   798 
       
   799     wsptr += 5;		/* advance pointer to next row */
       
   800   }
       
   801 }
       
   802 
       
   803 
       
   804 /*
       
   805  * Perform dequantization and inverse DCT on one block of coefficients,
       
   806  * producing a reduced-size 4x4 output block.
       
   807  *
       
   808  * Optimized algorithm with 3 multiplications in the 1-D kernel.
       
   809  * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
       
   810  */
       
   811 
       
   812 GLOBAL(void)
       
   813 jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
   814 	       JCOEFPTR coef_block,
       
   815 	       JSAMPARRAY output_buf, JDIMENSION output_col)
       
   816 {
       
   817   INT32 tmp0, tmp2, tmp10, tmp12;
       
   818   INT32 z1, z2, z3;
       
   819   JCOEFPTR inptr;
       
   820   ISLOW_MULT_TYPE * quantptr;
       
   821   int * wsptr;
       
   822   JSAMPROW outptr;
       
   823   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
   824   int ctr;
       
   825   int workspace[4*4];	/* buffers data between passes */
       
   826   SHIFT_TEMPS
       
   827 
       
   828   /* Pass 1: process columns from input, store into work array. */
       
   829 
       
   830   inptr = coef_block;
       
   831   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
   832   wsptr = workspace;
       
   833   for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
       
   834     /* Even part */
       
   835 
       
   836     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
   837     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
   838     
       
   839     tmp10 = (tmp0 + tmp2) << PASS1_BITS;
       
   840     tmp12 = (tmp0 - tmp2) << PASS1_BITS;
       
   841 
       
   842     /* Odd part */
       
   843     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
       
   844 
       
   845     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
   846     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
   847 
       
   848     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);               /* c6 */
       
   849     /* Add fudge factor here for final descale. */
       
   850     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
       
   851     tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
       
   852 		       CONST_BITS-PASS1_BITS);
       
   853     tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
       
   854 		       CONST_BITS-PASS1_BITS);
       
   855 
       
   856     /* Final output stage */
       
   857 
       
   858     wsptr[4*0] = (int) (tmp10 + tmp0);
       
   859     wsptr[4*3] = (int) (tmp10 - tmp0);
       
   860     wsptr[4*1] = (int) (tmp12 + tmp2);
       
   861     wsptr[4*2] = (int) (tmp12 - tmp2);
       
   862   }
       
   863 
       
   864   /* Pass 2: process 4 rows from work array, store into output array. */
       
   865 
       
   866   wsptr = workspace;
       
   867   for (ctr = 0; ctr < 4; ctr++) {
       
   868     outptr = output_buf[ctr] + output_col;
       
   869 
       
   870     /* Even part */
       
   871 
       
   872     /* Add fudge factor here for final descale. */
       
   873     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
   874     tmp2 = (INT32) wsptr[2];
       
   875 
       
   876     tmp10 = (tmp0 + tmp2) << CONST_BITS;
       
   877     tmp12 = (tmp0 - tmp2) << CONST_BITS;
       
   878 
       
   879     /* Odd part */
       
   880     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
       
   881 
       
   882     z2 = (INT32) wsptr[1];
       
   883     z3 = (INT32) wsptr[3];
       
   884 
       
   885     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
       
   886     tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
       
   887     tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
       
   888 
       
   889     /* Final output stage */
       
   890 
       
   891     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
       
   892 					      CONST_BITS+PASS1_BITS+3)
       
   893 			    & RANGE_MASK];
       
   894     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
       
   895 					      CONST_BITS+PASS1_BITS+3)
       
   896 			    & RANGE_MASK];
       
   897     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
       
   898 					      CONST_BITS+PASS1_BITS+3)
       
   899 			    & RANGE_MASK];
       
   900     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
       
   901 					      CONST_BITS+PASS1_BITS+3)
       
   902 			    & RANGE_MASK];
       
   903 
       
   904     wsptr += 4;		/* advance pointer to next row */
       
   905   }
       
   906 }
       
   907 
       
   908 
       
   909 /*
       
   910  * Perform dequantization and inverse DCT on one block of coefficients,
       
   911  * producing a reduced-size 3x3 output block.
       
   912  *
       
   913  * Optimized algorithm with 2 multiplications in the 1-D kernel.
       
   914  * cK represents sqrt(2) * cos(K*pi/6).
       
   915  */
       
   916 
       
   917 GLOBAL(void)
       
   918 jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
   919 	       JCOEFPTR coef_block,
       
   920 	       JSAMPARRAY output_buf, JDIMENSION output_col)
       
   921 {
       
   922   INT32 tmp0, tmp2, tmp10, tmp12;
       
   923   JCOEFPTR inptr;
       
   924   ISLOW_MULT_TYPE * quantptr;
       
   925   int * wsptr;
       
   926   JSAMPROW outptr;
       
   927   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
   928   int ctr;
       
   929   int workspace[3*3];	/* buffers data between passes */
       
   930   SHIFT_TEMPS
       
   931 
       
   932   /* Pass 1: process columns from input, store into work array. */
       
   933 
       
   934   inptr = coef_block;
       
   935   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
   936   wsptr = workspace;
       
   937   for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
       
   938     /* Even part */
       
   939 
       
   940     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
   941     tmp0 <<= CONST_BITS;
       
   942     /* Add fudge factor here for final descale. */
       
   943     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
       
   944     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
   945     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
       
   946     tmp10 = tmp0 + tmp12;
       
   947     tmp2 = tmp0 - tmp12 - tmp12;
       
   948 
       
   949     /* Odd part */
       
   950 
       
   951     tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
   952     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
       
   953 
       
   954     /* Final output stage */
       
   955 
       
   956     wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
       
   957     wsptr[3*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
       
   958     wsptr[3*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
       
   959   }
       
   960 
       
   961   /* Pass 2: process 3 rows from work array, store into output array. */
       
   962 
       
   963   wsptr = workspace;
       
   964   for (ctr = 0; ctr < 3; ctr++) {
       
   965     outptr = output_buf[ctr] + output_col;
       
   966 
       
   967     /* Even part */
       
   968 
       
   969     /* Add fudge factor here for final descale. */
       
   970     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
   971     tmp0 <<= CONST_BITS;
       
   972     tmp2 = (INT32) wsptr[2];
       
   973     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
       
   974     tmp10 = tmp0 + tmp12;
       
   975     tmp2 = tmp0 - tmp12 - tmp12;
       
   976 
       
   977     /* Odd part */
       
   978 
       
   979     tmp12 = (INT32) wsptr[1];
       
   980     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
       
   981 
       
   982     /* Final output stage */
       
   983 
       
   984     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
       
   985 					      CONST_BITS+PASS1_BITS+3)
       
   986 			    & RANGE_MASK];
       
   987     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
       
   988 					      CONST_BITS+PASS1_BITS+3)
       
   989 			    & RANGE_MASK];
       
   990     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
       
   991 					      CONST_BITS+PASS1_BITS+3)
       
   992 			    & RANGE_MASK];
       
   993 
       
   994     wsptr += 3;		/* advance pointer to next row */
       
   995   }
       
   996 }
       
   997 
       
   998 
       
   999 /*
       
  1000  * Perform dequantization and inverse DCT on one block of coefficients,
       
  1001  * producing a reduced-size 2x2 output block.
       
  1002  *
       
  1003  * Multiplication-less algorithm.
       
  1004  */
       
  1005 
       
  1006 GLOBAL(void)
       
  1007 jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  1008 	       JCOEFPTR coef_block,
       
  1009 	       JSAMPARRAY output_buf, JDIMENSION output_col)
       
  1010 {
       
  1011   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
       
  1012   ISLOW_MULT_TYPE * quantptr;
       
  1013   JSAMPROW outptr;
       
  1014   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  1015   SHIFT_TEMPS
       
  1016 
       
  1017   /* Pass 1: process columns from input. */
       
  1018 
       
  1019   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  1020 
       
  1021   /* Column 0 */
       
  1022   tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  1023   tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  1024   /* Add fudge factor here for final descale. */
       
  1025   tmp4 += ONE << 2;
       
  1026 
       
  1027   tmp0 = tmp4 + tmp5;
       
  1028   tmp2 = tmp4 - tmp5;
       
  1029 
       
  1030   /* Column 1 */
       
  1031   tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0+1], quantptr[DCTSIZE*0+1]);
       
  1032   tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1+1], quantptr[DCTSIZE*1+1]);
       
  1033 
       
  1034   tmp1 = tmp4 + tmp5;
       
  1035   tmp3 = tmp4 - tmp5;
       
  1036 
       
  1037   /* Pass 2: process 2 rows, store into output array. */
       
  1038 
       
  1039   /* Row 0 */
       
  1040   outptr = output_buf[0] + output_col;
       
  1041 
       
  1042   outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp0 + tmp1, 3) & RANGE_MASK];
       
  1043   outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp0 - tmp1, 3) & RANGE_MASK];
       
  1044 
       
  1045   /* Row 1 */
       
  1046   outptr = output_buf[1] + output_col;
       
  1047 
       
  1048   outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp2 + tmp3, 3) & RANGE_MASK];
       
  1049   outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2 - tmp3, 3) & RANGE_MASK];
       
  1050 }
       
  1051 
       
  1052 
       
  1053 /*
       
  1054  * Perform dequantization and inverse DCT on one block of coefficients,
       
  1055  * producing a reduced-size 1x1 output block.
       
  1056  *
       
  1057  * We hardly need an inverse DCT routine for this: just take the
       
  1058  * average pixel value, which is one-eighth of the DC coefficient.
       
  1059  */
       
  1060 
       
  1061 GLOBAL(void)
       
  1062 jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  1063 	       JCOEFPTR coef_block,
       
  1064 	       JSAMPARRAY output_buf, JDIMENSION output_col)
       
  1065 {
       
  1066   int dcval;
       
  1067   ISLOW_MULT_TYPE * quantptr;
       
  1068   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  1069   SHIFT_TEMPS
       
  1070 
       
  1071   /* 1x1 is trivial: just take the DC coefficient divided by 8. */
       
  1072   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  1073   dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
       
  1074   dcval = (int) DESCALE((INT32) dcval, 3);
       
  1075 
       
  1076   output_buf[0][output_col] = range_limit[dcval & RANGE_MASK];
       
  1077 }
       
  1078 
       
  1079 
       
  1080 /*
       
  1081  * Perform dequantization and inverse DCT on one block of coefficients,
       
  1082  * producing a 9x9 output block.
       
  1083  *
       
  1084  * Optimized algorithm with 10 multiplications in the 1-D kernel.
       
  1085  * cK represents sqrt(2) * cos(K*pi/18).
       
  1086  */
       
  1087 
       
  1088 GLOBAL(void)
       
  1089 jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  1090 	       JCOEFPTR coef_block,
       
  1091 	       JSAMPARRAY output_buf, JDIMENSION output_col)
       
  1092 {
       
  1093   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
       
  1094   INT32 z1, z2, z3, z4;
       
  1095   JCOEFPTR inptr;
       
  1096   ISLOW_MULT_TYPE * quantptr;
       
  1097   int * wsptr;
       
  1098   JSAMPROW outptr;
       
  1099   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  1100   int ctr;
       
  1101   int workspace[8*9];	/* buffers data between passes */
       
  1102   SHIFT_TEMPS
       
  1103 
       
  1104   /* Pass 1: process columns from input, store into work array. */
       
  1105 
       
  1106   inptr = coef_block;
       
  1107   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  1108   wsptr = workspace;
       
  1109   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
       
  1110     /* Even part */
       
  1111 
       
  1112     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  1113     tmp0 <<= CONST_BITS;
       
  1114     /* Add fudge factor here for final descale. */
       
  1115     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
       
  1116 
       
  1117     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
  1118     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
       
  1119     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
       
  1120 
       
  1121     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
       
  1122     tmp1 = tmp0 + tmp3;
       
  1123     tmp2 = tmp0 - tmp3 - tmp3;
       
  1124 
       
  1125     tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
       
  1126     tmp11 = tmp2 + tmp0;
       
  1127     tmp14 = tmp2 - tmp0 - tmp0;
       
  1128 
       
  1129     tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
       
  1130     tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
       
  1131     tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
       
  1132 
       
  1133     tmp10 = tmp1 + tmp0 - tmp3;
       
  1134     tmp12 = tmp1 - tmp0 + tmp2;
       
  1135     tmp13 = tmp1 - tmp2 + tmp3;
       
  1136 
       
  1137     /* Odd part */
       
  1138 
       
  1139     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  1140     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
  1141     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
       
  1142     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
       
  1143 
       
  1144     z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
       
  1145 
       
  1146     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
       
  1147     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
       
  1148     tmp0 = tmp2 + tmp3 - z2;
       
  1149     tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
       
  1150     tmp2 += z2 - tmp1;
       
  1151     tmp3 += z2 + tmp1;
       
  1152     tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
       
  1153 
       
  1154     /* Final output stage */
       
  1155 
       
  1156     wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
       
  1157     wsptr[8*8] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
       
  1158     wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
       
  1159     wsptr[8*7] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
       
  1160     wsptr[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
       
  1161     wsptr[8*6] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
       
  1162     wsptr[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS-PASS1_BITS);
       
  1163     wsptr[8*5] = (int) RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS-PASS1_BITS);
       
  1164     wsptr[8*4] = (int) RIGHT_SHIFT(tmp14, CONST_BITS-PASS1_BITS);
       
  1165   }
       
  1166 
       
  1167   /* Pass 2: process 9 rows from work array, store into output array. */
       
  1168 
       
  1169   wsptr = workspace;
       
  1170   for (ctr = 0; ctr < 9; ctr++) {
       
  1171     outptr = output_buf[ctr] + output_col;
       
  1172 
       
  1173     /* Even part */
       
  1174 
       
  1175     /* Add fudge factor here for final descale. */
       
  1176     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
  1177     tmp0 <<= CONST_BITS;
       
  1178 
       
  1179     z1 = (INT32) wsptr[2];
       
  1180     z2 = (INT32) wsptr[4];
       
  1181     z3 = (INT32) wsptr[6];
       
  1182 
       
  1183     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
       
  1184     tmp1 = tmp0 + tmp3;
       
  1185     tmp2 = tmp0 - tmp3 - tmp3;
       
  1186 
       
  1187     tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
       
  1188     tmp11 = tmp2 + tmp0;
       
  1189     tmp14 = tmp2 - tmp0 - tmp0;
       
  1190 
       
  1191     tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
       
  1192     tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
       
  1193     tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
       
  1194 
       
  1195     tmp10 = tmp1 + tmp0 - tmp3;
       
  1196     tmp12 = tmp1 - tmp0 + tmp2;
       
  1197     tmp13 = tmp1 - tmp2 + tmp3;
       
  1198 
       
  1199     /* Odd part */
       
  1200 
       
  1201     z1 = (INT32) wsptr[1];
       
  1202     z2 = (INT32) wsptr[3];
       
  1203     z3 = (INT32) wsptr[5];
       
  1204     z4 = (INT32) wsptr[7];
       
  1205 
       
  1206     z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
       
  1207 
       
  1208     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
       
  1209     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
       
  1210     tmp0 = tmp2 + tmp3 - z2;
       
  1211     tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
       
  1212     tmp2 += z2 - tmp1;
       
  1213     tmp3 += z2 + tmp1;
       
  1214     tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
       
  1215 
       
  1216     /* Final output stage */
       
  1217 
       
  1218     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
       
  1219 					      CONST_BITS+PASS1_BITS+3)
       
  1220 			    & RANGE_MASK];
       
  1221     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
       
  1222 					      CONST_BITS+PASS1_BITS+3)
       
  1223 			    & RANGE_MASK];
       
  1224     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
       
  1225 					      CONST_BITS+PASS1_BITS+3)
       
  1226 			    & RANGE_MASK];
       
  1227     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
       
  1228 					      CONST_BITS+PASS1_BITS+3)
       
  1229 			    & RANGE_MASK];
       
  1230     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
       
  1231 					      CONST_BITS+PASS1_BITS+3)
       
  1232 			    & RANGE_MASK];
       
  1233     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
       
  1234 					      CONST_BITS+PASS1_BITS+3)
       
  1235 			    & RANGE_MASK];
       
  1236     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3,
       
  1237 					      CONST_BITS+PASS1_BITS+3)
       
  1238 			    & RANGE_MASK];
       
  1239     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3,
       
  1240 					      CONST_BITS+PASS1_BITS+3)
       
  1241 			    & RANGE_MASK];
       
  1242     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14,
       
  1243 					      CONST_BITS+PASS1_BITS+3)
       
  1244 			    & RANGE_MASK];
       
  1245 
       
  1246     wsptr += 8;		/* advance pointer to next row */
       
  1247   }
       
  1248 }
       
  1249 
       
  1250 
       
  1251 /*
       
  1252  * Perform dequantization and inverse DCT on one block of coefficients,
       
  1253  * producing a 10x10 output block.
       
  1254  *
       
  1255  * Optimized algorithm with 12 multiplications in the 1-D kernel.
       
  1256  * cK represents sqrt(2) * cos(K*pi/20).
       
  1257  */
       
  1258 
       
  1259 GLOBAL(void)
       
  1260 jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  1261 		 JCOEFPTR coef_block,
       
  1262 		 JSAMPARRAY output_buf, JDIMENSION output_col)
       
  1263 {
       
  1264   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
       
  1265   INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
   154   INT32 z1, z2, z3, z4, z5;
  1266   INT32 z1, z2, z3, z4, z5;
   155   JCOEFPTR inptr;
  1267   JCOEFPTR inptr;
   156   ISLOW_MULT_TYPE * quantptr;
  1268   ISLOW_MULT_TYPE * quantptr;
   157   int * wsptr;
  1269   int * wsptr;
   158   JSAMPROW outptr;
  1270   JSAMPROW outptr;
   159   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
  1271   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   160   int ctr;
  1272   int ctr;
   161   int workspace[DCTSIZE2];	/* buffers data between passes */
  1273   int workspace[8*10];	/* buffers data between passes */
       
  1274   SHIFT_TEMPS
       
  1275 
       
  1276   /* Pass 1: process columns from input, store into work array. */
       
  1277 
       
  1278   inptr = coef_block;
       
  1279   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  1280   wsptr = workspace;
       
  1281   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
       
  1282     /* Even part */
       
  1283 
       
  1284     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  1285     z3 <<= CONST_BITS;
       
  1286     /* Add fudge factor here for final descale. */
       
  1287     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
       
  1288     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
       
  1289     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
       
  1290     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
       
  1291     tmp10 = z3 + z1;
       
  1292     tmp11 = z3 - z2;
       
  1293 
       
  1294     tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
       
  1295 			CONST_BITS-PASS1_BITS);
       
  1296 
       
  1297     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
  1298     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
       
  1299 
       
  1300     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
       
  1301     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
       
  1302     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
       
  1303 
       
  1304     tmp20 = tmp10 + tmp12;
       
  1305     tmp24 = tmp10 - tmp12;
       
  1306     tmp21 = tmp11 + tmp13;
       
  1307     tmp23 = tmp11 - tmp13;
       
  1308 
       
  1309     /* Odd part */
       
  1310 
       
  1311     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  1312     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
  1313     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
       
  1314     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
       
  1315 
       
  1316     tmp11 = z2 + z4;
       
  1317     tmp13 = z2 - z4;
       
  1318 
       
  1319     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
       
  1320     z5 = z3 << CONST_BITS;
       
  1321 
       
  1322     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
       
  1323     z4 = z5 + tmp12;
       
  1324 
       
  1325     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
       
  1326     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
       
  1327 
       
  1328     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
       
  1329     z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
       
  1330 
       
  1331     tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
       
  1332 
       
  1333     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
       
  1334     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
       
  1335 
       
  1336     /* Final output stage */
       
  1337 
       
  1338     wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
       
  1339     wsptr[8*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
       
  1340     wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
       
  1341     wsptr[8*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
       
  1342     wsptr[8*2] = (int) (tmp22 + tmp12);
       
  1343     wsptr[8*7] = (int) (tmp22 - tmp12);
       
  1344     wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
       
  1345     wsptr[8*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
       
  1346     wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
       
  1347     wsptr[8*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
       
  1348   }
       
  1349 
       
  1350   /* Pass 2: process 10 rows from work array, store into output array. */
       
  1351 
       
  1352   wsptr = workspace;
       
  1353   for (ctr = 0; ctr < 10; ctr++) {
       
  1354     outptr = output_buf[ctr] + output_col;
       
  1355 
       
  1356     /* Even part */
       
  1357 
       
  1358     /* Add fudge factor here for final descale. */
       
  1359     z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
  1360     z3 <<= CONST_BITS;
       
  1361     z4 = (INT32) wsptr[4];
       
  1362     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
       
  1363     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
       
  1364     tmp10 = z3 + z1;
       
  1365     tmp11 = z3 - z2;
       
  1366 
       
  1367     tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
       
  1368 
       
  1369     z2 = (INT32) wsptr[2];
       
  1370     z3 = (INT32) wsptr[6];
       
  1371 
       
  1372     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
       
  1373     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
       
  1374     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
       
  1375 
       
  1376     tmp20 = tmp10 + tmp12;
       
  1377     tmp24 = tmp10 - tmp12;
       
  1378     tmp21 = tmp11 + tmp13;
       
  1379     tmp23 = tmp11 - tmp13;
       
  1380 
       
  1381     /* Odd part */
       
  1382 
       
  1383     z1 = (INT32) wsptr[1];
       
  1384     z2 = (INT32) wsptr[3];
       
  1385     z3 = (INT32) wsptr[5];
       
  1386     z3 <<= CONST_BITS;
       
  1387     z4 = (INT32) wsptr[7];
       
  1388 
       
  1389     tmp11 = z2 + z4;
       
  1390     tmp13 = z2 - z4;
       
  1391 
       
  1392     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
       
  1393 
       
  1394     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
       
  1395     z4 = z3 + tmp12;
       
  1396 
       
  1397     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
       
  1398     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
       
  1399 
       
  1400     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
       
  1401     z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
       
  1402 
       
  1403     tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
       
  1404 
       
  1405     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
       
  1406     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
       
  1407 
       
  1408     /* Final output stage */
       
  1409 
       
  1410     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
       
  1411 					      CONST_BITS+PASS1_BITS+3)
       
  1412 			    & RANGE_MASK];
       
  1413     outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
       
  1414 					      CONST_BITS+PASS1_BITS+3)
       
  1415 			    & RANGE_MASK];
       
  1416     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
       
  1417 					      CONST_BITS+PASS1_BITS+3)
       
  1418 			    & RANGE_MASK];
       
  1419     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
       
  1420 					      CONST_BITS+PASS1_BITS+3)
       
  1421 			    & RANGE_MASK];
       
  1422     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
       
  1423 					      CONST_BITS+PASS1_BITS+3)
       
  1424 			    & RANGE_MASK];
       
  1425     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
       
  1426 					      CONST_BITS+PASS1_BITS+3)
       
  1427 			    & RANGE_MASK];
       
  1428     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
       
  1429 					      CONST_BITS+PASS1_BITS+3)
       
  1430 			    & RANGE_MASK];
       
  1431     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
       
  1432 					      CONST_BITS+PASS1_BITS+3)
       
  1433 			    & RANGE_MASK];
       
  1434     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
       
  1435 					      CONST_BITS+PASS1_BITS+3)
       
  1436 			    & RANGE_MASK];
       
  1437     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
       
  1438 					      CONST_BITS+PASS1_BITS+3)
       
  1439 			    & RANGE_MASK];
       
  1440 
       
  1441     wsptr += 8;		/* advance pointer to next row */
       
  1442   }
       
  1443 }
       
  1444 
       
  1445 
       
  1446 /*
       
  1447  * Perform dequantization and inverse DCT on one block of coefficients,
       
  1448  * producing a 11x11 output block.
       
  1449  *
       
  1450  * Optimized algorithm with 24 multiplications in the 1-D kernel.
       
  1451  * cK represents sqrt(2) * cos(K*pi/22).
       
  1452  */
       
  1453 
       
  1454 GLOBAL(void)
       
  1455 jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  1456 		 JCOEFPTR coef_block,
       
  1457 		 JSAMPARRAY output_buf, JDIMENSION output_col)
       
  1458 {
       
  1459   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
       
  1460   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
       
  1461   INT32 z1, z2, z3, z4;
       
  1462   JCOEFPTR inptr;
       
  1463   ISLOW_MULT_TYPE * quantptr;
       
  1464   int * wsptr;
       
  1465   JSAMPROW outptr;
       
  1466   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  1467   int ctr;
       
  1468   int workspace[8*11];	/* buffers data between passes */
       
  1469   SHIFT_TEMPS
       
  1470 
       
  1471   /* Pass 1: process columns from input, store into work array. */
       
  1472 
       
  1473   inptr = coef_block;
       
  1474   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  1475   wsptr = workspace;
       
  1476   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
       
  1477     /* Even part */
       
  1478 
       
  1479     tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  1480     tmp10 <<= CONST_BITS;
       
  1481     /* Add fudge factor here for final descale. */
       
  1482     tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
       
  1483 
       
  1484     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
  1485     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
       
  1486     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
       
  1487 
       
  1488     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
       
  1489     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
       
  1490     z4 = z1 + z3;
       
  1491     tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
       
  1492     z4 -= z2;
       
  1493     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
       
  1494     tmp21 = tmp20 + tmp23 + tmp25 -
       
  1495 	    MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
       
  1496     tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
       
  1497     tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
       
  1498     tmp24 += tmp25;
       
  1499     tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
       
  1500     tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
       
  1501 	     MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
       
  1502     tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
       
  1503 
       
  1504     /* Odd part */
       
  1505 
       
  1506     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  1507     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
  1508     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
       
  1509     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
       
  1510 
       
  1511     tmp11 = z1 + z2;
       
  1512     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
       
  1513     tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
       
  1514     tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
       
  1515     tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
       
  1516     tmp10 = tmp11 + tmp12 + tmp13 -
       
  1517 	    MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
       
  1518     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
       
  1519     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
       
  1520     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
       
  1521     z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
       
  1522     tmp11 += z1;
       
  1523     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
       
  1524     tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
       
  1525 	     MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
       
  1526 	     MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
       
  1527 
       
  1528     /* Final output stage */
       
  1529 
       
  1530     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
       
  1531     wsptr[8*10] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
       
  1532     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
       
  1533     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
       
  1534     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
       
  1535     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
       
  1536     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
       
  1537     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
       
  1538     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
       
  1539     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
       
  1540     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25, CONST_BITS-PASS1_BITS);
       
  1541   }
       
  1542 
       
  1543   /* Pass 2: process 11 rows from work array, store into output array. */
       
  1544 
       
  1545   wsptr = workspace;
       
  1546   for (ctr = 0; ctr < 11; ctr++) {
       
  1547     outptr = output_buf[ctr] + output_col;
       
  1548 
       
  1549     /* Even part */
       
  1550 
       
  1551     /* Add fudge factor here for final descale. */
       
  1552     tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
  1553     tmp10 <<= CONST_BITS;
       
  1554 
       
  1555     z1 = (INT32) wsptr[2];
       
  1556     z2 = (INT32) wsptr[4];
       
  1557     z3 = (INT32) wsptr[6];
       
  1558 
       
  1559     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
       
  1560     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
       
  1561     z4 = z1 + z3;
       
  1562     tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
       
  1563     z4 -= z2;
       
  1564     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
       
  1565     tmp21 = tmp20 + tmp23 + tmp25 -
       
  1566 	    MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
       
  1567     tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
       
  1568     tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
       
  1569     tmp24 += tmp25;
       
  1570     tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
       
  1571     tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
       
  1572 	     MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
       
  1573     tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
       
  1574 
       
  1575     /* Odd part */
       
  1576 
       
  1577     z1 = (INT32) wsptr[1];
       
  1578     z2 = (INT32) wsptr[3];
       
  1579     z3 = (INT32) wsptr[5];
       
  1580     z4 = (INT32) wsptr[7];
       
  1581 
       
  1582     tmp11 = z1 + z2;
       
  1583     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
       
  1584     tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
       
  1585     tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
       
  1586     tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
       
  1587     tmp10 = tmp11 + tmp12 + tmp13 -
       
  1588 	    MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
       
  1589     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
       
  1590     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
       
  1591     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
       
  1592     z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
       
  1593     tmp11 += z1;
       
  1594     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
       
  1595     tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
       
  1596 	     MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
       
  1597 	     MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
       
  1598 
       
  1599     /* Final output stage */
       
  1600 
       
  1601     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
       
  1602 					       CONST_BITS+PASS1_BITS+3)
       
  1603 			     & RANGE_MASK];
       
  1604     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
       
  1605 					       CONST_BITS+PASS1_BITS+3)
       
  1606 			     & RANGE_MASK];
       
  1607     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
       
  1608 					       CONST_BITS+PASS1_BITS+3)
       
  1609 			     & RANGE_MASK];
       
  1610     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
       
  1611 					       CONST_BITS+PASS1_BITS+3)
       
  1612 			     & RANGE_MASK];
       
  1613     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
       
  1614 					       CONST_BITS+PASS1_BITS+3)
       
  1615 			     & RANGE_MASK];
       
  1616     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
       
  1617 					       CONST_BITS+PASS1_BITS+3)
       
  1618 			     & RANGE_MASK];
       
  1619     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
       
  1620 					       CONST_BITS+PASS1_BITS+3)
       
  1621 			     & RANGE_MASK];
       
  1622     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
       
  1623 					       CONST_BITS+PASS1_BITS+3)
       
  1624 			     & RANGE_MASK];
       
  1625     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
       
  1626 					       CONST_BITS+PASS1_BITS+3)
       
  1627 			     & RANGE_MASK];
       
  1628     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
       
  1629 					       CONST_BITS+PASS1_BITS+3)
       
  1630 			     & RANGE_MASK];
       
  1631     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25,
       
  1632 					       CONST_BITS+PASS1_BITS+3)
       
  1633 			     & RANGE_MASK];
       
  1634 
       
  1635     wsptr += 8;		/* advance pointer to next row */
       
  1636   }
       
  1637 }
       
  1638 
       
  1639 
       
  1640 /*
       
  1641  * Perform dequantization and inverse DCT on one block of coefficients,
       
  1642  * producing a 12x12 output block.
       
  1643  *
       
  1644  * Optimized algorithm with 15 multiplications in the 1-D kernel.
       
  1645  * cK represents sqrt(2) * cos(K*pi/24).
       
  1646  */
       
  1647 
       
  1648 GLOBAL(void)
       
  1649 jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  1650 		 JCOEFPTR coef_block,
       
  1651 		 JSAMPARRAY output_buf, JDIMENSION output_col)
       
  1652 {
       
  1653   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
       
  1654   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
       
  1655   INT32 z1, z2, z3, z4;
       
  1656   JCOEFPTR inptr;
       
  1657   ISLOW_MULT_TYPE * quantptr;
       
  1658   int * wsptr;
       
  1659   JSAMPROW outptr;
       
  1660   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  1661   int ctr;
       
  1662   int workspace[8*12];	/* buffers data between passes */
       
  1663   SHIFT_TEMPS
       
  1664 
       
  1665   /* Pass 1: process columns from input, store into work array. */
       
  1666 
       
  1667   inptr = coef_block;
       
  1668   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  1669   wsptr = workspace;
       
  1670   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
       
  1671     /* Even part */
       
  1672 
       
  1673     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  1674     z3 <<= CONST_BITS;
       
  1675     /* Add fudge factor here for final descale. */
       
  1676     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
       
  1677 
       
  1678     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
       
  1679     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
       
  1680 
       
  1681     tmp10 = z3 + z4;
       
  1682     tmp11 = z3 - z4;
       
  1683 
       
  1684     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
  1685     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
       
  1686     z1 <<= CONST_BITS;
       
  1687     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
       
  1688     z2 <<= CONST_BITS;
       
  1689 
       
  1690     tmp12 = z1 - z2;
       
  1691 
       
  1692     tmp21 = z3 + tmp12;
       
  1693     tmp24 = z3 - tmp12;
       
  1694 
       
  1695     tmp12 = z4 + z2;
       
  1696 
       
  1697     tmp20 = tmp10 + tmp12;
       
  1698     tmp25 = tmp10 - tmp12;
       
  1699 
       
  1700     tmp12 = z4 - z1 - z2;
       
  1701 
       
  1702     tmp22 = tmp11 + tmp12;
       
  1703     tmp23 = tmp11 - tmp12;
       
  1704 
       
  1705     /* Odd part */
       
  1706 
       
  1707     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  1708     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
  1709     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
       
  1710     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
       
  1711 
       
  1712     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
       
  1713     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
       
  1714 
       
  1715     tmp10 = z1 + z3;
       
  1716     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
       
  1717     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
       
  1718     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
       
  1719     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
       
  1720     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
       
  1721     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
       
  1722     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
       
  1723 	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
       
  1724 
       
  1725     z1 -= z4;
       
  1726     z2 -= z3;
       
  1727     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
       
  1728     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
       
  1729     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
       
  1730 
       
  1731     /* Final output stage */
       
  1732 
       
  1733     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
       
  1734     wsptr[8*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
       
  1735     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
       
  1736     wsptr[8*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
       
  1737     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
       
  1738     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
       
  1739     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
       
  1740     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
       
  1741     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
       
  1742     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
       
  1743     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
       
  1744     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
       
  1745   }
       
  1746 
       
  1747   /* Pass 2: process 12 rows from work array, store into output array. */
       
  1748 
       
  1749   wsptr = workspace;
       
  1750   for (ctr = 0; ctr < 12; ctr++) {
       
  1751     outptr = output_buf[ctr] + output_col;
       
  1752 
       
  1753     /* Even part */
       
  1754 
       
  1755     /* Add fudge factor here for final descale. */
       
  1756     z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
  1757     z3 <<= CONST_BITS;
       
  1758 
       
  1759     z4 = (INT32) wsptr[4];
       
  1760     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
       
  1761 
       
  1762     tmp10 = z3 + z4;
       
  1763     tmp11 = z3 - z4;
       
  1764 
       
  1765     z1 = (INT32) wsptr[2];
       
  1766     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
       
  1767     z1 <<= CONST_BITS;
       
  1768     z2 = (INT32) wsptr[6];
       
  1769     z2 <<= CONST_BITS;
       
  1770 
       
  1771     tmp12 = z1 - z2;
       
  1772 
       
  1773     tmp21 = z3 + tmp12;
       
  1774     tmp24 = z3 - tmp12;
       
  1775 
       
  1776     tmp12 = z4 + z2;
       
  1777 
       
  1778     tmp20 = tmp10 + tmp12;
       
  1779     tmp25 = tmp10 - tmp12;
       
  1780 
       
  1781     tmp12 = z4 - z1 - z2;
       
  1782 
       
  1783     tmp22 = tmp11 + tmp12;
       
  1784     tmp23 = tmp11 - tmp12;
       
  1785 
       
  1786     /* Odd part */
       
  1787 
       
  1788     z1 = (INT32) wsptr[1];
       
  1789     z2 = (INT32) wsptr[3];
       
  1790     z3 = (INT32) wsptr[5];
       
  1791     z4 = (INT32) wsptr[7];
       
  1792 
       
  1793     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
       
  1794     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
       
  1795 
       
  1796     tmp10 = z1 + z3;
       
  1797     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
       
  1798     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
       
  1799     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
       
  1800     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
       
  1801     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
       
  1802     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
       
  1803     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
       
  1804 	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
       
  1805 
       
  1806     z1 -= z4;
       
  1807     z2 -= z3;
       
  1808     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
       
  1809     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
       
  1810     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
       
  1811 
       
  1812     /* Final output stage */
       
  1813 
       
  1814     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
       
  1815 					       CONST_BITS+PASS1_BITS+3)
       
  1816 			     & RANGE_MASK];
       
  1817     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
       
  1818 					       CONST_BITS+PASS1_BITS+3)
       
  1819 			     & RANGE_MASK];
       
  1820     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
       
  1821 					       CONST_BITS+PASS1_BITS+3)
       
  1822 			     & RANGE_MASK];
       
  1823     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
       
  1824 					       CONST_BITS+PASS1_BITS+3)
       
  1825 			     & RANGE_MASK];
       
  1826     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
       
  1827 					       CONST_BITS+PASS1_BITS+3)
       
  1828 			     & RANGE_MASK];
       
  1829     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
       
  1830 					       CONST_BITS+PASS1_BITS+3)
       
  1831 			     & RANGE_MASK];
       
  1832     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
       
  1833 					       CONST_BITS+PASS1_BITS+3)
       
  1834 			     & RANGE_MASK];
       
  1835     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
       
  1836 					       CONST_BITS+PASS1_BITS+3)
       
  1837 			     & RANGE_MASK];
       
  1838     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
       
  1839 					       CONST_BITS+PASS1_BITS+3)
       
  1840 			     & RANGE_MASK];
       
  1841     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
       
  1842 					       CONST_BITS+PASS1_BITS+3)
       
  1843 			     & RANGE_MASK];
       
  1844     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
       
  1845 					       CONST_BITS+PASS1_BITS+3)
       
  1846 			     & RANGE_MASK];
       
  1847     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
       
  1848 					       CONST_BITS+PASS1_BITS+3)
       
  1849 			     & RANGE_MASK];
       
  1850 
       
  1851     wsptr += 8;		/* advance pointer to next row */
       
  1852   }
       
  1853 }
       
  1854 
       
  1855 
       
  1856 /*
       
  1857  * Perform dequantization and inverse DCT on one block of coefficients,
       
  1858  * producing a 13x13 output block.
       
  1859  *
       
  1860  * Optimized algorithm with 29 multiplications in the 1-D kernel.
       
  1861  * cK represents sqrt(2) * cos(K*pi/26).
       
  1862  */
       
  1863 
       
  1864 GLOBAL(void)
       
  1865 jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  1866 		 JCOEFPTR coef_block,
       
  1867 		 JSAMPARRAY output_buf, JDIMENSION output_col)
       
  1868 {
       
  1869   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
       
  1870   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
       
  1871   INT32 z1, z2, z3, z4;
       
  1872   JCOEFPTR inptr;
       
  1873   ISLOW_MULT_TYPE * quantptr;
       
  1874   int * wsptr;
       
  1875   JSAMPROW outptr;
       
  1876   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  1877   int ctr;
       
  1878   int workspace[8*13];	/* buffers data between passes */
       
  1879   SHIFT_TEMPS
       
  1880 
       
  1881   /* Pass 1: process columns from input, store into work array. */
       
  1882 
       
  1883   inptr = coef_block;
       
  1884   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  1885   wsptr = workspace;
       
  1886   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
       
  1887     /* Even part */
       
  1888 
       
  1889     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  1890     z1 <<= CONST_BITS;
       
  1891     /* Add fudge factor here for final descale. */
       
  1892     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
       
  1893 
       
  1894     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
  1895     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
       
  1896     z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
       
  1897 
       
  1898     tmp10 = z3 + z4;
       
  1899     tmp11 = z3 - z4;
       
  1900 
       
  1901     tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
       
  1902     tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
       
  1903 
       
  1904     tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
       
  1905     tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
       
  1906 
       
  1907     tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
       
  1908     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
       
  1909 
       
  1910     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
       
  1911     tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
       
  1912 
       
  1913     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
       
  1914     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
       
  1915 
       
  1916     tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
       
  1917     tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
       
  1918 
       
  1919     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
       
  1920 
       
  1921     /* Odd part */
       
  1922 
       
  1923     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  1924     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
  1925     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
       
  1926     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
       
  1927 
       
  1928     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
       
  1929     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
       
  1930     tmp15 = z1 + z4;
       
  1931     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
       
  1932     tmp10 = tmp11 + tmp12 + tmp13 -
       
  1933 	    MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
       
  1934     tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
       
  1935     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
       
  1936     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
       
  1937     tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
       
  1938     tmp11 += tmp14;
       
  1939     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
       
  1940     tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
       
  1941     tmp12 += tmp14;
       
  1942     tmp13 += tmp14;
       
  1943     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
       
  1944     tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
       
  1945 	    MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
       
  1946     z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
       
  1947     tmp14 += z1;
       
  1948     tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
       
  1949 	     MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
       
  1950 
       
  1951     /* Final output stage */
       
  1952 
       
  1953     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
       
  1954     wsptr[8*12] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
       
  1955     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
       
  1956     wsptr[8*11] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
       
  1957     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
       
  1958     wsptr[8*10] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
       
  1959     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
       
  1960     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
       
  1961     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
       
  1962     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
       
  1963     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
       
  1964     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
       
  1965     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26, CONST_BITS-PASS1_BITS);
       
  1966   }
       
  1967 
       
  1968   /* Pass 2: process 13 rows from work array, store into output array. */
       
  1969 
       
  1970   wsptr = workspace;
       
  1971   for (ctr = 0; ctr < 13; ctr++) {
       
  1972     outptr = output_buf[ctr] + output_col;
       
  1973 
       
  1974     /* Even part */
       
  1975 
       
  1976     /* Add fudge factor here for final descale. */
       
  1977     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
  1978     z1 <<= CONST_BITS;
       
  1979 
       
  1980     z2 = (INT32) wsptr[2];
       
  1981     z3 = (INT32) wsptr[4];
       
  1982     z4 = (INT32) wsptr[6];
       
  1983 
       
  1984     tmp10 = z3 + z4;
       
  1985     tmp11 = z3 - z4;
       
  1986 
       
  1987     tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
       
  1988     tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
       
  1989 
       
  1990     tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
       
  1991     tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
       
  1992 
       
  1993     tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
       
  1994     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
       
  1995 
       
  1996     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
       
  1997     tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
       
  1998 
       
  1999     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
       
  2000     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
       
  2001 
       
  2002     tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
       
  2003     tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
       
  2004 
       
  2005     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
       
  2006 
       
  2007     /* Odd part */
       
  2008 
       
  2009     z1 = (INT32) wsptr[1];
       
  2010     z2 = (INT32) wsptr[3];
       
  2011     z3 = (INT32) wsptr[5];
       
  2012     z4 = (INT32) wsptr[7];
       
  2013 
       
  2014     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
       
  2015     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
       
  2016     tmp15 = z1 + z4;
       
  2017     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
       
  2018     tmp10 = tmp11 + tmp12 + tmp13 -
       
  2019 	    MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
       
  2020     tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
       
  2021     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
       
  2022     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
       
  2023     tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
       
  2024     tmp11 += tmp14;
       
  2025     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
       
  2026     tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
       
  2027     tmp12 += tmp14;
       
  2028     tmp13 += tmp14;
       
  2029     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
       
  2030     tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
       
  2031 	    MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
       
  2032     z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
       
  2033     tmp14 += z1;
       
  2034     tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
       
  2035 	     MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
       
  2036 
       
  2037     /* Final output stage */
       
  2038 
       
  2039     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
       
  2040 					       CONST_BITS+PASS1_BITS+3)
       
  2041 			     & RANGE_MASK];
       
  2042     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
       
  2043 					       CONST_BITS+PASS1_BITS+3)
       
  2044 			     & RANGE_MASK];
       
  2045     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
       
  2046 					       CONST_BITS+PASS1_BITS+3)
       
  2047 			     & RANGE_MASK];
       
  2048     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
       
  2049 					       CONST_BITS+PASS1_BITS+3)
       
  2050 			     & RANGE_MASK];
       
  2051     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
       
  2052 					       CONST_BITS+PASS1_BITS+3)
       
  2053 			     & RANGE_MASK];
       
  2054     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
       
  2055 					       CONST_BITS+PASS1_BITS+3)
       
  2056 			     & RANGE_MASK];
       
  2057     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
       
  2058 					       CONST_BITS+PASS1_BITS+3)
       
  2059 			     & RANGE_MASK];
       
  2060     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
       
  2061 					       CONST_BITS+PASS1_BITS+3)
       
  2062 			     & RANGE_MASK];
       
  2063     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
       
  2064 					       CONST_BITS+PASS1_BITS+3)
       
  2065 			     & RANGE_MASK];
       
  2066     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
       
  2067 					       CONST_BITS+PASS1_BITS+3)
       
  2068 			     & RANGE_MASK];
       
  2069     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
       
  2070 					       CONST_BITS+PASS1_BITS+3)
       
  2071 			     & RANGE_MASK];
       
  2072     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
       
  2073 					       CONST_BITS+PASS1_BITS+3)
       
  2074 			     & RANGE_MASK];
       
  2075     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26,
       
  2076 					       CONST_BITS+PASS1_BITS+3)
       
  2077 			     & RANGE_MASK];
       
  2078 
       
  2079     wsptr += 8;		/* advance pointer to next row */
       
  2080   }
       
  2081 }
       
  2082 
       
  2083 
       
  2084 /*
       
  2085  * Perform dequantization and inverse DCT on one block of coefficients,
       
  2086  * producing a 14x14 output block.
       
  2087  *
       
  2088  * Optimized algorithm with 20 multiplications in the 1-D kernel.
       
  2089  * cK represents sqrt(2) * cos(K*pi/28).
       
  2090  */
       
  2091 
       
  2092 GLOBAL(void)
       
  2093 jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  2094 		 JCOEFPTR coef_block,
       
  2095 		 JSAMPARRAY output_buf, JDIMENSION output_col)
       
  2096 {
       
  2097   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
       
  2098   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
       
  2099   INT32 z1, z2, z3, z4;
       
  2100   JCOEFPTR inptr;
       
  2101   ISLOW_MULT_TYPE * quantptr;
       
  2102   int * wsptr;
       
  2103   JSAMPROW outptr;
       
  2104   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  2105   int ctr;
       
  2106   int workspace[8*14];	/* buffers data between passes */
       
  2107   SHIFT_TEMPS
       
  2108 
       
  2109   /* Pass 1: process columns from input, store into work array. */
       
  2110 
       
  2111   inptr = coef_block;
       
  2112   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  2113   wsptr = workspace;
       
  2114   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
       
  2115     /* Even part */
       
  2116 
       
  2117     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  2118     z1 <<= CONST_BITS;
       
  2119     /* Add fudge factor here for final descale. */
       
  2120     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
       
  2121     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
       
  2122     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
       
  2123     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
       
  2124     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
       
  2125 
       
  2126     tmp10 = z1 + z2;
       
  2127     tmp11 = z1 + z3;
       
  2128     tmp12 = z1 - z4;
       
  2129 
       
  2130     tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
       
  2131 			CONST_BITS-PASS1_BITS);
       
  2132 
       
  2133     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
  2134     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
       
  2135 
       
  2136     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
       
  2137 
       
  2138     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
       
  2139     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
       
  2140     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
       
  2141 	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
       
  2142 
       
  2143     tmp20 = tmp10 + tmp13;
       
  2144     tmp26 = tmp10 - tmp13;
       
  2145     tmp21 = tmp11 + tmp14;
       
  2146     tmp25 = tmp11 - tmp14;
       
  2147     tmp22 = tmp12 + tmp15;
       
  2148     tmp24 = tmp12 - tmp15;
       
  2149 
       
  2150     /* Odd part */
       
  2151 
       
  2152     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  2153     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
  2154     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
       
  2155     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
       
  2156     tmp13 = z4 << CONST_BITS;
       
  2157 
       
  2158     tmp14 = z1 + z3;
       
  2159     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
       
  2160     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
       
  2161     tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
       
  2162     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
       
  2163     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
       
  2164     z1    -= z2;
       
  2165     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
       
  2166     tmp16 += tmp15;
       
  2167     z1    += z4;
       
  2168     z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
       
  2169     tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
       
  2170     tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
       
  2171     z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
       
  2172     tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
       
  2173     tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
       
  2174 
       
  2175     tmp13 = (z1 - z3) << PASS1_BITS;
       
  2176 
       
  2177     /* Final output stage */
       
  2178 
       
  2179     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
       
  2180     wsptr[8*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
       
  2181     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
       
  2182     wsptr[8*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
       
  2183     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
       
  2184     wsptr[8*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
       
  2185     wsptr[8*3]  = (int) (tmp23 + tmp13);
       
  2186     wsptr[8*10] = (int) (tmp23 - tmp13);
       
  2187     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
       
  2188     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
       
  2189     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
       
  2190     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
       
  2191     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
       
  2192     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
       
  2193   }
       
  2194 
       
  2195   /* Pass 2: process 14 rows from work array, store into output array. */
       
  2196 
       
  2197   wsptr = workspace;
       
  2198   for (ctr = 0; ctr < 14; ctr++) {
       
  2199     outptr = output_buf[ctr] + output_col;
       
  2200 
       
  2201     /* Even part */
       
  2202 
       
  2203     /* Add fudge factor here for final descale. */
       
  2204     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
  2205     z1 <<= CONST_BITS;
       
  2206     z4 = (INT32) wsptr[4];
       
  2207     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
       
  2208     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
       
  2209     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
       
  2210 
       
  2211     tmp10 = z1 + z2;
       
  2212     tmp11 = z1 + z3;
       
  2213     tmp12 = z1 - z4;
       
  2214 
       
  2215     tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
       
  2216 
       
  2217     z1 = (INT32) wsptr[2];
       
  2218     z2 = (INT32) wsptr[6];
       
  2219 
       
  2220     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
       
  2221 
       
  2222     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
       
  2223     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
       
  2224     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
       
  2225 	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
       
  2226 
       
  2227     tmp20 = tmp10 + tmp13;
       
  2228     tmp26 = tmp10 - tmp13;
       
  2229     tmp21 = tmp11 + tmp14;
       
  2230     tmp25 = tmp11 - tmp14;
       
  2231     tmp22 = tmp12 + tmp15;
       
  2232     tmp24 = tmp12 - tmp15;
       
  2233 
       
  2234     /* Odd part */
       
  2235 
       
  2236     z1 = (INT32) wsptr[1];
       
  2237     z2 = (INT32) wsptr[3];
       
  2238     z3 = (INT32) wsptr[5];
       
  2239     z4 = (INT32) wsptr[7];
       
  2240     z4 <<= CONST_BITS;
       
  2241 
       
  2242     tmp14 = z1 + z3;
       
  2243     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
       
  2244     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
       
  2245     tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
       
  2246     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
       
  2247     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
       
  2248     z1    -= z2;
       
  2249     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
       
  2250     tmp16 += tmp15;
       
  2251     tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
       
  2252     tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
       
  2253     tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
       
  2254     tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
       
  2255     tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
       
  2256     tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
       
  2257 
       
  2258     tmp13 = ((z1 - z3) << CONST_BITS) + z4;
       
  2259 
       
  2260     /* Final output stage */
       
  2261 
       
  2262     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
       
  2263 					       CONST_BITS+PASS1_BITS+3)
       
  2264 			     & RANGE_MASK];
       
  2265     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
       
  2266 					       CONST_BITS+PASS1_BITS+3)
       
  2267 			     & RANGE_MASK];
       
  2268     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
       
  2269 					       CONST_BITS+PASS1_BITS+3)
       
  2270 			     & RANGE_MASK];
       
  2271     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
       
  2272 					       CONST_BITS+PASS1_BITS+3)
       
  2273 			     & RANGE_MASK];
       
  2274     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
       
  2275 					       CONST_BITS+PASS1_BITS+3)
       
  2276 			     & RANGE_MASK];
       
  2277     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
       
  2278 					       CONST_BITS+PASS1_BITS+3)
       
  2279 			     & RANGE_MASK];
       
  2280     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
       
  2281 					       CONST_BITS+PASS1_BITS+3)
       
  2282 			     & RANGE_MASK];
       
  2283     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
       
  2284 					       CONST_BITS+PASS1_BITS+3)
       
  2285 			     & RANGE_MASK];
       
  2286     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
       
  2287 					       CONST_BITS+PASS1_BITS+3)
       
  2288 			     & RANGE_MASK];
       
  2289     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
       
  2290 					       CONST_BITS+PASS1_BITS+3)
       
  2291 			     & RANGE_MASK];
       
  2292     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
       
  2293 					       CONST_BITS+PASS1_BITS+3)
       
  2294 			     & RANGE_MASK];
       
  2295     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
       
  2296 					       CONST_BITS+PASS1_BITS+3)
       
  2297 			     & RANGE_MASK];
       
  2298     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
       
  2299 					       CONST_BITS+PASS1_BITS+3)
       
  2300 			     & RANGE_MASK];
       
  2301     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
       
  2302 					       CONST_BITS+PASS1_BITS+3)
       
  2303 			     & RANGE_MASK];
       
  2304 
       
  2305     wsptr += 8;		/* advance pointer to next row */
       
  2306   }
       
  2307 }
       
  2308 
       
  2309 
       
  2310 /*
       
  2311  * Perform dequantization and inverse DCT on one block of coefficients,
       
  2312  * producing a 15x15 output block.
       
  2313  *
       
  2314  * Optimized algorithm with 22 multiplications in the 1-D kernel.
       
  2315  * cK represents sqrt(2) * cos(K*pi/30).
       
  2316  */
       
  2317 
       
  2318 GLOBAL(void)
       
  2319 jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  2320 		 JCOEFPTR coef_block,
       
  2321 		 JSAMPARRAY output_buf, JDIMENSION output_col)
       
  2322 {
       
  2323   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
       
  2324   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
       
  2325   INT32 z1, z2, z3, z4;
       
  2326   JCOEFPTR inptr;
       
  2327   ISLOW_MULT_TYPE * quantptr;
       
  2328   int * wsptr;
       
  2329   JSAMPROW outptr;
       
  2330   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  2331   int ctr;
       
  2332   int workspace[8*15];	/* buffers data between passes */
       
  2333   SHIFT_TEMPS
       
  2334 
       
  2335   /* Pass 1: process columns from input, store into work array. */
       
  2336 
       
  2337   inptr = coef_block;
       
  2338   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  2339   wsptr = workspace;
       
  2340   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
       
  2341     /* Even part */
       
  2342 
       
  2343     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  2344     z1 <<= CONST_BITS;
       
  2345     /* Add fudge factor here for final descale. */
       
  2346     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
       
  2347 
       
  2348     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
  2349     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
       
  2350     z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
       
  2351 
       
  2352     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
       
  2353     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
       
  2354 
       
  2355     tmp12 = z1 - tmp10;
       
  2356     tmp13 = z1 + tmp11;
       
  2357     z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
       
  2358 
       
  2359     z4 = z2 - z3;
       
  2360     z3 += z2;
       
  2361     tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
       
  2362     tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
       
  2363     z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
       
  2364 
       
  2365     tmp20 = tmp13 + tmp10 + tmp11;
       
  2366     tmp23 = tmp12 - tmp10 + tmp11 + z2;
       
  2367 
       
  2368     tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
       
  2369     tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
       
  2370 
       
  2371     tmp25 = tmp13 - tmp10 - tmp11;
       
  2372     tmp26 = tmp12 + tmp10 - tmp11 - z2;
       
  2373 
       
  2374     tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
       
  2375     tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
       
  2376 
       
  2377     tmp21 = tmp12 + tmp10 + tmp11;
       
  2378     tmp24 = tmp13 - tmp10 + tmp11;
       
  2379     tmp11 += tmp11;
       
  2380     tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
       
  2381     tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
       
  2382 
       
  2383     /* Odd part */
       
  2384 
       
  2385     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  2386     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
  2387     z4 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
       
  2388     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
       
  2389     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
       
  2390 
       
  2391     tmp13 = z2 - z4;
       
  2392     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
       
  2393     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
       
  2394     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
       
  2395 
       
  2396     tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
       
  2397     tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
       
  2398     z2 = z1 - z4;
       
  2399     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
       
  2400 
       
  2401     tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
       
  2402     tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
       
  2403     tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
       
  2404     z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
       
  2405     tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
       
  2406     tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
       
  2407 
       
  2408     /* Final output stage */
       
  2409 
       
  2410     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
       
  2411     wsptr[8*14] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
       
  2412     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
       
  2413     wsptr[8*13] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
       
  2414     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
       
  2415     wsptr[8*12] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
       
  2416     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
       
  2417     wsptr[8*11] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
       
  2418     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
       
  2419     wsptr[8*10] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
       
  2420     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
       
  2421     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
       
  2422     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
       
  2423     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
       
  2424     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27, CONST_BITS-PASS1_BITS);
       
  2425   }
       
  2426 
       
  2427   /* Pass 2: process 15 rows from work array, store into output array. */
       
  2428 
       
  2429   wsptr = workspace;
       
  2430   for (ctr = 0; ctr < 15; ctr++) {
       
  2431     outptr = output_buf[ctr] + output_col;
       
  2432 
       
  2433     /* Even part */
       
  2434 
       
  2435     /* Add fudge factor here for final descale. */
       
  2436     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
  2437     z1 <<= CONST_BITS;
       
  2438 
       
  2439     z2 = (INT32) wsptr[2];
       
  2440     z3 = (INT32) wsptr[4];
       
  2441     z4 = (INT32) wsptr[6];
       
  2442 
       
  2443     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
       
  2444     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
       
  2445 
       
  2446     tmp12 = z1 - tmp10;
       
  2447     tmp13 = z1 + tmp11;
       
  2448     z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
       
  2449 
       
  2450     z4 = z2 - z3;
       
  2451     z3 += z2;
       
  2452     tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
       
  2453     tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
       
  2454     z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
       
  2455 
       
  2456     tmp20 = tmp13 + tmp10 + tmp11;
       
  2457     tmp23 = tmp12 - tmp10 + tmp11 + z2;
       
  2458 
       
  2459     tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
       
  2460     tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
       
  2461 
       
  2462     tmp25 = tmp13 - tmp10 - tmp11;
       
  2463     tmp26 = tmp12 + tmp10 - tmp11 - z2;
       
  2464 
       
  2465     tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
       
  2466     tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
       
  2467 
       
  2468     tmp21 = tmp12 + tmp10 + tmp11;
       
  2469     tmp24 = tmp13 - tmp10 + tmp11;
       
  2470     tmp11 += tmp11;
       
  2471     tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
       
  2472     tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
       
  2473 
       
  2474     /* Odd part */
       
  2475 
       
  2476     z1 = (INT32) wsptr[1];
       
  2477     z2 = (INT32) wsptr[3];
       
  2478     z4 = (INT32) wsptr[5];
       
  2479     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
       
  2480     z4 = (INT32) wsptr[7];
       
  2481 
       
  2482     tmp13 = z2 - z4;
       
  2483     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
       
  2484     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
       
  2485     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
       
  2486 
       
  2487     tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
       
  2488     tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
       
  2489     z2 = z1 - z4;
       
  2490     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
       
  2491 
       
  2492     tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
       
  2493     tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
       
  2494     tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
       
  2495     z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
       
  2496     tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
       
  2497     tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
       
  2498 
       
  2499     /* Final output stage */
       
  2500 
       
  2501     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
       
  2502 					       CONST_BITS+PASS1_BITS+3)
       
  2503 			     & RANGE_MASK];
       
  2504     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
       
  2505 					       CONST_BITS+PASS1_BITS+3)
       
  2506 			     & RANGE_MASK];
       
  2507     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
       
  2508 					       CONST_BITS+PASS1_BITS+3)
       
  2509 			     & RANGE_MASK];
       
  2510     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
       
  2511 					       CONST_BITS+PASS1_BITS+3)
       
  2512 			     & RANGE_MASK];
       
  2513     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
       
  2514 					       CONST_BITS+PASS1_BITS+3)
       
  2515 			     & RANGE_MASK];
       
  2516     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
       
  2517 					       CONST_BITS+PASS1_BITS+3)
       
  2518 			     & RANGE_MASK];
       
  2519     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
       
  2520 					       CONST_BITS+PASS1_BITS+3)
       
  2521 			     & RANGE_MASK];
       
  2522     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
       
  2523 					       CONST_BITS+PASS1_BITS+3)
       
  2524 			     & RANGE_MASK];
       
  2525     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
       
  2526 					       CONST_BITS+PASS1_BITS+3)
       
  2527 			     & RANGE_MASK];
       
  2528     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
       
  2529 					       CONST_BITS+PASS1_BITS+3)
       
  2530 			     & RANGE_MASK];
       
  2531     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
       
  2532 					       CONST_BITS+PASS1_BITS+3)
       
  2533 			     & RANGE_MASK];
       
  2534     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
       
  2535 					       CONST_BITS+PASS1_BITS+3)
       
  2536 			     & RANGE_MASK];
       
  2537     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
       
  2538 					       CONST_BITS+PASS1_BITS+3)
       
  2539 			     & RANGE_MASK];
       
  2540     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
       
  2541 					       CONST_BITS+PASS1_BITS+3)
       
  2542 			     & RANGE_MASK];
       
  2543     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27,
       
  2544 					       CONST_BITS+PASS1_BITS+3)
       
  2545 			     & RANGE_MASK];
       
  2546 
       
  2547     wsptr += 8;		/* advance pointer to next row */
       
  2548   }
       
  2549 }
       
  2550 
       
  2551 
       
  2552 /*
       
  2553  * Perform dequantization and inverse DCT on one block of coefficients,
       
  2554  * producing a 16x16 output block.
       
  2555  *
       
  2556  * Optimized algorithm with 28 multiplications in the 1-D kernel.
       
  2557  * cK represents sqrt(2) * cos(K*pi/32).
       
  2558  */
       
  2559 
       
  2560 GLOBAL(void)
       
  2561 jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  2562 		 JCOEFPTR coef_block,
       
  2563 		 JSAMPARRAY output_buf, JDIMENSION output_col)
       
  2564 {
       
  2565   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
       
  2566   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
       
  2567   INT32 z1, z2, z3, z4;
       
  2568   JCOEFPTR inptr;
       
  2569   ISLOW_MULT_TYPE * quantptr;
       
  2570   int * wsptr;
       
  2571   JSAMPROW outptr;
       
  2572   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  2573   int ctr;
       
  2574   int workspace[8*16];	/* buffers data between passes */
       
  2575   SHIFT_TEMPS
       
  2576 
       
  2577   /* Pass 1: process columns from input, store into work array. */
       
  2578 
       
  2579   inptr = coef_block;
       
  2580   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  2581   wsptr = workspace;
       
  2582   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
       
  2583     /* Even part */
       
  2584 
       
  2585     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  2586     tmp0 <<= CONST_BITS;
       
  2587     /* Add fudge factor here for final descale. */
       
  2588     tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
       
  2589 
       
  2590     z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
       
  2591     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
       
  2592     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
       
  2593 
       
  2594     tmp10 = tmp0 + tmp1;
       
  2595     tmp11 = tmp0 - tmp1;
       
  2596     tmp12 = tmp0 + tmp2;
       
  2597     tmp13 = tmp0 - tmp2;
       
  2598 
       
  2599     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
  2600     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
       
  2601     z3 = z1 - z2;
       
  2602     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
       
  2603     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
       
  2604 
       
  2605     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
       
  2606     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
       
  2607     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
       
  2608     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
       
  2609 
       
  2610     tmp20 = tmp10 + tmp0;
       
  2611     tmp27 = tmp10 - tmp0;
       
  2612     tmp21 = tmp12 + tmp1;
       
  2613     tmp26 = tmp12 - tmp1;
       
  2614     tmp22 = tmp13 + tmp2;
       
  2615     tmp25 = tmp13 - tmp2;
       
  2616     tmp23 = tmp11 + tmp3;
       
  2617     tmp24 = tmp11 - tmp3;
       
  2618 
       
  2619     /* Odd part */
       
  2620 
       
  2621     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  2622     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
  2623     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
       
  2624     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
       
  2625 
       
  2626     tmp11 = z1 + z3;
       
  2627 
       
  2628     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
       
  2629     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
       
  2630     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
       
  2631     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
       
  2632     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
       
  2633     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
       
  2634     tmp0  = tmp1 + tmp2 + tmp3 -
       
  2635 	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
       
  2636     tmp13 = tmp10 + tmp11 + tmp12 -
       
  2637 	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
       
  2638     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
       
  2639     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
       
  2640     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
       
  2641     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
       
  2642     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
       
  2643     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
       
  2644     z2    += z4;
       
  2645     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
       
  2646     tmp1  += z1;
       
  2647     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
       
  2648     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
       
  2649     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
       
  2650     tmp12 += z2;
       
  2651     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
       
  2652     tmp2  += z2;
       
  2653     tmp3  += z2;
       
  2654     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
       
  2655     tmp10 += z2;
       
  2656     tmp11 += z2;
       
  2657 
       
  2658     /* Final output stage */
       
  2659 
       
  2660     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
       
  2661     wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
       
  2662     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
       
  2663     wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
       
  2664     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
       
  2665     wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
       
  2666     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
       
  2667     wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
       
  2668     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
       
  2669     wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
       
  2670     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
       
  2671     wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
       
  2672     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
       
  2673     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
       
  2674     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
       
  2675     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
       
  2676   }
       
  2677 
       
  2678   /* Pass 2: process 16 rows from work array, store into output array. */
       
  2679 
       
  2680   wsptr = workspace;
       
  2681   for (ctr = 0; ctr < 16; ctr++) {
       
  2682     outptr = output_buf[ctr] + output_col;
       
  2683 
       
  2684     /* Even part */
       
  2685 
       
  2686     /* Add fudge factor here for final descale. */
       
  2687     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
  2688     tmp0 <<= CONST_BITS;
       
  2689 
       
  2690     z1 = (INT32) wsptr[4];
       
  2691     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
       
  2692     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
       
  2693 
       
  2694     tmp10 = tmp0 + tmp1;
       
  2695     tmp11 = tmp0 - tmp1;
       
  2696     tmp12 = tmp0 + tmp2;
       
  2697     tmp13 = tmp0 - tmp2;
       
  2698 
       
  2699     z1 = (INT32) wsptr[2];
       
  2700     z2 = (INT32) wsptr[6];
       
  2701     z3 = z1 - z2;
       
  2702     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
       
  2703     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
       
  2704 
       
  2705     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
       
  2706     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
       
  2707     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
       
  2708     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
       
  2709 
       
  2710     tmp20 = tmp10 + tmp0;
       
  2711     tmp27 = tmp10 - tmp0;
       
  2712     tmp21 = tmp12 + tmp1;
       
  2713     tmp26 = tmp12 - tmp1;
       
  2714     tmp22 = tmp13 + tmp2;
       
  2715     tmp25 = tmp13 - tmp2;
       
  2716     tmp23 = tmp11 + tmp3;
       
  2717     tmp24 = tmp11 - tmp3;
       
  2718 
       
  2719     /* Odd part */
       
  2720 
       
  2721     z1 = (INT32) wsptr[1];
       
  2722     z2 = (INT32) wsptr[3];
       
  2723     z3 = (INT32) wsptr[5];
       
  2724     z4 = (INT32) wsptr[7];
       
  2725 
       
  2726     tmp11 = z1 + z3;
       
  2727 
       
  2728     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
       
  2729     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
       
  2730     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
       
  2731     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
       
  2732     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
       
  2733     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
       
  2734     tmp0  = tmp1 + tmp2 + tmp3 -
       
  2735 	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
       
  2736     tmp13 = tmp10 + tmp11 + tmp12 -
       
  2737 	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
       
  2738     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
       
  2739     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
       
  2740     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
       
  2741     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
       
  2742     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
       
  2743     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
       
  2744     z2    += z4;
       
  2745     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
       
  2746     tmp1  += z1;
       
  2747     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
       
  2748     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
       
  2749     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
       
  2750     tmp12 += z2;
       
  2751     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
       
  2752     tmp2  += z2;
       
  2753     tmp3  += z2;
       
  2754     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
       
  2755     tmp10 += z2;
       
  2756     tmp11 += z2;
       
  2757 
       
  2758     /* Final output stage */
       
  2759 
       
  2760     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
       
  2761 					       CONST_BITS+PASS1_BITS+3)
       
  2762 			     & RANGE_MASK];
       
  2763     outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
       
  2764 					       CONST_BITS+PASS1_BITS+3)
       
  2765 			     & RANGE_MASK];
       
  2766     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
       
  2767 					       CONST_BITS+PASS1_BITS+3)
       
  2768 			     & RANGE_MASK];
       
  2769     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
       
  2770 					       CONST_BITS+PASS1_BITS+3)
       
  2771 			     & RANGE_MASK];
       
  2772     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
       
  2773 					       CONST_BITS+PASS1_BITS+3)
       
  2774 			     & RANGE_MASK];
       
  2775     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
       
  2776 					       CONST_BITS+PASS1_BITS+3)
       
  2777 			     & RANGE_MASK];
       
  2778     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
       
  2779 					       CONST_BITS+PASS1_BITS+3)
       
  2780 			     & RANGE_MASK];
       
  2781     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
       
  2782 					       CONST_BITS+PASS1_BITS+3)
       
  2783 			     & RANGE_MASK];
       
  2784     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
       
  2785 					       CONST_BITS+PASS1_BITS+3)
       
  2786 			     & RANGE_MASK];
       
  2787     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
       
  2788 					       CONST_BITS+PASS1_BITS+3)
       
  2789 			     & RANGE_MASK];
       
  2790     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
       
  2791 					       CONST_BITS+PASS1_BITS+3)
       
  2792 			     & RANGE_MASK];
       
  2793     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
       
  2794 					       CONST_BITS+PASS1_BITS+3)
       
  2795 			     & RANGE_MASK];
       
  2796     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
       
  2797 					       CONST_BITS+PASS1_BITS+3)
       
  2798 			     & RANGE_MASK];
       
  2799     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
       
  2800 					       CONST_BITS+PASS1_BITS+3)
       
  2801 			     & RANGE_MASK];
       
  2802     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
       
  2803 					       CONST_BITS+PASS1_BITS+3)
       
  2804 			     & RANGE_MASK];
       
  2805     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
       
  2806 					       CONST_BITS+PASS1_BITS+3)
       
  2807 			     & RANGE_MASK];
       
  2808 
       
  2809     wsptr += 8;		/* advance pointer to next row */
       
  2810   }
       
  2811 }
       
  2812 
       
  2813 
       
  2814 /*
       
  2815  * Perform dequantization and inverse DCT on one block of coefficients,
       
  2816  * producing a 16x8 output block.
       
  2817  *
       
  2818  * 8-point IDCT in pass 1 (columns), 16-point in pass 2 (rows).
       
  2819  */
       
  2820 
       
  2821 GLOBAL(void)
       
  2822 jpeg_idct_16x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  2823 		JCOEFPTR coef_block,
       
  2824 		JSAMPARRAY output_buf, JDIMENSION output_col)
       
  2825 {
       
  2826   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
       
  2827   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
       
  2828   INT32 z1, z2, z3, z4;
       
  2829   JCOEFPTR inptr;
       
  2830   ISLOW_MULT_TYPE * quantptr;
       
  2831   int * wsptr;
       
  2832   JSAMPROW outptr;
       
  2833   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  2834   int ctr;
       
  2835   int workspace[8*8];	/* buffers data between passes */
   162   SHIFT_TEMPS
  2836   SHIFT_TEMPS
   163 
  2837 
   164   /* Pass 1: process columns from input, store into work array. */
  2838   /* Pass 1: process columns from input, store into work array. */
   165   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
  2839   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
   166   /* furthermore, we scale the results by 2**PASS1_BITS. */
  2840   /* furthermore, we scale the results by 2**PASS1_BITS. */
   205     
  2879     
   206     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
  2880     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   207     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
  2881     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   208     
  2882     
   209     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
  2883     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
   210     tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
  2884     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
   211     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
  2885     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
   212     
  2886     
   213     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
  2887     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   214     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
  2888     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   215 
  2889     z2 <<= CONST_BITS;
   216     tmp0 = (z2 + z3) << CONST_BITS;
  2890     z3 <<= CONST_BITS;
   217     tmp1 = (z2 - z3) << CONST_BITS;
  2891     /* Add fudge factor here for final descale. */
       
  2892     z2 += ONE << (CONST_BITS-PASS1_BITS-1);
       
  2893 
       
  2894     tmp0 = z2 + z3;
       
  2895     tmp1 = z2 - z3;
   218     
  2896     
   219     tmp10 = tmp0 + tmp3;
  2897     tmp10 = tmp0 + tmp2;
   220     tmp13 = tmp0 - tmp3;
  2898     tmp13 = tmp0 - tmp2;
   221     tmp11 = tmp1 + tmp2;
  2899     tmp11 = tmp1 + tmp3;
   222     tmp12 = tmp1 - tmp2;
  2900     tmp12 = tmp1 - tmp3;
   223     
  2901     
   224     /* Odd part per figure 8; the matrix is unitary and hence its
  2902     /* Odd part per figure 8; the matrix is unitary and hence its
   225      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
  2903      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
   226      */
  2904      */
   227     
  2905     
   228     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
  2906     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   229     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
  2907     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   230     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
  2908     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   231     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
  2909     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   232     
  2910     
   233     z1 = tmp0 + tmp3;
  2911     z2 = tmp0 + tmp2;
   234     z2 = tmp1 + tmp2;
  2912     z3 = tmp1 + tmp3;
   235     z3 = tmp0 + tmp2;
  2913 
   236     z4 = tmp1 + tmp3;
  2914     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
   237     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
  2915     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
   238     
  2916     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
       
  2917     z2 += z1;
       
  2918     z3 += z1;
       
  2919 
       
  2920     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
   239     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
  2921     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
       
  2922     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
       
  2923     tmp0 += z1 + z2;
       
  2924     tmp3 += z1 + z3;
       
  2925 
       
  2926     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
   240     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
  2927     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
   241     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
  2928     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
   242     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
  2929     tmp1 += z1 + z3;
   243     z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
  2930     tmp2 += z1 + z2;
   244     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
       
   245     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
       
   246     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
       
   247     
       
   248     z3 += z5;
       
   249     z4 += z5;
       
   250     
       
   251     tmp0 += z1 + z3;
       
   252     tmp1 += z2 + z4;
       
   253     tmp2 += z2 + z3;
       
   254     tmp3 += z1 + z4;
       
   255     
  2931     
   256     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
  2932     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
   257     
  2933     
   258     wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
  2934     wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
   259     wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
  2935     wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
   260     wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
  2936     wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
   261     wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
  2937     wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
   262     wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
  2938     wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
   263     wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
  2939     wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
   264     wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
  2940     wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
   265     wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
  2941     wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
   266     
  2942     
   267     inptr++;			/* advance pointers to next column */
  2943     inptr++;			/* advance pointers to next column */
   268     quantptr++;
  2944     quantptr++;
   269     wsptr++;
  2945     wsptr++;
   270   }
  2946   }
       
  2947 
       
  2948   /* Pass 2: process 8 rows from work array, store into output array.
       
  2949    * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
       
  2950    */
       
  2951   wsptr = workspace;
       
  2952   for (ctr = 0; ctr < 8; ctr++) {
       
  2953     outptr = output_buf[ctr] + output_col;
       
  2954 
       
  2955     /* Even part */
       
  2956 
       
  2957     /* Add fudge factor here for final descale. */
       
  2958     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
  2959     tmp0 <<= CONST_BITS;
       
  2960 
       
  2961     z1 = (INT32) wsptr[4];
       
  2962     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
       
  2963     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
       
  2964 
       
  2965     tmp10 = tmp0 + tmp1;
       
  2966     tmp11 = tmp0 - tmp1;
       
  2967     tmp12 = tmp0 + tmp2;
       
  2968     tmp13 = tmp0 - tmp2;
       
  2969 
       
  2970     z1 = (INT32) wsptr[2];
       
  2971     z2 = (INT32) wsptr[6];
       
  2972     z3 = z1 - z2;
       
  2973     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
       
  2974     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
       
  2975 
       
  2976     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
       
  2977     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
       
  2978     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
       
  2979     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
       
  2980 
       
  2981     tmp20 = tmp10 + tmp0;
       
  2982     tmp27 = tmp10 - tmp0;
       
  2983     tmp21 = tmp12 + tmp1;
       
  2984     tmp26 = tmp12 - tmp1;
       
  2985     tmp22 = tmp13 + tmp2;
       
  2986     tmp25 = tmp13 - tmp2;
       
  2987     tmp23 = tmp11 + tmp3;
       
  2988     tmp24 = tmp11 - tmp3;
       
  2989 
       
  2990     /* Odd part */
       
  2991 
       
  2992     z1 = (INT32) wsptr[1];
       
  2993     z2 = (INT32) wsptr[3];
       
  2994     z3 = (INT32) wsptr[5];
       
  2995     z4 = (INT32) wsptr[7];
       
  2996 
       
  2997     tmp11 = z1 + z3;
       
  2998 
       
  2999     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
       
  3000     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
       
  3001     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
       
  3002     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
       
  3003     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
       
  3004     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
       
  3005     tmp0  = tmp1 + tmp2 + tmp3 -
       
  3006 	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
       
  3007     tmp13 = tmp10 + tmp11 + tmp12 -
       
  3008 	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
       
  3009     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
       
  3010     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
       
  3011     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
       
  3012     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
       
  3013     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
       
  3014     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
       
  3015     z2    += z4;
       
  3016     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
       
  3017     tmp1  += z1;
       
  3018     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
       
  3019     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
       
  3020     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
       
  3021     tmp12 += z2;
       
  3022     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
       
  3023     tmp2  += z2;
       
  3024     tmp3  += z2;
       
  3025     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
       
  3026     tmp10 += z2;
       
  3027     tmp11 += z2;
       
  3028 
       
  3029     /* Final output stage */
       
  3030 
       
  3031     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
       
  3032 					       CONST_BITS+PASS1_BITS+3)
       
  3033 			     & RANGE_MASK];
       
  3034     outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
       
  3035 					       CONST_BITS+PASS1_BITS+3)
       
  3036 			     & RANGE_MASK];
       
  3037     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
       
  3038 					       CONST_BITS+PASS1_BITS+3)
       
  3039 			     & RANGE_MASK];
       
  3040     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
       
  3041 					       CONST_BITS+PASS1_BITS+3)
       
  3042 			     & RANGE_MASK];
       
  3043     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
       
  3044 					       CONST_BITS+PASS1_BITS+3)
       
  3045 			     & RANGE_MASK];
       
  3046     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
       
  3047 					       CONST_BITS+PASS1_BITS+3)
       
  3048 			     & RANGE_MASK];
       
  3049     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
       
  3050 					       CONST_BITS+PASS1_BITS+3)
       
  3051 			     & RANGE_MASK];
       
  3052     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
       
  3053 					       CONST_BITS+PASS1_BITS+3)
       
  3054 			     & RANGE_MASK];
       
  3055     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
       
  3056 					       CONST_BITS+PASS1_BITS+3)
       
  3057 			     & RANGE_MASK];
       
  3058     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
       
  3059 					       CONST_BITS+PASS1_BITS+3)
       
  3060 			     & RANGE_MASK];
       
  3061     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
       
  3062 					       CONST_BITS+PASS1_BITS+3)
       
  3063 			     & RANGE_MASK];
       
  3064     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
       
  3065 					       CONST_BITS+PASS1_BITS+3)
       
  3066 			     & RANGE_MASK];
       
  3067     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
       
  3068 					       CONST_BITS+PASS1_BITS+3)
       
  3069 			     & RANGE_MASK];
       
  3070     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
       
  3071 					       CONST_BITS+PASS1_BITS+3)
       
  3072 			     & RANGE_MASK];
       
  3073     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
       
  3074 					       CONST_BITS+PASS1_BITS+3)
       
  3075 			     & RANGE_MASK];
       
  3076     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
       
  3077 					       CONST_BITS+PASS1_BITS+3)
       
  3078 			     & RANGE_MASK];
       
  3079 
       
  3080     wsptr += 8;		/* advance pointer to next row */
       
  3081   }
       
  3082 }
       
  3083 
       
  3084 
       
  3085 /*
       
  3086  * Perform dequantization and inverse DCT on one block of coefficients,
       
  3087  * producing a 14x7 output block.
       
  3088  *
       
  3089  * 7-point IDCT in pass 1 (columns), 14-point in pass 2 (rows).
       
  3090  */
       
  3091 
       
  3092 GLOBAL(void)
       
  3093 jpeg_idct_14x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  3094 		JCOEFPTR coef_block,
       
  3095 		JSAMPARRAY output_buf, JDIMENSION output_col)
       
  3096 {
       
  3097   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
       
  3098   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
       
  3099   INT32 z1, z2, z3, z4;
       
  3100   JCOEFPTR inptr;
       
  3101   ISLOW_MULT_TYPE * quantptr;
       
  3102   int * wsptr;
       
  3103   JSAMPROW outptr;
       
  3104   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  3105   int ctr;
       
  3106   int workspace[8*7];	/* buffers data between passes */
       
  3107   SHIFT_TEMPS
       
  3108 
       
  3109   /* Pass 1: process columns from input, store into work array.
       
  3110    * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
       
  3111    */
       
  3112   inptr = coef_block;
       
  3113   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  3114   wsptr = workspace;
       
  3115   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
       
  3116     /* Even part */
       
  3117 
       
  3118     tmp23 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  3119     tmp23 <<= CONST_BITS;
       
  3120     /* Add fudge factor here for final descale. */
       
  3121     tmp23 += ONE << (CONST_BITS-PASS1_BITS-1);
       
  3122 
       
  3123     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
  3124     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
       
  3125     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
       
  3126 
       
  3127     tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734));       /* c4 */
       
  3128     tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123));       /* c6 */
       
  3129     tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
       
  3130     tmp10 = z1 + z3;
       
  3131     z2 -= tmp10;
       
  3132     tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
       
  3133     tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536));   /* c2-c4-c6 */
       
  3134     tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249));   /* c2+c4+c6 */
       
  3135     tmp23 += MULTIPLY(z2, FIX(1.414213562));           /* c0 */
       
  3136 
       
  3137     /* Odd part */
       
  3138 
       
  3139     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  3140     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
  3141     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
       
  3142 
       
  3143     tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347));       /* (c3+c1-c5)/2 */
       
  3144     tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339));       /* (c3+c5-c1)/2 */
       
  3145     tmp10 = tmp11 - tmp12;
       
  3146     tmp11 += tmp12;
       
  3147     tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276));     /* -c1 */
       
  3148     tmp11 += tmp12;
       
  3149     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));          /* c5 */
       
  3150     tmp10 += z2;
       
  3151     tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693));      /* c3+c1-c5 */
       
  3152 
       
  3153     /* Final output stage */
       
  3154 
       
  3155     wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
       
  3156     wsptr[8*6] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
       
  3157     wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
       
  3158     wsptr[8*5] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
       
  3159     wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
       
  3160     wsptr[8*4] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
       
  3161     wsptr[8*3] = (int) RIGHT_SHIFT(tmp23, CONST_BITS-PASS1_BITS);
       
  3162   }
       
  3163 
       
  3164   /* Pass 2: process 7 rows from work array, store into output array.
       
  3165    * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
       
  3166    */
       
  3167   wsptr = workspace;
       
  3168   for (ctr = 0; ctr < 7; ctr++) {
       
  3169     outptr = output_buf[ctr] + output_col;
       
  3170 
       
  3171     /* Even part */
       
  3172 
       
  3173     /* Add fudge factor here for final descale. */
       
  3174     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
  3175     z1 <<= CONST_BITS;
       
  3176     z4 = (INT32) wsptr[4];
       
  3177     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
       
  3178     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
       
  3179     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
       
  3180 
       
  3181     tmp10 = z1 + z2;
       
  3182     tmp11 = z1 + z3;
       
  3183     tmp12 = z1 - z4;
       
  3184 
       
  3185     tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
       
  3186 
       
  3187     z1 = (INT32) wsptr[2];
       
  3188     z2 = (INT32) wsptr[6];
       
  3189 
       
  3190     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
       
  3191 
       
  3192     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
       
  3193     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
       
  3194     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
       
  3195 	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
       
  3196 
       
  3197     tmp20 = tmp10 + tmp13;
       
  3198     tmp26 = tmp10 - tmp13;
       
  3199     tmp21 = tmp11 + tmp14;
       
  3200     tmp25 = tmp11 - tmp14;
       
  3201     tmp22 = tmp12 + tmp15;
       
  3202     tmp24 = tmp12 - tmp15;
       
  3203 
       
  3204     /* Odd part */
       
  3205 
       
  3206     z1 = (INT32) wsptr[1];
       
  3207     z2 = (INT32) wsptr[3];
       
  3208     z3 = (INT32) wsptr[5];
       
  3209     z4 = (INT32) wsptr[7];
       
  3210     z4 <<= CONST_BITS;
       
  3211 
       
  3212     tmp14 = z1 + z3;
       
  3213     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
       
  3214     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
       
  3215     tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
       
  3216     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
       
  3217     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
       
  3218     z1    -= z2;
       
  3219     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
       
  3220     tmp16 += tmp15;
       
  3221     tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
       
  3222     tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
       
  3223     tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
       
  3224     tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
       
  3225     tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
       
  3226     tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
       
  3227 
       
  3228     tmp13 = ((z1 - z3) << CONST_BITS) + z4;
       
  3229 
       
  3230     /* Final output stage */
       
  3231 
       
  3232     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
       
  3233 					       CONST_BITS+PASS1_BITS+3)
       
  3234 			     & RANGE_MASK];
       
  3235     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
       
  3236 					       CONST_BITS+PASS1_BITS+3)
       
  3237 			     & RANGE_MASK];
       
  3238     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
       
  3239 					       CONST_BITS+PASS1_BITS+3)
       
  3240 			     & RANGE_MASK];
       
  3241     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
       
  3242 					       CONST_BITS+PASS1_BITS+3)
       
  3243 			     & RANGE_MASK];
       
  3244     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
       
  3245 					       CONST_BITS+PASS1_BITS+3)
       
  3246 			     & RANGE_MASK];
       
  3247     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
       
  3248 					       CONST_BITS+PASS1_BITS+3)
       
  3249 			     & RANGE_MASK];
       
  3250     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
       
  3251 					       CONST_BITS+PASS1_BITS+3)
       
  3252 			     & RANGE_MASK];
       
  3253     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
       
  3254 					       CONST_BITS+PASS1_BITS+3)
       
  3255 			     & RANGE_MASK];
       
  3256     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
       
  3257 					       CONST_BITS+PASS1_BITS+3)
       
  3258 			     & RANGE_MASK];
       
  3259     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
       
  3260 					       CONST_BITS+PASS1_BITS+3)
       
  3261 			     & RANGE_MASK];
       
  3262     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
       
  3263 					       CONST_BITS+PASS1_BITS+3)
       
  3264 			     & RANGE_MASK];
       
  3265     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
       
  3266 					       CONST_BITS+PASS1_BITS+3)
       
  3267 			     & RANGE_MASK];
       
  3268     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
       
  3269 					       CONST_BITS+PASS1_BITS+3)
       
  3270 			     & RANGE_MASK];
       
  3271     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
       
  3272 					       CONST_BITS+PASS1_BITS+3)
       
  3273 			     & RANGE_MASK];
       
  3274 
       
  3275     wsptr += 8;		/* advance pointer to next row */
       
  3276   }
       
  3277 }
       
  3278 
       
  3279 
       
  3280 /*
       
  3281  * Perform dequantization and inverse DCT on one block of coefficients,
       
  3282  * producing a 12x6 output block.
       
  3283  *
       
  3284  * 6-point IDCT in pass 1 (columns), 12-point in pass 2 (rows).
       
  3285  */
       
  3286 
       
  3287 GLOBAL(void)
       
  3288 jpeg_idct_12x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  3289 		JCOEFPTR coef_block,
       
  3290 		JSAMPARRAY output_buf, JDIMENSION output_col)
       
  3291 {
       
  3292   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
       
  3293   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
       
  3294   INT32 z1, z2, z3, z4;
       
  3295   JCOEFPTR inptr;
       
  3296   ISLOW_MULT_TYPE * quantptr;
       
  3297   int * wsptr;
       
  3298   JSAMPROW outptr;
       
  3299   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  3300   int ctr;
       
  3301   int workspace[8*6];	/* buffers data between passes */
       
  3302   SHIFT_TEMPS
       
  3303 
       
  3304   /* Pass 1: process columns from input, store into work array.
       
  3305    * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
       
  3306    */
       
  3307   inptr = coef_block;
       
  3308   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  3309   wsptr = workspace;
       
  3310   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
       
  3311     /* Even part */
       
  3312 
       
  3313     tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  3314     tmp10 <<= CONST_BITS;
       
  3315     /* Add fudge factor here for final descale. */
       
  3316     tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
       
  3317     tmp12 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
       
  3318     tmp20 = MULTIPLY(tmp12, FIX(0.707106781));   /* c4 */
       
  3319     tmp11 = tmp10 + tmp20;
       
  3320     tmp21 = RIGHT_SHIFT(tmp10 - tmp20 - tmp20, CONST_BITS-PASS1_BITS);
       
  3321     tmp20 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
  3322     tmp10 = MULTIPLY(tmp20, FIX(1.224744871));   /* c2 */
       
  3323     tmp20 = tmp11 + tmp10;
       
  3324     tmp22 = tmp11 - tmp10;
       
  3325 
       
  3326     /* Odd part */
       
  3327 
       
  3328     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  3329     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
  3330     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
       
  3331     tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
       
  3332     tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
       
  3333     tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
       
  3334     tmp11 = (z1 - z2 - z3) << PASS1_BITS;
       
  3335 
       
  3336     /* Final output stage */
       
  3337 
       
  3338     wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
       
  3339     wsptr[8*5] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
       
  3340     wsptr[8*1] = (int) (tmp21 + tmp11);
       
  3341     wsptr[8*4] = (int) (tmp21 - tmp11);
       
  3342     wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
       
  3343     wsptr[8*3] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
       
  3344   }
       
  3345 
       
  3346   /* Pass 2: process 6 rows from work array, store into output array.
       
  3347    * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
       
  3348    */
       
  3349   wsptr = workspace;
       
  3350   for (ctr = 0; ctr < 6; ctr++) {
       
  3351     outptr = output_buf[ctr] + output_col;
       
  3352 
       
  3353     /* Even part */
       
  3354 
       
  3355     /* Add fudge factor here for final descale. */
       
  3356     z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
  3357     z3 <<= CONST_BITS;
       
  3358 
       
  3359     z4 = (INT32) wsptr[4];
       
  3360     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
       
  3361 
       
  3362     tmp10 = z3 + z4;
       
  3363     tmp11 = z3 - z4;
       
  3364 
       
  3365     z1 = (INT32) wsptr[2];
       
  3366     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
       
  3367     z1 <<= CONST_BITS;
       
  3368     z2 = (INT32) wsptr[6];
       
  3369     z2 <<= CONST_BITS;
       
  3370 
       
  3371     tmp12 = z1 - z2;
       
  3372 
       
  3373     tmp21 = z3 + tmp12;
       
  3374     tmp24 = z3 - tmp12;
       
  3375 
       
  3376     tmp12 = z4 + z2;
       
  3377 
       
  3378     tmp20 = tmp10 + tmp12;
       
  3379     tmp25 = tmp10 - tmp12;
       
  3380 
       
  3381     tmp12 = z4 - z1 - z2;
       
  3382 
       
  3383     tmp22 = tmp11 + tmp12;
       
  3384     tmp23 = tmp11 - tmp12;
       
  3385 
       
  3386     /* Odd part */
       
  3387 
       
  3388     z1 = (INT32) wsptr[1];
       
  3389     z2 = (INT32) wsptr[3];
       
  3390     z3 = (INT32) wsptr[5];
       
  3391     z4 = (INT32) wsptr[7];
       
  3392 
       
  3393     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
       
  3394     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
       
  3395 
       
  3396     tmp10 = z1 + z3;
       
  3397     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
       
  3398     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
       
  3399     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
       
  3400     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
       
  3401     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
       
  3402     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
       
  3403     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
       
  3404 	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
       
  3405 
       
  3406     z1 -= z4;
       
  3407     z2 -= z3;
       
  3408     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
       
  3409     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
       
  3410     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
       
  3411 
       
  3412     /* Final output stage */
       
  3413 
       
  3414     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
       
  3415 					       CONST_BITS+PASS1_BITS+3)
       
  3416 			     & RANGE_MASK];
       
  3417     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
       
  3418 					       CONST_BITS+PASS1_BITS+3)
       
  3419 			     & RANGE_MASK];
       
  3420     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
       
  3421 					       CONST_BITS+PASS1_BITS+3)
       
  3422 			     & RANGE_MASK];
       
  3423     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
       
  3424 					       CONST_BITS+PASS1_BITS+3)
       
  3425 			     & RANGE_MASK];
       
  3426     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
       
  3427 					       CONST_BITS+PASS1_BITS+3)
       
  3428 			     & RANGE_MASK];
       
  3429     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
       
  3430 					       CONST_BITS+PASS1_BITS+3)
       
  3431 			     & RANGE_MASK];
       
  3432     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
       
  3433 					       CONST_BITS+PASS1_BITS+3)
       
  3434 			     & RANGE_MASK];
       
  3435     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
       
  3436 					       CONST_BITS+PASS1_BITS+3)
       
  3437 			     & RANGE_MASK];
       
  3438     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
       
  3439 					       CONST_BITS+PASS1_BITS+3)
       
  3440 			     & RANGE_MASK];
       
  3441     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
       
  3442 					       CONST_BITS+PASS1_BITS+3)
       
  3443 			     & RANGE_MASK];
       
  3444     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
       
  3445 					       CONST_BITS+PASS1_BITS+3)
       
  3446 			     & RANGE_MASK];
       
  3447     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
       
  3448 					       CONST_BITS+PASS1_BITS+3)
       
  3449 			     & RANGE_MASK];
       
  3450 
       
  3451     wsptr += 8;		/* advance pointer to next row */
       
  3452   }
       
  3453 }
       
  3454 
       
  3455 
       
  3456 /*
       
  3457  * Perform dequantization and inverse DCT on one block of coefficients,
       
  3458  * producing a 10x5 output block.
       
  3459  *
       
  3460  * 5-point IDCT in pass 1 (columns), 10-point in pass 2 (rows).
       
  3461  */
       
  3462 
       
  3463 GLOBAL(void)
       
  3464 jpeg_idct_10x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  3465 		JCOEFPTR coef_block,
       
  3466 		JSAMPARRAY output_buf, JDIMENSION output_col)
       
  3467 {
       
  3468   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
       
  3469   INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
       
  3470   INT32 z1, z2, z3, z4;
       
  3471   JCOEFPTR inptr;
       
  3472   ISLOW_MULT_TYPE * quantptr;
       
  3473   int * wsptr;
       
  3474   JSAMPROW outptr;
       
  3475   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  3476   int ctr;
       
  3477   int workspace[8*5];	/* buffers data between passes */
       
  3478   SHIFT_TEMPS
       
  3479 
       
  3480   /* Pass 1: process columns from input, store into work array.
       
  3481    * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
       
  3482    */
       
  3483   inptr = coef_block;
       
  3484   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  3485   wsptr = workspace;
       
  3486   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
       
  3487     /* Even part */
       
  3488 
       
  3489     tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  3490     tmp12 <<= CONST_BITS;
       
  3491     /* Add fudge factor here for final descale. */
       
  3492     tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
       
  3493     tmp13 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
  3494     tmp14 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
       
  3495     z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
       
  3496     z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
       
  3497     z3 = tmp12 + z2;
       
  3498     tmp10 = z3 + z1;
       
  3499     tmp11 = z3 - z1;
       
  3500     tmp12 -= z2 << 2;
       
  3501 
       
  3502     /* Odd part */
       
  3503 
       
  3504     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  3505     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
  3506 
       
  3507     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));       /* c3 */
       
  3508     tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148));    /* c1-c3 */
       
  3509     tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899));    /* c1+c3 */
       
  3510 
       
  3511     /* Final output stage */
       
  3512 
       
  3513     wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp13, CONST_BITS-PASS1_BITS);
       
  3514     wsptr[8*4] = (int) RIGHT_SHIFT(tmp10 - tmp13, CONST_BITS-PASS1_BITS);
       
  3515     wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp14, CONST_BITS-PASS1_BITS);
       
  3516     wsptr[8*3] = (int) RIGHT_SHIFT(tmp11 - tmp14, CONST_BITS-PASS1_BITS);
       
  3517     wsptr[8*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
       
  3518   }
       
  3519 
       
  3520   /* Pass 2: process 5 rows from work array, store into output array.
       
  3521    * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
       
  3522    */
       
  3523   wsptr = workspace;
       
  3524   for (ctr = 0; ctr < 5; ctr++) {
       
  3525     outptr = output_buf[ctr] + output_col;
       
  3526 
       
  3527     /* Even part */
       
  3528 
       
  3529     /* Add fudge factor here for final descale. */
       
  3530     z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
  3531     z3 <<= CONST_BITS;
       
  3532     z4 = (INT32) wsptr[4];
       
  3533     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
       
  3534     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
       
  3535     tmp10 = z3 + z1;
       
  3536     tmp11 = z3 - z2;
       
  3537 
       
  3538     tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
       
  3539 
       
  3540     z2 = (INT32) wsptr[2];
       
  3541     z3 = (INT32) wsptr[6];
       
  3542 
       
  3543     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
       
  3544     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
       
  3545     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
       
  3546 
       
  3547     tmp20 = tmp10 + tmp12;
       
  3548     tmp24 = tmp10 - tmp12;
       
  3549     tmp21 = tmp11 + tmp13;
       
  3550     tmp23 = tmp11 - tmp13;
       
  3551 
       
  3552     /* Odd part */
       
  3553 
       
  3554     z1 = (INT32) wsptr[1];
       
  3555     z2 = (INT32) wsptr[3];
       
  3556     z3 = (INT32) wsptr[5];
       
  3557     z3 <<= CONST_BITS;
       
  3558     z4 = (INT32) wsptr[7];
       
  3559 
       
  3560     tmp11 = z2 + z4;
       
  3561     tmp13 = z2 - z4;
       
  3562 
       
  3563     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
       
  3564 
       
  3565     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
       
  3566     z4 = z3 + tmp12;
       
  3567 
       
  3568     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
       
  3569     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
       
  3570 
       
  3571     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
       
  3572     z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
       
  3573 
       
  3574     tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
       
  3575 
       
  3576     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
       
  3577     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
       
  3578 
       
  3579     /* Final output stage */
       
  3580 
       
  3581     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
       
  3582 					      CONST_BITS+PASS1_BITS+3)
       
  3583 			    & RANGE_MASK];
       
  3584     outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
       
  3585 					      CONST_BITS+PASS1_BITS+3)
       
  3586 			    & RANGE_MASK];
       
  3587     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
       
  3588 					      CONST_BITS+PASS1_BITS+3)
       
  3589 			    & RANGE_MASK];
       
  3590     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
       
  3591 					      CONST_BITS+PASS1_BITS+3)
       
  3592 			    & RANGE_MASK];
       
  3593     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
       
  3594 					      CONST_BITS+PASS1_BITS+3)
       
  3595 			    & RANGE_MASK];
       
  3596     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
       
  3597 					      CONST_BITS+PASS1_BITS+3)
       
  3598 			    & RANGE_MASK];
       
  3599     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
       
  3600 					      CONST_BITS+PASS1_BITS+3)
       
  3601 			    & RANGE_MASK];
       
  3602     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
       
  3603 					      CONST_BITS+PASS1_BITS+3)
       
  3604 			    & RANGE_MASK];
       
  3605     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
       
  3606 					      CONST_BITS+PASS1_BITS+3)
       
  3607 			    & RANGE_MASK];
       
  3608     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
       
  3609 					      CONST_BITS+PASS1_BITS+3)
       
  3610 			    & RANGE_MASK];
       
  3611 
       
  3612     wsptr += 8;		/* advance pointer to next row */
       
  3613   }
       
  3614 }
       
  3615 
       
  3616 
       
  3617 /*
       
  3618  * Perform dequantization and inverse DCT on one block of coefficients,
       
  3619  * producing a 8x4 output block.
       
  3620  *
       
  3621  * 4-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
       
  3622  */
       
  3623 
       
  3624 GLOBAL(void)
       
  3625 jpeg_idct_8x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  3626 	       JCOEFPTR coef_block,
       
  3627 	       JSAMPARRAY output_buf, JDIMENSION output_col)
       
  3628 {
       
  3629   INT32 tmp0, tmp1, tmp2, tmp3;
       
  3630   INT32 tmp10, tmp11, tmp12, tmp13;
       
  3631   INT32 z1, z2, z3;
       
  3632   JCOEFPTR inptr;
       
  3633   ISLOW_MULT_TYPE * quantptr;
       
  3634   int * wsptr;
       
  3635   JSAMPROW outptr;
       
  3636   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  3637   int ctr;
       
  3638   int workspace[8*4];	/* buffers data between passes */
       
  3639   SHIFT_TEMPS
       
  3640 
       
  3641   /* Pass 1: process columns from input, store into work array.
       
  3642    * 4-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
       
  3643    */
       
  3644   inptr = coef_block;
       
  3645   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  3646   wsptr = workspace;
       
  3647   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
       
  3648     /* Even part */
       
  3649 
       
  3650     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  3651     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
  3652 
       
  3653     tmp10 = (tmp0 + tmp2) << PASS1_BITS;
       
  3654     tmp12 = (tmp0 - tmp2) << PASS1_BITS;
       
  3655 
       
  3656     /* Odd part */
       
  3657     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
       
  3658 
       
  3659     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  3660     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
  3661 
       
  3662     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);               /* c6 */
       
  3663     /* Add fudge factor here for final descale. */
       
  3664     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
       
  3665     tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
       
  3666 		       CONST_BITS-PASS1_BITS);
       
  3667     tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
       
  3668 		       CONST_BITS-PASS1_BITS);
       
  3669 
       
  3670     /* Final output stage */
       
  3671 
       
  3672     wsptr[8*0] = (int) (tmp10 + tmp0);
       
  3673     wsptr[8*3] = (int) (tmp10 - tmp0);
       
  3674     wsptr[8*1] = (int) (tmp12 + tmp2);
       
  3675     wsptr[8*2] = (int) (tmp12 - tmp2);
       
  3676   }
       
  3677 
       
  3678   /* Pass 2: process rows from work array, store into output array. */
       
  3679   /* Note that we must descale the results by a factor of 8 == 2**3, */
       
  3680   /* and also undo the PASS1_BITS scaling. */
       
  3681 
       
  3682   wsptr = workspace;
       
  3683   for (ctr = 0; ctr < 4; ctr++) {
       
  3684     outptr = output_buf[ctr] + output_col;
       
  3685 
       
  3686     /* Even part: reverse the even part of the forward DCT. */
       
  3687     /* The rotator is sqrt(2)*c(-6). */
       
  3688 
       
  3689     z2 = (INT32) wsptr[2];
       
  3690     z3 = (INT32) wsptr[6];
       
  3691     
       
  3692     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
       
  3693     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
       
  3694     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
       
  3695     
       
  3696     /* Add fudge factor here for final descale. */
       
  3697     z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
  3698     z3 = (INT32) wsptr[4];
       
  3699     
       
  3700     tmp0 = (z2 + z3) << CONST_BITS;
       
  3701     tmp1 = (z2 - z3) << CONST_BITS;
       
  3702     
       
  3703     tmp10 = tmp0 + tmp2;
       
  3704     tmp13 = tmp0 - tmp2;
       
  3705     tmp11 = tmp1 + tmp3;
       
  3706     tmp12 = tmp1 - tmp3;
       
  3707 
       
  3708     /* Odd part per figure 8; the matrix is unitary and hence its
       
  3709      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
       
  3710      */
       
  3711 
       
  3712     tmp0 = (INT32) wsptr[7];
       
  3713     tmp1 = (INT32) wsptr[5];
       
  3714     tmp2 = (INT32) wsptr[3];
       
  3715     tmp3 = (INT32) wsptr[1];
       
  3716 
       
  3717     z2 = tmp0 + tmp2;
       
  3718     z3 = tmp1 + tmp3;
       
  3719 
       
  3720     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
       
  3721     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
       
  3722     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
       
  3723     z2 += z1;
       
  3724     z3 += z1;
       
  3725 
       
  3726     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
       
  3727     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
       
  3728     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
       
  3729     tmp0 += z1 + z2;
       
  3730     tmp3 += z1 + z3;
       
  3731 
       
  3732     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
       
  3733     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
       
  3734     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
       
  3735     tmp1 += z1 + z3;
       
  3736     tmp2 += z1 + z2;
       
  3737 
       
  3738     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
       
  3739 
       
  3740     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
       
  3741 					      CONST_BITS+PASS1_BITS+3)
       
  3742 			    & RANGE_MASK];
       
  3743     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
       
  3744 					      CONST_BITS+PASS1_BITS+3)
       
  3745 			    & RANGE_MASK];
       
  3746     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
       
  3747 					      CONST_BITS+PASS1_BITS+3)
       
  3748 			    & RANGE_MASK];
       
  3749     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
       
  3750 					      CONST_BITS+PASS1_BITS+3)
       
  3751 			    & RANGE_MASK];
       
  3752     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
       
  3753 					      CONST_BITS+PASS1_BITS+3)
       
  3754 			    & RANGE_MASK];
       
  3755     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
       
  3756 					      CONST_BITS+PASS1_BITS+3)
       
  3757 			    & RANGE_MASK];
       
  3758     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
       
  3759 					      CONST_BITS+PASS1_BITS+3)
       
  3760 			    & RANGE_MASK];
       
  3761     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
       
  3762 					      CONST_BITS+PASS1_BITS+3)
       
  3763 			    & RANGE_MASK];
       
  3764 
       
  3765     wsptr += DCTSIZE;		/* advance pointer to next row */
       
  3766   }
       
  3767 }
       
  3768 
       
  3769 
       
  3770 /*
       
  3771  * Perform dequantization and inverse DCT on one block of coefficients,
       
  3772  * producing a reduced-size 6x3 output block.
       
  3773  *
       
  3774  * 3-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
       
  3775  */
       
  3776 
       
  3777 GLOBAL(void)
       
  3778 jpeg_idct_6x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  3779 	       JCOEFPTR coef_block,
       
  3780 	       JSAMPARRAY output_buf, JDIMENSION output_col)
       
  3781 {
       
  3782   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
       
  3783   INT32 z1, z2, z3;
       
  3784   JCOEFPTR inptr;
       
  3785   ISLOW_MULT_TYPE * quantptr;
       
  3786   int * wsptr;
       
  3787   JSAMPROW outptr;
       
  3788   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  3789   int ctr;
       
  3790   int workspace[6*3];	/* buffers data between passes */
       
  3791   SHIFT_TEMPS
       
  3792 
       
  3793   /* Pass 1: process columns from input, store into work array.
       
  3794    * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
       
  3795    */
       
  3796   inptr = coef_block;
       
  3797   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  3798   wsptr = workspace;
       
  3799   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
       
  3800     /* Even part */
       
  3801 
       
  3802     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  3803     tmp0 <<= CONST_BITS;
       
  3804     /* Add fudge factor here for final descale. */
       
  3805     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
       
  3806     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
  3807     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
       
  3808     tmp10 = tmp0 + tmp12;
       
  3809     tmp2 = tmp0 - tmp12 - tmp12;
       
  3810 
       
  3811     /* Odd part */
       
  3812 
       
  3813     tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  3814     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
       
  3815 
       
  3816     /* Final output stage */
       
  3817 
       
  3818     wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
       
  3819     wsptr[6*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
       
  3820     wsptr[6*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
       
  3821   }
       
  3822   
       
  3823   /* Pass 2: process 3 rows from work array, store into output array.
       
  3824    * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
       
  3825    */
       
  3826   wsptr = workspace;
       
  3827   for (ctr = 0; ctr < 3; ctr++) {
       
  3828     outptr = output_buf[ctr] + output_col;
       
  3829 
       
  3830     /* Even part */
       
  3831 
       
  3832     /* Add fudge factor here for final descale. */
       
  3833     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
  3834     tmp0 <<= CONST_BITS;
       
  3835     tmp2 = (INT32) wsptr[4];
       
  3836     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
       
  3837     tmp1 = tmp0 + tmp10;
       
  3838     tmp11 = tmp0 - tmp10 - tmp10;
       
  3839     tmp10 = (INT32) wsptr[2];
       
  3840     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
       
  3841     tmp10 = tmp1 + tmp0;
       
  3842     tmp12 = tmp1 - tmp0;
       
  3843 
       
  3844     /* Odd part */
       
  3845 
       
  3846     z1 = (INT32) wsptr[1];
       
  3847     z2 = (INT32) wsptr[3];
       
  3848     z3 = (INT32) wsptr[5];
       
  3849     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
       
  3850     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
       
  3851     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
       
  3852     tmp1 = (z1 - z2 - z3) << CONST_BITS;
       
  3853 
       
  3854     /* Final output stage */
       
  3855 
       
  3856     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
       
  3857 					      CONST_BITS+PASS1_BITS+3)
       
  3858 			    & RANGE_MASK];
       
  3859     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
       
  3860 					      CONST_BITS+PASS1_BITS+3)
       
  3861 			    & RANGE_MASK];
       
  3862     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
       
  3863 					      CONST_BITS+PASS1_BITS+3)
       
  3864 			    & RANGE_MASK];
       
  3865     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
       
  3866 					      CONST_BITS+PASS1_BITS+3)
       
  3867 			    & RANGE_MASK];
       
  3868     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
       
  3869 					      CONST_BITS+PASS1_BITS+3)
       
  3870 			    & RANGE_MASK];
       
  3871     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
       
  3872 					      CONST_BITS+PASS1_BITS+3)
       
  3873 			    & RANGE_MASK];
       
  3874 
       
  3875     wsptr += 6;		/* advance pointer to next row */
       
  3876   }
       
  3877 }
       
  3878 
       
  3879 
       
  3880 /*
       
  3881  * Perform dequantization and inverse DCT on one block of coefficients,
       
  3882  * producing a 4x2 output block.
       
  3883  *
       
  3884  * 2-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
       
  3885  */
       
  3886 
       
  3887 GLOBAL(void)
       
  3888 jpeg_idct_4x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  3889 	       JCOEFPTR coef_block,
       
  3890 	       JSAMPARRAY output_buf, JDIMENSION output_col)
       
  3891 {
       
  3892   INT32 tmp0, tmp2, tmp10, tmp12;
       
  3893   INT32 z1, z2, z3;
       
  3894   JCOEFPTR inptr;
       
  3895   ISLOW_MULT_TYPE * quantptr;
       
  3896   INT32 * wsptr;
       
  3897   JSAMPROW outptr;
       
  3898   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  3899   int ctr;
       
  3900   INT32 workspace[4*2];	/* buffers data between passes */
       
  3901   SHIFT_TEMPS
       
  3902 
       
  3903   /* Pass 1: process columns from input, store into work array. */
       
  3904 
       
  3905   inptr = coef_block;
       
  3906   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  3907   wsptr = workspace;
       
  3908   for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
       
  3909     /* Even part */
       
  3910 
       
  3911     tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  3912 
       
  3913     /* Odd part */
       
  3914 
       
  3915     tmp0 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  3916 
       
  3917     /* Final output stage */
       
  3918 
       
  3919     wsptr[4*0] = tmp10 + tmp0;
       
  3920     wsptr[4*1] = tmp10 - tmp0;
       
  3921   }
       
  3922 
       
  3923   /* Pass 2: process 2 rows from work array, store into output array.
       
  3924    * 4-point IDCT kernel,
       
  3925    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
       
  3926    */
       
  3927   wsptr = workspace;
       
  3928   for (ctr = 0; ctr < 2; ctr++) {
       
  3929     outptr = output_buf[ctr] + output_col;
       
  3930 
       
  3931     /* Even part */
       
  3932 
       
  3933     /* Add fudge factor here for final descale. */
       
  3934     tmp0 = wsptr[0] + (ONE << 2);
       
  3935     tmp2 = wsptr[2];
       
  3936 
       
  3937     tmp10 = (tmp0 + tmp2) << CONST_BITS;
       
  3938     tmp12 = (tmp0 - tmp2) << CONST_BITS;
       
  3939 
       
  3940     /* Odd part */
       
  3941     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
       
  3942 
       
  3943     z2 = wsptr[1];
       
  3944     z3 = wsptr[3];
       
  3945 
       
  3946     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
       
  3947     tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
       
  3948     tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
       
  3949 
       
  3950     /* Final output stage */
       
  3951 
       
  3952     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
       
  3953 					      CONST_BITS+3)
       
  3954 			    & RANGE_MASK];
       
  3955     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
       
  3956 					      CONST_BITS+3)
       
  3957 			    & RANGE_MASK];
       
  3958     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
       
  3959 					      CONST_BITS+3)
       
  3960 			    & RANGE_MASK];
       
  3961     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
       
  3962 					      CONST_BITS+3)
       
  3963 			    & RANGE_MASK];
       
  3964 
       
  3965     wsptr += 4;		/* advance pointer to next row */
       
  3966   }
       
  3967 }
       
  3968 
       
  3969 
       
  3970 /*
       
  3971  * Perform dequantization and inverse DCT on one block of coefficients,
       
  3972  * producing a 2x1 output block.
       
  3973  *
       
  3974  * 1-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
       
  3975  */
       
  3976 
       
  3977 GLOBAL(void)
       
  3978 jpeg_idct_2x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  3979 	       JCOEFPTR coef_block,
       
  3980 	       JSAMPARRAY output_buf, JDIMENSION output_col)
       
  3981 {
       
  3982   INT32 tmp0, tmp10;
       
  3983   ISLOW_MULT_TYPE * quantptr;
       
  3984   JSAMPROW outptr;
       
  3985   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  3986   SHIFT_TEMPS
       
  3987 
       
  3988   /* Pass 1: empty. */
       
  3989 
       
  3990   /* Pass 2: process 1 row from input, store into output array. */
       
  3991 
       
  3992   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  3993   outptr = output_buf[0] + output_col;
       
  3994 
       
  3995   /* Even part */
       
  3996 
       
  3997   tmp10 = DEQUANTIZE(coef_block[0], quantptr[0]);
       
  3998   /* Add fudge factor here for final descale. */
       
  3999   tmp10 += ONE << 2;
       
  4000 
       
  4001   /* Odd part */
       
  4002 
       
  4003   tmp0 = DEQUANTIZE(coef_block[1], quantptr[1]);
       
  4004 
       
  4005   /* Final output stage */
       
  4006 
       
  4007   outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, 3) & RANGE_MASK];
       
  4008   outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, 3) & RANGE_MASK];
       
  4009 }
       
  4010 
       
  4011 
       
  4012 /*
       
  4013  * Perform dequantization and inverse DCT on one block of coefficients,
       
  4014  * producing a 8x16 output block.
       
  4015  *
       
  4016  * 16-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
       
  4017  */
       
  4018 
       
  4019 GLOBAL(void)
       
  4020 jpeg_idct_8x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  4021 		JCOEFPTR coef_block,
       
  4022 		JSAMPARRAY output_buf, JDIMENSION output_col)
       
  4023 {
       
  4024   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
       
  4025   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
       
  4026   INT32 z1, z2, z3, z4;
       
  4027   JCOEFPTR inptr;
       
  4028   ISLOW_MULT_TYPE * quantptr;
       
  4029   int * wsptr;
       
  4030   JSAMPROW outptr;
       
  4031   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  4032   int ctr;
       
  4033   int workspace[8*16];	/* buffers data between passes */
       
  4034   SHIFT_TEMPS
       
  4035 
       
  4036   /* Pass 1: process columns from input, store into work array.
       
  4037    * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
       
  4038    */
       
  4039   inptr = coef_block;
       
  4040   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  4041   wsptr = workspace;
       
  4042   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
       
  4043     /* Even part */
       
  4044 
       
  4045     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  4046     tmp0 <<= CONST_BITS;
       
  4047     /* Add fudge factor here for final descale. */
       
  4048     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
       
  4049 
       
  4050     z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
       
  4051     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
       
  4052     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
       
  4053 
       
  4054     tmp10 = tmp0 + tmp1;
       
  4055     tmp11 = tmp0 - tmp1;
       
  4056     tmp12 = tmp0 + tmp2;
       
  4057     tmp13 = tmp0 - tmp2;
       
  4058 
       
  4059     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
  4060     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
       
  4061     z3 = z1 - z2;
       
  4062     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
       
  4063     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
       
  4064 
       
  4065     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
       
  4066     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
       
  4067     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
       
  4068     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
       
  4069 
       
  4070     tmp20 = tmp10 + tmp0;
       
  4071     tmp27 = tmp10 - tmp0;
       
  4072     tmp21 = tmp12 + tmp1;
       
  4073     tmp26 = tmp12 - tmp1;
       
  4074     tmp22 = tmp13 + tmp2;
       
  4075     tmp25 = tmp13 - tmp2;
       
  4076     tmp23 = tmp11 + tmp3;
       
  4077     tmp24 = tmp11 - tmp3;
       
  4078 
       
  4079     /* Odd part */
       
  4080 
       
  4081     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  4082     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
  4083     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
       
  4084     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
       
  4085 
       
  4086     tmp11 = z1 + z3;
       
  4087 
       
  4088     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
       
  4089     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
       
  4090     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
       
  4091     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
       
  4092     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
       
  4093     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
       
  4094     tmp0  = tmp1 + tmp2 + tmp3 -
       
  4095 	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
       
  4096     tmp13 = tmp10 + tmp11 + tmp12 -
       
  4097 	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
       
  4098     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
       
  4099     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
       
  4100     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
       
  4101     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
       
  4102     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
       
  4103     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
       
  4104     z2    += z4;
       
  4105     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
       
  4106     tmp1  += z1;
       
  4107     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
       
  4108     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
       
  4109     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
       
  4110     tmp12 += z2;
       
  4111     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
       
  4112     tmp2  += z2;
       
  4113     tmp3  += z2;
       
  4114     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
       
  4115     tmp10 += z2;
       
  4116     tmp11 += z2;
       
  4117 
       
  4118     /* Final output stage */
       
  4119 
       
  4120     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
       
  4121     wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
       
  4122     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
       
  4123     wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
       
  4124     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
       
  4125     wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
       
  4126     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
       
  4127     wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
       
  4128     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
       
  4129     wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
       
  4130     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
       
  4131     wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
       
  4132     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
       
  4133     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
       
  4134     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
       
  4135     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
       
  4136   }
   271   
  4137   
   272   /* Pass 2: process rows from work array, store into output array. */
  4138   /* Pass 2: process rows from work array, store into output array. */
   273   /* Note that we must descale the results by a factor of 8 == 2**3, */
  4139   /* Note that we must descale the results by a factor of 8 == 2**3, */
   274   /* and also undo the PASS1_BITS scaling. */
  4140   /* and also undo the PASS1_BITS scaling. */
   275 
  4141 
   276   wsptr = workspace;
  4142   wsptr = workspace;
   277   for (ctr = 0; ctr < DCTSIZE; ctr++) {
  4143   for (ctr = 0; ctr < 16; ctr++) {
   278     outptr = output_buf[ctr] + output_col;
  4144     outptr = output_buf[ctr] + output_col;
   279     /* Rows of zeroes can be exploited in the same way as we did with columns.
       
   280      * However, the column calculation has created many nonzero AC terms, so
       
   281      * the simplification applies less often (typically 5% to 10% of the time).
       
   282      * On machines with very fast multiplication, it's possible that the
       
   283      * test takes more time than it's worth.  In that case this section
       
   284      * may be commented out.
       
   285      */
       
   286     
       
   287 #ifndef NO_ZERO_ROW_TEST
       
   288     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
       
   289 	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       
   290       /* AC terms all zero */
       
   291       JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
       
   292 				  & RANGE_MASK];
       
   293       
       
   294       outptr[0] = dcval;
       
   295       outptr[1] = dcval;
       
   296       outptr[2] = dcval;
       
   297       outptr[3] = dcval;
       
   298       outptr[4] = dcval;
       
   299       outptr[5] = dcval;
       
   300       outptr[6] = dcval;
       
   301       outptr[7] = dcval;
       
   302 
       
   303       wsptr += DCTSIZE;		/* advance pointer to next row */
       
   304       continue;
       
   305     }
       
   306 #endif
       
   307     
  4145     
   308     /* Even part: reverse the even part of the forward DCT. */
  4146     /* Even part: reverse the even part of the forward DCT. */
   309     /* The rotator is sqrt(2)*c(-6). */
  4147     /* The rotator is sqrt(2)*c(-6). */
   310     
  4148     
   311     z2 = (INT32) wsptr[2];
  4149     z2 = (INT32) wsptr[2];
   312     z3 = (INT32) wsptr[6];
  4150     z3 = (INT32) wsptr[6];
   313     
  4151     
   314     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
  4152     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
   315     tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
  4153     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
   316     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
  4154     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
   317     
  4155     
   318     tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS;
  4156     /* Add fudge factor here for final descale. */
   319     tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS;
  4157     z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
  4158     z3 = (INT32) wsptr[4];
   320     
  4159     
   321     tmp10 = tmp0 + tmp3;
  4160     tmp0 = (z2 + z3) << CONST_BITS;
   322     tmp13 = tmp0 - tmp3;
  4161     tmp1 = (z2 - z3) << CONST_BITS;
   323     tmp11 = tmp1 + tmp2;
  4162     
   324     tmp12 = tmp1 - tmp2;
  4163     tmp10 = tmp0 + tmp2;
       
  4164     tmp13 = tmp0 - tmp2;
       
  4165     tmp11 = tmp1 + tmp3;
       
  4166     tmp12 = tmp1 - tmp3;
   325     
  4167     
   326     /* Odd part per figure 8; the matrix is unitary and hence its
  4168     /* Odd part per figure 8; the matrix is unitary and hence its
   327      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
  4169      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
   328      */
  4170      */
   329     
  4171     
   330     tmp0 = (INT32) wsptr[7];
  4172     tmp0 = (INT32) wsptr[7];
   331     tmp1 = (INT32) wsptr[5];
  4173     tmp1 = (INT32) wsptr[5];
   332     tmp2 = (INT32) wsptr[3];
  4174     tmp2 = (INT32) wsptr[3];
   333     tmp3 = (INT32) wsptr[1];
  4175     tmp3 = (INT32) wsptr[1];
   334     
  4176     
   335     z1 = tmp0 + tmp3;
  4177     z2 = tmp0 + tmp2;
   336     z2 = tmp1 + tmp2;
  4178     z3 = tmp1 + tmp3;
   337     z3 = tmp0 + tmp2;
  4179 
   338     z4 = tmp1 + tmp3;
  4180     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
   339     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
  4181     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
   340     
  4182     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
       
  4183     z2 += z1;
       
  4184     z3 += z1;
       
  4185 
       
  4186     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
   341     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
  4187     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
       
  4188     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
       
  4189     tmp0 += z1 + z2;
       
  4190     tmp3 += z1 + z3;
       
  4191 
       
  4192     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
   342     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
  4193     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
   343     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
  4194     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
   344     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
  4195     tmp1 += z1 + z3;
   345     z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
  4196     tmp2 += z1 + z2;
   346     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
       
   347     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
       
   348     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
       
   349     
       
   350     z3 += z5;
       
   351     z4 += z5;
       
   352     
       
   353     tmp0 += z1 + z3;
       
   354     tmp1 += z2 + z4;
       
   355     tmp2 += z2 + z3;
       
   356     tmp3 += z1 + z4;
       
   357     
  4197     
   358     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
  4198     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
   359     
  4199     
   360     outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3,
  4200     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
   361 					  CONST_BITS+PASS1_BITS+3)
  4201 					      CONST_BITS+PASS1_BITS+3)
   362 			    & RANGE_MASK];
  4202 			    & RANGE_MASK];
   363     outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3,
  4203     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
   364 					  CONST_BITS+PASS1_BITS+3)
  4204 					      CONST_BITS+PASS1_BITS+3)
   365 			    & RANGE_MASK];
  4205 			    & RANGE_MASK];
   366     outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2,
  4206     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
   367 					  CONST_BITS+PASS1_BITS+3)
  4207 					      CONST_BITS+PASS1_BITS+3)
   368 			    & RANGE_MASK];
  4208 			    & RANGE_MASK];
   369     outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2,
  4209     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
   370 					  CONST_BITS+PASS1_BITS+3)
  4210 					      CONST_BITS+PASS1_BITS+3)
   371 			    & RANGE_MASK];
  4211 			    & RANGE_MASK];
   372     outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1,
  4212     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
   373 					  CONST_BITS+PASS1_BITS+3)
  4213 					      CONST_BITS+PASS1_BITS+3)
   374 			    & RANGE_MASK];
  4214 			    & RANGE_MASK];
   375     outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1,
  4215     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
   376 					  CONST_BITS+PASS1_BITS+3)
  4216 					      CONST_BITS+PASS1_BITS+3)
   377 			    & RANGE_MASK];
  4217 			    & RANGE_MASK];
   378     outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0,
  4218     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
   379 					  CONST_BITS+PASS1_BITS+3)
  4219 					      CONST_BITS+PASS1_BITS+3)
   380 			    & RANGE_MASK];
  4220 			    & RANGE_MASK];
   381     outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0,
  4221     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
   382 					  CONST_BITS+PASS1_BITS+3)
  4222 					      CONST_BITS+PASS1_BITS+3)
   383 			    & RANGE_MASK];
  4223 			    & RANGE_MASK];
   384     
  4224     
   385     wsptr += DCTSIZE;		/* advance pointer to next row */
  4225     wsptr += DCTSIZE;		/* advance pointer to next row */
   386   }
  4226   }
   387 }
  4227 }
   388 
  4228 
       
  4229 
       
  4230 /*
       
  4231  * Perform dequantization and inverse DCT on one block of coefficients,
       
  4232  * producing a 7x14 output block.
       
  4233  *
       
  4234  * 14-point IDCT in pass 1 (columns), 7-point in pass 2 (rows).
       
  4235  */
       
  4236 
       
  4237 GLOBAL(void)
       
  4238 jpeg_idct_7x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  4239 		JCOEFPTR coef_block,
       
  4240 		JSAMPARRAY output_buf, JDIMENSION output_col)
       
  4241 {
       
  4242   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
       
  4243   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
       
  4244   INT32 z1, z2, z3, z4;
       
  4245   JCOEFPTR inptr;
       
  4246   ISLOW_MULT_TYPE * quantptr;
       
  4247   int * wsptr;
       
  4248   JSAMPROW outptr;
       
  4249   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  4250   int ctr;
       
  4251   int workspace[7*14];	/* buffers data between passes */
       
  4252   SHIFT_TEMPS
       
  4253 
       
  4254   /* Pass 1: process columns from input, store into work array.
       
  4255    * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
       
  4256    */
       
  4257   inptr = coef_block;
       
  4258   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  4259   wsptr = workspace;
       
  4260   for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
       
  4261     /* Even part */
       
  4262 
       
  4263     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  4264     z1 <<= CONST_BITS;
       
  4265     /* Add fudge factor here for final descale. */
       
  4266     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
       
  4267     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
       
  4268     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
       
  4269     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
       
  4270     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
       
  4271 
       
  4272     tmp10 = z1 + z2;
       
  4273     tmp11 = z1 + z3;
       
  4274     tmp12 = z1 - z4;
       
  4275 
       
  4276     tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
       
  4277 			CONST_BITS-PASS1_BITS);
       
  4278 
       
  4279     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
  4280     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
       
  4281 
       
  4282     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
       
  4283 
       
  4284     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
       
  4285     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
       
  4286     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
       
  4287 	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
       
  4288 
       
  4289     tmp20 = tmp10 + tmp13;
       
  4290     tmp26 = tmp10 - tmp13;
       
  4291     tmp21 = tmp11 + tmp14;
       
  4292     tmp25 = tmp11 - tmp14;
       
  4293     tmp22 = tmp12 + tmp15;
       
  4294     tmp24 = tmp12 - tmp15;
       
  4295 
       
  4296     /* Odd part */
       
  4297 
       
  4298     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  4299     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
  4300     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
       
  4301     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
       
  4302     tmp13 = z4 << CONST_BITS;
       
  4303 
       
  4304     tmp14 = z1 + z3;
       
  4305     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
       
  4306     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
       
  4307     tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
       
  4308     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
       
  4309     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
       
  4310     z1    -= z2;
       
  4311     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
       
  4312     tmp16 += tmp15;
       
  4313     z1    += z4;
       
  4314     z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
       
  4315     tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
       
  4316     tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
       
  4317     z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
       
  4318     tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
       
  4319     tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
       
  4320 
       
  4321     tmp13 = (z1 - z3) << PASS1_BITS;
       
  4322 
       
  4323     /* Final output stage */
       
  4324 
       
  4325     wsptr[7*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
       
  4326     wsptr[7*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
       
  4327     wsptr[7*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
       
  4328     wsptr[7*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
       
  4329     wsptr[7*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
       
  4330     wsptr[7*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
       
  4331     wsptr[7*3]  = (int) (tmp23 + tmp13);
       
  4332     wsptr[7*10] = (int) (tmp23 - tmp13);
       
  4333     wsptr[7*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
       
  4334     wsptr[7*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
       
  4335     wsptr[7*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
       
  4336     wsptr[7*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
       
  4337     wsptr[7*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
       
  4338     wsptr[7*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
       
  4339   }
       
  4340 
       
  4341   /* Pass 2: process 14 rows from work array, store into output array.
       
  4342    * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
       
  4343    */
       
  4344   wsptr = workspace;
       
  4345   for (ctr = 0; ctr < 14; ctr++) {
       
  4346     outptr = output_buf[ctr] + output_col;
       
  4347 
       
  4348     /* Even part */
       
  4349 
       
  4350     /* Add fudge factor here for final descale. */
       
  4351     tmp23 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
  4352     tmp23 <<= CONST_BITS;
       
  4353 
       
  4354     z1 = (INT32) wsptr[2];
       
  4355     z2 = (INT32) wsptr[4];
       
  4356     z3 = (INT32) wsptr[6];
       
  4357 
       
  4358     tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734));       /* c4 */
       
  4359     tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123));       /* c6 */
       
  4360     tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
       
  4361     tmp10 = z1 + z3;
       
  4362     z2 -= tmp10;
       
  4363     tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
       
  4364     tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536));   /* c2-c4-c6 */
       
  4365     tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249));   /* c2+c4+c6 */
       
  4366     tmp23 += MULTIPLY(z2, FIX(1.414213562));           /* c0 */
       
  4367 
       
  4368     /* Odd part */
       
  4369 
       
  4370     z1 = (INT32) wsptr[1];
       
  4371     z2 = (INT32) wsptr[3];
       
  4372     z3 = (INT32) wsptr[5];
       
  4373 
       
  4374     tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347));       /* (c3+c1-c5)/2 */
       
  4375     tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339));       /* (c3+c5-c1)/2 */
       
  4376     tmp10 = tmp11 - tmp12;
       
  4377     tmp11 += tmp12;
       
  4378     tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276));     /* -c1 */
       
  4379     tmp11 += tmp12;
       
  4380     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));          /* c5 */
       
  4381     tmp10 += z2;
       
  4382     tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693));      /* c3+c1-c5 */
       
  4383 
       
  4384     /* Final output stage */
       
  4385 
       
  4386     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
       
  4387 					      CONST_BITS+PASS1_BITS+3)
       
  4388 			    & RANGE_MASK];
       
  4389     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
       
  4390 					      CONST_BITS+PASS1_BITS+3)
       
  4391 			    & RANGE_MASK];
       
  4392     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
       
  4393 					      CONST_BITS+PASS1_BITS+3)
       
  4394 			    & RANGE_MASK];
       
  4395     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
       
  4396 					      CONST_BITS+PASS1_BITS+3)
       
  4397 			    & RANGE_MASK];
       
  4398     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
       
  4399 					      CONST_BITS+PASS1_BITS+3)
       
  4400 			    & RANGE_MASK];
       
  4401     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
       
  4402 					      CONST_BITS+PASS1_BITS+3)
       
  4403 			    & RANGE_MASK];
       
  4404     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23,
       
  4405 					      CONST_BITS+PASS1_BITS+3)
       
  4406 			    & RANGE_MASK];
       
  4407 
       
  4408     wsptr += 7;		/* advance pointer to next row */
       
  4409   }
       
  4410 }
       
  4411 
       
  4412 
       
  4413 /*
       
  4414  * Perform dequantization and inverse DCT on one block of coefficients,
       
  4415  * producing a 6x12 output block.
       
  4416  *
       
  4417  * 12-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
       
  4418  */
       
  4419 
       
  4420 GLOBAL(void)
       
  4421 jpeg_idct_6x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  4422 		JCOEFPTR coef_block,
       
  4423 		JSAMPARRAY output_buf, JDIMENSION output_col)
       
  4424 {
       
  4425   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
       
  4426   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
       
  4427   INT32 z1, z2, z3, z4;
       
  4428   JCOEFPTR inptr;
       
  4429   ISLOW_MULT_TYPE * quantptr;
       
  4430   int * wsptr;
       
  4431   JSAMPROW outptr;
       
  4432   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  4433   int ctr;
       
  4434   int workspace[6*12];	/* buffers data between passes */
       
  4435   SHIFT_TEMPS
       
  4436 
       
  4437   /* Pass 1: process columns from input, store into work array.
       
  4438    * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
       
  4439    */
       
  4440   inptr = coef_block;
       
  4441   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  4442   wsptr = workspace;
       
  4443   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
       
  4444     /* Even part */
       
  4445 
       
  4446     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  4447     z3 <<= CONST_BITS;
       
  4448     /* Add fudge factor here for final descale. */
       
  4449     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
       
  4450 
       
  4451     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
       
  4452     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
       
  4453 
       
  4454     tmp10 = z3 + z4;
       
  4455     tmp11 = z3 - z4;
       
  4456 
       
  4457     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
  4458     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
       
  4459     z1 <<= CONST_BITS;
       
  4460     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
       
  4461     z2 <<= CONST_BITS;
       
  4462 
       
  4463     tmp12 = z1 - z2;
       
  4464 
       
  4465     tmp21 = z3 + tmp12;
       
  4466     tmp24 = z3 - tmp12;
       
  4467 
       
  4468     tmp12 = z4 + z2;
       
  4469 
       
  4470     tmp20 = tmp10 + tmp12;
       
  4471     tmp25 = tmp10 - tmp12;
       
  4472 
       
  4473     tmp12 = z4 - z1 - z2;
       
  4474 
       
  4475     tmp22 = tmp11 + tmp12;
       
  4476     tmp23 = tmp11 - tmp12;
       
  4477 
       
  4478     /* Odd part */
       
  4479 
       
  4480     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  4481     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
  4482     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
       
  4483     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
       
  4484 
       
  4485     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
       
  4486     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
       
  4487 
       
  4488     tmp10 = z1 + z3;
       
  4489     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
       
  4490     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
       
  4491     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
       
  4492     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
       
  4493     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
       
  4494     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
       
  4495     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
       
  4496 	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
       
  4497 
       
  4498     z1 -= z4;
       
  4499     z2 -= z3;
       
  4500     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
       
  4501     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
       
  4502     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
       
  4503 
       
  4504     /* Final output stage */
       
  4505 
       
  4506     wsptr[6*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
       
  4507     wsptr[6*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
       
  4508     wsptr[6*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
       
  4509     wsptr[6*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
       
  4510     wsptr[6*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
       
  4511     wsptr[6*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
       
  4512     wsptr[6*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
       
  4513     wsptr[6*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
       
  4514     wsptr[6*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
       
  4515     wsptr[6*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
       
  4516     wsptr[6*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
       
  4517     wsptr[6*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
       
  4518   }
       
  4519 
       
  4520   /* Pass 2: process 12 rows from work array, store into output array.
       
  4521    * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
       
  4522    */
       
  4523   wsptr = workspace;
       
  4524   for (ctr = 0; ctr < 12; ctr++) {
       
  4525     outptr = output_buf[ctr] + output_col;
       
  4526 
       
  4527     /* Even part */
       
  4528 
       
  4529     /* Add fudge factor here for final descale. */
       
  4530     tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
  4531     tmp10 <<= CONST_BITS;
       
  4532     tmp12 = (INT32) wsptr[4];
       
  4533     tmp20 = MULTIPLY(tmp12, FIX(0.707106781));   /* c4 */
       
  4534     tmp11 = tmp10 + tmp20;
       
  4535     tmp21 = tmp10 - tmp20 - tmp20;
       
  4536     tmp20 = (INT32) wsptr[2];
       
  4537     tmp10 = MULTIPLY(tmp20, FIX(1.224744871));   /* c2 */
       
  4538     tmp20 = tmp11 + tmp10;
       
  4539     tmp22 = tmp11 - tmp10;
       
  4540 
       
  4541     /* Odd part */
       
  4542 
       
  4543     z1 = (INT32) wsptr[1];
       
  4544     z2 = (INT32) wsptr[3];
       
  4545     z3 = (INT32) wsptr[5];
       
  4546     tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
       
  4547     tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
       
  4548     tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
       
  4549     tmp11 = (z1 - z2 - z3) << CONST_BITS;
       
  4550 
       
  4551     /* Final output stage */
       
  4552 
       
  4553     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
       
  4554 					      CONST_BITS+PASS1_BITS+3)
       
  4555 			    & RANGE_MASK];
       
  4556     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
       
  4557 					      CONST_BITS+PASS1_BITS+3)
       
  4558 			    & RANGE_MASK];
       
  4559     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
       
  4560 					      CONST_BITS+PASS1_BITS+3)
       
  4561 			    & RANGE_MASK];
       
  4562     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
       
  4563 					      CONST_BITS+PASS1_BITS+3)
       
  4564 			    & RANGE_MASK];
       
  4565     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
       
  4566 					      CONST_BITS+PASS1_BITS+3)
       
  4567 			    & RANGE_MASK];
       
  4568     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
       
  4569 					      CONST_BITS+PASS1_BITS+3)
       
  4570 			    & RANGE_MASK];
       
  4571 
       
  4572     wsptr += 6;		/* advance pointer to next row */
       
  4573   }
       
  4574 }
       
  4575 
       
  4576 
       
  4577 /*
       
  4578  * Perform dequantization and inverse DCT on one block of coefficients,
       
  4579  * producing a 5x10 output block.
       
  4580  *
       
  4581  * 10-point IDCT in pass 1 (columns), 5-point in pass 2 (rows).
       
  4582  */
       
  4583 
       
  4584 GLOBAL(void)
       
  4585 jpeg_idct_5x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  4586 		JCOEFPTR coef_block,
       
  4587 		JSAMPARRAY output_buf, JDIMENSION output_col)
       
  4588 {
       
  4589   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
       
  4590   INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
       
  4591   INT32 z1, z2, z3, z4, z5;
       
  4592   JCOEFPTR inptr;
       
  4593   ISLOW_MULT_TYPE * quantptr;
       
  4594   int * wsptr;
       
  4595   JSAMPROW outptr;
       
  4596   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  4597   int ctr;
       
  4598   int workspace[5*10];	/* buffers data between passes */
       
  4599   SHIFT_TEMPS
       
  4600 
       
  4601   /* Pass 1: process columns from input, store into work array.
       
  4602    * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
       
  4603    */
       
  4604   inptr = coef_block;
       
  4605   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  4606   wsptr = workspace;
       
  4607   for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
       
  4608     /* Even part */
       
  4609 
       
  4610     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  4611     z3 <<= CONST_BITS;
       
  4612     /* Add fudge factor here for final descale. */
       
  4613     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
       
  4614     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
       
  4615     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
       
  4616     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
       
  4617     tmp10 = z3 + z1;
       
  4618     tmp11 = z3 - z2;
       
  4619 
       
  4620     tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
       
  4621 			CONST_BITS-PASS1_BITS);
       
  4622 
       
  4623     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
  4624     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
       
  4625 
       
  4626     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
       
  4627     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
       
  4628     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
       
  4629 
       
  4630     tmp20 = tmp10 + tmp12;
       
  4631     tmp24 = tmp10 - tmp12;
       
  4632     tmp21 = tmp11 + tmp13;
       
  4633     tmp23 = tmp11 - tmp13;
       
  4634 
       
  4635     /* Odd part */
       
  4636 
       
  4637     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  4638     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
  4639     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
       
  4640     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
       
  4641 
       
  4642     tmp11 = z2 + z4;
       
  4643     tmp13 = z2 - z4;
       
  4644 
       
  4645     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
       
  4646     z5 = z3 << CONST_BITS;
       
  4647 
       
  4648     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
       
  4649     z4 = z5 + tmp12;
       
  4650 
       
  4651     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
       
  4652     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
       
  4653 
       
  4654     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
       
  4655     z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
       
  4656 
       
  4657     tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
       
  4658 
       
  4659     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
       
  4660     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
       
  4661 
       
  4662     /* Final output stage */
       
  4663 
       
  4664     wsptr[5*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
       
  4665     wsptr[5*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
       
  4666     wsptr[5*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
       
  4667     wsptr[5*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
       
  4668     wsptr[5*2] = (int) (tmp22 + tmp12);
       
  4669     wsptr[5*7] = (int) (tmp22 - tmp12);
       
  4670     wsptr[5*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
       
  4671     wsptr[5*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
       
  4672     wsptr[5*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
       
  4673     wsptr[5*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
       
  4674   }
       
  4675 
       
  4676   /* Pass 2: process 10 rows from work array, store into output array.
       
  4677    * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
       
  4678    */
       
  4679   wsptr = workspace;
       
  4680   for (ctr = 0; ctr < 10; ctr++) {
       
  4681     outptr = output_buf[ctr] + output_col;
       
  4682 
       
  4683     /* Even part */
       
  4684 
       
  4685     /* Add fudge factor here for final descale. */
       
  4686     tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
  4687     tmp12 <<= CONST_BITS;
       
  4688     tmp13 = (INT32) wsptr[2];
       
  4689     tmp14 = (INT32) wsptr[4];
       
  4690     z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
       
  4691     z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
       
  4692     z3 = tmp12 + z2;
       
  4693     tmp10 = z3 + z1;
       
  4694     tmp11 = z3 - z1;
       
  4695     tmp12 -= z2 << 2;
       
  4696 
       
  4697     /* Odd part */
       
  4698 
       
  4699     z2 = (INT32) wsptr[1];
       
  4700     z3 = (INT32) wsptr[3];
       
  4701 
       
  4702     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));       /* c3 */
       
  4703     tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148));    /* c1-c3 */
       
  4704     tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899));    /* c1+c3 */
       
  4705 
       
  4706     /* Final output stage */
       
  4707 
       
  4708     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp13,
       
  4709 					      CONST_BITS+PASS1_BITS+3)
       
  4710 			    & RANGE_MASK];
       
  4711     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp13,
       
  4712 					      CONST_BITS+PASS1_BITS+3)
       
  4713 			    & RANGE_MASK];
       
  4714     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp14,
       
  4715 					      CONST_BITS+PASS1_BITS+3)
       
  4716 			    & RANGE_MASK];
       
  4717     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp14,
       
  4718 					      CONST_BITS+PASS1_BITS+3)
       
  4719 			    & RANGE_MASK];
       
  4720     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
       
  4721 					      CONST_BITS+PASS1_BITS+3)
       
  4722 			    & RANGE_MASK];
       
  4723 
       
  4724     wsptr += 5;		/* advance pointer to next row */
       
  4725   }
       
  4726 }
       
  4727 
       
  4728 
       
  4729 /*
       
  4730  * Perform dequantization and inverse DCT on one block of coefficients,
       
  4731  * producing a 4x8 output block.
       
  4732  *
       
  4733  * 8-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
       
  4734  */
       
  4735 
       
  4736 GLOBAL(void)
       
  4737 jpeg_idct_4x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  4738 	       JCOEFPTR coef_block,
       
  4739 	       JSAMPARRAY output_buf, JDIMENSION output_col)
       
  4740 {
       
  4741   INT32 tmp0, tmp1, tmp2, tmp3;
       
  4742   INT32 tmp10, tmp11, tmp12, tmp13;
       
  4743   INT32 z1, z2, z3;
       
  4744   JCOEFPTR inptr;
       
  4745   ISLOW_MULT_TYPE * quantptr;
       
  4746   int * wsptr;
       
  4747   JSAMPROW outptr;
       
  4748   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  4749   int ctr;
       
  4750   int workspace[4*8];	/* buffers data between passes */
       
  4751   SHIFT_TEMPS
       
  4752 
       
  4753   /* Pass 1: process columns from input, store into work array. */
       
  4754   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
       
  4755   /* furthermore, we scale the results by 2**PASS1_BITS. */
       
  4756 
       
  4757   inptr = coef_block;
       
  4758   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  4759   wsptr = workspace;
       
  4760   for (ctr = 4; ctr > 0; ctr--) {
       
  4761     /* Due to quantization, we will usually find that many of the input
       
  4762      * coefficients are zero, especially the AC terms.  We can exploit this
       
  4763      * by short-circuiting the IDCT calculation for any column in which all
       
  4764      * the AC terms are zero.  In that case each output is equal to the
       
  4765      * DC coefficient (with scale factor as needed).
       
  4766      * With typical images and quantization tables, half or more of the
       
  4767      * column DCT calculations can be simplified this way.
       
  4768      */
       
  4769 
       
  4770     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
       
  4771 	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
       
  4772 	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
       
  4773 	inptr[DCTSIZE*7] == 0) {
       
  4774       /* AC terms all zero */
       
  4775       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
       
  4776 
       
  4777       wsptr[4*0] = dcval;
       
  4778       wsptr[4*1] = dcval;
       
  4779       wsptr[4*2] = dcval;
       
  4780       wsptr[4*3] = dcval;
       
  4781       wsptr[4*4] = dcval;
       
  4782       wsptr[4*5] = dcval;
       
  4783       wsptr[4*6] = dcval;
       
  4784       wsptr[4*7] = dcval;
       
  4785 
       
  4786       inptr++;			/* advance pointers to next column */
       
  4787       quantptr++;
       
  4788       wsptr++;
       
  4789       continue;
       
  4790     }
       
  4791 
       
  4792     /* Even part: reverse the even part of the forward DCT. */
       
  4793     /* The rotator is sqrt(2)*c(-6). */
       
  4794 
       
  4795     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
  4796     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
       
  4797     
       
  4798     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
       
  4799     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
       
  4800     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
       
  4801     
       
  4802     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  4803     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
       
  4804     z2 <<= CONST_BITS;
       
  4805     z3 <<= CONST_BITS;
       
  4806     /* Add fudge factor here for final descale. */
       
  4807     z2 += ONE << (CONST_BITS-PASS1_BITS-1);
       
  4808 
       
  4809     tmp0 = z2 + z3;
       
  4810     tmp1 = z2 - z3;
       
  4811     
       
  4812     tmp10 = tmp0 + tmp2;
       
  4813     tmp13 = tmp0 - tmp2;
       
  4814     tmp11 = tmp1 + tmp3;
       
  4815     tmp12 = tmp1 - tmp3;
       
  4816 
       
  4817     /* Odd part per figure 8; the matrix is unitary and hence its
       
  4818      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
       
  4819      */
       
  4820 
       
  4821     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
       
  4822     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
       
  4823     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
  4824     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  4825 
       
  4826     z2 = tmp0 + tmp2;
       
  4827     z3 = tmp1 + tmp3;
       
  4828 
       
  4829     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
       
  4830     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
       
  4831     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
       
  4832     z2 += z1;
       
  4833     z3 += z1;
       
  4834 
       
  4835     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
       
  4836     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
       
  4837     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
       
  4838     tmp0 += z1 + z2;
       
  4839     tmp3 += z1 + z3;
       
  4840 
       
  4841     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
       
  4842     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
       
  4843     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
       
  4844     tmp1 += z1 + z3;
       
  4845     tmp2 += z1 + z2;
       
  4846 
       
  4847     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
       
  4848 
       
  4849     wsptr[4*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
       
  4850     wsptr[4*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
       
  4851     wsptr[4*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
       
  4852     wsptr[4*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
       
  4853     wsptr[4*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
       
  4854     wsptr[4*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
       
  4855     wsptr[4*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
       
  4856     wsptr[4*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
       
  4857 
       
  4858     inptr++;			/* advance pointers to next column */
       
  4859     quantptr++;
       
  4860     wsptr++;
       
  4861   }
       
  4862 
       
  4863   /* Pass 2: process 8 rows from work array, store into output array.
       
  4864    * 4-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
       
  4865    */
       
  4866   wsptr = workspace;
       
  4867   for (ctr = 0; ctr < 8; ctr++) {
       
  4868     outptr = output_buf[ctr] + output_col;
       
  4869 
       
  4870     /* Even part */
       
  4871 
       
  4872     /* Add fudge factor here for final descale. */
       
  4873     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
  4874     tmp2 = (INT32) wsptr[2];
       
  4875 
       
  4876     tmp10 = (tmp0 + tmp2) << CONST_BITS;
       
  4877     tmp12 = (tmp0 - tmp2) << CONST_BITS;
       
  4878 
       
  4879     /* Odd part */
       
  4880     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
       
  4881 
       
  4882     z2 = (INT32) wsptr[1];
       
  4883     z3 = (INT32) wsptr[3];
       
  4884 
       
  4885     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
       
  4886     tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
       
  4887     tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
       
  4888 
       
  4889     /* Final output stage */
       
  4890 
       
  4891     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
       
  4892 					      CONST_BITS+PASS1_BITS+3)
       
  4893 			    & RANGE_MASK];
       
  4894     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
       
  4895 					      CONST_BITS+PASS1_BITS+3)
       
  4896 			    & RANGE_MASK];
       
  4897     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
       
  4898 					      CONST_BITS+PASS1_BITS+3)
       
  4899 			    & RANGE_MASK];
       
  4900     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
       
  4901 					      CONST_BITS+PASS1_BITS+3)
       
  4902 			    & RANGE_MASK];
       
  4903     
       
  4904     wsptr += 4;		/* advance pointer to next row */
       
  4905   }
       
  4906 }
       
  4907 
       
  4908 
       
  4909 /*
       
  4910  * Perform dequantization and inverse DCT on one block of coefficients,
       
  4911  * producing a reduced-size 3x6 output block.
       
  4912  *
       
  4913  * 6-point IDCT in pass 1 (columns), 3-point in pass 2 (rows).
       
  4914  */
       
  4915 
       
  4916 GLOBAL(void)
       
  4917 jpeg_idct_3x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  4918 	       JCOEFPTR coef_block,
       
  4919 	       JSAMPARRAY output_buf, JDIMENSION output_col)
       
  4920 {
       
  4921   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
       
  4922   INT32 z1, z2, z3;
       
  4923   JCOEFPTR inptr;
       
  4924   ISLOW_MULT_TYPE * quantptr;
       
  4925   int * wsptr;
       
  4926   JSAMPROW outptr;
       
  4927   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  4928   int ctr;
       
  4929   int workspace[3*6];	/* buffers data between passes */
       
  4930   SHIFT_TEMPS
       
  4931 
       
  4932   /* Pass 1: process columns from input, store into work array.
       
  4933    * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
       
  4934    */
       
  4935   inptr = coef_block;
       
  4936   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  4937   wsptr = workspace;
       
  4938   for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
       
  4939     /* Even part */
       
  4940 
       
  4941     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  4942     tmp0 <<= CONST_BITS;
       
  4943     /* Add fudge factor here for final descale. */
       
  4944     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
       
  4945     tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
       
  4946     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
       
  4947     tmp1 = tmp0 + tmp10;
       
  4948     tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
       
  4949     tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
  4950     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
       
  4951     tmp10 = tmp1 + tmp0;
       
  4952     tmp12 = tmp1 - tmp0;
       
  4953 
       
  4954     /* Odd part */
       
  4955 
       
  4956     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  4957     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
  4958     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
       
  4959     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
       
  4960     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
       
  4961     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
       
  4962     tmp1 = (z1 - z2 - z3) << PASS1_BITS;
       
  4963 
       
  4964     /* Final output stage */
       
  4965 
       
  4966     wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
       
  4967     wsptr[3*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
       
  4968     wsptr[3*1] = (int) (tmp11 + tmp1);
       
  4969     wsptr[3*4] = (int) (tmp11 - tmp1);
       
  4970     wsptr[3*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
       
  4971     wsptr[3*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
       
  4972   }
       
  4973 
       
  4974   /* Pass 2: process 6 rows from work array, store into output array.
       
  4975    * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
       
  4976    */
       
  4977   wsptr = workspace;
       
  4978   for (ctr = 0; ctr < 6; ctr++) {
       
  4979     outptr = output_buf[ctr] + output_col;
       
  4980 
       
  4981     /* Even part */
       
  4982 
       
  4983     /* Add fudge factor here for final descale. */
       
  4984     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
       
  4985     tmp0 <<= CONST_BITS;
       
  4986     tmp2 = (INT32) wsptr[2];
       
  4987     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
       
  4988     tmp10 = tmp0 + tmp12;
       
  4989     tmp2 = tmp0 - tmp12 - tmp12;
       
  4990 
       
  4991     /* Odd part */
       
  4992 
       
  4993     tmp12 = (INT32) wsptr[1];
       
  4994     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
       
  4995 
       
  4996     /* Final output stage */
       
  4997 
       
  4998     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
       
  4999 					      CONST_BITS+PASS1_BITS+3)
       
  5000 			    & RANGE_MASK];
       
  5001     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
       
  5002 					      CONST_BITS+PASS1_BITS+3)
       
  5003 			    & RANGE_MASK];
       
  5004     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
       
  5005 					      CONST_BITS+PASS1_BITS+3)
       
  5006 			    & RANGE_MASK];
       
  5007 
       
  5008     wsptr += 3;		/* advance pointer to next row */
       
  5009   }
       
  5010 }
       
  5011 
       
  5012 
       
  5013 /*
       
  5014  * Perform dequantization and inverse DCT on one block of coefficients,
       
  5015  * producing a 2x4 output block.
       
  5016  *
       
  5017  * 4-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
       
  5018  */
       
  5019 
       
  5020 GLOBAL(void)
       
  5021 jpeg_idct_2x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  5022 	       JCOEFPTR coef_block,
       
  5023 	       JSAMPARRAY output_buf, JDIMENSION output_col)
       
  5024 {
       
  5025   INT32 tmp0, tmp2, tmp10, tmp12;
       
  5026   INT32 z1, z2, z3;
       
  5027   JCOEFPTR inptr;
       
  5028   ISLOW_MULT_TYPE * quantptr;
       
  5029   INT32 * wsptr;
       
  5030   JSAMPROW outptr;
       
  5031   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  5032   int ctr;
       
  5033   INT32 workspace[2*4];	/* buffers data between passes */
       
  5034   SHIFT_TEMPS
       
  5035 
       
  5036   /* Pass 1: process columns from input, store into work array.
       
  5037    * 4-point IDCT kernel,
       
  5038    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
       
  5039    */
       
  5040   inptr = coef_block;
       
  5041   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  5042   wsptr = workspace;
       
  5043   for (ctr = 0; ctr < 2; ctr++, inptr++, quantptr++, wsptr++) {
       
  5044     /* Even part */
       
  5045 
       
  5046     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  5047     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
       
  5048 
       
  5049     tmp10 = (tmp0 + tmp2) << CONST_BITS;
       
  5050     tmp12 = (tmp0 - tmp2) << CONST_BITS;
       
  5051 
       
  5052     /* Odd part */
       
  5053     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
       
  5054 
       
  5055     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  5056     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
       
  5057 
       
  5058     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
       
  5059     tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
       
  5060     tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
       
  5061 
       
  5062     /* Final output stage */
       
  5063 
       
  5064     wsptr[2*0] = tmp10 + tmp0;
       
  5065     wsptr[2*3] = tmp10 - tmp0;
       
  5066     wsptr[2*1] = tmp12 + tmp2;
       
  5067     wsptr[2*2] = tmp12 - tmp2;
       
  5068   }
       
  5069 
       
  5070   /* Pass 2: process 4 rows from work array, store into output array. */
       
  5071 
       
  5072   wsptr = workspace;
       
  5073   for (ctr = 0; ctr < 4; ctr++) {
       
  5074     outptr = output_buf[ctr] + output_col;
       
  5075 
       
  5076     /* Even part */
       
  5077 
       
  5078     /* Add fudge factor here for final descale. */
       
  5079     tmp10 = wsptr[0] + (ONE << (CONST_BITS+2));
       
  5080 
       
  5081     /* Odd part */
       
  5082 
       
  5083     tmp0 = wsptr[1];
       
  5084 
       
  5085     /* Final output stage */
       
  5086 
       
  5087     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS+3)
       
  5088 			    & RANGE_MASK];
       
  5089     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS+3)
       
  5090 			    & RANGE_MASK];
       
  5091 
       
  5092     wsptr += 2;		/* advance pointer to next row */
       
  5093   }
       
  5094 }
       
  5095 
       
  5096 
       
  5097 /*
       
  5098  * Perform dequantization and inverse DCT on one block of coefficients,
       
  5099  * producing a 1x2 output block.
       
  5100  *
       
  5101  * 2-point IDCT in pass 1 (columns), 1-point in pass 2 (rows).
       
  5102  */
       
  5103 
       
  5104 GLOBAL(void)
       
  5105 jpeg_idct_1x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
       
  5106 	       JCOEFPTR coef_block,
       
  5107 	       JSAMPARRAY output_buf, JDIMENSION output_col)
       
  5108 {
       
  5109   INT32 tmp0, tmp10;
       
  5110   ISLOW_MULT_TYPE * quantptr;
       
  5111   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
       
  5112   SHIFT_TEMPS
       
  5113 
       
  5114   /* Process 1 column from input, store into output array. */
       
  5115 
       
  5116   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
       
  5117 
       
  5118   /* Even part */
       
  5119     
       
  5120   tmp10 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
  5121   /* Add fudge factor here for final descale. */
       
  5122   tmp10 += ONE << 2;
       
  5123 
       
  5124   /* Odd part */
       
  5125 
       
  5126   tmp0 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
       
  5127 
       
  5128   /* Final output stage */
       
  5129 
       
  5130   output_buf[0][output_col] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, 3)
       
  5131 					  & RANGE_MASK];
       
  5132   output_buf[1][output_col] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, 3)
       
  5133 					  & RANGE_MASK];
       
  5134 }
       
  5135 
       
  5136 #endif /* IDCT_SCALING_SUPPORTED */
   389 #endif /* DCT_ISLOW_SUPPORTED */
  5137 #endif /* DCT_ISLOW_SUPPORTED */