src/3rdparty/libjpeg/jfdctint.c
branchGCC_SURGE
changeset 31 5daf16870df6
parent 30 5dc02b23752f
equal deleted inserted replaced
27:93b982ccede2 31:5daf16870df6
     1 /*
     1 /*
     2  * jfdctint.c
     2  * jfdctint.c
     3  *
     3  *
     4  * Copyright (C) 1991-1996, Thomas G. Lane.
     4  * Copyright (C) 1991-1996, Thomas G. Lane.
       
     5  * Modification developed 2003-2009 by Guido Vollbeding.
     5  * This file is part of the Independent JPEG Group's software.
     6  * This file is part of the Independent JPEG Group's software.
     6  * For conditions of distribution and use, see the accompanying README file.
     7  * For conditions of distribution and use, see the accompanying README file.
     7  *
     8  *
     8  * This file contains a slow-but-accurate integer implementation of the
     9  * This file contains a slow-but-accurate integer implementation of the
     9  * forward DCT (Discrete Cosine Transform).
    10  * forward DCT (Discrete Cosine Transform).
    19  * The primary algorithm described there uses 11 multiplies and 29 adds.
    20  * The primary algorithm described there uses 11 multiplies and 29 adds.
    20  * We use their alternate method with 12 multiplies and 32 adds.
    21  * We use their alternate method with 12 multiplies and 32 adds.
    21  * The advantage of this method is that no data path contains more than one
    22  * The advantage of this method is that no data path contains more than one
    22  * multiplication; this allows a very simple and accurate implementation in
    23  * multiplication; this allows a very simple and accurate implementation in
    23  * scaled fixed-point arithmetic, with a minimal number of shifts.
    24  * scaled fixed-point arithmetic, with a minimal number of shifts.
       
    25  *
       
    26  * We also provide FDCT routines with various input sample block sizes for
       
    27  * direct resolution reduction or enlargement and for direct resolving the
       
    28  * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
       
    29  * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 output DCT block.
       
    30  *
       
    31  * For N<8 we fill the remaining block coefficients with zero.
       
    32  * For N>8 we apply a partial N-point FDCT on the input samples, computing
       
    33  * just the lower 8 frequency coefficients and discarding the rest.
       
    34  *
       
    35  * We must scale the output coefficients of the N-point FDCT appropriately
       
    36  * to the standard 8-point FDCT level by 8/N per 1-D pass.  This scaling
       
    37  * is folded into the constant multipliers (pass 2) and/or final/initial
       
    38  * shifting.
       
    39  *
       
    40  * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
       
    41  * since there would be too many additional constants to pre-calculate.
    24  */
    42  */
    25 
    43 
    26 #define JPEG_INTERNALS
    44 #define JPEG_INTERNALS
    27 #include "jinclude.h"
    45 #include "jinclude.h"
    28 #include "jpeglib.h"
    46 #include "jpeglib.h"
    34 /*
    52 /*
    35  * This module is specialized to the case DCTSIZE = 8.
    53  * This module is specialized to the case DCTSIZE = 8.
    36  */
    54  */
    37 
    55 
    38 #if DCTSIZE != 8
    56 #if DCTSIZE != 8
    39   Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
    57   Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
    40 #endif
    58 #endif
    41 
    59 
    42 
    60 
    43 /*
    61 /*
    44  * The poop on this scaling stuff is as follows:
    62  * The poop on this scaling stuff is as follows:
   135 /*
   153 /*
   136  * Perform the forward DCT on one block of samples.
   154  * Perform the forward DCT on one block of samples.
   137  */
   155  */
   138 
   156 
   139 GLOBAL(void)
   157 GLOBAL(void)
   140 jpeg_fdct_islow (DCTELEM * data)
   158 jpeg_fdct_islow (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
   141 {
   159 {
   142   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   160   INT32 tmp0, tmp1, tmp2, tmp3;
   143   INT32 tmp10, tmp11, tmp12, tmp13;
   161   INT32 tmp10, tmp11, tmp12, tmp13;
   144   INT32 z1, z2, z3, z4, z5;
   162   INT32 z1;
   145   DCTELEM *dataptr;
   163   DCTELEM *dataptr;
       
   164   JSAMPROW elemptr;
   146   int ctr;
   165   int ctr;
   147   SHIFT_TEMPS
   166   SHIFT_TEMPS
   148 
   167 
   149   /* Pass 1: process rows. */
   168   /* Pass 1: process rows. */
   150   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
   169   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
   151   /* furthermore, we scale the results by 2**PASS1_BITS. */
   170   /* furthermore, we scale the results by 2**PASS1_BITS. */
   152 
   171 
   153   dataptr = data;
   172   dataptr = data;
   154   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
   173   for (ctr = 0; ctr < DCTSIZE; ctr++) {
   155     tmp0 = dataptr[0] + dataptr[7];
   174     elemptr = sample_data[ctr] + start_col;
   156     tmp7 = dataptr[0] - dataptr[7];
   175 
   157     tmp1 = dataptr[1] + dataptr[6];
       
   158     tmp6 = dataptr[1] - dataptr[6];
       
   159     tmp2 = dataptr[2] + dataptr[5];
       
   160     tmp5 = dataptr[2] - dataptr[5];
       
   161     tmp3 = dataptr[3] + dataptr[4];
       
   162     tmp4 = dataptr[3] - dataptr[4];
       
   163     
       
   164     /* Even part per LL&M figure 1 --- note that published figure is faulty;
   176     /* Even part per LL&M figure 1 --- note that published figure is faulty;
   165      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
   177      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
   166      */
   178      */
   167     
   179 
       
   180     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
       
   181     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
       
   182     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
       
   183     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
       
   184 
   168     tmp10 = tmp0 + tmp3;
   185     tmp10 = tmp0 + tmp3;
   169     tmp13 = tmp0 - tmp3;
   186     tmp12 = tmp0 - tmp3;
   170     tmp11 = tmp1 + tmp2;
   187     tmp11 = tmp1 + tmp2;
   171     tmp12 = tmp1 - tmp2;
   188     tmp13 = tmp1 - tmp2;
   172     
   189 
   173     dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
   190     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
       
   191     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
       
   192     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
       
   193     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
       
   194 
       
   195     /* Apply unsigned->signed conversion */
       
   196     dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
   174     dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
   197     dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
   175     
   198 
   176     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
   199     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
   177     dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
   200     /* Add fudge factor here for final descale. */
   178 				   CONST_BITS-PASS1_BITS);
   201     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
   179     dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
   202     dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
   180 				   CONST_BITS-PASS1_BITS);
   203 				       CONST_BITS-PASS1_BITS);
   181     
   204     dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
       
   205 				       CONST_BITS-PASS1_BITS);
       
   206 
   182     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
   207     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
   183      * cK represents cos(K*pi/16).
   208      * cK represents sqrt(2) * cos(K*pi/16).
   184      * i0..i3 in the paper are tmp4..tmp7 here.
   209      * i0..i3 in the paper are tmp0..tmp3 here.
   185      */
   210      */
   186     
   211 
   187     z1 = tmp4 + tmp7;
   212     tmp10 = tmp0 + tmp3;
   188     z2 = tmp5 + tmp6;
   213     tmp11 = tmp1 + tmp2;
   189     z3 = tmp4 + tmp6;
   214     tmp12 = tmp0 + tmp2;
   190     z4 = tmp5 + tmp7;
   215     tmp13 = tmp1 + tmp3;
   191     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
   216     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
   192     
   217     /* Add fudge factor here for final descale. */
   193     tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
   218     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
   194     tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
   219 
   195     tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
   220     tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
   196     tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
   221     tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
   197     z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
   222     tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
   198     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
   223     tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
   199     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
   224     tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
   200     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
   225     tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
   201     
   226     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
   202     z3 += z5;
   227     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
   203     z4 += z5;
   228 
   204     
   229     tmp12 += z1;
   205     dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
   230     tmp13 += z1;
   206     dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
   231 
   207     dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
   232     dataptr[1] = (DCTELEM)
   208     dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
   233       RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
   209     
   234     dataptr[3] = (DCTELEM)
       
   235       RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
       
   236     dataptr[5] = (DCTELEM)
       
   237       RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
       
   238     dataptr[7] = (DCTELEM)
       
   239       RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
       
   240 
   210     dataptr += DCTSIZE;		/* advance pointer to next row */
   241     dataptr += DCTSIZE;		/* advance pointer to next row */
   211   }
   242   }
   212 
   243 
   213   /* Pass 2: process columns.
   244   /* Pass 2: process columns.
   214    * We remove the PASS1_BITS scaling, but leave the results scaled up
   245    * We remove the PASS1_BITS scaling, but leave the results scaled up
   215    * by an overall factor of 8.
   246    * by an overall factor of 8.
   216    */
   247    */
   217 
   248 
   218   dataptr = data;
   249   dataptr = data;
   219   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
   250   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
   220     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
       
   221     tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
       
   222     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
       
   223     tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
       
   224     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
       
   225     tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
       
   226     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
       
   227     tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
       
   228     
       
   229     /* Even part per LL&M figure 1 --- note that published figure is faulty;
   251     /* Even part per LL&M figure 1 --- note that published figure is faulty;
   230      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
   252      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
   231      */
   253      */
   232     
   254 
       
   255     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
       
   256     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
       
   257     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
       
   258     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
       
   259 
       
   260     /* Add fudge factor here for final descale. */
       
   261     tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
       
   262     tmp12 = tmp0 - tmp3;
       
   263     tmp11 = tmp1 + tmp2;
       
   264     tmp13 = tmp1 - tmp2;
       
   265 
       
   266     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
       
   267     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
       
   268     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
       
   269     tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
       
   270 
       
   271     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
       
   272     dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
       
   273 
       
   274     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
       
   275     /* Add fudge factor here for final descale. */
       
   276     z1 += ONE << (CONST_BITS+PASS1_BITS-1);
       
   277     dataptr[DCTSIZE*2] = (DCTELEM)
       
   278       RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
       
   279     dataptr[DCTSIZE*6] = (DCTELEM)
       
   280       RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
       
   281 
       
   282     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
       
   283      * cK represents sqrt(2) * cos(K*pi/16).
       
   284      * i0..i3 in the paper are tmp0..tmp3 here.
       
   285      */
       
   286 
   233     tmp10 = tmp0 + tmp3;
   287     tmp10 = tmp0 + tmp3;
   234     tmp13 = tmp0 - tmp3;
       
   235     tmp11 = tmp1 + tmp2;
   288     tmp11 = tmp1 + tmp2;
   236     tmp12 = tmp1 - tmp2;
   289     tmp12 = tmp0 + tmp2;
   237     
   290     tmp13 = tmp1 + tmp3;
   238     dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
   291     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
   239     dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);
   292     /* Add fudge factor here for final descale. */
   240     
   293     z1 += ONE << (CONST_BITS+PASS1_BITS-1);
       
   294 
       
   295     tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
       
   296     tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
       
   297     tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
       
   298     tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
       
   299     tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
       
   300     tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
       
   301     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
       
   302     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
       
   303 
       
   304     tmp12 += z1;
       
   305     tmp13 += z1;
       
   306 
       
   307     dataptr[DCTSIZE*1] = (DCTELEM)
       
   308       RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
       
   309     dataptr[DCTSIZE*3] = (DCTELEM)
       
   310       RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
       
   311     dataptr[DCTSIZE*5] = (DCTELEM)
       
   312       RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
       
   313     dataptr[DCTSIZE*7] = (DCTELEM)
       
   314       RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
       
   315 
       
   316     dataptr++;			/* advance pointer to next column */
       
   317   }
       
   318 }
       
   319 
       
   320 #ifdef DCT_SCALING_SUPPORTED
       
   321 
       
   322 
       
   323 /*
       
   324  * Perform the forward DCT on a 7x7 sample block.
       
   325  */
       
   326 
       
   327 GLOBAL(void)
       
   328 jpeg_fdct_7x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
   329 {
       
   330   INT32 tmp0, tmp1, tmp2, tmp3;
       
   331   INT32 tmp10, tmp11, tmp12;
       
   332   INT32 z1, z2, z3;
       
   333   DCTELEM *dataptr;
       
   334   JSAMPROW elemptr;
       
   335   int ctr;
       
   336   SHIFT_TEMPS
       
   337 
       
   338   /* Pre-zero output coefficient block. */
       
   339   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
       
   340 
       
   341   /* Pass 1: process rows. */
       
   342   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
   343   /* furthermore, we scale the results by 2**PASS1_BITS. */
       
   344   /* cK represents sqrt(2) * cos(K*pi/14). */
       
   345 
       
   346   dataptr = data;
       
   347   for (ctr = 0; ctr < 7; ctr++) {
       
   348     elemptr = sample_data[ctr] + start_col;
       
   349 
       
   350     /* Even part */
       
   351 
       
   352     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
       
   353     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
       
   354     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
       
   355     tmp3 = GETJSAMPLE(elemptr[3]);
       
   356 
       
   357     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
       
   358     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
       
   359     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
       
   360 
       
   361     z1 = tmp0 + tmp2;
       
   362     /* Apply unsigned->signed conversion */
       
   363     dataptr[0] = (DCTELEM)
       
   364       ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
       
   365     tmp3 += tmp3;
       
   366     z1 -= tmp3;
       
   367     z1 -= tmp3;
       
   368     z1 = MULTIPLY(z1, FIX(0.353553391));                /* (c2+c6-c4)/2 */
       
   369     z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002));       /* (c2+c4-c6)/2 */
       
   370     z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123));       /* c6 */
       
   371     dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
       
   372     z1 -= z2;
       
   373     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734));       /* c4 */
       
   374     dataptr[4] = (DCTELEM)
       
   375       DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
       
   376 	      CONST_BITS-PASS1_BITS);
       
   377     dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
       
   378 
       
   379     /* Odd part */
       
   380 
       
   381     tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347));   /* (c3+c1-c5)/2 */
       
   382     tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339));   /* (c3+c5-c1)/2 */
       
   383     tmp0 = tmp1 - tmp2;
       
   384     tmp1 += tmp2;
       
   385     tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
       
   386     tmp1 += tmp2;
       
   387     tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268));   /* c5 */
       
   388     tmp0 += tmp3;
       
   389     tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693));   /* c3+c1-c5 */
       
   390 
       
   391     dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
       
   392     dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
       
   393     dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
       
   394 
       
   395     dataptr += DCTSIZE;		/* advance pointer to next row */
       
   396   }
       
   397 
       
   398   /* Pass 2: process columns.
       
   399    * We remove the PASS1_BITS scaling, but leave the results scaled up
       
   400    * by an overall factor of 8.
       
   401    * We must also scale the output by (8/7)**2 = 64/49, which we fold
       
   402    * into the constant multipliers:
       
   403    * cK now represents sqrt(2) * cos(K*pi/14) * 64/49.
       
   404    */
       
   405 
       
   406   dataptr = data;
       
   407   for (ctr = 0; ctr < 7; ctr++) {
       
   408     /* Even part */
       
   409 
       
   410     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
       
   411     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
       
   412     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
       
   413     tmp3 = dataptr[DCTSIZE*3];
       
   414 
       
   415     tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
       
   416     tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
       
   417     tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
       
   418 
       
   419     z1 = tmp0 + tmp2;
       
   420     dataptr[DCTSIZE*0] = (DCTELEM)
       
   421       DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
       
   422 	      CONST_BITS+PASS1_BITS);
       
   423     tmp3 += tmp3;
       
   424     z1 -= tmp3;
       
   425     z1 -= tmp3;
       
   426     z1 = MULTIPLY(z1, FIX(0.461784020));                /* (c2+c6-c4)/2 */
       
   427     z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084));       /* (c2+c4-c6)/2 */
       
   428     z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446));       /* c6 */
       
   429     dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS);
       
   430     z1 -= z2;
       
   431     z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509));       /* c4 */
       
   432     dataptr[DCTSIZE*4] = (DCTELEM)
       
   433       DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
       
   434 	      CONST_BITS+PASS1_BITS);
       
   435     dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS);
       
   436 
       
   437     /* Odd part */
       
   438 
       
   439     tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677));   /* (c3+c1-c5)/2 */
       
   440     tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464));   /* (c3+c5-c1)/2 */
       
   441     tmp0 = tmp1 - tmp2;
       
   442     tmp1 += tmp2;
       
   443     tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
       
   444     tmp1 += tmp2;
       
   445     tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310));   /* c5 */
       
   446     tmp0 += tmp3;
       
   447     tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355));   /* c3+c1-c5 */
       
   448 
       
   449     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS);
       
   450     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS);
       
   451     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS);
       
   452 
       
   453     dataptr++;			/* advance pointer to next column */
       
   454   }
       
   455 }
       
   456 
       
   457 
       
   458 /*
       
   459  * Perform the forward DCT on a 6x6 sample block.
       
   460  */
       
   461 
       
   462 GLOBAL(void)
       
   463 jpeg_fdct_6x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
   464 {
       
   465   INT32 tmp0, tmp1, tmp2;
       
   466   INT32 tmp10, tmp11, tmp12;
       
   467   DCTELEM *dataptr;
       
   468   JSAMPROW elemptr;
       
   469   int ctr;
       
   470   SHIFT_TEMPS
       
   471 
       
   472   /* Pre-zero output coefficient block. */
       
   473   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
       
   474 
       
   475   /* Pass 1: process rows. */
       
   476   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
   477   /* furthermore, we scale the results by 2**PASS1_BITS. */
       
   478   /* cK represents sqrt(2) * cos(K*pi/12). */
       
   479 
       
   480   dataptr = data;
       
   481   for (ctr = 0; ctr < 6; ctr++) {
       
   482     elemptr = sample_data[ctr] + start_col;
       
   483 
       
   484     /* Even part */
       
   485 
       
   486     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
       
   487     tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
       
   488     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
       
   489 
       
   490     tmp10 = tmp0 + tmp2;
       
   491     tmp12 = tmp0 - tmp2;
       
   492 
       
   493     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
       
   494     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
       
   495     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
       
   496 
       
   497     /* Apply unsigned->signed conversion */
       
   498     dataptr[0] = (DCTELEM)
       
   499       ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
       
   500     dataptr[2] = (DCTELEM)
       
   501       DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
       
   502 	      CONST_BITS-PASS1_BITS);
       
   503     dataptr[4] = (DCTELEM)
       
   504       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
       
   505 	      CONST_BITS-PASS1_BITS);
       
   506 
       
   507     /* Odd part */
       
   508 
       
   509     tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
       
   510 		    CONST_BITS-PASS1_BITS);
       
   511 
       
   512     dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
       
   513     dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
       
   514     dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
       
   515 
       
   516     dataptr += DCTSIZE;		/* advance pointer to next row */
       
   517   }
       
   518 
       
   519   /* Pass 2: process columns.
       
   520    * We remove the PASS1_BITS scaling, but leave the results scaled up
       
   521    * by an overall factor of 8.
       
   522    * We must also scale the output by (8/6)**2 = 16/9, which we fold
       
   523    * into the constant multipliers:
       
   524    * cK now represents sqrt(2) * cos(K*pi/12) * 16/9.
       
   525    */
       
   526 
       
   527   dataptr = data;
       
   528   for (ctr = 0; ctr < 6; ctr++) {
       
   529     /* Even part */
       
   530 
       
   531     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
       
   532     tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
       
   533     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
       
   534 
       
   535     tmp10 = tmp0 + tmp2;
       
   536     tmp12 = tmp0 - tmp2;
       
   537 
       
   538     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
       
   539     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
       
   540     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
       
   541 
       
   542     dataptr[DCTSIZE*0] = (DCTELEM)
       
   543       DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
       
   544 	      CONST_BITS+PASS1_BITS);
       
   545     dataptr[DCTSIZE*2] = (DCTELEM)
       
   546       DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
       
   547 	      CONST_BITS+PASS1_BITS);
       
   548     dataptr[DCTSIZE*4] = (DCTELEM)
       
   549       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
       
   550 	      CONST_BITS+PASS1_BITS);
       
   551 
       
   552     /* Odd part */
       
   553 
       
   554     tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */
       
   555 
       
   556     dataptr[DCTSIZE*1] = (DCTELEM)
       
   557       DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
       
   558 	      CONST_BITS+PASS1_BITS);
       
   559     dataptr[DCTSIZE*3] = (DCTELEM)
       
   560       DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
       
   561 	      CONST_BITS+PASS1_BITS);
       
   562     dataptr[DCTSIZE*5] = (DCTELEM)
       
   563       DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
       
   564 	      CONST_BITS+PASS1_BITS);
       
   565 
       
   566     dataptr++;			/* advance pointer to next column */
       
   567   }
       
   568 }
       
   569 
       
   570 
       
   571 /*
       
   572  * Perform the forward DCT on a 5x5 sample block.
       
   573  */
       
   574 
       
   575 GLOBAL(void)
       
   576 jpeg_fdct_5x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
   577 {
       
   578   INT32 tmp0, tmp1, tmp2;
       
   579   INT32 tmp10, tmp11;
       
   580   DCTELEM *dataptr;
       
   581   JSAMPROW elemptr;
       
   582   int ctr;
       
   583   SHIFT_TEMPS
       
   584 
       
   585   /* Pre-zero output coefficient block. */
       
   586   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
       
   587 
       
   588   /* Pass 1: process rows. */
       
   589   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
   590   /* furthermore, we scale the results by 2**PASS1_BITS. */
       
   591   /* We scale the results further by 2 as part of output adaption */
       
   592   /* scaling for different DCT size. */
       
   593   /* cK represents sqrt(2) * cos(K*pi/10). */
       
   594 
       
   595   dataptr = data;
       
   596   for (ctr = 0; ctr < 5; ctr++) {
       
   597     elemptr = sample_data[ctr] + start_col;
       
   598 
       
   599     /* Even part */
       
   600 
       
   601     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
       
   602     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
       
   603     tmp2 = GETJSAMPLE(elemptr[2]);
       
   604 
       
   605     tmp10 = tmp0 + tmp1;
       
   606     tmp11 = tmp0 - tmp1;
       
   607 
       
   608     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
       
   609     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
       
   610 
       
   611     /* Apply unsigned->signed conversion */
       
   612     dataptr[0] = (DCTELEM)
       
   613       ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << (PASS1_BITS+1));
       
   614     tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
       
   615     tmp10 -= tmp2 << 2;
       
   616     tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */
       
   617     dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS-1);
       
   618     dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS-1);
       
   619 
       
   620     /* Odd part */
       
   621 
       
   622     tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876));    /* c3 */
       
   623 
       
   624     dataptr[1] = (DCTELEM)
       
   625       DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
       
   626 	      CONST_BITS-PASS1_BITS-1);
       
   627     dataptr[3] = (DCTELEM)
       
   628       DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
       
   629 	      CONST_BITS-PASS1_BITS-1);
       
   630 
       
   631     dataptr += DCTSIZE;		/* advance pointer to next row */
       
   632   }
       
   633 
       
   634   /* Pass 2: process columns.
       
   635    * We remove the PASS1_BITS scaling, but leave the results scaled up
       
   636    * by an overall factor of 8.
       
   637    * We must also scale the output by (8/5)**2 = 64/25, which we partially
       
   638    * fold into the constant multipliers (other part was done in pass 1):
       
   639    * cK now represents sqrt(2) * cos(K*pi/10) * 32/25.
       
   640    */
       
   641 
       
   642   dataptr = data;
       
   643   for (ctr = 0; ctr < 5; ctr++) {
       
   644     /* Even part */
       
   645 
       
   646     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
       
   647     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
       
   648     tmp2 = dataptr[DCTSIZE*2];
       
   649 
       
   650     tmp10 = tmp0 + tmp1;
       
   651     tmp11 = tmp0 - tmp1;
       
   652 
       
   653     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
       
   654     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
       
   655 
       
   656     dataptr[DCTSIZE*0] = (DCTELEM)
       
   657       DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)),        /* 32/25 */
       
   658 	      CONST_BITS+PASS1_BITS);
       
   659     tmp11 = MULTIPLY(tmp11, FIX(1.011928851));          /* (c2+c4)/2 */
       
   660     tmp10 -= tmp2 << 2;
       
   661     tmp10 = MULTIPLY(tmp10, FIX(0.452548340));          /* (c2-c4)/2 */
       
   662     dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
       
   663     dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
       
   664 
       
   665     /* Odd part */
       
   666 
       
   667     tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961));    /* c3 */
       
   668 
       
   669     dataptr[DCTSIZE*1] = (DCTELEM)
       
   670       DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
       
   671 	      CONST_BITS+PASS1_BITS);
       
   672     dataptr[DCTSIZE*3] = (DCTELEM)
       
   673       DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
       
   674 	      CONST_BITS+PASS1_BITS);
       
   675 
       
   676     dataptr++;			/* advance pointer to next column */
       
   677   }
       
   678 }
       
   679 
       
   680 
       
   681 /*
       
   682  * Perform the forward DCT on a 4x4 sample block.
       
   683  */
       
   684 
       
   685 GLOBAL(void)
       
   686 jpeg_fdct_4x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
   687 {
       
   688   INT32 tmp0, tmp1;
       
   689   INT32 tmp10, tmp11;
       
   690   DCTELEM *dataptr;
       
   691   JSAMPROW elemptr;
       
   692   int ctr;
       
   693   SHIFT_TEMPS
       
   694 
       
   695   /* Pre-zero output coefficient block. */
       
   696   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
       
   697 
       
   698   /* Pass 1: process rows. */
       
   699   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
   700   /* furthermore, we scale the results by 2**PASS1_BITS. */
       
   701   /* We must also scale the output by (8/4)**2 = 2**2, which we add here. */
       
   702   /* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */
       
   703 
       
   704   dataptr = data;
       
   705   for (ctr = 0; ctr < 4; ctr++) {
       
   706     elemptr = sample_data[ctr] + start_col;
       
   707 
       
   708     /* Even part */
       
   709 
       
   710     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
       
   711     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
       
   712 
       
   713     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
       
   714     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
       
   715 
       
   716     /* Apply unsigned->signed conversion */
       
   717     dataptr[0] = (DCTELEM)
       
   718       ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+2));
       
   719     dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+2));
       
   720 
       
   721     /* Odd part */
       
   722 
       
   723     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
       
   724     /* Add fudge factor here for final descale. */
       
   725     tmp0 += ONE << (CONST_BITS-PASS1_BITS-3);
       
   726 
       
   727     dataptr[1] = (DCTELEM)
       
   728       RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
       
   729 		  CONST_BITS-PASS1_BITS-2);
       
   730     dataptr[3] = (DCTELEM)
       
   731       RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
       
   732 		  CONST_BITS-PASS1_BITS-2);
       
   733 
       
   734     dataptr += DCTSIZE;		/* advance pointer to next row */
       
   735   }
       
   736 
       
   737   /* Pass 2: process columns.
       
   738    * We remove the PASS1_BITS scaling, but leave the results scaled up
       
   739    * by an overall factor of 8.
       
   740    */
       
   741 
       
   742   dataptr = data;
       
   743   for (ctr = 0; ctr < 4; ctr++) {
       
   744     /* Even part */
       
   745 
       
   746     /* Add fudge factor here for final descale. */
       
   747     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
       
   748     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
       
   749 
       
   750     tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
       
   751     tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
       
   752 
       
   753     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
       
   754     dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
       
   755 
       
   756     /* Odd part */
       
   757 
       
   758     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
       
   759     /* Add fudge factor here for final descale. */
       
   760     tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
       
   761 
       
   762     dataptr[DCTSIZE*1] = (DCTELEM)
       
   763       RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
       
   764 		  CONST_BITS+PASS1_BITS);
       
   765     dataptr[DCTSIZE*3] = (DCTELEM)
       
   766       RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
       
   767 		  CONST_BITS+PASS1_BITS);
       
   768 
       
   769     dataptr++;			/* advance pointer to next column */
       
   770   }
       
   771 }
       
   772 
       
   773 
       
   774 /*
       
   775  * Perform the forward DCT on a 3x3 sample block.
       
   776  */
       
   777 
       
   778 GLOBAL(void)
       
   779 jpeg_fdct_3x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
   780 {
       
   781   INT32 tmp0, tmp1, tmp2;
       
   782   DCTELEM *dataptr;
       
   783   JSAMPROW elemptr;
       
   784   int ctr;
       
   785   SHIFT_TEMPS
       
   786 
       
   787   /* Pre-zero output coefficient block. */
       
   788   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
       
   789 
       
   790   /* Pass 1: process rows. */
       
   791   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
   792   /* furthermore, we scale the results by 2**PASS1_BITS. */
       
   793   /* We scale the results further by 2**2 as part of output adaption */
       
   794   /* scaling for different DCT size. */
       
   795   /* cK represents sqrt(2) * cos(K*pi/6). */
       
   796 
       
   797   dataptr = data;
       
   798   for (ctr = 0; ctr < 3; ctr++) {
       
   799     elemptr = sample_data[ctr] + start_col;
       
   800 
       
   801     /* Even part */
       
   802 
       
   803     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
       
   804     tmp1 = GETJSAMPLE(elemptr[1]);
       
   805 
       
   806     tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
       
   807 
       
   808     /* Apply unsigned->signed conversion */
       
   809     dataptr[0] = (DCTELEM)
       
   810       ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+2));
       
   811     dataptr[2] = (DCTELEM)
       
   812       DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
       
   813 	      CONST_BITS-PASS1_BITS-2);
       
   814 
       
   815     /* Odd part */
       
   816 
       
   817     dataptr[1] = (DCTELEM)
       
   818       DESCALE(MULTIPLY(tmp2, FIX(1.224744871)),               /* c1 */
       
   819 	      CONST_BITS-PASS1_BITS-2);
       
   820 
       
   821     dataptr += DCTSIZE;		/* advance pointer to next row */
       
   822   }
       
   823 
       
   824   /* Pass 2: process columns.
       
   825    * We remove the PASS1_BITS scaling, but leave the results scaled up
       
   826    * by an overall factor of 8.
       
   827    * We must also scale the output by (8/3)**2 = 64/9, which we partially
       
   828    * fold into the constant multipliers (other part was done in pass 1):
       
   829    * cK now represents sqrt(2) * cos(K*pi/6) * 16/9.
       
   830    */
       
   831 
       
   832   dataptr = data;
       
   833   for (ctr = 0; ctr < 3; ctr++) {
       
   834     /* Even part */
       
   835 
       
   836     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
       
   837     tmp1 = dataptr[DCTSIZE*1];
       
   838 
       
   839     tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
       
   840 
       
   841     dataptr[DCTSIZE*0] = (DCTELEM)
       
   842       DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),        /* 16/9 */
       
   843 	      CONST_BITS+PASS1_BITS);
       
   844     dataptr[DCTSIZE*2] = (DCTELEM)
       
   845       DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
       
   846 	      CONST_BITS+PASS1_BITS);
       
   847 
       
   848     /* Odd part */
       
   849 
       
   850     dataptr[DCTSIZE*1] = (DCTELEM)
       
   851       DESCALE(MULTIPLY(tmp2, FIX(2.177324216)),               /* c1 */
       
   852 	      CONST_BITS+PASS1_BITS);
       
   853 
       
   854     dataptr++;			/* advance pointer to next column */
       
   855   }
       
   856 }
       
   857 
       
   858 
       
   859 /*
       
   860  * Perform the forward DCT on a 2x2 sample block.
       
   861  */
       
   862 
       
   863 GLOBAL(void)
       
   864 jpeg_fdct_2x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
   865 {
       
   866   INT32 tmp0, tmp1, tmp2, tmp3;
       
   867   JSAMPROW elemptr;
       
   868 
       
   869   /* Pre-zero output coefficient block. */
       
   870   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
       
   871 
       
   872   /* Pass 1: process rows. */
       
   873   /* Note results are scaled up by sqrt(8) compared to a true DCT. */
       
   874 
       
   875   /* Row 0 */
       
   876   elemptr = sample_data[0] + start_col;
       
   877 
       
   878   tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
       
   879   tmp1 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
       
   880 
       
   881   /* Row 1 */
       
   882   elemptr = sample_data[1] + start_col;
       
   883 
       
   884   tmp2 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
       
   885   tmp3 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
       
   886 
       
   887   /* Pass 2: process columns.
       
   888    * We leave the results scaled up by an overall factor of 8.
       
   889    * We must also scale the output by (8/2)**2 = 2**4.
       
   890    */
       
   891 
       
   892   /* Column 0 */
       
   893   /* Apply unsigned->signed conversion */
       
   894   data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp2 - 4 * CENTERJSAMPLE) << 4);
       
   895   data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp2) << 4);
       
   896 
       
   897   /* Column 1 */
       
   898   data[DCTSIZE*0+1] = (DCTELEM) ((tmp1 + tmp3) << 4);
       
   899   data[DCTSIZE*1+1] = (DCTELEM) ((tmp1 - tmp3) << 4);
       
   900 }
       
   901 
       
   902 
       
   903 /*
       
   904  * Perform the forward DCT on a 1x1 sample block.
       
   905  */
       
   906 
       
   907 GLOBAL(void)
       
   908 jpeg_fdct_1x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
   909 {
       
   910   /* Pre-zero output coefficient block. */
       
   911   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
       
   912 
       
   913   /* We leave the result scaled up by an overall factor of 8. */
       
   914   /* We must also scale the output by (8/1)**2 = 2**6. */
       
   915   /* Apply unsigned->signed conversion */
       
   916   data[0] = (DCTELEM)
       
   917     ((GETJSAMPLE(sample_data[0][start_col]) - CENTERJSAMPLE) << 6);
       
   918 }
       
   919 
       
   920 
       
   921 /*
       
   922  * Perform the forward DCT on a 9x9 sample block.
       
   923  */
       
   924 
       
   925 GLOBAL(void)
       
   926 jpeg_fdct_9x9 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
   927 {
       
   928   INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
       
   929   INT32 tmp10, tmp11, tmp12, tmp13;
       
   930   INT32 z1, z2;
       
   931   DCTELEM workspace[8];
       
   932   DCTELEM *dataptr;
       
   933   DCTELEM *wsptr;
       
   934   JSAMPROW elemptr;
       
   935   int ctr;
       
   936   SHIFT_TEMPS
       
   937 
       
   938   /* Pass 1: process rows. */
       
   939   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
   940   /* we scale the results further by 2 as part of output adaption */
       
   941   /* scaling for different DCT size. */
       
   942   /* cK represents sqrt(2) * cos(K*pi/18). */
       
   943 
       
   944   dataptr = data;
       
   945   ctr = 0;
       
   946   for (;;) {
       
   947     elemptr = sample_data[ctr] + start_col;
       
   948 
       
   949     /* Even part */
       
   950 
       
   951     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[8]);
       
   952     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[7]);
       
   953     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[6]);
       
   954     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[5]);
       
   955     tmp4 = GETJSAMPLE(elemptr[4]);
       
   956 
       
   957     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[8]);
       
   958     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[7]);
       
   959     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[6]);
       
   960     tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[5]);
       
   961 
       
   962     z1 = tmp0 + tmp2 + tmp3;
       
   963     z2 = tmp1 + tmp4;
       
   964     /* Apply unsigned->signed conversion */
       
   965     dataptr[0] = (DCTELEM) ((z1 + z2 - 9 * CENTERJSAMPLE) << 1);
       
   966     dataptr[6] = (DCTELEM)
       
   967       DESCALE(MULTIPLY(z1 - z2 - z2, FIX(0.707106781)),  /* c6 */
       
   968 	      CONST_BITS-1);
       
   969     z1 = MULTIPLY(tmp0 - tmp2, FIX(1.328926049));        /* c2 */
       
   970     z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(0.707106781)); /* c6 */
       
   971     dataptr[2] = (DCTELEM)
       
   972       DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.083350441))    /* c4 */
       
   973 	      + z1 + z2, CONST_BITS-1);
       
   974     dataptr[4] = (DCTELEM)
       
   975       DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.245575608))    /* c8 */
       
   976 	      + z1 - z2, CONST_BITS-1);
       
   977 
       
   978     /* Odd part */
       
   979 
       
   980     dataptr[3] = (DCTELEM)
       
   981       DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.224744871)), /* c3 */
       
   982 	      CONST_BITS-1);
       
   983 
       
   984     tmp11 = MULTIPLY(tmp11, FIX(1.224744871));        /* c3 */
       
   985     tmp0 = MULTIPLY(tmp10 + tmp12, FIX(0.909038955)); /* c5 */
       
   986     tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.483689525)); /* c7 */
       
   987 
       
   988     dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS-1);
       
   989 
       
   990     tmp2 = MULTIPLY(tmp12 - tmp13, FIX(1.392728481)); /* c1 */
       
   991 
       
   992     dataptr[5] = (DCTELEM) DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS-1);
       
   993     dataptr[7] = (DCTELEM) DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS-1);
       
   994 
       
   995     ctr++;
       
   996 
       
   997     if (ctr != DCTSIZE) {
       
   998       if (ctr == 9)
       
   999 	break;			/* Done. */
       
  1000       dataptr += DCTSIZE;	/* advance pointer to next row */
       
  1001     } else
       
  1002       dataptr = workspace;	/* switch pointer to extended workspace */
       
  1003   }
       
  1004 
       
  1005   /* Pass 2: process columns.
       
  1006    * We leave the results scaled up by an overall factor of 8.
       
  1007    * We must also scale the output by (8/9)**2 = 64/81, which we partially
       
  1008    * fold into the constant multipliers and final/initial shifting:
       
  1009    * cK now represents sqrt(2) * cos(K*pi/18) * 128/81.
       
  1010    */
       
  1011 
       
  1012   dataptr = data;
       
  1013   wsptr = workspace;
       
  1014   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
       
  1015     /* Even part */
       
  1016 
       
  1017     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*0];
       
  1018     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*7];
       
  1019     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*6];
       
  1020     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*5];
       
  1021     tmp4 = dataptr[DCTSIZE*4];
       
  1022 
       
  1023     tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*0];
       
  1024     tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*7];
       
  1025     tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*6];
       
  1026     tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*5];
       
  1027 
       
  1028     z1 = tmp0 + tmp2 + tmp3;
       
  1029     z2 = tmp1 + tmp4;
       
  1030     dataptr[DCTSIZE*0] = (DCTELEM)
       
  1031       DESCALE(MULTIPLY(z1 + z2, FIX(1.580246914)),       /* 128/81 */
       
  1032 	      CONST_BITS+2);
       
  1033     dataptr[DCTSIZE*6] = (DCTELEM)
       
  1034       DESCALE(MULTIPLY(z1 - z2 - z2, FIX(1.117403309)),  /* c6 */
       
  1035 	      CONST_BITS+2);
       
  1036     z1 = MULTIPLY(tmp0 - tmp2, FIX(2.100031287));        /* c2 */
       
  1037     z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(1.117403309)); /* c6 */
       
  1038     dataptr[DCTSIZE*2] = (DCTELEM)
       
  1039       DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.711961190))    /* c4 */
       
  1040 	      + z1 + z2, CONST_BITS+2);
       
  1041     dataptr[DCTSIZE*4] = (DCTELEM)
       
  1042       DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.388070096))    /* c8 */
       
  1043 	      + z1 - z2, CONST_BITS+2);
       
  1044 
       
  1045     /* Odd part */
       
  1046 
       
  1047     dataptr[DCTSIZE*3] = (DCTELEM)
       
  1048       DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.935399303)), /* c3 */
       
  1049 	      CONST_BITS+2);
       
  1050 
       
  1051     tmp11 = MULTIPLY(tmp11, FIX(1.935399303));        /* c3 */
       
  1052     tmp0 = MULTIPLY(tmp10 + tmp12, FIX(1.436506004)); /* c5 */
       
  1053     tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.764348879)); /* c7 */
       
  1054 
       
  1055     dataptr[DCTSIZE*1] = (DCTELEM)
       
  1056       DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS+2);
       
  1057 
       
  1058     tmp2 = MULTIPLY(tmp12 - tmp13, FIX(2.200854883)); /* c1 */
       
  1059 
       
  1060     dataptr[DCTSIZE*5] = (DCTELEM)
       
  1061       DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS+2);
       
  1062     dataptr[DCTSIZE*7] = (DCTELEM)
       
  1063       DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS+2);
       
  1064 
       
  1065     dataptr++;			/* advance pointer to next column */
       
  1066     wsptr++;			/* advance pointer to next column */
       
  1067   }
       
  1068 }
       
  1069 
       
  1070 
       
  1071 /*
       
  1072  * Perform the forward DCT on a 10x10 sample block.
       
  1073  */
       
  1074 
       
  1075 GLOBAL(void)
       
  1076 jpeg_fdct_10x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  1077 {
       
  1078   INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
       
  1079   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
       
  1080   DCTELEM workspace[8*2];
       
  1081   DCTELEM *dataptr;
       
  1082   DCTELEM *wsptr;
       
  1083   JSAMPROW elemptr;
       
  1084   int ctr;
       
  1085   SHIFT_TEMPS
       
  1086 
       
  1087   /* Pass 1: process rows. */
       
  1088   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
  1089   /* we scale the results further by 2 as part of output adaption */
       
  1090   /* scaling for different DCT size. */
       
  1091   /* cK represents sqrt(2) * cos(K*pi/20). */
       
  1092 
       
  1093   dataptr = data;
       
  1094   ctr = 0;
       
  1095   for (;;) {
       
  1096     elemptr = sample_data[ctr] + start_col;
       
  1097 
       
  1098     /* Even part */
       
  1099 
       
  1100     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
       
  1101     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
       
  1102     tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
       
  1103     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
       
  1104     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
       
  1105 
       
  1106     tmp10 = tmp0 + tmp4;
       
  1107     tmp13 = tmp0 - tmp4;
       
  1108     tmp11 = tmp1 + tmp3;
       
  1109     tmp14 = tmp1 - tmp3;
       
  1110 
       
  1111     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
       
  1112     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
       
  1113     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
       
  1114     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
       
  1115     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
       
  1116 
       
  1117     /* Apply unsigned->signed conversion */
       
  1118     dataptr[0] = (DCTELEM)
       
  1119       ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << 1);
       
  1120     tmp12 += tmp12;
       
  1121     dataptr[4] = (DCTELEM)
       
  1122       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
       
  1123 	      MULTIPLY(tmp11 - tmp12, FIX(0.437016024)),  /* c8 */
       
  1124 	      CONST_BITS-1);
       
  1125     tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876));    /* c6 */
       
  1126     dataptr[2] = (DCTELEM)
       
  1127       DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)),  /* c2-c6 */
       
  1128 	      CONST_BITS-1);
       
  1129     dataptr[6] = (DCTELEM)
       
  1130       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)),  /* c2+c6 */
       
  1131 	      CONST_BITS-1);
       
  1132 
       
  1133     /* Odd part */
       
  1134 
       
  1135     tmp10 = tmp0 + tmp4;
       
  1136     tmp11 = tmp1 - tmp3;
       
  1137     dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << 1);
       
  1138     tmp2 <<= CONST_BITS;
       
  1139     dataptr[1] = (DCTELEM)
       
  1140       DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) +          /* c1 */
       
  1141 	      MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 +   /* c3 */
       
  1142 	      MULTIPLY(tmp3, FIX(0.642039522)) +          /* c7 */
       
  1143 	      MULTIPLY(tmp4, FIX(0.221231742)),           /* c9 */
       
  1144 	      CONST_BITS-1);
       
  1145     tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) -     /* (c3+c7)/2 */
       
  1146 	    MULTIPLY(tmp1 + tmp3, FIX(0.587785252));      /* (c1-c9)/2 */
       
  1147     tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) +   /* (c3-c7)/2 */
       
  1148 	    (tmp11 << (CONST_BITS - 1)) - tmp2;
       
  1149     dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-1);
       
  1150     dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-1);
       
  1151 
       
  1152     ctr++;
       
  1153 
       
  1154     if (ctr != DCTSIZE) {
       
  1155       if (ctr == 10)
       
  1156 	break;			/* Done. */
       
  1157       dataptr += DCTSIZE;	/* advance pointer to next row */
       
  1158     } else
       
  1159       dataptr = workspace;	/* switch pointer to extended workspace */
       
  1160   }
       
  1161 
       
  1162   /* Pass 2: process columns.
       
  1163    * We leave the results scaled up by an overall factor of 8.
       
  1164    * We must also scale the output by (8/10)**2 = 16/25, which we partially
       
  1165    * fold into the constant multipliers and final/initial shifting:
       
  1166    * cK now represents sqrt(2) * cos(K*pi/20) * 32/25.
       
  1167    */
       
  1168 
       
  1169   dataptr = data;
       
  1170   wsptr = workspace;
       
  1171   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
       
  1172     /* Even part */
       
  1173 
       
  1174     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
       
  1175     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
       
  1176     tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
       
  1177     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
       
  1178     tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
       
  1179 
       
  1180     tmp10 = tmp0 + tmp4;
       
  1181     tmp13 = tmp0 - tmp4;
       
  1182     tmp11 = tmp1 + tmp3;
       
  1183     tmp14 = tmp1 - tmp3;
       
  1184 
       
  1185     tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
       
  1186     tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
       
  1187     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
       
  1188     tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
       
  1189     tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
       
  1190 
       
  1191     dataptr[DCTSIZE*0] = (DCTELEM)
       
  1192       DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
       
  1193 	      CONST_BITS+2);
       
  1194     tmp12 += tmp12;
       
  1195     dataptr[DCTSIZE*4] = (DCTELEM)
       
  1196       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
       
  1197 	      MULTIPLY(tmp11 - tmp12, FIX(0.559380511)),  /* c8 */
       
  1198 	      CONST_BITS+2);
       
  1199     tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961));    /* c6 */
       
  1200     dataptr[DCTSIZE*2] = (DCTELEM)
       
  1201       DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)),  /* c2-c6 */
       
  1202 	      CONST_BITS+2);
       
  1203     dataptr[DCTSIZE*6] = (DCTELEM)
       
  1204       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)),  /* c2+c6 */
       
  1205 	      CONST_BITS+2);
       
  1206 
       
  1207     /* Odd part */
       
  1208 
       
  1209     tmp10 = tmp0 + tmp4;
       
  1210     tmp11 = tmp1 - tmp3;
       
  1211     dataptr[DCTSIZE*5] = (DCTELEM)
       
  1212       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)),  /* 32/25 */
       
  1213 	      CONST_BITS+2);
       
  1214     tmp2 = MULTIPLY(tmp2, FIX(1.28));                     /* 32/25 */
       
  1215     dataptr[DCTSIZE*1] = (DCTELEM)
       
  1216       DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) +          /* c1 */
       
  1217 	      MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 +   /* c3 */
       
  1218 	      MULTIPLY(tmp3, FIX(0.821810588)) +          /* c7 */
       
  1219 	      MULTIPLY(tmp4, FIX(0.283176630)),           /* c9 */
       
  1220 	      CONST_BITS+2);
       
  1221     tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) -     /* (c3+c7)/2 */
       
  1222 	    MULTIPLY(tmp1 + tmp3, FIX(0.752365123));      /* (c1-c9)/2 */
       
  1223     tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) +   /* (c3-c7)/2 */
       
  1224 	    MULTIPLY(tmp11, FIX(0.64)) - tmp2;            /* 16/25 */
       
  1225     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+2);
       
  1226     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+2);
       
  1227 
       
  1228     dataptr++;			/* advance pointer to next column */
       
  1229     wsptr++;			/* advance pointer to next column */
       
  1230   }
       
  1231 }
       
  1232 
       
  1233 
       
  1234 /*
       
  1235  * Perform the forward DCT on an 11x11 sample block.
       
  1236  */
       
  1237 
       
  1238 GLOBAL(void)
       
  1239 jpeg_fdct_11x11 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  1240 {
       
  1241   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
       
  1242   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
       
  1243   INT32 z1, z2, z3;
       
  1244   DCTELEM workspace[8*3];
       
  1245   DCTELEM *dataptr;
       
  1246   DCTELEM *wsptr;
       
  1247   JSAMPROW elemptr;
       
  1248   int ctr;
       
  1249   SHIFT_TEMPS
       
  1250 
       
  1251   /* Pass 1: process rows. */
       
  1252   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
  1253   /* we scale the results further by 2 as part of output adaption */
       
  1254   /* scaling for different DCT size. */
       
  1255   /* cK represents sqrt(2) * cos(K*pi/22). */
       
  1256 
       
  1257   dataptr = data;
       
  1258   ctr = 0;
       
  1259   for (;;) {
       
  1260     elemptr = sample_data[ctr] + start_col;
       
  1261 
       
  1262     /* Even part */
       
  1263 
       
  1264     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[10]);
       
  1265     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[9]);
       
  1266     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[8]);
       
  1267     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[7]);
       
  1268     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[6]);
       
  1269     tmp5 = GETJSAMPLE(elemptr[5]);
       
  1270 
       
  1271     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[10]);
       
  1272     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[9]);
       
  1273     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[8]);
       
  1274     tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[7]);
       
  1275     tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[6]);
       
  1276 
       
  1277     /* Apply unsigned->signed conversion */
       
  1278     dataptr[0] = (DCTELEM)
       
  1279       ((tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 - 11 * CENTERJSAMPLE) << 1);
       
  1280     tmp5 += tmp5;
       
  1281     tmp0 -= tmp5;
       
  1282     tmp1 -= tmp5;
       
  1283     tmp2 -= tmp5;
       
  1284     tmp3 -= tmp5;
       
  1285     tmp4 -= tmp5;
       
  1286     z1 = MULTIPLY(tmp0 + tmp3, FIX(1.356927976)) +       /* c2 */
       
  1287 	 MULTIPLY(tmp2 + tmp4, FIX(0.201263574));        /* c10 */
       
  1288     z2 = MULTIPLY(tmp1 - tmp3, FIX(0.926112931));        /* c6 */
       
  1289     z3 = MULTIPLY(tmp0 - tmp1, FIX(1.189712156));        /* c4 */
       
  1290     dataptr[2] = (DCTELEM)
       
  1291       DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.018300590)) /* c2+c8-c6 */
       
  1292 	      - MULTIPLY(tmp4, FIX(1.390975730)),        /* c4+c10 */
       
  1293 	      CONST_BITS-1);
       
  1294     dataptr[4] = (DCTELEM)
       
  1295       DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.062335650)) /* c4-c6-c10 */
       
  1296 	      - MULTIPLY(tmp2, FIX(1.356927976))         /* c2 */
       
  1297 	      + MULTIPLY(tmp4, FIX(0.587485545)),        /* c8 */
       
  1298 	      CONST_BITS-1);
       
  1299     dataptr[6] = (DCTELEM)
       
  1300       DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.620527200)) /* c2+c4-c6 */
       
  1301 	      - MULTIPLY(tmp2, FIX(0.788749120)),        /* c8+c10 */
       
  1302 	      CONST_BITS-1);
       
  1303 
       
  1304     /* Odd part */
       
  1305 
       
  1306     tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.286413905));    /* c3 */
       
  1307     tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.068791298));    /* c5 */
       
  1308     tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.764581576));    /* c7 */
       
  1309     tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.719967871)) /* c7+c5+c3-c1 */
       
  1310 	   + MULTIPLY(tmp14, FIX(0.398430003));          /* c9 */
       
  1311     tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.764581576));  /* -c7 */
       
  1312     tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.399818907));  /* -c1 */
       
  1313     tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.276416582)) /* c9+c7+c1-c3 */
       
  1314 	    - MULTIPLY(tmp14, FIX(1.068791298));         /* c5 */
       
  1315     tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.398430003));   /* c9 */
       
  1316     tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(1.989053629)) /* c9+c5+c3-c7 */
       
  1317 	    + MULTIPLY(tmp14, FIX(1.399818907));         /* c1 */
       
  1318     tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.305598626)) /* c1+c5-c9-c7 */
       
  1319 	    - MULTIPLY(tmp14, FIX(1.286413905));         /* c3 */
       
  1320 
       
  1321     dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-1);
       
  1322     dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-1);
       
  1323     dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-1);
       
  1324     dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-1);
       
  1325 
       
  1326     ctr++;
       
  1327 
       
  1328     if (ctr != DCTSIZE) {
       
  1329       if (ctr == 11)
       
  1330 	break;			/* Done. */
       
  1331       dataptr += DCTSIZE;	/* advance pointer to next row */
       
  1332     } else
       
  1333       dataptr = workspace;	/* switch pointer to extended workspace */
       
  1334   }
       
  1335 
       
  1336   /* Pass 2: process columns.
       
  1337    * We leave the results scaled up by an overall factor of 8.
       
  1338    * We must also scale the output by (8/11)**2 = 64/121, which we partially
       
  1339    * fold into the constant multipliers and final/initial shifting:
       
  1340    * cK now represents sqrt(2) * cos(K*pi/22) * 128/121.
       
  1341    */
       
  1342 
       
  1343   dataptr = data;
       
  1344   wsptr = workspace;
       
  1345   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
       
  1346     /* Even part */
       
  1347 
       
  1348     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*2];
       
  1349     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*1];
       
  1350     tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*0];
       
  1351     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*7];
       
  1352     tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*6];
       
  1353     tmp5 = dataptr[DCTSIZE*5];
       
  1354 
       
  1355     tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*2];
       
  1356     tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*1];
       
  1357     tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*0];
       
  1358     tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*7];
       
  1359     tmp14 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*6];
       
  1360 
       
  1361     dataptr[DCTSIZE*0] = (DCTELEM)
       
  1362       DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5,
       
  1363 		       FIX(1.057851240)),                /* 128/121 */
       
  1364 	      CONST_BITS+2);
       
  1365     tmp5 += tmp5;
       
  1366     tmp0 -= tmp5;
       
  1367     tmp1 -= tmp5;
       
  1368     tmp2 -= tmp5;
       
  1369     tmp3 -= tmp5;
       
  1370     tmp4 -= tmp5;
       
  1371     z1 = MULTIPLY(tmp0 + tmp3, FIX(1.435427942)) +       /* c2 */
       
  1372 	 MULTIPLY(tmp2 + tmp4, FIX(0.212906922));        /* c10 */
       
  1373     z2 = MULTIPLY(tmp1 - tmp3, FIX(0.979689713));        /* c6 */
       
  1374     z3 = MULTIPLY(tmp0 - tmp1, FIX(1.258538479));        /* c4 */
       
  1375     dataptr[DCTSIZE*2] = (DCTELEM)
       
  1376       DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.077210542)) /* c2+c8-c6 */
       
  1377 	      - MULTIPLY(tmp4, FIX(1.471445400)),        /* c4+c10 */
       
  1378 	      CONST_BITS+2);
       
  1379     dataptr[DCTSIZE*4] = (DCTELEM)
       
  1380       DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.065941844)) /* c4-c6-c10 */
       
  1381 	      - MULTIPLY(tmp2, FIX(1.435427942))         /* c2 */
       
  1382 	      + MULTIPLY(tmp4, FIX(0.621472312)),        /* c8 */
       
  1383 	      CONST_BITS+2);
       
  1384     dataptr[DCTSIZE*6] = (DCTELEM)
       
  1385       DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.714276708)) /* c2+c4-c6 */
       
  1386 	      - MULTIPLY(tmp2, FIX(0.834379234)),        /* c8+c10 */
       
  1387 	      CONST_BITS+2);
       
  1388 
       
  1389     /* Odd part */
       
  1390 
       
  1391     tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.360834544));    /* c3 */
       
  1392     tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.130622199));    /* c5 */
       
  1393     tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.808813568));    /* c7 */
       
  1394     tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.819470145)) /* c7+c5+c3-c1 */
       
  1395 	   + MULTIPLY(tmp14, FIX(0.421479672));          /* c9 */
       
  1396     tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.808813568));  /* -c7 */
       
  1397     tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.480800167));  /* -c1 */
       
  1398     tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.350258864)) /* c9+c7+c1-c3 */
       
  1399 	    - MULTIPLY(tmp14, FIX(1.130622199));         /* c5 */
       
  1400     tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.421479672));   /* c9 */
       
  1401     tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(2.104122847)) /* c9+c5+c3-c7 */
       
  1402 	    + MULTIPLY(tmp14, FIX(1.480800167));         /* c1 */
       
  1403     tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.381129125)) /* c1+c5-c9-c7 */
       
  1404 	    - MULTIPLY(tmp14, FIX(1.360834544));         /* c3 */
       
  1405 
       
  1406     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
       
  1407     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
       
  1408     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
       
  1409     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
       
  1410 
       
  1411     dataptr++;			/* advance pointer to next column */
       
  1412     wsptr++;			/* advance pointer to next column */
       
  1413   }
       
  1414 }
       
  1415 
       
  1416 
       
  1417 /*
       
  1418  * Perform the forward DCT on a 12x12 sample block.
       
  1419  */
       
  1420 
       
  1421 GLOBAL(void)
       
  1422 jpeg_fdct_12x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  1423 {
       
  1424   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
       
  1425   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
       
  1426   DCTELEM workspace[8*4];
       
  1427   DCTELEM *dataptr;
       
  1428   DCTELEM *wsptr;
       
  1429   JSAMPROW elemptr;
       
  1430   int ctr;
       
  1431   SHIFT_TEMPS
       
  1432 
       
  1433   /* Pass 1: process rows. */
       
  1434   /* Note results are scaled up by sqrt(8) compared to a true DCT. */
       
  1435   /* cK represents sqrt(2) * cos(K*pi/24). */
       
  1436 
       
  1437   dataptr = data;
       
  1438   ctr = 0;
       
  1439   for (;;) {
       
  1440     elemptr = sample_data[ctr] + start_col;
       
  1441 
       
  1442     /* Even part */
       
  1443 
       
  1444     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
       
  1445     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
       
  1446     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
       
  1447     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
       
  1448     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
       
  1449     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
       
  1450 
       
  1451     tmp10 = tmp0 + tmp5;
       
  1452     tmp13 = tmp0 - tmp5;
       
  1453     tmp11 = tmp1 + tmp4;
       
  1454     tmp14 = tmp1 - tmp4;
       
  1455     tmp12 = tmp2 + tmp3;
       
  1456     tmp15 = tmp2 - tmp3;
       
  1457 
       
  1458     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
       
  1459     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
       
  1460     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
       
  1461     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
       
  1462     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
       
  1463     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
       
  1464 
       
  1465     /* Apply unsigned->signed conversion */
       
  1466     dataptr[0] = (DCTELEM) (tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE);
       
  1467     dataptr[6] = (DCTELEM) (tmp13 - tmp14 - tmp15);
       
  1468     dataptr[4] = (DCTELEM)
       
  1469       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
       
  1470 	      CONST_BITS);
       
  1471     dataptr[2] = (DCTELEM)
       
  1472       DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
       
  1473 	      CONST_BITS);
       
  1474 
       
  1475     /* Odd part */
       
  1476 
       
  1477     tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100);    /* c9 */
       
  1478     tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865);   /* c3-c9 */
       
  1479     tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065);   /* c3+c9 */
       
  1480     tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054));   /* c5 */
       
  1481     tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669));   /* c7 */
       
  1482     tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
       
  1483 	    + MULTIPLY(tmp5, FIX(0.184591911));        /* c11 */
       
  1484     tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
       
  1485     tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
       
  1486 	    + MULTIPLY(tmp5, FIX(0.860918669));        /* c7 */
       
  1487     tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
       
  1488 	    - MULTIPLY(tmp5, FIX(1.121971054));        /* c5 */
       
  1489     tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
       
  1490 	    - MULTIPLY(tmp2 + tmp5, FIX_0_541196100);  /* c9 */
       
  1491 
       
  1492     dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS);
       
  1493     dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS);
       
  1494     dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS);
       
  1495     dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS);
       
  1496 
       
  1497     ctr++;
       
  1498 
       
  1499     if (ctr != DCTSIZE) {
       
  1500       if (ctr == 12)
       
  1501 	break;			/* Done. */
       
  1502       dataptr += DCTSIZE;	/* advance pointer to next row */
       
  1503     } else
       
  1504       dataptr = workspace;	/* switch pointer to extended workspace */
       
  1505   }
       
  1506 
       
  1507   /* Pass 2: process columns.
       
  1508    * We leave the results scaled up by an overall factor of 8.
       
  1509    * We must also scale the output by (8/12)**2 = 4/9, which we partially
       
  1510    * fold into the constant multipliers and final shifting:
       
  1511    * cK now represents sqrt(2) * cos(K*pi/24) * 8/9.
       
  1512    */
       
  1513 
       
  1514   dataptr = data;
       
  1515   wsptr = workspace;
       
  1516   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
       
  1517     /* Even part */
       
  1518 
       
  1519     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
       
  1520     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
       
  1521     tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
       
  1522     tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
       
  1523     tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
       
  1524     tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
       
  1525 
       
  1526     tmp10 = tmp0 + tmp5;
       
  1527     tmp13 = tmp0 - tmp5;
       
  1528     tmp11 = tmp1 + tmp4;
       
  1529     tmp14 = tmp1 - tmp4;
       
  1530     tmp12 = tmp2 + tmp3;
       
  1531     tmp15 = tmp2 - tmp3;
       
  1532 
       
  1533     tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
       
  1534     tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
       
  1535     tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
       
  1536     tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
       
  1537     tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
       
  1538     tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
       
  1539 
       
  1540     dataptr[DCTSIZE*0] = (DCTELEM)
       
  1541       DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
       
  1542 	      CONST_BITS+1);
       
  1543     dataptr[DCTSIZE*6] = (DCTELEM)
       
  1544       DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
       
  1545 	      CONST_BITS+1);
       
  1546     dataptr[DCTSIZE*4] = (DCTELEM)
       
  1547       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)),         /* c4 */
       
  1548 	      CONST_BITS+1);
       
  1549     dataptr[DCTSIZE*2] = (DCTELEM)
       
  1550       DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) +        /* 8/9 */
       
  1551 	      MULTIPLY(tmp13 + tmp15, FIX(1.214244803)),         /* c2 */
       
  1552 	      CONST_BITS+1);
       
  1553 
       
  1554     /* Odd part */
       
  1555 
       
  1556     tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200));   /* c9 */
       
  1557     tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102));  /* c3-c9 */
       
  1558     tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502));  /* c3+c9 */
       
  1559     tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603));   /* c5 */
       
  1560     tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039));   /* c7 */
       
  1561     tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
       
  1562 	    + MULTIPLY(tmp5, FIX(0.164081699));        /* c11 */
       
  1563     tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
       
  1564     tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
       
  1565 	    + MULTIPLY(tmp5, FIX(0.765261039));        /* c7 */
       
  1566     tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
       
  1567 	    - MULTIPLY(tmp5, FIX(0.997307603));        /* c5 */
       
  1568     tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
       
  1569 	    - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
       
  1570 
       
  1571     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+1);
       
  1572     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+1);
       
  1573     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+1);
       
  1574     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+1);
       
  1575 
       
  1576     dataptr++;			/* advance pointer to next column */
       
  1577     wsptr++;			/* advance pointer to next column */
       
  1578   }
       
  1579 }
       
  1580 
       
  1581 
       
  1582 /*
       
  1583  * Perform the forward DCT on a 13x13 sample block.
       
  1584  */
       
  1585 
       
  1586 GLOBAL(void)
       
  1587 jpeg_fdct_13x13 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  1588 {
       
  1589   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
       
  1590   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
       
  1591   INT32 z1, z2;
       
  1592   DCTELEM workspace[8*5];
       
  1593   DCTELEM *dataptr;
       
  1594   DCTELEM *wsptr;
       
  1595   JSAMPROW elemptr;
       
  1596   int ctr;
       
  1597   SHIFT_TEMPS
       
  1598 
       
  1599   /* Pass 1: process rows. */
       
  1600   /* Note results are scaled up by sqrt(8) compared to a true DCT. */
       
  1601   /* cK represents sqrt(2) * cos(K*pi/26). */
       
  1602 
       
  1603   dataptr = data;
       
  1604   ctr = 0;
       
  1605   for (;;) {
       
  1606     elemptr = sample_data[ctr] + start_col;
       
  1607 
       
  1608     /* Even part */
       
  1609 
       
  1610     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[12]);
       
  1611     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[11]);
       
  1612     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[10]);
       
  1613     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[9]);
       
  1614     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[8]);
       
  1615     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[7]);
       
  1616     tmp6 = GETJSAMPLE(elemptr[6]);
       
  1617 
       
  1618     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[12]);
       
  1619     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[11]);
       
  1620     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[10]);
       
  1621     tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[9]);
       
  1622     tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[8]);
       
  1623     tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[7]);
       
  1624 
       
  1625     /* Apply unsigned->signed conversion */
       
  1626     dataptr[0] = (DCTELEM)
       
  1627       (tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6 - 13 * CENTERJSAMPLE);
       
  1628     tmp6 += tmp6;
       
  1629     tmp0 -= tmp6;
       
  1630     tmp1 -= tmp6;
       
  1631     tmp2 -= tmp6;
       
  1632     tmp3 -= tmp6;
       
  1633     tmp4 -= tmp6;
       
  1634     tmp5 -= tmp6;
       
  1635     dataptr[2] = (DCTELEM)
       
  1636       DESCALE(MULTIPLY(tmp0, FIX(1.373119086)) +   /* c2 */
       
  1637 	      MULTIPLY(tmp1, FIX(1.058554052)) +   /* c6 */
       
  1638 	      MULTIPLY(tmp2, FIX(0.501487041)) -   /* c10 */
       
  1639 	      MULTIPLY(tmp3, FIX(0.170464608)) -   /* c12 */
       
  1640 	      MULTIPLY(tmp4, FIX(0.803364869)) -   /* c8 */
       
  1641 	      MULTIPLY(tmp5, FIX(1.252223920)),    /* c4 */
       
  1642 	      CONST_BITS);
       
  1643     z1 = MULTIPLY(tmp0 - tmp2, FIX(1.155388986)) - /* (c4+c6)/2 */
       
  1644 	 MULTIPLY(tmp3 - tmp4, FIX(0.435816023)) - /* (c2-c10)/2 */
       
  1645 	 MULTIPLY(tmp1 - tmp5, FIX(0.316450131));  /* (c8-c12)/2 */
       
  1646     z2 = MULTIPLY(tmp0 + tmp2, FIX(0.096834934)) - /* (c4-c6)/2 */
       
  1647 	 MULTIPLY(tmp3 + tmp4, FIX(0.937303064)) + /* (c2+c10)/2 */
       
  1648 	 MULTIPLY(tmp1 + tmp5, FIX(0.486914739));  /* (c8+c12)/2 */
       
  1649 
       
  1650     dataptr[4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS);
       
  1651     dataptr[6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS);
       
  1652 
       
  1653     /* Odd part */
       
  1654 
       
  1655     tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.322312651));   /* c3 */
       
  1656     tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.163874945));   /* c5 */
       
  1657     tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.937797057)) +  /* c7 */
       
  1658 	   MULTIPLY(tmp14 + tmp15, FIX(0.338443458));   /* c11 */
       
  1659     tmp0 = tmp1 + tmp2 + tmp3 -
       
  1660 	   MULTIPLY(tmp10, FIX(2.020082300)) +          /* c3+c5+c7-c1 */
       
  1661 	   MULTIPLY(tmp14, FIX(0.318774355));           /* c9-c11 */
       
  1662     tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.937797057)) -  /* c7 */
       
  1663 	   MULTIPLY(tmp11 + tmp12, FIX(0.338443458));   /* c11 */
       
  1664     tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.163874945)); /* -c5 */
       
  1665     tmp1 += tmp4 + tmp5 +
       
  1666 	    MULTIPLY(tmp11, FIX(0.837223564)) -         /* c5+c9+c11-c3 */
       
  1667 	    MULTIPLY(tmp14, FIX(2.341699410));          /* c1+c7 */
       
  1668     tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.657217813)); /* -c9 */
       
  1669     tmp2 += tmp4 + tmp6 -
       
  1670 	    MULTIPLY(tmp12, FIX(1.572116027)) +         /* c1+c5-c9-c11 */
       
  1671 	    MULTIPLY(tmp15, FIX(2.260109708));          /* c3+c7 */
       
  1672     tmp3 += tmp5 + tmp6 +
       
  1673 	    MULTIPLY(tmp13, FIX(2.205608352)) -         /* c3+c5+c9-c7 */
       
  1674 	    MULTIPLY(tmp15, FIX(1.742345811));          /* c1+c11 */
       
  1675 
       
  1676     dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
       
  1677     dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
       
  1678     dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
       
  1679     dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
       
  1680 
       
  1681     ctr++;
       
  1682 
       
  1683     if (ctr != DCTSIZE) {
       
  1684       if (ctr == 13)
       
  1685 	break;			/* Done. */
       
  1686       dataptr += DCTSIZE;	/* advance pointer to next row */
       
  1687     } else
       
  1688       dataptr = workspace;	/* switch pointer to extended workspace */
       
  1689   }
       
  1690 
       
  1691   /* Pass 2: process columns.
       
  1692    * We leave the results scaled up by an overall factor of 8.
       
  1693    * We must also scale the output by (8/13)**2 = 64/169, which we partially
       
  1694    * fold into the constant multipliers and final shifting:
       
  1695    * cK now represents sqrt(2) * cos(K*pi/26) * 128/169.
       
  1696    */
       
  1697 
       
  1698   dataptr = data;
       
  1699   wsptr = workspace;
       
  1700   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
       
  1701     /* Even part */
       
  1702 
       
  1703     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*4];
       
  1704     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*3];
       
  1705     tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*2];
       
  1706     tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*1];
       
  1707     tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*0];
       
  1708     tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*7];
       
  1709     tmp6 = dataptr[DCTSIZE*6];
       
  1710 
       
  1711     tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*4];
       
  1712     tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*3];
       
  1713     tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*2];
       
  1714     tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*1];
       
  1715     tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*0];
       
  1716     tmp15 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*7];
       
  1717 
       
  1718     dataptr[DCTSIZE*0] = (DCTELEM)
       
  1719       DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6,
       
  1720 		       FIX(0.757396450)),          /* 128/169 */
       
  1721 	      CONST_BITS+1);
       
  1722     tmp6 += tmp6;
       
  1723     tmp0 -= tmp6;
       
  1724     tmp1 -= tmp6;
       
  1725     tmp2 -= tmp6;
       
  1726     tmp3 -= tmp6;
       
  1727     tmp4 -= tmp6;
       
  1728     tmp5 -= tmp6;
       
  1729     dataptr[DCTSIZE*2] = (DCTELEM)
       
  1730       DESCALE(MULTIPLY(tmp0, FIX(1.039995521)) +   /* c2 */
       
  1731 	      MULTIPLY(tmp1, FIX(0.801745081)) +   /* c6 */
       
  1732 	      MULTIPLY(tmp2, FIX(0.379824504)) -   /* c10 */
       
  1733 	      MULTIPLY(tmp3, FIX(0.129109289)) -   /* c12 */
       
  1734 	      MULTIPLY(tmp4, FIX(0.608465700)) -   /* c8 */
       
  1735 	      MULTIPLY(tmp5, FIX(0.948429952)),    /* c4 */
       
  1736 	      CONST_BITS+1);
       
  1737     z1 = MULTIPLY(tmp0 - tmp2, FIX(0.875087516)) - /* (c4+c6)/2 */
       
  1738 	 MULTIPLY(tmp3 - tmp4, FIX(0.330085509)) - /* (c2-c10)/2 */
       
  1739 	 MULTIPLY(tmp1 - tmp5, FIX(0.239678205));  /* (c8-c12)/2 */
       
  1740     z2 = MULTIPLY(tmp0 + tmp2, FIX(0.073342435)) - /* (c4-c6)/2 */
       
  1741 	 MULTIPLY(tmp3 + tmp4, FIX(0.709910013)) + /* (c2+c10)/2 */
       
  1742 	 MULTIPLY(tmp1 + tmp5, FIX(0.368787494));  /* (c8+c12)/2 */
       
  1743 
       
  1744     dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+1);
       
  1745     dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS+1);
       
  1746 
       
  1747     /* Odd part */
       
  1748 
       
  1749     tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.001514908));   /* c3 */
       
  1750     tmp2 = MULTIPLY(tmp10 + tmp12, FIX(0.881514751));   /* c5 */
       
  1751     tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.710284161)) +  /* c7 */
       
  1752 	   MULTIPLY(tmp14 + tmp15, FIX(0.256335874));   /* c11 */
       
  1753     tmp0 = tmp1 + tmp2 + tmp3 -
       
  1754 	   MULTIPLY(tmp10, FIX(1.530003162)) +          /* c3+c5+c7-c1 */
       
  1755 	   MULTIPLY(tmp14, FIX(0.241438564));           /* c9-c11 */
       
  1756     tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.710284161)) -  /* c7 */
       
  1757 	   MULTIPLY(tmp11 + tmp12, FIX(0.256335874));   /* c11 */
       
  1758     tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(0.881514751)); /* -c5 */
       
  1759     tmp1 += tmp4 + tmp5 +
       
  1760 	    MULTIPLY(tmp11, FIX(0.634110155)) -         /* c5+c9+c11-c3 */
       
  1761 	    MULTIPLY(tmp14, FIX(1.773594819));          /* c1+c7 */
       
  1762     tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.497774438)); /* -c9 */
       
  1763     tmp2 += tmp4 + tmp6 -
       
  1764 	    MULTIPLY(tmp12, FIX(1.190715098)) +         /* c1+c5-c9-c11 */
       
  1765 	    MULTIPLY(tmp15, FIX(1.711799069));          /* c3+c7 */
       
  1766     tmp3 += tmp5 + tmp6 +
       
  1767 	    MULTIPLY(tmp13, FIX(1.670519935)) -         /* c3+c5+c9-c7 */
       
  1768 	    MULTIPLY(tmp15, FIX(1.319646532));          /* c1+c11 */
       
  1769 
       
  1770     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+1);
       
  1771     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+1);
       
  1772     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+1);
       
  1773     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+1);
       
  1774 
       
  1775     dataptr++;			/* advance pointer to next column */
       
  1776     wsptr++;			/* advance pointer to next column */
       
  1777   }
       
  1778 }
       
  1779 
       
  1780 
       
  1781 /*
       
  1782  * Perform the forward DCT on a 14x14 sample block.
       
  1783  */
       
  1784 
       
  1785 GLOBAL(void)
       
  1786 jpeg_fdct_14x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  1787 {
       
  1788   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
       
  1789   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
       
  1790   DCTELEM workspace[8*6];
       
  1791   DCTELEM *dataptr;
       
  1792   DCTELEM *wsptr;
       
  1793   JSAMPROW elemptr;
       
  1794   int ctr;
       
  1795   SHIFT_TEMPS
       
  1796 
       
  1797   /* Pass 1: process rows. */
       
  1798   /* Note results are scaled up by sqrt(8) compared to a true DCT. */
       
  1799   /* cK represents sqrt(2) * cos(K*pi/28). */
       
  1800 
       
  1801   dataptr = data;
       
  1802   ctr = 0;
       
  1803   for (;;) {
       
  1804     elemptr = sample_data[ctr] + start_col;
       
  1805 
       
  1806     /* Even part */
       
  1807 
       
  1808     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
       
  1809     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
       
  1810     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
       
  1811     tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
       
  1812     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
       
  1813     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
       
  1814     tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
       
  1815 
       
  1816     tmp10 = tmp0 + tmp6;
       
  1817     tmp14 = tmp0 - tmp6;
       
  1818     tmp11 = tmp1 + tmp5;
       
  1819     tmp15 = tmp1 - tmp5;
       
  1820     tmp12 = tmp2 + tmp4;
       
  1821     tmp16 = tmp2 - tmp4;
       
  1822 
       
  1823     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
       
  1824     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
       
  1825     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
       
  1826     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
       
  1827     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
       
  1828     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
       
  1829     tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
       
  1830 
       
  1831     /* Apply unsigned->signed conversion */
       
  1832     dataptr[0] = (DCTELEM)
       
  1833       (tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE);
       
  1834     tmp13 += tmp13;
       
  1835     dataptr[4] = (DCTELEM)
       
  1836       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
       
  1837 	      MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
       
  1838 	      MULTIPLY(tmp12 - tmp13, FIX(0.881747734)),  /* c8 */
       
  1839 	      CONST_BITS);
       
  1840 
       
  1841     tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686));    /* c6 */
       
  1842 
       
  1843     dataptr[2] = (DCTELEM)
       
  1844       DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590))   /* c2-c6 */
       
  1845 	      + MULTIPLY(tmp16, FIX(0.613604268)),        /* c10 */
       
  1846 	      CONST_BITS);
       
  1847     dataptr[6] = (DCTELEM)
       
  1848       DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954))   /* c6+c10 */
       
  1849 	      - MULTIPLY(tmp16, FIX(1.378756276)),        /* c2 */
       
  1850 	      CONST_BITS);
       
  1851 
       
  1852     /* Odd part */
       
  1853 
       
  1854     tmp10 = tmp1 + tmp2;
       
  1855     tmp11 = tmp5 - tmp4;
       
  1856     dataptr[7] = (DCTELEM) (tmp0 - tmp10 + tmp3 - tmp11 - tmp6);
       
  1857     tmp3 <<= CONST_BITS;
       
  1858     tmp10 = MULTIPLY(tmp10, - FIX(0.158341681));          /* -c13 */
       
  1859     tmp11 = MULTIPLY(tmp11, FIX(1.405321284));            /* c1 */
       
  1860     tmp10 += tmp11 - tmp3;
       
  1861     tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) +     /* c5 */
       
  1862 	    MULTIPLY(tmp4 + tmp6, FIX(0.752406978));      /* c9 */
       
  1863     dataptr[5] = (DCTELEM)
       
  1864       DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
       
  1865 	      + MULTIPLY(tmp4, FIX(1.119999435)),         /* c1+c11-c9 */
       
  1866 	      CONST_BITS);
       
  1867     tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) +     /* c3 */
       
  1868 	    MULTIPLY(tmp5 - tmp6, FIX(0.467085129));      /* c11 */
       
  1869     dataptr[3] = (DCTELEM)
       
  1870       DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
       
  1871 	      - MULTIPLY(tmp5, FIX(3.069855259)),         /* c1+c5+c11 */
       
  1872 	      CONST_BITS);
       
  1873     dataptr[1] = (DCTELEM)
       
  1874       DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
       
  1875 	      MULTIPLY(tmp0 + tmp6, FIX(1.126980169)),    /* c3+c5-c1 */
       
  1876 	      CONST_BITS);
       
  1877 
       
  1878     ctr++;
       
  1879 
       
  1880     if (ctr != DCTSIZE) {
       
  1881       if (ctr == 14)
       
  1882 	break;			/* Done. */
       
  1883       dataptr += DCTSIZE;	/* advance pointer to next row */
       
  1884     } else
       
  1885       dataptr = workspace;	/* switch pointer to extended workspace */
       
  1886   }
       
  1887 
       
  1888   /* Pass 2: process columns.
       
  1889    * We leave the results scaled up by an overall factor of 8.
       
  1890    * We must also scale the output by (8/14)**2 = 16/49, which we partially
       
  1891    * fold into the constant multipliers and final shifting:
       
  1892    * cK now represents sqrt(2) * cos(K*pi/28) * 32/49.
       
  1893    */
       
  1894 
       
  1895   dataptr = data;
       
  1896   wsptr = workspace;
       
  1897   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
       
  1898     /* Even part */
       
  1899 
       
  1900     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
       
  1901     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
       
  1902     tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
       
  1903     tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
       
  1904     tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
       
  1905     tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
       
  1906     tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
       
  1907 
       
  1908     tmp10 = tmp0 + tmp6;
       
  1909     tmp14 = tmp0 - tmp6;
       
  1910     tmp11 = tmp1 + tmp5;
       
  1911     tmp15 = tmp1 - tmp5;
       
  1912     tmp12 = tmp2 + tmp4;
       
  1913     tmp16 = tmp2 - tmp4;
       
  1914 
       
  1915     tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
       
  1916     tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
       
  1917     tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
       
  1918     tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
       
  1919     tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
       
  1920     tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
       
  1921     tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
       
  1922 
       
  1923     dataptr[DCTSIZE*0] = (DCTELEM)
       
  1924       DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
       
  1925 		       FIX(0.653061224)),                 /* 32/49 */
       
  1926 	      CONST_BITS+1);
       
  1927     tmp13 += tmp13;
       
  1928     dataptr[DCTSIZE*4] = (DCTELEM)
       
  1929       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
       
  1930 	      MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
       
  1931 	      MULTIPLY(tmp12 - tmp13, FIX(0.575835255)),  /* c8 */
       
  1932 	      CONST_BITS+1);
       
  1933 
       
  1934     tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570));    /* c6 */
       
  1935 
       
  1936     dataptr[DCTSIZE*2] = (DCTELEM)
       
  1937       DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691))   /* c2-c6 */
       
  1938 	      + MULTIPLY(tmp16, FIX(0.400721155)),        /* c10 */
       
  1939 	      CONST_BITS+1);
       
  1940     dataptr[DCTSIZE*6] = (DCTELEM)
       
  1941       DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725))   /* c6+c10 */
       
  1942 	      - MULTIPLY(tmp16, FIX(0.900412262)),        /* c2 */
       
  1943 	      CONST_BITS+1);
       
  1944 
       
  1945     /* Odd part */
       
  1946 
       
  1947     tmp10 = tmp1 + tmp2;
       
  1948     tmp11 = tmp5 - tmp4;
       
  1949     dataptr[DCTSIZE*7] = (DCTELEM)
       
  1950       DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
       
  1951 		       FIX(0.653061224)),                 /* 32/49 */
       
  1952 	      CONST_BITS+1);
       
  1953     tmp3  = MULTIPLY(tmp3 , FIX(0.653061224));            /* 32/49 */
       
  1954     tmp10 = MULTIPLY(tmp10, - FIX(0.103406812));          /* -c13 */
       
  1955     tmp11 = MULTIPLY(tmp11, FIX(0.917760839));            /* c1 */
       
  1956     tmp10 += tmp11 - tmp3;
       
  1957     tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) +     /* c5 */
       
  1958 	    MULTIPLY(tmp4 + tmp6, FIX(0.491367823));      /* c9 */
       
  1959     dataptr[DCTSIZE*5] = (DCTELEM)
       
  1960       DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
       
  1961 	      + MULTIPLY(tmp4, FIX(0.731428202)),         /* c1+c11-c9 */
       
  1962 	      CONST_BITS+1);
       
  1963     tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) +     /* c3 */
       
  1964 	    MULTIPLY(tmp5 - tmp6, FIX(0.305035186));      /* c11 */
       
  1965     dataptr[DCTSIZE*3] = (DCTELEM)
       
  1966       DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
       
  1967 	      - MULTIPLY(tmp5, FIX(2.004803435)),         /* c1+c5+c11 */
       
  1968 	      CONST_BITS+1);
       
  1969     dataptr[DCTSIZE*1] = (DCTELEM)
       
  1970       DESCALE(tmp11 + tmp12 + tmp3
       
  1971 	      - MULTIPLY(tmp0, FIX(0.735987049))          /* c3+c5-c1 */
       
  1972 	      - MULTIPLY(tmp6, FIX(0.082925825)),         /* c9-c11-c13 */
       
  1973 	      CONST_BITS+1);
       
  1974 
       
  1975     dataptr++;			/* advance pointer to next column */
       
  1976     wsptr++;			/* advance pointer to next column */
       
  1977   }
       
  1978 }
       
  1979 
       
  1980 
       
  1981 /*
       
  1982  * Perform the forward DCT on a 15x15 sample block.
       
  1983  */
       
  1984 
       
  1985 GLOBAL(void)
       
  1986 jpeg_fdct_15x15 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  1987 {
       
  1988   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
       
  1989   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
       
  1990   INT32 z1, z2, z3;
       
  1991   DCTELEM workspace[8*7];
       
  1992   DCTELEM *dataptr;
       
  1993   DCTELEM *wsptr;
       
  1994   JSAMPROW elemptr;
       
  1995   int ctr;
       
  1996   SHIFT_TEMPS
       
  1997 
       
  1998   /* Pass 1: process rows. */
       
  1999   /* Note results are scaled up by sqrt(8) compared to a true DCT. */
       
  2000   /* cK represents sqrt(2) * cos(K*pi/30). */
       
  2001 
       
  2002   dataptr = data;
       
  2003   ctr = 0;
       
  2004   for (;;) {
       
  2005     elemptr = sample_data[ctr] + start_col;
       
  2006 
       
  2007     /* Even part */
       
  2008 
       
  2009     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[14]);
       
  2010     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[13]);
       
  2011     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[12]);
       
  2012     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[11]);
       
  2013     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[10]);
       
  2014     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[9]);
       
  2015     tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[8]);
       
  2016     tmp7 = GETJSAMPLE(elemptr[7]);
       
  2017 
       
  2018     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[14]);
       
  2019     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[13]);
       
  2020     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[12]);
       
  2021     tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[11]);
       
  2022     tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[10]);
       
  2023     tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[9]);
       
  2024     tmp16 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[8]);
       
  2025 
       
  2026     z1 = tmp0 + tmp4 + tmp5;
       
  2027     z2 = tmp1 + tmp3 + tmp6;
       
  2028     z3 = tmp2 + tmp7;
       
  2029     /* Apply unsigned->signed conversion */
       
  2030     dataptr[0] = (DCTELEM) (z1 + z2 + z3 - 15 * CENTERJSAMPLE);
       
  2031     z3 += z3;
       
  2032     dataptr[6] = (DCTELEM)
       
  2033       DESCALE(MULTIPLY(z1 - z3, FIX(1.144122806)) - /* c6 */
       
  2034 	      MULTIPLY(z2 - z3, FIX(0.437016024)),  /* c12 */
       
  2035 	      CONST_BITS);
       
  2036     tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
       
  2037     z1 = MULTIPLY(tmp3 - tmp2, FIX(1.531135173)) -  /* c2+c14 */
       
  2038          MULTIPLY(tmp6 - tmp2, FIX(2.238241955));   /* c4+c8 */
       
  2039     z2 = MULTIPLY(tmp5 - tmp2, FIX(0.798468008)) -  /* c8-c14 */
       
  2040 	 MULTIPLY(tmp0 - tmp2, FIX(0.091361227));   /* c2-c4 */
       
  2041     z3 = MULTIPLY(tmp0 - tmp3, FIX(1.383309603)) +  /* c2 */
       
  2042 	 MULTIPLY(tmp6 - tmp5, FIX(0.946293579)) +  /* c8 */
       
  2043 	 MULTIPLY(tmp1 - tmp4, FIX(0.790569415));   /* (c6+c12)/2 */
       
  2044 
       
  2045     dataptr[2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS);
       
  2046     dataptr[4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS);
       
  2047 
       
  2048     /* Odd part */
       
  2049 
       
  2050     tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
       
  2051 		    FIX(1.224744871));                         /* c5 */
       
  2052     tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.344997024)) + /* c3 */
       
  2053 	   MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.831253876));  /* c9 */
       
  2054     tmp12 = MULTIPLY(tmp12, FIX(1.224744871));                 /* c5 */
       
  2055     tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.406466353)) +         /* c1 */
       
  2056 	   MULTIPLY(tmp11 + tmp14, FIX(1.344997024)) +         /* c3 */
       
  2057 	   MULTIPLY(tmp13 + tmp15, FIX(0.575212477));          /* c11 */
       
  2058     tmp0 = MULTIPLY(tmp13, FIX(0.475753014)) -                 /* c7-c11 */
       
  2059 	   MULTIPLY(tmp14, FIX(0.513743148)) +                 /* c3-c9 */
       
  2060 	   MULTIPLY(tmp16, FIX(1.700497885)) + tmp4 + tmp12;   /* c1+c13 */
       
  2061     tmp3 = MULTIPLY(tmp10, - FIX(0.355500862)) -               /* -(c1-c7) */
       
  2062 	   MULTIPLY(tmp11, FIX(2.176250899)) -                 /* c3+c9 */
       
  2063 	   MULTIPLY(tmp15, FIX(0.869244010)) + tmp4 - tmp12;   /* c11+c13 */
       
  2064 
       
  2065     dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
       
  2066     dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
       
  2067     dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
       
  2068     dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
       
  2069 
       
  2070     ctr++;
       
  2071 
       
  2072     if (ctr != DCTSIZE) {
       
  2073       if (ctr == 15)
       
  2074 	break;			/* Done. */
       
  2075       dataptr += DCTSIZE;	/* advance pointer to next row */
       
  2076     } else
       
  2077       dataptr = workspace;	/* switch pointer to extended workspace */
       
  2078   }
       
  2079 
       
  2080   /* Pass 2: process columns.
       
  2081    * We leave the results scaled up by an overall factor of 8.
       
  2082    * We must also scale the output by (8/15)**2 = 64/225, which we partially
       
  2083    * fold into the constant multipliers and final shifting:
       
  2084    * cK now represents sqrt(2) * cos(K*pi/30) * 256/225.
       
  2085    */
       
  2086 
       
  2087   dataptr = data;
       
  2088   wsptr = workspace;
       
  2089   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
       
  2090     /* Even part */
       
  2091 
       
  2092     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*6];
       
  2093     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*5];
       
  2094     tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*4];
       
  2095     tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*3];
       
  2096     tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*2];
       
  2097     tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*1];
       
  2098     tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*0];
       
  2099     tmp7 = dataptr[DCTSIZE*7];
       
  2100 
       
  2101     tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*6];
       
  2102     tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*5];
       
  2103     tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*4];
       
  2104     tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*3];
       
  2105     tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*2];
       
  2106     tmp15 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*1];
       
  2107     tmp16 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*0];
       
  2108 
       
  2109     z1 = tmp0 + tmp4 + tmp5;
       
  2110     z2 = tmp1 + tmp3 + tmp6;
       
  2111     z3 = tmp2 + tmp7;
       
  2112     dataptr[DCTSIZE*0] = (DCTELEM)
       
  2113       DESCALE(MULTIPLY(z1 + z2 + z3, FIX(1.137777778)), /* 256/225 */
       
  2114 	      CONST_BITS+2);
       
  2115     z3 += z3;
       
  2116     dataptr[DCTSIZE*6] = (DCTELEM)
       
  2117       DESCALE(MULTIPLY(z1 - z3, FIX(1.301757503)) - /* c6 */
       
  2118 	      MULTIPLY(z2 - z3, FIX(0.497227121)),  /* c12 */
       
  2119 	      CONST_BITS+2);
       
  2120     tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
       
  2121     z1 = MULTIPLY(tmp3 - tmp2, FIX(1.742091575)) -  /* c2+c14 */
       
  2122          MULTIPLY(tmp6 - tmp2, FIX(2.546621957));   /* c4+c8 */
       
  2123     z2 = MULTIPLY(tmp5 - tmp2, FIX(0.908479156)) -  /* c8-c14 */
       
  2124 	 MULTIPLY(tmp0 - tmp2, FIX(0.103948774));   /* c2-c4 */
       
  2125     z3 = MULTIPLY(tmp0 - tmp3, FIX(1.573898926)) +  /* c2 */
       
  2126 	 MULTIPLY(tmp6 - tmp5, FIX(1.076671805)) +  /* c8 */
       
  2127 	 MULTIPLY(tmp1 - tmp4, FIX(0.899492312));   /* (c6+c12)/2 */
       
  2128 
       
  2129     dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS+2);
       
  2130     dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS+2);
       
  2131 
       
  2132     /* Odd part */
       
  2133 
       
  2134     tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
       
  2135 		    FIX(1.393487498));                         /* c5 */
       
  2136     tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.530307725)) + /* c3 */
       
  2137 	   MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.945782187));  /* c9 */
       
  2138     tmp12 = MULTIPLY(tmp12, FIX(1.393487498));                 /* c5 */
       
  2139     tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.600246161)) +         /* c1 */
       
  2140 	   MULTIPLY(tmp11 + tmp14, FIX(1.530307725)) +         /* c3 */
       
  2141 	   MULTIPLY(tmp13 + tmp15, FIX(0.654463974));          /* c11 */
       
  2142     tmp0 = MULTIPLY(tmp13, FIX(0.541301207)) -                 /* c7-c11 */
       
  2143 	   MULTIPLY(tmp14, FIX(0.584525538)) +                 /* c3-c9 */
       
  2144 	   MULTIPLY(tmp16, FIX(1.934788705)) + tmp4 + tmp12;   /* c1+c13 */
       
  2145     tmp3 = MULTIPLY(tmp10, - FIX(0.404480980)) -               /* -(c1-c7) */
       
  2146 	   MULTIPLY(tmp11, FIX(2.476089912)) -                 /* c3+c9 */
       
  2147 	   MULTIPLY(tmp15, FIX(0.989006518)) + tmp4 - tmp12;   /* c11+c13 */
       
  2148 
       
  2149     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
       
  2150     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
       
  2151     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
       
  2152     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
       
  2153 
       
  2154     dataptr++;			/* advance pointer to next column */
       
  2155     wsptr++;			/* advance pointer to next column */
       
  2156   }
       
  2157 }
       
  2158 
       
  2159 
       
  2160 /*
       
  2161  * Perform the forward DCT on a 16x16 sample block.
       
  2162  */
       
  2163 
       
  2164 GLOBAL(void)
       
  2165 jpeg_fdct_16x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  2166 {
       
  2167   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
       
  2168   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
       
  2169   DCTELEM workspace[DCTSIZE2];
       
  2170   DCTELEM *dataptr;
       
  2171   DCTELEM *wsptr;
       
  2172   JSAMPROW elemptr;
       
  2173   int ctr;
       
  2174   SHIFT_TEMPS
       
  2175 
       
  2176   /* Pass 1: process rows. */
       
  2177   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
  2178   /* furthermore, we scale the results by 2**PASS1_BITS. */
       
  2179   /* cK represents sqrt(2) * cos(K*pi/32). */
       
  2180 
       
  2181   dataptr = data;
       
  2182   ctr = 0;
       
  2183   for (;;) {
       
  2184     elemptr = sample_data[ctr] + start_col;
       
  2185 
       
  2186     /* Even part */
       
  2187 
       
  2188     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
       
  2189     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
       
  2190     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
       
  2191     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
       
  2192     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
       
  2193     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
       
  2194     tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
       
  2195     tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
       
  2196 
       
  2197     tmp10 = tmp0 + tmp7;
       
  2198     tmp14 = tmp0 - tmp7;
       
  2199     tmp11 = tmp1 + tmp6;
       
  2200     tmp15 = tmp1 - tmp6;
       
  2201     tmp12 = tmp2 + tmp5;
       
  2202     tmp16 = tmp2 - tmp5;
       
  2203     tmp13 = tmp3 + tmp4;
       
  2204     tmp17 = tmp3 - tmp4;
       
  2205 
       
  2206     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
       
  2207     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
       
  2208     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
       
  2209     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
       
  2210     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
       
  2211     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
       
  2212     tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
       
  2213     tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
       
  2214 
       
  2215     /* Apply unsigned->signed conversion */
       
  2216     dataptr[0] = (DCTELEM)
       
  2217       ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
       
  2218     dataptr[4] = (DCTELEM)
       
  2219       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
       
  2220 	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
       
  2221 	      CONST_BITS-PASS1_BITS);
       
  2222 
       
  2223     tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
       
  2224 	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
       
  2225 
       
  2226     dataptr[2] = (DCTELEM)
       
  2227       DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
       
  2228 	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
       
  2229 	      CONST_BITS-PASS1_BITS);
       
  2230     dataptr[6] = (DCTELEM)
       
  2231       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
       
  2232 	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
       
  2233 	      CONST_BITS-PASS1_BITS);
       
  2234 
       
  2235     /* Odd part */
       
  2236 
       
  2237     tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
       
  2238 	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
       
  2239     tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
       
  2240 	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
       
  2241     tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
       
  2242 	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
       
  2243     tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
       
  2244 	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
       
  2245     tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
       
  2246 	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
       
  2247     tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
       
  2248 	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
       
  2249     tmp10 = tmp11 + tmp12 + tmp13 -
       
  2250 	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
       
  2251 	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
       
  2252     tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
       
  2253 	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
       
  2254     tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
       
  2255 	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
       
  2256     tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
       
  2257 	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
       
  2258 
       
  2259     dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
       
  2260     dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
       
  2261     dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
       
  2262     dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
       
  2263 
       
  2264     ctr++;
       
  2265 
       
  2266     if (ctr != DCTSIZE) {
       
  2267       if (ctr == DCTSIZE * 2)
       
  2268 	break;			/* Done. */
       
  2269       dataptr += DCTSIZE;	/* advance pointer to next row */
       
  2270     } else
       
  2271       dataptr = workspace;	/* switch pointer to extended workspace */
       
  2272   }
       
  2273 
       
  2274   /* Pass 2: process columns.
       
  2275    * We remove the PASS1_BITS scaling, but leave the results scaled up
       
  2276    * by an overall factor of 8.
       
  2277    * We must also scale the output by (8/16)**2 = 1/2**2.
       
  2278    */
       
  2279 
       
  2280   dataptr = data;
       
  2281   wsptr = workspace;
       
  2282   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
       
  2283     /* Even part */
       
  2284 
       
  2285     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
       
  2286     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
       
  2287     tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
       
  2288     tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
       
  2289     tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
       
  2290     tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
       
  2291     tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
       
  2292     tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
       
  2293 
       
  2294     tmp10 = tmp0 + tmp7;
       
  2295     tmp14 = tmp0 - tmp7;
       
  2296     tmp11 = tmp1 + tmp6;
       
  2297     tmp15 = tmp1 - tmp6;
       
  2298     tmp12 = tmp2 + tmp5;
       
  2299     tmp16 = tmp2 - tmp5;
       
  2300     tmp13 = tmp3 + tmp4;
       
  2301     tmp17 = tmp3 - tmp4;
       
  2302 
       
  2303     tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
       
  2304     tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
       
  2305     tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
       
  2306     tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
       
  2307     tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
       
  2308     tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
       
  2309     tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
       
  2310     tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
       
  2311 
       
  2312     dataptr[DCTSIZE*0] = (DCTELEM)
       
  2313       DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+2);
       
  2314     dataptr[DCTSIZE*4] = (DCTELEM)
       
  2315       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
       
  2316 	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
       
  2317 	      CONST_BITS+PASS1_BITS+2);
       
  2318 
       
  2319     tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
       
  2320 	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
       
  2321 
       
  2322     dataptr[DCTSIZE*2] = (DCTELEM)
       
  2323       DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
       
  2324 	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+10 */
       
  2325 	      CONST_BITS+PASS1_BITS+2);
       
  2326     dataptr[DCTSIZE*6] = (DCTELEM)
       
  2327       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
       
  2328 	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
       
  2329 	      CONST_BITS+PASS1_BITS+2);
       
  2330 
       
  2331     /* Odd part */
       
  2332 
       
  2333     tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
       
  2334 	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
       
  2335     tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
       
  2336 	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
       
  2337     tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
       
  2338 	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
       
  2339     tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
       
  2340 	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
       
  2341     tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
       
  2342 	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
       
  2343     tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
       
  2344 	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
       
  2345     tmp10 = tmp11 + tmp12 + tmp13 -
       
  2346 	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
       
  2347 	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
       
  2348     tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
       
  2349 	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
       
  2350     tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
       
  2351 	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
       
  2352     tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
       
  2353 	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
       
  2354 
       
  2355     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+2);
       
  2356     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+2);
       
  2357     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+2);
       
  2358     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+2);
       
  2359 
       
  2360     dataptr++;			/* advance pointer to next column */
       
  2361     wsptr++;			/* advance pointer to next column */
       
  2362   }
       
  2363 }
       
  2364 
       
  2365 
       
  2366 /*
       
  2367  * Perform the forward DCT on a 16x8 sample block.
       
  2368  *
       
  2369  * 16-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
       
  2370  */
       
  2371 
       
  2372 GLOBAL(void)
       
  2373 jpeg_fdct_16x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  2374 {
       
  2375   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
       
  2376   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
       
  2377   INT32 z1;
       
  2378   DCTELEM *dataptr;
       
  2379   JSAMPROW elemptr;
       
  2380   int ctr;
       
  2381   SHIFT_TEMPS
       
  2382 
       
  2383   /* Pass 1: process rows. */
       
  2384   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
  2385   /* furthermore, we scale the results by 2**PASS1_BITS. */
       
  2386   /* 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32). */
       
  2387 
       
  2388   dataptr = data;
       
  2389   ctr = 0;
       
  2390   for (ctr = 0; ctr < DCTSIZE; ctr++) {
       
  2391     elemptr = sample_data[ctr] + start_col;
       
  2392 
       
  2393     /* Even part */
       
  2394 
       
  2395     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
       
  2396     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
       
  2397     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
       
  2398     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
       
  2399     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
       
  2400     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
       
  2401     tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
       
  2402     tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
       
  2403 
       
  2404     tmp10 = tmp0 + tmp7;
       
  2405     tmp14 = tmp0 - tmp7;
       
  2406     tmp11 = tmp1 + tmp6;
       
  2407     tmp15 = tmp1 - tmp6;
       
  2408     tmp12 = tmp2 + tmp5;
       
  2409     tmp16 = tmp2 - tmp5;
       
  2410     tmp13 = tmp3 + tmp4;
       
  2411     tmp17 = tmp3 - tmp4;
       
  2412 
       
  2413     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
       
  2414     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
       
  2415     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
       
  2416     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
       
  2417     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
       
  2418     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
       
  2419     tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
       
  2420     tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
       
  2421 
       
  2422     /* Apply unsigned->signed conversion */
       
  2423     dataptr[0] = (DCTELEM)
       
  2424       ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
       
  2425     dataptr[4] = (DCTELEM)
       
  2426       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
       
  2427 	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
       
  2428 	      CONST_BITS-PASS1_BITS);
       
  2429 
       
  2430     tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
       
  2431 	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
       
  2432 
       
  2433     dataptr[2] = (DCTELEM)
       
  2434       DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
       
  2435 	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
       
  2436 	      CONST_BITS-PASS1_BITS);
       
  2437     dataptr[6] = (DCTELEM)
       
  2438       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
       
  2439 	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
       
  2440 	      CONST_BITS-PASS1_BITS);
       
  2441 
       
  2442     /* Odd part */
       
  2443 
       
  2444     tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
       
  2445 	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
       
  2446     tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
       
  2447 	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
       
  2448     tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
       
  2449 	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
       
  2450     tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
       
  2451 	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
       
  2452     tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
       
  2453 	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
       
  2454     tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
       
  2455 	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
       
  2456     tmp10 = tmp11 + tmp12 + tmp13 -
       
  2457 	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
       
  2458 	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
       
  2459     tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
       
  2460 	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
       
  2461     tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
       
  2462 	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
       
  2463     tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
       
  2464 	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
       
  2465 
       
  2466     dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
       
  2467     dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
       
  2468     dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
       
  2469     dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
       
  2470 
       
  2471     dataptr += DCTSIZE;		/* advance pointer to next row */
       
  2472   }
       
  2473 
       
  2474   /* Pass 2: process columns.
       
  2475    * We remove the PASS1_BITS scaling, but leave the results scaled up
       
  2476    * by an overall factor of 8.
       
  2477    * We must also scale the output by 8/16 = 1/2.
       
  2478    */
       
  2479 
       
  2480   dataptr = data;
       
  2481   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
       
  2482     /* Even part per LL&M figure 1 --- note that published figure is faulty;
       
  2483      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
       
  2484      */
       
  2485 
       
  2486     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
       
  2487     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
       
  2488     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
       
  2489     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
       
  2490 
       
  2491     tmp10 = tmp0 + tmp3;
       
  2492     tmp12 = tmp0 - tmp3;
       
  2493     tmp11 = tmp1 + tmp2;
       
  2494     tmp13 = tmp1 - tmp2;
       
  2495 
       
  2496     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
       
  2497     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
       
  2498     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
       
  2499     tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
       
  2500 
       
  2501     dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS+1);
       
  2502     dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS+1);
       
  2503 
   241     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
  2504     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
   242     dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
  2505     dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
   243 					   CONST_BITS+PASS1_BITS);
  2506 					   CONST_BITS+PASS1_BITS+1);
   244     dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
  2507     dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
   245 					   CONST_BITS+PASS1_BITS);
  2508 					   CONST_BITS+PASS1_BITS+1);
   246     
  2509 
   247     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
  2510     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
   248      * cK represents cos(K*pi/16).
  2511      * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
   249      * i0..i3 in the paper are tmp4..tmp7 here.
  2512      * i0..i3 in the paper are tmp0..tmp3 here.
   250      */
  2513      */
   251     
  2514 
   252     z1 = tmp4 + tmp7;
  2515     tmp10 = tmp0 + tmp3;
   253     z2 = tmp5 + tmp6;
  2516     tmp11 = tmp1 + tmp2;
   254     z3 = tmp4 + tmp6;
  2517     tmp12 = tmp0 + tmp2;
   255     z4 = tmp5 + tmp7;
  2518     tmp13 = tmp1 + tmp3;
   256     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
  2519     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
   257     
  2520 
   258     tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
  2521     tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
   259     tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
  2522     tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
   260     tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
  2523     tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
   261     tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
  2524     tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
   262     z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
  2525     tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
   263     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
  2526     tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
   264     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
  2527     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
   265     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
  2528     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
   266     
  2529 
   267     z3 += z5;
  2530     tmp12 += z1;
   268     z4 += z5;
  2531     tmp13 += z1;
   269     
  2532 
   270     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3,
  2533     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12,
   271 					   CONST_BITS+PASS1_BITS);
  2534 					   CONST_BITS+PASS1_BITS+1);
   272     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4,
  2535     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13,
   273 					   CONST_BITS+PASS1_BITS);
  2536 					   CONST_BITS+PASS1_BITS+1);
   274     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3,
  2537     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12,
   275 					   CONST_BITS+PASS1_BITS);
  2538 					   CONST_BITS+PASS1_BITS+1);
   276     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4,
  2539     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13,
   277 					   CONST_BITS+PASS1_BITS);
  2540 					   CONST_BITS+PASS1_BITS+1);
   278     
  2541 
   279     dataptr++;			/* advance pointer to next column */
  2542     dataptr++;			/* advance pointer to next column */
   280   }
  2543   }
   281 }
  2544 }
   282 
  2545 
       
  2546 
       
  2547 /*
       
  2548  * Perform the forward DCT on a 14x7 sample block.
       
  2549  *
       
  2550  * 14-point FDCT in pass 1 (rows), 7-point in pass 2 (columns).
       
  2551  */
       
  2552 
       
  2553 GLOBAL(void)
       
  2554 jpeg_fdct_14x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  2555 {
       
  2556   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
       
  2557   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
       
  2558   INT32 z1, z2, z3;
       
  2559   DCTELEM *dataptr;
       
  2560   JSAMPROW elemptr;
       
  2561   int ctr;
       
  2562   SHIFT_TEMPS
       
  2563 
       
  2564   /* Zero bottom row of output coefficient block. */
       
  2565   MEMZERO(&data[DCTSIZE*7], SIZEOF(DCTELEM) * DCTSIZE);
       
  2566 
       
  2567   /* Pass 1: process rows. */
       
  2568   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
  2569   /* furthermore, we scale the results by 2**PASS1_BITS. */
       
  2570   /* 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28). */
       
  2571 
       
  2572   dataptr = data;
       
  2573   for (ctr = 0; ctr < 7; ctr++) {
       
  2574     elemptr = sample_data[ctr] + start_col;
       
  2575 
       
  2576     /* Even part */
       
  2577 
       
  2578     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
       
  2579     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
       
  2580     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
       
  2581     tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
       
  2582     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
       
  2583     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
       
  2584     tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
       
  2585 
       
  2586     tmp10 = tmp0 + tmp6;
       
  2587     tmp14 = tmp0 - tmp6;
       
  2588     tmp11 = tmp1 + tmp5;
       
  2589     tmp15 = tmp1 - tmp5;
       
  2590     tmp12 = tmp2 + tmp4;
       
  2591     tmp16 = tmp2 - tmp4;
       
  2592 
       
  2593     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
       
  2594     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
       
  2595     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
       
  2596     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
       
  2597     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
       
  2598     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
       
  2599     tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
       
  2600 
       
  2601     /* Apply unsigned->signed conversion */
       
  2602     dataptr[0] = (DCTELEM)
       
  2603       ((tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE) << PASS1_BITS);
       
  2604     tmp13 += tmp13;
       
  2605     dataptr[4] = (DCTELEM)
       
  2606       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
       
  2607 	      MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
       
  2608 	      MULTIPLY(tmp12 - tmp13, FIX(0.881747734)),  /* c8 */
       
  2609 	      CONST_BITS-PASS1_BITS);
       
  2610 
       
  2611     tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686));    /* c6 */
       
  2612 
       
  2613     dataptr[2] = (DCTELEM)
       
  2614       DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590))   /* c2-c6 */
       
  2615 	      + MULTIPLY(tmp16, FIX(0.613604268)),        /* c10 */
       
  2616 	      CONST_BITS-PASS1_BITS);
       
  2617     dataptr[6] = (DCTELEM)
       
  2618       DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954))   /* c6+c10 */
       
  2619 	      - MULTIPLY(tmp16, FIX(1.378756276)),        /* c2 */
       
  2620 	      CONST_BITS-PASS1_BITS);
       
  2621 
       
  2622     /* Odd part */
       
  2623 
       
  2624     tmp10 = tmp1 + tmp2;
       
  2625     tmp11 = tmp5 - tmp4;
       
  2626     dataptr[7] = (DCTELEM) ((tmp0 - tmp10 + tmp3 - tmp11 - tmp6) << PASS1_BITS);
       
  2627     tmp3 <<= CONST_BITS;
       
  2628     tmp10 = MULTIPLY(tmp10, - FIX(0.158341681));          /* -c13 */
       
  2629     tmp11 = MULTIPLY(tmp11, FIX(1.405321284));            /* c1 */
       
  2630     tmp10 += tmp11 - tmp3;
       
  2631     tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) +     /* c5 */
       
  2632 	    MULTIPLY(tmp4 + tmp6, FIX(0.752406978));      /* c9 */
       
  2633     dataptr[5] = (DCTELEM)
       
  2634       DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
       
  2635 	      + MULTIPLY(tmp4, FIX(1.119999435)),         /* c1+c11-c9 */
       
  2636 	      CONST_BITS-PASS1_BITS);
       
  2637     tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) +     /* c3 */
       
  2638 	    MULTIPLY(tmp5 - tmp6, FIX(0.467085129));      /* c11 */
       
  2639     dataptr[3] = (DCTELEM)
       
  2640       DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
       
  2641 	      - MULTIPLY(tmp5, FIX(3.069855259)),         /* c1+c5+c11 */
       
  2642 	      CONST_BITS-PASS1_BITS);
       
  2643     dataptr[1] = (DCTELEM)
       
  2644       DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
       
  2645 	      MULTIPLY(tmp0 + tmp6, FIX(1.126980169)),    /* c3+c5-c1 */
       
  2646 	      CONST_BITS-PASS1_BITS);
       
  2647 
       
  2648     dataptr += DCTSIZE;		/* advance pointer to next row */
       
  2649   }
       
  2650 
       
  2651   /* Pass 2: process columns.
       
  2652    * We remove the PASS1_BITS scaling, but leave the results scaled up
       
  2653    * by an overall factor of 8.
       
  2654    * We must also scale the output by (8/14)*(8/7) = 32/49, which we
       
  2655    * partially fold into the constant multipliers and final shifting:
       
  2656    * 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14) * 64/49.
       
  2657    */
       
  2658 
       
  2659   dataptr = data;
       
  2660   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
       
  2661     /* Even part */
       
  2662 
       
  2663     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
       
  2664     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
       
  2665     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
       
  2666     tmp3 = dataptr[DCTSIZE*3];
       
  2667 
       
  2668     tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
       
  2669     tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
       
  2670     tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
       
  2671 
       
  2672     z1 = tmp0 + tmp2;
       
  2673     dataptr[DCTSIZE*0] = (DCTELEM)
       
  2674       DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
       
  2675 	      CONST_BITS+PASS1_BITS+1);
       
  2676     tmp3 += tmp3;
       
  2677     z1 -= tmp3;
       
  2678     z1 -= tmp3;
       
  2679     z1 = MULTIPLY(z1, FIX(0.461784020));                /* (c2+c6-c4)/2 */
       
  2680     z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084));       /* (c2+c4-c6)/2 */
       
  2681     z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446));       /* c6 */
       
  2682     dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS+1);
       
  2683     z1 -= z2;
       
  2684     z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509));       /* c4 */
       
  2685     dataptr[DCTSIZE*4] = (DCTELEM)
       
  2686       DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
       
  2687 	      CONST_BITS+PASS1_BITS+1);
       
  2688     dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS+1);
       
  2689 
       
  2690     /* Odd part */
       
  2691 
       
  2692     tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677));   /* (c3+c1-c5)/2 */
       
  2693     tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464));   /* (c3+c5-c1)/2 */
       
  2694     tmp0 = tmp1 - tmp2;
       
  2695     tmp1 += tmp2;
       
  2696     tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
       
  2697     tmp1 += tmp2;
       
  2698     tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310));   /* c5 */
       
  2699     tmp0 += tmp3;
       
  2700     tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355));   /* c3+c1-c5 */
       
  2701 
       
  2702     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS+1);
       
  2703     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS+1);
       
  2704     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS+1);
       
  2705 
       
  2706     dataptr++;			/* advance pointer to next column */
       
  2707   }
       
  2708 }
       
  2709 
       
  2710 
       
  2711 /*
       
  2712  * Perform the forward DCT on a 12x6 sample block.
       
  2713  *
       
  2714  * 12-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
       
  2715  */
       
  2716 
       
  2717 GLOBAL(void)
       
  2718 jpeg_fdct_12x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  2719 {
       
  2720   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
       
  2721   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
       
  2722   DCTELEM *dataptr;
       
  2723   JSAMPROW elemptr;
       
  2724   int ctr;
       
  2725   SHIFT_TEMPS
       
  2726 
       
  2727   /* Zero 2 bottom rows of output coefficient block. */
       
  2728   MEMZERO(&data[DCTSIZE*6], SIZEOF(DCTELEM) * DCTSIZE * 2);
       
  2729 
       
  2730   /* Pass 1: process rows. */
       
  2731   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
  2732   /* furthermore, we scale the results by 2**PASS1_BITS. */
       
  2733   /* 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24). */
       
  2734 
       
  2735   dataptr = data;
       
  2736   for (ctr = 0; ctr < 6; ctr++) {
       
  2737     elemptr = sample_data[ctr] + start_col;
       
  2738 
       
  2739     /* Even part */
       
  2740 
       
  2741     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
       
  2742     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
       
  2743     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
       
  2744     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
       
  2745     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
       
  2746     tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
       
  2747 
       
  2748     tmp10 = tmp0 + tmp5;
       
  2749     tmp13 = tmp0 - tmp5;
       
  2750     tmp11 = tmp1 + tmp4;
       
  2751     tmp14 = tmp1 - tmp4;
       
  2752     tmp12 = tmp2 + tmp3;
       
  2753     tmp15 = tmp2 - tmp3;
       
  2754 
       
  2755     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
       
  2756     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
       
  2757     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
       
  2758     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
       
  2759     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
       
  2760     tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
       
  2761 
       
  2762     /* Apply unsigned->signed conversion */
       
  2763     dataptr[0] = (DCTELEM)
       
  2764       ((tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE) << PASS1_BITS);
       
  2765     dataptr[6] = (DCTELEM) ((tmp13 - tmp14 - tmp15) << PASS1_BITS);
       
  2766     dataptr[4] = (DCTELEM)
       
  2767       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
       
  2768 	      CONST_BITS-PASS1_BITS);
       
  2769     dataptr[2] = (DCTELEM)
       
  2770       DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
       
  2771 	      CONST_BITS-PASS1_BITS);
       
  2772 
       
  2773     /* Odd part */
       
  2774 
       
  2775     tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100);    /* c9 */
       
  2776     tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865);   /* c3-c9 */
       
  2777     tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065);   /* c3+c9 */
       
  2778     tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054));   /* c5 */
       
  2779     tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669));   /* c7 */
       
  2780     tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
       
  2781 	    + MULTIPLY(tmp5, FIX(0.184591911));        /* c11 */
       
  2782     tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
       
  2783     tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
       
  2784 	    + MULTIPLY(tmp5, FIX(0.860918669));        /* c7 */
       
  2785     tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
       
  2786 	    - MULTIPLY(tmp5, FIX(1.121971054));        /* c5 */
       
  2787     tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
       
  2788 	    - MULTIPLY(tmp2 + tmp5, FIX_0_541196100);  /* c9 */
       
  2789 
       
  2790     dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
       
  2791     dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
       
  2792     dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
       
  2793     dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
       
  2794 
       
  2795     dataptr += DCTSIZE;		/* advance pointer to next row */
       
  2796   }
       
  2797 
       
  2798   /* Pass 2: process columns.
       
  2799    * We remove the PASS1_BITS scaling, but leave the results scaled up
       
  2800    * by an overall factor of 8.
       
  2801    * We must also scale the output by (8/12)*(8/6) = 8/9, which we
       
  2802    * partially fold into the constant multipliers and final shifting:
       
  2803    * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
       
  2804    */
       
  2805 
       
  2806   dataptr = data;
       
  2807   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
       
  2808     /* Even part */
       
  2809 
       
  2810     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
       
  2811     tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
       
  2812     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
       
  2813 
       
  2814     tmp10 = tmp0 + tmp2;
       
  2815     tmp12 = tmp0 - tmp2;
       
  2816 
       
  2817     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
       
  2818     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
       
  2819     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
       
  2820 
       
  2821     dataptr[DCTSIZE*0] = (DCTELEM)
       
  2822       DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
       
  2823 	      CONST_BITS+PASS1_BITS+1);
       
  2824     dataptr[DCTSIZE*2] = (DCTELEM)
       
  2825       DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
       
  2826 	      CONST_BITS+PASS1_BITS+1);
       
  2827     dataptr[DCTSIZE*4] = (DCTELEM)
       
  2828       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
       
  2829 	      CONST_BITS+PASS1_BITS+1);
       
  2830 
       
  2831     /* Odd part */
       
  2832 
       
  2833     tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */
       
  2834 
       
  2835     dataptr[DCTSIZE*1] = (DCTELEM)
       
  2836       DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
       
  2837 	      CONST_BITS+PASS1_BITS+1);
       
  2838     dataptr[DCTSIZE*3] = (DCTELEM)
       
  2839       DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
       
  2840 	      CONST_BITS+PASS1_BITS+1);
       
  2841     dataptr[DCTSIZE*5] = (DCTELEM)
       
  2842       DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
       
  2843 	      CONST_BITS+PASS1_BITS+1);
       
  2844 
       
  2845     dataptr++;			/* advance pointer to next column */
       
  2846   }
       
  2847 }
       
  2848 
       
  2849 
       
  2850 /*
       
  2851  * Perform the forward DCT on a 10x5 sample block.
       
  2852  *
       
  2853  * 10-point FDCT in pass 1 (rows), 5-point in pass 2 (columns).
       
  2854  */
       
  2855 
       
  2856 GLOBAL(void)
       
  2857 jpeg_fdct_10x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  2858 {
       
  2859   INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
       
  2860   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
       
  2861   DCTELEM *dataptr;
       
  2862   JSAMPROW elemptr;
       
  2863   int ctr;
       
  2864   SHIFT_TEMPS
       
  2865 
       
  2866   /* Zero 3 bottom rows of output coefficient block. */
       
  2867   MEMZERO(&data[DCTSIZE*5], SIZEOF(DCTELEM) * DCTSIZE * 3);
       
  2868 
       
  2869   /* Pass 1: process rows. */
       
  2870   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
  2871   /* furthermore, we scale the results by 2**PASS1_BITS. */
       
  2872   /* 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20). */
       
  2873 
       
  2874   dataptr = data;
       
  2875   for (ctr = 0; ctr < 5; ctr++) {
       
  2876     elemptr = sample_data[ctr] + start_col;
       
  2877 
       
  2878     /* Even part */
       
  2879 
       
  2880     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
       
  2881     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
       
  2882     tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
       
  2883     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
       
  2884     tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
       
  2885 
       
  2886     tmp10 = tmp0 + tmp4;
       
  2887     tmp13 = tmp0 - tmp4;
       
  2888     tmp11 = tmp1 + tmp3;
       
  2889     tmp14 = tmp1 - tmp3;
       
  2890 
       
  2891     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
       
  2892     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
       
  2893     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
       
  2894     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
       
  2895     tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
       
  2896 
       
  2897     /* Apply unsigned->signed conversion */
       
  2898     dataptr[0] = (DCTELEM)
       
  2899       ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << PASS1_BITS);
       
  2900     tmp12 += tmp12;
       
  2901     dataptr[4] = (DCTELEM)
       
  2902       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
       
  2903 	      MULTIPLY(tmp11 - tmp12, FIX(0.437016024)),  /* c8 */
       
  2904 	      CONST_BITS-PASS1_BITS);
       
  2905     tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876));    /* c6 */
       
  2906     dataptr[2] = (DCTELEM)
       
  2907       DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)),  /* c2-c6 */
       
  2908 	      CONST_BITS-PASS1_BITS);
       
  2909     dataptr[6] = (DCTELEM)
       
  2910       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)),  /* c2+c6 */
       
  2911 	      CONST_BITS-PASS1_BITS);
       
  2912 
       
  2913     /* Odd part */
       
  2914 
       
  2915     tmp10 = tmp0 + tmp4;
       
  2916     tmp11 = tmp1 - tmp3;
       
  2917     dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << PASS1_BITS);
       
  2918     tmp2 <<= CONST_BITS;
       
  2919     dataptr[1] = (DCTELEM)
       
  2920       DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) +          /* c1 */
       
  2921 	      MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 +   /* c3 */
       
  2922 	      MULTIPLY(tmp3, FIX(0.642039522)) +          /* c7 */
       
  2923 	      MULTIPLY(tmp4, FIX(0.221231742)),           /* c9 */
       
  2924 	      CONST_BITS-PASS1_BITS);
       
  2925     tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) -     /* (c3+c7)/2 */
       
  2926 	    MULTIPLY(tmp1 + tmp3, FIX(0.587785252));      /* (c1-c9)/2 */
       
  2927     tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) +   /* (c3-c7)/2 */
       
  2928 	    (tmp11 << (CONST_BITS - 1)) - tmp2;
       
  2929     dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-PASS1_BITS);
       
  2930     dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-PASS1_BITS);
       
  2931 
       
  2932     dataptr += DCTSIZE;		/* advance pointer to next row */
       
  2933   }
       
  2934 
       
  2935   /* Pass 2: process columns.
       
  2936    * We remove the PASS1_BITS scaling, but leave the results scaled up
       
  2937    * by an overall factor of 8.
       
  2938    * We must also scale the output by (8/10)*(8/5) = 32/25, which we
       
  2939    * fold into the constant multipliers:
       
  2940    * 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10) * 32/25.
       
  2941    */
       
  2942 
       
  2943   dataptr = data;
       
  2944   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
       
  2945     /* Even part */
       
  2946 
       
  2947     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
       
  2948     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
       
  2949     tmp2 = dataptr[DCTSIZE*2];
       
  2950 
       
  2951     tmp10 = tmp0 + tmp1;
       
  2952     tmp11 = tmp0 - tmp1;
       
  2953 
       
  2954     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
       
  2955     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
       
  2956 
       
  2957     dataptr[DCTSIZE*0] = (DCTELEM)
       
  2958       DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)),        /* 32/25 */
       
  2959 	      CONST_BITS+PASS1_BITS);
       
  2960     tmp11 = MULTIPLY(tmp11, FIX(1.011928851));          /* (c2+c4)/2 */
       
  2961     tmp10 -= tmp2 << 2;
       
  2962     tmp10 = MULTIPLY(tmp10, FIX(0.452548340));          /* (c2-c4)/2 */
       
  2963     dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
       
  2964     dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
       
  2965 
       
  2966     /* Odd part */
       
  2967 
       
  2968     tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961));    /* c3 */
       
  2969 
       
  2970     dataptr[DCTSIZE*1] = (DCTELEM)
       
  2971       DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
       
  2972 	      CONST_BITS+PASS1_BITS);
       
  2973     dataptr[DCTSIZE*3] = (DCTELEM)
       
  2974       DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
       
  2975 	      CONST_BITS+PASS1_BITS);
       
  2976 
       
  2977     dataptr++;			/* advance pointer to next column */
       
  2978   }
       
  2979 }
       
  2980 
       
  2981 
       
  2982 /*
       
  2983  * Perform the forward DCT on an 8x4 sample block.
       
  2984  *
       
  2985  * 8-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
       
  2986  */
       
  2987 
       
  2988 GLOBAL(void)
       
  2989 jpeg_fdct_8x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  2990 {
       
  2991   INT32 tmp0, tmp1, tmp2, tmp3;
       
  2992   INT32 tmp10, tmp11, tmp12, tmp13;
       
  2993   INT32 z1;
       
  2994   DCTELEM *dataptr;
       
  2995   JSAMPROW elemptr;
       
  2996   int ctr;
       
  2997   SHIFT_TEMPS
       
  2998 
       
  2999   /* Zero 4 bottom rows of output coefficient block. */
       
  3000   MEMZERO(&data[DCTSIZE*4], SIZEOF(DCTELEM) * DCTSIZE * 4);
       
  3001 
       
  3002   /* Pass 1: process rows. */
       
  3003   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
  3004   /* furthermore, we scale the results by 2**PASS1_BITS. */
       
  3005   /* We must also scale the output by 8/4 = 2, which we add here. */
       
  3006 
       
  3007   dataptr = data;
       
  3008   for (ctr = 0; ctr < 4; ctr++) {
       
  3009     elemptr = sample_data[ctr] + start_col;
       
  3010 
       
  3011     /* Even part per LL&M figure 1 --- note that published figure is faulty;
       
  3012      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
       
  3013      */
       
  3014 
       
  3015     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
       
  3016     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
       
  3017     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
       
  3018     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
       
  3019 
       
  3020     tmp10 = tmp0 + tmp3;
       
  3021     tmp12 = tmp0 - tmp3;
       
  3022     tmp11 = tmp1 + tmp2;
       
  3023     tmp13 = tmp1 - tmp2;
       
  3024 
       
  3025     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
       
  3026     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
       
  3027     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
       
  3028     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
       
  3029 
       
  3030     /* Apply unsigned->signed conversion */
       
  3031     dataptr[0] = (DCTELEM)
       
  3032       ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << (PASS1_BITS+1));
       
  3033     dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << (PASS1_BITS+1));
       
  3034 
       
  3035     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
       
  3036     /* Add fudge factor here for final descale. */
       
  3037     z1 += ONE << (CONST_BITS-PASS1_BITS-2);
       
  3038     dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
       
  3039 				       CONST_BITS-PASS1_BITS-1);
       
  3040     dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
       
  3041 				       CONST_BITS-PASS1_BITS-1);
       
  3042 
       
  3043     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
       
  3044      * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
       
  3045      * i0..i3 in the paper are tmp0..tmp3 here.
       
  3046      */
       
  3047 
       
  3048     tmp10 = tmp0 + tmp3;
       
  3049     tmp11 = tmp1 + tmp2;
       
  3050     tmp12 = tmp0 + tmp2;
       
  3051     tmp13 = tmp1 + tmp3;
       
  3052     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
       
  3053     /* Add fudge factor here for final descale. */
       
  3054     z1 += ONE << (CONST_BITS-PASS1_BITS-2);
       
  3055 
       
  3056     tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
       
  3057     tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
       
  3058     tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
       
  3059     tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
       
  3060     tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
       
  3061     tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
       
  3062     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
       
  3063     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
       
  3064 
       
  3065     tmp12 += z1;
       
  3066     tmp13 += z1;
       
  3067 
       
  3068     dataptr[1] = (DCTELEM)
       
  3069       RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS-1);
       
  3070     dataptr[3] = (DCTELEM)
       
  3071       RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS-1);
       
  3072     dataptr[5] = (DCTELEM)
       
  3073       RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS-1);
       
  3074     dataptr[7] = (DCTELEM)
       
  3075       RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS-1);
       
  3076 
       
  3077     dataptr += DCTSIZE;		/* advance pointer to next row */
       
  3078   }
       
  3079 
       
  3080   /* Pass 2: process columns.
       
  3081    * We remove the PASS1_BITS scaling, but leave the results scaled up
       
  3082    * by an overall factor of 8.
       
  3083    * 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
       
  3084    */
       
  3085 
       
  3086   dataptr = data;
       
  3087   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
       
  3088     /* Even part */
       
  3089 
       
  3090     /* Add fudge factor here for final descale. */
       
  3091     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
       
  3092     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
       
  3093 
       
  3094     tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
       
  3095     tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
       
  3096 
       
  3097     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
       
  3098     dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
       
  3099 
       
  3100     /* Odd part */
       
  3101 
       
  3102     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);   /* c6 */
       
  3103     /* Add fudge factor here for final descale. */
       
  3104     tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
       
  3105 
       
  3106     dataptr[DCTSIZE*1] = (DCTELEM)
       
  3107       RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
       
  3108 		  CONST_BITS+PASS1_BITS);
       
  3109     dataptr[DCTSIZE*3] = (DCTELEM)
       
  3110       RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
       
  3111 		  CONST_BITS+PASS1_BITS);
       
  3112 
       
  3113     dataptr++;			/* advance pointer to next column */
       
  3114   }
       
  3115 }
       
  3116 
       
  3117 
       
  3118 /*
       
  3119  * Perform the forward DCT on a 6x3 sample block.
       
  3120  *
       
  3121  * 6-point FDCT in pass 1 (rows), 3-point in pass 2 (columns).
       
  3122  */
       
  3123 
       
  3124 GLOBAL(void)
       
  3125 jpeg_fdct_6x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  3126 {
       
  3127   INT32 tmp0, tmp1, tmp2;
       
  3128   INT32 tmp10, tmp11, tmp12;
       
  3129   DCTELEM *dataptr;
       
  3130   JSAMPROW elemptr;
       
  3131   int ctr;
       
  3132   SHIFT_TEMPS
       
  3133 
       
  3134   /* Pre-zero output coefficient block. */
       
  3135   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
       
  3136 
       
  3137   /* Pass 1: process rows. */
       
  3138   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
  3139   /* furthermore, we scale the results by 2**PASS1_BITS. */
       
  3140   /* We scale the results further by 2 as part of output adaption */
       
  3141   /* scaling for different DCT size. */
       
  3142   /* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */
       
  3143 
       
  3144   dataptr = data;
       
  3145   for (ctr = 0; ctr < 3; ctr++) {
       
  3146     elemptr = sample_data[ctr] + start_col;
       
  3147 
       
  3148     /* Even part */
       
  3149 
       
  3150     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
       
  3151     tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
       
  3152     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
       
  3153 
       
  3154     tmp10 = tmp0 + tmp2;
       
  3155     tmp12 = tmp0 - tmp2;
       
  3156 
       
  3157     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
       
  3158     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
       
  3159     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
       
  3160 
       
  3161     /* Apply unsigned->signed conversion */
       
  3162     dataptr[0] = (DCTELEM)
       
  3163       ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << (PASS1_BITS+1));
       
  3164     dataptr[2] = (DCTELEM)
       
  3165       DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
       
  3166 	      CONST_BITS-PASS1_BITS-1);
       
  3167     dataptr[4] = (DCTELEM)
       
  3168       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
       
  3169 	      CONST_BITS-PASS1_BITS-1);
       
  3170 
       
  3171     /* Odd part */
       
  3172 
       
  3173     tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
       
  3174 		    CONST_BITS-PASS1_BITS-1);
       
  3175 
       
  3176     dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << (PASS1_BITS+1)));
       
  3177     dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << (PASS1_BITS+1));
       
  3178     dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << (PASS1_BITS+1)));
       
  3179 
       
  3180     dataptr += DCTSIZE;		/* advance pointer to next row */
       
  3181   }
       
  3182 
       
  3183   /* Pass 2: process columns.
       
  3184    * We remove the PASS1_BITS scaling, but leave the results scaled up
       
  3185    * by an overall factor of 8.
       
  3186    * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
       
  3187    * fold into the constant multipliers (other part was done in pass 1):
       
  3188    * 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6) * 16/9.
       
  3189    */
       
  3190 
       
  3191   dataptr = data;
       
  3192   for (ctr = 0; ctr < 6; ctr++) {
       
  3193     /* Even part */
       
  3194 
       
  3195     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
       
  3196     tmp1 = dataptr[DCTSIZE*1];
       
  3197 
       
  3198     tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
       
  3199 
       
  3200     dataptr[DCTSIZE*0] = (DCTELEM)
       
  3201       DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),        /* 16/9 */
       
  3202 	      CONST_BITS+PASS1_BITS);
       
  3203     dataptr[DCTSIZE*2] = (DCTELEM)
       
  3204       DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
       
  3205 	      CONST_BITS+PASS1_BITS);
       
  3206 
       
  3207     /* Odd part */
       
  3208 
       
  3209     dataptr[DCTSIZE*1] = (DCTELEM)
       
  3210       DESCALE(MULTIPLY(tmp2, FIX(2.177324216)),               /* c1 */
       
  3211 	      CONST_BITS+PASS1_BITS);
       
  3212 
       
  3213     dataptr++;			/* advance pointer to next column */
       
  3214   }
       
  3215 }
       
  3216 
       
  3217 
       
  3218 /*
       
  3219  * Perform the forward DCT on a 4x2 sample block.
       
  3220  *
       
  3221  * 4-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
       
  3222  */
       
  3223 
       
  3224 GLOBAL(void)
       
  3225 jpeg_fdct_4x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  3226 {
       
  3227   INT32 tmp0, tmp1;
       
  3228   INT32 tmp10, tmp11;
       
  3229   DCTELEM *dataptr;
       
  3230   JSAMPROW elemptr;
       
  3231   int ctr;
       
  3232   SHIFT_TEMPS
       
  3233 
       
  3234   /* Pre-zero output coefficient block. */
       
  3235   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
       
  3236 
       
  3237   /* Pass 1: process rows. */
       
  3238   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
  3239   /* furthermore, we scale the results by 2**PASS1_BITS. */
       
  3240   /* We must also scale the output by (8/4)*(8/2) = 2**3, which we add here. */
       
  3241   /* 4-point FDCT kernel, */
       
  3242   /* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */
       
  3243 
       
  3244   dataptr = data;
       
  3245   for (ctr = 0; ctr < 2; ctr++) {
       
  3246     elemptr = sample_data[ctr] + start_col;
       
  3247 
       
  3248     /* Even part */
       
  3249 
       
  3250     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
       
  3251     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
       
  3252 
       
  3253     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
       
  3254     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
       
  3255 
       
  3256     /* Apply unsigned->signed conversion */
       
  3257     dataptr[0] = (DCTELEM)
       
  3258       ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+3));
       
  3259     dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+3));
       
  3260 
       
  3261     /* Odd part */
       
  3262 
       
  3263     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
       
  3264     /* Add fudge factor here for final descale. */
       
  3265     tmp0 += ONE << (CONST_BITS-PASS1_BITS-4);
       
  3266 
       
  3267     dataptr[1] = (DCTELEM)
       
  3268       RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
       
  3269 		  CONST_BITS-PASS1_BITS-3);
       
  3270     dataptr[3] = (DCTELEM)
       
  3271       RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
       
  3272 		  CONST_BITS-PASS1_BITS-3);
       
  3273 
       
  3274     dataptr += DCTSIZE;		/* advance pointer to next row */
       
  3275   }
       
  3276 
       
  3277   /* Pass 2: process columns.
       
  3278    * We remove the PASS1_BITS scaling, but leave the results scaled up
       
  3279    * by an overall factor of 8.
       
  3280    */
       
  3281 
       
  3282   dataptr = data;
       
  3283   for (ctr = 0; ctr < 4; ctr++) {
       
  3284     /* Even part */
       
  3285 
       
  3286     /* Add fudge factor here for final descale. */
       
  3287     tmp0 = dataptr[DCTSIZE*0] + (ONE << (PASS1_BITS-1));
       
  3288     tmp1 = dataptr[DCTSIZE*1];
       
  3289 
       
  3290     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
       
  3291 
       
  3292     /* Odd part */
       
  3293 
       
  3294     dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
       
  3295 
       
  3296     dataptr++;			/* advance pointer to next column */
       
  3297   }
       
  3298 }
       
  3299 
       
  3300 
       
  3301 /*
       
  3302  * Perform the forward DCT on a 2x1 sample block.
       
  3303  *
       
  3304  * 2-point FDCT in pass 1 (rows), 1-point in pass 2 (columns).
       
  3305  */
       
  3306 
       
  3307 GLOBAL(void)
       
  3308 jpeg_fdct_2x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  3309 {
       
  3310   INT32 tmp0, tmp1;
       
  3311   JSAMPROW elemptr;
       
  3312 
       
  3313   /* Pre-zero output coefficient block. */
       
  3314   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
       
  3315 
       
  3316   elemptr = sample_data[0] + start_col;
       
  3317 
       
  3318   tmp0 = GETJSAMPLE(elemptr[0]);
       
  3319   tmp1 = GETJSAMPLE(elemptr[1]);
       
  3320 
       
  3321   /* We leave the results scaled up by an overall factor of 8.
       
  3322    * We must also scale the output by (8/2)*(8/1) = 2**5.
       
  3323    */
       
  3324 
       
  3325   /* Even part */
       
  3326   /* Apply unsigned->signed conversion */
       
  3327   data[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);
       
  3328 
       
  3329   /* Odd part */
       
  3330   data[1] = (DCTELEM) ((tmp0 - tmp1) << 5);
       
  3331 }
       
  3332 
       
  3333 
       
  3334 /*
       
  3335  * Perform the forward DCT on an 8x16 sample block.
       
  3336  *
       
  3337  * 8-point FDCT in pass 1 (rows), 16-point in pass 2 (columns).
       
  3338  */
       
  3339 
       
  3340 GLOBAL(void)
       
  3341 jpeg_fdct_8x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  3342 {
       
  3343   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
       
  3344   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
       
  3345   INT32 z1;
       
  3346   DCTELEM workspace[DCTSIZE2];
       
  3347   DCTELEM *dataptr;
       
  3348   DCTELEM *wsptr;
       
  3349   JSAMPROW elemptr;
       
  3350   int ctr;
       
  3351   SHIFT_TEMPS
       
  3352 
       
  3353   /* Pass 1: process rows. */
       
  3354   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
  3355   /* furthermore, we scale the results by 2**PASS1_BITS. */
       
  3356 
       
  3357   dataptr = data;
       
  3358   ctr = 0;
       
  3359   for (;;) {
       
  3360     elemptr = sample_data[ctr] + start_col;
       
  3361 
       
  3362     /* Even part per LL&M figure 1 --- note that published figure is faulty;
       
  3363      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
       
  3364      */
       
  3365 
       
  3366     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
       
  3367     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
       
  3368     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
       
  3369     tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
       
  3370 
       
  3371     tmp10 = tmp0 + tmp3;
       
  3372     tmp12 = tmp0 - tmp3;
       
  3373     tmp11 = tmp1 + tmp2;
       
  3374     tmp13 = tmp1 - tmp2;
       
  3375 
       
  3376     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
       
  3377     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
       
  3378     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
       
  3379     tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
       
  3380 
       
  3381     /* Apply unsigned->signed conversion */
       
  3382     dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
       
  3383     dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
       
  3384 
       
  3385     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
       
  3386     dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
       
  3387 				   CONST_BITS-PASS1_BITS);
       
  3388     dataptr[6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
       
  3389 				   CONST_BITS-PASS1_BITS);
       
  3390 
       
  3391     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
       
  3392      * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
       
  3393      * i0..i3 in the paper are tmp0..tmp3 here.
       
  3394      */
       
  3395 
       
  3396     tmp10 = tmp0 + tmp3;
       
  3397     tmp11 = tmp1 + tmp2;
       
  3398     tmp12 = tmp0 + tmp2;
       
  3399     tmp13 = tmp1 + tmp3;
       
  3400     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
       
  3401 
       
  3402     tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
       
  3403     tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
       
  3404     tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
       
  3405     tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
       
  3406     tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
       
  3407     tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
       
  3408     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
       
  3409     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
       
  3410 
       
  3411     tmp12 += z1;
       
  3412     tmp13 += z1;
       
  3413 
       
  3414     dataptr[1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
       
  3415     dataptr[3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
       
  3416     dataptr[5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
       
  3417     dataptr[7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
       
  3418 
       
  3419     ctr++;
       
  3420 
       
  3421     if (ctr != DCTSIZE) {
       
  3422       if (ctr == DCTSIZE * 2)
       
  3423 	break;			/* Done. */
       
  3424       dataptr += DCTSIZE;	/* advance pointer to next row */
       
  3425     } else
       
  3426       dataptr = workspace;	/* switch pointer to extended workspace */
       
  3427   }
       
  3428 
       
  3429   /* Pass 2: process columns.
       
  3430    * We remove the PASS1_BITS scaling, but leave the results scaled up
       
  3431    * by an overall factor of 8.
       
  3432    * We must also scale the output by 8/16 = 1/2.
       
  3433    * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
       
  3434    */
       
  3435 
       
  3436   dataptr = data;
       
  3437   wsptr = workspace;
       
  3438   for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
       
  3439     /* Even part */
       
  3440 
       
  3441     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
       
  3442     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
       
  3443     tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
       
  3444     tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
       
  3445     tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
       
  3446     tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
       
  3447     tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
       
  3448     tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
       
  3449 
       
  3450     tmp10 = tmp0 + tmp7;
       
  3451     tmp14 = tmp0 - tmp7;
       
  3452     tmp11 = tmp1 + tmp6;
       
  3453     tmp15 = tmp1 - tmp6;
       
  3454     tmp12 = tmp2 + tmp5;
       
  3455     tmp16 = tmp2 - tmp5;
       
  3456     tmp13 = tmp3 + tmp4;
       
  3457     tmp17 = tmp3 - tmp4;
       
  3458 
       
  3459     tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
       
  3460     tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
       
  3461     tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
       
  3462     tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
       
  3463     tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
       
  3464     tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
       
  3465     tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
       
  3466     tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
       
  3467 
       
  3468     dataptr[DCTSIZE*0] = (DCTELEM)
       
  3469       DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+1);
       
  3470     dataptr[DCTSIZE*4] = (DCTELEM)
       
  3471       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
       
  3472 	      MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
       
  3473 	      CONST_BITS+PASS1_BITS+1);
       
  3474 
       
  3475     tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
       
  3476 	    MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
       
  3477 
       
  3478     dataptr[DCTSIZE*2] = (DCTELEM)
       
  3479       DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
       
  3480 	      + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
       
  3481 	      CONST_BITS+PASS1_BITS+1);
       
  3482     dataptr[DCTSIZE*6] = (DCTELEM)
       
  3483       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
       
  3484 	      - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
       
  3485 	      CONST_BITS+PASS1_BITS+1);
       
  3486 
       
  3487     /* Odd part */
       
  3488 
       
  3489     tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
       
  3490 	    MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
       
  3491     tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
       
  3492 	    MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
       
  3493     tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
       
  3494 	    MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
       
  3495     tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
       
  3496 	    MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
       
  3497     tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
       
  3498 	    MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
       
  3499     tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
       
  3500 	    MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
       
  3501     tmp10 = tmp11 + tmp12 + tmp13 -
       
  3502 	    MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
       
  3503 	    MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
       
  3504     tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
       
  3505 	     - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
       
  3506     tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
       
  3507 	     + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
       
  3508     tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
       
  3509 	     + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
       
  3510 
       
  3511     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+1);
       
  3512     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+1);
       
  3513     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+1);
       
  3514     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+1);
       
  3515 
       
  3516     dataptr++;			/* advance pointer to next column */
       
  3517     wsptr++;			/* advance pointer to next column */
       
  3518   }
       
  3519 }
       
  3520 
       
  3521 
       
  3522 /*
       
  3523  * Perform the forward DCT on a 7x14 sample block.
       
  3524  *
       
  3525  * 7-point FDCT in pass 1 (rows), 14-point in pass 2 (columns).
       
  3526  */
       
  3527 
       
  3528 GLOBAL(void)
       
  3529 jpeg_fdct_7x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  3530 {
       
  3531   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
       
  3532   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
       
  3533   INT32 z1, z2, z3;
       
  3534   DCTELEM workspace[8*6];
       
  3535   DCTELEM *dataptr;
       
  3536   DCTELEM *wsptr;
       
  3537   JSAMPROW elemptr;
       
  3538   int ctr;
       
  3539   SHIFT_TEMPS
       
  3540 
       
  3541   /* Pre-zero output coefficient block. */
       
  3542   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
       
  3543 
       
  3544   /* Pass 1: process rows. */
       
  3545   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
  3546   /* furthermore, we scale the results by 2**PASS1_BITS. */
       
  3547   /* 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14). */
       
  3548 
       
  3549   dataptr = data;
       
  3550   ctr = 0;
       
  3551   for (;;) {
       
  3552     elemptr = sample_data[ctr] + start_col;
       
  3553 
       
  3554     /* Even part */
       
  3555 
       
  3556     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
       
  3557     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
       
  3558     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
       
  3559     tmp3 = GETJSAMPLE(elemptr[3]);
       
  3560 
       
  3561     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
       
  3562     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
       
  3563     tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
       
  3564 
       
  3565     z1 = tmp0 + tmp2;
       
  3566     /* Apply unsigned->signed conversion */
       
  3567     dataptr[0] = (DCTELEM)
       
  3568       ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
       
  3569     tmp3 += tmp3;
       
  3570     z1 -= tmp3;
       
  3571     z1 -= tmp3;
       
  3572     z1 = MULTIPLY(z1, FIX(0.353553391));                /* (c2+c6-c4)/2 */
       
  3573     z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002));       /* (c2+c4-c6)/2 */
       
  3574     z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123));       /* c6 */
       
  3575     dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
       
  3576     z1 -= z2;
       
  3577     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734));       /* c4 */
       
  3578     dataptr[4] = (DCTELEM)
       
  3579       DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
       
  3580 	      CONST_BITS-PASS1_BITS);
       
  3581     dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
       
  3582 
       
  3583     /* Odd part */
       
  3584 
       
  3585     tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347));   /* (c3+c1-c5)/2 */
       
  3586     tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339));   /* (c3+c5-c1)/2 */
       
  3587     tmp0 = tmp1 - tmp2;
       
  3588     tmp1 += tmp2;
       
  3589     tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
       
  3590     tmp1 += tmp2;
       
  3591     tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268));   /* c5 */
       
  3592     tmp0 += tmp3;
       
  3593     tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693));   /* c3+c1-c5 */
       
  3594 
       
  3595     dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
       
  3596     dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
       
  3597     dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
       
  3598 
       
  3599     ctr++;
       
  3600 
       
  3601     if (ctr != DCTSIZE) {
       
  3602       if (ctr == 14)
       
  3603 	break;			/* Done. */
       
  3604       dataptr += DCTSIZE;	/* advance pointer to next row */
       
  3605     } else
       
  3606       dataptr = workspace;	/* switch pointer to extended workspace */
       
  3607   }
       
  3608 
       
  3609   /* Pass 2: process columns.
       
  3610    * We remove the PASS1_BITS scaling, but leave the results scaled up
       
  3611    * by an overall factor of 8.
       
  3612    * We must also scale the output by (8/7)*(8/14) = 32/49, which we
       
  3613    * fold into the constant multipliers:
       
  3614    * 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28) * 32/49.
       
  3615    */
       
  3616 
       
  3617   dataptr = data;
       
  3618   wsptr = workspace;
       
  3619   for (ctr = 0; ctr < 7; ctr++) {
       
  3620     /* Even part */
       
  3621 
       
  3622     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
       
  3623     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
       
  3624     tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
       
  3625     tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
       
  3626     tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
       
  3627     tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
       
  3628     tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
       
  3629 
       
  3630     tmp10 = tmp0 + tmp6;
       
  3631     tmp14 = tmp0 - tmp6;
       
  3632     tmp11 = tmp1 + tmp5;
       
  3633     tmp15 = tmp1 - tmp5;
       
  3634     tmp12 = tmp2 + tmp4;
       
  3635     tmp16 = tmp2 - tmp4;
       
  3636 
       
  3637     tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
       
  3638     tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
       
  3639     tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
       
  3640     tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
       
  3641     tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
       
  3642     tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
       
  3643     tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
       
  3644 
       
  3645     dataptr[DCTSIZE*0] = (DCTELEM)
       
  3646       DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
       
  3647 		       FIX(0.653061224)),                 /* 32/49 */
       
  3648 	      CONST_BITS+PASS1_BITS);
       
  3649     tmp13 += tmp13;
       
  3650     dataptr[DCTSIZE*4] = (DCTELEM)
       
  3651       DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
       
  3652 	      MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
       
  3653 	      MULTIPLY(tmp12 - tmp13, FIX(0.575835255)),  /* c8 */
       
  3654 	      CONST_BITS+PASS1_BITS);
       
  3655 
       
  3656     tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570));    /* c6 */
       
  3657 
       
  3658     dataptr[DCTSIZE*2] = (DCTELEM)
       
  3659       DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691))   /* c2-c6 */
       
  3660 	      + MULTIPLY(tmp16, FIX(0.400721155)),        /* c10 */
       
  3661 	      CONST_BITS+PASS1_BITS);
       
  3662     dataptr[DCTSIZE*6] = (DCTELEM)
       
  3663       DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725))   /* c6+c10 */
       
  3664 	      - MULTIPLY(tmp16, FIX(0.900412262)),        /* c2 */
       
  3665 	      CONST_BITS+PASS1_BITS);
       
  3666 
       
  3667     /* Odd part */
       
  3668 
       
  3669     tmp10 = tmp1 + tmp2;
       
  3670     tmp11 = tmp5 - tmp4;
       
  3671     dataptr[DCTSIZE*7] = (DCTELEM)
       
  3672       DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
       
  3673 		       FIX(0.653061224)),                 /* 32/49 */
       
  3674 	      CONST_BITS+PASS1_BITS);
       
  3675     tmp3  = MULTIPLY(tmp3 , FIX(0.653061224));            /* 32/49 */
       
  3676     tmp10 = MULTIPLY(tmp10, - FIX(0.103406812));          /* -c13 */
       
  3677     tmp11 = MULTIPLY(tmp11, FIX(0.917760839));            /* c1 */
       
  3678     tmp10 += tmp11 - tmp3;
       
  3679     tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) +     /* c5 */
       
  3680 	    MULTIPLY(tmp4 + tmp6, FIX(0.491367823));      /* c9 */
       
  3681     dataptr[DCTSIZE*5] = (DCTELEM)
       
  3682       DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
       
  3683 	      + MULTIPLY(tmp4, FIX(0.731428202)),         /* c1+c11-c9 */
       
  3684 	      CONST_BITS+PASS1_BITS);
       
  3685     tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) +     /* c3 */
       
  3686 	    MULTIPLY(tmp5 - tmp6, FIX(0.305035186));      /* c11 */
       
  3687     dataptr[DCTSIZE*3] = (DCTELEM)
       
  3688       DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
       
  3689 	      - MULTIPLY(tmp5, FIX(2.004803435)),         /* c1+c5+c11 */
       
  3690 	      CONST_BITS+PASS1_BITS);
       
  3691     dataptr[DCTSIZE*1] = (DCTELEM)
       
  3692       DESCALE(tmp11 + tmp12 + tmp3
       
  3693 	      - MULTIPLY(tmp0, FIX(0.735987049))          /* c3+c5-c1 */
       
  3694 	      - MULTIPLY(tmp6, FIX(0.082925825)),         /* c9-c11-c13 */
       
  3695 	      CONST_BITS+PASS1_BITS);
       
  3696 
       
  3697     dataptr++;			/* advance pointer to next column */
       
  3698     wsptr++;			/* advance pointer to next column */
       
  3699   }
       
  3700 }
       
  3701 
       
  3702 
       
  3703 /*
       
  3704  * Perform the forward DCT on a 6x12 sample block.
       
  3705  *
       
  3706  * 6-point FDCT in pass 1 (rows), 12-point in pass 2 (columns).
       
  3707  */
       
  3708 
       
  3709 GLOBAL(void)
       
  3710 jpeg_fdct_6x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  3711 {
       
  3712   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
       
  3713   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
       
  3714   DCTELEM workspace[8*4];
       
  3715   DCTELEM *dataptr;
       
  3716   DCTELEM *wsptr;
       
  3717   JSAMPROW elemptr;
       
  3718   int ctr;
       
  3719   SHIFT_TEMPS
       
  3720 
       
  3721   /* Pre-zero output coefficient block. */
       
  3722   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
       
  3723 
       
  3724   /* Pass 1: process rows. */
       
  3725   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
  3726   /* furthermore, we scale the results by 2**PASS1_BITS. */
       
  3727   /* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */
       
  3728 
       
  3729   dataptr = data;
       
  3730   ctr = 0;
       
  3731   for (;;) {
       
  3732     elemptr = sample_data[ctr] + start_col;
       
  3733 
       
  3734     /* Even part */
       
  3735 
       
  3736     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
       
  3737     tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
       
  3738     tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
       
  3739 
       
  3740     tmp10 = tmp0 + tmp2;
       
  3741     tmp12 = tmp0 - tmp2;
       
  3742 
       
  3743     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
       
  3744     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
       
  3745     tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
       
  3746 
       
  3747     /* Apply unsigned->signed conversion */
       
  3748     dataptr[0] = (DCTELEM)
       
  3749       ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
       
  3750     dataptr[2] = (DCTELEM)
       
  3751       DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
       
  3752 	      CONST_BITS-PASS1_BITS);
       
  3753     dataptr[4] = (DCTELEM)
       
  3754       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
       
  3755 	      CONST_BITS-PASS1_BITS);
       
  3756 
       
  3757     /* Odd part */
       
  3758 
       
  3759     tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
       
  3760 		    CONST_BITS-PASS1_BITS);
       
  3761 
       
  3762     dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
       
  3763     dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
       
  3764     dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
       
  3765 
       
  3766     ctr++;
       
  3767 
       
  3768     if (ctr != DCTSIZE) {
       
  3769       if (ctr == 12)
       
  3770 	break;			/* Done. */
       
  3771       dataptr += DCTSIZE;	/* advance pointer to next row */
       
  3772     } else
       
  3773       dataptr = workspace;	/* switch pointer to extended workspace */
       
  3774   }
       
  3775 
       
  3776   /* Pass 2: process columns.
       
  3777    * We remove the PASS1_BITS scaling, but leave the results scaled up
       
  3778    * by an overall factor of 8.
       
  3779    * We must also scale the output by (8/6)*(8/12) = 8/9, which we
       
  3780    * fold into the constant multipliers:
       
  3781    * 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24) * 8/9.
       
  3782    */
       
  3783 
       
  3784   dataptr = data;
       
  3785   wsptr = workspace;
       
  3786   for (ctr = 0; ctr < 6; ctr++) {
       
  3787     /* Even part */
       
  3788 
       
  3789     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
       
  3790     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
       
  3791     tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
       
  3792     tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
       
  3793     tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
       
  3794     tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
       
  3795 
       
  3796     tmp10 = tmp0 + tmp5;
       
  3797     tmp13 = tmp0 - tmp5;
       
  3798     tmp11 = tmp1 + tmp4;
       
  3799     tmp14 = tmp1 - tmp4;
       
  3800     tmp12 = tmp2 + tmp3;
       
  3801     tmp15 = tmp2 - tmp3;
       
  3802 
       
  3803     tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
       
  3804     tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
       
  3805     tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
       
  3806     tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
       
  3807     tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
       
  3808     tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
       
  3809 
       
  3810     dataptr[DCTSIZE*0] = (DCTELEM)
       
  3811       DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
       
  3812 	      CONST_BITS+PASS1_BITS);
       
  3813     dataptr[DCTSIZE*6] = (DCTELEM)
       
  3814       DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
       
  3815 	      CONST_BITS+PASS1_BITS);
       
  3816     dataptr[DCTSIZE*4] = (DCTELEM)
       
  3817       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)),         /* c4 */
       
  3818 	      CONST_BITS+PASS1_BITS);
       
  3819     dataptr[DCTSIZE*2] = (DCTELEM)
       
  3820       DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) +        /* 8/9 */
       
  3821 	      MULTIPLY(tmp13 + tmp15, FIX(1.214244803)),         /* c2 */
       
  3822 	      CONST_BITS+PASS1_BITS);
       
  3823 
       
  3824     /* Odd part */
       
  3825 
       
  3826     tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200));   /* c9 */
       
  3827     tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102));  /* c3-c9 */
       
  3828     tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502));  /* c3+c9 */
       
  3829     tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603));   /* c5 */
       
  3830     tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039));   /* c7 */
       
  3831     tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
       
  3832 	    + MULTIPLY(tmp5, FIX(0.164081699));        /* c11 */
       
  3833     tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
       
  3834     tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
       
  3835 	    + MULTIPLY(tmp5, FIX(0.765261039));        /* c7 */
       
  3836     tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
       
  3837 	    - MULTIPLY(tmp5, FIX(0.997307603));        /* c5 */
       
  3838     tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
       
  3839 	    - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
       
  3840 
       
  3841     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS);
       
  3842     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS);
       
  3843     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS);
       
  3844     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS);
       
  3845 
       
  3846     dataptr++;			/* advance pointer to next column */
       
  3847     wsptr++;			/* advance pointer to next column */
       
  3848   }
       
  3849 }
       
  3850 
       
  3851 
       
  3852 /*
       
  3853  * Perform the forward DCT on a 5x10 sample block.
       
  3854  *
       
  3855  * 5-point FDCT in pass 1 (rows), 10-point in pass 2 (columns).
       
  3856  */
       
  3857 
       
  3858 GLOBAL(void)
       
  3859 jpeg_fdct_5x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  3860 {
       
  3861   INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
       
  3862   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
       
  3863   DCTELEM workspace[8*2];
       
  3864   DCTELEM *dataptr;
       
  3865   DCTELEM *wsptr;
       
  3866   JSAMPROW elemptr;
       
  3867   int ctr;
       
  3868   SHIFT_TEMPS
       
  3869 
       
  3870   /* Pre-zero output coefficient block. */
       
  3871   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
       
  3872 
       
  3873   /* Pass 1: process rows. */
       
  3874   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
  3875   /* furthermore, we scale the results by 2**PASS1_BITS. */
       
  3876   /* 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10). */
       
  3877 
       
  3878   dataptr = data;
       
  3879   ctr = 0;
       
  3880   for (;;) {
       
  3881     elemptr = sample_data[ctr] + start_col;
       
  3882 
       
  3883     /* Even part */
       
  3884 
       
  3885     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
       
  3886     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
       
  3887     tmp2 = GETJSAMPLE(elemptr[2]);
       
  3888 
       
  3889     tmp10 = tmp0 + tmp1;
       
  3890     tmp11 = tmp0 - tmp1;
       
  3891 
       
  3892     tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
       
  3893     tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
       
  3894 
       
  3895     /* Apply unsigned->signed conversion */
       
  3896     dataptr[0] = (DCTELEM)
       
  3897       ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << PASS1_BITS);
       
  3898     tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
       
  3899     tmp10 -= tmp2 << 2;
       
  3900     tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */
       
  3901     dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS);
       
  3902     dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS);
       
  3903 
       
  3904     /* Odd part */
       
  3905 
       
  3906     tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876));    /* c3 */
       
  3907 
       
  3908     dataptr[1] = (DCTELEM)
       
  3909       DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
       
  3910 	      CONST_BITS-PASS1_BITS);
       
  3911     dataptr[3] = (DCTELEM)
       
  3912       DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
       
  3913 	      CONST_BITS-PASS1_BITS);
       
  3914 
       
  3915     ctr++;
       
  3916 
       
  3917     if (ctr != DCTSIZE) {
       
  3918       if (ctr == 10)
       
  3919 	break;			/* Done. */
       
  3920       dataptr += DCTSIZE;	/* advance pointer to next row */
       
  3921     } else
       
  3922       dataptr = workspace;	/* switch pointer to extended workspace */
       
  3923   }
       
  3924 
       
  3925   /* Pass 2: process columns.
       
  3926    * We remove the PASS1_BITS scaling, but leave the results scaled up
       
  3927    * by an overall factor of 8.
       
  3928    * We must also scale the output by (8/5)*(8/10) = 32/25, which we
       
  3929    * fold into the constant multipliers:
       
  3930    * 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20) * 32/25.
       
  3931    */
       
  3932 
       
  3933   dataptr = data;
       
  3934   wsptr = workspace;
       
  3935   for (ctr = 0; ctr < 5; ctr++) {
       
  3936     /* Even part */
       
  3937 
       
  3938     tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
       
  3939     tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
       
  3940     tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
       
  3941     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
       
  3942     tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
       
  3943 
       
  3944     tmp10 = tmp0 + tmp4;
       
  3945     tmp13 = tmp0 - tmp4;
       
  3946     tmp11 = tmp1 + tmp3;
       
  3947     tmp14 = tmp1 - tmp3;
       
  3948 
       
  3949     tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
       
  3950     tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
       
  3951     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
       
  3952     tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
       
  3953     tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
       
  3954 
       
  3955     dataptr[DCTSIZE*0] = (DCTELEM)
       
  3956       DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
       
  3957 	      CONST_BITS+PASS1_BITS);
       
  3958     tmp12 += tmp12;
       
  3959     dataptr[DCTSIZE*4] = (DCTELEM)
       
  3960       DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
       
  3961 	      MULTIPLY(tmp11 - tmp12, FIX(0.559380511)),  /* c8 */
       
  3962 	      CONST_BITS+PASS1_BITS);
       
  3963     tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961));    /* c6 */
       
  3964     dataptr[DCTSIZE*2] = (DCTELEM)
       
  3965       DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)),  /* c2-c6 */
       
  3966 	      CONST_BITS+PASS1_BITS);
       
  3967     dataptr[DCTSIZE*6] = (DCTELEM)
       
  3968       DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)),  /* c2+c6 */
       
  3969 	      CONST_BITS+PASS1_BITS);
       
  3970 
       
  3971     /* Odd part */
       
  3972 
       
  3973     tmp10 = tmp0 + tmp4;
       
  3974     tmp11 = tmp1 - tmp3;
       
  3975     dataptr[DCTSIZE*5] = (DCTELEM)
       
  3976       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)),  /* 32/25 */
       
  3977 	      CONST_BITS+PASS1_BITS);
       
  3978     tmp2 = MULTIPLY(tmp2, FIX(1.28));                     /* 32/25 */
       
  3979     dataptr[DCTSIZE*1] = (DCTELEM)
       
  3980       DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) +          /* c1 */
       
  3981 	      MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 +   /* c3 */
       
  3982 	      MULTIPLY(tmp3, FIX(0.821810588)) +          /* c7 */
       
  3983 	      MULTIPLY(tmp4, FIX(0.283176630)),           /* c9 */
       
  3984 	      CONST_BITS+PASS1_BITS);
       
  3985     tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) -     /* (c3+c7)/2 */
       
  3986 	    MULTIPLY(tmp1 + tmp3, FIX(0.752365123));      /* (c1-c9)/2 */
       
  3987     tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) +   /* (c3-c7)/2 */
       
  3988 	    MULTIPLY(tmp11, FIX(0.64)) - tmp2;            /* 16/25 */
       
  3989     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+PASS1_BITS);
       
  3990     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+PASS1_BITS);
       
  3991 
       
  3992     dataptr++;			/* advance pointer to next column */
       
  3993     wsptr++;			/* advance pointer to next column */
       
  3994   }
       
  3995 }
       
  3996 
       
  3997 
       
  3998 /*
       
  3999  * Perform the forward DCT on a 4x8 sample block.
       
  4000  *
       
  4001  * 4-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
       
  4002  */
       
  4003 
       
  4004 GLOBAL(void)
       
  4005 jpeg_fdct_4x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  4006 {
       
  4007   INT32 tmp0, tmp1, tmp2, tmp3;
       
  4008   INT32 tmp10, tmp11, tmp12, tmp13;
       
  4009   INT32 z1;
       
  4010   DCTELEM *dataptr;
       
  4011   JSAMPROW elemptr;
       
  4012   int ctr;
       
  4013   SHIFT_TEMPS
       
  4014 
       
  4015   /* Pre-zero output coefficient block. */
       
  4016   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
       
  4017 
       
  4018   /* Pass 1: process rows. */
       
  4019   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
  4020   /* furthermore, we scale the results by 2**PASS1_BITS. */
       
  4021   /* We must also scale the output by 8/4 = 2, which we add here. */
       
  4022   /* 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16). */
       
  4023 
       
  4024   dataptr = data;
       
  4025   for (ctr = 0; ctr < DCTSIZE; ctr++) {
       
  4026     elemptr = sample_data[ctr] + start_col;
       
  4027 
       
  4028     /* Even part */
       
  4029 
       
  4030     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
       
  4031     tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
       
  4032 
       
  4033     tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
       
  4034     tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
       
  4035 
       
  4036     /* Apply unsigned->signed conversion */
       
  4037     dataptr[0] = (DCTELEM)
       
  4038       ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+1));
       
  4039     dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+1));
       
  4040 
       
  4041     /* Odd part */
       
  4042 
       
  4043     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
       
  4044     /* Add fudge factor here for final descale. */
       
  4045     tmp0 += ONE << (CONST_BITS-PASS1_BITS-2);
       
  4046 
       
  4047     dataptr[1] = (DCTELEM)
       
  4048       RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
       
  4049 		  CONST_BITS-PASS1_BITS-1);
       
  4050     dataptr[3] = (DCTELEM)
       
  4051       RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
       
  4052 		  CONST_BITS-PASS1_BITS-1);
       
  4053 
       
  4054     dataptr += DCTSIZE;		/* advance pointer to next row */
       
  4055   }
       
  4056 
       
  4057   /* Pass 2: process columns.
       
  4058    * We remove the PASS1_BITS scaling, but leave the results scaled up
       
  4059    * by an overall factor of 8.
       
  4060    */
       
  4061 
       
  4062   dataptr = data;
       
  4063   for (ctr = 0; ctr < 4; ctr++) {
       
  4064     /* Even part per LL&M figure 1 --- note that published figure is faulty;
       
  4065      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
       
  4066      */
       
  4067 
       
  4068     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
       
  4069     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
       
  4070     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
       
  4071     tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
       
  4072 
       
  4073     /* Add fudge factor here for final descale. */
       
  4074     tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
       
  4075     tmp12 = tmp0 - tmp3;
       
  4076     tmp11 = tmp1 + tmp2;
       
  4077     tmp13 = tmp1 - tmp2;
       
  4078 
       
  4079     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
       
  4080     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
       
  4081     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
       
  4082     tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
       
  4083 
       
  4084     dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
       
  4085     dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
       
  4086 
       
  4087     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
       
  4088     /* Add fudge factor here for final descale. */
       
  4089     z1 += ONE << (CONST_BITS+PASS1_BITS-1);
       
  4090     dataptr[DCTSIZE*2] = (DCTELEM)
       
  4091       RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
       
  4092     dataptr[DCTSIZE*6] = (DCTELEM)
       
  4093       RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
       
  4094 
       
  4095     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
       
  4096      * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
       
  4097      * i0..i3 in the paper are tmp0..tmp3 here.
       
  4098      */
       
  4099 
       
  4100     tmp10 = tmp0 + tmp3;
       
  4101     tmp11 = tmp1 + tmp2;
       
  4102     tmp12 = tmp0 + tmp2;
       
  4103     tmp13 = tmp1 + tmp3;
       
  4104     z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /*  c3 */
       
  4105     /* Add fudge factor here for final descale. */
       
  4106     z1 += ONE << (CONST_BITS+PASS1_BITS-1);
       
  4107 
       
  4108     tmp0  = MULTIPLY(tmp0,    FIX_1_501321110);    /*  c1+c3-c5-c7 */
       
  4109     tmp1  = MULTIPLY(tmp1,    FIX_3_072711026);    /*  c1+c3+c5-c7 */
       
  4110     tmp2  = MULTIPLY(tmp2,    FIX_2_053119869);    /*  c1+c3-c5+c7 */
       
  4111     tmp3  = MULTIPLY(tmp3,    FIX_0_298631336);    /* -c1+c3+c5-c7 */
       
  4112     tmp10 = MULTIPLY(tmp10, - FIX_0_899976223);    /*  c7-c3 */
       
  4113     tmp11 = MULTIPLY(tmp11, - FIX_2_562915447);    /* -c1-c3 */
       
  4114     tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);    /*  c5-c3 */
       
  4115     tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);    /* -c3-c5 */
       
  4116 
       
  4117     tmp12 += z1;
       
  4118     tmp13 += z1;
       
  4119 
       
  4120     dataptr[DCTSIZE*1] = (DCTELEM)
       
  4121       RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
       
  4122     dataptr[DCTSIZE*3] = (DCTELEM)
       
  4123       RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
       
  4124     dataptr[DCTSIZE*5] = (DCTELEM)
       
  4125       RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
       
  4126     dataptr[DCTSIZE*7] = (DCTELEM)
       
  4127       RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
       
  4128 
       
  4129     dataptr++;			/* advance pointer to next column */
       
  4130   }
       
  4131 }
       
  4132 
       
  4133 
       
  4134 /*
       
  4135  * Perform the forward DCT on a 3x6 sample block.
       
  4136  *
       
  4137  * 3-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
       
  4138  */
       
  4139 
       
  4140 GLOBAL(void)
       
  4141 jpeg_fdct_3x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  4142 {
       
  4143   INT32 tmp0, tmp1, tmp2;
       
  4144   INT32 tmp10, tmp11, tmp12;
       
  4145   DCTELEM *dataptr;
       
  4146   JSAMPROW elemptr;
       
  4147   int ctr;
       
  4148   SHIFT_TEMPS
       
  4149 
       
  4150   /* Pre-zero output coefficient block. */
       
  4151   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
       
  4152 
       
  4153   /* Pass 1: process rows. */
       
  4154   /* Note results are scaled up by sqrt(8) compared to a true DCT; */
       
  4155   /* furthermore, we scale the results by 2**PASS1_BITS. */
       
  4156   /* We scale the results further by 2 as part of output adaption */
       
  4157   /* scaling for different DCT size. */
       
  4158   /* 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6). */
       
  4159 
       
  4160   dataptr = data;
       
  4161   for (ctr = 0; ctr < 6; ctr++) {
       
  4162     elemptr = sample_data[ctr] + start_col;
       
  4163 
       
  4164     /* Even part */
       
  4165 
       
  4166     tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
       
  4167     tmp1 = GETJSAMPLE(elemptr[1]);
       
  4168 
       
  4169     tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
       
  4170 
       
  4171     /* Apply unsigned->signed conversion */
       
  4172     dataptr[0] = (DCTELEM)
       
  4173       ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+1));
       
  4174     dataptr[2] = (DCTELEM)
       
  4175       DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
       
  4176 	      CONST_BITS-PASS1_BITS-1);
       
  4177 
       
  4178     /* Odd part */
       
  4179 
       
  4180     dataptr[1] = (DCTELEM)
       
  4181       DESCALE(MULTIPLY(tmp2, FIX(1.224744871)),               /* c1 */
       
  4182 	      CONST_BITS-PASS1_BITS-1);
       
  4183 
       
  4184     dataptr += DCTSIZE;		/* advance pointer to next row */
       
  4185   }
       
  4186 
       
  4187   /* Pass 2: process columns.
       
  4188    * We remove the PASS1_BITS scaling, but leave the results scaled up
       
  4189    * by an overall factor of 8.
       
  4190    * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
       
  4191    * fold into the constant multipliers (other part was done in pass 1):
       
  4192    * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
       
  4193    */
       
  4194 
       
  4195   dataptr = data;
       
  4196   for (ctr = 0; ctr < 3; ctr++) {
       
  4197     /* Even part */
       
  4198 
       
  4199     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
       
  4200     tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
       
  4201     tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
       
  4202 
       
  4203     tmp10 = tmp0 + tmp2;
       
  4204     tmp12 = tmp0 - tmp2;
       
  4205 
       
  4206     tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
       
  4207     tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
       
  4208     tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
       
  4209 
       
  4210     dataptr[DCTSIZE*0] = (DCTELEM)
       
  4211       DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
       
  4212 	      CONST_BITS+PASS1_BITS);
       
  4213     dataptr[DCTSIZE*2] = (DCTELEM)
       
  4214       DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
       
  4215 	      CONST_BITS+PASS1_BITS);
       
  4216     dataptr[DCTSIZE*4] = (DCTELEM)
       
  4217       DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
       
  4218 	      CONST_BITS+PASS1_BITS);
       
  4219 
       
  4220     /* Odd part */
       
  4221 
       
  4222     tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */
       
  4223 
       
  4224     dataptr[DCTSIZE*1] = (DCTELEM)
       
  4225       DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
       
  4226 	      CONST_BITS+PASS1_BITS);
       
  4227     dataptr[DCTSIZE*3] = (DCTELEM)
       
  4228       DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
       
  4229 	      CONST_BITS+PASS1_BITS);
       
  4230     dataptr[DCTSIZE*5] = (DCTELEM)
       
  4231       DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
       
  4232 	      CONST_BITS+PASS1_BITS);
       
  4233 
       
  4234     dataptr++;			/* advance pointer to next column */
       
  4235   }
       
  4236 }
       
  4237 
       
  4238 
       
  4239 /*
       
  4240  * Perform the forward DCT on a 2x4 sample block.
       
  4241  *
       
  4242  * 2-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
       
  4243  */
       
  4244 
       
  4245 GLOBAL(void)
       
  4246 jpeg_fdct_2x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  4247 {
       
  4248   INT32 tmp0, tmp1;
       
  4249   INT32 tmp10, tmp11;
       
  4250   DCTELEM *dataptr;
       
  4251   JSAMPROW elemptr;
       
  4252   int ctr;
       
  4253   SHIFT_TEMPS
       
  4254 
       
  4255   /* Pre-zero output coefficient block. */
       
  4256   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
       
  4257 
       
  4258   /* Pass 1: process rows. */
       
  4259   /* Note results are scaled up by sqrt(8) compared to a true DCT. */
       
  4260   /* We must also scale the output by (8/2)*(8/4) = 2**3, which we add here. */
       
  4261 
       
  4262   dataptr = data;
       
  4263   for (ctr = 0; ctr < 4; ctr++) {
       
  4264     elemptr = sample_data[ctr] + start_col;
       
  4265 
       
  4266     /* Even part */
       
  4267 
       
  4268     tmp0 = GETJSAMPLE(elemptr[0]);
       
  4269     tmp1 = GETJSAMPLE(elemptr[1]);
       
  4270 
       
  4271     /* Apply unsigned->signed conversion */
       
  4272     dataptr[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 3);
       
  4273 
       
  4274     /* Odd part */
       
  4275 
       
  4276     dataptr[1] = (DCTELEM) ((tmp0 - tmp1) << 3);
       
  4277 
       
  4278     dataptr += DCTSIZE;		/* advance pointer to next row */
       
  4279   }
       
  4280 
       
  4281   /* Pass 2: process columns.
       
  4282    * We leave the results scaled up by an overall factor of 8.
       
  4283    * 4-point FDCT kernel,
       
  4284    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
       
  4285    */
       
  4286 
       
  4287   dataptr = data;
       
  4288   for (ctr = 0; ctr < 2; ctr++) {
       
  4289     /* Even part */
       
  4290 
       
  4291     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3];
       
  4292     tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
       
  4293 
       
  4294     tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
       
  4295     tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
       
  4296 
       
  4297     dataptr[DCTSIZE*0] = (DCTELEM) (tmp0 + tmp1);
       
  4298     dataptr[DCTSIZE*2] = (DCTELEM) (tmp0 - tmp1);
       
  4299 
       
  4300     /* Odd part */
       
  4301 
       
  4302     tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
       
  4303     /* Add fudge factor here for final descale. */
       
  4304     tmp0 += ONE << (CONST_BITS-1);
       
  4305 
       
  4306     dataptr[DCTSIZE*1] = (DCTELEM)
       
  4307       RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
       
  4308 		  CONST_BITS);
       
  4309     dataptr[DCTSIZE*3] = (DCTELEM)
       
  4310       RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
       
  4311 		  CONST_BITS);
       
  4312 
       
  4313     dataptr++;			/* advance pointer to next column */
       
  4314   }
       
  4315 }
       
  4316 
       
  4317 
       
  4318 /*
       
  4319  * Perform the forward DCT on a 1x2 sample block.
       
  4320  *
       
  4321  * 1-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
       
  4322  */
       
  4323 
       
  4324 GLOBAL(void)
       
  4325 jpeg_fdct_1x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
       
  4326 {
       
  4327   INT32 tmp0, tmp1;
       
  4328 
       
  4329   /* Pre-zero output coefficient block. */
       
  4330   MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
       
  4331 
       
  4332   tmp0 = GETJSAMPLE(sample_data[0][start_col]);
       
  4333   tmp1 = GETJSAMPLE(sample_data[1][start_col]);
       
  4334 
       
  4335   /* We leave the results scaled up by an overall factor of 8.
       
  4336    * We must also scale the output by (8/1)*(8/2) = 2**5.
       
  4337    */
       
  4338 
       
  4339   /* Even part */
       
  4340   /* Apply unsigned->signed conversion */
       
  4341   data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);
       
  4342 
       
  4343   /* Odd part */
       
  4344   data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp1) << 5);
       
  4345 }
       
  4346 
       
  4347 #endif /* DCT_SCALING_SUPPORTED */
   283 #endif /* DCT_ISLOW_SUPPORTED */
  4348 #endif /* DCT_ISLOW_SUPPORTED */