FCam: src/processing/Demosaic_ARM.cpp Source File

00001 #ifdef FCAM_ARCH_ARM
00002 #include "Demosaic_ARM.h"
00003 #include <arm_neon.h>
00004 
00005 namespace FCam {
00006 
00007 // Make a linear luminance -> pixel value lookup table
00008 extern void makeLUT(const Frame &f, float contrast, int blackLevel, float gamma, unsigned char *lut);
00009 
00010 Image demosaic_ARM(Frame src, float contrast, bool denoise, int blackLevel, float gamma) {
00011 
00012     const int BLOCK_WIDTH  = 40;
00013     const int BLOCK_HEIGHT = 24;
00014 
00015     Image input = src.image();
00016 
00017     // Check we're the right bayer pattern. If not crop and continue.
00018     switch ((int)src.platform().bayerPattern()) {
00019     case GRBG:
00020         break;
00021     case RGGB:
00022         input = input.subImage(1, 0, Size(input.width()-2, input.height()));
00023         break;
00024     case BGGR:
00025         input = input.subImage(0, 1, Size(input.width(), input.height()-2));
00026         break;
00027     case GBRG:
00028         input = input.subImage(1, 1, Size(input.width()-2, input.height()-2));
00029     default:
00030         error(Event::DemosaicError, "Can't demosaic from a non-bayer sensor\n");
00031         return Image();
00032     }
00033 
00034     int rawWidth = input.width();
00035     int rawHeight = input.height();
00036 
00037     const int VEC_WIDTH = ((BLOCK_WIDTH + 8)/8);
00038     const int VEC_HEIGHT = ((BLOCK_HEIGHT + 8)/2);
00039 
00040     int rawPixelsPerRow = input.bytesPerRow()/2 ; // Assumes bytesPerRow is even
00041 
00042     int outWidth = rawWidth-8;
00043     int outHeight = rawHeight-8;
00044     outWidth /= BLOCK_WIDTH;
00045     outWidth *= BLOCK_WIDTH;
00046     outHeight /= BLOCK_HEIGHT;
00047     outHeight *= BLOCK_HEIGHT;
00048 
00049     Image out(outWidth, outHeight, RGB24);
00050 
00051     // Check we're the right size, if not, crop center
00052     if (((input.width() - 8) != (unsigned)outWidth) ||
00053         ((input.height() - 8) != (unsigned)outHeight)) {
00054         int offX = (input.width() - 8 - outWidth)/2;
00055         int offY = (input.height() - 8 - outHeight)/2;
00056         offX -= offX&1;
00057         offY -= offY&1;
00058 
00059         if (offX || offY) {
00060             input = input.subImage(offX, offY, Size(outWidth+8, outHeight+8));
00061         }
00062     }
00063 
00064     Time startTime = Time::now();
00065 
00066     // Prepare the color matrix in S8.8 fixed point
00067     float colorMatrix_f[12];
00068 
00069     // Check if there's a custom color matrix
00070     if (src.shot().colorMatrix().size() == 12) {
00071         for (int i = 0; i < 12; i++) {
00072             colorMatrix_f[i] = src.shot().colorMatrix()[i];
00073         }
00074     } else {
00075         // Otherwise use the platform version
00076         src.platform().rawToRGBColorMatrix(src.shot().whiteBalance, colorMatrix_f);
00077     }
00078 
00079     int16x4_t colorMatrix[3];
00080     for (int i = 0; i < 3; i++) {
00081         int16_t val = (int16_t)(colorMatrix_f[i*4+0] * 256 + 0.5);
00082         colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 0);
00083         val = (int16_t)(colorMatrix_f[i*4+1] * 256 + 0.5);
00084         colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 1);
00085         val = (int16_t)(colorMatrix_f[i*4+2] * 256 + 0.5);
00086         colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 2);
00087         val = (int16_t)(colorMatrix_f[i*4+3] * 256 + 0.5);
00088         colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 3);
00089     }
00090 
00091     // A buffer to store data after demosiac and color correction
00092     // but before gamma correction
00093     uint16_t out16[BLOCK_WIDTH*BLOCK_HEIGHT*3];
00094 
00095     // Various color channels. Only 4 of them are defined before
00096     // demosaic, all of them are defined after demosiac
00097     int16_t scratch[VEC_WIDTH*VEC_HEIGHT*4*12];
00098 
00099 #define R_R_OFF  (VEC_WIDTH*VEC_HEIGHT*4*0)
00100 #define R_GR_OFF (VEC_WIDTH*VEC_HEIGHT*4*1)
00101 #define R_GB_OFF (VEC_WIDTH*VEC_HEIGHT*4*2)
00102 #define R_B_OFF  (VEC_WIDTH*VEC_HEIGHT*4*3)
00103 
00104 #define G_R_OFF  (VEC_WIDTH*VEC_HEIGHT*4*4)
00105 #define G_GR_OFF (VEC_WIDTH*VEC_HEIGHT*4*5)
00106 #define G_GB_OFF (VEC_WIDTH*VEC_HEIGHT*4*6)
00107 #define G_B_OFF  (VEC_WIDTH*VEC_HEIGHT*4*7)
00108 
00109 #define B_R_OFF  (VEC_WIDTH*VEC_HEIGHT*4*8)
00110 #define B_GR_OFF (VEC_WIDTH*VEC_HEIGHT*4*9)
00111 #define B_GB_OFF (VEC_WIDTH*VEC_HEIGHT*4*10)
00112 #define B_B_OFF  (VEC_WIDTH*VEC_HEIGHT*4*11)
00113 
00114 #define R_R(i)  (scratch+(i)+R_R_OFF)
00115 #define R_GR(i) (scratch+(i)+R_GR_OFF)
00116 #define R_GB(i) (scratch+(i)+R_GB_OFF)
00117 #define R_B(i)  (scratch+(i)+R_B_OFF)
00118 
00119 #define G_R(i)  (scratch+(i)+G_R_OFF)
00120 #define G_GR(i) (scratch+(i)+G_GR_OFF)
00121 #define G_GB(i) (scratch+(i)+G_GB_OFF)
00122 #define G_B(i)  (scratch+(i)+G_B_OFF)
00123 
00124 #define B_R(i)  (scratch+(i)+B_R_OFF)
00125 #define B_GR(i) (scratch+(i)+B_GR_OFF)
00126 #define B_GB(i) (scratch+(i)+B_GB_OFF)
00127 #define B_B(i)  (scratch+(i)+B_B_OFF)
00128 
00129     // Reuse some of the output scratch area for the noisy inputs
00130 #define G_GR_NOISY B_GR
00131 #define B_B_NOISY  G_B
00132 #define R_R_NOISY  G_R
00133 #define G_GB_NOISY B_GB
00134 
00135     // Prepare the lookup table
00136     unsigned char lut[4096];
00137     makeLUT(src, contrast, blackLevel, gamma, lut);
00138 
00139     // For each block in the input
00140     for (int by = 0; by < rawHeight-8-BLOCK_HEIGHT+1; by += BLOCK_HEIGHT) {
00141         const short *__restrict__ blockPtr = (const short *)input(0,by);
00142         unsigned char *__restrict__ outBlockPtr = out(0, by);
00143         for (int bx = 0; bx < rawWidth-8-BLOCK_WIDTH+1; bx += BLOCK_WIDTH) {
00144 
00145             // Stage 1) Demux a block of input into L1
00146             if (1) {
00147                 register const int16_t *__restrict__ rawPtr = blockPtr;
00148                 register const int16_t *__restrict__ rawPtr2 = blockPtr + rawPixelsPerRow;
00149 
00150                 register const int rawJump = rawPixelsPerRow*2 - VEC_WIDTH*8;
00151 
00152                 register int16_t *__restrict__ g_gr_ptr = denoise ? G_GR_NOISY(0) : G_GR(0);
00153                 register int16_t *__restrict__ r_r_ptr  = denoise ? R_R_NOISY(0)  : R_R(0);
00154                 register int16_t *__restrict__ b_b_ptr  = denoise ? B_B_NOISY(0)  : B_B(0);
00155                 register int16_t *__restrict__ g_gb_ptr = denoise ? G_GB_NOISY(0) : G_GB(0);
00156 
00157                 for (int y = 0; y < VEC_HEIGHT; y++) {
00158                     for (int x = 0; x < VEC_WIDTH/2; x++) {
00159 
00160                         asm volatile("# Stage 1) Demux\n");
00161 
00162                         // The below needs to be volatile, but
00163                         // it's not clear why (if it's not, it
00164                         // gets optimized out entirely)
00165                         asm volatile(
00166                             "vld2.16  {d6-d9}, [%[rawPtr]]! \n\t"
00167                             "vld2.16  {d10-d13}, [%[rawPtr2]]! \n\t"
00168                             "vst1.16  {d6-d7}, [%[g_gr_ptr]]! \n\t"
00169                             "vst1.16  {d8-d9}, [%[r_r_ptr]]! \n\t"
00170                             "vst1.16  {d10-d11}, [%[b_b_ptr]]! \n\t"
00171                             "vst1.16  {d12-d13}, [%[g_gb_ptr]]! \n\t" :
00172                             [rawPtr]"+r"(rawPtr),
00173                             [rawPtr2]"+r"(rawPtr2),
00174                             [g_gr_ptr]"+r"(g_gr_ptr),
00175                             [r_r_ptr]"+r"(r_r_ptr),
00176                             [b_b_ptr]"+r"(b_b_ptr),
00177                             [g_gb_ptr]"+r"(g_gb_ptr) ::
00178                             "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "memory");
00179 
00180                     }
00181 
00182                     rawPtr += rawJump;
00183                     rawPtr2 += rawJump;
00184                 }
00185             }
00186 
00187             // Stage 1.5) Denoise sensor input (noisy pixel supression)
00188 
00189             // A pixel can't be brighter than its brightest neighbor
00190 
00191             if (denoise) {
00192                 register int16_t *__restrict__ ptr_in = NULL;
00193                 register int16_t *__restrict__ ptr_out = NULL;
00194                 asm volatile("#Stage 1.5: Denoise\n\t");
00195                 for (int b=0; b<4; b++) {
00196                     if (b==0) { ptr_in = G_GR_NOISY(0); }
00197                     if (b==1) { ptr_in = R_R_NOISY(0); }
00198                     if (b==2) { ptr_in = B_B_NOISY(0); }
00199                     if (b==3) { ptr_in = G_GB_NOISY(0); }
00200                     if (b==0) { ptr_out = G_GR(0); }
00201                     if (b==1) { ptr_out = R_R(0); }
00202                     if (b==2) { ptr_out = B_B(0); }
00203                     if (b==3) { ptr_out = G_GB(0); }
00204 
00205                     // write the top block pixels who aren't being denoised
00206                     for (int x = 0; x < (BLOCK_WIDTH+8); x+=8) {
00207                         int16x8_t in = vld1q_s16(ptr_in);
00208                         vst1q_s16(ptr_out, in);
00209                         ptr_in+=8;
00210                         ptr_out+=8;
00211                     }
00212 
00213                     for (int y = 1; y < VEC_HEIGHT - 1; y++) {
00214                         for (int x = 0; x < VEC_WIDTH/2; x++) {
00215                             int16x8_t here  = vld1q_s16(ptr_in);
00216                             int16x8_t above = vld1q_s16(ptr_in + VEC_WIDTH*4);
00217                             int16x8_t under = vld1q_s16(ptr_in - VEC_WIDTH*4);
00218                             int16x8_t right = vld1q_s16(ptr_in + 1);
00219                             int16x8_t left  = vld1q_s16(ptr_in - 1);
00220                             int16x8_t max, min;
00221 
00222                             // find the max and min of the neighbors
00223                             max = vmaxq_s16(left, right);
00224                             max = vmaxq_s16(above, max);
00225                             max = vmaxq_s16(under, max);
00226 
00227                             min = vminq_s16(left, right);
00228                             min = vminq_s16(above, min);
00229                             min = vminq_s16(under, min);
00230 
00231                             // clamp here to be within this range
00232                             here  = vminq_s16(max, here);
00233                             here  = vmaxq_s16(min, here);
00234 
00235                             vst1q_s16(ptr_out, here);
00236                             ptr_in += 8;
00237                             ptr_out += 8;
00238                         }
00239                     }
00240 
00241                     // write the bottom block pixels who aren't being denoised
00242                     for (int x = 0; x < (BLOCK_WIDTH+8); x+=8) {
00243                         int16x8_t in = vld1q_s16(ptr_in);
00244                         vst1q_s16(ptr_out, in);
00245                         ptr_in+=8;
00246                         ptr_out+=8;
00247                     }
00248                 }
00249             }
00250 
00251             // Stage 2 and 3) Do horizontal and vertical
00252             // interpolation of green, as well as picking the
00253             // output for green
00254             /*
00255               gv_r = (gb[UP] + gb[HERE])/2;
00256               gvd_r = (gb[UP] - gb[HERE]);
00257               gh_r = (gr[HERE] + gr[RIGHT])/2;
00258               ghd_r = (gr[HERE] - gr[RIGHT]);
00259               g_r = ghd_r < gvd_r ? gh_r : gv_r;
00260 
00261               gv_b = (gr[DOWN] + gr[HERE])/2;
00262               gvd_b = (gr[DOWN] - gr[HERE]);
00263               gh_b = (gb[LEFT] + gb[HERE])/2;
00264               ghd_b = (gb[LEFT] - gb[HERE]);
00265               g_b = ghd_b < gvd_b ? gh_b : gv_b;
00266             */
00267             if (1) {
00268 
00269                 int i = VEC_WIDTH*4;
00270 
00271                 register int16_t *g_gb_up_ptr = G_GB(i) - VEC_WIDTH*4;
00272                 register int16_t *g_gb_here_ptr = G_GB(i);
00273                 register int16_t *g_gb_left_ptr = G_GB(i) - 1;
00274                 register int16_t *g_gr_down_ptr = G_GR(i) + VEC_WIDTH*4;
00275                 register int16_t *g_gr_here_ptr = G_GR(i);
00276                 register int16_t *g_gr_right_ptr = G_GR(i) + 1;
00277                 register int16_t *g_r_ptr = G_R(i);
00278                 register int16_t *g_b_ptr = G_B(i);
00279 
00280                 for (int y = 1; y < VEC_HEIGHT-1; y++) {
00281                     for (int x = 0; x < VEC_WIDTH/2; x++) {
00282 
00283                         asm volatile("#Stage 2) Green interpolation\n");
00284 
00285                         // Load the inputs
00286 
00287                         int16x8_t gb_up = vld1q_s16(g_gb_up_ptr);
00288                         g_gb_up_ptr+=8;
00289                         int16x8_t gb_here = vld1q_s16(g_gb_here_ptr);
00290                         g_gb_here_ptr+=8;
00291                         int16x8_t gb_left = vld1q_s16(g_gb_left_ptr); // unaligned
00292                         g_gb_left_ptr+=8;
00293                         int16x8_t gr_down = vld1q_s16(g_gr_down_ptr);
00294                         g_gr_down_ptr+=8;
00295                         int16x8_t gr_here = vld1q_s16(g_gr_here_ptr);
00296                         g_gr_here_ptr+=8;
00297                         int16x8_t gr_right = vld1q_s16(g_gr_right_ptr); // unaligned
00298                         g_gr_right_ptr+=8;
00299 
00300                         //I couldn't get this assembly to work, and I don't know which
00301                         // of the three blocks of assembly is wrong
00302                         // This asm should load the inputs
00303                         /*
00304                         asm volatile(
00305                         "vld1.16        {d16-d17}, [%[gb_up]]!\n\t"
00306                         "vld1.16        {d18-d19}, [%[gb_here]]!\n\t"
00307                         "vld1.16        {d20-d21}, [%[gb_left]]!\n\t"
00308                         "vld1.16        {d22-d23}, [%[gr_down]]!\n\t"
00309                         "vld1.16        {d24-d25}, [%[gr_here]]!\n\t"
00310                         "vld1.16        {d26-d27}, [%[gr_right]]!\n\t"
00311                         :
00312                         [gb_up]"+r"(g_gb_up_ptr),
00313                         [gb_here]"+r"(g_gb_here_ptr),
00314                         [gb_left]"+r"(g_gb_left_ptr),
00315                         [gr_down]"+r"(g_gr_down_ptr),
00316                         [gr_here]"+r"(g_gr_here_ptr),
00317                         [gr_right]"+r"(g_gr_right_ptr) ::
00318                         //"d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27",
00319                         "q8","q9","q10","q11","q12","q13");
00320 
00321                         //q8 - gb_up
00322                         //q9 - gb_here
00323                         //q10 - gb_left
00324                         //q11 - gr_down
00325                         //q12 - gr_here
00326                         //q13 - gr_right
00327                         */
00328 
00329                         // Do the processing
00330                         int16x8_t gv_r  = vhaddq_s16(gb_up, gb_here);
00331                         int16x8_t gvd_r = vabdq_s16(gb_up, gb_here);
00332                         int16x8_t gh_r  = vhaddq_s16(gr_right, gr_here);
00333                         int16x8_t ghd_r = vabdq_s16(gr_here, gr_right);
00334                         int16x8_t g_r = vbslq_s16(vcltq_s16(ghd_r, gvd_r), gh_r, gv_r);
00335 
00336                         int16x8_t gv_b  = vhaddq_s16(gr_down, gr_here);
00337                         int16x8_t gvd_b = vabdq_s16(gr_down, gr_here);
00338                         int16x8_t gh_b  = vhaddq_s16(gb_left, gb_here);
00339                         int16x8_t ghd_b = vabdq_s16(gb_left, gb_here);
00340                         int16x8_t g_b = vbslq_s16(vcltq_s16(ghd_b, gvd_b), gh_b, gv_b);
00341 
00342                         //this asm should do the above selection/interpolation
00343                         /*
00344                         asm volatile(
00345                         "vabd.s16       q0, q12, q13\n\t" //ghd_r
00346                         "vabd.s16       q1, q8, q9\n\t" //gvd_r
00347                         "vabd.s16       q2, q10, q9\n\t" //ghd_b
00348                         "vabd.s16       q3, q11, q12\n\t" //gvd_b
00349                         "vcgt.s16       q1, q0, q1\n\t" //select ghd_r or gvd_r
00350                         "vcgt.s16       q2, q2, q3\n\t" //select gvd_b or ghd_b
00351                         "vhadd.s16      q8, q8, q9\n\t" //gv_r
00352                         "vhadd.s16      q11, q11, q12\n\t" //gv_b
00353                         "vhadd.s16      q12, q12, q13\n\t" //gh_r
00354                         "vhadd.s16      q9, q9, q10\n\t" //gh_b
00355                         "vbsl           q1, q12, q8\n\t" //g_r
00356                         "vbsl           q2, q9, q11\n\t" //g_b
00357                          ::: "q0","q1","q2","q3","q8","q9","q10","q11","q12","q13");
00358                         */
00359 
00360                         //this should save the output
00361                         /*
00362                         asm volatile(
00363                         "vst1.16        {d2-d3}, [%[g_r]]!\n\t"
00364                         "vst1.16        {d4-d5}, [%[g_b]]!\n\t" :
00365                         [g_r]"+r"(g_r_ptr),[g_b]"+r"(g_b_ptr)
00366                         :: "memory");
00367                         */
00368 
00369                         // Save the outputs
00370                         vst1q_s16(g_r_ptr, g_r);
00371                         g_r_ptr+=8;
00372                         vst1q_s16(g_b_ptr, g_b);
00373                         g_b_ptr+=8;
00374                     }
00375                 }
00376             }
00377             asm volatile("#End of stage 2 (green interpolation)\n");
00378             // Stages 4-9
00379 
00380             if (1) {
00381 
00382                 /*
00383                   r_gr = (r[LEFT] + r[HERE])/2 + gr[HERE] - (g_r[LEFT] + g_r[HERE])/2;
00384                   b_gr = (b[UP] + b[HERE])/2 + gr[HERE] - (g_b[UP] + g_b[HERE])/2;
00385                   r_gb = (r[HERE] + r[DOWN])/2 + gb[HERE] - (g_r[HERE] + g_r[DOWN])/2;
00386                   b_gb = (b[HERE] + b[RIGHT])/2 + gb[HERE] - (g_b[HERE] + g_b[RIGHT])/2;
00387 
00388                   rp_b = (r[DOWNLEFT] + r[HERE])/2 + g_b[HERE] - (g_r[DOWNLEFT] + g_r[HERE])/2;
00389                   rn_b = (r[LEFT] + r[DOWN])/2 + g_b[HERE] - (g_r[LEFT] + g_r[DOWN])/2;
00390                   rpd_b = (r[DOWNLEFT] - r[HERE]);
00391                   rnd_b = (r[LEFT] - r[DOWN]);
00392                   r_b = rpd_b < rnd_b ? rp_b : rn_b;
00393 
00394                   bp_r = (b[UPRIGHT] + b[HERE])/2 + g_r[HERE] - (g_b[UPRIGHT] + g_b[HERE])/2;
00395                   bn_r = (b[RIGHT] + b[UP])/2 + g_r[HERE] - (g_b[RIGHT] + g_b[UP])/2;
00396                   bpd_r = (b[UPRIGHT] - b[HERE]);
00397                   bnd_r = (b[RIGHT] - b[UP]);
00398                   b_r = bpd_r < bnd_r ? bp_r : bn_r;
00399                 */
00400 
00401                 int i = 2*VEC_WIDTH*4;
00402 
00403                 for (int y = 2; y < VEC_HEIGHT-2; y++) {
00404                     for (int x = 0; x < VEC_WIDTH; x++) {
00405 
00406                         asm volatile("#Stage 4) r/b interpolation\n");
00407 
00408                         // Load the inputs
00409                         int16x4_t r_here       = vld1_s16(R_R(i));
00410                         int16x4_t r_left       = vld1_s16(R_R(i) - 1);
00411                         int16x4_t r_down       = vld1_s16(R_R(i) + VEC_WIDTH*4);
00412 
00413                         int16x4_t g_r_left     = vld1_s16(G_R(i) - 1);
00414                         int16x4_t g_r_here     = vld1_s16(G_R(i));
00415                         int16x4_t g_r_down     = vld1_s16(G_R(i) + VEC_WIDTH*4);
00416 
00417                         int16x4_t b_up         = vld1_s16(B_B(i) - VEC_WIDTH*4);
00418                         int16x4_t b_here       = vld1_s16(B_B(i));
00419                         int16x4_t b_right      = vld1_s16(B_B(i) + 1);
00420 
00421                         int16x4_t g_b_up       = vld1_s16(G_B(i) - VEC_WIDTH*4);
00422                         int16x4_t g_b_here     = vld1_s16(G_B(i));
00423                         int16x4_t g_b_right    = vld1_s16(G_B(i) + 1);
00424 
00425                         // Do the processing
00426                         int16x4_t gr_here      = vld1_s16(G_GR(i));
00427                         int16x4_t gb_here      = vld1_s16(G_GB(i));
00428 
00429                         {
00430                             // red at green
00431                             int16x4_t r_gr  = vadd_s16(vhadd_s16(r_left, r_here),
00432                                                        vsub_s16(gr_here,
00433                                                                 vhadd_s16(g_r_left, g_r_here)));
00434                             int16x4_t r_gb  = vadd_s16(vhadd_s16(r_here, r_down),
00435                                                        vsub_s16(gb_here,
00436                                                                 vhadd_s16(g_r_down, g_r_here)));
00437                             vst1_s16(R_GR(i), r_gr);
00438                             vst1_s16(R_GB(i), r_gb);
00439                         }
00440 
00441                         {
00442                             // red at blue
00443                             int16x4_t r_downleft   = vld1_s16(R_R(i) + VEC_WIDTH*4 - 1);
00444                             int16x4_t g_r_downleft = vld1_s16(G_R(i) + VEC_WIDTH*4 - 1);
00445 
00446                             int16x4_t rp_b  = vadd_s16(vhadd_s16(r_downleft, r_here),
00447                                                        vsub_s16(g_b_here,
00448                                                                 vhadd_s16(g_r_downleft, g_r_here)));
00449                             int16x4_t rn_b  = vadd_s16(vhadd_s16(r_left, r_down),
00450                                                        vsub_s16(g_b_here,
00451                                                                 vhadd_s16(g_r_left, g_r_down)));
00452                             int16x4_t rpd_b = vabd_s16(r_downleft, r_here);
00453                             int16x4_t rnd_b = vabd_s16(r_left, r_down);
00454                             int16x4_t r_b   = vbsl_s16(vclt_s16(rpd_b, rnd_b), rp_b, rn_b);
00455                             vst1_s16(R_B(i), r_b);
00456                         }
00457 
00458                         {
00459                             // blue at green
00460                             int16x4_t b_gr  = vadd_s16(vhadd_s16(b_up, b_here),
00461                                                        vsub_s16(gr_here,
00462                                                                 vhadd_s16(g_b_up, g_b_here)));
00463                             int16x4_t b_gb  = vadd_s16(vhadd_s16(b_here, b_right),
00464                                                        vsub_s16(gb_here,
00465                                                                 vhadd_s16(g_b_right, g_b_here)));
00466                             vst1_s16(B_GR(i), b_gr);
00467                             vst1_s16(B_GB(i), b_gb);
00468                         }
00469 
00470                         {
00471                             // blue at red
00472                             int16x4_t b_upright    = vld1_s16(B_B(i) - VEC_WIDTH*4 + 1);
00473                             int16x4_t g_b_upright  = vld1_s16(G_B(i) - VEC_WIDTH*4 + 1);
00474 
00475                             int16x4_t bp_r  = vadd_s16(vhadd_s16(b_upright, b_here),
00476                                                        vsub_s16(g_r_here,
00477                                                                 vhadd_s16(g_b_upright, g_b_here)));
00478                             int16x4_t bn_r  = vadd_s16(vhadd_s16(b_right, b_up),
00479                                                        vsub_s16(g_r_here,
00480                                                                 vhadd_s16(g_b_right, g_b_up)));
00481                             int16x4_t bpd_r = vabd_s16(b_upright, b_here);
00482                             int16x4_t bnd_r = vabd_s16(b_right, b_up);
00483                             int16x4_t b_r   = vbsl_s16(vclt_s16(bpd_r, bnd_r), bp_r, bn_r);
00484                             vst1_s16(B_R(i), b_r);
00485                         }
00486 
00487                         // Advance the index
00488                         i += 4;
00489                     }
00490                 }
00491                 asm volatile("#End of stage 4 - what_ever\n\t");
00492             }
00493 
00494             // Stage 10)
00495             if (1) {
00496                 // Color-correct and save the results into a 16-bit buffer for gamma correction
00497 
00498                 asm volatile("#Stage 10) Color Correction\n");
00499 
00500                 uint16_t *__restrict__ out16Ptr = out16;
00501 
00502                 int i = 2*VEC_WIDTH*4;
00503 
00504                 const uint16x4_t bound = vdup_n_u16(1023);
00505 
00506                 for (int y = 2; y < VEC_HEIGHT-2; y++) {
00507 
00508                     // skip the first vec in each row
00509 
00510                     int16x4x2_t r0 = vzip_s16(vld1_s16(R_GR(i)), vld1_s16(R_R(i)));
00511                     int16x4x2_t g0 = vzip_s16(vld1_s16(G_GR(i)), vld1_s16(G_R(i)));
00512                     int16x4x2_t b0 = vzip_s16(vld1_s16(B_GR(i)), vld1_s16(B_R(i)));
00513                     i += 4;
00514 
00515                     for (int x = 1; x < VEC_WIDTH; x++) {
00516 
00517                         int16x4x2_t r1 = vzip_s16(vld1_s16(R_GR(i)), vld1_s16(R_R(i)));
00518                         int16x4x2_t g1 = vzip_s16(vld1_s16(G_GR(i)), vld1_s16(G_R(i)));
00519                         int16x4x2_t b1 = vzip_s16(vld1_s16(B_GR(i)), vld1_s16(B_R(i)));
00520 
00521                         // do the color matrix
00522                         int32x4_t rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));
00523                         rout = vmlal_lane_s16(rout, r0.val[1], colorMatrix[0], 0);
00524                         rout = vmlal_lane_s16(rout, g0.val[1], colorMatrix[0], 1);
00525                         rout = vmlal_lane_s16(rout, b0.val[1], colorMatrix[0], 2);
00526 
00527                         int32x4_t gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));
00528                         gout = vmlal_lane_s16(gout, r0.val[1], colorMatrix[1], 0);
00529                         gout = vmlal_lane_s16(gout, g0.val[1], colorMatrix[1], 1);
00530                         gout = vmlal_lane_s16(gout, b0.val[1], colorMatrix[1], 2);
00531 
00532                         int32x4_t bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00533                         bout = vmlal_lane_s16(bout, r0.val[1], colorMatrix[2], 0);
00534                         bout = vmlal_lane_s16(bout, g0.val[1], colorMatrix[2], 1);
00535                         bout = vmlal_lane_s16(bout, b0.val[1], colorMatrix[2], 2);
00536 
00537                         uint16x4x3_t col16;
00538                         col16.val[0] = vqrshrun_n_s32(rout, 8);
00539                         col16.val[1] = vqrshrun_n_s32(gout, 8);
00540                         col16.val[2] = vqrshrun_n_s32(bout, 8);
00541                         col16.val[0] = vmin_u16(col16.val[0], bound);
00542                         col16.val[1] = vmin_u16(col16.val[1], bound);
00543                         col16.val[2] = vmin_u16(col16.val[2], bound);
00544                         vst3_u16(out16Ptr, col16);
00545                         out16Ptr += 12;
00546 
00547                         rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));
00548                         rout = vmlal_lane_s16(rout, r1.val[0], colorMatrix[0], 0);
00549                         rout = vmlal_lane_s16(rout, g1.val[0], colorMatrix[0], 1);
00550                         rout = vmlal_lane_s16(rout, b1.val[0], colorMatrix[0], 2);
00551 
00552                         gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));
00553                         gout = vmlal_lane_s16(gout, r1.val[0], colorMatrix[1], 0);
00554                         gout = vmlal_lane_s16(gout, g1.val[0], colorMatrix[1], 1);
00555                         gout = vmlal_lane_s16(gout, b1.val[0], colorMatrix[1], 2);
00556 
00557                         bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00558                         bout = vmlal_lane_s16(bout, r1.val[0], colorMatrix[2], 0);
00559                         bout = vmlal_lane_s16(bout, g1.val[0], colorMatrix[2], 1);
00560                         bout = vmlal_lane_s16(bout, b1.val[0], colorMatrix[2], 2);
00561 
00562                         col16.val[0] = vqrshrun_n_s32(rout, 8);
00563                         col16.val[1] = vqrshrun_n_s32(gout, 8);
00564                         col16.val[2] = vqrshrun_n_s32(bout, 8);
00565                         col16.val[0] = vmin_u16(col16.val[0], bound);
00566                         col16.val[1] = vmin_u16(col16.val[1], bound);
00567                         col16.val[2] = vmin_u16(col16.val[2], bound);
00568                         vst3_u16(out16Ptr, col16);
00569                         out16Ptr += 12;
00570 
00571                         r0 = r1;
00572                         g0 = g1;
00573                         b0 = b1;
00574 
00575                         i += 4;
00576                     }
00577 
00578                     // jump back
00579                     i -= VEC_WIDTH*4;
00580 
00581                     r0 = vzip_s16(vld1_s16(R_B(i)), vld1_s16(R_GB(i)));
00582                     g0 = vzip_s16(vld1_s16(G_B(i)), vld1_s16(G_GB(i)));
00583                     b0 = vzip_s16(vld1_s16(B_B(i)), vld1_s16(B_GB(i)));
00584                     i += 4;
00585 
00586                     for (int x = 1; x < VEC_WIDTH; x++) {
00587                         int16x4x2_t r1 = vzip_s16(vld1_s16(R_B(i)), vld1_s16(R_GB(i)));
00588                         int16x4x2_t g1 = vzip_s16(vld1_s16(G_B(i)), vld1_s16(G_GB(i)));
00589                         int16x4x2_t b1 = vzip_s16(vld1_s16(B_B(i)), vld1_s16(B_GB(i)));
00590 
00591                         // do the color matrix
00592                         int32x4_t rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));
00593                         rout = vmlal_lane_s16(rout, r0.val[1], colorMatrix[0], 0);
00594                         rout = vmlal_lane_s16(rout, g0.val[1], colorMatrix[0], 1);
00595                         rout = vmlal_lane_s16(rout, b0.val[1], colorMatrix[0], 2);
00596 
00597                         int32x4_t gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));
00598                         gout = vmlal_lane_s16(gout, r0.val[1], colorMatrix[1], 0);
00599                         gout = vmlal_lane_s16(gout, g0.val[1], colorMatrix[1], 1);
00600                         gout = vmlal_lane_s16(gout, b0.val[1], colorMatrix[1], 2);
00601 
00602                         int32x4_t bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00603                         bout = vmlal_lane_s16(bout, r0.val[1], colorMatrix[2], 0);
00604                         bout = vmlal_lane_s16(bout, g0.val[1], colorMatrix[2], 1);
00605                         bout = vmlal_lane_s16(bout, b0.val[1], colorMatrix[2], 2);
00606 
00607                         uint16x4x3_t col16;
00608                         col16.val[0] = vqrshrun_n_s32(rout, 8);
00609                         col16.val[1] = vqrshrun_n_s32(gout, 8);
00610                         col16.val[2] = vqrshrun_n_s32(bout, 8);
00611                         col16.val[0] = vmin_u16(col16.val[0], bound);
00612                         col16.val[1] = vmin_u16(col16.val[1], bound);
00613                         col16.val[2] = vmin_u16(col16.val[2], bound);
00614                         vst3_u16(out16Ptr, col16);
00615                         out16Ptr += 12;
00616 
00617                         rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));
00618                         rout = vmlal_lane_s16(rout, r1.val[0], colorMatrix[0], 0);
00619                         rout = vmlal_lane_s16(rout, g1.val[0], colorMatrix[0], 1);
00620                         rout = vmlal_lane_s16(rout, b1.val[0], colorMatrix[0], 2);
00621 
00622                         gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));
00623                         gout = vmlal_lane_s16(gout, r1.val[0], colorMatrix[1], 0);
00624                         gout = vmlal_lane_s16(gout, g1.val[0], colorMatrix[1], 1);
00625                         gout = vmlal_lane_s16(gout, b1.val[0], colorMatrix[1], 2);
00626 
00627                         bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00628                         bout = vmlal_lane_s16(bout, r1.val[0], colorMatrix[2], 0);
00629                         bout = vmlal_lane_s16(bout, g1.val[0], colorMatrix[2], 1);
00630                         bout = vmlal_lane_s16(bout, b1.val[0], colorMatrix[2], 2);
00631 
00632                         col16.val[0] = vqrshrun_n_s32(rout, 8);
00633                         col16.val[1] = vqrshrun_n_s32(gout, 8);
00634                         col16.val[2] = vqrshrun_n_s32(bout, 8);
00635                         col16.val[0] = vmin_u16(col16.val[0], bound);
00636                         col16.val[1] = vmin_u16(col16.val[1], bound);
00637                         col16.val[2] = vmin_u16(col16.val[2], bound);
00638                         vst3_u16(out16Ptr, col16);
00639                         out16Ptr += 12;
00640 
00641                         r0 = r1;
00642                         g0 = g1;
00643                         b0 = b1;
00644 
00645                         i += 4;
00646                     }
00647                 }
00648                 asm volatile("#End of stage 10) - color correction\n\t");
00649             }
00650 
00651 
00652             if (1) {
00653 
00654                 asm volatile("#Gamma Correction\n");
00655                 // Gamma correction (on the CPU, not the NEON)
00656                 const uint16_t *__restrict__ out16Ptr = out16;
00657 
00658                 for (int y = 0; y < BLOCK_HEIGHT; y++) {
00659                     unsigned int *__restrict__ outPtr32 = (unsigned int *)(outBlockPtr + y * outWidth * 3);
00660                     for (int x = 0; x < (BLOCK_WIDTH*3)/4; x++) {
00661                         unsigned val = ((lut[out16Ptr[0]] << 0) |
00662                                         (lut[out16Ptr[1]] << 8) |
00663                                         (lut[out16Ptr[2]] << 16) |
00664                                         (lut[out16Ptr[3]] << 24));
00665                         *outPtr32++ = val;
00666                         out16Ptr += 4;
00667                         // *outPtr++ = lut[*out16Ptr++];
00668                     }
00669                 }
00670                 asm volatile("#end of Gamma Correction\n");
00671 
00672                 /*
00673                 const uint16_t * __restrict__ out16Ptr = out16;
00674                 for (int y = 0; y < BLOCK_HEIGHT; y++) {
00675                     unsigned char * __restrict__ outPtr = (outBlockPtr + y * outWidth * 3);
00676                     for (int x = 0; x < (BLOCK_WIDTH*3); x++) {
00677                         *outPtr++ = lut[*out16Ptr++];
00678                     }
00679                 }
00680                 */
00681 
00682             }
00683 
00684 
00685             blockPtr += BLOCK_WIDTH;
00686             outBlockPtr += BLOCK_WIDTH*3;
00687         }
00688     }
00689 
00690     //std::cout << "Done demosaicking. time = " << ((Time::now() - startTime)/1000) << std::endl;
00691     return out;
00692 }
00693 
00694 Image makeThumbnailRAW_ARM(Frame src, float contrast, int blackLevel, float gamma) {
00695     // Assuming we want a slightly-cropped thumbnail into 640x480, Bayer pattern GRBG
00696     // This means averaging together a 4x4 block of Bayer pattern for one RGB24 pixel
00697     // Also want to convert to sRGB, which includes a color matrix multiply and a gamma transform
00698     // using a lookup table.
00699 
00700     // Implementation:
00701     //   Uses ARM NEON SIMD vector instructions and inline assembly.
00702     //   Reads in a 16x4 block of pixels at a time, in 16-bit GRBG Bayer format, and outputs a 4x1 block of RGB24 pixels.
00703     // Important note: Due to some apparent bugs in GCC's inline assembly register mapping between C variables and NEON registers,
00704     //   namely that trying to reference an int16x4 variable creates a reference to a s register instead of a d register, all the
00705     //   int16x4 variables are forced into specific NEON registers, and then referred to using that register, not by name.
00706     //   This bug seems to be in gcc 4.2.1, should be fixed by 4.4 based on some gcc bug reports.
00707 
00708     Image thumb(640, 480, RGB24);
00709     const unsigned int w = 2592, tw = 640;
00710     const unsigned int h = 1968, th = 480;
00711     const unsigned int scale = 4;
00712     const unsigned int cw = tw*scale;
00713     const unsigned int ch = th*scale;
00714     const unsigned int startX = (w-cw)/2;
00715     const unsigned int startY = (h-ch)/2;
00716     const unsigned int bytesPerRow = src.image().bytesPerRow();
00717 
00718     // Make the response curve
00719     unsigned char lut[4096];
00720     makeLUT(src, contrast, blackLevel, gamma, lut);
00721 
00722     unsigned char *row = src.image()(startX, startY);
00723 
00724     Time startTime = Time::now();
00725     float colorMatrix_f[12];
00726 
00727     // Check if there's a custom color matrix
00728     if (src.shot().colorMatrix().size() == 12) {
00729         for (int i = 0; i < 12; i++) {
00730             colorMatrix_f[i] = src.shot().colorMatrix()[i];
00731         }
00732         printf("Making thumbnail with custom WB\n");
00733     } else {
00734         // Otherwise use the platform version
00735         src.platform().rawToRGBColorMatrix(src.shot().whiteBalance, colorMatrix_f);
00736         printf("Making thumbnail with platform WB\n");
00737     }
00738 
00739     register int16x4_t colorMatrix0 asm("d0");  // ASM assignments are essential - they're implicitly trusted by the inline code.
00740     register int16x4_t colorMatrix1 asm("d1");
00741     register int16x4_t colorMatrix2 asm("d2");
00742     register int16x4_t wCoord asm("d20");  // Workaround for annoyances with scalar addition.
00743     register int16x4_t maxValue asm("d21");  // Maximum allowed signed 16-bit value
00744     register int16x4_t minValue asm("d22");  // Minimum allowed signed 16-bit value
00745 
00746     asm volatile(
00747         // Load matrix into colorMatrix0-2, set to be d0-d2
00748         "vldm %[colorMatrix_f], {q2,q3,q4}  \n\t"
00749         "vcvt.s32.f32 q2, q2, #8  \n\t" // Float->fixed-point conversion
00750         "vcvt.s32.f32 q3, q3, #8  \n\t"
00751         "vcvt.s32.f32 q4, q4, #8  \n\t"
00752         "vmovn.i32 d0, q2  \n\t" // Narrowing to 16-bit
00753         "vmovn.i32 d1, q3  \n\t"
00754         "vmovn.i32 d2, q4  \n\t"
00755         // Load homogenous coordinate, pixel value limits
00756         "vmov.i16  d20, #0x4   \n\t"  // Homogenous coordinate.
00757         "vmov.i16  d21, #0x00FF  \n\t"  // Maximum pixel value: 1023
00758         "vorr.i16  d21, #0x0300  \n\t"  // Maximum pixel value part 2
00759         "vmov.i16  d22, #0x0     \n\t"  // Minimum pixel value: 0
00760         : [colorMatrix0] "=w"(colorMatrix0),
00761         [colorMatrix1] "=w"(colorMatrix1),
00762         [colorMatrix2] "=w"(colorMatrix2),
00763         [wCoord] "=w"(wCoord),
00764         [maxValue] "=w"(maxValue),
00765         [minValue] "=w"(minValue)
00766         :  [colorMatrix_f] "r"(colorMatrix_f)
00767         : "memory",
00768         "d3", "d4", "d5", "d6", "d7", "d8", "d9");
00769 
00770     for (unsigned int ty = 0; ty <480; ty++, row+=4*bytesPerRow) {
00771         register unsigned short *px0 = (unsigned short *)row;
00772         register unsigned short *px1 = (unsigned short *)(row+1*bytesPerRow);
00773         register unsigned short *px2 = (unsigned short *)(row+2*bytesPerRow);
00774         register unsigned short *px3 = (unsigned short *)(row+3*bytesPerRow);
00775 
00776         register unsigned char *dst = thumb(0,ty);
00777         for (register unsigned int tx =0; tx < 640; tx+=scale) {
00778             // Assembly block for fast downsample/demosaic, color correction, and gamma curve lookup
00779             asm volatile(
00780                 // *px0: GRGR GRGR GRGR GRGR
00781                 // *px1: BGBG BGBG BGBG BGBG
00782                 // *px2: GRGR GRGR GRGR GRGR
00783                 // *px3: BGBG BGBG BGBG BGBG
00785                 "vld2.16 {d4-d7}, [%[px0]]!  \n\t"
00786                 "vld2.16 {d8-d11}, [%[px1]]! \n\t"
00787                 "vld2.16 {d12-d15}, [%[px2]]! \n\t"
00788                 "vld2.16 {d16-d19}, [%[px3]]! \n\t"
00789                 //  d4    d5    d6    d7
00790                 // GG|GG GG|GG RR|RR RR|RR
00791                 //  d8    d9    d10   d11
00792                 // BB|BB BB|BB GG|GG GG|GG
00793                 //  d12   d13   d14   d15
00794                 // GG|GG GG|GG RR|RR RR|RR
00795                 //  d16   d17   d18   d19
00796                 // BB|BB BB|BB GG|GG GG|GG
00797 
00799                 "vpadd.u16 d4, d4, d5  \n\t"   // G1
00800                 "vpadd.u16 d5, d6, d7  \n\t"   // R1
00801                 "vpadd.u16 d6, d8, d9  \n\t"   // B1
00802                 "vpadd.u16 d7, d10, d11 \n\t"  // G2
00803                 "vpadd.u16 d8, d12, d13 \n\t"  // G3
00804                 "vpadd.u16 d9, d14, d15 \n\t"  // R2
00805                 "vpadd.u16 d10, d16, d17 \n\t" // B2
00806                 "vpadd.u16 d11, d18, d19 \n\t" // G4
00807                 //    d4       d5       d6       d7
00808                 // G|G|G|G  R|R|R|R  B|B|B|B  G|G|G|G
00809                 //    d8       d9       d10      d11
00810                 // G|G|G|G  R|R|R|R  B|B|B|B  G|G|G|G
00811 
00813                 "vadd.u16 d7, d8   \n\t"
00814                 "vadd.u16 d4, d11   \n\t"
00815                 "vhadd.u16 d4, d7  \n\t"
00817                 "vadd.u16 d5, d9  \n\t"
00819                 "vadd.u16 d6, d10 \n\t"
00820                 //    d4       d5       d6
00821                 // G|G|G|G  R|R|R|R  B|B|B|B
00822                 //
00823                 // Assuming sRGB affine matrix stored in fixed precision (lsb = 1/256)
00824                 // Trusting GCC to properly assign colorMatrix0-2 to d0-d2.  Direct reference seems to be broken on g++ 4.2.1 at least.
00825                 // r   colorMatrix0[0] [1] [2] [3]   r_in
00826                 // g = colorMatrix1[0] [1] [2] [3] * g_in
00827                 // b   colorMatrix2[0] [1] [2] [3]   b_in
00828 
00830 
00831                 "vmull.s16 q4, d5, d0[0] \n\t"
00832                 "vmlal.s16 q4, d4, d0[1] \n\t"
00833                 "vmlal.s16 q4, d6, d0[2] \n\t"
00834                 "vmlal.s16 q4, d20, d0[3] \n\t"
00835 
00836                 "vmull.s16 q5, d5, d1[0] \n\t"
00837                 "vmlal.s16 q5, d4, d1[1] \n\t"
00838                 "vmlal.s16 q5, d6, d1[2] \n\t"
00839                 "vmlal.s16 q5, d20, d1[3] \n\t"
00840 
00841                 "vmull.s16 q6, d5, d2[0] \n\t"
00842                 "vmlal.s16 q6, d4, d2[1] \n\t"
00843                 "vmlal.s16 q6, d6, d2[2] \n\t"
00844                 "vmlal.s16 q6, d20, d2[3] \n\t"
00845 
00846                 //  d08  d09  d10  d11  d12  d13
00847                 //  R|R  R|R  G|G  G|G  B|B  B|B
00848 
00850                 "vrshrn.s32 d3, q4, #10  \n\t"
00851                 "vrshrn.s32 d4, q5, #10  \n\t"
00852                 "vrshrn.s32 d5, q6, #10  \n\t"
00854                 "vmin.s16 d3, d3, d21    \n\t"
00855                 "vmin.s16 d4, d4, d21    \n\t"
00856                 "vmin.s16 d5, d5, d21    \n\t"
00857                 "vmax.s16 d3, d3, d22    \n\t"
00858                 "vmax.s16 d4, d4, d22    \n\t"
00859                 "vmax.s16 d5, d5, d22    \n\t"
00860 
00861                 //    d3       d4       d2
00862                 // R|R|R|R  G|G|G|G  B|B|B|B
00863 
00865                 "vmov r0,r1, d3                        \n\t"
00866                 //    r0       r1
00867                 // R16|R16  R16|R16
00868                 "uxth r2, r0                           \n\t" // Extract first red pixel into r2
00869                 "ldrb r4, [%[gammaTable], r2]          \n\t" // Table lookup, byte result
00870 
00871                 "uxth r2, r0, ROR #16                  \n\t"
00872                 "ldrb r3, [%[gammaTable], r2]          \n\t"
00873                 "orr  r4, r4, r3, LSL #24              \n\t"
00874 
00875                 "uxth r2, r1                           \n\t"
00876                 "ldrb r3, [%[gammaTable], r2]          \n\t"
00877                 "mov  r5, r3, LSL #16                  \n\t"
00878 
00879                 "uxth r2, r1, ROR #16                  \n\t"
00880                 "ldrb r3, [%[gammaTable], r2]          \n\t"
00881                 "mov  r6, r3, LSL #8                   \n\t"
00882 
00883                 //   r4   r5   r6
00884                 //  R__R __R_ _R__  -> increasing mem address (and increasing left shift)
00885 
00886                 "vmov r0,r1, d4                        \n\t"
00887                 //    r0       r1
00888                 // G16|G16  G16|G16
00889                 "uxth r2, r0                           \n\t"
00890                 "ldrb r3, [%[gammaTable], r2]          \n\t"
00891                 "orr  r4, r4, r3, LSL #8               \n\t"
00892 
00893                 "uxth r2, r0, ROR #16                  \n\t"
00894                 "ldrb r3, [%[gammaTable], r2]          \n\t"
00895                 "orr  r5, r5, r3                       \n\t"
00896 
00897                 "uxth r2, r1                           \n\t"
00898                 "ldrb r3, [%[gammaTable], r2]          \n\t"
00899                 "orr  r5, r5, r3, LSL #24              \n\t"
00900 
00901                 "uxth r2, r1, ROR #16                  \n\t"
00902                 "ldrb r3, [%[gammaTable], r2]          \n\t"
00903                 "orr  r6, r6, r3, LSL #16              \n\t"
00904 
00905                 //   r4   r5   r6
00906                 //  RG_R G_RG _RG_  -> increasing mem address (and increasing left shift)
00907 
00908                 "vmov r0,r1, d5                        \n\t"
00909                 //    r0       r1
00910                 // B16|B16  B16|B16
00911                 "uxth r2, r0                           \n\t"
00912                 "ldrb r3, [%[gammaTable], r2]          \n\t"
00913                 "orr  r4, r4, r3, LSL #16              \n\t"
00914 
00915                 "uxth r2, r0, ROR #16                  \n\t"
00916                 "ldrb r3, [%[gammaTable], r2]          \n\t"
00917                 "orr  r5, r5, r3, LSL #8               \n\t"
00918 
00919                 "uxth r2, r1                           \n\t"
00920                 "ldrb r3, [%[gammaTable], r2]          \n\t"
00921                 "orr  r6, r6, r3                       \n\t"
00922 
00923                 "uxth r2, r1, ROR #16                  \n\t"
00924                 "ldrb r3, [%[gammaTable], r2]          \n\t"
00925                 "orr  r6, r6, r3, LSL #24              \n\t"
00926 
00927                 //   r4   r5   r6
00928                 //  RGBR GBRG BRGB
00929 
00930                 "stm %[dst]!, {r4,r5,r6}                   \n\t" // multi-store!
00931                 : [px0] "+&r"(px0),
00932                 [px1] "+&r"(px1),
00933                 [px2] "+&r"(px2),
00934                 [px3] "+&r"(px3),
00935                 [dst] "+&r"(dst)
00936                 : [gammaTable] "r"(lut),
00937                 [colorMatrix0] "w"(colorMatrix0),  // Implicitly referenced only (d0)
00938                 [colorMatrix1] "w"(colorMatrix1),  // Implicitly referenced only (d1)
00939                 [colorMatrix2] "w"(colorMatrix2),  // Implicitly referenced only (d2)
00940                 [wCoord] "w"(wCoord),              // Implicitly referenced only (d20)
00941                 [maxValue] "w"(maxValue),          // Implicitly referenced only (d21)
00942                 [minValue] "w"(minValue)           // Implicitly referenced only (d22)
00943                 : "memory",
00944                 "r0", "r1", "r2", "r3", "r4", "r5", "r6",
00945                 "d3", "d4", "d5", "d6",
00946                 "d7", "d8", "d9", "d10",
00947                 "d11", "d12", "d13", "d14",
00948                 "d15", "d16", "d17", "d18", "d19"
00949             );
00950 
00951         }
00952     }
00953 
00954     //std::cout << "Done creating fast thumbnail. time = " << ((Time::now()-startTime)/1000) << std::endl;
00955 
00956     return thumb;
00957 }
00958 }
00959 
00960 
00961 #endif