00001 #ifdef FCAM_ARCH_ARM
00002 #include "Demosaic_ARM.h"
00003 #include <arm_neon.h>
00004
00005 namespace FCam {
00006
00007
00008 extern void makeLUT(const Frame &f, float contrast, int blackLevel, float gamma, unsigned char *lut);
00009
00010 Image demosaic_ARM(Frame src, float contrast, bool denoise, int blackLevel, float gamma) {
00011
00012 const int BLOCK_WIDTH = 40;
00013 const int BLOCK_HEIGHT = 24;
00014
00015 Image input = src.image();
00016
00017
00018 switch ((int)src.platform().bayerPattern()) {
00019 case GRBG:
00020 break;
00021 case RGGB:
00022 input = input.subImage(1, 0, Size(input.width()-2, input.height()));
00023 break;
00024 case BGGR:
00025 input = input.subImage(0, 1, Size(input.width(), input.height()-2));
00026 break;
00027 case GBRG:
00028 input = input.subImage(1, 1, Size(input.width()-2, input.height()-2));
00029 default:
00030 error(Event::DemosaicError, "Can't demosaic from a non-bayer sensor\n");
00031 return Image();
00032 }
00033
00034 int rawWidth = input.width();
00035 int rawHeight = input.height();
00036
00037 const int VEC_WIDTH = ((BLOCK_WIDTH + 8)/8);
00038 const int VEC_HEIGHT = ((BLOCK_HEIGHT + 8)/2);
00039
00040 int rawPixelsPerRow = input.bytesPerRow()/2 ;
00041
00042 int outWidth = rawWidth-8;
00043 int outHeight = rawHeight-8;
00044 outWidth /= BLOCK_WIDTH;
00045 outWidth *= BLOCK_WIDTH;
00046 outHeight /= BLOCK_HEIGHT;
00047 outHeight *= BLOCK_HEIGHT;
00048
00049 Image out(outWidth, outHeight, RGB24);
00050
00051
00052 if (((input.width() - 8) != (unsigned)outWidth) ||
00053 ((input.height() - 8) != (unsigned)outHeight)) {
00054 int offX = (input.width() - 8 - outWidth)/2;
00055 int offY = (input.height() - 8 - outHeight)/2;
00056 offX -= offX&1;
00057 offY -= offY&1;
00058
00059 if (offX || offY) {
00060 input = input.subImage(offX, offY, Size(outWidth+8, outHeight+8));
00061 }
00062 }
00063
00064 Time startTime = Time::now();
00065
00066
00067 float colorMatrix_f[12];
00068
00069
00070 if (src.shot().colorMatrix().size() == 12) {
00071 for (int i = 0; i < 12; i++) {
00072 colorMatrix_f[i] = src.shot().colorMatrix()[i];
00073 }
00074 } else {
00075
00076 src.platform().rawToRGBColorMatrix(src.shot().whiteBalance, colorMatrix_f);
00077 }
00078
00079 int16x4_t colorMatrix[3];
00080 for (int i = 0; i < 3; i++) {
00081 int16_t val = (int16_t)(colorMatrix_f[i*4+0] * 256 + 0.5);
00082 colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 0);
00083 val = (int16_t)(colorMatrix_f[i*4+1] * 256 + 0.5);
00084 colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 1);
00085 val = (int16_t)(colorMatrix_f[i*4+2] * 256 + 0.5);
00086 colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 2);
00087 val = (int16_t)(colorMatrix_f[i*4+3] * 256 + 0.5);
00088 colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 3);
00089 }
00090
00091
00092
00093 uint16_t out16[BLOCK_WIDTH*BLOCK_HEIGHT*3];
00094
00095
00096
00097 int16_t scratch[VEC_WIDTH*VEC_HEIGHT*4*12];
00098
00099 #define R_R_OFF (VEC_WIDTH*VEC_HEIGHT*4*0)
00100 #define R_GR_OFF (VEC_WIDTH*VEC_HEIGHT*4*1)
00101 #define R_GB_OFF (VEC_WIDTH*VEC_HEIGHT*4*2)
00102 #define R_B_OFF (VEC_WIDTH*VEC_HEIGHT*4*3)
00103
00104 #define G_R_OFF (VEC_WIDTH*VEC_HEIGHT*4*4)
00105 #define G_GR_OFF (VEC_WIDTH*VEC_HEIGHT*4*5)
00106 #define G_GB_OFF (VEC_WIDTH*VEC_HEIGHT*4*6)
00107 #define G_B_OFF (VEC_WIDTH*VEC_HEIGHT*4*7)
00108
00109 #define B_R_OFF (VEC_WIDTH*VEC_HEIGHT*4*8)
00110 #define B_GR_OFF (VEC_WIDTH*VEC_HEIGHT*4*9)
00111 #define B_GB_OFF (VEC_WIDTH*VEC_HEIGHT*4*10)
00112 #define B_B_OFF (VEC_WIDTH*VEC_HEIGHT*4*11)
00113
00114 #define R_R(i) (scratch+(i)+R_R_OFF)
00115 #define R_GR(i) (scratch+(i)+R_GR_OFF)
00116 #define R_GB(i) (scratch+(i)+R_GB_OFF)
00117 #define R_B(i) (scratch+(i)+R_B_OFF)
00118
00119 #define G_R(i) (scratch+(i)+G_R_OFF)
00120 #define G_GR(i) (scratch+(i)+G_GR_OFF)
00121 #define G_GB(i) (scratch+(i)+G_GB_OFF)
00122 #define G_B(i) (scratch+(i)+G_B_OFF)
00123
00124 #define B_R(i) (scratch+(i)+B_R_OFF)
00125 #define B_GR(i) (scratch+(i)+B_GR_OFF)
00126 #define B_GB(i) (scratch+(i)+B_GB_OFF)
00127 #define B_B(i) (scratch+(i)+B_B_OFF)
00128
00129
00130 #define G_GR_NOISY B_GR
00131 #define B_B_NOISY G_B
00132 #define R_R_NOISY G_R
00133 #define G_GB_NOISY B_GB
00134
00135
00136 unsigned char lut[4096];
00137 makeLUT(src, contrast, blackLevel, gamma, lut);
00138
00139
00140 for (int by = 0; by < rawHeight-8-BLOCK_HEIGHT+1; by += BLOCK_HEIGHT) {
00141 const short *__restrict__ blockPtr = (const short *)input(0,by);
00142 unsigned char *__restrict__ outBlockPtr = out(0, by);
00143 for (int bx = 0; bx < rawWidth-8-BLOCK_WIDTH+1; bx += BLOCK_WIDTH) {
00144
00145
00146 if (1) {
00147 register const int16_t *__restrict__ rawPtr = blockPtr;
00148 register const int16_t *__restrict__ rawPtr2 = blockPtr + rawPixelsPerRow;
00149
00150 register const int rawJump = rawPixelsPerRow*2 - VEC_WIDTH*8;
00151
00152 register int16_t *__restrict__ g_gr_ptr = denoise ? G_GR_NOISY(0) : G_GR(0);
00153 register int16_t *__restrict__ r_r_ptr = denoise ? R_R_NOISY(0) : R_R(0);
00154 register int16_t *__restrict__ b_b_ptr = denoise ? B_B_NOISY(0) : B_B(0);
00155 register int16_t *__restrict__ g_gb_ptr = denoise ? G_GB_NOISY(0) : G_GB(0);
00156
00157 for (int y = 0; y < VEC_HEIGHT; y++) {
00158 for (int x = 0; x < VEC_WIDTH/2; x++) {
00159
00160 asm volatile("# Stage 1) Demux\n");
00161
00162
00163
00164
00165 asm volatile(
00166 "vld2.16 {d6-d9}, [%[rawPtr]]! \n\t"
00167 "vld2.16 {d10-d13}, [%[rawPtr2]]! \n\t"
00168 "vst1.16 {d6-d7}, [%[g_gr_ptr]]! \n\t"
00169 "vst1.16 {d8-d9}, [%[r_r_ptr]]! \n\t"
00170 "vst1.16 {d10-d11}, [%[b_b_ptr]]! \n\t"
00171 "vst1.16 {d12-d13}, [%[g_gb_ptr]]! \n\t" :
00172 [rawPtr]"+r"(rawPtr),
00173 [rawPtr2]"+r"(rawPtr2),
00174 [g_gr_ptr]"+r"(g_gr_ptr),
00175 [r_r_ptr]"+r"(r_r_ptr),
00176 [b_b_ptr]"+r"(b_b_ptr),
00177 [g_gb_ptr]"+r"(g_gb_ptr) ::
00178 "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "memory");
00179
00180 }
00181
00182 rawPtr += rawJump;
00183 rawPtr2 += rawJump;
00184 }
00185 }
00186
00187
00188
00189
00190
00191 if (denoise) {
00192 register int16_t *__restrict__ ptr_in = NULL;
00193 register int16_t *__restrict__ ptr_out = NULL;
00194 asm volatile("#Stage 1.5: Denoise\n\t");
00195 for (int b=0; b<4; b++) {
00196 if (b==0) { ptr_in = G_GR_NOISY(0); }
00197 if (b==1) { ptr_in = R_R_NOISY(0); }
00198 if (b==2) { ptr_in = B_B_NOISY(0); }
00199 if (b==3) { ptr_in = G_GB_NOISY(0); }
00200 if (b==0) { ptr_out = G_GR(0); }
00201 if (b==1) { ptr_out = R_R(0); }
00202 if (b==2) { ptr_out = B_B(0); }
00203 if (b==3) { ptr_out = G_GB(0); }
00204
00205
00206 for (int x = 0; x < (BLOCK_WIDTH+8); x+=8) {
00207 int16x8_t in = vld1q_s16(ptr_in);
00208 vst1q_s16(ptr_out, in);
00209 ptr_in+=8;
00210 ptr_out+=8;
00211 }
00212
00213 for (int y = 1; y < VEC_HEIGHT - 1; y++) {
00214 for (int x = 0; x < VEC_WIDTH/2; x++) {
00215 int16x8_t here = vld1q_s16(ptr_in);
00216 int16x8_t above = vld1q_s16(ptr_in + VEC_WIDTH*4);
00217 int16x8_t under = vld1q_s16(ptr_in - VEC_WIDTH*4);
00218 int16x8_t right = vld1q_s16(ptr_in + 1);
00219 int16x8_t left = vld1q_s16(ptr_in - 1);
00220 int16x8_t max, min;
00221
00222
00223 max = vmaxq_s16(left, right);
00224 max = vmaxq_s16(above, max);
00225 max = vmaxq_s16(under, max);
00226
00227 min = vminq_s16(left, right);
00228 min = vminq_s16(above, min);
00229 min = vminq_s16(under, min);
00230
00231
00232 here = vminq_s16(max, here);
00233 here = vmaxq_s16(min, here);
00234
00235 vst1q_s16(ptr_out, here);
00236 ptr_in += 8;
00237 ptr_out += 8;
00238 }
00239 }
00240
00241
00242 for (int x = 0; x < (BLOCK_WIDTH+8); x+=8) {
00243 int16x8_t in = vld1q_s16(ptr_in);
00244 vst1q_s16(ptr_out, in);
00245 ptr_in+=8;
00246 ptr_out+=8;
00247 }
00248 }
00249 }
00250
00251
00252
00253
00254
00255
00256
00257
00258
00259
00260
00261
00262
00263
00264
00265
00266
00267 if (1) {
00268
00269 int i = VEC_WIDTH*4;
00270
00271 register int16_t *g_gb_up_ptr = G_GB(i) - VEC_WIDTH*4;
00272 register int16_t *g_gb_here_ptr = G_GB(i);
00273 register int16_t *g_gb_left_ptr = G_GB(i) - 1;
00274 register int16_t *g_gr_down_ptr = G_GR(i) + VEC_WIDTH*4;
00275 register int16_t *g_gr_here_ptr = G_GR(i);
00276 register int16_t *g_gr_right_ptr = G_GR(i) + 1;
00277 register int16_t *g_r_ptr = G_R(i);
00278 register int16_t *g_b_ptr = G_B(i);
00279
00280 for (int y = 1; y < VEC_HEIGHT-1; y++) {
00281 for (int x = 0; x < VEC_WIDTH/2; x++) {
00282
00283 asm volatile("#Stage 2) Green interpolation\n");
00284
00285
00286
00287 int16x8_t gb_up = vld1q_s16(g_gb_up_ptr);
00288 g_gb_up_ptr+=8;
00289 int16x8_t gb_here = vld1q_s16(g_gb_here_ptr);
00290 g_gb_here_ptr+=8;
00291 int16x8_t gb_left = vld1q_s16(g_gb_left_ptr);
00292 g_gb_left_ptr+=8;
00293 int16x8_t gr_down = vld1q_s16(g_gr_down_ptr);
00294 g_gr_down_ptr+=8;
00295 int16x8_t gr_here = vld1q_s16(g_gr_here_ptr);
00296 g_gr_here_ptr+=8;
00297 int16x8_t gr_right = vld1q_s16(g_gr_right_ptr);
00298 g_gr_right_ptr+=8;
00299
00300
00301
00302
00303
00304
00305
00306
00307
00308
00309
00310
00311
00312
00313
00314
00315
00316
00317
00318
00319
00320
00321
00322
00323
00324
00325
00326
00327
00328
00329
00330 int16x8_t gv_r = vhaddq_s16(gb_up, gb_here);
00331 int16x8_t gvd_r = vabdq_s16(gb_up, gb_here);
00332 int16x8_t gh_r = vhaddq_s16(gr_right, gr_here);
00333 int16x8_t ghd_r = vabdq_s16(gr_here, gr_right);
00334 int16x8_t g_r = vbslq_s16(vcltq_s16(ghd_r, gvd_r), gh_r, gv_r);
00335
00336 int16x8_t gv_b = vhaddq_s16(gr_down, gr_here);
00337 int16x8_t gvd_b = vabdq_s16(gr_down, gr_here);
00338 int16x8_t gh_b = vhaddq_s16(gb_left, gb_here);
00339 int16x8_t ghd_b = vabdq_s16(gb_left, gb_here);
00340 int16x8_t g_b = vbslq_s16(vcltq_s16(ghd_b, gvd_b), gh_b, gv_b);
00341
00342
00343
00344
00345
00346
00347
00348
00349
00350
00351
00352
00353
00354
00355
00356
00357
00358
00359
00360
00361
00362
00363
00364
00365
00366
00367
00368
00369
00370 vst1q_s16(g_r_ptr, g_r);
00371 g_r_ptr+=8;
00372 vst1q_s16(g_b_ptr, g_b);
00373 g_b_ptr+=8;
00374 }
00375 }
00376 }
00377 asm volatile("#End of stage 2 (green interpolation)\n");
00378
00379
00380 if (1) {
00381
00382
00383
00384
00385
00386
00387
00388
00389
00390
00391
00392
00393
00394
00395
00396
00397
00398
00399
00400
00401 int i = 2*VEC_WIDTH*4;
00402
00403 for (int y = 2; y < VEC_HEIGHT-2; y++) {
00404 for (int x = 0; x < VEC_WIDTH; x++) {
00405
00406 asm volatile("#Stage 4) r/b interpolation\n");
00407
00408
00409 int16x4_t r_here = vld1_s16(R_R(i));
00410 int16x4_t r_left = vld1_s16(R_R(i) - 1);
00411 int16x4_t r_down = vld1_s16(R_R(i) + VEC_WIDTH*4);
00412
00413 int16x4_t g_r_left = vld1_s16(G_R(i) - 1);
00414 int16x4_t g_r_here = vld1_s16(G_R(i));
00415 int16x4_t g_r_down = vld1_s16(G_R(i) + VEC_WIDTH*4);
00416
00417 int16x4_t b_up = vld1_s16(B_B(i) - VEC_WIDTH*4);
00418 int16x4_t b_here = vld1_s16(B_B(i));
00419 int16x4_t b_right = vld1_s16(B_B(i) + 1);
00420
00421 int16x4_t g_b_up = vld1_s16(G_B(i) - VEC_WIDTH*4);
00422 int16x4_t g_b_here = vld1_s16(G_B(i));
00423 int16x4_t g_b_right = vld1_s16(G_B(i) + 1);
00424
00425
00426 int16x4_t gr_here = vld1_s16(G_GR(i));
00427 int16x4_t gb_here = vld1_s16(G_GB(i));
00428
00429 {
00430
00431 int16x4_t r_gr = vadd_s16(vhadd_s16(r_left, r_here),
00432 vsub_s16(gr_here,
00433 vhadd_s16(g_r_left, g_r_here)));
00434 int16x4_t r_gb = vadd_s16(vhadd_s16(r_here, r_down),
00435 vsub_s16(gb_here,
00436 vhadd_s16(g_r_down, g_r_here)));
00437 vst1_s16(R_GR(i), r_gr);
00438 vst1_s16(R_GB(i), r_gb);
00439 }
00440
00441 {
00442
00443 int16x4_t r_downleft = vld1_s16(R_R(i) + VEC_WIDTH*4 - 1);
00444 int16x4_t g_r_downleft = vld1_s16(G_R(i) + VEC_WIDTH*4 - 1);
00445
00446 int16x4_t rp_b = vadd_s16(vhadd_s16(r_downleft, r_here),
00447 vsub_s16(g_b_here,
00448 vhadd_s16(g_r_downleft, g_r_here)));
00449 int16x4_t rn_b = vadd_s16(vhadd_s16(r_left, r_down),
00450 vsub_s16(g_b_here,
00451 vhadd_s16(g_r_left, g_r_down)));
00452 int16x4_t rpd_b = vabd_s16(r_downleft, r_here);
00453 int16x4_t rnd_b = vabd_s16(r_left, r_down);
00454 int16x4_t r_b = vbsl_s16(vclt_s16(rpd_b, rnd_b), rp_b, rn_b);
00455 vst1_s16(R_B(i), r_b);
00456 }
00457
00458 {
00459
00460 int16x4_t b_gr = vadd_s16(vhadd_s16(b_up, b_here),
00461 vsub_s16(gr_here,
00462 vhadd_s16(g_b_up, g_b_here)));
00463 int16x4_t b_gb = vadd_s16(vhadd_s16(b_here, b_right),
00464 vsub_s16(gb_here,
00465 vhadd_s16(g_b_right, g_b_here)));
00466 vst1_s16(B_GR(i), b_gr);
00467 vst1_s16(B_GB(i), b_gb);
00468 }
00469
00470 {
00471
00472 int16x4_t b_upright = vld1_s16(B_B(i) - VEC_WIDTH*4 + 1);
00473 int16x4_t g_b_upright = vld1_s16(G_B(i) - VEC_WIDTH*4 + 1);
00474
00475 int16x4_t bp_r = vadd_s16(vhadd_s16(b_upright, b_here),
00476 vsub_s16(g_r_here,
00477 vhadd_s16(g_b_upright, g_b_here)));
00478 int16x4_t bn_r = vadd_s16(vhadd_s16(b_right, b_up),
00479 vsub_s16(g_r_here,
00480 vhadd_s16(g_b_right, g_b_up)));
00481 int16x4_t bpd_r = vabd_s16(b_upright, b_here);
00482 int16x4_t bnd_r = vabd_s16(b_right, b_up);
00483 int16x4_t b_r = vbsl_s16(vclt_s16(bpd_r, bnd_r), bp_r, bn_r);
00484 vst1_s16(B_R(i), b_r);
00485 }
00486
00487
00488 i += 4;
00489 }
00490 }
00491 asm volatile("#End of stage 4 - what_ever\n\t");
00492 }
00493
00494
00495 if (1) {
00496
00497
00498 asm volatile("#Stage 10) Color Correction\n");
00499
00500 uint16_t *__restrict__ out16Ptr = out16;
00501
00502 int i = 2*VEC_WIDTH*4;
00503
00504 const uint16x4_t bound = vdup_n_u16(1023);
00505
00506 for (int y = 2; y < VEC_HEIGHT-2; y++) {
00507
00508
00509
00510 int16x4x2_t r0 = vzip_s16(vld1_s16(R_GR(i)), vld1_s16(R_R(i)));
00511 int16x4x2_t g0 = vzip_s16(vld1_s16(G_GR(i)), vld1_s16(G_R(i)));
00512 int16x4x2_t b0 = vzip_s16(vld1_s16(B_GR(i)), vld1_s16(B_R(i)));
00513 i += 4;
00514
00515 for (int x = 1; x < VEC_WIDTH; x++) {
00516
00517 int16x4x2_t r1 = vzip_s16(vld1_s16(R_GR(i)), vld1_s16(R_R(i)));
00518 int16x4x2_t g1 = vzip_s16(vld1_s16(G_GR(i)), vld1_s16(G_R(i)));
00519 int16x4x2_t b1 = vzip_s16(vld1_s16(B_GR(i)), vld1_s16(B_R(i)));
00520
00521
00522 int32x4_t rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));
00523 rout = vmlal_lane_s16(rout, r0.val[1], colorMatrix[0], 0);
00524 rout = vmlal_lane_s16(rout, g0.val[1], colorMatrix[0], 1);
00525 rout = vmlal_lane_s16(rout, b0.val[1], colorMatrix[0], 2);
00526
00527 int32x4_t gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));
00528 gout = vmlal_lane_s16(gout, r0.val[1], colorMatrix[1], 0);
00529 gout = vmlal_lane_s16(gout, g0.val[1], colorMatrix[1], 1);
00530 gout = vmlal_lane_s16(gout, b0.val[1], colorMatrix[1], 2);
00531
00532 int32x4_t bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00533 bout = vmlal_lane_s16(bout, r0.val[1], colorMatrix[2], 0);
00534 bout = vmlal_lane_s16(bout, g0.val[1], colorMatrix[2], 1);
00535 bout = vmlal_lane_s16(bout, b0.val[1], colorMatrix[2], 2);
00536
00537 uint16x4x3_t col16;
00538 col16.val[0] = vqrshrun_n_s32(rout, 8);
00539 col16.val[1] = vqrshrun_n_s32(gout, 8);
00540 col16.val[2] = vqrshrun_n_s32(bout, 8);
00541 col16.val[0] = vmin_u16(col16.val[0], bound);
00542 col16.val[1] = vmin_u16(col16.val[1], bound);
00543 col16.val[2] = vmin_u16(col16.val[2], bound);
00544 vst3_u16(out16Ptr, col16);
00545 out16Ptr += 12;
00546
00547 rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));
00548 rout = vmlal_lane_s16(rout, r1.val[0], colorMatrix[0], 0);
00549 rout = vmlal_lane_s16(rout, g1.val[0], colorMatrix[0], 1);
00550 rout = vmlal_lane_s16(rout, b1.val[0], colorMatrix[0], 2);
00551
00552 gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));
00553 gout = vmlal_lane_s16(gout, r1.val[0], colorMatrix[1], 0);
00554 gout = vmlal_lane_s16(gout, g1.val[0], colorMatrix[1], 1);
00555 gout = vmlal_lane_s16(gout, b1.val[0], colorMatrix[1], 2);
00556
00557 bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00558 bout = vmlal_lane_s16(bout, r1.val[0], colorMatrix[2], 0);
00559 bout = vmlal_lane_s16(bout, g1.val[0], colorMatrix[2], 1);
00560 bout = vmlal_lane_s16(bout, b1.val[0], colorMatrix[2], 2);
00561
00562 col16.val[0] = vqrshrun_n_s32(rout, 8);
00563 col16.val[1] = vqrshrun_n_s32(gout, 8);
00564 col16.val[2] = vqrshrun_n_s32(bout, 8);
00565 col16.val[0] = vmin_u16(col16.val[0], bound);
00566 col16.val[1] = vmin_u16(col16.val[1], bound);
00567 col16.val[2] = vmin_u16(col16.val[2], bound);
00568 vst3_u16(out16Ptr, col16);
00569 out16Ptr += 12;
00570
00571 r0 = r1;
00572 g0 = g1;
00573 b0 = b1;
00574
00575 i += 4;
00576 }
00577
00578
00579 i -= VEC_WIDTH*4;
00580
00581 r0 = vzip_s16(vld1_s16(R_B(i)), vld1_s16(R_GB(i)));
00582 g0 = vzip_s16(vld1_s16(G_B(i)), vld1_s16(G_GB(i)));
00583 b0 = vzip_s16(vld1_s16(B_B(i)), vld1_s16(B_GB(i)));
00584 i += 4;
00585
00586 for (int x = 1; x < VEC_WIDTH; x++) {
00587 int16x4x2_t r1 = vzip_s16(vld1_s16(R_B(i)), vld1_s16(R_GB(i)));
00588 int16x4x2_t g1 = vzip_s16(vld1_s16(G_B(i)), vld1_s16(G_GB(i)));
00589 int16x4x2_t b1 = vzip_s16(vld1_s16(B_B(i)), vld1_s16(B_GB(i)));
00590
00591
00592 int32x4_t rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));
00593 rout = vmlal_lane_s16(rout, r0.val[1], colorMatrix[0], 0);
00594 rout = vmlal_lane_s16(rout, g0.val[1], colorMatrix[0], 1);
00595 rout = vmlal_lane_s16(rout, b0.val[1], colorMatrix[0], 2);
00596
00597 int32x4_t gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));
00598 gout = vmlal_lane_s16(gout, r0.val[1], colorMatrix[1], 0);
00599 gout = vmlal_lane_s16(gout, g0.val[1], colorMatrix[1], 1);
00600 gout = vmlal_lane_s16(gout, b0.val[1], colorMatrix[1], 2);
00601
00602 int32x4_t bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00603 bout = vmlal_lane_s16(bout, r0.val[1], colorMatrix[2], 0);
00604 bout = vmlal_lane_s16(bout, g0.val[1], colorMatrix[2], 1);
00605 bout = vmlal_lane_s16(bout, b0.val[1], colorMatrix[2], 2);
00606
00607 uint16x4x3_t col16;
00608 col16.val[0] = vqrshrun_n_s32(rout, 8);
00609 col16.val[1] = vqrshrun_n_s32(gout, 8);
00610 col16.val[2] = vqrshrun_n_s32(bout, 8);
00611 col16.val[0] = vmin_u16(col16.val[0], bound);
00612 col16.val[1] = vmin_u16(col16.val[1], bound);
00613 col16.val[2] = vmin_u16(col16.val[2], bound);
00614 vst3_u16(out16Ptr, col16);
00615 out16Ptr += 12;
00616
00617 rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));
00618 rout = vmlal_lane_s16(rout, r1.val[0], colorMatrix[0], 0);
00619 rout = vmlal_lane_s16(rout, g1.val[0], colorMatrix[0], 1);
00620 rout = vmlal_lane_s16(rout, b1.val[0], colorMatrix[0], 2);
00621
00622 gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));
00623 gout = vmlal_lane_s16(gout, r1.val[0], colorMatrix[1], 0);
00624 gout = vmlal_lane_s16(gout, g1.val[0], colorMatrix[1], 1);
00625 gout = vmlal_lane_s16(gout, b1.val[0], colorMatrix[1], 2);
00626
00627 bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00628 bout = vmlal_lane_s16(bout, r1.val[0], colorMatrix[2], 0);
00629 bout = vmlal_lane_s16(bout, g1.val[0], colorMatrix[2], 1);
00630 bout = vmlal_lane_s16(bout, b1.val[0], colorMatrix[2], 2);
00631
00632 col16.val[0] = vqrshrun_n_s32(rout, 8);
00633 col16.val[1] = vqrshrun_n_s32(gout, 8);
00634 col16.val[2] = vqrshrun_n_s32(bout, 8);
00635 col16.val[0] = vmin_u16(col16.val[0], bound);
00636 col16.val[1] = vmin_u16(col16.val[1], bound);
00637 col16.val[2] = vmin_u16(col16.val[2], bound);
00638 vst3_u16(out16Ptr, col16);
00639 out16Ptr += 12;
00640
00641 r0 = r1;
00642 g0 = g1;
00643 b0 = b1;
00644
00645 i += 4;
00646 }
00647 }
00648 asm volatile("#End of stage 10) - color correction\n\t");
00649 }
00650
00651
00652 if (1) {
00653
00654 asm volatile("#Gamma Correction\n");
00655
00656 const uint16_t *__restrict__ out16Ptr = out16;
00657
00658 for (int y = 0; y < BLOCK_HEIGHT; y++) {
00659 unsigned int *__restrict__ outPtr32 = (unsigned int *)(outBlockPtr + y * outWidth * 3);
00660 for (int x = 0; x < (BLOCK_WIDTH*3)/4; x++) {
00661 unsigned val = ((lut[out16Ptr[0]] << 0) |
00662 (lut[out16Ptr[1]] << 8) |
00663 (lut[out16Ptr[2]] << 16) |
00664 (lut[out16Ptr[3]] << 24));
00665 *outPtr32++ = val;
00666 out16Ptr += 4;
00667
00668 }
00669 }
00670 asm volatile("#end of Gamma Correction\n");
00671
00672
00673
00674
00675
00676
00677
00678
00679
00680
00681
00682 }
00683
00684
00685 blockPtr += BLOCK_WIDTH;
00686 outBlockPtr += BLOCK_WIDTH*3;
00687 }
00688 }
00689
00690
00691 return out;
00692 }
00693
00694 Image makeThumbnailRAW_ARM(Frame src, float contrast, int blackLevel, float gamma) {
00695
00696
00697
00698
00699
00700
00701
00702
00703
00704
00705
00706
00707
00708 Image thumb(640, 480, RGB24);
00709 const unsigned int w = 2592, tw = 640;
00710 const unsigned int h = 1968, th = 480;
00711 const unsigned int scale = 4;
00712 const unsigned int cw = tw*scale;
00713 const unsigned int ch = th*scale;
00714 const unsigned int startX = (w-cw)/2;
00715 const unsigned int startY = (h-ch)/2;
00716 const unsigned int bytesPerRow = src.image().bytesPerRow();
00717
00718
00719 unsigned char lut[4096];
00720 makeLUT(src, contrast, blackLevel, gamma, lut);
00721
00722 unsigned char *row = src.image()(startX, startY);
00723
00724 Time startTime = Time::now();
00725 float colorMatrix_f[12];
00726
00727
00728 if (src.shot().colorMatrix().size() == 12) {
00729 for (int i = 0; i < 12; i++) {
00730 colorMatrix_f[i] = src.shot().colorMatrix()[i];
00731 }
00732 printf("Making thumbnail with custom WB\n");
00733 } else {
00734
00735 src.platform().rawToRGBColorMatrix(src.shot().whiteBalance, colorMatrix_f);
00736 printf("Making thumbnail with platform WB\n");
00737 }
00738
00739 register int16x4_t colorMatrix0 asm("d0");
00740 register int16x4_t colorMatrix1 asm("d1");
00741 register int16x4_t colorMatrix2 asm("d2");
00742 register int16x4_t wCoord asm("d20");
00743 register int16x4_t maxValue asm("d21");
00744 register int16x4_t minValue asm("d22");
00745
00746 asm volatile(
00747
00748 "vldm %[colorMatrix_f], {q2,q3,q4} \n\t"
00749 "vcvt.s32.f32 q2, q2, #8 \n\t"
00750 "vcvt.s32.f32 q3, q3, #8 \n\t"
00751 "vcvt.s32.f32 q4, q4, #8 \n\t"
00752 "vmovn.i32 d0, q2 \n\t"
00753 "vmovn.i32 d1, q3 \n\t"
00754 "vmovn.i32 d2, q4 \n\t"
00755
00756 "vmov.i16 d20, #0x4 \n\t"
00757 "vmov.i16 d21, #0x00FF \n\t"
00758 "vorr.i16 d21, #0x0300 \n\t"
00759 "vmov.i16 d22, #0x0 \n\t"
00760 : [colorMatrix0] "=w"(colorMatrix0),
00761 [colorMatrix1] "=w"(colorMatrix1),
00762 [colorMatrix2] "=w"(colorMatrix2),
00763 [wCoord] "=w"(wCoord),
00764 [maxValue] "=w"(maxValue),
00765 [minValue] "=w"(minValue)
00766 : [colorMatrix_f] "r"(colorMatrix_f)
00767 : "memory",
00768 "d3", "d4", "d5", "d6", "d7", "d8", "d9");
00769
00770 for (unsigned int ty = 0; ty <480; ty++, row+=4*bytesPerRow) {
00771 register unsigned short *px0 = (unsigned short *)row;
00772 register unsigned short *px1 = (unsigned short *)(row+1*bytesPerRow);
00773 register unsigned short *px2 = (unsigned short *)(row+2*bytesPerRow);
00774 register unsigned short *px3 = (unsigned short *)(row+3*bytesPerRow);
00775
00776 register unsigned char *dst = thumb(0,ty);
00777 for (register unsigned int tx =0; tx < 640; tx+=scale) {
00778
00779 asm volatile(
00780
00781
00782
00783
00785 "vld2.16 {d4-d7}, [%[px0]]! \n\t"
00786 "vld2.16 {d8-d11}, [%[px1]]! \n\t"
00787 "vld2.16 {d12-d15}, [%[px2]]! \n\t"
00788 "vld2.16 {d16-d19}, [%[px3]]! \n\t"
00789
00790
00791
00792
00793
00794
00795
00796
00797
00799 "vpadd.u16 d4, d4, d5 \n\t"
00800 "vpadd.u16 d5, d6, d7 \n\t"
00801 "vpadd.u16 d6, d8, d9 \n\t"
00802 "vpadd.u16 d7, d10, d11 \n\t"
00803 "vpadd.u16 d8, d12, d13 \n\t"
00804 "vpadd.u16 d9, d14, d15 \n\t"
00805 "vpadd.u16 d10, d16, d17 \n\t"
00806 "vpadd.u16 d11, d18, d19 \n\t"
00807
00808
00809
00810
00811
00813 "vadd.u16 d7, d8 \n\t"
00814 "vadd.u16 d4, d11 \n\t"
00815 "vhadd.u16 d4, d7 \n\t"
00817 "vadd.u16 d5, d9 \n\t"
00819 "vadd.u16 d6, d10 \n\t"
00820
00821
00822
00823
00824
00825
00826
00827
00828
00830
00831 "vmull.s16 q4, d5, d0[0] \n\t"
00832 "vmlal.s16 q4, d4, d0[1] \n\t"
00833 "vmlal.s16 q4, d6, d0[2] \n\t"
00834 "vmlal.s16 q4, d20, d0[3] \n\t"
00835
00836 "vmull.s16 q5, d5, d1[0] \n\t"
00837 "vmlal.s16 q5, d4, d1[1] \n\t"
00838 "vmlal.s16 q5, d6, d1[2] \n\t"
00839 "vmlal.s16 q5, d20, d1[3] \n\t"
00840
00841 "vmull.s16 q6, d5, d2[0] \n\t"
00842 "vmlal.s16 q6, d4, d2[1] \n\t"
00843 "vmlal.s16 q6, d6, d2[2] \n\t"
00844 "vmlal.s16 q6, d20, d2[3] \n\t"
00845
00846
00847
00848
00850 "vrshrn.s32 d3, q4, #10 \n\t"
00851 "vrshrn.s32 d4, q5, #10 \n\t"
00852 "vrshrn.s32 d5, q6, #10 \n\t"
00854 "vmin.s16 d3, d3, d21 \n\t"
00855 "vmin.s16 d4, d4, d21 \n\t"
00856 "vmin.s16 d5, d5, d21 \n\t"
00857 "vmax.s16 d3, d3, d22 \n\t"
00858 "vmax.s16 d4, d4, d22 \n\t"
00859 "vmax.s16 d5, d5, d22 \n\t"
00860
00861
00862
00863
00865 "vmov r0,r1, d3 \n\t"
00866
00867
00868 "uxth r2, r0 \n\t"
00869 "ldrb r4, [%[gammaTable], r2] \n\t"
00870
00871 "uxth r2, r0, ROR #16 \n\t"
00872 "ldrb r3, [%[gammaTable], r2] \n\t"
00873 "orr r4, r4, r3, LSL #24 \n\t"
00874
00875 "uxth r2, r1 \n\t"
00876 "ldrb r3, [%[gammaTable], r2] \n\t"
00877 "mov r5, r3, LSL #16 \n\t"
00878
00879 "uxth r2, r1, ROR #16 \n\t"
00880 "ldrb r3, [%[gammaTable], r2] \n\t"
00881 "mov r6, r3, LSL #8 \n\t"
00882
00883
00884
00885
00886 "vmov r0,r1, d4 \n\t"
00887
00888
00889 "uxth r2, r0 \n\t"
00890 "ldrb r3, [%[gammaTable], r2] \n\t"
00891 "orr r4, r4, r3, LSL #8 \n\t"
00892
00893 "uxth r2, r0, ROR #16 \n\t"
00894 "ldrb r3, [%[gammaTable], r2] \n\t"
00895 "orr r5, r5, r3 \n\t"
00896
00897 "uxth r2, r1 \n\t"
00898 "ldrb r3, [%[gammaTable], r2] \n\t"
00899 "orr r5, r5, r3, LSL #24 \n\t"
00900
00901 "uxth r2, r1, ROR #16 \n\t"
00902 "ldrb r3, [%[gammaTable], r2] \n\t"
00903 "orr r6, r6, r3, LSL #16 \n\t"
00904
00905
00906
00907
00908 "vmov r0,r1, d5 \n\t"
00909
00910
00911 "uxth r2, r0 \n\t"
00912 "ldrb r3, [%[gammaTable], r2] \n\t"
00913 "orr r4, r4, r3, LSL #16 \n\t"
00914
00915 "uxth r2, r0, ROR #16 \n\t"
00916 "ldrb r3, [%[gammaTable], r2] \n\t"
00917 "orr r5, r5, r3, LSL #8 \n\t"
00918
00919 "uxth r2, r1 \n\t"
00920 "ldrb r3, [%[gammaTable], r2] \n\t"
00921 "orr r6, r6, r3 \n\t"
00922
00923 "uxth r2, r1, ROR #16 \n\t"
00924 "ldrb r3, [%[gammaTable], r2] \n\t"
00925 "orr r6, r6, r3, LSL #24 \n\t"
00926
00927
00928
00929
00930 "stm %[dst]!, {r4,r5,r6} \n\t"
00931 : [px0] "+&r"(px0),
00932 [px1] "+&r"(px1),
00933 [px2] "+&r"(px2),
00934 [px3] "+&r"(px3),
00935 [dst] "+&r"(dst)
00936 : [gammaTable] "r"(lut),
00937 [colorMatrix0] "w"(colorMatrix0),
00938 [colorMatrix1] "w"(colorMatrix1),
00939 [colorMatrix2] "w"(colorMatrix2),
00940 [wCoord] "w"(wCoord),
00941 [maxValue] "w"(maxValue),
00942 [minValue] "w"(minValue)
00943 : "memory",
00944 "r0", "r1", "r2", "r3", "r4", "r5", "r6",
00945 "d3", "d4", "d5", "d6",
00946 "d7", "d8", "d9", "d10",
00947 "d11", "d12", "d13", "d14",
00948 "d15", "d16", "d17", "d18", "d19"
00949 );
00950
00951 }
00952 }
00953
00954
00955
00956 return thumb;
00957 }
00958 }
00959
00960
00961 #endif