MagickCore  6.9.6
accelerate-private.h
Go to the documentation of this file.
1 /*
2  Copyright 1999-2016 ImageMagick Studio LLC, a non-profit organization
3  dedicated to making software imaging solutions freely available.
4 
5  You may not use this file except in compliance with the License.
6  obtain a copy of the License at
7 
8  http://www.imagemagick.org/script/license.php
9 
10  Unless required by applicable law or agreed to in writing, software
11  distributed under the License is distributed on an "AS IS" BASIS,
12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  See the License for the specific language governing permissions and
14  limitations under the License.
15 
16  MagickCore private methods for accelerated functions.
17 */
18 
19 #ifndef MAGICKCORE_ACCELERATE_PRIVATE_H
20 #define MAGICKCORE_ACCELERATE_PRIVATE_H
21 
22 #if defined(__cplusplus) || defined(c_plusplus)
23 extern "C" {
24 #endif
25 
26 #if defined(MAGICKCORE_OPENCL_SUPPORT)
27 
28 /*
29  Define declarations.
30 */
31 #define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n"
32 #define OPENCL_ELIF(...) "\n #""elif " #__VA_ARGS__ " \n"
33 #define OPENCL_ELSE() "\n #""else " " \n"
34 #define OPENCL_ENDIF() "\n #""endif " " \n"
35 #define OPENCL_IF(...) "\n #""if " #__VA_ARGS__ " \n"
36 #define STRINGIFY(...) #__VA_ARGS__ "\n"
37 
38 /*
39  Typedef declarations.
40 */
41 
42 typedef struct _FloatPixelPacket
43 {
44 #ifdef MAGICK_PIXEL_RGBA
46  red,
47  green,
48  blue,
49  opacity;
50 #endif
51 #ifdef MAGICK_PIXEL_BGRA
53  blue,
54  green,
55  red,
56  opacity;
57 #endif
58 } FloatPixelPacket;
59 
60 const char* accelerateKernels =
61 
62 /*
63  Define declarations.
64 */
65  OPENCL_DEFINE(GetPixelAlpha(pixel),(QuantumRange-(pixel).w))
66  OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
67  OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
68  OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
69  OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
70  OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
71  OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
72  OPENCL_DEFINE(SigmaRandom, (attenuate))
73  OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
74  OPENCL_DEFINE(MagickMax(x, y), (((x) > (y)) ? (x) : (y)))
75  OPENCL_DEFINE(MagickMin(x, y), (((x) < (y)) ? (x) : (y)))
76 
77 /*
78  Typedef declarations.
79 */
80  STRINGIFY(
81  typedef enum
82  {
84  RGBColorspace, /* Linear RGB colorspace */
85  GRAYColorspace, /* greyscale (linear) image (faked 1 channel) */
95  CMYKColorspace, /* negared linear RGB with black separated */
96  sRGBColorspace, /* Default: non-lienar sRGB colorspace */
105  CMYColorspace, /* negated linear RGB colorspace */
108  LCHColorspace, /* alias for LCHuv */
110  LCHabColorspace, /* Cylindrical (Polar) Lab */
111  LCHuvColorspace, /* Cylindrical (Polar) Luv */
114  HSVColorspace, /* alias for HSB */
117  } ColorspaceType;
118  )
119 
120  STRINGIFY(
121  typedef enum
122  {
178  /* These are new operators, added after the above was last sorted.
179  * The list should be re-sorted only when a new library version is
180  * created.
181  */
196  )
197 
198  STRINGIFY(
199  typedef enum
200  {
206  } MagickFunction;
207  )
208 
209  STRINGIFY(
210  typedef enum
211  {
213  UniformNoise,
216  ImpulseNoise,
218  PoissonNoise,
220  } NoiseType;
221  )
222 
223  STRINGIFY(
224  typedef enum
225  {
237  )
238 
239  STRINGIFY(
240  typedef enum {
258  )
259 
260  STRINGIFY(
261  typedef enum
262  {
264  RedChannel = 0x0001,
265  GrayChannel = 0x0001,
266  CyanChannel = 0x0001,
267  GreenChannel = 0x0002,
268  MagentaChannel = 0x0002,
269  BlueChannel = 0x0004,
270  YellowChannel = 0x0004,
271  AlphaChannel = 0x0008,
272  OpacityChannel = 0x0008,
273  MatteChannel = 0x0008, /* deprecated */
274  BlackChannel = 0x0020,
275  IndexChannel = 0x0020,
276  CompositeChannels = 0x002F,
277  AllChannels = 0x7ffffff,
278  /*
279  Special purpose channel types.
280  */
281  TrueAlphaChannel = 0x0040, /* extract actual alpha channel from opacity */
282  RGBChannels = 0x0080, /* set alpha from grayscale mask in RGB */
283  GrayChannels = 0x0080,
284  SyncChannels = 0x0100, /* channels should be modified equally */
286  } ChannelType;
287  )
288 
289 /*
290  Helper functions.
291 */
292 
293 OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
294 
295  STRINGIFY(
296  inline CLQuantum ScaleCharToQuantum(const unsigned char value)
297  {
298  return((CLQuantum) value);
299  }
300  )
301 
302 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
303 
304  STRINGIFY(
305  inline CLQuantum ScaleCharToQuantum(const unsigned char value)
306  {
307  return((CLQuantum) (257.0f*value));
308  }
309  )
310 
311 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
312 
313  STRINGIFY(
314  inline CLQuantum ScaleCharToQuantum(const unsigned char value)
315  {
316  return((CLQuantum) (16843009.0*value));
317  }
318  )
319 
320 OPENCL_ENDIF()
321 
322 STRINGIFY(
323  inline int ClampToCanvas(const int offset, const int range)
324  {
325  return clamp(offset, (int)0, range - 1);
326  }
327  )
328 
329  STRINGIFY(
330  inline int ClampToCanvasWithHalo(const int offset, const int range, const int edge, const int section)
331  {
332  return clamp(offset, section ? (int)(0 - edge) : (int)0, section ? (range - 1) : (range - 1 + edge));
333  }
334  )
335 
336  STRINGIFY(
337  inline CLQuantum ClampToQuantum(const float value)
338  {
339  return (CLQuantum)(clamp(value, 0.0f, (float)QuantumRange) + 0.5f);
340  }
341  )
342 
343  STRINGIFY(
344  inline uint ScaleQuantumToMap(CLQuantum value)
345  {
346  if (value >= (CLQuantum)MaxMap)
347  return ((uint)MaxMap);
348  else
349  return ((uint)value);
350  }
351  )
352 
353  STRINGIFY(
354  inline float PerceptibleReciprocal(const float x)
355  {
356  float sign = x < (float) 0.0 ? (float)-1.0 : (float) 1.0;
357  return((sign*x) >= MagickEpsilon ? (float) 1.0 / x : sign*((float) 1.0 / MagickEpsilon));
358  }
359  )
360 
361  STRINGIFY(
362  inline float RoundToUnity(const float value)
363  {
364  return clamp(value, 0.0f, 1.0f);
365  }
366  )
367 
368  STRINGIFY(
369 
370  inline CLQuantum getBlue(CLPixelType p) { return p.x; }
371  inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; }
372  inline float getBlueF4(float4 p) { return p.x; }
373  inline void setBlueF4(float4* p, float value) { (*p).x = value; }
374 
375  inline CLQuantum getGreen(CLPixelType p) { return p.y; }
376  inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
377  inline float getGreenF4(float4 p) { return p.y; }
378  inline void setGreenF4(float4* p, float value) { (*p).y = value; }
379 
380  inline CLQuantum getRed(CLPixelType p) { return p.z; }
381  inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; }
382  inline float getRedF4(float4 p) { return p.z; }
383  inline void setRedF4(float4* p, float value) { (*p).z = value; }
384 
385  inline CLQuantum getOpacity(CLPixelType p) { return p.w; }
386  inline void setOpacity(CLPixelType* p, CLQuantum value) { (*p).w = value; }
387  inline float getOpacityF4(float4 p) { return p.w; }
388  inline void setOpacityF4(float4* p, float value) { (*p).w = value; }
389 
390  inline void setGray(CLPixelType* p, CLQuantum value) { (*p).z = value; (*p).y = value; (*p).x = value; }
391 
392  inline float GetPixelIntensity(const int method, const int colorspace, CLPixelType p)
393  {
394  float red = getRed(p);
395  float green = getGreen(p);
396  float blue = getBlue(p);
397 
398  float intensity;
399 
400  if (colorspace == GRAYColorspace)
401  return red;
402 
403  switch (method)
404  {
406  {
407  intensity = (red + green + blue) / 3.0;
408  break;
409  }
411  {
412  intensity = MagickMax(MagickMax(red, green), blue);
413  break;
414  }
416  {
417  intensity = (MagickMin(MagickMin(red, green), blue) +
418  MagickMax(MagickMax(red, green), blue)) / 2.0;
419  break;
420  }
422  {
423  intensity = (float)(((float)red*red + green*green + blue*blue) /
424  (3.0*QuantumRange));
425  break;
426  }
428  {
429  /*
430  if (image->colorspace == RGBColorspace)
431  {
432  red=EncodePixelGamma(red);
433  green=EncodePixelGamma(green);
434  blue=EncodePixelGamma(blue);
435  }
436  */
437  intensity = 0.298839*red + 0.586811*green + 0.114350*blue;
438  break;
439  }
441  {
442  /*
443  if (image->colorspace == sRGBColorspace)
444  {
445  red=DecodePixelGamma(red);
446  green=DecodePixelGamma(green);
447  blue=DecodePixelGamma(blue);
448  }
449  */
450  intensity = 0.298839*red + 0.586811*green + 0.114350*blue;
451  break;
452  }
454  default:
455  {
456  /*
457  if (image->colorspace == RGBColorspace)
458  {
459  red=EncodePixelGamma(red);
460  green=EncodePixelGamma(green);
461  blue=EncodePixelGamma(blue);
462  }
463  */
464  intensity = 0.212656*red + 0.715158*green + 0.072186*blue;
465  break;
466  }
468  {
469  /*
470  if (image->colorspace == sRGBColorspace)
471  {
472  red=DecodePixelGamma(red);
473  green=DecodePixelGamma(green);
474  blue=DecodePixelGamma(blue);
475  }
476  */
477  intensity = 0.212656*red + 0.715158*green + 0.072186*blue;
478  break;
479  }
481  {
482  intensity = (float)(sqrt((float)red*red + green*green + blue*blue) /
483  sqrt(3.0));
484  break;
485  }
486  }
487 
488  return intensity;
489 
490  }
491  )
492 
493 /*
494 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
495 % %
496 % %
497 % %
498 % A d d N o i s e %
499 % %
500 % %
501 % %
502 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
503 */
504 
505 STRINGIFY(
506 
507 /*
508 Part of MWC64X by David Thomas, dt10@imperial.ac.uk
509 This is provided under BSD, full license is with the main package.
510 See http://www.doc.ic.ac.uk/~dt10/research
511 */
512 
513 // Pre: a<M, b<M
514 // Post: r=(a+b) mod M
515 ulong MWC_AddMod64(ulong a, ulong b, ulong M)
516 {
517  ulong v=a+b;
518  //if( (v>=M) || (v<a) )
519  if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
520  v=v-M;
521  return v;
522 }
523 
524 // Pre: a<M,b<M
525 // Post: r=(a*b) mod M
526 // This could be done more efficently, but it is portable, and should
527 // be easy to understand. It can be replaced with any of the better
528 // modular multiplication algorithms (for example if you know you have
529 // double precision available or something).
530 ulong MWC_MulMod64(ulong a, ulong b, ulong M)
531 {
532  ulong r=0;
533  while(a!=0){
534  if(a&1)
535  r=MWC_AddMod64(r,b,M);
536  b=MWC_AddMod64(b,b,M);
537  a=a>>1;
538  }
539  return r;
540 }
541 
542 
543 // Pre: a<M, e>=0
544 // Post: r=(a^b) mod M
545 // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
546 // most architectures
547 ulong MWC_PowMod64(ulong a, ulong e, ulong M)
548 {
549  ulong sqr=a, acc=1;
550  while(e!=0){
551  if(e&1)
552  acc=MWC_MulMod64(acc,sqr,M);
553  sqr=MWC_MulMod64(sqr,sqr,M);
554  e=e>>1;
555  }
556  return acc;
557 }
558 
559 uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
560 {
561  ulong m=MWC_PowMod64(A, distance, M);
562  ulong x=curr.x*(ulong)A+curr.y;
563  x=MWC_MulMod64(x, m, M);
564  return (uint2)((uint)(x/A), (uint)(x%A));
565 }
566 
567 uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
568 {
569  // This is an arbitrary constant for starting LCG jumping from. I didn't
570  // want to start from 1, as then you end up with the two or three first values
571  // being a bit poor in ones - once you've decided that, one constant is as
572  // good as any another. There is no deep mathematical reason for it, I just
573  // generated a random number.
574  enum{ MWC_BASEID = 4077358422479273989UL };
575 
576  ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
577  ulong m=MWC_PowMod64(A, dist, M);
578 
579  ulong x=MWC_MulMod64(MWC_BASEID, m, M);
580  return (uint2)((uint)(x/A), (uint)(x%A));
581 }
582 
584 typedef struct{ uint x; uint c; } mwc64x_state_t;
585 
586 enum{ MWC64X_A = 4294883355U };
587 enum{ MWC64X_M = 18446383549859758079UL };
588 
589 void MWC64X_Step(mwc64x_state_t *s)
590 {
591  uint X=s->x, C=s->c;
592 
593  uint Xn=MWC64X_A*X+C;
594  uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
595  uint Cn=mad_hi(MWC64X_A,X,carry);
596 
597  s->x=Xn;
598  s->c=Cn;
599 }
600 
601 void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
602 {
603  uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance);
604  s->x=tmp.x;
605  s->c=tmp.y;
606 }
607 
608 void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
609 {
610  uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset);
611  s->x=tmp.x;
612  s->c=tmp.y;
613 }
614 
616 uint MWC64X_NextUint(mwc64x_state_t *s)
617 {
618  uint res=s->x ^ s->c;
619  MWC64X_Step(s);
620  return res;
621 }
622 
623 //
624 // End of MWC64X excerpt
625 //
626 
627  float mwcReadPseudoRandomValue(mwc64x_state_t* rng) {
628  return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
629  }
630 
631 
632  float mwcGenerateDifferentialNoise(mwc64x_state_t* r, CLQuantum pixel, NoiseType noise_type, float attenuate) {
633 
634  float
635  alpha,
636  beta,
637  noise,
638  sigma;
639 
640  noise = 0.0f;
641  alpha=mwcReadPseudoRandomValue(r);
642  switch(noise_type) {
643  case UniformNoise:
644  default:
645  {
646  noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
647  break;
648  }
649  case GaussianNoise:
650  {
651  float
652  gamma,
653  tau;
654 
655  if (alpha == 0.0f)
656  alpha=1.0f;
657  beta=mwcReadPseudoRandomValue(r);
658  gamma=sqrt(-2.0f*log(alpha));
659  sigma=gamma*cospi((2.0f*beta));
660  tau=gamma*sinpi((2.0f*beta));
661  noise=(float)(pixel+sqrt((float) pixel)*SigmaGaussian*sigma+
663  break;
664  }
665 
666 
667  case ImpulseNoise:
668  {
669  if (alpha < (SigmaImpulse/2.0f))
670  noise=0.0f;
671  else
672  if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
673  noise=(float)QuantumRange;
674  else
675  noise=(float)pixel;
676  break;
677  }
678  case LaplacianNoise:
679  {
680  if (alpha <= 0.5f)
681  {
682  if (alpha <= MagickEpsilon)
683  noise=(float) (pixel-QuantumRange);
684  else
685  noise=(float) (pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
686  0.5f);
687  break;
688  }
689  beta=1.0f-alpha;
690  if (beta <= (0.5f*MagickEpsilon))
691  noise=(float) (pixel+QuantumRange);
692  else
693  noise=(float) (pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
694  break;
695  }
697  {
698  sigma=1.0f;
699  if (alpha > MagickEpsilon)
700  sigma=sqrt(-2.0f*log(alpha));
701  beta=mwcReadPseudoRandomValue(r);
702  noise=(float) (pixel+pixel*SigmaMultiplicativeGaussian*sigma*
703  cospi((float) (2.0f*beta))/2.0f);
704  break;
705  }
706  case PoissonNoise:
707  {
708  float
709  poisson;
710  unsigned int i;
711  poisson=exp(-SigmaPoisson*QuantumScale*pixel);
712  for (i=0; alpha > poisson; i++)
713  {
714  beta=mwcReadPseudoRandomValue(r);
715  alpha*=beta;
716  }
717  noise=(float) (QuantumRange*i/SigmaPoisson);
718  break;
719  }
720  case RandomNoise:
721  {
722  noise=(float) (QuantumRange*SigmaRandom*alpha);
723  break;
724  }
725 
726  };
727  return noise;
728  }
729 
730  __kernel
731  void AddNoise(const __global CLPixelType* inputImage, __global CLPixelType* filteredImage
732  ,const unsigned int inputPixelCount, const unsigned int pixelsPerWorkItem
733  ,const ChannelType channel
734  ,const NoiseType noise_type, const float attenuate
735  ,const unsigned int seed0, const unsigned int seed1
736  ,const unsigned int numRandomNumbersPerPixel) {
737 
738  mwc64x_state_t rng;
739  rng.x = seed0;
740  rng.c = seed1;
741 
742  uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use
743  uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream);
744 
745  MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams
746 
747  uint pos = get_local_size(0) * get_group_id(0) * pixelsPerWorkItem + get_local_id(0); // pixel to process
748 
749  uint count = pixelsPerWorkItem;
750 
751  while (count > 0) {
752  if (pos < inputPixelCount) {
753  CLPixelType p = inputImage[pos];
754 
755  if ((channel&RedChannel)!=0) {
756  setRed(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getRed(p),noise_type,attenuate)));
757  }
758 
759  if ((channel&GreenChannel)!=0) {
760  setGreen(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getGreen(p),noise_type,attenuate)));
761  }
762 
763  if ((channel&BlueChannel)!=0) {
764  setBlue(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getBlue(p),noise_type,attenuate)));
765  }
766 
767  if ((channel & OpacityChannel) != 0) {
768  setOpacity(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getOpacity(p),noise_type,attenuate)));
769  }
770 
771  filteredImage[pos] = p;
772  //filteredImage[pos] = (CLPixelType)(MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, 255);
773  }
774  pos += get_local_size(0);
775  --count;
776  }
777  }
778  )
779 
780 /*
781 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
782 % %
783 % %
784 % %
785 % B l u r %
786 % %
787 % %
788 % %
789 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
790 */
791 
792  STRINGIFY(
793  /*
794  Reduce image noise and reduce detail levels by row
795  im: input pixels filtered_in filtered_im: output pixels
796  filter : convolve kernel width: convolve kernel size
797  channel : define which channel is blured
798  is_RGBA_BGRA : define the input is RGBA or BGRA
799  */
800  __kernel void BlurRow(__global CLPixelType *im, __global float4 *filtered_im,
801  const ChannelType channel, __constant float *filter,
802  const unsigned int width,
803  const unsigned int imageColumns, const unsigned int imageRows,
804  __local CLPixelType *temp)
805  {
806  const int x = get_global_id(0);
807  const int y = get_global_id(1);
808 
809  const int columns = imageColumns;
810 
811  const unsigned int radius = (width-1)/2;
812  const int wsize = get_local_size(0);
813  const unsigned int loadSize = wsize+width;
814 
815  //load chunk only for now
816  //event_t e = async_work_group_copy(temp+radius, im+x+y*columns, wsize, 0);
817  //wait_group_events(1,&e);
818 
819  //parallel load and clamp
820  /*
821  int count = 0;
822  for (int i=0; i < loadSize; i=i+wsize)
823  {
824  int currentX = x + wsize*(count++);
825 
826  int localId = get_local_id(0);
827 
828  if ((localId+i) > loadSize)
829  break;
830 
831  temp[localId+i] = im[y*columns+ClampToCanvas(currentX-radius, columns)];
832 
833  if (y==0 && get_group_id(0) == 0)
834  {
835  printf("(%d %d) temp %d load %d currentX %d\n", x, y, localId+i, ClampToCanvas(currentX-radius, columns), currentX);
836  }
837  }
838  */
839 
840  //group coordinate
841  const int groupX=get_local_size(0)*get_group_id(0);
842  const int groupY=get_local_size(1)*get_group_id(1);
843 
844  //parallel load and clamp
845  for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
846  {
847  //int cx = ClampToCanvas(groupX+i, columns);
848  temp[i] = im[y * columns + ClampToCanvas(i+groupX-radius, columns)];
849 
850  /*if (0 && y==0 && get_group_id(1) == 0)
851  {
852  printf("(%d %d) temp %d load %d groupX %d\n", x, y, i, ClampToCanvas(groupX+i, columns), groupX);
853  }*/
854  }
855 
856  // barrier
857  barrier(CLK_LOCAL_MEM_FENCE);
858 
859  // only do the work if this is not a patched item
860  if (get_global_id(0) < columns)
861  {
862  // compute
863  float4 result = (float4) 0;
864 
865  int i = 0;
866 
867  \n #ifndef UFACTOR \n
868  \n #define UFACTOR 8 \n
869  \n #endif \n
870 
871  for ( ; i+UFACTOR < width; )
872  {
873  \n #pragma unroll UFACTOR\n
874  for (int j=0; j < UFACTOR; j++, i++)
875  {
876  result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
877  }
878  }
879 
880  for ( ; i < width; i++)
881  {
882  result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
883  }
884 
885  result.x = ClampToQuantum(result.x);
886  result.y = ClampToQuantum(result.y);
887  result.z = ClampToQuantum(result.z);
888  result.w = ClampToQuantum(result.w);
889 
890  // write back to global
891  filtered_im[y*columns+x] = result;
892  }
893  }
894  )
895 
896  STRINGIFY(
897  /*
898  Reduce image noise and reduce detail levels by line
899  im: input pixels filtered_in filtered_im: output pixels
900  filter : convolve kernel width: convolve kernel size
901  channel : define which channel is blured\
902  is_RGBA_BGRA : define the input is RGBA or BGRA
903  */
904  __kernel void BlurColumn(const __global float4 *blurRowData, __global CLPixelType *filtered_im,
905  const ChannelType channel, __constant float *filter,
906  const unsigned int width,
907  const unsigned int imageColumns, const unsigned int imageRows,
908  __local float4 *temp)
909  {
910  const int x = get_global_id(0);
911  const int y = get_global_id(1);
912 
913  //const int columns = get_global_size(0);
914  //const int rows = get_global_size(1);
915  const int columns = imageColumns;
916  const int rows = imageRows;
917 
918  unsigned int radius = (width-1)/2;
919  const int wsize = get_local_size(1);
920  const unsigned int loadSize = wsize+width;
921 
922  //group coordinate
923  const int groupX=get_local_size(0)*get_group_id(0);
924  const int groupY=get_local_size(1)*get_group_id(1);
925  //notice that get_local_size(0) is 1, so
926  //groupX=get_group_id(0);
927 
928  //parallel load and clamp
929  for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
930  {
931  temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
932  }
933 
934  // barrier
935  barrier(CLK_LOCAL_MEM_FENCE);
936 
937  // only do the work if this is not a patched item
938  if (get_global_id(1) < rows)
939  {
940  // compute
941  float4 result = (float4) 0;
942 
943  int i = 0;
944 
945  \n #ifndef UFACTOR \n
946  \n #define UFACTOR 8 \n
947  \n #endif \n
948 
949  for ( ; i+UFACTOR < width; )
950  {
951  \n #pragma unroll UFACTOR \n
952  for (int j=0; j < UFACTOR; j++, i++)
953  {
954  result+=filter[i]*temp[i+get_local_id(1)];
955  }
956  }
957 
958  for ( ; i < width; i++)
959  {
960  result+=filter[i]*temp[i+get_local_id(1)];
961  }
962 
963  result.x = ClampToQuantum(result.x);
964  result.y = ClampToQuantum(result.y);
965  result.z = ClampToQuantum(result.z);
966  result.w = ClampToQuantum(result.w);
967 
968  // write back to global
969  filtered_im[y*columns+x] = (CLPixelType) (result.x,result.y,result.z,result.w);
970  }
971 
972  }
973  )
974 
975 /*
976 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
977 % %
978 % %
979 % %
980 % C o m p o s i t e %
981 % %
982 % %
983 % %
984 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
985 */
986 
987  STRINGIFY(
988  inline float ColorDodge(const float Sca,
989  const float Sa,const float Dca,const float Da)
990  {
991  /*
992  Oct 2004 SVG specification.
993  */
994  if ((Sca*Da+Dca*Sa) >= Sa*Da)
995  return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
996  return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
997 
998 
999  /*
1000  New specification, March 2009 SVG specification. This specification was
1001  also wrong of non-overlap cases.
1002  */
1003  /*
1004  if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
1005  return(Sca*(1.0-Da));
1006  if (fabs(Sca-Sa) < MagickEpsilon)
1007  return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
1008  return(Sa*MagickMin(Da,Dca*Sa/(Sa-Sca)));
1009  */
1010 
1011  /*
1012  Working from first principles using the original formula:
1013 
1014  f(Sc,Dc) = Dc/(1-Sc)
1015 
1016  This works correctly! Looks like the 2004 model was right but just
1017  required a extra condition for correct handling.
1018  */
1019 
1020  /*
1021  if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
1022  return(Sca*(1.0-Da)+Dca*(1.0-Sa));
1023  if (fabs(Sca-Sa) < MagickEpsilon)
1024  return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
1025  return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
1026  */
1027  }
1028 
1029  inline void CompositeColorDodge(const float4 *p,
1030  const float4 *q,float4 *composite) {
1031 
1032  float
1033  Da,
1034  gamma,
1035  Sa;
1036 
1037  Sa=1.0f-QuantumScale*getOpacityF4(*p); /* simplify and speed up equations */
1038  Da=1.0f-QuantumScale*getOpacityF4(*q);
1039  gamma=RoundToUnity(Sa+Da-Sa*Da); /* over blend, as per SVG doc */
1040  setOpacityF4(composite, QuantumRange*(1.0-gamma));
1041  gamma=QuantumRange/(fabs(gamma) < MagickEpsilon ? MagickEpsilon : gamma);
1042  setRedF4(composite,gamma*ColorDodge(QuantumScale*getRedF4(*p)*Sa,Sa,QuantumScale*
1043  getRedF4(*q)*Da,Da));
1044  setGreenF4(composite,gamma*ColorDodge(QuantumScale*getGreenF4(*p)*Sa,Sa,QuantumScale*
1045  getGreenF4(*q)*Da,Da));
1046  setBlueF4(composite,gamma*ColorDodge(QuantumScale*getBlueF4(*p)*Sa,Sa,QuantumScale*
1047  getBlueF4(*q)*Da,Da));
1048  }
1049  )
1050 
1051  STRINGIFY(
1052  inline void MagickPixelCompositePlus(const float4 *p,
1053  const float alpha,const float4 *q,
1054  const float beta,float4 *composite)
1055  {
1056  float
1057  gamma;
1058 
1059  float
1060  Da,
1061  Sa;
1062  /*
1063  Add two pixels with the given opacities.
1064  */
1065  Sa=1.0-QuantumScale*alpha;
1066  Da=1.0-QuantumScale*beta;
1067  gamma=RoundToUnity(Sa+Da); /* 'Plus' blending -- not 'Over' blending */
1068  setOpacityF4(composite,(float) QuantumRange*(1.0-gamma));
1069  gamma=PerceptibleReciprocal(gamma);
1070  setRedF4(composite,gamma*(Sa*getRedF4(*p)+Da*getRedF4(*q)));
1071  setGreenF4(composite,gamma*(Sa*getGreenF4(*p)+Da*getGreenF4(*q)));
1072  setBlueF4(composite,gamma*(Sa*getBlueF4(*p)+Da*getBlueF4(*q)));
1073  }
1074  )
1075 
1076  STRINGIFY(
1077  inline void MagickPixelCompositeBlend(const float4 *p,
1078  const float alpha,const float4 *q,
1079  const float beta,float4 *composite)
1080  {
1081  MagickPixelCompositePlus(p,(float) (QuantumRange-alpha*
1082  (QuantumRange-getOpacityF4(*p))),q,(float) (QuantumRange-beta*
1083  (QuantumRange-getOpacityF4(*q))),composite);
1084  }
1085  )
1086 
1087  STRINGIFY(
1088  __kernel
1089  void Composite(__global CLPixelType *image,
1090  const unsigned int imageWidth,
1091  const unsigned int imageHeight,
1092  const unsigned int imageMatte,
1093  const __global CLPixelType *compositeImage,
1094  const unsigned int compositeWidth,
1095  const unsigned int compositeHeight,
1096  const unsigned int compositeMatte,
1097  const unsigned int compose,
1098  const ChannelType channel,
1099  const float destination_dissolve,
1100  const float source_dissolve) {
1101 
1102  uint2 index;
1103  index.x = get_global_id(0);
1104  index.y = get_global_id(1);
1105 
1106 
1107  if (index.x >= imageWidth
1108  || index.y >= imageHeight) {
1109  return;
1110  }
1111  const CLPixelType inputPixel = image[index.y*imageWidth+index.x];
1112  float4 destination;
1113  setRedF4(&destination,getRed(inputPixel));
1114  setGreenF4(&destination,getGreen(inputPixel));
1115  setBlueF4(&destination,getBlue(inputPixel));
1116 
1117 
1118  const CLPixelType compositePixel
1119  = compositeImage[index.y*imageWidth+index.x];
1120  float4 source;
1121  setRedF4(&source,getRed(compositePixel));
1122  setGreenF4(&source,getGreen(compositePixel));
1123  setBlueF4(&source,getBlue(compositePixel));
1124 
1125  if (imageMatte != 0) {
1126  setOpacityF4(&destination,getOpacity(inputPixel));
1127  }
1128  else {
1129  setOpacityF4(&destination,0.0f);
1130  }
1131 
1132  if (compositeMatte != 0) {
1133  setOpacityF4(&source,getOpacity(compositePixel));
1134  }
1135  else {
1136  setOpacityF4(&source,0.0f);
1137  }
1138 
1139  float4 composite=destination;
1140 
1141  CompositeOperator op = (CompositeOperator)compose;
1142  switch (op) {
1143  case ColorDodgeCompositeOp:
1144  CompositeColorDodge(&source,&destination,&composite);
1145  break;
1146  case BlendCompositeOp:
1147  MagickPixelCompositeBlend(&source,source_dissolve,&destination,
1148  destination_dissolve,&composite);
1149  break;
1150  default:
1151  // unsupported operators
1152  break;
1153  };
1154 
1155  CLPixelType outputPixel;
1156  setRed(&outputPixel, ClampToQuantum(getRedF4(composite)));
1157  setGreen(&outputPixel, ClampToQuantum(getGreenF4(composite)));
1158  setBlue(&outputPixel, ClampToQuantum(getBlueF4(composite)));
1159  setOpacity(&outputPixel, ClampToQuantum(getOpacityF4(composite)));
1160  image[index.y*imageWidth+index.x] = outputPixel;
1161  }
1162  )
1163 
1164 /*
1165 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1166 % %
1167 % %
1168 % %
1169 % C o n t r a s t %
1170 % %
1171 % %
1172 % %
1173 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1174 */
1175 
1176  STRINGIFY(
1177 
1178  inline float3 ConvertRGBToHSB(CLPixelType pixel) {
1179  float3 HueSaturationBrightness;
1180  HueSaturationBrightness.x = 0.0f; // Hue
1181  HueSaturationBrightness.y = 0.0f; // Saturation
1182  HueSaturationBrightness.z = 0.0f; // Brightness
1183 
1184  float r=(float) getRed(pixel);
1185  float g=(float) getGreen(pixel);
1186  float b=(float) getBlue(pixel);
1187 
1188  float tmin=MagickMin(MagickMin(r,g),b);
1189  float tmax= MagickMax(MagickMax(r,g),b);
1190 
1191  if (tmax!=0.0f) {
1192  float delta=tmax-tmin;
1193  HueSaturationBrightness.y=delta/tmax;
1194  HueSaturationBrightness.z=QuantumScale*tmax;
1195 
1196  if (delta != 0.0f) {
1197  HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f));
1198  HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta;
1199  HueSaturationBrightness.x/=6.0f;
1200  HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f;
1201  }
1202  }
1203  return HueSaturationBrightness;
1204  }
1205 
1206  inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) {
1207 
1208  float hue = HueSaturationBrightness.x;
1209  float brightness = HueSaturationBrightness.z;
1210  float saturation = HueSaturationBrightness.y;
1211 
1212  CLPixelType rgb;
1213 
1214  if (saturation == 0.0f) {
1215  setRed(&rgb,ClampToQuantum(QuantumRange*brightness));
1216  setGreen(&rgb,getRed(rgb));
1217  setBlue(&rgb,getRed(rgb));
1218  }
1219  else {
1220 
1221  float h=6.0f*(hue-floor(hue));
1222  float f=h-floor(h);
1223  float p=brightness*(1.0f-saturation);
1224  float q=brightness*(1.0f-saturation*f);
1225  float t=brightness*(1.0f-(saturation*(1.0f-f)));
1226 
1227  float clampedBrightness = ClampToQuantum(QuantumRange*brightness);
1228  float clamped_t = ClampToQuantum(QuantumRange*t);
1229  float clamped_p = ClampToQuantum(QuantumRange*p);
1230  float clamped_q = ClampToQuantum(QuantumRange*q);
1231  int ih = (int)h;
1232  setRed(&rgb, (ih == 1)?clamped_q:
1233  (ih == 2 || ih == 3)?clamped_p:
1234  (ih == 4)?clamped_t:
1235  clampedBrightness);
1236 
1237  setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness:
1238  (ih == 3)?clamped_q:
1239  (ih == 4 || ih == 5)?clamped_p:
1240  clamped_t);
1241 
1242  setBlue(&rgb, (ih == 2)?clamped_t:
1243  (ih == 3 || ih == 4)?clampedBrightness:
1244  (ih == 5)?clamped_q:
1245  clamped_p);
1246  }
1247  return rgb;
1248  }
1249 
1250  __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen)
1251  {
1252 
1253  const int sign = sharpen!=0?1:-1;
1254  const int x = get_global_id(0);
1255  const int y = get_global_id(1);
1256  const int columns = get_global_size(0);
1257  const int c = x + y * columns;
1258 
1259  CLPixelType pixel = im[c];
1260  float3 HueSaturationBrightness = ConvertRGBToHSB(pixel);
1261  float brightness = HueSaturationBrightness.z;
1262  brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
1263  brightness = clamp(brightness,0.0f,1.0f);
1264  HueSaturationBrightness.z = brightness;
1265 
1266  CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness);
1267  filteredPixel.w = pixel.w;
1268  im[c] = filteredPixel;
1269  }
1270  )
1271 
1272 /*
1273 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1274 % %
1275 % %
1276 % %
1277 % C o n t r a s t S t r e t c h %
1278 % %
1279 % %
1280 % %
1281 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1282 */
1283 
1284  STRINGIFY(
1285  /*
1286  */
1287  __kernel void Histogram(__global CLPixelType * restrict im,
1288  const ChannelType channel,
1289  const int method,
1290  const int colorspace,
1291  __global uint4 * restrict histogram)
1292  {
1293  const int x = get_global_id(0);
1294  const int y = get_global_id(1);
1295  const int columns = get_global_size(0);
1296  const int c = x + y * columns;
1297  if ((channel & SyncChannels) != 0)
1298  {
1299  float intensity = GetPixelIntensity(method, colorspace,im[c]);
1300  uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
1301  atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
1302  }
1303  else
1304  {
1305  // for equalizing, we always need all channels?
1306  // otherwise something more
1307  }
1308  }
1309  )
1310 
1311  STRINGIFY(
1312  /*
1313  */
1314  __kernel void ContrastStretch(__global CLPixelType * restrict im,
1315  const ChannelType channel,
1316  __global CLPixelType * restrict stretch_map,
1317  const float4 white, const float4 black)
1318  {
1319  const int x = get_global_id(0);
1320  const int y = get_global_id(1);
1321  const int columns = get_global_size(0);
1322  const int c = x + y * columns;
1323 
1324  uint ePos;
1325  CLPixelType oValue, eValue;
1326  CLQuantum red, green, blue, opacity;
1327 
1328  //read from global
1329  oValue=im[c];
1330 
1331  if ((channel & RedChannel) != 0)
1332  {
1333  if (getRedF4(white) != getRedF4(black))
1334  {
1335  ePos = ScaleQuantumToMap(getRed(oValue));
1336  eValue = stretch_map[ePos];
1337  red = getRed(eValue);
1338  }
1339  }
1340 
1341  if ((channel & GreenChannel) != 0)
1342  {
1343  if (getGreenF4(white) != getGreenF4(black))
1344  {
1345  ePos = ScaleQuantumToMap(getGreen(oValue));
1346  eValue = stretch_map[ePos];
1347  green = getGreen(eValue);
1348  }
1349  }
1350 
1351  if ((channel & BlueChannel) != 0)
1352  {
1353  if (getBlueF4(white) != getBlueF4(black))
1354  {
1355  ePos = ScaleQuantumToMap(getBlue(oValue));
1356  eValue = stretch_map[ePos];
1357  blue = getBlue(eValue);
1358  }
1359  }
1360 
1361  if ((channel & OpacityChannel) != 0)
1362  {
1363  if (getOpacityF4(white) != getOpacityF4(black))
1364  {
1365  ePos = ScaleQuantumToMap(getOpacity(oValue));
1366  eValue = stretch_map[ePos];
1367  opacity = getOpacity(eValue);
1368  }
1369  }
1370 
1371  //write back
1372  im[c]=(CLPixelType)(blue, green, red, opacity);
1373 
1374  }
1375  )
1376 
1377 /*
1378 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1379 % %
1380 % %
1381 % %
1382 % C o n v o l v e %
1383 % %
1384 % %
1385 % %
1386 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1387 */
1388 
1389  STRINGIFY(
1390  __kernel
1391  void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
1392  const unsigned int imageWidth, const unsigned int imageHeight,
1393  __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1394  const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
1395 
1396  int2 blockID;
1397  blockID.x = get_group_id(0);
1398  blockID.y = get_group_id(1);
1399 
1400  // image area processed by this workgroup
1401  int2 imageAreaOrg;
1402  imageAreaOrg.x = blockID.x * get_local_size(0);
1403  imageAreaOrg.y = blockID.y * get_local_size(1);
1404 
1405  int2 midFilterDimen;
1406  midFilterDimen.x = (filterWidth-1)/2;
1407  midFilterDimen.y = (filterHeight-1)/2;
1408 
1409  int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
1410 
1411  // dimension of the local cache
1412  int2 cachedAreaDimen;
1413  cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
1414  cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
1415 
1416  // cache the pixels accessed by this workgroup in local memory
1417  int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
1418  int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
1419  int groupSize = get_local_size(0) * get_local_size(1);
1420  for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
1421 
1422  int2 cachedAreaIndex;
1423  cachedAreaIndex.x = i % cachedAreaDimen.x;
1424  cachedAreaIndex.y = i / cachedAreaDimen.x;
1425 
1426  int2 imagePixelIndex;
1427  imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
1428 
1429  // only support EdgeVirtualPixelMethod through ClampToCanvas
1430  // TODO: implement other virtual pixel method
1431  imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
1432  imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
1433 
1434  pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
1435  }
1436 
1437  // cache the filter
1438  for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
1439  filterCache[i] = filter[i];
1440  }
1441  barrier(CLK_LOCAL_MEM_FENCE);
1442 
1443 
1444  int2 imageIndex;
1445  imageIndex.x = imageAreaOrg.x + get_local_id(0);
1446  imageIndex.y = imageAreaOrg.y + get_local_id(1);
1447 
1448  // if out-of-range, stops here and quit
1449  if (imageIndex.x >= imageWidth
1450  || imageIndex.y >= imageHeight) {
1451  return;
1452  }
1453 
1454  int filterIndex = 0;
1455  float4 sum = (float4)0.0f;
1456  float gamma = 0.0f;
1457  if (((channel & OpacityChannel) == 0) || (matte == 0)) {
1458  int cacheIndexY = get_local_id(1);
1459  for (int j = 0; j < filterHeight; j++) {
1460  int cacheIndexX = get_local_id(0);
1461  for (int i = 0; i < filterWidth; i++) {
1462  CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1463  float f = filterCache[filterIndex];
1464 
1465  sum.x += f * p.x;
1466  sum.y += f * p.y;
1467  sum.z += f * p.z;
1468  sum.w += f * p.w;
1469 
1470  gamma += f;
1471  filterIndex++;
1472  cacheIndexX++;
1473  }
1474  cacheIndexY++;
1475  }
1476  }
1477  else {
1478  int cacheIndexY = get_local_id(1);
1479  for (int j = 0; j < filterHeight; j++) {
1480  int cacheIndexX = get_local_id(0);
1481  for (int i = 0; i < filterWidth; i++) {
1482 
1483  CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1484  float alpha = QuantumScale*(QuantumRange-p.w);
1485  float f = filterCache[filterIndex];
1486  float g = alpha * f;
1487 
1488  sum.x += g*p.x;
1489  sum.y += g*p.y;
1490  sum.z += g*p.z;
1491  sum.w += f*p.w;
1492 
1493  gamma += g;
1494  filterIndex++;
1495  cacheIndexX++;
1496  }
1497  cacheIndexY++;
1498  }
1499  gamma = PerceptibleReciprocal(gamma);
1500  sum.xyz = gamma*sum.xyz;
1501  }
1502  CLPixelType outputPixel;
1503  outputPixel.x = ClampToQuantum(sum.x);
1504  outputPixel.y = ClampToQuantum(sum.y);
1505  outputPixel.z = ClampToQuantum(sum.z);
1506  outputPixel.w = ((channel & OpacityChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1507 
1508  output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1509  }
1510  )
1511 
1512  STRINGIFY(
1513  __kernel
1514  void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
1515  const uint imageWidth, const uint imageHeight,
1516  __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1517  const uint matte, const ChannelType channel) {
1518 
1519  int2 imageIndex;
1520  imageIndex.x = get_global_id(0);
1521  imageIndex.y = get_global_id(1);
1522 
1523  /*
1524  unsigned int imageWidth = get_global_size(0);
1525  unsigned int imageHeight = get_global_size(1);
1526  */
1527  if (imageIndex.x >= imageWidth
1528  || imageIndex.y >= imageHeight)
1529  return;
1530 
1531  int2 midFilterDimen;
1532  midFilterDimen.x = (filterWidth-1)/2;
1533  midFilterDimen.y = (filterHeight-1)/2;
1534 
1535  int filterIndex = 0;
1536  float4 sum = (float4)0.0f;
1537  float gamma = 0.0f;
1538  if (((channel & OpacityChannel) == 0) || (matte == 0)) {
1539  for (int j = 0; j < filterHeight; j++) {
1540  int2 inputPixelIndex;
1541  inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1542  inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1543  for (int i = 0; i < filterWidth; i++) {
1544  inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1545  inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1546 
1547  CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1548  float f = filter[filterIndex];
1549 
1550  sum.x += f * p.x;
1551  sum.y += f * p.y;
1552  sum.z += f * p.z;
1553  sum.w += f * p.w;
1554 
1555  gamma += f;
1556 
1557  filterIndex++;
1558  }
1559  }
1560  }
1561  else {
1562 
1563  for (int j = 0; j < filterHeight; j++) {
1564  int2 inputPixelIndex;
1565  inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1566  inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1567  for (int i = 0; i < filterWidth; i++) {
1568  inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1569  inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1570 
1571  CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1572  float alpha = QuantumScale*(QuantumRange-p.w);
1573  float f = filter[filterIndex];
1574  float g = alpha * f;
1575 
1576  sum.x += g*p.x;
1577  sum.y += g*p.y;
1578  sum.z += g*p.z;
1579  sum.w += f*p.w;
1580 
1581  gamma += g;
1582 
1583 
1584  filterIndex++;
1585  }
1586  }
1587  gamma = PerceptibleReciprocal(gamma);
1588  sum.xyz = gamma*sum.xyz;
1589  }
1590 
1591  CLPixelType outputPixel;
1592  outputPixel.x = ClampToQuantum(sum.x);
1593  outputPixel.y = ClampToQuantum(sum.y);
1594  outputPixel.z = ClampToQuantum(sum.z);
1595  outputPixel.w = ((channel & OpacityChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1596 
1597  output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1598  }
1599  )
1600 
1601 /*
1602 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1603 % %
1604 % %
1605 % %
1606 % D e s p e c k l e %
1607 % %
1608 % %
1609 % %
1610 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1611 */
1612 
1613  STRINGIFY(
1614 
1615  __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1616  , const unsigned int imageWidth, const unsigned int imageHeight
1617  , const int2 offset, const int polarity, const int matte) {
1618 
1619  int x = get_global_id(0);
1620  int y = get_global_id(1);
1621 
1622  CLPixelType v = inputImage[y*imageWidth+x];
1623 
1624  int2 neighbor;
1625  neighbor.y = y + offset.y;
1626  neighbor.x = x + offset.x;
1627 
1628  int2 clampedNeighbor;
1629  clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1630  clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1631 
1632  CLPixelType r = (clampedNeighbor.x == neighbor.x
1633  && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1634  :(CLPixelType)0;
1635 
1636  int sv[4];
1637  sv[0] = (int)v.x;
1638  sv[1] = (int)v.y;
1639  sv[2] = (int)v.z;
1640  sv[3] = (int)v.w;
1641 
1642  int sr[4];
1643  sr[0] = (int)r.x;
1644  sr[1] = (int)r.y;
1645  sr[2] = (int)r.z;
1646  sr[3] = (int)r.w;
1647 
1648  if (polarity > 0) {
1649  \n #pragma unroll 4\n
1650  for (unsigned int i = 0; i < 4; i++) {
1651  sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
1652  }
1653  }
1654  else {
1655  \n #pragma unroll 4\n
1656  for (unsigned int i = 0; i < 4; i++) {
1657  sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
1658  }
1659 
1660  }
1661 
1662  v.x = (CLQuantum)sv[0];
1663  v.y = (CLQuantum)sv[1];
1664  v.z = (CLQuantum)sv[2];
1665 
1666  if (matte!=0)
1667  v.w = (CLQuantum)sv[3];
1668 
1669  outputImage[y*imageWidth+x] = v;
1670 
1671  }
1672 
1673 
1674  )
1675 
1676 
1677 
1678  STRINGIFY(
1679 
1680  __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1681  , const unsigned int imageWidth, const unsigned int imageHeight
1682  , const int2 offset, const int polarity, const int matte) {
1683 
1684  int x = get_global_id(0);
1685  int y = get_global_id(1);
1686 
1687  CLPixelType v = inputImage[y*imageWidth+x];
1688 
1689  int2 neighbor, clampedNeighbor;
1690 
1691  neighbor.y = y + offset.y;
1692  neighbor.x = x + offset.x;
1693  clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1694  clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1695 
1696  CLPixelType r = (clampedNeighbor.x == neighbor.x
1697  && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1698  :(CLPixelType)0;
1699 
1700 
1701  neighbor.y = y - offset.y;
1702  neighbor.x = x - offset.x;
1703  clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1704  clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1705 
1706  CLPixelType s = (clampedNeighbor.x == neighbor.x
1707  && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1708  :(CLPixelType)0;
1709 
1710 
1711  int sv[4];
1712  sv[0] = (int)v.x;
1713  sv[1] = (int)v.y;
1714  sv[2] = (int)v.z;
1715  sv[3] = (int)v.w;
1716 
1717  int sr[4];
1718  sr[0] = (int)r.x;
1719  sr[1] = (int)r.y;
1720  sr[2] = (int)r.z;
1721  sr[3] = (int)r.w;
1722 
1723  int ss[4];
1724  ss[0] = (int)s.x;
1725  ss[1] = (int)s.y;
1726  ss[2] = (int)s.z;
1727  ss[3] = (int)s.w;
1728 
1729  if (polarity > 0) {
1730  \n #pragma unroll 4\n
1731  for (unsigned int i = 0; i < 4; i++) {
1732  //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] ) ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
1733  //
1734  //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1735  //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1736  sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1737  }
1738  }
1739  else {
1740  \n #pragma unroll 4\n
1741  for (unsigned int i = 0; i < 4; i++) {
1742  //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] ) ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
1743  //
1744  //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1745  sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1746  }
1747  }
1748 
1749  v.x = (CLQuantum)sv[0];
1750  v.y = (CLQuantum)sv[1];
1751  v.z = (CLQuantum)sv[2];
1752 
1753  if (matte!=0)
1754  v.w = (CLQuantum)sv[3];
1755 
1756  outputImage[y*imageWidth+x] = v;
1757 
1758  }
1759 
1760 
1761  )
1762 
1763 /*
1764 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1765 % %
1766 % %
1767 % %
1768 % E q u a l i z e %
1769 % %
1770 % %
1771 % %
1772 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1773 */
1774 
1775  STRINGIFY(
1776  /*
1777  */
1778  __kernel void Equalize(__global CLPixelType * restrict im,
1779  const ChannelType channel,
1780  __global CLPixelType * restrict equalize_map,
1781  const float4 white, const float4 black)
1782  {
1783  const int x = get_global_id(0);
1784  const int y = get_global_id(1);
1785  const int columns = get_global_size(0);
1786  const int c = x + y * columns;
1787 
1788  uint ePos;
1789  CLPixelType oValue, eValue;
1790  CLQuantum red, green, blue, opacity;
1791 
1792  //read from global
1793  oValue=im[c];
1794 
1795  if ((channel & SyncChannels) != 0)
1796  {
1797  if (getRedF4(white) != getRedF4(black))
1798  {
1799  ePos = ScaleQuantumToMap(getRed(oValue));
1800  eValue = equalize_map[ePos];
1801  red = getRed(eValue);
1802  ePos = ScaleQuantumToMap(getGreen(oValue));
1803  eValue = equalize_map[ePos];
1804  green = getRed(eValue);
1805  ePos = ScaleQuantumToMap(getBlue(oValue));
1806  eValue = equalize_map[ePos];
1807  blue = getRed(eValue);
1808  ePos = ScaleQuantumToMap(getOpacity(oValue));
1809  eValue = equalize_map[ePos];
1810  opacity = getRed(eValue);
1811 
1812  //write back
1813  im[c]=(CLPixelType)(blue, green, red, opacity);
1814  }
1815 
1816  }
1817 
1818  // for equalizing, we always need all channels?
1819  // otherwise something more
1820 
1821  }
1822  )
1823 
1824 /*
1825 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1826 % %
1827 % %
1828 % %
1829 % F u n c t i o n %
1830 % %
1831 % %
1832 % %
1833 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1834 */
1835 
1836  STRINGIFY(
1837 
1838  /*
1839  apply FunctionImageChannel(braightness-contrast)
1840  */
1841  CLPixelType ApplyFunction(CLPixelType pixel,const MagickFunction function,
1842  const unsigned int number_parameters,
1843  __constant float *parameters)
1844  {
1845  float4 result = (float4) 0.0f;
1846  switch (function)
1847  {
1848  case PolynomialFunction:
1849  {
1850  for (unsigned int i=0; i < number_parameters; i++)
1851  result = result*(float4)QuantumScale*convert_float4(pixel) + parameters[i];
1852  result *= (float4)QuantumRange;
1853  break;
1854  }
1855  case SinusoidFunction:
1856  {
1857  float freq,phase,ampl,bias;
1858  freq = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1859  phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
1860  ampl = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
1861  bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1862  result.x = QuantumRange*(ampl*sin(2.0f*MagickPI*
1863  (freq*QuantumScale*(float)pixel.x + phase/360.0f)) + bias);
1864  result.y = QuantumRange*(ampl*sin(2.0f*MagickPI*
1865  (freq*QuantumScale*(float)pixel.y + phase/360.0f)) + bias);
1866  result.z = QuantumRange*(ampl*sin(2.0f*MagickPI*
1867  (freq*QuantumScale*(float)pixel.z + phase/360.0f)) + bias);
1868  result.w = QuantumRange*(ampl*sin(2.0f*MagickPI*
1869  (freq*QuantumScale*(float)pixel.w + phase/360.0f)) + bias);
1870  break;
1871  }
1872  case ArcsinFunction:
1873  {
1874  float width,range,center,bias;
1875  width = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1876  center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1877  range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1878  bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1879 
1880  result.x = 2.0f/width*(QuantumScale*(float)pixel.x - center);
1881  result.x = range/MagickPI*asin(result.x)+bias;
1882  result.x = ( result.x <= -1.0f ) ? bias - range/2.0f : result.x;
1883  result.x = ( result.x >= 1.0f ) ? bias + range/2.0f : result.x;
1884 
1885  result.y = 2.0f/width*(QuantumScale*(float)pixel.y - center);
1886  result.y = range/MagickPI*asin(result.y)+bias;
1887  result.y = ( result.y <= -1.0f ) ? bias - range/2.0f : result.y;
1888  result.y = ( result.y >= 1.0f ) ? bias + range/2.0f : result.y;
1889 
1890  result.z = 2.0f/width*(QuantumScale*(float)pixel.z - center);
1891  result.z = range/MagickPI*asin(result.z)+bias;
1892  result.z = ( result.z <= -1.0f ) ? bias - range/2.0f : result.x;
1893  result.z = ( result.z >= 1.0f ) ? bias + range/2.0f : result.x;
1894 
1895 
1896  result.w = 2.0f/width*(QuantumScale*(float)pixel.w - center);
1897  result.w = range/MagickPI*asin(result.w)+bias;
1898  result.w = ( result.w <= -1.0f ) ? bias - range/2.0f : result.w;
1899  result.w = ( result.w >= 1.0f ) ? bias + range/2.0f : result.w;
1900 
1901  result *= (float4)QuantumRange;
1902  break;
1903  }
1904  case ArctanFunction:
1905  {
1906  float slope,range,center,bias;
1907  slope = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1908  center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1909  range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1910  bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1911  result = (float4)MagickPI*(float4)slope*((float4)QuantumScale*convert_float4(pixel)-(float4)center);
1912  result = (float4)QuantumRange*((float4)range/(float4)MagickPI*atan(result) + (float4)bias);
1913  break;
1914  }
1915  case UndefinedFunction:
1916  break;
1917  }
1918  return (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
1919  ClampToQuantum(result.z), ClampToQuantum(result.w));
1920  }
1921  )
1922 
1923  STRINGIFY(
1924  /*
1925  Improve brightness / contrast of the image
1926  channel : define which channel is improved
1927  function : the function called to enchance the brightness contrast
1928  number_parameters : numbers of parameters
1929  parameters : the parameter
1930  */
1931  __kernel void ComputeFunction(__global CLPixelType *im,
1932  const ChannelType channel, const MagickFunction function,
1933  const unsigned int number_parameters, __constant float *parameters)
1934  {
1935  const int x = get_global_id(0);
1936  const int y = get_global_id(1);
1937  const int columns = get_global_size(0);
1938  const int c = x + y * columns;
1939  im[c] = ApplyFunction(im[c], function, number_parameters, parameters);
1940  }
1941  )
1942 
1943 /*
1944 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1945 % %
1946 % %
1947 % %
1948 % G r a y s c a l e %
1949 % %
1950 % %
1951 % %
1952 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1953 */
1954 
1955  STRINGIFY(
1956  __kernel void Grayscale(__global CLPixelType *im,
1957  const int method, const int colorspace)
1958  {
1959 
1960  const int x = get_global_id(0);
1961  const int y = get_global_id(1);
1962  const int columns = get_global_size(0);
1963  const int c = x + y * columns;
1964 
1965  CLPixelType pixel = im[c];
1966 
1967  float
1968  blue,
1969  green,
1970  intensity,
1971  red;
1972 
1973  red=(float)getRed(pixel);
1974  green=(float)getGreen(pixel);
1975  blue=(float)getBlue(pixel);
1976 
1977  intensity=0.0;
1978 
1979  CLPixelType filteredPixel;
1980 
1981  switch (method)
1982  {
1984  {
1985  intensity=(red+green+blue)/3.0;
1986  break;
1987  }
1989  {
1990  intensity=MagickMax(MagickMax(red,green),blue);
1991  break;
1992  }
1994  {
1995  intensity=(MagickMin(MagickMin(red,green),blue)+
1996  MagickMax(MagickMax(red,green),blue))/2.0;
1997  break;
1998  }
2000  {
2001  intensity=(float) (((float) red*red+green*green+
2002  blue*blue)/(3.0*QuantumRange));
2003  break;
2004  }
2006  {
2007  /*
2008  if (colorspace == RGBColorspace)
2009  {
2010  red=EncodePixelGamma(red);
2011  green=EncodePixelGamma(green);
2012  blue=EncodePixelGamma(blue);
2013  }
2014  */
2015  intensity=0.298839*red+0.586811*green+0.114350*blue;
2016  break;
2017  }
2019  {
2020  /*
2021  if (image->colorspace == sRGBColorspace)
2022  {
2023  red=DecodePixelGamma(red);
2024  green=DecodePixelGamma(green);
2025  blue=DecodePixelGamma(blue);
2026  }
2027  */
2028  intensity=0.298839*red+0.586811*green+0.114350*blue;
2029  break;
2030  }
2032  default:
2033  {
2034  /*
2035  if (image->colorspace == RGBColorspace)
2036  {
2037  red=EncodePixelGamma(red);
2038  green=EncodePixelGamma(green);
2039  blue=EncodePixelGamma(blue);
2040  }
2041  */
2042  intensity=0.212656*red+0.715158*green+0.072186*blue;
2043  break;
2044  }
2046  {
2047  /*
2048  if (image->colorspace == sRGBColorspace)
2049  {
2050  red=DecodePixelGamma(red);
2051  green=DecodePixelGamma(green);
2052  blue=DecodePixelGamma(blue);
2053  }
2054  */
2055  intensity=0.212656*red+0.715158*green+0.072186*blue;
2056  break;
2057  }
2059  {
2060  intensity=(float) (sqrt((float) red*red+green*green+
2061  blue*blue)/sqrt(3.0));
2062  break;
2063  }
2064 
2065  }
2066 
2067  setGray(&filteredPixel, ClampToQuantum(intensity));
2068 
2069  filteredPixel.w = pixel.w;
2070 
2071  im[c] = filteredPixel;
2072  }
2073  )
2074 
2075 /*
2076 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2077 % %
2078 % %
2079 % %
2080 % L o c a l C o n t r a s t %
2081 % %
2082 % %
2083 % %
2084 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2085 */
2086 
2087  STRINGIFY(
2088  inline int mirrorBottom(int value)
2089  {
2090  return (value < 0) ? - (value) : value;
2091  }
2092  inline int mirrorTop(int value, int width)
2093  {
2094  return (value >= width) ? (2 * width - value - 1) : value;
2095  }
2096 
2097  __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
2098  const int radius,
2099  const int imageWidth,
2100  const int imageHeight)
2101  {
2102  const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
2103 
2104  int x = get_local_id(0);
2105  int y = get_global_id(1);
2106 
2107  if ((x >= imageWidth) || (y >= imageHeight))
2108  return;
2109 
2110  global CLPixelType *src = srcImage + y * imageWidth;
2111 
2112  for (int i = x; i < imageWidth; i += get_local_size(0)) {
2113  float sum = 0.0f;
2114  float weight = 1.0f;
2115 
2116  int j = i - radius;
2117  while ((j + 7) < i) {
2118  for (int k = 0; k < 8; ++k) // Unroll 8x
2119  sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
2120  weight += 8.0f;
2121  j+=8;
2122  }
2123  while (j < i) {
2124  sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
2125  weight += 1.0f;
2126  ++j;
2127  }
2128 
2129  while ((j + 7) < radius + i) {
2130  for (int k = 0; k < 8; ++k) // Unroll 8x
2131  sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
2132  weight -= 8.0f;
2133  j+=8;
2134  }
2135  while (j < radius + i) {
2136  sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
2137  weight -= 1.0f;
2138  ++j;
2139  }
2140 
2141  tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
2142  }
2143  }
2144  )
2145 
2146  STRINGIFY(
2147  __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
2148  const int radius,
2149  const float strength,
2150  const int imageWidth,
2151  const int imageHeight)
2152  {
2153  const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
2154 
2155  int x = get_global_id(0);
2156  int y = get_global_id(1);
2157 
2158  if ((x >= imageWidth) || (y >= imageHeight))
2159  return;
2160 
2161  global float *src = blurImage + x;
2162 
2163  float sum = 0.0f;
2164  float weight = 1.0f;
2165 
2166  int j = y - radius;
2167  while ((j + 7) < y) {
2168  for (int k = 0; k < 8; ++k) // Unroll 8x
2169  sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
2170  weight += 8.0f;
2171  j+=8;
2172  }
2173  while (j < y) {
2174  sum += weight * src[mirrorBottom(j) * imageWidth];
2175  weight += 1.0f;
2176  ++j;
2177  }
2178 
2179  while ((j + 7) < radius + y) {
2180  for (int k = 0; k < 8; ++k) // Unroll 8x
2181  sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
2182  weight -= 8.0f;
2183  j+=8;
2184  }
2185  while (j < radius + y) {
2186  sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
2187  weight -= 1.0f;
2188  ++j;
2189  }
2190 
2191  CLPixelType pixel = srcImage[x + y * imageWidth];
2192  float srcVal = dot(RGB, convert_float4(pixel));
2193  float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
2194  mult = (srcVal + mult) / srcVal;
2195 
2196  pixel.x = ClampToQuantum(pixel.x * mult);
2197  pixel.y = ClampToQuantum(pixel.y * mult);
2198  pixel.z = ClampToQuantum(pixel.z * mult);
2199 
2200  dstImage[x + y * imageWidth] = pixel;
2201  }
2202  )
2203 
2204 /*
2205 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2206 % %
2207 % %
2208 % %
2209 % M o d u l a t e %
2210 % %
2211 % %
2212 % %
2213 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2214 */
2215 
2216  STRINGIFY(
2217 
2218  inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
2219  float *hue, float *saturation, float *lightness)
2220  {
2221  float
2222  c,
2223  tmax,
2224  tmin;
2225 
2226  /*
2227  Convert RGB to HSL colorspace.
2228  */
2229  tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
2230  tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
2231 
2232  c=tmax-tmin;
2233 
2234  *lightness=(tmax+tmin)/2.0;
2235  if (c <= 0.0)
2236  {
2237  *hue=0.0;
2238  *saturation=0.0;
2239  return;
2240  }
2241 
2242  if (tmax == (QuantumScale*red))
2243  {
2244  *hue=(QuantumScale*green-QuantumScale*blue)/c;
2245  if ((QuantumScale*green) < (QuantumScale*blue))
2246  *hue+=6.0;
2247  }
2248  else
2249  if (tmax == (QuantumScale*green))
2250  *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
2251  else
2252  *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
2253 
2254  *hue*=60.0/360.0;
2255  if (*lightness <= 0.5)
2256  *saturation=c/(2.0*(*lightness));
2257  else
2258  *saturation=c/(2.0-2.0*(*lightness));
2259  }
2260 
2261  inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
2262  CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2263  {
2264  float
2265  b,
2266  c,
2267  g,
2268  h,
2269  tmin,
2270  r,
2271  x;
2272 
2273  /*
2274  Convert HSL to RGB colorspace.
2275  */
2276  h=hue*360.0;
2277  if (lightness <= 0.5)
2278  c=2.0*lightness*saturation;
2279  else
2280  c=(2.0-2.0*lightness)*saturation;
2281  tmin=lightness-0.5*c;
2282  h-=360.0*floor(h/360.0);
2283  h/=60.0;
2284  x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
2285  switch ((int) floor(h) % 6)
2286  {
2287  case 0:
2288  {
2289  r=tmin+c;
2290  g=tmin+x;
2291  b=tmin;
2292  break;
2293  }
2294  case 1:
2295  {
2296  r=tmin+x;
2297  g=tmin+c;
2298  b=tmin;
2299  break;
2300  }
2301  case 2:
2302  {
2303  r=tmin;
2304  g=tmin+c;
2305  b=tmin+x;
2306  break;
2307  }
2308  case 3:
2309  {
2310  r=tmin;
2311  g=tmin+x;
2312  b=tmin+c;
2313  break;
2314  }
2315  case 4:
2316  {
2317  r=tmin+x;
2318  g=tmin;
2319  b=tmin+c;
2320  break;
2321  }
2322  case 5:
2323  {
2324  r=tmin+c;
2325  g=tmin;
2326  b=tmin+x;
2327  break;
2328  }
2329  default:
2330  {
2331  r=0.0;
2332  g=0.0;
2333  b=0.0;
2334  }
2335  }
2336  *red=ClampToQuantum(QuantumRange*r);
2337  *green=ClampToQuantum(QuantumRange*g);
2338  *blue=ClampToQuantum(QuantumRange*b);
2339  }
2340 
2341  inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness,
2342  CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2343  {
2344  float
2345  hue,
2346  lightness,
2347  saturation;
2348 
2349  /*
2350  Increase or decrease color lightness, saturation, or hue.
2351  */
2352  ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
2353  hue+=0.5*(0.01*percent_hue-1.0);
2354  while (hue < 0.0)
2355  hue+=1.0;
2356  while (hue >= 1.0)
2357  hue-=1.0;
2358  saturation*=0.01*percent_saturation;
2359  lightness*=0.01*percent_lightness;
2360  ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
2361  }
2362 
2363  __kernel void Modulate(__global CLPixelType *im,
2364  const float percent_brightness,
2365  const float percent_hue,
2366  const float percent_saturation,
2367  const int colorspace)
2368  {
2369 
2370  const int x = get_global_id(0);
2371  const int y = get_global_id(1);
2372  const int columns = get_global_size(0);
2373  const int c = x + y * columns;
2374 
2375  CLPixelType pixel = im[c];
2376 
2377  CLQuantum
2378  blue,
2379  green,
2380  red;
2381 
2382  red=getRed(pixel);
2383  green=getGreen(pixel);
2384  blue=getBlue(pixel);
2385 
2386  switch (colorspace)
2387  {
2388  case HSLColorspace:
2389  default:
2390  {
2391  ModulateHSL(percent_hue, percent_saturation, percent_brightness,
2392  &red, &green, &blue);
2393  }
2394 
2395  }
2396 
2397  CLPixelType filteredPixel;
2398 
2399  setRed(&filteredPixel, red);
2400  setGreen(&filteredPixel, green);
2401  setBlue(&filteredPixel, blue);
2402  filteredPixel.w = pixel.w;
2403 
2404  im[c] = filteredPixel;
2405  }
2406  )
2407 
2408 /*
2409 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2410 % %
2411 % %
2412 % %
2413 % M o t i o n B l u r %
2414 % %
2415 % %
2416 % %
2417 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2418 */
2419 
2420  STRINGIFY(
2421  __kernel
2422  void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
2423  const unsigned int imageWidth, const unsigned int imageHeight,
2424  const __global float *filter, const unsigned int width, const __global int2* offset,
2425  const float4 bias,
2426  const ChannelType channel, const unsigned int matte) {
2427 
2428  int2 currentPixel;
2429  currentPixel.x = get_global_id(0);
2430  currentPixel.y = get_global_id(1);
2431 
2432  if (currentPixel.x >= imageWidth
2433  || currentPixel.y >= imageHeight)
2434  return;
2435 
2436  float4 pixel;
2437  pixel.x = (float)bias.x;
2438  pixel.y = (float)bias.y;
2439  pixel.z = (float)bias.z;
2440  pixel.w = (float)bias.w;
2441 
2442  if (((channel & OpacityChannel) == 0) || (matte == 0)) {
2443 
2444  for (int i = 0; i < width; i++) {
2445  // only support EdgeVirtualPixelMethod through ClampToCanvas
2446  // TODO: implement other virtual pixel method
2447  int2 samplePixel = currentPixel + offset[i];
2448  samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2449  samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2450  CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2451 
2452  pixel.x += (filter[i] * (float)samplePixelValue.x);
2453  pixel.y += (filter[i] * (float)samplePixelValue.y);
2454  pixel.z += (filter[i] * (float)samplePixelValue.z);
2455  pixel.w += (filter[i] * (float)samplePixelValue.w);
2456  }
2457 
2458  CLPixelType outputPixel;
2459  outputPixel.x = ClampToQuantum(pixel.x);
2460  outputPixel.y = ClampToQuantum(pixel.y);
2461  outputPixel.z = ClampToQuantum(pixel.z);
2462  outputPixel.w = ClampToQuantum(pixel.w);
2463  output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2464  }
2465  else {
2466 
2467  float gamma = 0.0f;
2468  for (int i = 0; i < width; i++) {
2469  // only support EdgeVirtualPixelMethod through ClampToCanvas
2470  // TODO: implement other virtual pixel method
2471  int2 samplePixel = currentPixel + offset[i];
2472  samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2473  samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2474 
2475  CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2476 
2477  float alpha = QuantumScale*(QuantumRange-samplePixelValue.w);
2478  float k = filter[i];
2479  pixel.x = pixel.x + k * alpha * samplePixelValue.x;
2480  pixel.y = pixel.y + k * alpha * samplePixelValue.y;
2481  pixel.z = pixel.z + k * alpha * samplePixelValue.z;
2482 
2483  pixel.w += k * alpha * samplePixelValue.w;
2484 
2485  gamma+=k*alpha;
2486  }
2487  gamma = PerceptibleReciprocal(gamma);
2488  pixel.xyz = gamma*pixel.xyz;
2489 
2490  CLPixelType outputPixel;
2491  outputPixel.x = ClampToQuantum(pixel.x);
2492  outputPixel.y = ClampToQuantum(pixel.y);
2493  outputPixel.z = ClampToQuantum(pixel.z);
2494  outputPixel.w = ClampToQuantum(pixel.w);
2495  output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2496  }
2497  }
2498  )
2499 
2500 /*
2501 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2502 % %
2503 % %
2504 % %
2505 % R a d i a l B l u r %
2506 % %
2507 % %
2508 % %
2509 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2510 */
2511 
2512  STRINGIFY(
2513  __kernel void RadialBlur(const __global CLPixelType *im, __global CLPixelType *filtered_im,
2514  const float4 bias,
2515  const unsigned int channel, const unsigned int matte,
2516  const float2 blurCenter,
2517  __constant float *cos_theta, __constant float *sin_theta,
2518  const unsigned int cossin_theta_size)
2519  {
2520  const int x = get_global_id(0);
2521  const int y = get_global_id(1);
2522  const int columns = get_global_size(0);
2523  const int rows = get_global_size(1);
2524  unsigned int step = 1;
2525  float center_x = (float) x - blurCenter.x;
2526  float center_y = (float) y - blurCenter.y;
2527  float radius = hypot(center_x, center_y);
2528 
2529  //float blur_radius = hypot((float) columns/2.0f, (float) rows/2.0f);
2530  float blur_radius = hypot(blurCenter.x, blurCenter.y);
2531 
2532  if (radius > MagickEpsilon)
2533  {
2534  step = (unsigned int) (blur_radius / radius);
2535  if (step == 0)
2536  step = 1;
2537  if (step >= cossin_theta_size)
2538  step = cossin_theta_size-1;
2539  }
2540 
2541  float4 result;
2542  result.x = (float)bias.x;
2543  result.y = (float)bias.y;
2544  result.z = (float)bias.z;
2545  result.w = (float)bias.w;
2546  float normalize = 0.0f;
2547 
2548  if (((channel & OpacityChannel) == 0) || (matte == 0)) {
2549  for (unsigned int i=0; i<cossin_theta_size; i+=step)
2550  {
2551  result += convert_float4(im[
2552  ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+
2553  ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
2554  normalize += 1.0f;
2555  }
2556  normalize = PerceptibleReciprocal(normalize);
2557  result = result * normalize;
2558  }
2559  else {
2560  float gamma = 0.0f;
2561  for (unsigned int i=0; i<cossin_theta_size; i+=step)
2562  {
2563  float4 p = convert_float4(im[
2564  ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+
2565  ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
2566 
2567  float alpha = (float)(QuantumScale*(QuantumRange-p.w));
2568  result.x += alpha * p.x;
2569  result.y += alpha * p.y;
2570  result.z += alpha * p.z;
2571  result.w += p.w;
2572  gamma+=alpha;
2573  normalize += 1.0f;
2574  }
2575  gamma = PerceptibleReciprocal(gamma);
2576  normalize = PerceptibleReciprocal(normalize);
2577  result.x = gamma*result.x;
2578  result.y = gamma*result.y;
2579  result.z = gamma*result.z;
2580  result.w = normalize*result.w;
2581  }
2582  filtered_im[y * columns + x] = (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
2583  ClampToQuantum(result.z), ClampToQuantum(result.w));
2584  }
2585  )
2586 
2587 /*
2588 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2589 % %
2590 % %
2591 % %
2592 % R e s i z e %
2593 % %
2594 % %
2595 % %
2596 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2597 */
2598 
2599  STRINGIFY(
2600  // Based on Box from resize.c
2601  float BoxResizeFilter(const float x)
2602  {
2603  return 1.0f;
2604  }
2605  )
2606 
2607  STRINGIFY(
2608  // Based on CubicBC from resize.c
2609  float CubicBC(const float x,const __global float* resizeFilterCoefficients)
2610  {
2611  /*
2612  Cubic Filters using B,C determined values:
2613  Mitchell-Netravali B = 1/3 C = 1/3 "Balanced" cubic spline filter
2614  Catmull-Rom B = 0 C = 1/2 Interpolatory and exact on linears
2615  Spline B = 1 C = 0 B-Spline Gaussian approximation
2616  Hermite B = 0 C = 0 B-Spline interpolator
2617 
2618  See paper by Mitchell and Netravali, Reconstruction Filters in Computer
2619  Graphics Computer Graphics, Volume 22, Number 4, August 1988
2620  http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
2621  Mitchell.pdf.
2622 
2623  Coefficents are determined from B,C values:
2624  P0 = ( 6 - 2*B )/6 = coeff[0]
2625  P1 = 0
2626  P2 = (-18 +12*B + 6*C )/6 = coeff[1]
2627  P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
2628  Q0 = ( 8*B +24*C )/6 = coeff[3]
2629  Q1 = ( -12*B -48*C )/6 = coeff[4]
2630  Q2 = ( 6*B +30*C )/6 = coeff[5]
2631  Q3 = ( - 1*B - 6*C )/6 = coeff[6]
2632 
2633  which are used to define the filter:
2634 
2635  P0 + P1*x + P2*x^2 + P3*x^3 0 <= x < 1
2636  Q0 + Q1*x + Q2*x^2 + Q3*x^3 1 <= x < 2
2637 
2638  which ensures function is continuous in value and derivative (slope).
2639  */
2640  if (x < 1.0)
2641  return(resizeFilterCoefficients[0]+x*(x*
2642  (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
2643  if (x < 2.0)
2644  return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
2645  (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
2646  return(0.0);
2647  }
2648  )
2649 
2650  STRINGIFY(
2651  float Sinc(const float x)
2652  {
2653  if (x != 0.0f)
2654  {
2655  const float alpha=(float) (MagickPI*x);
2656  return sinpi(x)/alpha;
2657  }
2658  return(1.0f);
2659  }
2660  )
2661 
2662  STRINGIFY(
2663  float Triangle(const float x)
2664  {
2665  /*
2666  1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
2667  a Bartlett 2D Cone filter. Also used as a Bartlett Windowing function
2668  for Sinc().
2669  */
2670  return ((x<1.0f)?(1.0f-x):0.0f);
2671  }
2672  )
2673 
2674 
2675  STRINGIFY(
2676  float Hanning(const float x)
2677  {
2678  /*
2679  Cosine window function:
2680  0.5+0.5*cos(pi*x).
2681  */
2682  const float cosine=cos((MagickPI*x));
2683  return(0.5f+0.5f*cosine);
2684  }
2685  )
2686 
2687  STRINGIFY(
2688  float Hamming(const float x)
2689  {
2690  /*
2691  Offset cosine window function:
2692  .54 + .46 cos(pi x).
2693  */
2694  const float cosine=cos((MagickPI*x));
2695  return(0.54f+0.46f*cosine);
2696  }
2697  )
2698 
2699  STRINGIFY(
2700  float Blackman(const float x)
2701  {
2702  /*
2703  Blackman: 2nd order cosine windowing function:
2704  0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
2705 
2706  Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
2707  five flops.
2708  */
2709  const float cosine=cos((MagickPI*x));
2710  return(0.34f+cosine*(0.5f+cosine*0.16f));
2711  }
2712  )
2713 
2714 
2715 
2716 
2717  STRINGIFY(
2718  inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
2719  {
2720  switch (filterType)
2721  {
2722  /* Call Sinc even for SincFast to get better precision on GPU
2723  and to avoid thread divergence. Sinc is pretty fast on GPU anyway...*/
2724  case SincWeightingFunction:
2726  return Sinc(x);
2728  return CubicBC(x,filterCoefficients);
2729  case BoxWeightingFunction:
2730  return BoxResizeFilter(x);
2732  return Triangle(x);
2734  return Hanning(x);
2736  return Hamming(x);
2738  return Blackman(x);
2739 
2740  default:
2741  return 0.0f;
2742  }
2743  }
2744  )
2745 
2746 
2747  STRINGIFY(
2748  inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
2749  , const ResizeWeightingFunctionType resizeWindowType
2750  , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
2751  {
2752  float scale;
2753  float xBlur = fabs(x/resizeFilterBlur);
2754  if (resizeWindowSupport < MagickEpsilon
2755  || resizeWindowType == BoxWeightingFunction)
2756  {
2757  scale = 1.0f;
2758  }
2759  else
2760  {
2761  scale = resizeFilterScale;
2762  scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
2763  }
2764  float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
2765  return weight;
2766  }
2767 
2768  )
2769 
2770  ;
2771  const char* accelerateKernels2 =
2772 
2773  STRINGIFY(
2774 
2775  inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2776  return (numWorkItems/pixelPerWorkgroup);
2777  }
2778 
2779  // returns the index of the pixel for the current workitem to compute.
2780  // returns -1 if this workitem doesn't need to participate in any computation
2781  inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2782  const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
2783  int pixelIndex = itemID/numWorkItemsPerPixel;
2784  pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
2785  return pixelIndex;
2786  }
2787 
2788  )
2789 
2790  STRINGIFY(
2791  __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
2792  void ResizeHorizontalFilter(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
2793  , const float xFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
2794  , const int resizeFilterType, const int resizeWindowType
2795  , const __global float* resizeFilterCubicCoefficients
2796  , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
2797  , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
2798  , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
2799 
2800 
2801  // calculate the range of resized image pixels computed by this workgroup
2802  const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
2803  const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
2804  const unsigned int actualNumPixelToCompute = stopX - startX;
2805 
2806  // calculate the range of input image pixels to cache
2807  float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
2808  const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2809  scale = PerceptibleReciprocal(scale);
2810 
2811  const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
2812  const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
2813 
2814  // cache the input pixels into local memory
2815  const unsigned int y = get_global_id(1);
2816  event_t e = async_work_group_copy(inputImageCache,inputImage+y*inputColumns+cacheRangeStartX,cacheRangeEndX-cacheRangeStartX,0);
2817  wait_group_events(1,&e);
2818 
2819  unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2820  for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2821  {
2822 
2823  const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
2824  const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
2825  const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
2826 
2827  // determine which resized pixel computed by this workitem
2828  const unsigned int itemID = get_local_id(0);
2829  const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
2830 
2831  const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
2832 
2833  float4 filteredPixel = (float4)0.0f;
2834  float density = 0.0f;
2835  float gamma = 0.0f;
2836  // -1 means this workitem doesn't participate in the computation
2837  if (pixelIndex != -1) {
2838 
2839  // x coordinated of the resized pixel computed by this workitem
2840  const int x = chunkStartX + pixelIndex;
2841 
2842  // calculate how many steps required for this pixel
2843  const float bisect = (x+0.5)/xFactor+MagickEpsilon;
2844  const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2845  const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
2846  const unsigned int n = stop - start;
2847 
2848  // calculate how many steps this workitem will contribute
2849  unsigned int numStepsPerWorkItem = n / numItems;
2850  numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2851 
2852  const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2853  if (startStep < n) {
2854  const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2855 
2856  unsigned int cacheIndex = start+startStep-cacheRangeStartX;
2857  if (matte == 0) {
2858 
2859  for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
2860  float4 cp = convert_float4(inputImageCache[cacheIndex]);
2861 
2862  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
2863  , (ResizeWeightingFunctionType)resizeWindowType
2864  , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
2865 
2866  filteredPixel += ((float4)weight)*cp;
2867  density+=weight;
2868  }
2869 
2870 
2871  }
2872  else {
2873  for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
2874  CLPixelType p = inputImageCache[cacheIndex];
2875 
2876  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
2877  , (ResizeWeightingFunctionType)resizeWindowType
2878  , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
2879 
2880  float alpha = weight * QuantumScale * GetPixelAlpha(p);
2881  float4 cp = convert_float4(p);
2882 
2883  filteredPixel.x += alpha * cp.x;
2884  filteredPixel.y += alpha * cp.y;
2885  filteredPixel.z += alpha * cp.z;
2886  filteredPixel.w += weight * cp.w;
2887 
2888  density+=weight;
2889  gamma+=alpha;
2890  }
2891  }
2892  }
2893  }
2894 
2895  // initialize the accumulators to zero
2896  if (itemID < actualNumPixelInThisChunk) {
2897  outputPixelCache[itemID] = (float4)0.0f;
2898  densityCache[itemID] = 0.0f;
2899  if (matte != 0)
2900  gammaCache[itemID] = 0.0f;
2901  }
2902  barrier(CLK_LOCAL_MEM_FENCE);
2903 
2904  // accumulatte the filtered pixel value and the density
2905  for (unsigned int i = 0; i < numItems; i++) {
2906  if (pixelIndex != -1) {
2907  if (itemID%numItems == i) {
2908  outputPixelCache[pixelIndex]+=filteredPixel;
2909  densityCache[pixelIndex]+=density;
2910  if (matte!=0) {
2911  gammaCache[pixelIndex]+=gamma;
2912  }
2913  }
2914  }
2915  barrier(CLK_LOCAL_MEM_FENCE);
2916  }
2917 
2918  if (itemID < actualNumPixelInThisChunk) {
2919  if (matte==0) {
2920  float density = densityCache[itemID];
2921  float4 filteredPixel = outputPixelCache[itemID];
2922  if (density!= 0.0f && density != 1.0)
2923  {
2924  density = PerceptibleReciprocal(density);
2925  filteredPixel *= (float4)density;
2926  }
2927  filteredImage[y*filteredColumns+chunkStartX+itemID] = (CLPixelType) (ClampToQuantum(filteredPixel.x)
2928  , ClampToQuantum(filteredPixel.y)
2929  , ClampToQuantum(filteredPixel.z)
2930  , ClampToQuantum(filteredPixel.w));
2931  }
2932  else {
2933  float density = densityCache[itemID];
2934  float gamma = gammaCache[itemID];
2935  float4 filteredPixel = outputPixelCache[itemID];
2936 
2937  if (density!= 0.0f && density != 1.0) {
2938  density = PerceptibleReciprocal(density);
2939  filteredPixel *= (float4)density;
2940  gamma *= density;
2941  }
2942  gamma = PerceptibleReciprocal(gamma);
2943 
2944  CLPixelType fp;
2945  fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
2946  , ClampToQuantum(gamma*filteredPixel.y)
2947  , ClampToQuantum(gamma*filteredPixel.z)
2948  , ClampToQuantum(filteredPixel.w));
2949 
2950  filteredImage[y*filteredColumns+chunkStartX+itemID] = fp;
2951 
2952  }
2953  }
2954 
2955  } // end of chunking loop
2956  }
2957  )
2958 
2959 
2960  STRINGIFY(
2961  __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
2962  void ResizeVerticalFilter(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
2963  , const float yFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
2964  , const int resizeFilterType, const int resizeWindowType
2965  , const __global float* resizeFilterCubicCoefficients
2966  , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
2967  , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
2968  , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
2969 
2970 
2971  // calculate the range of resized image pixels computed by this workgroup
2972  const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
2973  const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
2974  const unsigned int actualNumPixelToCompute = stopY - startY;
2975 
2976  // calculate the range of input image pixels to cache
2977  float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
2978  const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2979  scale = PerceptibleReciprocal(scale);
2980 
2981  const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
2982  const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
2983 
2984  // cache the input pixels into local memory
2985  const unsigned int x = get_global_id(0);
2986  event_t e = async_work_group_strided_copy(inputImageCache, inputImage+cacheRangeStartY*inputColumns+x, cacheRangeEndY-cacheRangeStartY, inputColumns, 0);
2987  wait_group_events(1,&e);
2988 
2989  unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2990  for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2991  {
2992 
2993  const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
2994  const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
2995  const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
2996 
2997  // determine which resized pixel computed by this workitem
2998  const unsigned int itemID = get_local_id(1);
2999  const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
3000 
3001  const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
3002 
3003  float4 filteredPixel = (float4)0.0f;
3004  float density = 0.0f;
3005  float gamma = 0.0f;
3006  // -1 means this workitem doesn't participate in the computation
3007  if (pixelIndex != -1) {
3008 
3009  // x coordinated of the resized pixel computed by this workitem
3010  const int y = chunkStartY + pixelIndex;
3011 
3012  // calculate how many steps required for this pixel
3013  const float bisect = (y+0.5)/yFactor+MagickEpsilon;
3014  const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
3015  const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
3016  const unsigned int n = stop - start;
3017 
3018  // calculate how many steps this workitem will contribute
3019  unsigned int numStepsPerWorkItem = n / numItems;
3020  numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
3021 
3022  const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
3023  if (startStep < n) {
3024  const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
3025 
3026  unsigned int cacheIndex = start+startStep-cacheRangeStartY;
3027  if (matte == 0) {
3028 
3029  for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
3030  float4 cp = convert_float4(inputImageCache[cacheIndex]);
3031 
3032  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
3033  , (ResizeWeightingFunctionType)resizeWindowType
3034  , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
3035 
3036  filteredPixel += ((float4)weight)*cp;
3037  density+=weight;
3038  }
3039 
3040 
3041  }
3042  else {
3043  for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
3044  CLPixelType p = inputImageCache[cacheIndex];
3045 
3046  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
3047  , (ResizeWeightingFunctionType)resizeWindowType
3048  , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
3049 
3050  float alpha = weight * QuantumScale * GetPixelAlpha(p);
3051  float4 cp = convert_float4(p);
3052 
3053  filteredPixel.x += alpha * cp.x;
3054  filteredPixel.y += alpha * cp.y;
3055  filteredPixel.z += alpha * cp.z;
3056  filteredPixel.w += weight * cp.w;
3057 
3058  density+=weight;
3059  gamma+=alpha;
3060  }
3061  }
3062  }
3063  }
3064 
3065  // initialize the accumulators to zero
3066  if (itemID < actualNumPixelInThisChunk) {
3067  outputPixelCache[itemID] = (float4)0.0f;
3068  densityCache[itemID] = 0.0f;
3069  if (matte != 0)
3070  gammaCache[itemID] = 0.0f;
3071  }
3072  barrier(CLK_LOCAL_MEM_FENCE);
3073 
3074  // accumulatte the filtered pixel value and the density
3075  for (unsigned int i = 0; i < numItems; i++) {
3076  if (pixelIndex != -1) {
3077  if (itemID%numItems == i) {
3078  outputPixelCache[pixelIndex]+=filteredPixel;
3079  densityCache[pixelIndex]+=density;
3080  if (matte!=0) {
3081  gammaCache[pixelIndex]+=gamma;
3082  }
3083  }
3084  }
3085  barrier(CLK_LOCAL_MEM_FENCE);
3086  }
3087 
3088  if (itemID < actualNumPixelInThisChunk) {
3089  if (matte==0) {
3090  float density = densityCache[itemID];
3091  float4 filteredPixel = outputPixelCache[itemID];
3092  if (density!= 0.0f && density != 1.0)
3093  {
3094  density = PerceptibleReciprocal(density);
3095  filteredPixel *= (float4)density;
3096  }
3097  filteredImage[(chunkStartY+itemID)*filteredColumns+x] = (CLPixelType) (ClampToQuantum(filteredPixel.x)
3098  , ClampToQuantum(filteredPixel.y)
3099  , ClampToQuantum(filteredPixel.z)
3100  , ClampToQuantum(filteredPixel.w));
3101  }
3102  else {
3103  float density = densityCache[itemID];
3104  float gamma = gammaCache[itemID];
3105  float4 filteredPixel = outputPixelCache[itemID];
3106 
3107  if (density!= 0.0f && density != 1.0) {
3108  density = PerceptibleReciprocal(density);
3109  filteredPixel *= (float4)density;
3110  gamma *= density;
3111  }
3112  gamma = PerceptibleReciprocal(gamma);
3113 
3114  CLPixelType fp;
3115  fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
3116  , ClampToQuantum(gamma*filteredPixel.y)
3117  , ClampToQuantum(gamma*filteredPixel.z)
3118  , ClampToQuantum(filteredPixel.w));
3119 
3120  filteredImage[(chunkStartY+itemID)*filteredColumns+x] = fp;
3121 
3122  }
3123  }
3124 
3125  } // end of chunking loop
3126  }
3127  )
3128 
3129 /*
3130 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3131 % %
3132 % %
3133 % %
3134 % U n s h a r p M a s k %
3135 % %
3136 % %
3137 % %
3138 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3139 */
3140 
3141  STRINGIFY(
3142  __kernel void UnsharpMaskBlurColumn(const __global CLPixelType* inputImage,
3143  const __global float4 *blurRowData, __global CLPixelType *filtered_im,
3144  const unsigned int imageColumns, const unsigned int imageRows,
3145  __local float4* cachedData, __local float* cachedFilter,
3146  const ChannelType channel, const __global float *filter, const unsigned int width,
3147  const float gain, const float threshold)
3148  {
3149  const unsigned int radius = (width-1)/2;
3150 
3151  // cache the pixel shared by the workgroup
3152  const int groupX = get_group_id(0);
3153  const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
3154  const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
3155 
3156  if (groupStartY >= 0
3157  && groupStopY < imageRows) {
3158  event_t e = async_work_group_strided_copy(cachedData
3159  ,blurRowData+groupStartY*imageColumns+groupX
3160  ,groupStopY-groupStartY,imageColumns,0);
3161  wait_group_events(1,&e);
3162  }
3163  else {
3164  for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) {
3165  cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,imageRows)*imageColumns+ groupX];
3166  }
3167  barrier(CLK_LOCAL_MEM_FENCE);
3168  }
3169  // cache the filter as well
3170  event_t e = async_work_group_copy(cachedFilter,filter,width,0);
3171  wait_group_events(1,&e);
3172 
3173  // only do the work if this is not a patched item
3174  //const int cy = get_group_id(1)*get_local_size(1)+get_local_id(1);
3175  const int cy = get_global_id(1);
3176 
3177  if (cy < imageRows) {
3178  float4 blurredPixel = (float4) 0.0f;
3179 
3180  int i = 0;
3181 
3182  \n #ifndef UFACTOR \n
3183  \n #define UFACTOR 8 \n
3184  \n #endif \n
3185 
3186  for ( ; i+UFACTOR < width; )
3187  {
3188  \n #pragma unroll UFACTOR \n
3189  for (int j=0; j < UFACTOR; j++, i++)
3190  {
3191  blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
3192  }
3193  }
3194 
3195  for ( ; i < width; i++)
3196  {
3197  blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
3198  }
3199 
3200  blurredPixel = floor((float4)(ClampToQuantum(blurredPixel.x), ClampToQuantum(blurredPixel.y)
3201  ,ClampToQuantum(blurredPixel.z), ClampToQuantum(blurredPixel.w)));
3202 
3203  float4 inputImagePixel = convert_float4(inputImage[cy*imageColumns+groupX]);
3204  float4 outputPixel = inputImagePixel - blurredPixel;
3205 
3206  float quantumThreshold = QuantumRange*threshold;
3207 
3208  int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
3209  outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
3210 
3211  //write back
3212  filtered_im[cy*imageColumns+groupX] = (CLPixelType) (ClampToQuantum(outputPixel.x), ClampToQuantum(outputPixel.y)
3213  ,ClampToQuantum(outputPixel.z), ClampToQuantum(outputPixel.w));
3214 
3215  }
3216  }
3217  )
3218 
3219 
3220 
3221  STRINGIFY(
3222  __kernel void UnsharpMask(__global CLPixelType *im, __global CLPixelType *filtered_im,
3223  __constant float *filter,
3224  const unsigned int width,
3225  const unsigned int imageColumns, const unsigned int imageRows,
3226  __local float4 *pixels,
3227  const float gain, const float threshold, const unsigned int justBlur)
3228  {
3229  const int x = get_global_id(0);
3230  const int y = get_global_id(1);
3231 
3232  const unsigned int radius = (width - 1) / 2;
3233 
3234  int row = y - radius;
3235  int baseRow = get_group_id(1) * get_local_size(1) - radius;
3236  int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
3237 
3238  while (row < endRow) {
3239  int srcy = (row < 0) ? -row : row; // mirror pad
3240  srcy = (srcy >= imageRows) ? (2 * imageRows - srcy - 1) : srcy;
3241 
3242  float4 value = 0.0f;
3243 
3244  int ix = x - radius;
3245  int i = 0;
3246 
3247  while (i + 7 < width) {
3248  for (int j = 0; j < 8; ++j) { // unrolled
3249  int srcx = ix + j;
3250  srcx = (srcx < 0) ? -srcx : srcx;
3251  srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx;
3252  value += filter[i + j] * convert_float4(im[srcx + srcy * imageColumns]);
3253  }
3254  ix += 8;
3255  i += 8;
3256  }
3257 
3258  while (i < width) {
3259  int srcx = (ix < 0) ? -ix : ix; // mirror pad
3260  srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx;
3261  value += filter[i] * convert_float4(im[srcx + srcy * imageColumns]);
3262  ++i;
3263  ++ix;
3264  }
3265  pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
3266  row += get_local_size(1);
3267  }
3268 
3269 
3270  barrier(CLK_LOCAL_MEM_FENCE);
3271 
3272 
3273  const int px = get_local_id(0);
3274  const int py = get_local_id(1);
3275  const int prp = get_local_size(0);
3276  float4 value = (float4)(0.0f);
3277 
3278  int i = 0;
3279  while (i + 7 < width) { // unrolled
3280  value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
3281  value += (float4)(filter[i]) * pixels[px + (py + i + 1) * prp];
3282  value += (float4)(filter[i]) * pixels[px + (py + i + 2) * prp];
3283  value += (float4)(filter[i]) * pixels[px + (py + i + 3) * prp];
3284  value += (float4)(filter[i]) * pixels[px + (py + i + 4) * prp];
3285  value += (float4)(filter[i]) * pixels[px + (py + i + 5) * prp];
3286  value += (float4)(filter[i]) * pixels[px + (py + i + 6) * prp];
3287  value += (float4)(filter[i]) * pixels[px + (py + i + 7) * prp];
3288  i += 8;
3289  }
3290  while (i < width) {
3291  value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
3292  ++i;
3293  }
3294 
3295  if (justBlur == 0) { // apply sharpening
3296  float4 srcPixel = convert_float4(im[x + y * imageColumns]);
3297  float4 diff = srcPixel - value;
3298 
3299  float quantumThreshold = QuantumRange*threshold;
3300 
3301  int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
3302  value = select(srcPixel + diff * gain, srcPixel, mask);
3303  }
3304 
3305  if ((x < imageColumns) && (y < imageRows))
3306  filtered_im[x + y * imageColumns] = (CLPixelType)(ClampToQuantum(value.s0), ClampToQuantum(value.s1), ClampToQuantum(value.s2), ClampToQuantum(value.s3));
3307  }
3308  )
3309 
3310  STRINGIFY(
3311  __kernel __attribute__((reqd_work_group_size(64, 4, 1))) void WaveletDenoise(__global CLPixelType *srcImage, __global CLPixelType *dstImage,
3312  const float threshold,
3313  const int passes,
3314  const int imageWidth,
3315  const int imageHeight)
3316  {
3317  const int pad = (1 << (passes - 1));;
3318  const int tileSize = 64;
3319  const int tileRowPixels = 64;
3320  const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
3321 
3322  CLPixelType stage[16];
3323 
3324  local float buffer[64 * 64];
3325 
3326  int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0);
3327  int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad;
3328 
3329  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3330  stage[i / 4] = srcImage[mirrorTop(mirrorBottom(srcx), imageWidth) + (mirrorTop(mirrorBottom(srcy + i) , imageHeight)) * imageWidth];
3331  }
3332 
3333 
3334  for (int channel = 0; channel < 3; ++channel) {
3335  // Load LDS
3336  switch (channel) {
3337  case 0:
3338  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3339  buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s0);
3340  break;
3341  case 1:
3342  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3343  buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s1);
3344  break;
3345  case 2:
3346  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3347  buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s2);
3348  break;
3349  }
3350 
3351 
3352  // Process
3353 
3354  float tmp[16];
3355  float accum[16];
3356  float pixel;
3357 
3358  for (int pass = 0; pass < passes; ++pass) {
3359  const int radius = 1 << pass;
3360  const int x = get_local_id(0);
3361  const float thresh = threshold * noise[pass];
3362 
3363  if (pass == 0)
3364  accum[0] = accum[1] = accum[2] = accum[3] = accum[4] = accum[5] = accum[6] = accum[6] = accum[7] = accum[8] = accum[9] = accum[10] = accum[11] = accum[12] = accum[13] = accum[14] = accum[15] = 0.0f;
3365 
3366  // Snapshot input
3367 
3368  // Apply horizontal hat
3369  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3370  const int offset = i * tileRowPixels;
3371  if (pass == 0)
3372  tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
3373  pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
3374  barrier(CLK_LOCAL_MEM_FENCE);
3375  buffer[x + offset] = pixel;
3376  }
3377  barrier(CLK_LOCAL_MEM_FENCE);
3378  // Apply vertical hat
3379  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3380  pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
3381  float delta = tmp[i / 4] - pixel;
3382  tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
3383  if (delta < -thresh)
3384  delta += thresh;
3385  else if (delta > thresh)
3386  delta -= thresh;
3387  else
3388  delta = 0;
3389  accum[i / 4] += delta;
3390 
3391  }
3392  barrier(CLK_LOCAL_MEM_FENCE);
3393  if (pass < passes - 1)
3394  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3395  buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
3396  else // last pass
3397  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3398  accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
3399  barrier(CLK_LOCAL_MEM_FENCE);
3400  }
3401 
3402  switch (channel) {
3403  case 0:
3404  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3405  stage[i / 4].s0 = ClampToQuantum(accum[i / 4]);
3406  break;
3407  case 1:
3408  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3409  stage[i / 4].s1 = ClampToQuantum(accum[i / 4]);
3410  break;
3411  case 2:
3412  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3413  stage[i / 4].s2 = ClampToQuantum(accum[i / 4]);
3414  break;
3415  }
3416 
3417  barrier(CLK_LOCAL_MEM_FENCE);
3418  }
3419 
3420  // Write from stage to output
3421 
3422  if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
3423  //for (int i = pad + get_local_id(1); i < tileSize - pad; i += get_local_size(1)) {
3424  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3425  if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
3426  dstImage[srcx + (srcy + i) * imageWidth] = stage[i / 4];
3427  }
3428  }
3429  }
3430  }
3431  )
3432  ;
3433 
3434 #endif // MAGICKCORE_OPENCL_SUPPORT
3435 
3436 #if defined(__cplusplus) || defined(c_plusplus)
3437 }
3438 #endif
3439 
3440 #endif // MAGICKCORE_ACCELERATE_PRIVATE_H
Definition: composite.h:91
Definition: composite.h:94
Definition: composite.h:65
Definition: colorspace.h:44
Definition: resize-private.h:31
Definition: colorspace.h:36
#define SigmaPoisson
Definition: resize-private.h:37
Definition: pixel.h:75
Definition: statistic.h:116
Definition: resize-private.h:33
Definition: magick-type.h:198
Definition: composite.h:75
Definition: pixel.h:72
Definition: colorspace.h:40
static void MagickPixelCompositeBlend(const MagickPixelPacket *p, const MagickRealType alpha, const MagickPixelPacket *q, const MagickRealType beta, MagickPixelPacket *composite)
Definition: composite-private.h:138
Definition: composite.h:31
Definition: composite.h:93
Definition: colorspace.h:45
Definition: colorspace.h:33
Definition: composite.h:80
#define SigmaRandom
Definition: composite.h:33
Definition: resize-private.h:40
Definition: composite.h:90
Definition: resize-private.h:29
static MagickRealType ColorDodge(const MagickRealType Sca, const MagickRealType Sa, const MagickRealType Dca, const MagickRealType Da)
Definition: composite.c:293
Definition: fx.h:34
PixelIntensityMethod
Definition: pixel.h:67
Definition: magick-type.h:187
Definition: composite.h:95
Definition: colorspace.h:59
Definition: magick-type.h:193
Definition: composite.h:59
Definition: composite.h:89
Definition: magick-type.h:182
Definition: composite.h:27
Definition: colorspace.h:41
Definition: colorspace.h:37
static MagickRealType RoundToUnity(const MagickRealType value)
Definition: composite-private.h:33
Definition: composite.h:35
Definition: composite.h:87
#define MagickPI
Definition: image-private.h:36
Definition: colorspace.h:58
Definition: colorspace.h:50
static MagickRealType Hanning(const MagickRealType x, const ResizeFilter *magick_unused(resize_filter))
Definition: resize.c:287
Definition: colorspace.h:47
Definition: fx.h:29
float MagickRealType
Definition: magick-type.h:76
Definition: statistic.h:115
Definition: colorspace.h:31
#define MAGICKCORE_QUANTUM_DEPTH
Definition: magick-type.h:28
Definition: composite.h:53
Definition: colorspace.h:35
Definition: resize-private.h:38
Definition: pixel.h:77
#define MagickEpsilon
Definition: magick-type.h:139
#define SigmaLaplacian
MagickExport void ConvertRGBToHSL(const Quantum red, const Quantum green, const Quantum blue, double *hue, double *saturation, double *lightness)
Definition: gem.c:1127
Definition: magick-type.h:188
Definition: colorspace.h:48
Definition: statistic.h:117
Definition: magick-type.h:200
NoiseType
Definition: fx.h:27
Definition: colorspace.h:52
Definition: composite.h:47
static MagickRealType Hamming(const MagickRealType x, const ResizeFilter *magick_unused(resize_filter))
Definition: resize.c:301
Definition: resize-private.h:41
Definition: composite.h:73
Definition: composite.h:29
Definition: composite.h:72
Definition: composite.h:42
Definition: colorspace.h:43
#define SigmaUniform
Definition: composite.h:97
static void ModulateHSL(const double percent_hue, const double percent_saturation, const double percent_lightness, Quantum *red, Quantum *green, Quantum *blue)
Definition: enhance.c:3554
Definition: colorspace.h:34
Definition: colorspace.h:57
Definition: resize-private.h:30
static double PerceptibleReciprocal(const double x)
Definition: pixel-accessor.h:124
Definition: composite.h:54
#define GetPixelAlpha(pixel)
Definition: pixel-accessor.h:36
Definition: composite.h:38
Definition: composite.h:68
Definition: composite.h:96
Definition: magick-type.h:184
Definition: composite.h:71
Definition: resize-private.h:32
Definition: composite.h:55
Definition: composite.h:56
Definition: fx.h:31
#define SigmaGaussian
Definition: composite.h:69
Definition: pixel.h:71
static Quantum ApplyFunction(Quantum pixel, const MagickFunction function, const size_t number_parameters, const double *parameters, ExceptionInfo *exception)
Definition: statistic.c:938
Definition: colorspace.h:38
Definition: pixel.h:70
Definition: composite.h:86
Definition: resize-private.h:36
Definition: colorspace.h:30
#define SigmaMultiplicativeGaussian
Definition: composite.h:49
Definition: composite.h:44
#define TauGaussian
MagickExport void ConvertRGBToHSB(const Quantum red, const Quantum green, const Quantum blue, double *hue, double *saturation, double *brightness)
Definition: gem.c:994
Definition: magick-type.h:186
static void Contrast(const int sign, Quantum *red, Quantum *green, Quantum *blue)
Definition: enhance.c:913
Definition: magick-type.h:201
Definition: composite.h:46
Definition: statistic.h:113
Definition: composite.h:28
Definition: magick-type.h:181
Definition: magick-type.h:190
Definition: colorspace.h:54
Definition: magick-type.h:189
Definition: resize-private.h:39
Definition: composite.h:78
Definition: resize-private.h:34
#define QuantumScale
Definition: magick-type.h:142
Definition: colorspace.h:55
Definition: fx.h:33
Definition: composite.h:62
Definition: colorspace.h:39
#define MaxMap
Definition: magick-type.h:70
Definition: magick-type.h:197
#define MagickMax(x, y)
Definition: image-private.h:34
Definition: composite.h:98
Definition: composite.h:39
static void CompositeColorDodge(const MagickPixelPacket *p, const MagickPixelPacket *q, MagickPixelPacket *composite)
Definition: composite.c:330
MagickExport void ConvertHSBToRGB(const double hue, const double saturation, const double brightness, Quantum *red, Quantum *green, Quantum *blue)
Definition: gem.c:284
Definition: composite.h:45
ChannelType
Definition: magick-type.h:177
Definition: composite.h:70
Definition: colorspace.h:46
Definition: resize-private.h:28
Definition: composite.h:81
Definition: composite.h:41
Definition: composite.h:52
Definition: pixel.h:69
Definition: colorspace.h:49
MagickExport void ConvertHSLToRGB(const double hue, const double saturation, const double lightness, Quantum *red, Quantum *green, Quantum *blue)
Definition: gem.c:460
Definition: composite.h:77
static Quantum ClampToQuantum(const MagickRealType value)
Definition: quantum.h:87
Definition: colorspace.h:53
Definition: composite.h:61
Definition: magick-type.h:183
static void MagickPixelCompositePlus(const MagickPixelPacket *p, const MagickRealType alpha, const MagickPixelPacket *q, const MagickRealType beta, MagickPixelPacket *composite)
Definition: composite-private.h:111
Definition: composite.h:76
Definition: magick-type.h:179
Definition: colorspace.h:28
Definition: resize-private.h:42
Definition: composite.h:50
Definition: composite.h:36
Definition: composite.h:43
MagickExport MagickRealType GetPixelIntensity(const Image *image, const PixelPacket *magick_restrict pixel)
Definition: pixel.c:2285
static MagickRealType Sinc(const MagickRealType, const ResizeFilter *)
Definition: composite.h:37
Definition: composite.h:60
Definition: statistic.h:114
ResizeWeightingFunctionType
Definition: resize-private.h:25
static MagickRealType Blackman(const MagickRealType x, const ResizeFilter *magick_unused(resize_filter))
Definition: resize.c:148
Definition: colorspace.h:56
#define MagickMin(x, y)
Definition: image-private.h:35
ColorspaceType
Definition: colorspace.h:25
Definition: composite.h:32
Definition: colorspace.h:29
Definition: composite.h:88
Definition: colorspace.h:42
#define SigmaImpulse
Definition: composite.h:48
Definition: composite.h:64
Definition: magick-type.h:185
Definition: colorspace.h:51
CompositeOperator
Definition: composite.h:25
Definition: composite.h:79
Definition: magick-type.h:192
Definition: colorspace.h:32
Definition: pixel.h:78
Definition: composite.h:66
Definition: composite.h:30
Definition: colorspace.h:60
Definition: magick-type.h:180
Definition: composite.h:63
Definition: composite.h:58
Definition: composite.h:92
Definition: magick-type.h:199
Definition: composite.h:34
static MagickRealType CubicBC(const MagickRealType x, const ResizeFilter *resize_filter)
Definition: resize.c:210
Definition: resize-private.h:27
Definition: composite.h:74
Definition: colorspace.h:27
MagickFunction
Definition: statistic.h:111
Definition: fx.h:30
Definition: composite.h:40
Definition: composite.h:67
Definition: resize-private.h:35
#define QuantumRange
Definition: magick-type.h:94
static MagickRealType Triangle(const MagickRealType x, const ResizeFilter *magick_unused(resize_filter))
Definition: resize.c:514
Definition: fx.h:35
Definition: pixel.h:73
Definition: composite.h:51
Definition: fx.h:32
Definition: magick-type.h:191
Definition: fx.h:36
Definition: composite.h:57