Error message here!

Hide Error message here!

Error message here!

Hide Error message here!

Error message here!

Close

## SSE图像算法优化系列三十：GIMP中的Noise Reduction算法原理及快速实现。

Imageshop 2019-11-18 08:45:00 阅读数:37 评论数:0 点赞数:0 收藏数:0

```static void
noise_reduction (float *src_buf, /* source buffer, one pixel to the left
and up from the starting pixel */
int src_stride, /* stridewidth of buffer in pixels */
float *dst_buf, /* destination buffer */
int dst_width, /* width to render */
int dst_height, /* height to render */
int dst_stride) /* stride of target buffer */
{
int c;
int x,y;
int dst_offset;
#define NEIGHBOURS 8
#define AXES (NEIGHBOURS/2)
#define POW2(a) ((a)*(a))
/* core code/formulas to be tweaked for the tuning the implementation */
#define GEN_METRIC(before, center, after) \
POW2((center) * - (before) - (after))
/* Condition used to bail diffusion from a direction */
#define BAIL_CONDITION(new,original) ((new) > (original))
#define SYMMETRY(a) (NEIGHBOURS - (a) - 1) /* point-symmetric neighbour pixel */
#define O(u,v) (((u)+((v) * src_stride)) * 4)
int offsets[NEIGHBOURS] = { /* array of the relative distance i float
* pointers to each of neighbours
* in source buffer, allows quick referencing.
*/
O( -, -), O(, -), O(, -),
O( -, ), O(, ),
O( -, ), O(, ), O(, )};
#undef O
dst_offset = ;
for (y=; y<dst_height; y++)
{
float *center_pix = src_buf + ((y+) * src_stride + ) * ;
dst_offset = dst_stride * y;
for (x=; x<dst_width; x++)
{
for (c=; c<; c++) /* do each color component individually */
{
float metric_reference[AXES];
int axis;
int direction;
float sum;
int count;
for (axis = ; axis < AXES; axis++)
{ /* initialize original metrics for the horizontal, vertical
and 2 diagonal metrics */
float *before_pix = center_pix + offsets[axis];
float *after_pix = center_pix + offsets[SYMMETRY(axis)];
metric_reference[axis] =
GEN_METRIC (before_pix[c], center_pix[c], after_pix[c]);
}
sum = center_pix[c];
count = ;
/* try smearing in data from all neighbours */
for (direction = ; direction < NEIGHBOURS; direction++)
{
float *pix = center_pix + offsets[direction];
float value = pix[c] * 0.5 + center_pix[c] * 0.5;
int axis;
int valid;
/* check if the non-smoothing operating check is true if
* smearing from this direction for any of the axes */
valid = ; /* assume it will be valid */
for (axis = ; axis < AXES; axis++)
{
float *before_pix = center_pix + offsets[axis];
float *after_pix = center_pix + offsets[SYMMETRY(axis)];
float metric_new =
GEN_METRIC (before_pix[c], value, after_pix[c]);
if (BAIL_CONDITION(metric_new, metric_reference[axis]))
{
valid = ; /* mark as not a valid smoothing, and .. */
break; /* .. break out of loop */
}
}
if (valid) /* we were still smooth in all axes */
{ /* add up contribution to final result */
sum += value;
count ++;
}
}
dst_buf[dst_offset*+c] = sum / count;
}
dst_buf[dst_offset*+] = center_pix[]; /* copy alpha unmodified */
dst_offset++;
center_pix += ;
}
}
}```

这个代码看上去比较混乱，没办法，大型软件没有哪一个代码看上去能让人省心的，而且也不怎么讲究效率，我测试了一个3K*2K的彩色图，在GIMP里大概在4S左右处理完成，属于很慢的了，看这个代码，也知道大概有width * height * 3 * 8 * 4 * Iter个循环，计算量确实是相当的大。

for (c=0; c<3; c++) /* do each color component individually */

```void IM_AnisotropicDiffusion3X3(short *Src, short *Dest, int Width, int Height, int SplitPos, int Stride)
{
int Channel = Stride / Width;
short *RowCopy = (short *)malloc((Width + ) * * Channel * sizeof(short));
short *First = RowCopy;
short *Second = RowCopy + (Width + ) * Channel;
short *Third = RowCopy + (Width + ) * * Channel;
memcpy(Second, Src, Channel * sizeof(short));
memcpy(Second + Channel, Src, Width * Channel * sizeof(short)); // 拷贝数据到中间位置
memcpy(Second + (Width + ) * Channel, Src + (Width - ) * Channel, Channel * sizeof(short));
memcpy(First, Second, (Width + ) * Channel * sizeof(short)); // 第一行和第二行一样

memcpy(Third, Src + Stride, Channel * sizeof(short)); // 拷贝第二行数据
memcpy(Third + Channel, Src + Stride, Width * Channel* sizeof(short));
memcpy(Third + (Width + ) * Channel, Src + Stride + (Width - ) * Channel, Channel* sizeof(short));
for (int Y = ; Y < Height; Y++)
{
short *LinePD = Dest + Y * Stride;
if (Y != )
{
short *Temp = First; First = Second; Second = Third; Third = Temp;
}
if (Y == Height - )
{
memcpy(Third, Second, (Width + ) * Channel * sizeof(short));
}
else
{
memcpy(Third, Src + (Y + ) * Stride, Channel * sizeof(short));
memcpy(Third + Channel, Src + (Y + ) * Stride, Width * Channel * sizeof(short)); // 由于备份了前面一行的数据，这里即使Src和Dest相同也是没有问题的
memcpy(Third + (Width + ) * Channel, Src + (Y + ) * Stride + (Width - ) * Channel, Channel * sizeof(short));
}
for (int X = ; X < SplitPos * Channel; X++)
{
short LT = First[X], T = First[X + Channel], RT = First[X + * Channel];
short L = Second[X], C = Second[X + Channel], R = Second[X + * Channel];
short LB = Third[X], B = Third[X + Channel], RB = Third[X + * Channel];
short LT_RB = LT + RB, RT_LB = RT + LB;
short T_B = T + B, L_R = L + R, C_C = C + C;
short Dist1 = IM_Abs(C_C - LT_RB), Dist2 = IM_Abs(C_C - T_B);
short Dist3 = IM_Abs(C_C - RT_LB), Dist4 = IM_Abs(C_C - L_R);
int Sum = C_C, Amount = ;
short LT_C = LT + C;
if ((IM_Abs(LT_C - LT_RB) < Dist1) && (IM_Abs(LT_C - T_B) < Dist2) && (IM_Abs(LT_C - RT_LB) < Dist3) && (IM_Abs(LT_C - L_R) < Dist4))
{
Sum += LT_C;
Amount += ;
}
short T_C = T + C;
if ((IM_Abs(T_C - LT_RB) < Dist1) && (IM_Abs(T_C - T_B) < Dist2) && (IM_Abs(T_C - RT_LB) < Dist3) && (IM_Abs(T_C - L_R) < Dist4))
{
Sum += T_C;
Amount += ;
}
short RT_C = RT + C;
if ((IM_Abs(RT_C - LT_RB) < Dist1) && (IM_Abs(RT_C - T_B) < Dist2) && (IM_Abs(RT_C - RT_LB) < Dist3) && (IM_Abs(RT_C - L_R) < Dist4))
{
Sum += RT_C;
Amount += ;
}
short L_C = L + C;
if ((IM_Abs(L_C - LT_RB) < Dist1) && (IM_Abs(L_C - T_B) < Dist2) && (IM_Abs(L_C - RT_LB) < Dist3) && (IM_Abs(L_C - L_R) < Dist4))
{
Sum += L_C;
Amount += ;
}
short R_C = R + C;
if ((IM_Abs(R_C - LT_RB) < Dist1) && (IM_Abs(R_C - T_B) < Dist2) && (IM_Abs(R_C - RT_LB) < Dist3) && (IM_Abs(R_C - L_R) < Dist4))
{
Sum += R_C;
Amount += ;
}
short LB_C = LB + C;
if ((IM_Abs(LB_C - LT_RB) < Dist1) && (IM_Abs(LB_C - T_B) < Dist2) && (IM_Abs(LB_C - RT_LB) < Dist3) && (IM_Abs(LB_C - L_R) < Dist4))
{
Sum += LB_C;
Amount += ;
}
short B_C = B + C;
if ((IM_Abs(B_C - LT_RB) < Dist1) && (IM_Abs(B_C - T_B) < Dist2) && (IM_Abs(B_C - RT_LB) < Dist3) && (IM_Abs(B_C - L_R) < Dist4))
{
Sum += B_C;
Amount += ;
}
short RB_C = RB + C;
if ((IM_Abs(RB_C - LT_RB) < Dist1) && (IM_Abs(RB_C - T_B) < Dist2) && (IM_Abs(RB_C - RT_LB) < Dist3) && (IM_Abs(RB_C - L_R) < Dist4))
{
Sum += RB_C;
Amount += ;
}
LinePD[X] = Sum / Amount;
}
}
free(RowCopy);
}```

```int IM_ReduceNoise(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, int SplitPos, int Strength)
{
int Channel = Stride / Width;
if ((Src == NULL) || (Dest == NULL)) return IM_STATUS_NULLREFRENCE;
if ((Width <= ) || (Height <= )) return IM_STATUS_INVALIDPARAMETER;
if ((Channel != ) && (Channel != )) return IM_STATUS_INVALIDPARAMETER;
Strength = IM_ClampI(Strength, , );
SplitPos = IM_ClampI(SplitPos, , Width);
int Status = IM_STATUS_OK;
short *Temp = (short *)malloc(Height * Stride * sizeof(short));
if (Temp == NULL) return IM_STATUS_OUTOFMEMORY;
for (int Y = ; Y < Height * Stride; Y++)
{
Temp[Y] = Src[Y] << ;
}
for (int Y = ; Y < Strength; Y++)
{
IM_AnisotropicDiffusion3X3(Temp, Temp, Width, Height, SplitPos, Stride);
}
for (int Y = ; Y < Height * Stride; Y++)
{
Dest[Y] = Temp[Y] >> ;
}
free(Temp);
return IM_STATUS_OK;
}```

我还是老套路，使用SIMD指令做处理，看到上面的代码，其实真的觉得好容易改成SIMD的。

``` short LT_RB = LT + RB, RT_LB = RT + LB;
short T_B = T + B, L_R = L + R, C_C = C + C;
short Dist1 = IM_Abs(C_C - LT_RB), Dist2 = IM_Abs(C_C - T_B);
short Dist3 = IM_Abs(C_C - RT_LB), Dist4 = IM_Abs(C_C - L_R);  这些加减绝对值都有完全对应的SSE指令。 _mm_add_epi16、 _mm_sub_epi16、_mm_abs_epi16，基本上就是照着写。 稍微复杂一点就是这里：```
``` if ((IM_Abs(LT_C - LT_RB) < Dist1) && (IM_Abs(LT_C - T_B) < Dist2) && (IM_Abs(LT_C - RT_LB) < Dist3) && (IM_Abs(LT_C - L_R) < Dist4))
{
Sum += LT_C;
Amount += 2;
} 在C语言里，这里判断会进行短路计算，即如果前一个条件已经不满足了，后续的计算就不会进行。但是在SIMD指令里，是没有这样的机制的。我们只能全部计算，然后在通过某一种条件组合。 在合理，要实现符合条件就进行累加，不符合条件就不做处理的需求，我们需要稍作修改，即不符合条件不是不做处理，而是加0，加0对结果没有影响的。主要借助下面的_mm_blendv_epi8来实现。```
``` __m128i LT_C = _mm_add_epi16(LT, C);
Flag1 = _mm_cmplt_epi16(_mm_abs_epi16(_mm_sub_epi16(LT_C, LT_RB)), Dist1); // 只能全部都计算，但还是能提速
Flag2 = _mm_cmplt_epi16(_mm_abs_epi16(_mm_sub_epi16(LT_C, T_B)), Dist2);
Flag3 = _mm_cmplt_epi16(_mm_abs_epi16(_mm_sub_epi16(LT_C, RT_LB)), Dist3);
Flag4 = _mm_cmplt_epi16(_mm_abs_epi16(_mm_sub_epi16(LT_C, L_R)), Dist4);
Flag = _mm_and_si128(_mm_and_si128(Flag1, Flag2), _mm_and_si128(Flag3, Flag4));
Sum = _mm_adds_epu16(Sum, _mm_blendv_epi8(Zero, LT_C, Flag));
Amount = _mm_adds_epu16(Amount, _mm_blendv_epi8(Zero, Two, Flag));```

另外，在GIMP里也提供了这个算法的OPENCL实现，有兴趣的可以源代码里找一找，不晓得速度怎么样。

https://www.cnblogs.com/Imageshop/p/11873947.html