着重介绍了阈值二值化及ostu二值化的实现,算法本身很简单,主要是将算法并行化,有点费事,同时也注意到了对于时序性较强的算法,cuda移植后性能并不会得到大幅度提升,提升往往在10倍左右,值得深思。
阈值二值化
阈值二值化,这里是指阈值是人为设定的,简单的二值化,鲁棒性低,但计算量小。
/*
* Compare 'threshold()' funciton in OpenCV
* When:
* thresholdMin = thresholdMax and valMin = 0 ==> THRESH_BINARY
* thresholdMin = thresholdMax and valMax = 0 ==> THRESH_BINARY_INV
* thresholdMax = valMax and thresholdMin = 0 ==> THRESH_TRUNC
* thresholdMax = 255 and valMin = 0 ==> THRESH_TOZERO
* thresholdMin = 0 and valMax = 0 ==> THRESH_TOZERO_INV
*/
__global__ void thresholdBinarization(unsigned char* dataIn,
unsigned char* dataOut,
short int imgRows,
short int imgCols,
unsigned char thresholdMin,
unsigned char thresholdMax,
unsigned char valMin,
unsigned char valMax)
{
int xIndex = threadIdx.x + __umul24(blockIdx.x, blockDim.x);
int yIndex = threadIdx.y + __umul24(blockIdx.y, blockDim.y);
int tid = __umul24(yIndex,imgCols)+xIndex;
unsigned char val=dataIn[tid];
unsigned char res = val;
if(xIndex < imgCols && yIndex < imgRows)
{
if(val>thresholdMax)
{
res = valMax;
}
if(val<=thresholdMin)
{
res = valMin;
}
dataOut[tid] = res;
}
}
这个算法非常简单,设置与图像像素数一样的thread,thread读取每个像素值,直接进行判断,这里使用分支语句的影响很小,貌似可以被编译器优化,不必太担心效率。
而且通过适当的参数设置(见代码中的注释部分),可以得到与opencv中threshold函数一样的效果。
OSTU二值化
OSTU又称为自适应阈值二值化,它的主要思想是迭代,从灰度级0到255,每次按照当前的灰度值将图像分为出背景和前景,如果当前两个部分(背景和前景)的类间方差最大,则说明此时的阈值是最优的。
求直方图:
__global__ void getHist(unsigned char* dataIn, unsigned int* hist)
{
int xdx = threadIdx.x + __umul24(blockIdx.x, blockDim.x);
int ydx = threadIdx.y + __umul24(blockIdx.y, blockDim.y);
int tid = xdx + ydx*gridDim.x*blockDim.x;
if(tid < 256)
{
hist[tid]=0;
}
__syncthreads();
atomicAdd(&hist[dataIn[tid]],1);
}
这里需要注意的是,由于我们面向视频处理,像atomicAdd这类的操作,之前需要将整个数组清空,因为上一帧的处理结果还在内存中,并没有删除。并且还需要同步一下线程,确保整个hist都已被清空。
计算类间方差
/*
* w_A:前景像素级占的比例,Pi的和
* u_A:前景像素级的均值,i×Pi的和除以w_A
*/
__global__ void ostu(unsigned int* hist,
float* sum_Pi,
float* sum_i_Pi,
float* u_0,
float* varance,
short int imgRows,
short int imgCols)
{
//清空相关数组,清空上一次的计算结果
if(blockIdx.x == 0)
{
sum_Pi[threadIdx.x] = 0.0f;
sum_i_Pi[threadIdx.x] = 0.0f;
if(threadIdx.x==0)
{
u_0[0] = 0.0f;
}
}
__syncthreads();
//计算整幅图的平均灰度、前景的概率、前景的平均灰度值
unsigned int current_val = hist[threadIdx.x];
if(blockIdx.x==0)
{
atomicAdd(&u_0[0],current_val*threadIdx.x);//sum(i*Pi)+sum(j*Pj)
}
else
{
if(threadIdx.x < blockIdx.x)
{
atomicAdd(&sum_Pi[blockIdx.x-1],current_val);//sum()
atomicAdd(&sum_i_Pi[blockIdx.x-1],current_val*threadIdx.x);//sum(i*Pi)
}
}
__syncthreads();
//now we get sum_Pi[256] and sum_i_Pi[256] and w_0
//下面开始计算类间方差
int imgSize = imgRows*imgCols;
if(blockIdx.x>0)
{
float f_sum_pi = sum_Pi[blockIdx.x-1]/imgSize;
float f_sum_pj = 1-f_sum_pi;
if(f_sum_pj==0)
{
varance[blockIdx.x-1]=0;
}else
{
float temp = (u_0[0]/imgSize-sum_i_Pi[blockIdx.x-1]/(f_sum_pi*imgSize));
varance[blockIdx.x-1] = temp*temp*f_sum_pi/f_sum_pj;
}
}
}
最后面计算类间方差的公式是化简后得到公式,参考。上述程序会得到一个大小为256的类间方差数组,里面的值对应每个灰度级作为二值化阈值时对应的类间方差。
寻找最优阈值
__global__ void tree_max(float* varance,int* thres)
{
__shared__ float var[256];
__shared__ int varId[256]; // 512*4/1024 = 2KB
varId[threadIdx.x] = threadIdx.x;
var[threadIdx.x] = varance[threadIdx.x];
for (int i = 1; i < 256; i*=2)
{
if(threadIdx.x%(2*i)==0)
{
if(var[threadIdx.x]<var[threadIdx.x+i])
{
var[threadIdx.x] = var[threadIdx.x+i];
varId[threadIdx.x] = varId[threadIdx.x+i];
}
}
__syncthreads();
}
if(threadIdx.x == 0)
{
thres[0] = varId[0];
}
}
这一步是借用并行求和的思想,来并行求最大值,由于我们最终是要得到类间方差最大时的阈值,也就是类间方差最大那个值的索引,所以我们又定义了一个varId
变量来保存每一次树型对比时获得的最大类间方差对应的索引,则对比完后,varId[0]
保存的就是最大类间方对应的索引。最后我们再使用之前的阈值二值化进行二值化即可。
整体流程及thread、block设置
void ostu_gpu(unsigned char* dataIn,
unsigned char* dataOut,
unsigned int* hist,
float* sum_Pi,
float* sum_i_Pi,
float* u_0,
float* varance,
int* thres,
short int imgRows,
short int imgCols)
{
dim3 tPerBlock_hist(32,32);
dim3 bPerGrid_hist((imgCols+32-1)/32,(imgRows+32-1)/32);
//求直方图
getHist_gpu(dataIn,hist,tPerBlock_hist,bPerGrid_hist);
dim3 tPerBlock_ostu(256,1);
dim3 bPerGrid_ostu(257,1);
//计算类间方差
ostu<<<bPerGrid_ostu,tPerBlock_ostu>>>(hist,sum_Pi,sum_i_Pi,u_0,varance,imgRows,imgCols);
//寻找最优阈值
tree_max<<<1,256>>>(varance,thres);
//阈值值化
thresholdBinarization_inner<<<bPerGrid_hist,tPerBlock_hist>>>(dataIn,dataOut,imgRows,imgCols,thres);
}