模板匹配算法:实现图像匹配并进行归一化处理
#include <common.hpp>
#include
namespace CAROTENE_NS {
#define ENABLE4LINESMATCHING false //Disabled since overall time for simultaneous 4 lines matching is greater than //time for simultaneous 2 lines matching for the same amount of data
bool isMatchTemplateSupported(const Size2D &tmplSize) { return isSupportedConfiguration() && tmplSize.width >= 8 && // Actually the function could process even shorter templates // but there will be no NEON optimization in this case (tmplSize.width * tmplSize.height) <= 256; }
void matchTemplate(const Size2D &srcSize, const u8 * srcBase, ptrdiff_t srcStride, const Size2D &tmplSize, const u8 * tmplBase, ptrdiff_t tmplStride, f32 * dstBase, ptrdiff_t dstStride, bool normalize) { internal::assertSupportedConfiguration(isMatchTemplateSupported(tmplSize)); #ifdef CAROTENE_NEON const size_t tmplW = tmplSize.width; const size_t tmplH = tmplSize.height; const size_t dstW = srcSize.width - tmplSize.width + 1; const size_t dstH = srcSize.height - tmplSize.height + 1;
//template correlation part
{
#if ENABLE4LINESMATCHING const size_t dstroiw4 = dstW & ~3u; #endif const size_t dstroiw2 = dstW & ~1u; const size_t tmplroiw = tmplW & ~7u; const size_t dstride = dstStride >> 2;
f32 *corr = dstBase;
const u8 *imgrrow = srcBase;
for(size_t r = 0; r < dstH; ++r, corr+=dstride, imgrrow+=srcStride)
{
size_t c = 0;
#if ENABLE4LINESMATCHING for(; c < dstroiw4; c+=4) { u32 dot[4] = {0, 0, 0, 0}; uint32x4_t vdot0 = vmovq_n_u32(0); uint32x4_t vdot1 = vmovq_n_u32(0); uint32x4_t vdot2 = vmovq_n_u32(0); uint32x4_t vdot3 = vmovq_n_u32(0);
const u8 *img = imgrrow;
const u8 *tmpl = tmplBase;
for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
{
size_t j = 0;
for(; j < tmplroiw; j+=8)
{
uint8x8_t vtmpl = vld1_u8(tmpl + j);
uint8x8_t vimg0 = vld1_u8(img + j + c + 0);
uint8x8_t vimg1 = vld1_u8(img + j + c + 1);
uint8x8_t vimg2 = vld1_u8(img + j + c + 2);
uint8x8_t vimg3 = vld1_u8(img + j + c + 3);
uint16x8_t vd0 = vmull_u8(vtmpl, vimg0);
uint16x8_t vd1 = vmull_u8(vtmpl, vimg1);
uint16x8_t vd2 = vmull_u8(vtmpl, vimg2);
uint16x8_t vd3 = vmull_u8(vtmpl, vimg3);
vdot0 = vpadalq_u16(vdot0, vd0);
vdot1 = vpadalq_u16(vdot1, vd1);
vdot2 = vpadalq_u16(vdot2, vd2);
vdot3 = vpadalq_u16(vdot3, vd3);
}
for(; j < tmplW; ++j)
{
dot[0] += tmpl[j] * img[j + c + 0];
dot[1] += tmpl[j] * img[j + c + 1];
dot[2] += tmpl[j] * img[j + c + 2];
dot[3] += tmpl[j] * img[j + c + 3];
}
}
uint32x4_t vdotx = vld1q_u32(dot);
uint32x2_t vdot_0 = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0));
uint32x2_t vdot_1 = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1));
uint32x2_t vdot_2 = vpadd_u32(vget_low_u32(vdot2), vget_high_u32(vdot2));
uint32x2_t vdot_3 = vpadd_u32(vget_low_u32(vdot3), vget_high_u32(vdot3));
uint32x2_t vdot_01 = vpadd_u32(vdot_0, vdot_1);
uint32x2_t vdot_23 = vpadd_u32(vdot_2, vdot_3);
vst1q_f32(corr + c, vcvtq_f32_u32(vaddq_u32(vdotx, vcombine_u32(vdot_01, vdot_23))));
}
#endif
for(; c < dstroiw2; c+=2)
{
u32 dot[2] = {0, 0};
uint32x4_t vdot0 = vmovq_n_u32(0);
uint32x4_t vdot1 = vmovq_n_u32(0);
const u8 *img = imgrrow;
const u8 *tmpl = tmplBase;
for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
{
size_t j = 0;
for(; j < tmplroiw; j+=8)
{
uint8x8_t vtmpl = vld1_u8(tmpl + j);
uint8x8_t vimg0 = vld1_u8(img + j + c + 0);
uint8x8_t vimg1 = vld1_u8(img + j + c + 1);
uint16x8_t vd0 = vmull_u8(vtmpl, vimg0);
uint16x8_t vd1 = vmull_u8(vtmpl, vimg1);
vdot0 = vpadalq_u16(vdot0, vd0);
vdot1 = vpadalq_u16(vdot1, vd1);
}
for(; j < tmplW; ++j)
{
dot[0] += tmpl[j] * img[j + c + 0];
dot[1] += tmpl[j] * img[j + c + 1];
}
}
uint32x2_t vdotx = vld1_u32(dot);
uint32x2_t vdot_0 = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0));
uint32x2_t vdot_1 = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1));
uint32x2_t vdot_ = vpadd_u32(vdot_0, vdot_1);
vst1_f32(corr + c, vcvt_f32_u32(vadd_u32(vdotx, vdot_)));
}
for(; c < dstW; ++c)
{
u32 dot = 0;
uint32x4_t vdot = vmovq_n_u32(0);
const u8 *img = imgrrow;
const u8 *tmpl = tmplBase;
for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
{
size_t j = 0;
for(; j < tmplroiw; j+=8)
{
uint8x8_t vtmpl = vld1_u8(tmpl + j);
uint8x8_t vimg = vld1_u8(img + j + c);
uint16x8_t vd = vmull_u8(vtmpl, vimg);
vdot = vpadalq_u16(vdot, vd);
}
for(; j < tmplW; ++j)
dot += tmpl[j] * img[j + c];
}
u32 wdot[2];
vst1_u32(wdot, vpadd_u32(vget_low_u32(vdot), vget_high_u32(vdot)));
dot += wdot[0] + wdot[1];
corr[c] = (f32)dot;
}
}
}
if(normalize)
{
f32 tn = std::sqrt((f32)normL2(tmplSize, tmplBase, tmplStride));
size_t iw = srcSize.width+1;
size_t ih = srcSize.height+1;
std::vector<f64> _sqsum(iw*ih);
f64 *sqsum = &_sqsum[0];
memset(sqsum, 0, iw*sizeof(f64));
for(size_t i = 1; i < ih; ++i)
sqsum[iw*i] = 0.;
sqrIntegral(srcSize, srcBase, srcStride, sqsum + iw + 1, iw*sizeof(f64));
for(size_t i = 0; i < dstH; ++i)
{
f32 *result = internal::getRowPtr(dstBase, dstStride, i);
for(size_t j = 0; j < dstW; ++j)
{
double s2 = sqsum[iw*i + j] +
sqsum[iw*(i + tmplSize.height) + j + tmplSize.width] -
sqsum[iw*(i + tmplSize.height) + j] -
sqsum[iw*i + j + tmplSize.width];
result[j] /= tn * std::sqrt(s2);
}
}
}
#else (void)srcSize; (void)srcBase; (void)srcStride; (void)tmplBase; (void)tmplStride; (void)dstBase; (void)dstStride; (void)normalize; #endif }
} 该程序在实现模板匹配时,通过计算模板和图像的相关性来找到最佳匹配位置。在计算相关性时,如果指定了归一化参数normalize为true,程序会对相关性进行归一化处理。
具体的归一化处理方法如下:
-
首先计算模板的L2范数,即模板内所有元素的平方和的平方根,保存为tn。
-
创建一个大小为(srcSize.width+1) x (srcSize.height+1)的数组sqsum,用于计算图像的平方积分图。
-
首先将sqsum数组初始化为0,然后调用sqrIntegral函数计算图像的平方积分图,将结果保存在sqsum数组中。平方积分图的计算方法是通过遍历图像的每个像素,将当前像素的值加上它左上方像素的平方积分值,再减去它左边像素的平方积分值和上方像素的平方积分值,最后加上它左上方像素的平方积分值。
-
遍历dstBase数组中的每一行,对于每个位置(i, j),计算相关性结果result[j]除以tn乘以图像区域的平方和s2的平方根。图像区域的平方和s2是通过使用平方积分图计算得到的,具体计算方法是将右下角点为(i+tmplSize.height, j+tmplSize.width)、左上角点为(i, j)的矩形区域的平方和s2计算为sqsum[(i+tmplSize.height)iw + j+tmplSize.width] + sqsum[iwi + j] - sqsum[(i+tmplSize.height)iw + j] - sqsum[iwi + j+tmplSize.width],其中iw为sqsum数组的宽度。
通过上述归一化处理,可以将相关性结果归一到与模板和图像的尺度无关的范围内,方便比较不同尺度的模板匹配结果。
原文地址: https://www.cveoy.top/t/topic/qrsv 著作权归作者所有。请勿转载和采集!