模板匹配算法：实现图像匹配并进行归一化处理

#include <common.hpp>

#include #include

namespace CAROTENE_NS {

#define ENABLE4LINESMATCHING false //Disabled since overall time for simultaneous 4 lines matching is greater than //time for simultaneous 2 lines matching for the same amount of data

bool isMatchTemplateSupported(const Size2D &tmplSize) { return isSupportedConfiguration() && tmplSize.width >= 8 && // Actually the function could process even shorter templates // but there will be no NEON optimization in this case (tmplSize.width * tmplSize.height) <= 256; }

void matchTemplate(const Size2D &srcSize, const u8 * srcBase, ptrdiff_t srcStride, const Size2D &tmplSize, const u8 * tmplBase, ptrdiff_t tmplStride, f32 * dstBase, ptrdiff_t dstStride, bool normalize) { internal::assertSupportedConfiguration(isMatchTemplateSupported(tmplSize)); #ifdef CAROTENE_NEON const size_t tmplW = tmplSize.width; const size_t tmplH = tmplSize.height; const size_t dstW = srcSize.width - tmplSize.width + 1; const size_t dstH = srcSize.height - tmplSize.height + 1;

//template correlation part
{

#if ENABLE4LINESMATCHING const size_t dstroiw4 = dstW & ~3u; #endif const size_t dstroiw2 = dstW & ~1u; const size_t tmplroiw = tmplW & ~7u; const size_t dstride = dstStride >> 2;

    f32 *corr = dstBase;
    const u8  *imgrrow = srcBase;
    for(size_t r = 0; r < dstH; ++r, corr+=dstride, imgrrow+=srcStride)
    {
        size_t c = 0;

#if ENABLE4LINESMATCHING for(; c < dstroiw4; c+=4) { u32 dot[4] = {0, 0, 0, 0}; uint32x4_t vdot0 = vmovq_n_u32(0); uint32x4_t vdot1 = vmovq_n_u32(0); uint32x4_t vdot2 = vmovq_n_u32(0); uint32x4_t vdot3 = vmovq_n_u32(0);

            const u8  *img = imgrrow;
            const u8 *tmpl = tmplBase;
            for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
            {
                size_t j = 0;
                for(; j < tmplroiw; j+=8)
                {
                    uint8x8_t vtmpl = vld1_u8(tmpl + j);

                    uint8x8_t vimg0 = vld1_u8(img + j + c + 0);
                    uint8x8_t vimg1 = vld1_u8(img + j + c + 1);
                    uint8x8_t vimg2 = vld1_u8(img + j + c + 2);
                    uint8x8_t vimg3 = vld1_u8(img + j + c + 3);

                    uint16x8_t vd0 = vmull_u8(vtmpl, vimg0);
                    uint16x8_t vd1 = vmull_u8(vtmpl, vimg1);
                    uint16x8_t vd2 = vmull_u8(vtmpl, vimg2);
                    uint16x8_t vd3 = vmull_u8(vtmpl, vimg3);

                    vdot0 = vpadalq_u16(vdot0, vd0);
                    vdot1 = vpadalq_u16(vdot1, vd1);
                    vdot2 = vpadalq_u16(vdot2, vd2);
                    vdot3 = vpadalq_u16(vdot3, vd3);
                }
                for(; j < tmplW; ++j)
                {
                    dot[0] += tmpl[j] * img[j + c + 0];
                    dot[1] += tmpl[j] * img[j + c + 1];
                    dot[2] += tmpl[j] * img[j + c + 2];
                    dot[3] += tmpl[j] * img[j + c + 3];
                }
            }
            uint32x4_t vdotx   = vld1q_u32(dot);
            uint32x2_t vdot_0  = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0));
            uint32x2_t vdot_1  = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1));
            uint32x2_t vdot_2  = vpadd_u32(vget_low_u32(vdot2), vget_high_u32(vdot2));
            uint32x2_t vdot_3  = vpadd_u32(vget_low_u32(vdot3), vget_high_u32(vdot3));
            uint32x2_t vdot_01 = vpadd_u32(vdot_0, vdot_1);
            uint32x2_t vdot_23 = vpadd_u32(vdot_2, vdot_3);

            vst1q_f32(corr + c, vcvtq_f32_u32(vaddq_u32(vdotx, vcombine_u32(vdot_01, vdot_23))));
        }

#endif

        for(; c < dstroiw2; c+=2)
        {
            u32 dot[2] = {0, 0};
            uint32x4_t vdot0 = vmovq_n_u32(0);
            uint32x4_t vdot1 = vmovq_n_u32(0);
            const u8  *img = imgrrow;
            const u8 *tmpl = tmplBase;
            for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
            {
                size_t j = 0;
                for(; j < tmplroiw; j+=8)
                {
                    uint8x8_t vtmpl = vld1_u8(tmpl + j);

                    uint8x8_t vimg0 = vld1_u8(img + j + c + 0);
                    uint8x8_t vimg1 = vld1_u8(img + j + c + 1);

                    uint16x8_t vd0 = vmull_u8(vtmpl, vimg0);
                    uint16x8_t vd1 = vmull_u8(vtmpl, vimg1);

                    vdot0 = vpadalq_u16(vdot0, vd0);
                    vdot1 = vpadalq_u16(vdot1, vd1);
                }
                for(; j < tmplW; ++j)
                {
                    dot[0] += tmpl[j] * img[j + c + 0];
                    dot[1] += tmpl[j] * img[j + c + 1];
                }
            }
            uint32x2_t vdotx  = vld1_u32(dot);
            uint32x2_t vdot_0 = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0));
            uint32x2_t vdot_1 = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1));
            uint32x2_t vdot_  = vpadd_u32(vdot_0, vdot_1);
            vst1_f32(corr + c, vcvt_f32_u32(vadd_u32(vdotx, vdot_)));
        }

        for(; c < dstW; ++c)
        {
            u32 dot = 0;
            uint32x4_t vdot = vmovq_n_u32(0);
            const u8  *img = imgrrow;
            const u8 *tmpl = tmplBase;
            for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
            {
                size_t j = 0;
                for(; j < tmplroiw; j+=8)
                {
                    uint8x8_t vtmpl = vld1_u8(tmpl + j);
                    uint8x8_t vimg  = vld1_u8(img + j + c);
                    uint16x8_t vd   = vmull_u8(vtmpl, vimg);
                    vdot = vpadalq_u16(vdot, vd);
                }
                for(; j < tmplW; ++j)
                    dot += tmpl[j] * img[j + c];
            }
            u32 wdot[2];
            vst1_u32(wdot, vpadd_u32(vget_low_u32(vdot), vget_high_u32(vdot)));
            dot += wdot[0] + wdot[1];
            corr[c] = (f32)dot;
        }
    }
}

if(normalize)
{
    f32 tn = std::sqrt((f32)normL2(tmplSize, tmplBase, tmplStride));

    size_t iw = srcSize.width+1;
    size_t ih = srcSize.height+1;
    std::vector<f64> _sqsum(iw*ih);
    f64 *sqsum = &_sqsum[0];
    memset(sqsum, 0, iw*sizeof(f64));
    for(size_t i = 1; i < ih; ++i)
        sqsum[iw*i] = 0.;
    sqrIntegral(srcSize, srcBase, srcStride, sqsum + iw + 1, iw*sizeof(f64));

    for(size_t i = 0; i < dstH; ++i)
    {
        f32 *result = internal::getRowPtr(dstBase, dstStride, i);
        for(size_t j = 0; j < dstW; ++j)
        {
            double s2 = sqsum[iw*i + j] +
                        sqsum[iw*(i + tmplSize.height) + j + tmplSize.width] -
                        sqsum[iw*(i + tmplSize.height) + j] -
                        sqsum[iw*i + j + tmplSize.width];

            result[j] /= tn * std::sqrt(s2);
        }
    }
}

#else (void)srcSize; (void)srcBase; (void)srcStride; (void)tmplBase; (void)tmplStride; (void)dstBase; (void)dstStride; (void)normalize; #endif }

} 该程序在实现模板匹配时，通过计算模板和图像的相关性来找到最佳匹配位置。在计算相关性时，如果指定了归一化参数normalize为true，程序会对相关性进行归一化处理。

具体的归一化处理方法如下：

首先计算模板的L2范数，即模板内所有元素的平方和的平方根，保存为tn。
创建一个大小为(srcSize.width+1) x (srcSize.height+1)的数组sqsum，用于计算图像的平方积分图。
首先将sqsum数组初始化为0，然后调用sqrIntegral函数计算图像的平方积分图，将结果保存在sqsum数组中。平方积分图的计算方法是通过遍历图像的每个像素，将当前像素的值加上它左上方像素的平方积分值，再减去它左边像素的平方积分值和上方像素的平方积分值，最后加上它左上方像素的平方积分值。
遍历dstBase数组中的每一行，对于每个位置(i, j)，计算相关性结果result[j]除以tn乘以图像区域的平方和s2的平方根。图像区域的平方和s2是通过使用平方积分图计算得到的，具体计算方法是将右下角点为(i+tmplSize.height, j+tmplSize.width)、左上角点为(i, j)的矩形区域的平方和s2计算为sqsum[(i+tmplSize.height)iw + j+tmplSize.width] + sqsum[iwi + j] - sqsum[(i+tmplSize.height)iw + j] - sqsum[iwi + j+tmplSize.width]，其中iw为sqsum数组的宽度。

通过上述归一化处理，可以将相关性结果归一到与模板和图像的尺度无关的范围内，方便比较不同尺度的模板匹配结果。