#include vector#include cstringnamespace CAROTENE_NS bool isMatchTemplateSupportedAVICCV_8U &tmplSize return isSupportedConfiguration && tmplSizewidth
以下是使用NEON和OpenMP对程序进行优化的示例代码:
#include
namespace CAROTENE_NS {
bool isMatchTemplateSupported(AVICCV_8U &tmplSize) { return isSupportedConfiguration() && tmplSize.width >= 8 && (tmplSize.width * tmplSize.height) <= 256; }
void matchTemplate(const Size2D &srcSize, const u8 * srcBase, ptrdiff_t srcStride, const Size2D &tmplSize, const u8 * tmplBase, ptrdiff_t tmplStride, f32 * dstBase, ptrdiff_t dstStride, bool normalize) { internal::assertSupportedConfiguration(isMatchTemplateSupported(tmplSize));
const size_t tmplW = tmplSize.width;
const size_t tmplH = tmplSize.height;
const size_t dstW = srcSize.width - tmplSize.width + 1;
const size_t dstH = srcSize.height - tmplSize.height + 1;
// Calculate the number of threads to use
int numThreads = omp_get_max_threads();
// Initialize the vector for storing temporary results
std::vector<std::vector<f32>> threadResults(numThreads, std::vector<f32>(dstW));
// Template correlation part
#pragma omp parallel for schedule(dynamic)
for (int r = 0; r < dstH; ++r) {
// Get the thread ID
int threadID = omp_get_thread_num();
// Get the temporary result vector for this thread
std::vector<f32>& result = threadResults[threadID];
const size_t tmplroiw = tmplW & ~7u;
const size_t dstride = dstStride >> 2;
f32 *corr = dstBase + r * dstride;
const u8 *imgrrow = srcBase + r * srcStride;
int c = 0;
for (; c < dstW; ++c) {
u32 dot = 0;
uint32x4_t vdot = vmovq_n_u32(0);
const u8 *img = imgrrow;
const u8 *tmpl = tmplBase;
for (size_t i = 0; i < tmplH; ++i, tmpl += tmplStride, img += srcStride) {
size_t j = 0;
for (; j < tmplroiw; j += 8) {
uint8x8_t vtmpl = vld1_u8(tmpl + j);
uint8x8_t vimg = vld1_u8(img + j + c);
uint16x8_t vd = vmull_u8(vtmpl, vimg);
vdot = vpadalq_u16(vdot, vd);
}
for (; j < tmplW; ++j)
dot += tmpl[j] * img[j + c];
}
u32 wdot[2];
vst1_u32(wdot, vpadd_u32(vget_low_u32(vdot), vget_high_u32(vdot)));
dot += wdot[0] + wdot[1];
result[c] = (f32)dot;
}
// Copy the temporary result vector to the final result array
memcpy(corr, result.data(), dstW * sizeof(f32));
}
if (normalize) {
f32 tn = std::sqrt((f32)normL2(tmplSize, tmplBase, tmplStride));
size_t iw = srcSize.width + 1;
size_t ih = srcSize.height + 1;
std::vector<f64> _sqsum(iw * ih);
f64 *sqsum = &_sqsum[0];
memset(sqsum, 0, iw * sizeof(f64));
for (size_t i = 1; i < ih; ++i)
sqsum[iw * i] = 0.;
sqrIntegral(srcSize, srcBase, srcStride, sqsum + iw + 1, iw * sizeof(f64));
#pragma omp parallel for schedule(dynamic)
for (size_t i = 0; i < dstH; ++i) {
f32 *result = internal::getRowPtr(dstBase, dstStride, i);
for (size_t j = 0; j < dstW; ++j) {
double s2 = sqsum[iw * i + j] +
sqsum[iw * (i + tmplSize.height) + j + tmplSize.width] -
sqsum[iw * (i + tmplSize.height) + j] -
sqsum[iw * i + j + tmplSize.width];
result[j] /= tn * std::sqrt(s2);
}
}
}
}
} // namespace CAROTENE_N
原文地址: https://www.cveoy.top/t/topic/iMhq 著作权归作者所有。请勿转载和采集!