GPU加速模板匹配：使用 NPP 库的 C++ 代码示例

以下是使用 GPU NPP 库实现模板匹配的 C++ 代码示例：
#include <iostream>
#include <npp.h>

int main() {
    // 读取输入图像和模板图像
    Npp8u* inputImage = nullptr;
    Npp8u* templateImage = nullptr;
    int inputWidth, inputHeight, inputPitch;
    int templateWidth, templateHeight, templatePitch;
    
    // 初始化NPP库
    nppSetDebugLogLevel(NPP_DEBUG_INFO); // 设置日志级别为INFO以获取调试信息
    nppSetStream(NULL); // 使用默认CUDA流

    // 创建NPP图像描述符
    NppiSize inputSize = {inputWidth, inputHeight};
    NppiSize templateSize = {templateWidth, templateHeight};
    NppiRect roi = {0, 0, inputWidth, inputHeight};
    NppiSize resultSize = {inputWidth - templateWidth + 1, inputHeight - templateHeight + 1};

    // 在GPU上分配内存
    Npp8u* d_inputImage = nullptr;
    Npp8u* d_templateImage = nullptr;
    Npp32f* d_result = nullptr;
    size_t d_inputPitch, d_templatePitch, d_resultPitch;
    
    cudaMallocPitch(reinterpret_cast<void**>(&d_inputImage), &d_inputPitch, inputWidth * sizeof(Npp8u), inputHeight);
    cudaMallocPitch(reinterpret_cast<void**>(&d_templateImage), &d_templatePitch, templateWidth * sizeof(Npp8u), templateHeight);
    cudaMallocPitch(reinterpret_cast<void**>(&d_result), &d_resultPitch, resultSize.width * sizeof(Npp32f), resultSize.height);

    // 将输入图像和模板图像从主机内存复制到GPU内存
    cudaMemcpy2D(d_inputImage, d_inputPitch, inputImage, inputPitch, inputWidth * sizeof(Npp8u), inputHeight, cudaMemcpyHostToDevice);
    cudaMemcpy2D(d_templateImage, d_templatePitch, templateImage, templatePitch, templateWidth * sizeof(Npp8u), templateHeight, cudaMemcpyHostToDevice);

    // 执行模板匹配
    Npp32f correlation = 0.0f;
    NppiSize maskSize = {templateWidth, templateHeight};

    nppiFilterTemplate_32f_C1R(d_inputImage, d_inputPitch, inputSize, roi, d_templateImage, d_templatePitch, maskSize, d_result, d_resultPitch, NPP_FILTER_SQDIFF_NORMED);
    nppiMinIndx_32f_C1R(d_result, d_resultPitch, resultSize, &correlation, NPP_MINMAX_NOABS);

    // 将结果从GPU内存复制到主机内存
    Npp32f* result = new Npp32f[resultSize.width * resultSize.height];
    cudaMemcpy2D(result, resultSize.width * sizeof(Npp32f), d_result, d_resultPitch, resultSize.width * sizeof(Npp32f), resultSize.height, cudaMemcpyDeviceToHost);

    // 打印匹配结果
    std::cout << 'Correlation: ' << correlation << std::endl;

    // 释放GPU内存
    cudaFree(d_inputImage);
    cudaFree(d_templateImage);
    cudaFree(d_result);

    // 清理NPP库资源
    nppiFree(inputImage);
    nppiFree(templateImage);

    return 0;
}
请注意，这只是一个简单的示例代码，实际使用时可能需要根据具体需求进行修改和优化。