C++ 代码：根据列分组计算 MIC 系数

#include #include #include #include #include #include #include #include "cppmine.h" #include <boost/algorithm/string.hpp> #include <boost/numeric/ublas/matrix.hpp> #include <boost/numeric/ublas/io.hpp> #include <boost/program_options.hpp> #include #include <omp.h>

namespace po = boost::program_options; using namespace boost::numeric::ublas;

// 读取文件 matrix<double, row_major> read_tsv(const std::string& filename, std::vectorstd::string& col_names, int& group_index) { std::ifstream in(filename); std::string line; std::vector<std::vector> data;

// 读取第一行并解析为列名
if (std::getline(in, line)) {
    boost::algorithm::split(col_names, line, boost::is_any_of("\t"));
    col_names.erase(col_names.begin(), col_names.begin() + 1);  // 去除第一个空列
    // 查找名为'group'的列索引
    for (size_t i = 0; i < col_names.size(); ++i) {
        if (col_names[i] == "group") {
            group_index = i;
            break;
        }
    }
}
while (std::getline(in, line)) {
    std::vector<std::string> str_vec;
    boost::algorithm::split(str_vec, line, boost::is_any_of("\t"));
    std::vector<double> num_vec(str_vec.size() - 1);
    std::transform(str_vec.begin() + 1, str_vec.end(), num_vec.begin(),
                   [](const std::string& str) { return std::stod(str); });
    data.push_back(num_vec);
}

matrix<double, row_major> mat(data.size(), data[0].size());
for (size_t i = 0; i < mat.size1(); ++i)
    for (size_t j = 0; j < mat.size2(); ++j)
        mat(i, j) = data[i][j];

return mat;

}

//计算mic系数 std::map<std::string, double> calc_mic(const matrix<double, row_major>& mat, int threads, int group_index) { // 当样本量小于6个时，终止程序 if (mat.size1() < 6) { std::cerr << "error: the number of data rows should be greater than 5." << std::endl; } if (mat.size2() < 2) { std::cerr << "error: the number of data columns should be greater than 1." << std::endl; }

// 创建空的对称矩阵
matrix<double, row_major> result(mat.size2(), mat.size2());

// 创建MINE对象
MINE mine(0.5, 15, EST_MIC_APPROX);

// 创建map对象，用于存储每个组的mic系数
std::map<std::string, double> mic_map;

// 计算每列两两之间的mic系数，并填充对称矩阵
omp_set_num_threads(threads);
#pragma omp parallel for
for (size_t i = 0; i < mat.size2(); i++) {
    for (size_t j = i + 1; j < mat.size2(); j++) {  // 计算上三角部分
        double x[mat.size1()];
        double y[mat.size1()];
        for (size_t k = 0; k < mat.size1(); k++) {
            x[k] = mat(k, i);
            y[k] = mat(k, j);
        }
        mine.compute_score(x, y, mat.size1());
        result(i, j) = mine.mic();
        result(j, i) = result(i, j);  // 填充下三角部分

        // 根据当前的组名称将mic系数存储到map对象中
        std::string group_name = col_names[group_index];
        mic_map[group_name] = mine.mic();
    }
}

return mic_map;

}

//导出结果 void write_tsv(const std::string& filename, const std::map<std::string, double>& mic_map) { std::ofstream out(filename);

// 输出列名
out << "Group\tMIC\n";

// 输出每个组的mic系数
for (const auto& pair : mic_map) {
    out << pair.first << "\t" << std::setprecision(3) << std::fixed << pair.second << "\n";
}

}

//main函数 int main(int argc, char** argv) { // 声明命令行参数选项 int default_threads = 4; int threads; po::options_description desc(" Multi-threaded computation of maximum information coefficient.\n" " For more detailed information, please refer to https://minepy.readthedocs.io/en/latest/"); desc.add_options() ("help,h", "help message") ("input,i", po::valuestd::string(), "input.txt") ("output,o", po::valuestd::string(), "output.txt,The first column is the name of the indicator, such as ASV/OTU or gene") ("threads,t", po::value(&threads)->default_value(default_threads), "number of threads (default 4)");

// 解析命令行参数选项
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
po::notify(vm);

if (vm.count("help")) {
    std::cout << desc << "\n";
    return 0;
}
if (vm.empty()) {
    std::cout << desc << "\n";
    return 0;
}

int coreNum = omp_get_num_procs();  // 获得处理器个数
if (coreNum > 1 && threads > coreNum) {
    fprintf(stderr, "\n%s: error: only %d threads are available\n", argv[0], coreNum);
    exit(1);
}
if (vm.count("input")) {
    std::string filename = vm["input"].as<std::string>();
    std::ifstream file(filename);
    if (file) {
        // 文件存在，执行你的操作
        if (vm.count("output")) {
            std::vector<std::string> col_names;
            int group_index = -1;
            auto mat = read_tsv(vm["input"].as<std::string>(), col_names, group_index);
            auto mic_map = calc_mic(mat, threads, group_index);
            write_tsv(vm["output"].as<std::string>(), mic_map);
        } else {
            std::cerr << "output is not specified!" << std::endl;
            return 1;
        }
    } else {
        // 文件不存在
        std::cerr << vm["input"].as<std::string>() << " not found!" << std::endl;
        return 1;
    }
} else {
    std::cerr << "No file provided!" << std::endl;
    return 1;
}

return 0;

}