C++多线程计算最大互信息系数(MIC)并按列分组

#include #include #include #include #include #include #include 'cppmine.h' #include <boost/algorithm/string.hpp> #include <boost/numeric/ublas/matrix.hpp> #include <boost/numeric/ublas/io.hpp> #include <boost/program_options.hpp> #include #include <omp.h> namespace po = boost::program_options; using namespace boost::numeric::ublas;

// 读取文件 matrix<double, row_major> read_tsv(const std::string& filename, std::vectorstd::string& col_names) { std::ifstream in(filename); std::string line; std::vector<std::vector> data;

// 读取第一行并解析为列名
if (std::getline(in, line)) {
    boost::algorithm::split(col_names, line, boost::is_any_of('\t'));
    col_names.erase(col_names.begin());  // 去除第一个空列
}
while (std::getline(in, line)) {
    std::vector<std::string> str_vec;
    boost::algorithm::split(str_vec, line, boost::is_any_of('\t'));
    std::vector<double> num_vec(str_vec.size() - 1);
    std::transform(str_vec.begin() + 1, str_vec.end(), num_vec.begin(),
                   [](const std::string& str) { return std::stod(str); });
    data.push_back(num_vec);
}

matrix<double, row_major> mat(data.size(), data[0].size());
for (size_t i = 0; i < mat.size1(); ++i)
    for (size_t j = 0; j < mat.size2(); ++j)
        mat(i, j) = data[i][j];

return mat;

}

//计算mic系数

matrix<double, row_major> calc_mic(const matrix<double, row_major>& mat, const std::vectorstd::string& col_names, int threads) { //当样本量小于6个时，终止程序 if(mat.size1()<6){ std::cerr << 'error: the number of data rows should be greater than 5.' << std::endl; exit(1); } if(mat.size2()<2){ std::cerr << 'error: the number of data columns should be greater than 1.' << std::endl; exit(1); } // 创建空的对称矩阵 matrix<double, row_major> result(mat.size2(), mat.size2()); // 根据列名分组 std::map<std::string, std::vector<size_t>> groups; for (size_t i = 0; i < col_names.size(); ++i) { groups[col_names[i]].push_back(i); } // 计算每列两两之间的mic系数，并填充对称矩阵 omp_set_num_threads(threads); #pragma omp parallel for for (size_t g = 0; g < groups.size(); ++g) { for (size_t i = 0; i < groups[col_names[g]].size(); ++i) { MINE mine(0.5, 15, EST_MIC_APPROX); for (size_t j = i + 1; j < groups[col_names[g]].size(); ++j) { // 计算上三角部分 double x[mat.size1()]; double y[mat.size1()]; for (size_t k = 0; k < mat.size1(); ++k) { x[k] = mat(k,groups[col_names[g]][i]); y[k] = mat(k,groups[col_names[g]][j]); } mine.compute_score(x, y, mat.size1()); result(groups[col_names[g]][i],groups[col_names[g]][j]) = mine.mic(); result(groups[col_names[g]][j],groups[col_names[g]][i]) = result(groups[col_names[g]][i],groups[col_names[g]][j]); // 填充下三角部分 } } } return result; }

//导出结果

void write_tsv(const std::string& filename, const matrix<double, row_major>& mat, const std::vectorstd::string& col_names) { std::ofstream out(filename); // 输出列名 out << '\t'; for (const auto& col_name : col_names) out << col_name << '\t'; out << '\n'; // 输出数据行 for (size_t i = 0; i < mat.size1(); ++i) { out << col_names[i] << '\t'; for (size_t j = 0; j < mat.size2(); ++j) { out << std::setprecision(3) <<std::fixed << mat(i, j); if (j < mat.size2() - 1) out << '\t'; } out << '\n'; } }

//main函数 int main(int argc, char **argv) { // 声明命令行参数选项 int default_threads = 4; int threads; po::options_description desc(" Multi-threaded computation of maximum information coefficient.\n" " For more detailed information, please refer to https://minepy.readthedocs.io/en/latest/"); desc.add_options() ("help,h", "help message") ("input,i", po::valuestd::string(), "input.txt") ("output,o", po::valuestd::string(), "output.txt,The first column is the name of the indicator, such as ASV/OTU or gene") ("threads,t", po::value(&threads)->default_value(default_threads), "number of threads (default 4)"); // 解析命令行参数选项 po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); po::notify(vm);

if (vm.count("help")) {
    std::cout << desc << '\n';
    return 0;
}
if (vm.empty()) {
    std::cout << desc << '\n';
    return 0;
}
//
int coreNum = omp_get_num_procs();//获得处理器个数
// thread_ct = vm.count["threads"].as<int>();
if(coreNum > 1 &&  threads > coreNum){
	fprintf(stderr, "\n%s: error: only %d threads are available\n",  argv[0], coreNum); 
	exit(1);
}
 if (vm.count("input")) {
    std::string filename = vm["input"].as<std::string>();
    std::ifstream file(filename);
    if (file) {
        // 文件存在，执行你的操作
		if (vm.count("output")){
		  std::vector<std::string> col_names;
		  auto mat = read_tsv(vm["input"].as<std::string>(), col_names);
		  auto result = calc_mic(mat,col_names,threads);
		  write_tsv(vm["output"].as<std::string>(), result, col_names);
		}else{
		std::cerr << "output is not specified!" << std::endl;
        return 1;	
		}
    } else {
        // 文件不存在
        std::cerr << vm["input"].as<std::string>() << " not found!" << std::endl;
        return 1;
    }
} else {
    std::cerr << "No file provided!" << std::endl;
    return 1;
}

return 0;

}