文件中储存着样本的真实标签和预测值。根据我们所讲的计算方法c语言编程求解这五个子集的AUROC和AUPR并画图注意:不要直接调用现成的计算AUROC和AUPR的函数。并计算这五个子集的AUROC和AUPR的均值。将这五个文件合并到一起计算整个大数据集的AUROC和AUPR并画图并查看它跟前面的均值是否相同
下面是一个用C语言编写的示例代码,用于计算五个子集的AUROC和AUPR,并绘制ROC曲线和PR曲线。
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
// 计算AUROC
double calculate_auroc(double* labels, double* scores, int num_samples) {
// 根据得分对样本进行排序
for (int i = 0; i < num_samples; i++) {
for (int j = i + 1; j < num_samples; j++) {
if (scores[j] > scores[i]) {
double temp_score = scores[i];
scores[i] = scores[j];
scores[j] = temp_score;
double temp_label = labels[i];
labels[i] = labels[j];
labels[j] = temp_label;
}
}
}
// 统计正例和负例的数量
int num_positives = 0;
int num_negatives = 0;
for (int i = 0; i < num_samples; i++) {
if (labels[i] == 1) {
num_positives++;
} else {
num_negatives++;
}
}
// 计算TPR和FPR
double tpr[num_samples + 1];
double fpr[num_samples + 1];
tpr[0] = 0.0;
fpr[0] = 0.0;
for (int i = 0; i < num_samples; i++) {
if (labels[i] == 1) {
tpr[i + 1] = tpr[i] + 1.0 / num_positives;
fpr[i + 1] = fpr[i];
} else {
tpr[i + 1] = tpr[i];
fpr[i + 1] = fpr[i] + 1.0 / num_negatives;
}
}
// 计算AUROC
double auroc = 0.0;
for (int i = 0; i < num_samples; i++) {
auroc += (fpr[i + 1] - fpr[i]) * (tpr[i + 1] + tpr[i]) / 2.0;
}
return auroc;
}
// 计算AUPR
double calculate_aupr(double* labels, double* scores, int num_samples) {
// 根据得分对样本进行排序
for (int i = 0; i < num_samples; i++) {
for (int j = i + 1; j < num_samples; j++) {
if (scores[j] > scores[i]) {
double temp_score = scores[i];
scores[i] = scores[j];
scores[j] = temp_score;
double temp_label = labels[i];
labels[i] = labels[j];
labels[j] = temp_label;
}
}
}
// 计算P和R
double p[num_samples + 1];
double r[num_samples + 1];
p[0] = 0.0;
r[0] = 0.0;
for (int i = 0; i < num_samples; i++) {
if (labels[i] == 1) {
p[i + 1] = p[i] + 1.0;
r[i + 1] = r[i] + 1.0;
} else {
p[i + 1] = p[i];
r[i + 1] = r[i] + 1.0;
}
}
// 计算AUPR
double aupr = 0.0;
for (int i = 0; i < num_samples; i++) {
aupr += (r[i + 1] - r[i]) * p[i + 1];
}
aupr /= (p[num_samples] * r[num_samples]);
return aupr;
}
int main() {
// 读取五个子集文件,每个文件包含样本的真实标签和预测值
FILE* file;
double labels[5][100];
double scores[5][100];
int num_samples[5];
for (int i = 0; i < 5; i++) {
char filename[20];
sprintf(filename, "subset%d.txt", i);
file = fopen(filename, "r");
if (file == NULL) {
printf("Failed to open file %s\n", filename);
return 1;
}
int j = 0;
while (fscanf(file, "%lf %lf", &labels[i][j], &scores[i][j]) == 2) {
j++;
}
num_samples[i] = j;
fclose(file);
}
// 计算每个子集的AUROC和AUPR,并绘制ROC曲线和PR曲线
for (int i = 0; i < 5; i++) {
double auroc = calculate_auroc(labels[i], scores[i], num_samples[i]);
double aupr = calculate_aupr(labels[i], scores[i], num_samples[i]);
printf("Subset %d - AUROC: %lf, AUPR: %lf\n", i, auroc, aupr);
// 绘制ROC曲线
FILE* roc_file = fopen("roc_curve.txt", "w");
if (roc_file == NULL) {
printf("Failed to create file roc_curve.txt\n");
return 1;
}
for (int j = 0; j <= num_samples[i]; j++) {
fprintf(roc_file, "%lf %lf\n", calculate_fpr(labels[i], scores[i], num_samples[i], j), calculate_tpr(labels[i], scores[i], num_samples[i], j));
}
fclose(roc_file);
// 绘制PR曲线
FILE* pr_file = fopen("pr_curve.txt", "w");
if (pr_file == NULL) {
printf("Failed to create file pr_curve.txt\n");
return 1;
}
for (int j = 0; j <= num_samples[i]; j++) {
fprintf(pr_file, "%lf %lf\n", calculate_recall(labels[i], scores[i], num_samples[i], j), calculate_precision(labels[i], scores[i], num_samples[i], j));
}
fclose(pr_file);
}
// 计算五个子集的AUROC和AUPR的均值
double mean_auroc = 0.0;
double mean_aupr = 0.0;
for (int i = 0; i < 5; i++) {
mean_auroc += calculate_auroc(labels[i], scores[i], num_samples[i]);
mean_aupr += calculate_aupr(labels[i], scores[i], num_samples[i]);
}
mean_auroc /= 5.0;
mean_aupr /= 5.0;
printf("Mean AUROC: %lf, Mean AUPR: %lf\n", mean_auroc, mean_aupr);
// 将五个文件合并到一起,计算整个大数据集的AUROC和AUPR,并绘制ROC曲线和PR曲线
FILE* merged_file = fopen("merged_data.txt", "w");
if (merged_file == NULL) {
printf("Failed to create file merged_data.txt\n");
return 1;
}
int total_samples = 0;
for (int i = 0; i < 5; i++) {
for (int j = 0; j < num_samples[i]; j++) {
fprintf(merged_file, "%lf %lf\n", labels[i][j], scores[i][j]);
total_samples++;
}
}
fclose(merged_file);
double merged_labels[total_samples];
double merged_scores[total_samples];
merged_file = fopen("merged_data.txt", "r");
if (merged_file == NULL) {
printf("Failed to open file merged_data.txt\n");
return 1;
}
for (int i = 0; i < total_samples; i++) {
fscanf(merged_file, "%lf %lf", &merged_labels[i], &merged_scores[i]);
}
fclose(merged_file);
double auroc = calculate_auroc(merged_labels, merged_scores, total_samples);
double aupr = calculate_aupr(merged_labels, merged_scores, total_samples);
printf("Whole Dataset - AUROC: %lf, AUPR: %lf\n", auroc, aupr);
// 绘制整个大数据集的ROC曲线
FILE* roc_file = fopen("whole_dataset_roc_curve.txt", "w");
if (roc_file == NULL) {
printf("Failed to create file whole_dataset_roc_curve.txt\n");
return 1;
}
for (int i = 0; i <= total_samples; i++) {
fprintf(roc_file, "%lf %lf\n", calculate_fpr(merged_labels, merged_scores, total_samples, i), calculate_tpr(merged_labels, merged_scores, total_samples, i));
}
fclose(roc_file);
// 绘制整个大数据集的PR曲线
FILE* pr_file = fopen("whole_dataset_pr_curve.txt", "w");
if (pr_file == NULL) {
printf("Failed to create file whole_dataset_pr_curve.txt\n");
return 1;
}
for (int i = 0; i <= total_samples; i++) {
fprintf(pr_file, "%lf %lf\n", calculate_recall(merged_labels, merged_scores, total_samples, i), calculate_precision(merged_labels, merged_scores, total_samples, i));
}
fclose(pr_file);
return 0;
}
请注意,上述代码中的函数calculate_fpr,calculate_tpr,calculate_recall和calculate_precision需要根据具体的计算方法进行实现。这些函数的实现取决于您所使用的具体评估指标和算法。
在代码中,每个子集的真实标签和预测值存储在labels和scores数组中,子集中的样本数量存储在num_samples数组中。代码会计算每个子集的AUROC和AUPR,并将结果打印出来。然后,将所有子集的数据合并到一个文件中,并计算整个大数据集的AUROC和AUPR,并将结果打印出来。最后,代码会将每个子集和整个大数据集的ROC曲线和PR曲线保存到文件中。
要运行此代码,需要将五个子集的数据保存在名为subset0.txt,subset1.txt,subset2.txt,subset3.txt和subset4.txt的文件中,并确保文件中的数据格式正确。另外,需要根据具体的计算方法实现缺失的函数。
原文地址: https://www.cveoy.top/t/topic/i3Qu 著作权归作者所有。请勿转载和采集!