数据集划分:将文本数据集分割为训练集、验证集和测试集
import random
# 读取数据集
dataset = []
with open('data.txt', 'r') as file:
for line in file:
data = line.strip().split(',')
dataset.append(data)
# 打乱数据集
random.shuffle(dataset)
# 划分数据集
train_set = []
val_set = []
test_set = []
label_counter = [0] * 8 # 记录每个类别已经添加的样本数
for data in dataset:
label = int(data[-1]) # 获取类别标签
if label_counter[label] < 8000: # 添加到训练集
train_set.append(data)
label_counter[label] += 1
elif label_counter[label] < 8800: # 添加到验证集
val_set.append(data)
label_counter[label] += 1
elif label_counter[label] < 9600: # 添加到测试集
test_set.append(data)
label_counter[label] += 1
# 打乱训练集、验证集和测试集
random.shuffle(train_set)
random.shuffle(val_set)
random.shuffle(test_set)
# 保存训练集
with open('train.txt', 'w') as file:
for data in train_set:
file.write(','.join(data) + '
')
# 保存验证集
with open('val.txt', 'w') as file:
for data in val_set:
file.write(','.join(data) + '
')
# 保存测试集
with open('test.txt', 'w') as file:
for data in test_set:
file.write(','.join(data) + '
')
请将代码保存为一个Python脚本,并将数据集文件命名为"data.txt",然后运行该脚本即可。执行完毕后,将分别生成"train.txt"、"val.txt"和"test.txt"这三个文件,它们分别包含了按照要求划分的训练集、验证集和测试集的数据。
原文地址: https://www.cveoy.top/t/topic/jNJD 著作权归作者所有。请勿转载和采集!