import random

# 读取数据集
dataset = []
with open('data.txt', 'r') as file:
    for line in file:
        data = line.strip().split(',')
        dataset.append(data)

# 打乱数据集
random.shuffle(dataset)

# 划分数据集
train_set = []
val_set = []
test_set = []
label_counter = [0] * 8  # 记录每个类别已经添加的样本数
for data in dataset:
    label = int(data[-1])  # 获取类别标签
    if label_counter[label] < 8000:  # 添加到训练集
        train_set.append(data)
        label_counter[label] += 1
    elif label_counter[label] < 8800:  # 添加到验证集
        val_set.append(data)
        label_counter[label] += 1
    elif label_counter[label] < 9600:  # 添加到测试集
        test_set.append(data)
        label_counter[label] += 1

# 打乱训练集、验证集和测试集
random.shuffle(train_set)
random.shuffle(val_set)
random.shuffle(test_set)

# 保存训练集
with open('train.txt', 'w') as file:
    for data in train_set:
        file.write(','.join(data) + '
')

# 保存验证集
with open('val.txt', 'w') as file:
    for data in val_set:
        file.write(','.join(data) + '
')

# 保存测试集
with open('test.txt', 'w') as file:
    for data in test_set:
        file.write(','.join(data) + '
')

请将代码保存为一个Python脚本,并将数据集文件命名为"data.txt",然后运行该脚本即可。执行完毕后,将分别生成"train.txt"、"val.txt"和"test.txt"这三个文件,它们分别包含了按照要求划分的训练集、验证集和测试集的数据。

数据集划分:将文本数据集分割为训练集、验证集和测试集

原文地址: https://www.cveoy.top/t/topic/jNJD 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录