import random

data = []
labels = set()

# 读取数据集
with open('data.txt', 'r') as file:
    for line in file:
        sample = line.strip().split(',')
        data.append(sample[:23])
        labels.add(sample[23])

labels = list(labels)
random.shuffle(data)

# 划分数据集
train_set = []
valid_set = []
test_set = []

for label in labels:
    count = 0
    for sample in data:
        if sample[23] == label:
            if count < 1000:
                train_set.append(sample)
                count += 1
            elif count < 1100:
                valid_set.append(sample)
                count += 1
            elif count < 1200:
                test_set.append(sample)
                count += 1
        if count >= 1200:
            break

# 保存数据集
with open('train_set.txt', 'w') as file:
    for sample in train_set:
        file.write(','.join(sample) + '\n')

with open('valid_set.txt', 'w') as file:
    for sample in valid_set:
        file.write(','.join(sample) + '\n')

with open('test_set.txt', 'w') as file:
    for sample in test_set:
        file.write(','.join(sample) + '\n')
Python 数据集分割:将大型数据集划分为训练集、验证集和测试集

原文地址: https://www.cveoy.top/t/topic/jO8C 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录