Python 数据集分割:将大型数据集划分为训练集、验证集和测试集
import random
data = []
labels = set()
# 读取数据集
with open('data.txt', 'r') as file:
for line in file:
sample = line.strip().split(',')
data.append(sample[:23])
labels.add(sample[23])
labels = list(labels)
random.shuffle(data)
# 划分数据集
train_set = []
valid_set = []
test_set = []
for label in labels:
count = 0
for sample in data:
if sample[23] == label:
if count < 1000:
train_set.append(sample)
count += 1
elif count < 1100:
valid_set.append(sample)
count += 1
elif count < 1200:
test_set.append(sample)
count += 1
if count >= 1200:
break
# 保存数据集
with open('train_set.txt', 'w') as file:
for sample in train_set:
file.write(','.join(sample) + '\n')
with open('valid_set.txt', 'w') as file:
for sample in valid_set:
file.write(','.join(sample) + '\n')
with open('test_set.txt', 'w') as file:
for sample in test_set:
file.write(','.join(sample) + '\n')
原文地址: https://www.cveoy.top/t/topic/jO8C 著作权归作者所有。请勿转载和采集!