西瓜数据集分类:使用决策树算法实现
以下是使用决策树算法对西瓜数据集进行分类的 Python 代码示例:
import numpy as np
# 定义数据集
data = [
[1, 1, 1, 1],
[1, 1, 1, 0],
[0, 1, 1, 1],
[0, 1, 1, 0],
[1, 0, 1, 1],
[1, 0, 1, 0],
[0, 0, 0, 0]
]
labels = ['色泽', '根蒂', '敲声', '纹理']
class_labels = ['是', '否']
# 计算数据集的熵
def calc_entropy(data):
num_samples = len(data)
label_counts = {}
for sample in data:
label = sample[-1]
if label not in label_counts:
label_counts[label] = 0
label_counts[label] += 1
entropy = 0.0
for label in label_counts:
prob = float(label_counts[label]) / num_samples
entropy -= prob * np.log2(prob)
return entropy
# 划分数据集
def split_data(data, axis, value):
sub_data = []
for sample in data:
if sample[axis] == value:
sub_sample = sample[:axis]
sub_sample.extend(sample[axis+1:])
sub_data.append(sub_sample)
return sub_data
# 选择最好的特征进行划分
def choose_best_feature(data):
num_features = len(data[0]) - 1
base_entropy = calc_entropy(data)
best_info_gain = 0.0
best_feature = -1
for i in range(num_features):
feature_values = [sample[i] for sample in data]
unique_values = set(feature_values)
new_entropy = 0.0
for value in unique_values:
sub_data = split_data(data, i, value)
prob = len(sub_data) / float(len(data))
new_entropy += prob * calc_entropy(sub_data)
info_gain = base_entropy - new_entropy
if info_gain > best_info_gain:
best_info_gain = info_gain
best_feature = i
return best_feature
# 创建决策树
def create_decision_tree(data, labels):
class_list = [sample[-1] for sample in data]
# 如果类别完全相同,则停止划分,返回该类别
if class_list.count(class_list[0]) == len(class_list):
return class_list[0]
# 如果所有特征都已经遍历完,则选择出现次数最多的类别作为结果
if len(data[0]) == 1:
return max(set(class_list), key = class_list.count)
best_feature = choose_best_feature(data)
best_feature_label = labels[best_feature]
decision_tree = {best_feature_label: {}}
del(labels[best_feature])
feature_values = [sample[best_feature] for sample in data]
unique_values = set(feature_values)
for value in unique_values:
sub_labels = labels[:]
decision_tree[best_feature_label][value] = create_decision_tree(split_data(data, best_feature, value), sub_labels)
return decision_tree
# 使用决策树进行分类
def classify(decision_tree, feature_labels, test_sample):
root_label = list(decision_tree.keys())[0]
root_value = decision_tree[root_label]
feature_index = feature_labels.index(root_label)
for key in root_value.keys():
if test_sample[feature_index] == key:
if type(root_value[key]).__name__ == 'dict':
class_label = classify(root_value[key], feature_labels, test_sample)
else:
class_label = root_value[key]
return class_label
# 构建决策树
decision_tree = create_decision_tree(data, labels)
# 测试样本
test_sample = [1, 0, 1, 0]
predicted_class = classify(decision_tree, labels, test_sample)
print('预测类别:', predicted_class)
请注意,这只是一个简单的决策树实现示例,可能无法处理更复杂的数据集。使用更成熟的机器学习库(如 scikit-learn)可能更适合处理实际问题。
原文地址: https://www.cveoy.top/t/topic/ba0A 著作权归作者所有。请勿转载和采集!