以下是使用决策树算法对西瓜数据集进行分类的 Python 代码示例:

import numpy as np

# 定义数据集
data = [
    [1, 1, 1, 1],
    [1, 1, 1, 0],
    [0, 1, 1, 1],
    [0, 1, 1, 0],
    [1, 0, 1, 1],
    [1, 0, 1, 0],
    [0, 0, 0, 0]
]

labels = ['色泽', '根蒂', '敲声', '纹理']
class_labels = ['是', '否']

# 计算数据集的熵
def calc_entropy(data):
    num_samples = len(data)
    label_counts = {}
    for sample in data:
        label = sample[-1]
        if label not in label_counts:
            label_counts[label] = 0
        label_counts[label] += 1
    
    entropy = 0.0
    for label in label_counts:
        prob = float(label_counts[label]) / num_samples
        entropy -= prob * np.log2(prob)
    
    return entropy

# 划分数据集
def split_data(data, axis, value):
    sub_data = []
    for sample in data:
        if sample[axis] == value:
            sub_sample = sample[:axis]
            sub_sample.extend(sample[axis+1:])
            sub_data.append(sub_sample)
    
    return sub_data

# 选择最好的特征进行划分
def choose_best_feature(data):
    num_features = len(data[0]) - 1
    base_entropy = calc_entropy(data)
    best_info_gain = 0.0
    best_feature = -1
    
    for i in range(num_features):
        feature_values = [sample[i] for sample in data]
        unique_values = set(feature_values)
        new_entropy = 0.0
        
        for value in unique_values:
            sub_data = split_data(data, i, value)
            prob = len(sub_data) / float(len(data))
            new_entropy += prob * calc_entropy(sub_data)
        
        info_gain = base_entropy - new_entropy
        if info_gain > best_info_gain:
            best_info_gain = info_gain
            best_feature = i
    
    return best_feature

# 创建决策树
def create_decision_tree(data, labels):
    class_list = [sample[-1] for sample in data]
    
    # 如果类别完全相同,则停止划分,返回该类别
    if class_list.count(class_list[0]) == len(class_list):
        return class_list[0]
    
    # 如果所有特征都已经遍历完,则选择出现次数最多的类别作为结果
    if len(data[0]) == 1:
        return max(set(class_list), key = class_list.count)
    
    best_feature = choose_best_feature(data)
    best_feature_label = labels[best_feature]
    decision_tree = {best_feature_label: {}}
    del(labels[best_feature])
    
    feature_values = [sample[best_feature] for sample in data]
    unique_values = set(feature_values)
    for value in unique_values:
        sub_labels = labels[:]
        decision_tree[best_feature_label][value] = create_decision_tree(split_data(data, best_feature, value), sub_labels)
    
    return decision_tree

# 使用决策树进行分类
def classify(decision_tree, feature_labels, test_sample):
    root_label = list(decision_tree.keys())[0]
    root_value = decision_tree[root_label]
    feature_index = feature_labels.index(root_label)
    
    for key in root_value.keys():
        if test_sample[feature_index] == key:
            if type(root_value[key]).__name__ == 'dict':
                class_label = classify(root_value[key], feature_labels, test_sample)
            else:
                class_label = root_value[key]
    
    return class_label

# 构建决策树
decision_tree = create_decision_tree(data, labels)

# 测试样本
test_sample = [1, 0, 1, 0]
predicted_class = classify(decision_tree, labels, test_sample)

print('预测类别:', predicted_class)

请注意,这只是一个简单的决策树实现示例,可能无法处理更复杂的数据集。使用更成熟的机器学习库(如 scikit-learn)可能更适合处理实际问题。


原文地址: https://www.cveoy.top/t/topic/ba0A 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录