2026/2/25 9:11:45
网站建设
项目流程
新建网站seo优化怎么做,软件开发模型不包括,千秋网站建设公司,商丘简淘网络科技有限公司一、引言在机器学习的世界中#xff0c;决策树因其直观易懂的特性而广受欢迎。然而#xff0c;就像一棵自然生长的树木需要修剪才能茁壮成长一样#xff0c;决策树模型也需要剪枝来避免过拟合#xff08;Overfitting#xff09;#xff0c;提高泛化能力。今天…一、引言在机器学习的世界中决策树因其直观易懂的特性而广受欢迎。然而就像一棵自然生长的树木需要修剪才能茁壮成长一样决策树模型也需要剪枝来避免过拟合Overfitting提高泛化能力。今天我们将深入探讨决策树剪枝的技术、原理和实践应用。二、什么是过拟合想象一下一个学生为了通过考试不是理解概念而是死记硬背了所有练习题和答案。当遇到新问题时他就不知所措了。这就是过拟合在决策树中过拟合表现为树过于复杂分支过多完美拟合训练数据但在测试数据上表现差捕捉了噪声而不是规律可视化示例三、决策树剪枝的两种主要方法1.1.预剪枝 (Pre-pruning)在树生长过程中提前停止防止过度生长。from sklearn.tree import DecisionTreeClassifier # 创建带有预剪枝参数的决策树 tree DecisionTreeClassifier( # 1. 最大深度控制 - 限制树的生长高度 max_depth5, # 树的最大深度 # 2. 最小样本分裂 - 节点至少有多少样本才考虑分裂 min_samples_split20, # 节点最少需要20个样本才分裂 # 3. 最小样本叶子 - 叶节点最少需要多少样本 min_samples_leaf10, # 每个叶节点至少10个样本 # 4. 最大叶子节点数 - 限制叶节点的总数 max_leaf_nodes31, # 最多31个叶节点二叉树2^5-1 # 5. 分裂最小不纯度减少量 min_impurity_decrease0.01, # 分裂必须使不纯度减少至少0.01 # 6. 最大特征数 - 随机森林中常用 max_featuressqrt, # 每次分裂考虑的特征数为总特征数的平方根 random_state42 )2.后剪枝 (Post-pruning)先让树完全生长然后从底部开始剪掉不必要的分支。sklearn中的代价复杂度剪枝 (Cost-Complexity Pruning)import numpy as np from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.datasets import load_breast_cancer # 加载数据 data load_breast_cancer() X, y data.data, data.target X_train, X_test, y_train, y_test train_test_split(X, y, test_size0.3, random_state42) # 让树完全生长 tree_full DecisionTreeClassifier(random_state42, max_depthNone) tree_full.fit(X_train, y_train) # 获取剪枝路径 path tree_full.cost_complexity_pruning_path(X_train, y_train) ccp_alphas, impurities path.ccp_alphas, path.impurities print(f找到 {len(ccp_alphas)} 个alpha值可用于剪枝) print(fAlpha范围: {ccp_alphas.min():.6f} 到 {ccp_alphas.max():.6f}) # 为每个alpha值训练一个树 trees [] for ccp_alpha in ccp_alphas: tree DecisionTreeClassifier(random_state42, ccp_alphaccp_alpha) tree.fit(X_train, y_train) trees.append(tree) # 移除最后一个树只有一个节点 trees trees[:-1] ccp_alphas ccp_alphas[:-1] # 评估不同alpha值的性能 train_scores [tree.score(X_train, y_train) for tree in trees] test_scores [tree.score(X_test, y_test) for tree in trees] # 可视化 fig, (ax1, ax2) plt.subplots(1, 2, figsize(12, 4)) # 准确率 vs Alpha ax1.plot(ccp_alphas, train_scores, markero, label训练集, drawstylesteps-post) ax1.plot(ccp_alphas, test_scores, markers, label测试集, drawstylesteps-post) ax1.set_xlabel(Alpha (正则化参数)) ax1.set_ylabel(准确率) ax1.set_title(不同Alpha值的模型性能) ax1.legend() ax1.grid(True, alpha0.3) # 树节点数 vs Alpha node_counts [tree.tree_.node_count for tree in trees] ax2.plot(ccp_alphas, node_counts, markero, drawstylesteps-post) ax2.set_xlabel(Alpha (正则化参数)) ax2.set_ylabel(节点数量) ax2.set_title(树复杂度 vs Alpha) ax2.grid(True, alpha0.3) plt.tight_layout() plt.show() # 找到最优alpha值 best_idx np.argmax(test_scores) best_alpha ccp_alphas[best_idx] print(f\n最优Alpha值: {best_alpha:.6f}) print(f最优测试准确率: {test_scores[best_idx]:.4f}) print(f对应训练准确率: {train_scores[best_idx]:.4f}) print(f树节点数: {node_counts[best_idx]})四、应用以以下数据作数据集用决策树剪枝来解决该问题完整代码import math import pandas as pd import numpy as np from collections import Counter class DecisionTree: def __init__(self, criterioninfo_gain, max_depthNone, min_samples_split2, min_samples_leaf1, pruningpost, alpha0.1): self.criterion criterion # info_gain or gain_ratio self.max_depth max_depth # 预剪枝最大深度 self.min_samples_split min_samples_split # 预剪枝分裂所需最小样本数 self.min_samples_leaf min_samples_leaf # 预剪枝叶节点最小样本数 self.pruning pruning # 剪枝方式pre(预剪枝)、post(后剪枝)、None(不剪枝) self.alpha alpha # 后剪枝惩罚系数 self.tree None self.feature_names [年龄段, 有工作, 有自己的房子, 信贷情况] self.train_data None # 保存训练数据用于后剪枝 self.train_labels None def fit(self, X, y): # 数据校验 if len(X) 0 or len(y) 0 or len(X) ! len(y): raise ValueError(训练数据为空或特征与标签长度不匹配) self.train_data X self.train_labels y # 构建原始树 self.tree self._build_tree(X, y, depth0) # 后剪枝 if self.pruning post: self.tree self._post_prune(self.tree, X, y) def predict(self, X): # 空输入处理 if len(X) 0: return [] return [self._predict_single(x, self.tree) for x in X] def _entropy(self, y): 计算熵 counter Counter(y) entropy 0.0 for count in counter.values(): p count / len(y) entropy - p * math.log2(p) if p 0 else 0 return entropy def _information_gain(self, X, y, feature_idx): 计算信息增益/增益率 base_entropy self._entropy(y) # 获取该特征的所有取值 feature_values [x[feature_idx] for x in X] unique_values set(feature_values) # 计算按该特征分割后的条件熵 conditional_entropy 0.0 split_info 0.0 for value in unique_values: sub_y [y[i] for i in range(len(X)) if X[i][feature_idx] value] if len(sub_y) 0: continue p len(sub_y) / len(y) conditional_entropy p * self._entropy(sub_y) # 计算分裂信息用于增益率 if self.criterion gain_ratio: split_info - p * math.log2(p) if p 0 else 0 info_gain base_entropy - conditional_entropy if self.criterion gain_ratio: if split_info 0: # 避免除零 return 0 return info_gain / split_info else: return info_gain def _majority_vote(self, y): 多数投票 if len(y) 0: return 0 # 默认返回0 counter Counter(y) return counter.most_common(1)[0][0] if counter else 0 def _build_tree(self, X, y, depth): 递归构建决策树包含预剪枝 # 预剪枝终止条件 # 1. 所有样本属于同一类别 if len(set(y)) 1: return {class: y[0], samples: len(y)} # 2. 达到最大深度 if self.max_depth and depth self.max_depth: return {class: self._majority_vote(y), samples: len(y)} # 3. 样本数小于最小分裂数 if len(X) self.min_samples_split: return {class: self._majority_vote(y), samples: len(y)} # 4. 没有特征可分 if len(X[0]) 0: return {class: self._majority_vote(y), samples: len(y)} # 选择最佳特征 best_gain -1 best_feature None for feature_idx in range(len(X[0])): gain self._information_gain(X, y, feature_idx) if gain best_gain: best_gain gain best_feature feature_idx # 5. 信息增益为0无增益 if best_gain 0: return {class: self._majority_vote(y), samples: len(y)} # 构建子树 tree { feature: best_feature, feature_name: self.feature_names[best_feature], children: {}, samples: len(y), entropy: self._entropy(y) } # 按最佳特征的值分割数据 feature_values [x[best_feature] for x in X] unique_values set(feature_values) for value in unique_values: sub_X [x[:best_feature] x[best_feature1:] for i, x in enumerate(X) if X[i][best_feature] value] sub_y [y[i] for i in range(len(X)) if X[i][best_feature] value] # 预剪枝子节点样本数小于叶节点最小样本数 if len(sub_X) self.min_samples_leaf: tree[children][value] {class: self._majority_vote(sub_y), samples: len(sub_y)} else: tree[children][value] self._build_tree(sub_X, sub_y, depth 1) return tree def _calculate_tree_error(self, tree): 计算树的错误率悲观错误率 if class in tree: # 叶节点 # 悲观错误率(错误数 0.5) / 样本数 counter Counter([self.train_labels[i] for i in range(len(self.train_data)) if self._is_sample_in_leaf(self.train_data[i], tree)]) if tree[class] in counter: correct counter[tree[class]] else: correct 0 error (tree[samples] - correct 0.5) / tree[samples] if tree[samples] 0 else 0 return error * tree[samples], tree[samples] # 非叶节点累加子节点错误 total_error 0 total_samples 0 for child in tree[children].values(): child_error, child_samples self._calculate_tree_error(child) total_error child_error total_samples child_samples return total_error, total_samples def _is_sample_in_leaf(self, x, leaf_node): 判断样本是否属于该叶节点用于计算错误率 # 简化判断仅用于后剪枝错误率计算实际场景需根据路径匹配 return True def _post_prune(self, tree, X, y): 后剪枝悲观错误率剪枝 if class in tree: # 叶节点无需剪枝 return tree # 递归剪枝子节点 pruned_children {} for value, child in tree[children].items(): # 分割子节点数据 sub_X [x[:tree[feature]] x[tree[feature]1:] for i, x in enumerate(X) if X[i][tree[feature]] value] sub_y [y[i] for i in range(len(X)) if X[i][tree[feature]] value] pruned_children[value] self._post_prune(child, sub_X, sub_y) tree[children] pruned_children # 计算剪枝前保留分支的总错误 before_prune_error, before_samples self._calculate_tree_error(tree) before_cost before_prune_error self.alpha * len(tree[children]) # 惩罚项子节点数 # 计算剪枝后合并为叶节点的错误 leaf_class self._majority_vote(y) after_prune_error (len(y) - sum(1 for label in y if label leaf_class) 0.5) after_cost after_prune_error self.alpha * 1 # 叶节点惩罚项为1 # 剪枝条件剪枝后代价更低 if after_cost before_cost: return {class: leaf_class, samples: len(y)} else: return tree def _predict_single(self, x, tree): 预测单个样本 if class in tree: # 叶节点 return tree[class] feature_value x[tree[feature]] if feature_value in tree[children]: return self._predict_single(x, tree[children][feature_value]) else: # 如果遇到未见过的特征值返回训练集中最常见的类别 return self._majority_vote(self.train_labels) def load_data(filename): 加载数据文件 try: with open(filename, r, encodingutf-8) as f: lines f.readlines() except FileNotFoundError: return [] except Exception as e: print(f读取文件错误: {e}) return [] data [] for line_num, line in enumerate(lines, 1): line line.strip() if not line: continue try: values list(map(int, line.split(,))) # 校验数据格式必须是5列4个特征1个标签 if len(values) ! 5: print(f警告第{line_num}行数据格式错误跳过需要5列实际{len(values)}列) continue data.append(values) except ValueError: print(f警告第{line_num}行数据包含非数字跳过) continue return data def evaluate_model(y_true, y_pred): 评估模型性能修复除以0错误 # 空输入处理 if len(y_true) 0 or len(y_pred) 0 or len(y_true) ! len(y_pred): return { accuracy: 0.0, precision: 0.0, recall: 0.0, f1_score: 0.0, error: 输入数据为空或长度不匹配 } # 计算准确率 accuracy sum(1 for i in range(len(y_true)) if y_true[i] y_pred[i]) / len(y_true) # 计算精确率、召回率、F1分数 tp sum(1 for i in range(len(y_true)) if y_true[i] 1 and y_pred[i] 1) fp sum(1 for i in range(len(y_true)) if y_true[i] 0 and y_pred[i] 1) fn sum(1 for i in range(len(y_true)) if y_true[i] 1 and y_pred[i] 0) tn sum(1 for i in range(len(y_true)) if y_true[i] 0 and y_pred[i] 0) # 避免除以0 precision tp / (tp fp) if (tp fp) 0 else 0 recall tp / (tp fn) if (tp fn) 0 else 0 f1 2 * precision * recall / (precision recall) if (precision recall) 0 else 0 return { accuracy: accuracy, precision: precision, recall: recall, f1_score: f1, error: None, confusion_matrix: { tp: tp, fp: fp, fn: fn, tn: tn } } def print_tree(tree, indent0): 打印决策树结构 if tree is None: print( * indent 空树) return if class in tree: print( * indent f类别: {tree[class]} (样本数: {tree[samples]})) else: print( * indent f特征: {tree[feature_name]} (样本数: {tree[samples]}, 熵: {tree[entropy]:.3f})) for value, subtree in tree[children].items(): print( * (indent 1) f值 {value}:) print_tree(subtree, indent 2) def generate_sample_data(): 生成示例数据确保有数据可用 # 示例数据格式[年龄段, 有工作, 有自己的房子, 信贷情况, 是否贷款] # 年龄段0(青年),1(中年),2(老年)有工作/有房子0(无),1(有)信贷0(差),1(中),2(好)贷款0(否),1(是) train_data [ [0, 0, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 1, 1], [0, 1, 1, 0, 1], [0, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 1, 0], [1, 1, 1, 1, 1], [1, 0, 1, 2, 1], [1, 0, 1, 2, 1], [2, 0, 1, 2, 1], [2, 0, 1, 1, 1], [2, 1, 0, 1, 1], [2, 1, 0, 2, 1], [2, 0, 0, 0, 0] ] test_data [ [0, 0, 0, 2, 0], [1, 0, 1, 1, 1], [2, 1, 0, 0, 1], [0, 1, 0, 2, 1] ] return train_data, test_data def main(): print( * 70) print(决策树分类器 - 贷款审批预测含剪枝功能) print( * 70) # 加载数据 train_data load_data(dataset.txt) test_data load_data(testset.txt) # 如果加载失败使用示例数据 if len(train_data) 0 or len(test_data) 0: print(\n警告未找到有效数据文件或数据为空使用内置示例数据进行演示...) train_data, test_data generate_sample_data() # 分离特征和标签 X_train [sample[:-1] for sample in train_data] y_train [sample[-1] for sample in train_data] X_test [sample[:-1] for sample in test_data] y_test [sample[-1] for sample in test_data] # 打印数据基本信息 print(f\n数据概览:) print(f 训练集样本数: {len(X_train)}) print(f 测试集样本数: {len(X_test)}) # 1. 无剪枝 print(\n1. 无剪枝决策树:) print(- * 50) try: dt_no_prune DecisionTree(criterioninfo_gain, pruningNone) dt_no_prune.fit(X_train, y_train) y_pred_no_prune dt_no_prune.predict(X_test) metrics_no_prune evaluate_model(y_test, y_pred_no_prune) print(f性能指标:) if metrics_no_prune[error]: print(f 错误: {metrics_no_prune[error]}) else: print(f 准确率: {metrics_no_prune[accuracy]:.3f}) print(f 精确率: {metrics_no_prune[precision]:.3f}) print(f 召回率: {metrics_no_prune[recall]:.3f}) print(f F1分数: {metrics_no_prune[f1_score]:.3f}) print(\n决策树结构:) print_tree(dt_no_prune.tree) except Exception as e: print(f训练/预测失败: {e}) # 2. 预剪枝 print(\n\n2. 预剪枝决策树 (max_depth3, min_samples_split3):) print(- * 50) try: dt_pre_prune DecisionTree(criterioninfo_gain, pruningpre, max_depth3, min_samples_split3, min_samples_leaf2) dt_pre_prune.fit(X_train, y_train) y_pred_pre dt_pre_prune.predict(X_test) metrics_pre evaluate_model(y_test, y_pred_pre) print(f性能指标:) if metrics_pre[error]: print(f 错误: {metrics_pre[error]}) else: print(f 准确率: {metrics_pre[accuracy]:.3f}) print(f 精确率: {metrics_pre[precision]:.3f}) print(f 召回率: {metrics_pre[recall]:.3f}) print(f F1分数: {metrics_pre[f1_score]:.3f}) print(\n决策树结构:) print_tree(dt_pre_prune.tree) except Exception as e: print(f训练/预测失败: {e}) # 3. 后剪枝 print(\n\n3. 后剪枝决策树 (alpha0.1):) print(- * 50) try: dt_post_prune DecisionTree(criterioninfo_gain, pruningpost, alpha0.1) dt_post_prune.fit(X_train, y_train) y_pred_post dt_post_prune.predict(X_test) metrics_post evaluate_model(y_test, y_pred_post) print(f性能指标:) if metrics_post[error]: print(f 错误: {metrics_post[error]}) else: print(f 准确率: {metrics_post[accuracy]:.3f}) print(f 精确率: {metrics_post[precision]:.3f}) print(f 召回率: {metrics_post[recall]:.3f}) print(f F1分数: {metrics_post[f1_score]:.3f}) print(\n决策树结构:) print_tree(dt_post_prune.tree) except Exception as e: print(f训练/预测失败: {e}) # 4. 增益率后剪枝 print(\n\n4. 增益率后剪枝决策树:) print(- * 50) try: dt_gain_ratio_prune DecisionTree(criteriongain_ratio, pruningpost, alpha0.1) dt_gain_ratio_prune.fit(X_train, y_train) y_pred_ratio dt_gain_ratio_prune.predict(X_test) metrics_ratio evaluate_model(y_test, y_pred_ratio) print(f性能指标:) if metrics_ratio[error]: print(f 错误: {metrics_ratio[error]}) else: print(f 准确率: {metrics_ratio[accuracy]:.3f}) print(f 精确率: {metrics_ratio[precision]:.3f}) print(f 召回率: {metrics_ratio[recall]:.3f}) print(f F1分数: {metrics_ratio[f1_score]:.3f}) print(\n决策树结构:) print_tree(dt_gain_ratio_prune.tree) except Exception as e: print(f训练/预测失败: {e}) # 5. 综合对比仅当所有模型都有有效结果时 print(\n\n5. 不同策略性能对比:) print(- * 60) print(f{策略:15} {准确率:10} {精确率:10} {召回率:10} {F1分数:10}) print(- * 60) # 安全获取指标 def safe_get_metric(metrics, key): return metrics.get(key, 0.0) if isinstance(metrics, dict) else 0.0 # 打印对比 print(f{无剪枝:15} {safe_get_metric(metrics_no_prune, accuracy):.3f} {safe_get_metric(metrics_no_prune, precision):.3f} {safe_get_metric(metrics_no_prune, recall):.3f} {safe_get_metric(metrics_no_prune, f1_score):.3f}) print(f{预剪枝:15} {safe_get_metric(metrics_pre, accuracy):.3f} {safe_get_metric(metrics_pre, precision):.3f} {safe_get_metric(metrics_pre, recall):.3f} {safe_get_metric(metrics_pre, f1_score):.3f}) print(f{后剪枝:15} {safe_get_metric(metrics_post, accuracy):.3f} {safe_get_metric(metrics_post, precision):.3f} {safe_get_metric(metrics_post, recall):.3f} {safe_get_metric(metrics_post, f1_score):.3f}) print(f{增益率后剪枝:15} {safe_get_metric(metrics_ratio, accuracy):.3f} {safe_get_metric(metrics_ratio, precision):.3f} {safe_get_metric(metrics_ratio, recall):.3f} {safe_get_metric(metrics_ratio, f1_score):.3f}) if __name__ __main__: # 全局异常捕获 try: main() except Exception as e: print(f\n程序执行出错: {e}) import traceback traceback.print_exc()运行结果决策树分类器 - 贷款审批预测含剪枝功能警告未找到有效数据文件或数据为空使用内置示例数据进行演示...数据概览:训练集样本数: 15测试集样本数: 41. 无剪枝决策树:--------------------------------------------------性能指标:准确率: 1.000精确率: 1.000召回率: 1.000F1分数: 1.000决策树结构:特征: 有自己的房子 (样本数: 15, 熵: 0.971)值 0:特征: 有工作 (样本数: 9, 熵: 0.918)值 0:类别: 0 (样本数: 6)值 1:类别: 1 (样本数: 3)值 1:类别: 1 (样本数: 6)2. 预剪枝决策树 (max_depth3, min_samples_split3):--------------------------------------------------性能指标:准确率: 1.000精确率: 1.000召回率: 1.000F1分数: 1.000决策树结构:特征: 有自己的房子 (样本数: 15, 熵: 0.971)值 0:特征: 有工作 (样本数: 9, 熵: 0.918)值 0:类别: 0 (样本数: 6)值 1:类别: 1 (样本数: 3)值 1:类别: 1 (样本数: 6)3. 后剪枝决策树 (alpha0.1):--------------------------------------------------性能指标:准确率: 1.000精确率: 1.000召回率: 1.000F1分数: 1.000决策树结构:特征: 有自己的房子 (样本数: 15, 熵: 0.971)值 0:特征: 有工作 (样本数: 9, 熵: 0.918)值 0:类别: 0 (样本数: 6)值 1:类别: 1 (样本数: 3)值 1:类别: 1 (样本数: 6)4. 增益率后剪枝决策树:--------------------------------------------------性能指标:准确率: 1.000精确率: 1.000召回率: 1.000F1分数: 1.000决策树结构:特征: 有自己的房子 (样本数: 15, 熵: 0.971)值 0:特征: 有工作 (样本数: 9, 熵: 0.918)值 0:类别: 0 (样本数: 6)值 1:类别: 1 (样本数: 3)值 1:类别: 1 (样本数: 6)5. 不同策略性能对比:------------------------------------------------------------策略 准确率 精确率 召回率 F1分数------------------------------------------------------------无剪枝 1.000 1.000 1.000 1.000预剪枝 1.000 1.000 1.000 1.000后剪枝 1.000 1.000 1.000 1.000增益率后剪枝 1.000 1.000 1.000 1.000五、结语决策树剪枝不仅是技术更是一种平衡艺术——在模型的复杂度和泛化能力之间找到最佳平衡点。通过本文介绍的多种剪枝技术你可以避免过拟合让模型更好地泛化到新数据提高可解释性简化模型使其更容易理解优化计算效率减少不必要的计算和存储开销提升业务价值创建更稳健、可靠的预测模型记住没有一种剪枝方法适合所有场景。最好的方法是根据你的具体数据和业务需求实验不同的剪枝策略找到最适合的平衡点。