茂名建设企业网站热点网站建设
2026/1/1 23:42:37 网站建设 项目流程
茂名建设企业网站,热点网站建设,网站上的弹框如何做网页,上海市奉贤区建设局网站浙大疏锦行 方差筛选 是最基础的过滤法#xff1a;计算特征的方差#xff0c;剔除方差极低的特征#xff08;这类特征数值变化小#xff0c;对样本区分度弱#xff09;。优点是计算极快#xff0c;缺点是只看特征自身#xff0c;不考虑和目标的关联。 皮尔逊相关系数筛…浙大疏锦行方差筛选是最基础的过滤法计算特征的方差剔除方差极低的特征这类特征数值变化小对样本区分度弱。优点是计算极快缺点是只看特征自身不考虑和目标的关联。皮尔逊相关系数筛选属于过滤法计算特征与目标变量的皮尔逊相关系数衡量线性相关程度保留相关系数绝对值高的特征。优点是直观易懂缺点仅能捕捉线性关联对非线性关系无效。Lasso 筛选属于嵌入法在线性模型如线性回归、逻辑回归中加入 L1 正则化迫使不重要的特征系数收缩至 0最终保留系数非 0 的特征。既能完成特征筛选也能同步训练模型适合高维数据。树模型重要性属于嵌入法基于决策树 / 随机森林 / XGBoost 等树模型通过 “特征对节点分裂的贡献度” 计算重要性保留重要性高的特征。优点是能捕捉非线性关联缺点是易受高基数特征干扰。SHAP 重要性是树模型重要性的进阶版基于 SHAP 值解释模型预测的统一框架计算特征重要性不仅能体现特征的整体影响程度还能展示特征对预测结果的正负方向解释性更强。递归特征消除RFE属于包裹法反复训练指定模型每次移除模型判定的 “最不重要特征”直到保留预设数量的特征。优点是直接以模型性能为筛选依据精度较高缺点是计算成本高需多次训练模型作业对心脏病数据集完成特征筛选并对比精度。import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import VarianceThreshold, RFE from sklearn.metrics import accuracy_score, classification_report import shap import xgboost as xgb # ---------------------- 1. 加载数据与预处理 ---------------------- # 加载数据集 df pd.read_csv(E:/PythonStudy/data.csv) # 分离特征与目标变量 X df.drop(target, axis1) y df[target] # 填充缺失值 X X.fillna(X.mean()) # ---------------------- 2. 数据探索可视化 ---------------------- def data_explore(df): 数据分布与特征相关性可视化 # 目标变量分布饼图 target_count df[target].value_counts().reset_index() target_count[target] target_count[target].map(lambda x: 患病 if x 1 else 正常) target_count[占比] target_count[count] / target_count[count].sum() * 100 plt.figure(figsize(6, 6)) wedges, texts plt.pie( target_count[count], wedgeprops{width: 0.4, edgecolor: #000}, labelstarget_count[target], autopct%1.1f%% ) plt.title(样本患病情况分布, fontsize15, pad20) plt.show() # 特征与目标变量的相关性热力图 corr df.corr() plt.figure(figsize(12, 8)) sns.heatmap( corr, annotTrue, cmapRdBu_r, vmin-1, vmax1, annot_kws{fontsize: 10} ) plt.title(特征-目标相关性热力图, fontsize15) plt.show() # 执行数据探索 data_explore(df) # ---------------------- 3. 数据集划分与标准化 ---------------------- X_train, X_test, y_train, y_test train_test_split( X, y, test_size0.2, random_state42, stratifyy ) scaler StandardScaler() X_train_scaled pd.DataFrame(scaler.fit_transform(X_train), columnsX.columns) X_test_scaled pd.DataFrame(scaler.transform(X_test), columnsX.columns) # ---------------------- 4. 特征筛选方法定义 ---------------------- def select_by_variance(X_train, X_test, threshold0.1): 方差筛选移除低方差特征 vt VarianceThreshold(threshold) X_train_sel vt.fit_transform(X_train) X_test_sel vt.transform(X_test) cols X_train.columns[vt.get_support()] return pd.DataFrame(X_train_sel, columnscols), pd.DataFrame(X_test_sel, columnscols) def select_by_pearson(X_train, y_train, X_test, top_n5): 皮尔逊相关系数筛选保留与目标相关性topN的特征 corr [abs(X_train[col].corr(y_train)) for col in X_train.columns] top_cols X_train.columns[np.argsort(corr)[-top_n:]] return X_train[top_cols], X_test[top_cols] def select_by_lasso(X_train_scaled, y_train, X_test_scaled): Lasso筛选保留系数非0的特征 lasso LogisticRegression(penaltyl1, solverliblinear, C0.1, random_state42) lasso.fit(X_train_scaled, y_train) top_cols X_train_scaled.columns[lasso.coef_[0] ! 0] return X_train_scaled[top_cols], X_test_scaled[top_cols] def select_by_tree_importance(X_train, y_train, X_test, top_n5): 树模型重要性筛选保留随机森林重要性topN的特征 rf RandomForestClassifier(random_state42) rf.fit(X_train, y_train) top_cols X_train.columns[np.argsort(rf.feature_importances_)[-top_n:]] return X_train[top_cols], X_test[top_cols] def select_by_shap(X_train, y_train, X_test, top_n5): SHAP重要性筛选保留SHAP值topN的特征 rf RandomForestClassifier(random_state42) rf.fit(X_train, y_train) explainer shap.TreeExplainer(rf) shap_values explainer.shap_values(X_train) shap_importance np.mean(np.abs(shap_values[1]), axis0) top_cols X_train.columns[np.argsort(shap_importance)[-top_n:]] return X_train[top_cols], X_test[top_cols] def select_by_rfe(X_train, y_train, X_test, n_features5): 递归特征消除保留指定数量的特征 lr LogisticRegression(max_iter1000, random_state42) rfe RFE(estimatorlr, n_features_to_selectn_features) rfe.fit(X_train, y_train) top_cols X_train.columns[rfe.get_support()] return X_train[top_cols], X_test[top_cols] # ---------------------- 5. 模型评估与结果记录 ---------------------- # 待对比的模型 models { 逻辑回归: LogisticRegression(max_iter1000, random_state42), XGBoost: xgb.XGBClassifier(n_estimators100, random_state42) } # 待对比的特征筛选方法 methods { 方差筛选: select_by_variance, 皮尔逊相关: select_by_pearson, Lasso: select_by_lasso, 树模型重要性: select_by_tree_importance, SHAP重要性: select_by_shap, RFE: select_by_rfe } # 存储所有评估结果 all_results {} # 遍历筛选方法与模型计算指标 for method_name, method_func in methods.items(): # 执行特征筛选 if method_name Lasso: X_train_sel, X_test_sel method_func(X_train_scaled, y_train, X_test_scaled) else: X_train_sel, X_test_sel method_func(X_train, y_train, X_test) # 评估不同模型 method_res {} for model_name, model in models.items(): model.fit(X_train_sel, y_train) y_pred model.predict(X_test_sel) # 计算准确率与F1值 acc accuracy_score(y_test, y_pred) f1 classification_report(y_test, y_pred, output_dictTrue)[weighted avg][f1-score] method_res[model_name] {准确率: round(acc, 4), F1值: round(f1, 4)} print(f{method_name}-{model_name}准确率{acc:.4f}F1{f1:.4f}) all_results[method_name] method_res print(- * 60) # ---------------------- 6. 结果可视化 ---------------------- def plot_accuracy_compare(all_results): 绘制各筛选方法下的模型准确率对比图 plt.figure(figsize(12, 6)) x np.arange(len(all_results)) width 0.35 # 提取不同模型的准确率 lr_acc [all_results[method][逻辑回归][准确率] for method in all_results] xgb_acc [all_results[method][XGBoost][准确率] for method in all_results] # 绘制柱状图 plt.bar(x - width/2, lr_acc, width, label逻辑回归, color#1f77b4) plt.bar(x width/2, xgb_acc, width, labelXGBoost, color#ff7f0e) # 图表配置 plt.xticks(x, all_results.keys(), rotation45) plt.ylabel(准确率, fontsize12) plt.title(不同特征筛选方法的模型准确率对比, fontsize15) plt.legend() plt.tight_layout() plt.show() # 生成可视化结果 plot_accuracy_compare(all_results)

需要专业的网站建设服务?

联系我们获取免费的网站建设咨询和方案报价,让我们帮助您实现业务目标

立即咨询