# -*- coding:utf-8 -*-
# @author:Ye Zhoubing
# @datetime:2025/8/6 15:36
# @software: PyCharm
"""
使用 SHAP筛选最优子集
"""
import shap
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split# --------- 1. 读取数据 ----------
train_df = pd.read_csv('train_2014.csv', parse_dates=['date'])
predict_df = pd.read_csv('predict_2025.csv', parse_dates=['date'])train_df.set_index('date', inplace=True)
predict_df.set_index('date', inplace=True)X_train_raw = train_df.iloc[:, :8].values
y_train_raw = train_df.iloc[:, 8].values.reshape(-1, 1)# 原始数据(未标准化)
X = pd.DataFrame(X_train_raw, columns=[f'var{i+1}' for i in range(X_train_raw.shape[1])])
y = y_train_raw.flatten()# 拆分训练/验证(可选)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)# 训练 XGBoost 回归模型
model = xgb.XGBRegressor()
model.fit(X_train, y_train)# 创建 SHAP explainer
explainer = shap.Explainer(model)# 计算 SHAP 值
shap_values = explainer(X_train)# 可视化
shap.plots.beeswarm(shap_values)# 计算平均绝对 SHAP 值作为特征重要性指标
shap_importance = np.abs(shap_values.values).mean(axis=0)
importance_df = pd.DataFrame({'feature': X.columns,'shap_importance': shap_importance
}).sort_values(by='shap_importance', ascending=False)# 查看前几名特征
print(importance_df)# 假设我们选择前5个最重要特征
top_features = importance_df['feature'].head(5).tolist()
print("建议用于LSTM的特征子集:", top_features)

蜂群图(Beeswarm Plot)展示所有样本的特征SHAP值分布,纵轴为特征名称,横轴为SHAP值。点的颜色表示特征值大小(蓝色为低值,红色为高值),点的分布反映特征对模型输出的影响方向和强度。若点集中在右侧,说明该特征对预测有正向影响;集中在左侧则为负向影响。通过观察点的密集程度和分布范围,可判断特征的重要性。