基于 KMeans 聚类的特征重要性分析:使用 SHAP 和随机森林
本文演示了如何结合 KMeans 聚类、随机森林分类器和 SHAP 值,对心脏病数据集进行特征重要性分析。核心流程包括数据预处理、聚类评估、特征重要性排序和可视化。
1. 数据准备
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 加载数据
df = pd.read_csv('heart.csv')
# 处理缺失值:使用中位数填充
missing_features = df.columns[df.isnull().any()].tolist()
for feat in missing_features:
median_val = df[feat].median()
df[feat] = df[feat].fillna(median_val)
# 分割特征和标签
features = df.drop('target', axis=1)
target = df['target']
# 划分训练集和测试集 (8:2)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
2. 数据标准化与聚类评估
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
# 标准化
scaler = StandardScaler()
scaled_data = scaler.fit_transform(features)
# 评估不同 k 值 (2-10)
k_values = range(2, 11)
inertia_list = []
sil_list = []
ch_list = []
db_list = []
for k in k_values:
km = KMeans(n_clusters=k, random_state=42)
labels = km.fit_predict(scaled_data)
inertia_list.append(km.inertia_)
sil_list.append(silhouette_score(scaled_data, labels))
ch_list.append(calinski_harabasz_score(scaled_data, labels))
db_list.append(davies_bouldin_score(scaled_data, labels))
print(f"k={k}, 惯性:{km.inertia_:.2f}, 轮廓:{silhouette_score(scaled_data, labels):.3f}, CH:{ch_list[-1]:.2f}, DB:{db_list[-1]:.3f}")
# 选择 k=3(实际应用中可根据肘部法则或指标选择)
final_k = 3
kmeans = KMeans(n_clusters=final_k, random_state=42)
cluster_labels = kmeans.fit_predict(scaled_data)
features['Cluster'] = cluster_labels
# PCA 降维可视化
pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)
plt.figure(figsize=(6, 5))
sns.scatterplot(x=pca_data[:, 0], y=pca_data[:, 1], hue=cluster_labels, palette='viridis')
plt.title(f'KMeans 聚类结果 (k={final_k})')
plt.xlabel('主成分1')
plt.ylabel('主成分2')
plt.show()
print("各簇样本数:")
print(features['Cluster'].value_counts())
3. 使用随机森林和 SHAP 分析特征重要性
首先构建随机森林模型,然后用 SHAP 解释器计算特征重要性。
import shap
from sklearn.ensemble import RandomForestClassifier
# 除去聚类标签列
X_cluster = features.drop('Cluster', axis=1)
y_cluster = features['Cluster']
# 训练随机森林
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_cluster, y_cluster)
# SHAP 解释器
explainer = shap.TreeExplainer(rf_model)
shap_vals = explainer.shap_values(X_cluster)
# 处理多维 SHAP 值(多分类)
shap_vals_array = np.array(shap_vals)
print(f"SHAP 值形状: {shap_vals_array.shape}")
# 绘制特征重要性条形图(取第一个类别)
shap.summary_plot(shap_vals[0], X_cluster, plot_type="bar", show=False)
plt.title("SHAP 特征重要性(条形图)")
plt.show()
4. 选定特征分布分析
选取年龄、性别、胸痛类型、静息血压四个特征进行分布分析。
selected_feats = ['age', 'sex', 'cp', 'trestbps']
# 判断变量类型
for feat in selected_feats:
unique_cnt = df[feat].nunique()
feat_type = "离散型" if unique_cnt < 10 else "连续型"
print(f"{feat}: {unique_cnt} 个唯一值 -> {feat_type}")
# 总样本分布
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
for idx, feat in enumerate(selected_feats):
ax = axes[idx//2, idx%2]
ax.hist(df[feat], bins=20)
ax.set_title(f'{feat} 分布')
ax.set_xlabel(feat)
ax.set_ylabel('频数')
plt.tight_layout()
plt.show()
5. 按簇分析特征分布
# 按簇拆分数
clusters = features['Cluster'].unique()
for cluster in clusters:
subset = features[features['Cluster'] == cluster]
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
for idx, feat in enumerate(selected_feats):
ax = axes[idx//2, idx%2]
ax.hist(subset[feat], bins=20)
ax.set_title(f'簇 {cluster} - {feat} 分布')
ax.set_xlabel(feat)
ax.set_ylabel('频数')
plt.tight_layout()
plt.show()
通过以上步骤,我们成功将数据分为 3 个簇,并利用 SHAP 和随机森林识别出对聚类贡献最大的特征,进一步观察了这些特征在不同簇中的分布差异。这有助于理解数据的内部结构。