import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

数据清洗

# 加载数据
data = pd.read_csv('C:/Users/y1735/Desktop/heart.csv',encoding='gbk')

该数据集包含以下14个特征:

1.age - 年龄

2.sex - 性别

3.cp - 胸痛类型

4.trestbps - 静息血压

5.chol - 血清胆固醇

6.fbs - 空腹血糖

7.restecg - 静息心电图结果

8.thalach - 最大心率

9.exang - 运动诱发心绞痛

10.oldpeak - 运动引起的ST压低

11.slope - 运动ST段峰值斜率

12.ca - 主要血管数量

13.thal - 地中海贫血

14.target - 目标变量(是否有心脏病)

检查缺失值

print("缺失值检查:")
print(data.isnull().sum())
缺失值检查:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

删除重复值

data = data.drop_duplicates()
print("\n去重后数据形状:", data.shape)
去重后数据形状: (302, 14)
# 检查异常值
print("\n数据描述:")
print(data.describe())

数据预处理

X = data.drop('target', axis=1)
y = data['target']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

XGBoost模型

# 初始化XGBoost模型
xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')

# 训练模型
xgb_model.fit(X_train_scaled, y_train)

# 预测
xgb_pred = xgb_model.predict(X_test_scaled)

指标评估

xgb_accuracy = accuracy_score(y_test, xgb_pred)
xgb_precision = precision_score(y_test, xgb_pred)
xgb_recall = recall_score(y_test, xgb_pred)
xgb_f1 = f1_score(y_test, xgb_pred)

# 输出结果
print("\nXGBoost模型性能:")
print(f"准确率: {xgb_accuracy:.4f}")
print(f"查准率: {xgb_precision:.4f}")
print(f"召回率: {xgb_recall:.4f}")
print(f"F1分数: {xgb_f1:.4f}")
XGBoost模型性能:
准确率: 0.8242
查准率: 0.7647
召回率: 0.9070
F1分数: 0.8298

混淆矩阵

xgb_cm = confusion_matrix(y_test, xgb_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(xgb_cm, annot=True, fmt='d', cmap='Blues')
plt.title('XGBoost混淆矩阵')
plt.xlabel('预测值')
plt.ylabel('真实值')
plt.show()
png
png

特征重要性

xgb_feature_imp = pd.Series(xgb_model.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=xgb_feature_imp, y=xgb_feature_imp.index)
plt.title('XGBoost特征重要性')
plt.show()
png
png

cp对XGBoost模型贡献程度最大

AdaBoost模型

# 初始化AdaBoost模型
ada_model = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=1),
    random_state=42
)

# 训练模型
ada_model.fit(X_train_scaled, y_train)

# 预测
ada_pred = ada_model.predict(X_test_scaled)

评估指标

ada_accuracy = accuracy_score(y_test, ada_pred)
ada_precision = precision_score(y_test, ada_pred)
ada_recall = recall_score(y_test, ada_pred)
ada_f1 = f1_score(y_test, ada_pred)

# 输出结果
print("\nAdaBoost模型性能:")
print(f"准确率: {ada_accuracy:.4f}")
print(f"查准率: {ada_precision:.4f}")
print(f"召回率: {ada_recall:.4f}")
print(f"F1分数: {ada_f1:.4f}")
AdaBoost模型性能:
准确率: 0.8462
查准率: 0.8085
召回率: 0.8837
F1分数: 0.8444

混淆矩阵

ada_cm = confusion_matrix(y_test, ada_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(ada_cm, annot=True, fmt='d', cmap='Greens')
plt.title('AdaBoost混淆矩阵')
plt.xlabel('预测值')
plt.ylabel('真实值')
plt.show()
png
png

特征重要性

ada_feature_imp = pd.Series(ada_model.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=ada_feature_imp, y=ada_feature_imp.index)
plt.title('AdaBoost特征重要性')
plt.show()
png
png

chol对AdaBoost模型贡献程度最大

Bagging模型

# 初始化Bagging模型
bag_model = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    random_state=42
)

# 训练模型
bag_model.fit(X_train_scaled, y_train)

# 预测
bag_pred = bag_model.predict(X_test_scaled)

评估指标

bag_accuracy = accuracy_score(y_test, bag_pred)
bag_precision = precision_score(y_test, bag_pred)
bag_recall = recall_score(y_test, bag_pred)
bag_f1 = f1_score(y_test, bag_pred)

# 输出结果
print("\nBagging模型性能:")
print(f"准确率: {bag_accuracy:.4f}")
print(f"查准率: {bag_precision:.4f}")
print(f"召回率: {bag_recall:.4f}")
print(f"F1分数: {bag_f1:.4f}")
Bagging模型性能:
准确率: 0.7802
查准率: 0.7255
召回率: 0.8605
F1分数: 0.7872

混淆矩阵

bag_cm = confusion_matrix(y_test, bag_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(bag_cm, annot=True, fmt='d', cmap='Oranges')
plt.title('Bagging混淆矩阵')
plt.xlabel('预测值')
plt.ylabel('真实值')
plt.show()
png
png

随机森林模型

# 初始化随机森林模型
rf_model = RandomForestClassifier(random_state=42)

# 训练模型
rf_model.fit(X_train_scaled, y_train)

# 预测
rf_pred = rf_model.predict(X_test_scaled)

评估指标

rf_accuracy = accuracy_score(y_test, rf_pred)
rf_precision = precision_score(y_test, rf_pred)
rf_recall = recall_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred)

# 输出结果
print("\n随机森林模型性能:")
print(f"准确率: {rf_accuracy:.4f}")
print(f"查准率: {rf_precision:.4f}")
print(f"召回率: {rf_recall:.4f}")
print(f"F1分数: {rf_f1:.4f}")
随机森林模型性能:
准确率: 0.7473
查准率: 0.7174
召回率: 0.7674
F1分数: 0.7416

混淆矩阵

rf_cm = confusion_matrix(y_test, rf_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(rf_cm, annot=True, fmt='d', cmap='Reds')
plt.title('随机森林混淆矩阵')
plt.xlabel('预测值')
plt.ylabel('真实值')
plt.show()
png
png

特征重要性

rf_feature_imp = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=rf_feature_imp, y=rf_feature_imp.index)
plt.title('随机森林特征重要性')
plt.show()
png
png

thalach对随机森林模型贡献程度最大

模型对比

# 汇总结果
results = {
    'Model': ['XGBoost', 'AdaBoost', 'Bagging', 'Random Forest'],
    'Accuracy': [xgb_accuracy, ada_accuracy, bag_accuracy, rf_accuracy],
    'Precision': [xgb_precision, ada_precision, bag_precision, rf_precision],
    'Recall': [xgb_recall, ada_recall, bag_recall, rf_recall],
    'F1 Score': [xgb_f1, ada_f1, bag_f1, rf_f1]
}

results_df = pd.DataFrame(results)
print("\n模型性能比较:")
print(results_df)
模型性能比较:
           Model  Accuracy  Precision    Recall  F1 Score
0        XGBoost  0.824176   0.764706  0.906977  0.829787
1       AdaBoost  0.846154   0.808511  0.883721  0.844444
2        Bagging  0.780220   0.725490  0.860465  0.787234
3  Random Forest  0.747253   0.717391  0.767442  0.741573

可视化比较

plt.figure(figsize=(12, 8))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']

for i, metric in enumerate(metrics, 1):
    plt.subplot(2, 2, i)
    sns.barplot(x='Model', y=metric, data=results_df)
    plt.title(metric)
    plt.ylim(0, 1.1)
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()
png
png

最佳模型选择

根据F1分数(综合考虑查准率和召回率)选择最佳模型

best_model = results_df.loc[results_df['F1 Score'].idxmax()]
print("\n最佳模型:")
print(best_model)
最佳模型:
Model        AdaBoost
Accuracy     0.846154
Precision    0.808511
Recall       0.883721
F1 Score     0.844444
Name: 1, dtype: object