import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
# 加载数据
data = pd.read_csv('C:/Users/y1735/Desktop/heart.csv',encoding='gbk')
该数据集包含以下14个特征:
1.age - 年龄
2.sex - 性别
3.cp - 胸痛类型
4.trestbps - 静息血压
5.chol - 血清胆固醇
6.fbs - 空腹血糖
7.restecg - 静息心电图结果
8.thalach - 最大心率
9.exang - 运动诱发心绞痛
10.oldpeak - 运动引起的ST压低
11.slope - 运动ST段峰值斜率
12.ca - 主要血管数量
13.thal - 地中海贫血
14.target - 目标变量(是否有心脏病)
print("缺失值检查:")
print(data.isnull().sum())
缺失值检查:
age 0
sex 0
cp 0
trestbps 0
chol 0
fbs 0
restecg 0
thalach 0
exang 0
oldpeak 0
slope 0
ca 0
thal 0
target 0
dtype: int64
data = data.drop_duplicates()
print("\n去重后数据形状:", data.shape)
去重后数据形状: (302, 14)
# 检查异常值
print("\n数据描述:")
print(data.describe())
X = data.drop('target', axis=1)
y = data['target']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 初始化XGBoost模型
xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')
# 训练模型
xgb_model.fit(X_train_scaled, y_train)
# 预测
xgb_pred = xgb_model.predict(X_test_scaled)
xgb_accuracy = accuracy_score(y_test, xgb_pred)
xgb_precision = precision_score(y_test, xgb_pred)
xgb_recall = recall_score(y_test, xgb_pred)
xgb_f1 = f1_score(y_test, xgb_pred)
# 输出结果
print("\nXGBoost模型性能:")
print(f"准确率: {xgb_accuracy:.4f}")
print(f"查准率: {xgb_precision:.4f}")
print(f"召回率: {xgb_recall:.4f}")
print(f"F1分数: {xgb_f1:.4f}")
XGBoost模型性能:
准确率: 0.8242
查准率: 0.7647
召回率: 0.9070
F1分数: 0.8298
xgb_cm = confusion_matrix(y_test, xgb_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(xgb_cm, annot=True, fmt='d', cmap='Blues')
plt.title('XGBoost混淆矩阵')
plt.xlabel('预测值')
plt.ylabel('真实值')
plt.show()
xgb_feature_imp = pd.Series(xgb_model.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=xgb_feature_imp, y=xgb_feature_imp.index)
plt.title('XGBoost特征重要性')
plt.show()
cp对XGBoost模型贡献程度最大
# 初始化AdaBoost模型
ada_model = AdaBoostClassifier(
base_estimator=DecisionTreeClassifier(max_depth=1),
random_state=42
)
# 训练模型
ada_model.fit(X_train_scaled, y_train)
# 预测
ada_pred = ada_model.predict(X_test_scaled)
ada_accuracy = accuracy_score(y_test, ada_pred)
ada_precision = precision_score(y_test, ada_pred)
ada_recall = recall_score(y_test, ada_pred)
ada_f1 = f1_score(y_test, ada_pred)
# 输出结果
print("\nAdaBoost模型性能:")
print(f"准确率: {ada_accuracy:.4f}")
print(f"查准率: {ada_precision:.4f}")
print(f"召回率: {ada_recall:.4f}")
print(f"F1分数: {ada_f1:.4f}")
AdaBoost模型性能:
准确率: 0.8462
查准率: 0.8085
召回率: 0.8837
F1分数: 0.8444
ada_cm = confusion_matrix(y_test, ada_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(ada_cm, annot=True, fmt='d', cmap='Greens')
plt.title('AdaBoost混淆矩阵')
plt.xlabel('预测值')
plt.ylabel('真实值')
plt.show()
ada_feature_imp = pd.Series(ada_model.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=ada_feature_imp, y=ada_feature_imp.index)
plt.title('AdaBoost特征重要性')
plt.show()
chol对AdaBoost模型贡献程度最大
# 初始化Bagging模型
bag_model = BaggingClassifier(
base_estimator=DecisionTreeClassifier(),
random_state=42
)
# 训练模型
bag_model.fit(X_train_scaled, y_train)
# 预测
bag_pred = bag_model.predict(X_test_scaled)
bag_accuracy = accuracy_score(y_test, bag_pred)
bag_precision = precision_score(y_test, bag_pred)
bag_recall = recall_score(y_test, bag_pred)
bag_f1 = f1_score(y_test, bag_pred)
# 输出结果
print("\nBagging模型性能:")
print(f"准确率: {bag_accuracy:.4f}")
print(f"查准率: {bag_precision:.4f}")
print(f"召回率: {bag_recall:.4f}")
print(f"F1分数: {bag_f1:.4f}")
Bagging模型性能:
准确率: 0.7802
查准率: 0.7255
召回率: 0.8605
F1分数: 0.7872
bag_cm = confusion_matrix(y_test, bag_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(bag_cm, annot=True, fmt='d', cmap='Oranges')
plt.title('Bagging混淆矩阵')
plt.xlabel('预测值')
plt.ylabel('真实值')
plt.show()
# 初始化随机森林模型
rf_model = RandomForestClassifier(random_state=42)
# 训练模型
rf_model.fit(X_train_scaled, y_train)
# 预测
rf_pred = rf_model.predict(X_test_scaled)
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_precision = precision_score(y_test, rf_pred)
rf_recall = recall_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred)
# 输出结果
print("\n随机森林模型性能:")
print(f"准确率: {rf_accuracy:.4f}")
print(f"查准率: {rf_precision:.4f}")
print(f"召回率: {rf_recall:.4f}")
print(f"F1分数: {rf_f1:.4f}")
随机森林模型性能:
准确率: 0.7473
查准率: 0.7174
召回率: 0.7674
F1分数: 0.7416
rf_cm = confusion_matrix(y_test, rf_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(rf_cm, annot=True, fmt='d', cmap='Reds')
plt.title('随机森林混淆矩阵')
plt.xlabel('预测值')
plt.ylabel('真实值')
plt.show()
rf_feature_imp = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=rf_feature_imp, y=rf_feature_imp.index)
plt.title('随机森林特征重要性')
plt.show()
thalach对随机森林模型贡献程度最大
# 汇总结果
results = {
'Model': ['XGBoost', 'AdaBoost', 'Bagging', 'Random Forest'],
'Accuracy': [xgb_accuracy, ada_accuracy, bag_accuracy, rf_accuracy],
'Precision': [xgb_precision, ada_precision, bag_precision, rf_precision],
'Recall': [xgb_recall, ada_recall, bag_recall, rf_recall],
'F1 Score': [xgb_f1, ada_f1, bag_f1, rf_f1]
}
results_df = pd.DataFrame(results)
print("\n模型性能比较:")
print(results_df)
模型性能比较:
Model Accuracy Precision Recall F1 Score
0 XGBoost 0.824176 0.764706 0.906977 0.829787
1 AdaBoost 0.846154 0.808511 0.883721 0.844444
2 Bagging 0.780220 0.725490 0.860465 0.787234
3 Random Forest 0.747253 0.717391 0.767442 0.741573
plt.figure(figsize=(12, 8))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
for i, metric in enumerate(metrics, 1):
plt.subplot(2, 2, i)
sns.barplot(x='Model', y=metric, data=results_df)
plt.title(metric)
plt.ylim(0, 1.1)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
根据F1分数(综合考虑查准率和召回率)选择最佳模型
best_model = results_df.loc[results_df['F1 Score'].idxmax()]
print("\n最佳模型:")
print(best_model)
最佳模型:
Model AdaBoost
Accuracy 0.846154
Precision 0.808511
Recall 0.883721
F1 Score 0.844444
Name: 1, dtype: object