Heart Disease Classification Models¶
Using machine learning classification models to predict heart disease
Data loading and summary¶
In [ ]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df= pd.read_csv("/Users/nnthieu/Downloads/archive/heart_final.csv")
df.head()
Out[ ]:
age | sex | chest pain type | resting bp s | cholesterol | fasting blood sugar | resting ecg | max heart rate | exercise angina | oldpeak | ST slope | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 40 | 1 | 2 | 140 | 289 | 0 | 0 | 172 | 0 | 0.0 | 1 | 0 |
1 | 49 | 0 | 3 | 160 | 180 | 0 | 0 | 156 | 0 | 1.0 | 2 | 1 |
2 | 37 | 1 | 2 | 130 | 283 | 0 | 1 | 98 | 0 | 0.0 | 1 | 0 |
3 | 48 | 0 | 4 | 138 | 214 | 0 | 0 | 108 | 1 | 1.5 | 2 | 1 |
4 | 54 | 1 | 3 | 150 | 195 | 0 | 0 | 122 | 0 | 0.0 | 1 | 0 |
In [ ]:
df.shape
Out[ ]:
(1190, 12)
In [ ]:
df.isnull().sum()
Out[ ]:
age 0 sex 0 chest pain type 0 resting bp s 0 cholesterol 0 fasting blood sugar 0 resting ecg 0 max heart rate 0 exercise angina 0 oldpeak 0 ST slope 0 target 0 dtype: int64
In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1190 entries, 0 to 1189 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 1190 non-null int64 1 sex 1190 non-null int64 2 chest pain type 1190 non-null int64 3 resting bp s 1190 non-null int64 4 cholesterol 1190 non-null int64 5 fasting blood sugar 1190 non-null int64 6 resting ecg 1190 non-null int64 7 max heart rate 1190 non-null int64 8 exercise angina 1190 non-null int64 9 oldpeak 1190 non-null float64 10 ST slope 1190 non-null int64 11 target 1190 non-null int64 dtypes: float64(1), int64(11) memory usage: 111.7 KB
In [ ]:
# Create a mapping dictionary to remove spaces
mapping = {col: col.replace(' ', '') for col in df.columns}
# Rename columns using the mapping dictionary
df.rename(columns=mapping, inplace=True)
df.head()
Out[ ]:
age | sex | chestpaintype | restingbps | cholesterol | fastingbloodsugar | restingecg | maxheartrate | exerciseangina | oldpeak | STslope | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 40 | 1 | 2 | 140 | 289 | 0 | 0 | 172 | 0 | 0.0 | 1 | 0 |
1 | 49 | 0 | 3 | 160 | 180 | 0 | 0 | 156 | 0 | 1.0 | 2 | 1 |
2 | 37 | 1 | 2 | 130 | 283 | 0 | 1 | 98 | 0 | 0.0 | 1 | 0 |
3 | 48 | 0 | 4 | 138 | 214 | 0 | 0 | 108 | 1 | 1.5 | 2 | 1 |
4 | 54 | 1 | 3 | 150 | 195 | 0 | 0 | 122 | 0 | 0.0 | 1 | 0 |
Building models¶
Decision tree model
In [ ]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
x = df.drop('target', axis =1)
y = df['target']
x_train , x_test, y_train , y_test = train_test_split(x,y, test_size =0.3, random_state = 42)
model = DecisionTreeClassifier(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)
Accuracy: 0.87 Classification Report: precision recall f1-score support 0 0.83 0.90 0.86 154 1 0.92 0.86 0.89 203 accuracy 0.87 357 macro avg 0.87 0.88 0.87 357 weighted avg 0.88 0.87 0.87 357
KNN model
In [ ]:
from sklearn.neighbors import KNeighborsClassifier
model2 = KNeighborsClassifier(n_neighbors=5)
model2.fit(x_train,y_train)
y_pred = model2.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)
Accuracy: 0.75 Classification Report: precision recall f1-score support 0 0.70 0.73 0.71 154 1 0.79 0.76 0.77 203 accuracy 0.75 357 macro avg 0.74 0.74 0.74 357 weighted avg 0.75 0.75 0.75 357
XGB model
In [ ]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
model = XGBClassifier(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)
Accuracy: 0.91 Classification Report: precision recall f1-score support 0 0.91 0.88 0.89 154 1 0.91 0.93 0.92 203 accuracy 0.91 357 macro avg 0.91 0.91 0.91 357 weighted avg 0.91 0.91 0.91 357
Turning models to find the best model
In [ ]:
from sklearn.model_selection import GridSearchCV
param_grid = {
'n_estimators': [50, 100, 150, 200, 250, 300],
'max_depth': [1,3,5,7,9],
'learning_rate': [0.1, 0.01, 0.001]
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=2)
grid_search.fit(x_train, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Best Parameters:", best_params)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)
Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100} Accuracy: 0.93 Classification Report: precision recall f1-score support 0 0.93 0.90 0.91 154 1 0.93 0.95 0.94 203 accuracy 0.93 357 macro avg 0.93 0.92 0.93 357 weighted avg 0.93 0.93 0.93 357
.