Heart Disease Classification Models¶

Using machine learning classification models to predict heart disease

Data loading and summary¶

In [ ]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df= pd.read_csv("/Users/nnthieu/Downloads/archive/heart_final.csv")
df.head()
Out[ ]:
age sex chest pain type resting bp s cholesterol fasting blood sugar resting ecg max heart rate exercise angina oldpeak ST slope target
0 40 1 2 140 289 0 0 172 0 0.0 1 0
1 49 0 3 160 180 0 0 156 0 1.0 2 1
2 37 1 2 130 283 0 1 98 0 0.0 1 0
3 48 0 4 138 214 0 0 108 1 1.5 2 1
4 54 1 3 150 195 0 0 122 0 0.0 1 0
In [ ]:
df.shape
Out[ ]:
(1190, 12)
In [ ]:
df.isnull().sum()
Out[ ]:
age                    0
sex                    0
chest pain type        0
resting bp s           0
cholesterol            0
fasting blood sugar    0
resting ecg            0
max heart rate         0
exercise angina        0
oldpeak                0
ST slope               0
target                 0
dtype: int64
In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  1190 non-null   int64  
 1   sex                  1190 non-null   int64  
 2   chest pain type      1190 non-null   int64  
 3   resting bp s         1190 non-null   int64  
 4   cholesterol          1190 non-null   int64  
 5   fasting blood sugar  1190 non-null   int64  
 6   resting ecg          1190 non-null   int64  
 7   max heart rate       1190 non-null   int64  
 8   exercise angina      1190 non-null   int64  
 9   oldpeak              1190 non-null   float64
 10  ST slope             1190 non-null   int64  
 11  target               1190 non-null   int64  
dtypes: float64(1), int64(11)
memory usage: 111.7 KB
In [ ]:
# Create a mapping dictionary to remove spaces
mapping = {col: col.replace(' ', '') for col in df.columns}

# Rename columns using the mapping dictionary
df.rename(columns=mapping, inplace=True)

df.head()
Out[ ]:
age sex chestpaintype restingbps cholesterol fastingbloodsugar restingecg maxheartrate exerciseangina oldpeak STslope target
0 40 1 2 140 289 0 0 172 0 0.0 1 0
1 49 0 3 160 180 0 0 156 0 1.0 2 1
2 37 1 2 130 283 0 1 98 0 0.0 1 0
3 48 0 4 138 214 0 0 108 1 1.5 2 1
4 54 1 3 150 195 0 0 122 0 0.0 1 0

Building models¶

Decision tree model

In [ ]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

x = df.drop('target', axis =1)
y = df['target']
x_train , x_test, y_train  , y_test = train_test_split(x,y, test_size =0.3, random_state = 42)

model = DecisionTreeClassifier(random_state=42)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)
Accuracy: 0.87
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.90      0.86       154
           1       0.92      0.86      0.89       203

    accuracy                           0.87       357
   macro avg       0.87      0.88      0.87       357
weighted avg       0.88      0.87      0.87       357

KNN model

In [ ]:
from sklearn.neighbors import KNeighborsClassifier

model2 = KNeighborsClassifier(n_neighbors=5)
model2.fit(x_train,y_train)

y_pred = model2.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)
Accuracy: 0.75
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.73      0.71       154
           1       0.79      0.76      0.77       203

    accuracy                           0.75       357
   macro avg       0.74      0.74      0.74       357
weighted avg       0.75      0.75      0.75       357

XGB model

In [ ]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


model = XGBClassifier(random_state=42)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)
Accuracy: 0.91
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.88      0.89       154
           1       0.91      0.93      0.92       203

    accuracy                           0.91       357
   macro avg       0.91      0.91      0.91       357
weighted avg       0.91      0.91      0.91       357

Turning models to find the best model

In [ ]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 150, 200, 250, 300],
    'max_depth': [1,3,5,7,9],
    'learning_rate': [0.1, 0.01, 0.001]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=2)
grid_search.fit(x_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred = best_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Best Parameters:", best_params)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)
Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Accuracy: 0.93
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.90      0.91       154
           1       0.93      0.95      0.94       203

    accuracy                           0.93       357
   macro avg       0.93      0.92      0.93       357
weighted avg       0.93      0.93      0.93       357

.