import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df= pd.read_csv("/Users/nnthieu/Downloads/archive/heart_final.csv")
df.head()

df.shape

(1190, 12)

df.isnull().sum()

age                    0
sex                    0
chest pain type        0
resting bp s           0
cholesterol            0
fasting blood sugar    0
resting ecg            0
max heart rate         0
exercise angina        0
oldpeak                0
ST slope               0
target                 0
dtype: int64

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  1190 non-null   int64  
 1   sex                  1190 non-null   int64  
 2   chest pain type      1190 non-null   int64  
 3   resting bp s         1190 non-null   int64  
 4   cholesterol          1190 non-null   int64  
 5   fasting blood sugar  1190 non-null   int64  
 6   resting ecg          1190 non-null   int64  
 7   max heart rate       1190 non-null   int64  
 8   exercise angina      1190 non-null   int64  
 9   oldpeak              1190 non-null   float64
 10  ST slope             1190 non-null   int64  
 11  target               1190 non-null   int64  
dtypes: float64(1), int64(11)
memory usage: 111.7 KB

# Create a mapping dictionary to remove spaces
mapping = {col: col.replace(' ', '') for col in df.columns}

# Rename columns using the mapping dictionary
df.rename(columns=mapping, inplace=True)

df.head()

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

x = df.drop('target', axis =1)
y = df['target']
x_train , x_test, y_train  , y_test = train_test_split(x,y, test_size =0.3, random_state = 42)

model = DecisionTreeClassifier(random_state=42)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

Accuracy: 0.87
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.90      0.86       154
           1       0.92      0.86      0.89       203

    accuracy                           0.87       357
   macro avg       0.87      0.88      0.87       357
weighted avg       0.88      0.87      0.87       357

from sklearn.neighbors import KNeighborsClassifier

model2 = KNeighborsClassifier(n_neighbors=5)
model2.fit(x_train,y_train)

y_pred = model2.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

Accuracy: 0.75
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.73      0.71       154
           1       0.79      0.76      0.77       203

    accuracy                           0.75       357
   macro avg       0.74      0.74      0.74       357
weighted avg       0.75      0.75      0.75       357

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


model = XGBClassifier(random_state=42)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

Accuracy: 0.91
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.88      0.89       154
           1       0.91      0.93      0.92       203

    accuracy                           0.91       357
   macro avg       0.91      0.91      0.91       357
weighted avg       0.91      0.91      0.91       357

from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 150, 200, 250, 300],
    'max_depth': [1,3,5,7,9],
    'learning_rate': [0.1, 0.01, 0.001]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=2)
grid_search.fit(x_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred = best_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Best Parameters:", best_params)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Accuracy: 0.93
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.90      0.91       154
           1       0.93      0.95      0.94       203

    accuracy                           0.93       357
   macro avg       0.93      0.92      0.93       357
weighted avg       0.93      0.93      0.93       357

	age	sex	chest pain type	resting bp s	cholesterol	resting ecg	max heart rate	exercise angina	oldpeak	ST slope	target
0	40	1	2	140	289	0	172	0	0.0	1	0
1	49	0	3	160	180	0	156	0	1.0	2	1
2	37	1	2	130	283	1	98	0	0.0	1	0
3	48	0	4	138	214	0	108	1	1.5	2	1
4	54	1	3	150	195	0	122	0	0.0	1	0

	age	sex	chestpaintype	restingbps	cholesterol	restingecg	maxheartrate	exerciseangina	oldpeak	STslope	target
0	40	1	2	140	289	0	172	0	0.0	1	0
1	49	0	3	160	180	0	156	0	1.0	2	1
2	37	1	2	130	283	1	98	0	0.0	1	0
3	48	0	4	138	214	0	108	1	1.5	2	1
4	54	1	3	150	195	0	122	0	0.0	1	0

Heart Disease Classification Models¶

Data loading and summary¶

Building models¶

	age	sex	chest pain type	resting bp s	cholesterol	resting ecg	max heart rate	exercise angina	oldpeak	ST slope	target
0	40	1	2	140	289	0	172	0	0.0	1	0
1	49	0	3	160	180	0	156	0	1.0	2	1
2	37	1	2	130	283	1	98	0	0.0	1	0
3	48	0	4	138	214	0	108	1	1.5	2	1
4	54	1	3	150	195	0	122	0	0.0	1	0

	age	sex	chestpaintype	restingbps	cholesterol	restingecg	maxheartrate	exerciseangina	oldpeak	STslope	target
0	40	1	2	140	289	0	172	0	0.0	1	0
1	49	0	3	160	180	0	156	0	1.0	2	1
2	37	1	2	130	283	1	98	0	0.0	1	0
3	48	0	4	138	214	0	108	1	1.5	2	1
4	54	1	3	150	195	0	122	0	0.0	1	0

	age	sex	chest pain type	resting bp s	cholesterol	resting ecg	max heart rate	exercise angina	oldpeak	ST slope	target
0	40	1	2	140	289	0	172	0	0.0	1	0
1	49	0	3	160	180	0	156	0	1.0	2	1
2	37	1	2	130	283	1	98	0	0.0	1	0
3	48	0	4	138	214	0	108	1	1.5	2	1
4	54	1	3	150	195	0	122	0	0.0	1	0

	age	sex	chestpaintype	restingbps	cholesterol	restingecg	maxheartrate	exerciseangina	oldpeak	STslope	target
0	40	1	2	140	289	0	172	0	0.0	1	0
1	49	0	3	160	180	0	156	0	1.0	2	1
2	37	1	2	130	283	1	98	0	0.0	1	0
3	48	0	4	138	214	0	108	1	1.5	2	1
4	54	1	3	150	195	0	122	0	0.0	1	0

	age	sex	chest pain type	resting bp s	cholesterol	resting ecg	max heart rate	exercise angina	oldpeak	ST slope	target
0	40	1	2	140	289	0	172	0	0.0	1	0
1	49	0	3	160	180	0	156	0	1.0	2	1
2	37	1	2	130	283	1	98	0	0.0	1	0
3	48	0	4	138	214	0	108	1	1.5	2	1
4	54	1	3	150	195	0	122	0	0.0	1	0

	age	sex	chestpaintype	restingbps	cholesterol	restingecg	maxheartrate	exerciseangina	oldpeak	STslope	target
0	40	1	2	140	289	0	172	0	0.0	1	0
1	49	0	3	160	180	0	156	0	1.0	2	1
2	37	1	2	130	283	1	98	0	0.0	1	0
3	48	0	4	138	214	0	108	1	1.5	2	1
4	54	1	3	150	195	0	122	0	0.0	1	0