About data set
This database contains 76 attributes, but all published experiments refer to using a subset of 14 of them. In particular, the Cleveland database is the only one that has been used by ML researchers to this date.The “target” field refers to the presence of heart disease in the patient. It is integer valued 0 = no/less chance of heart attack and 1 = more chance of heart attack
Attribute Information
library(dplyr)
Read the data
heart <- read.csv("heart.csv")
Check for any missing values
anyNA(heart)
## [1] FALSE
Rename column name
heart <- heart %>%
rename(age = ï..age,
chest_pain_type = cp,
resting_blood_pressure = trestbps,
serum_cholestoral = chol,
fasting_blood_sugar = fbs,
resting_electrocardiographic = restecg,
max_heart_rate = thalach,
exercise_induced_angina = exang,
major_vessel = ca,
heart_attack = target)
Change column type
heart <- heart %>%
mutate(sex = as.factor(sex),
chest_pain_type = as.factor(chest_pain_type),
fasting_blood_sugar = as.factor(fasting_blood_sugar),
resting_electrocardiographic = as.factor(resting_electrocardiographic),
exercise_induced_angina = as.factor(exercise_induced_angina),
slope = as.factor(slope),
major_vessel = as.factor(major_vessel),
thal = as.factor(thal),
heart_attack = as.factor(heart_attack))
head(heart)
## age sex chest_pain_type resting_blood_pressure serum_cholestoral
## 1 63 1 3 145 233
## 2 37 1 2 130 250
## 3 41 0 1 130 204
## 4 56 1 1 120 236
## 5 57 0 0 120 354
## 6 57 1 0 140 192
## fasting_blood_sugar resting_electrocardiographic max_heart_rate
## 1 1 0 150
## 2 0 1 187
## 3 0 0 172
## 4 0 1 178
## 5 0 1 163
## 6 0 1 148
## exercise_induced_angina oldpeak slope major_vessel thal heart_attack
## 1 0 2.3 0 0 1 1
## 2 0 3.5 0 0 2 1
## 3 0 1.4 2 0 2 1
## 4 0 0.8 2 0 2 1
## 5 1 0.6 2 0 2 1
## 6 0 0.4 1 0 1 1
Divide the data to train and test dataframe for training and testing
set.seed(10)
library(rsample)
idx <- initial_split(data = heart, prop = 0.8, strata = heart_attack)
log_train <- training(idx)
log_test <- testing(idx)
Check for proportion after dividing the data. our data proportion is pretty good with 0.45 : 0.54
prop.table(table(log_train$heart_attack))
##
## 0 1
## 0.454918 0.545082
Make the model with Logistic Regression Algorithm
model_logistic <- glm(formula = heart_attack ~., data = log_train, family = "binomial")
summary(model_logistic)
##
## Call:
## glm(formula = heart_attack ~ ., family = "binomial", data = log_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9013 -0.3024 0.1104 0.3950 3.1181
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.334817 3.995641 0.084 0.933219
## age 0.032535 0.030524 1.066 0.286474
## sex1 -1.869731 0.636589 -2.937 0.003313 **
## chest_pain_type1 1.370943 0.715050 1.917 0.055204 .
## chest_pain_type2 2.056159 0.585453 3.512 0.000445 ***
## chest_pain_type3 2.455375 0.822525 2.985 0.002834 **
## resting_blood_pressure -0.031687 0.013325 -2.378 0.017404 *
## serum_cholestoral -0.002372 0.004707 -0.504 0.614353
## fasting_blood_sugar1 0.117041 0.649400 0.180 0.856972
## resting_electrocardiographic1 0.429481 0.454331 0.945 0.344504
## resting_electrocardiographic2 -1.189119 2.342895 -0.508 0.611774
## max_heart_rate 0.025697 0.013836 1.857 0.063286 .
## exercise_induced_angina1 -0.907720 0.538392 -1.686 0.091799 .
## oldpeak -0.389311 0.263284 -1.479 0.139228
## slope1 -1.238649 1.014070 -1.221 0.221911
## slope2 -0.138697 1.126713 -0.123 0.902029
## major_vessel1 -2.139418 0.595676 -3.592 0.000329 ***
## major_vessel2 -3.892381 0.985859 -3.948 7.87e-05 ***
## major_vessel3 -0.962194 0.941728 -1.022 0.306908
## major_vessel4 1.202833 1.978312 0.608 0.543181
## thal1 2.170327 2.683893 0.809 0.418717
## thal2 1.945209 2.596520 0.749 0.453761
## thal3 0.515105 2.597198 0.198 0.842786
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 336.27 on 243 degrees of freedom
## Residual deviance: 142.34 on 221 degrees of freedom
## AIC: 188.34
##
## Number of Fisher Scoring iterations: 6
Make stepwise model with backward method for comparing with base model
logistic_step <- step(object = model_logistic, direction = "backward")
## Start: AIC=188.34
## heart_attack ~ age + sex + chest_pain_type + resting_blood_pressure +
## serum_cholestoral + fasting_blood_sugar + resting_electrocardiographic +
## max_heart_rate + exercise_induced_angina + oldpeak + slope +
## major_vessel + thal
##
## Df Deviance AIC
## - resting_electrocardiographic 2 143.60 185.60
## - fasting_blood_sugar 1 142.38 186.38
## - serum_cholestoral 1 142.59 186.59
## - age 1 143.50 187.50
## <none> 142.34 188.34
## - oldpeak 1 144.67 188.67
## - exercise_induced_angina 1 145.17 189.17
## - slope 2 147.33 189.33
## - max_heart_rate 1 146.05 190.05
## - resting_blood_pressure 1 148.35 192.35
## - thal 3 152.42 192.42
## - sex 1 151.72 195.72
## - chest_pain_type 3 161.01 201.01
## - major_vessel 4 172.42 210.42
##
## Step: AIC=185.6
## heart_attack ~ age + sex + chest_pain_type + resting_blood_pressure +
## serum_cholestoral + fasting_blood_sugar + max_heart_rate +
## exercise_induced_angina + oldpeak + slope + major_vessel +
## thal
##
## Df Deviance AIC
## - fasting_blood_sugar 1 143.62 183.62
## - serum_cholestoral 1 144.17 184.17
## - age 1 144.56 184.56
## <none> 143.60 185.60
## - oldpeak 1 146.02 186.02
## - exercise_induced_angina 1 146.07 186.07
## - slope 2 148.60 186.60
## - max_heart_rate 1 147.15 187.15
## - thal 3 153.80 189.80
## - resting_blood_pressure 1 149.80 189.80
## - sex 1 153.04 193.04
## - chest_pain_type 3 162.93 198.93
## - major_vessel 4 174.04 208.04
##
## Step: AIC=183.62
## heart_attack ~ age + sex + chest_pain_type + resting_blood_pressure +
## serum_cholestoral + max_heart_rate + exercise_induced_angina +
## oldpeak + slope + major_vessel + thal
##
## Df Deviance AIC
## - serum_cholestoral 1 144.18 182.18
## - age 1 144.58 182.58
## <none> 143.62 183.62
## - exercise_induced_angina 1 146.07 184.07
## - oldpeak 1 146.10 184.10
## - slope 2 148.61 184.61
## - max_heart_rate 1 147.23 185.23
## - resting_blood_pressure 1 149.83 187.83
## - thal 3 153.85 187.85
## - sex 1 153.05 191.05
## - chest_pain_type 3 163.69 197.69
## - major_vessel 4 174.25 206.25
##
## Step: AIC=182.18
## heart_attack ~ age + sex + chest_pain_type + resting_blood_pressure +
## max_heart_rate + exercise_induced_angina + oldpeak + slope +
## major_vessel + thal
##
## Df Deviance AIC
## - age 1 144.98 180.98
## <none> 144.18 182.18
## - exercise_induced_angina 1 146.73 182.73
## - oldpeak 1 146.91 182.91
## - slope 2 149.22 183.22
## - max_heart_rate 1 147.53 183.53
## - resting_blood_pressure 1 150.31 186.31
## - thal 3 154.58 186.58
## - sex 1 153.07 189.07
## - chest_pain_type 3 164.51 196.51
## - major_vessel 4 174.57 204.57
##
## Step: AIC=180.98
## heart_attack ~ sex + chest_pain_type + resting_blood_pressure +
## max_heart_rate + exercise_induced_angina + oldpeak + slope +
## major_vessel + thal
##
## Df Deviance AIC
## <none> 144.98 180.98
## - slope 2 149.54 181.54
## - max_heart_rate 1 147.58 181.58
## - oldpeak 1 147.71 181.71
## - exercise_induced_angina 1 147.81 181.81
## - resting_blood_pressure 1 150.31 184.31
## - thal 3 155.37 185.37
## - sex 1 154.98 188.98
## - chest_pain_type 3 166.63 196.63
## - major_vessel 4 175.53 203.53
Predict test data with base mdoel and stepwise model
log_test$pred <- predict(object = model_logistic, newdata = log_test, type = "response")
log_test$pred <- as.factor(ifelse(log_test$pred >=0.5, 1, 0))
log_test$pred_step <- predict(object = logistic_step, newdata = log_test, type = "response")
log_test$pred_step <- as.factor(ifelse(log_test$pred_step >=0.5, 1, 0))
Check for Accuracy
library(caret)
confusionMatrix(data = log_test$pred, reference = log_test$heart_attack, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 21 2
## 1 6 30
##
## Accuracy : 0.8644
## 95% CI : (0.7502, 0.9396)
## No Information Rate : 0.5424
## P-Value [Acc > NIR] : 1.458e-07
##
## Kappa : 0.7237
##
## Mcnemar's Test P-Value : 0.2888
##
## Sensitivity : 0.9375
## Specificity : 0.7778
## Pos Pred Value : 0.8333
## Neg Pred Value : 0.9130
## Prevalence : 0.5424
## Detection Rate : 0.5085
## Detection Prevalence : 0.6102
## Balanced Accuracy : 0.8576
##
## 'Positive' Class : 1
##
confusionMatrix(data = log_test$pred_step, reference = log_test$heart_attack, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 22 4
## 1 5 28
##
## Accuracy : 0.8475
## 95% CI : (0.7301, 0.9278)
## No Information Rate : 0.5424
## P-Value [Acc > NIR] : 7.195e-07
##
## Kappa : 0.6918
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8750
## Specificity : 0.8148
## Pos Pred Value : 0.8485
## Neg Pred Value : 0.8462
## Prevalence : 0.5424
## Detection Rate : 0.4746
## Detection Prevalence : 0.5593
## Balanced Accuracy : 0.8449
##
## 'Positive' Class : 1
##
Read the data
heart2 <- read.csv("heart.csv")
heart2 <- heart2 %>%
rename(age = ï..age,
chest_pain_type = cp,
resting_blood_pressure = trestbps,
serum_cholestoral = chol,
fasting_blood_sugar = fbs,
resting_electrocardiographic = restecg,
max_heart_rate = thalach,
exercise_induced_angina = exang,
major_vessel = ca,
heart_attack = target) %>%
mutate(heart_attack = as.factor(heart_attack))
Split data to train and test
idx2 <- initial_split(heart2, prop = 0.8, strata = heart_attack)
train_knn <- training(idx2)
test_knn <- testing(idx2)
Split data to each target and predictor
train_knn_x <- train_knn %>%
select(-heart_attack)
train_knn_y <- train_knn %>%
select(heart_attack)
test_knn_x <- test_knn %>%
select(-heart_attack)
test_knn_y <- test_knn %>%
select(heart_attack)
Scale training prerdictor data
train_knn_x <- scale(train_knn_x)
test_knn_x <- scale(test_knn_x,
center = attr(train_knn_x, "scaled:center"),
scale = attr(train_knn_x, "scaled:scale"))
Check for proportion
prop.table(table(train_knn_y))
## train_knn_y
## 0 1
## 0.454918 0.545082
Using square root total row for K value
sqrt(nrow(train_knn_x))
## [1] 15.6205
Make the model wiith knn
library(class)
model_knn <- knn(train = train_knn_x, test = test_knn_x, cl = train_knn_y$heart_attack, k = 16)
Check for accurracy
confusionMatrix(data = model_knn, reference = test_knn_y$heart_attack, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 19 4
## 1 8 28
##
## Accuracy : 0.7966
## 95% CI : (0.6717, 0.8902)
## No Information Rate : 0.5424
## P-Value [Acc > NIR] : 4.294e-05
##
## Kappa : 0.5855
##
## Mcnemar's Test P-Value : 0.3865
##
## Sensitivity : 0.8750
## Specificity : 0.7037
## Pos Pred Value : 0.7778
## Neg Pred Value : 0.8261
## Prevalence : 0.5424
## Detection Rate : 0.4746
## Detection Prevalence : 0.6102
## Balanced Accuracy : 0.7894
##
## 'Positive' Class : 1
##
We want to use Recall in this case because we want to minimize undetected heart attack.
For this I personally think Logistic regression without step is better in this case with Accuracy value 86% and Sensitivity value 93%