Activated Library
library(dplyr) #make plot
library(tidyverse) #wrangling data
library(rsample) #sampling data
library(car) #check vif
library(caret) #confussion matrix
library(lmtest) #cek asumsi
library(class) #knn predict
options(scipen = 999)
Import Data
heart <- read.csv("heart.csv")
heart
Check Data Type
glimpse(heart)
## Rows: 303
## Columns: 14
## $ ï..age <int> 63, 37, 41, 56, 57, 57, 56, 44, 52, 57, 54, 48, 49, 64, 58...
## $ sex <int> 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0...
## $ cp <int> 3, 2, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 1, 3, 3, 2, 2, 3, 0, 3...
## $ trestbps <int> 145, 130, 130, 120, 120, 140, 140, 120, 172, 150, 140, 130...
## $ chol <int> 233, 250, 204, 236, 354, 192, 294, 263, 199, 168, 239, 275...
## $ fbs <int> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0...
## $ restecg <int> 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1...
## $ thalach <int> 150, 187, 172, 178, 163, 148, 153, 173, 162, 174, 160, 139...
## $ exang <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0...
## $ oldpeak <dbl> 2.3, 3.5, 1.4, 0.8, 0.6, 0.4, 1.3, 0.0, 0.5, 1.6, 1.2, 0.2...
## $ slope <int> 0, 0, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 0, 2, 2...
## $ ca <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2...
## $ thal <int> 1, 2, 2, 2, 2, 1, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ target <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
For variabel sex, cp, fbs, restecg, exang, slope, ca, thal, and target must change with factor data type
Change Data Type
#Change data type
heart <-
heart %>%
mutate_at(vars(sex,cp,fbs,restecg,exang,slope,ca,thal,target),funs(as.factor))
#Check data type after change
glimpse(heart)
## Rows: 303
## Columns: 14
## $ ï..age <int> 63, 37, 41, 56, 57, 57, 56, 44, 52, 57, 54, 48, 49, 64, 58...
## $ sex <fct> 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0...
## $ cp <fct> 3, 2, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 1, 3, 3, 2, 2, 3, 0, 3...
## $ trestbps <int> 145, 130, 130, 120, 120, 140, 140, 120, 172, 150, 140, 130...
## $ chol <int> 233, 250, 204, 236, 354, 192, 294, 263, 199, 168, 239, 275...
## $ fbs <fct> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0...
## $ restecg <fct> 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1...
## $ thalach <int> 150, 187, 172, 178, 163, 148, 153, 173, 162, 174, 160, 139...
## $ exang <fct> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0...
## $ oldpeak <dbl> 2.3, 3.5, 1.4, 0.8, 0.6, 0.4, 1.3, 0.0, 0.5, 1.6, 1.2, 0.2...
## $ slope <fct> 0, 0, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 0, 2, 2...
## $ ca <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2...
## $ thal <fct> 1, 2, 2, 2, 2, 1, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ target <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
Check Missing Value
colSums(is.na(heart))
## ï..age sex cp trestbps chol fbs restecg thalach
## 0 0 0 0 0 0 0 0
## exang oldpeak slope ca thal target
## 0 0 0 0 0 0
All variabel no have missing data.
Change Description Data
heart <-
heart %>%
mutate(sex = factor(sex, levels = c(0,1), labels = c("Female", "Male")),
fbs =factor(fbs, levels = c(0,1), labels = c("False", "True")),
exang = factor(exang, levels = c(0,1), labels = c("No", "Yes")),
target = factor(target, levels = c(0,1), labels = c("Not Heart Disease", "Heart Disease")))
heart
Check Data Summary
summary(heart)
## ï..age sex cp trestbps chol
## Min. :29.00 Female: 96 0:143 Min. : 94.0 Min. :126.0
## 1st Qu.:47.50 Male :207 1: 50 1st Qu.:120.0 1st Qu.:211.0
## Median :55.00 2: 87 Median :130.0 Median :240.0
## Mean :54.37 3: 23 Mean :131.6 Mean :246.3
## 3rd Qu.:61.00 3rd Qu.:140.0 3rd Qu.:274.5
## Max. :77.00 Max. :200.0 Max. :564.0
## fbs restecg thalach exang oldpeak slope ca
## False:258 0:147 Min. : 71.0 No :204 Min. :0.00 0: 21 0:175
## True : 45 1:152 1st Qu.:133.5 Yes: 99 1st Qu.:0.00 1:140 1: 65
## 2: 4 Median :153.0 Median :0.80 2:142 2: 38
## Mean :149.6 Mean :1.04 3: 20
## 3rd Qu.:166.0 3rd Qu.:1.60 4: 5
## Max. :202.0 Max. :6.20
## thal target
## 0: 2 Not Heart Disease:138
## 1: 18 Heart Disease :165
## 2:166
## 3:117
##
##
Plot Data Numeric
ggplot(
gather(heart %>% select_if(is.numeric)), aes(value)) +
geom_histogram(bins = 20, color="black", fill="lightblue", linetype="dashed") +
facet_wrap(~key, scales = 'free_x') +
labs(x="Count", y="Value")
Plot Data Categoric
ggplot(
gather(heart %>% select_if(is.factor)), aes(value, fill=value)) +
geom_bar(bins = 10) +
facet_wrap(~key, scales = 'free_x') +
theme(legend.position = "none") +
labs(x="Categorical", y="Value")
Proportion Table Target Variable
prop.table(table(heart$target))
##
## Not Heart Disease Heart Disease
## 0.4554455 0.5445545
Insight Data Summary:
-Proportion Age much on range 45-50 years old
-Proportion Cholesterol much on range 200-300 mg/dL
-Blood sugar much on less <120 mg/dL
-Quantity gender Male more than gender Female
-Propotion target varibel is Not Heart Disease (0.45%) and Heart Disease (0.54%)
Cross Validation
Make data train for training model (80% proportion from actual data) and data test for testing model (20% proportion from actual data)
RNGkind(sample.kind = "Rounding")
set.seed(1616)
index <- sample(nrow(heart),
nrow(heart) *0.8)
heart_train <- heart[index, ]
heart_test <- heart[-index, ]
Make model
Make model regression with stepwise “both”
modelheart <- glm(target ~ ., data = heart_train, family = "binomial")
model_log <- step(modelheart, direction = "both")
## Start: AIC=173.27
## target ~ ï..age + sex + cp + trestbps + chol + fbs + restecg +
## thalach + exang + oldpeak + slope + ca + thal
##
## Df Deviance AIC
## - restecg 2 127.99 169.99
## - oldpeak 1 128.71 172.71
## - chol 1 128.74 172.74
## - fbs 1 129.19 173.19
## <none> 127.28 173.28
## - ï..age 1 131.22 175.22
## - exang 1 131.95 175.95
## - slope 2 134.17 176.17
## - thalach 1 132.18 176.18
## - thal 3 137.33 177.33
## - cp 3 139.15 179.15
## - sex 1 135.25 179.25
## - trestbps 1 136.15 180.15
## - ca 4 175.14 213.14
##
## Step: AIC=169.99
## target ~ ï..age + sex + cp + trestbps + chol + fbs + thalach +
## exang + oldpeak + slope + ca + thal
##
## Df Deviance AIC
## - oldpeak 1 129.53 169.53
## - fbs 1 129.97 169.97
## <none> 127.99 169.99
## - chol 1 130.08 170.08
## - ï..age 1 131.81 171.81
## - exang 1 132.67 172.67
## - slope 2 134.77 172.77
## - thalach 1 132.78 172.78
## + restecg 2 127.28 173.28
## - thal 3 137.90 173.90
## - cp 3 139.54 175.54
## - sex 1 136.40 176.40
## - trestbps 1 137.01 177.01
## - ca 4 176.95 210.95
##
## Step: AIC=169.53
## target ~ ï..age + sex + cp + trestbps + chol + fbs + thalach +
## exang + slope + ca + thal
##
## Df Deviance AIC
## <none> 129.53 169.53
## - chol 1 131.71 169.71
## - fbs 1 131.82 169.82
## + oldpeak 1 127.99 169.99
## - ï..age 1 133.42 171.42
## - exang 1 134.48 172.48
## + restecg 2 128.71 172.71
## - thalach 1 134.98 172.98
## - thal 3 139.76 173.76
## - slope 2 138.16 174.16
## - cp 3 140.65 174.65
## - sex 1 138.34 176.34
## - trestbps 1 139.30 177.30
## - ca 4 181.67 213.67
summary(model_log)
##
## Call:
## glm(formula = target ~ ï..age + sex + cp + trestbps + chol +
## fbs + thalach + exang + slope + ca + thal, family = "binomial",
## data = heart_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.82624 -0.26939 0.06826 0.35126 2.51493
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.747449 3.835959 -0.195 0.84551
## ï..age 0.062681 0.032899 1.905 0.05675 .
## sexMale -1.927951 0.687437 -2.805 0.00504 **
## cp1 1.188364 0.750564 1.583 0.11335
## cp2 1.676734 0.586547 2.859 0.00425 **
## cp3 1.923863 0.849410 2.265 0.02352 *
## trestbps -0.043437 0.014820 -2.931 0.00338 **
## chol -0.007406 0.004939 -1.500 0.13370
## fbsTrue 0.998640 0.664862 1.502 0.13309
## thalach 0.035021 0.015700 2.231 0.02570 *
## exangYes -1.227084 0.554712 -2.212 0.02696 *
## slope1 -0.725950 0.872640 -0.832 0.40547
## slope2 0.858743 0.885462 0.970 0.33213
## ca1 -3.445803 0.703807 -4.896 0.000000978 ***
## ca2 -4.375906 0.941797 -4.646 0.000003379 ***
## ca3 -2.898257 0.988520 -2.932 0.00337 **
## ca4 -0.603644 3.552544 -0.170 0.86507
## thal1 2.807489 2.375376 1.182 0.23724
## thal2 2.336847 2.261497 1.033 0.30145
## thal3 0.967776 2.257631 0.429 0.66816
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 335.22 on 241 degrees of freedom
## Residual deviance: 129.53 on 222 degrees of freedom
## AIC: 169.53
##
## Number of Fisher Scoring iterations: 6
After make model and using stepwise “both” for selection prediktor variabel, optimum prediktor variabel is age, sex, cp, trestbps, chol, fbs, thalach, exang, slope, ca, thal.
Assumption
Multicollinearity
vif(model_log)
## GVIF Df GVIF^(1/(2*Df))
## ï..age 1.723208 1 1.312710
## sex 1.598750 1 1.264417
## cp 1.768155 3 1.099647
## trestbps 1.379831 1 1.174662
## chol 1.343845 1 1.159243
## fbs 1.279130 1 1.130986
## thalach 1.610393 1 1.269012
## exang 1.272209 1 1.127922
## slope 1.513998 2 1.109255
## ca 2.359953 4 1.113302
## thal 1.620655 3 1.083798
Value vif each variabel <10 , its mean each variabel no have correlation or same characteristic
Linearity
data.frame(prediksi = model_log$fitted.values,
error = model_log$residuals) %>%
ggplot(aes(x = prediksi, y = error)) +
geom_hline(yintercept = 0, lty = "dashed") +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Prediction
Predict data test an save in new column
heart_test$pred_result <- predict(object = model_log,
newdata = heart_test,
type = "response")
heart_test
Classification result predict with ifelse function
heart_test$pred_label <- factor(ifelse(heart_test$pred_result > 0.5, "Heart Disease","Not Heart Disease"))
heart_test
See result prediction in a graph
ggplot(heart_test, aes(x=pred_result)) +
geom_density() +
geom_hline(yintercept = 0.5 , linetype="dashed") +
theme_minimal()
Based on result predict in logistic regression, value >0.5 (Heart Disease) more than value <0.5 (Not Heart Disease)
Check Proportion
prop.table(table(heart$target))
##
## Not Heart Disease Heart Disease
## 0.4554455 0.5445545
Proportion data for target variabel (Not Heart Disease 45%) and (Heart Disease 54%) still balance
Cross Validation
Cross validation knn variabel predictor for axes x and variabel target for y axis
#Variabel Predictor
model_knn_train_x <- heart_train %>% select(-c(target,sex,cp,fbs,restecg,exang,slope,ca,thal))
model_knn_test_x <- heart_test %>% select(-c(target,sex,cp,fbs,restecg,exang,slope,ca,thal,pred_result,pred_label))
#Variabel Target
model_knn_train_y <- heart_train %>% select(target)
model_knn_test_y <- heart_test %>% select(target)
Scaling
# scale train_x data
model_knn_train_xs <- scale(model_knn_train_x)
# scale test_x data
model_knn_test_xs <- scale(model_knn_test_x,
center = attr(model_knn_train_xs, "scaled:center"),
scale = attr(model_knn_train_xs, "scaled:scale"))
Prediction
Find Optimum KNN
round(sqrt(nrow(model_knn_train_xs)))
## [1] 16
KNN Predict
heart_knn_pred <- knn(train = model_knn_train_xs,
test = model_knn_test_xs,
cl = model_knn_train_y$target,
k=16)
heart_knn_pred
## [1] Not Heart Disease Heart Disease Heart Disease Not Heart Disease
## [5] Heart Disease Heart Disease Not Heart Disease Not Heart Disease
## [9] Heart Disease Heart Disease Heart Disease Not Heart Disease
## [13] Heart Disease Heart Disease Heart Disease Not Heart Disease
## [17] Heart Disease Heart Disease Heart Disease Heart Disease
## [21] Heart Disease Not Heart Disease Heart Disease Heart Disease
## [25] Heart Disease Heart Disease Heart Disease Heart Disease
## [29] Heart Disease Not Heart Disease Heart Disease Not Heart Disease
## [33] Heart Disease Not Heart Disease Heart Disease Heart Disease
## [37] Heart Disease Heart Disease Heart Disease Heart Disease
## [41] Not Heart Disease Heart Disease Not Heart Disease Heart Disease
## [45] Not Heart Disease Heart Disease Not Heart Disease Not Heart Disease
## [49] Not Heart Disease Not Heart Disease Not Heart Disease Not Heart Disease
## [53] Not Heart Disease Heart Disease Not Heart Disease Not Heart Disease
## [57] Not Heart Disease Not Heart Disease Heart Disease Heart Disease
## [61] Not Heart Disease
## Levels: Not Heart Disease Heart Disease
For evaluation using confusion matrix
model_log_eval <- confusionMatrix(heart_test$pred_label, heart_test$target, positive = "Heart Disease")
model_log_eval
## Confusion Matrix and Statistics
##
## Reference
## Prediction Not Heart Disease Heart Disease
## Not Heart Disease 15 6
## Heart Disease 6 34
##
## Accuracy : 0.8033
## 95% CI : (0.6816, 0.894)
## No Information Rate : 0.6557
## P-Value [Acc > NIR] : 0.008813
##
## Kappa : 0.5643
##
## Mcnemar's Test P-Value : 1.000000
##
## Sensitivity : 0.8500
## Specificity : 0.7143
## Pos Pred Value : 0.8500
## Neg Pred Value : 0.7143
## Prevalence : 0.6557
## Detection Rate : 0.5574
## Detection Prevalence : 0.6557
## Balanced Accuracy : 0.7821
##
## 'Positive' Class : Heart Disease
##
Summary Evaluation Logistic Regression
-Accuracy : 0.8033 –> 80.3% model to correctly guess the target (Heart Disease / Not Heart Disease).
-Sensitivity (Recall) : 0.8500 –> 85% from all the positive actual data, capable proportion of model to guess right.
-Specificity : 0.7143 –> 71.4% from all the negative actual data, capable proportion of model to guess right.
-Pos Pred (Precision) : 0.8500 –> 85% from all the prediction result, capable model to correctly guess the positive class.
For evaluation using confusion matrix
conf_knn <- confusionMatrix(data=heart_knn_pred,
reference = as.factor(model_knn_test_y$target),
positive="Heart Disease")
conf_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction Not Heart Disease Heart Disease
## Not Heart Disease 15 10
## Heart Disease 6 30
##
## Accuracy : 0.7377
## 95% CI : (0.6093, 0.842)
## No Information Rate : 0.6557
## P-Value [Acc > NIR] : 0.1111
##
## Kappa : 0.4442
##
## Mcnemar's Test P-Value : 0.4533
##
## Sensitivity : 0.7500
## Specificity : 0.7143
## Pos Pred Value : 0.8333
## Neg Pred Value : 0.6000
## Prevalence : 0.6557
## Detection Rate : 0.4918
## Detection Prevalence : 0.5902
## Balanced Accuracy : 0.7321
##
## 'Positive' Class : Heart Disease
##
Summary Evaluation K-Nearest Neighbor
-Accuracy : 0.7377 –> 73.7% model to correctly guess the target (Heart Disease / Not Heart Disease).
-Sensitivity (Recall) : 0.7500 –> 75% from all the positive actual data, capable proportion of model to guess right.
-Specificity : 0.7143 –> 71.4% from all the negative actual data, capable proportion of model to guess right.
-Pos Pred (Precision) : 0.8333 –> 83.3% from all the prediction result, capable model to correctly guess the positive class.
model <- c("Logistic Regression", "K-Nearest Neighbor")
Accuracy <- c(0.8033,0.7377)
Sensitivity_Recall <- c(0.8500,0.7500)
Specificity <- c(0.7143,0.7143)
Pos.Pred_Precision <- c(0.8500,0.8333)
tabelmodel <- data.frame(model,Accuracy,Sensitivity_Recall,Specificity,Pos.Pred_Precision)
print(tabelmodel)
## model Accuracy Sensitivity_Recall Specificity
## 1 Logistic Regression 0.8033 0.85 0.7143
## 2 K-Nearest Neighbor 0.7377 0.75 0.7143
## Pos.Pred_Precision
## 1 0.8500
## 2 0.8333
Based on the results of each model, both of model have a good results in term of Accuracy, Recall, Specificity and Precision.
From business case, I want to choose patients with Heart Disease, for that I choose a model with high value Post Pred (Precision) and Accuracy. I choose a model with a high value Post Pred (Precision) and Accuracy because I want the model be able to predict accurately and don’t want wrong prediction of patients with Heart Disease or FP (Prediction correct but Actual wrong) is better than FN (Prediction wrong but Actual correct).
For logistic model has an Accuracy value (0.8033) and the Post Pred / Precision value (0.8500) is bigger than the KNN model which an Accuracy value (0.7705) and Pos Pred / Precision value (0.8421), therefore choose logistic regression model.