library(dplyr)
library(class)
normalize <- function(x){
return (
(x - min(x))/(max(x) - min(x))
)
}
zscore <- function(x){
return(
(x-mean(x))/sd(x)
)
}We’re using heart dataset from https://www.kaggle.com/ronitf/heart-disease-uci
Data Set Information:
Read the dataset and mutate the necessary variables into factor for GLM
h <- read.csv("heart.csv")
names(h)[1] <- "age"
h.glm <- h %>%
mutate_at(.funs = funs(as.factor(.)), .vars = c("sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal", "target"))We set Data Train to be 80% of the whole data
set.seed(2902)
nhtrain <- sample(nrow(h), nrow(h)*0.8)
h.train <- h.glm[nhtrain, ]
h.test <- h.glm[-nhtrain, ]
h.mm <- as.data.frame(lapply(h[,-14], normalize))
h.train.knn.mm <- h.mm[nhtrain, -14]
h.test.knn.mm <- h.mm[-nhtrain, -14]
h.z <- as.data.frame(lapply(h[,-14], zscore))
h.train.knn.z <- h.z[nhtrain,]
h.test.knn.z <- h.z[-nhtrain,]
h.train.labels <- h[nhtrain, 14]
h.test.labels <- h[-nhtrain, 14]Create logistic model with no predictors and all predictors from the dataset
h.glm.none <- glm(target ~ 1, family = "binomial", data = h.train)
h.glm.all <- glm(target ~ ., family = "binomial", data = h.train)
summary(h.glm.none)##
## Call:
## glm(formula = target ~ 1, family = "binomial", data = h.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.249 -1.249 1.108 1.108 1.108
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.1657 0.1290 1.284 0.199
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 333.83 on 241 degrees of freedom
## Residual deviance: 333.83 on 241 degrees of freedom
## AIC: 335.83
##
## Number of Fisher Scoring iterations: 3
summary(h.glm.all)##
## Call:
## glm(formula = target ~ ., family = "binomial", data = h.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.4626 -0.1399 0.0474 0.2542 2.1841
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.365e+01 1.455e+03 -0.009 0.992519
## age 4.370e-02 3.435e-02 1.272 0.203341
## sex1 -1.951e+00 7.674e-01 -2.543 0.010999 *
## cp1 1.364e+00 8.090e-01 1.687 0.091666 .
## cp2 2.747e+00 7.221e-01 3.804 0.000143 ***
## cp3 3.106e+00 9.731e-01 3.192 0.001413 **
## trestbps -1.561e-02 1.717e-02 -0.909 0.363458
## chol -1.213e-03 6.128e-03 -0.198 0.843105
## fbs1 1.654e-03 7.850e-01 0.002 0.998319
## restecg1 6.630e-01 5.519e-01 1.201 0.229633
## restecg2 -7.923e-01 4.303e+00 -0.184 0.853933
## thalach 2.135e-02 1.496e-02 1.427 0.153483
## exang1 -1.648e+00 6.477e-01 -2.545 0.010933 *
## oldpeak -4.143e-01 3.577e-01 -1.158 0.246666
## slope1 -1.928e+00 1.245e+00 -1.549 0.121493
## slope2 4.039e-01 1.311e+00 0.308 0.757927
## ca1 -3.355e+00 7.075e-01 -4.741 2.12e-06 ***
## ca2 -5.271e+00 1.179e+00 -4.472 7.74e-06 ***
## ca3 -3.264e+00 1.415e+00 -2.306 0.021110 *
## ca4 2.104e+00 2.493e+00 0.844 0.398739
## thal1 1.432e+01 1.455e+03 0.010 0.992150
## thal2 1.420e+01 1.455e+03 0.010 0.992217
## thal3 1.236e+01 1.455e+03 0.008 0.993225
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 333.83 on 241 degrees of freedom
## Residual deviance: 108.17 on 219 degrees of freedom
## AIC: 154.17
##
## Number of Fisher Scoring iterations: 14
summary(step(h.glm.all, data=h.train, direction="backward", trace = 0))##
## Call:
## glm(formula = target ~ age + sex + cp + trestbps + thalach +
## exang + slope + ca + thal, family = "binomial", data = h.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3519 -0.1582 0.0460 0.2748 2.2829
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -13.82864 1455.40189 -0.010 0.99242
## age 0.04822 0.03344 1.442 0.14939
## sex1 -1.83062 0.73313 -2.497 0.01253 *
## cp1 1.58090 0.78945 2.003 0.04523 *
## cp2 2.75665 0.69889 3.944 8.00e-05 ***
## cp3 2.88628 0.90499 3.189 0.00143 **
## trestbps -0.02384 0.01555 -1.533 0.12518
## thalach 0.02292 0.01446 1.585 0.11293
## exang1 -1.68924 0.61312 -2.755 0.00587 **
## slope1 -1.24912 1.02808 -1.215 0.22436
## slope2 1.24745 1.06343 1.173 0.24078
## ca1 -3.43415 0.70076 -4.901 9.55e-07 ***
## ca2 -5.26368 1.11769 -4.709 2.48e-06 ***
## ca3 -3.65395 1.36942 -2.668 0.00762 **
## ca4 2.26912 2.37574 0.955 0.33952
## thal1 14.09961 1455.39794 0.010 0.99227
## thal2 13.89916 1455.39779 0.010 0.99238
## thal3 12.03977 1455.39776 0.008 0.99340
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 333.83 on 241 degrees of freedom
## Residual deviance: 111.22 on 224 degrees of freedom
## AIC: 147.22
##
## Number of Fisher Scoring iterations: 14
summary(step(h.glm.none, scope = list(upper=h.glm.all), data=h.train, direction="forward",trace = 0))##
## Call:
## glm(formula = target ~ cp + ca + thal + slope + exang + sex +
## oldpeak, family = "binomial", data = h.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3110 -0.1737 0.0440 0.2793 2.5039
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.7929 1455.3986 -0.007 0.99463
## cp1 1.3918 0.7473 1.863 0.06253 .
## cp2 2.8232 0.6932 4.073 4.65e-05 ***
## cp3 2.8148 0.8727 3.225 0.00126 **
## ca1 -3.2587 0.6600 -4.938 7.90e-07 ***
## ca2 -4.6827 1.0822 -4.327 1.51e-05 ***
## ca3 -3.5700 1.4506 -2.461 0.01385 *
## ca4 1.7802 1.9565 0.910 0.36288
## thal1 13.7807 1455.3979 0.009 0.99245
## thal2 13.7037 1455.3978 0.009 0.99249
## thal3 11.8825 1455.3977 0.008 0.99349
## slope1 -1.7457 1.1685 -1.494 0.13521
## slope2 0.5824 1.2483 0.467 0.64085
## exang1 -1.7171 0.5942 -2.890 0.00385 **
## sex1 -1.8390 0.6981 -2.634 0.00843 **
## oldpeak -0.4713 0.3318 -1.421 0.15544
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 333.83 on 241 degrees of freedom
## Residual deviance: 113.70 on 226 degrees of freedom
## AIC: 145.7
##
## Number of Fisher Scoring iterations: 14
summary(step(h.glm.all, scope = list(upper=h.glm.all), data=h.train, direction="both",trace = 0))##
## Call:
## glm(formula = target ~ age + sex + cp + trestbps + thalach +
## exang + slope + ca + thal, family = "binomial", data = h.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3519 -0.1582 0.0460 0.2748 2.2829
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -13.82864 1455.40189 -0.010 0.99242
## age 0.04822 0.03344 1.442 0.14939
## sex1 -1.83062 0.73313 -2.497 0.01253 *
## cp1 1.58090 0.78945 2.003 0.04523 *
## cp2 2.75665 0.69889 3.944 8.00e-05 ***
## cp3 2.88628 0.90499 3.189 0.00143 **
## trestbps -0.02384 0.01555 -1.533 0.12518
## thalach 0.02292 0.01446 1.585 0.11293
## exang1 -1.68924 0.61312 -2.755 0.00587 **
## slope1 -1.24912 1.02808 -1.215 0.22436
## slope2 1.24745 1.06343 1.173 0.24078
## ca1 -3.43415 0.70076 -4.901 9.55e-07 ***
## ca2 -5.26368 1.11769 -4.709 2.48e-06 ***
## ca3 -3.65395 1.36942 -2.668 0.00762 **
## ca4 2.26912 2.37574 0.955 0.33952
## thal1 14.09961 1455.39794 0.010 0.99227
## thal2 13.89916 1455.39779 0.010 0.99238
## thal3 12.03977 1455.39776 0.008 0.99340
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 333.83 on 241 degrees of freedom
## Residual deviance: 111.22 on 224 degrees of freedom
## AIC: 147.22
##
## Number of Fisher Scoring iterations: 14
summary(step(h.glm.none, scope = list(upper=h.glm.all), data=h.train, direction="both",trace = 0))##
## Call:
## glm(formula = target ~ cp + ca + thal + slope + exang + sex +
## oldpeak, family = "binomial", data = h.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3110 -0.1737 0.0440 0.2793 2.5039
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.7929 1455.3986 -0.007 0.99463
## cp1 1.3918 0.7473 1.863 0.06253 .
## cp2 2.8232 0.6932 4.073 4.65e-05 ***
## cp3 2.8148 0.8727 3.225 0.00126 **
## ca1 -3.2587 0.6600 -4.938 7.90e-07 ***
## ca2 -4.6827 1.0822 -4.327 1.51e-05 ***
## ca3 -3.5700 1.4506 -2.461 0.01385 *
## ca4 1.7802 1.9565 0.910 0.36288
## thal1 13.7807 1455.3979 0.009 0.99245
## thal2 13.7037 1455.3978 0.009 0.99249
## thal3 11.8825 1455.3977 0.008 0.99349
## slope1 -1.7457 1.1685 -1.494 0.13521
## slope2 0.5824 1.2483 0.467 0.64085
## exang1 -1.7171 0.5942 -2.890 0.00385 **
## sex1 -1.8390 0.6981 -2.634 0.00843 **
## oldpeak -0.4713 0.3318 -1.421 0.15544
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 333.83 on 241 degrees of freedom
## Residual deviance: 113.70 on 226 degrees of freedom
## AIC: 145.7
##
## Number of Fisher Scoring iterations: 14
Checking from 4 different directions of stepwise, gave us the best AIC if we chose below variables to be included into the model:
Hence our Logarithmic Regression Model call will be
h.glm.adj <- glm(formula = target ~ cp + ca + thal + slope + exang + sex +
oldpeak, family = "binomial", data = h.train)h.test$pred.target <- predict(h.glm.adj, h.test, type = "response")
table("predicted"=as.numeric(h.test$pred.target>=0.5), "actual"=h.test$target)## actual
## predicted 0 1
## 0 18 7
## 1 9 27
Since the target represent the value of having a heart disease, so what really matters in this modeling is the recall value.
Using the confusion matrix, we could calculate the recall value as below:
paste("Recall: ", round((27/(27+7)) * 100,2), "%", sep="")## [1] "Recall: 79.41%"
The value shows that the model is quite good
h.knn.mm <- knn(train = h.train.knn.mm, test=h.test.knn.mm, cl= h.train.labels, k=17)
table("predicted" = h.knn.mm, "actual" = h.test.labels)## actual
## predicted 0 1
## 0 20 9
## 1 7 25
Using the confusion matrix, we could calculate the recall value as below:
paste("Recall: ", round((25/(25+9)) * 100,2), "%", sep="")## [1] "Recall: 73.53%"
h.knn.z <- knn(train = h.train.knn.z, test=h.test.knn.z, cl= h.train.labels, k=13)
table("predicted" = h.knn.z, "actual" = h.test.labels)## actual
## predicted 0 1
## 0 18 7
## 1 9 27
Using the confusion matrix, we could calculate the recall value as below:
paste("Recall: ", round((27/(27+7)) * 100,2), "%", sep="")## [1] "Recall: 79.41%"
For this dataset, We get a better result of KNN using zscore normalization