This data set dates from 1988 and consists of four databases: Cleveland, Hungary, Switzerland, and Long Beach V. It contains 76 attributes, including the predicted attribute, but all published experiments refer to using a subset of 14 of them. The “target” field refers to the presence of heart disease in the patient. It is integer valued 0 = no disease and 1 = disease.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## Warning: package 'ggplot2' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(gtools)
## Warning: package 'gtools' was built under R version 4.2.2
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.2.2
library(ggplot2)
library(class)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(GGally)
## Warning: package 'GGally' was built under R version 4.2.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(caret)
## Warning: package 'caret' was built under R version 4.2.2
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(tibble)
heart <- read.csv("data_input/heart.csv")
head(heart)
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1 52 1 0 125 212 0 1 168 0 1.0 2 2 3
## 2 53 1 0 140 203 1 0 155 1 3.1 0 0 3
## 3 70 1 0 145 174 0 1 125 1 2.6 0 0 3
## 4 61 1 0 148 203 0 1 161 0 0.0 2 1 3
## 5 62 0 0 138 294 1 1 106 0 1.9 1 3 2
## 6 58 0 0 100 248 0 0 122 0 1.0 1 0 2
## target
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 1
str(heart)
## 'data.frame': 1025 obs. of 14 variables:
## $ age : int 52 53 70 61 62 58 58 55 46 54 ...
## $ sex : int 1 1 1 1 0 0 1 1 1 1 ...
## $ cp : int 0 0 0 0 0 0 0 0 0 0 ...
## $ trestbps: int 125 140 145 148 138 100 114 160 120 122 ...
## $ chol : int 212 203 174 203 294 248 318 289 249 286 ...
## $ fbs : int 0 1 0 0 1 0 0 0 0 0 ...
## $ restecg : int 1 0 1 1 1 0 2 0 0 0 ...
## $ thalach : int 168 155 125 161 106 122 140 145 144 116 ...
## $ exang : int 0 1 1 0 0 0 0 1 0 1 ...
## $ oldpeak : num 1 3.1 2.6 0 1.9 1 4.4 0.8 0.8 3.2 ...
## $ slope : int 2 0 0 2 1 1 0 1 2 1 ...
## $ ca : int 2 0 0 1 3 0 3 1 0 2 ...
## $ thal : int 3 3 3 3 2 2 1 3 3 2 ...
## $ target : int 0 0 0 0 0 1 0 0 0 0 ...
colSums(is.na(heart))
## age sex cp trestbps chol fbs restecg thalach
## 0 0 0 0 0 0 0 0
## exang oldpeak slope ca thal target
## 0 0 0 0 0 0
summary(heart)
## age sex cp trestbps
## Min. :29.00 Min. :0.0000 Min. :0.0000 Min. : 94.0
## 1st Qu.:48.00 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:120.0
## Median :56.00 Median :1.0000 Median :1.0000 Median :130.0
## Mean :54.43 Mean :0.6956 Mean :0.9424 Mean :131.6
## 3rd Qu.:61.00 3rd Qu.:1.0000 3rd Qu.:2.0000 3rd Qu.:140.0
## Max. :77.00 Max. :1.0000 Max. :3.0000 Max. :200.0
## chol fbs restecg thalach
## Min. :126 Min. :0.0000 Min. :0.0000 Min. : 71.0
## 1st Qu.:211 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:132.0
## Median :240 Median :0.0000 Median :1.0000 Median :152.0
## Mean :246 Mean :0.1493 Mean :0.5298 Mean :149.1
## 3rd Qu.:275 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:166.0
## Max. :564 Max. :1.0000 Max. :2.0000 Max. :202.0
## exang oldpeak slope ca
## Min. :0.0000 Min. :0.000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.800 Median :1.000 Median :0.0000
## Mean :0.3366 Mean :1.072 Mean :1.385 Mean :0.7541
## 3rd Qu.:1.0000 3rd Qu.:1.800 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :6.200 Max. :2.000 Max. :4.0000
## thal target
## Min. :0.000 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:0.0000
## Median :2.000 Median :1.0000
## Mean :2.324 Mean :0.5132
## 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :3.000 Max. :1.0000
heart <- heart %>%
mutate(sex = factor(sex, levels = c(0,1), labels = c("female","male")),
fbs = factor(fbs, levels = c(0,1), labels = c("false","true")),
exang = factor(exang, levels = c(0,1), labels = c("no","yes")),
target = factor(target, levels = c(0,1),
labels = c("sehat","sakit")))
heart[,c("cp","restecg", "slope", "ca", "thal")] <- lapply(heart[,c("cp","restecg", "slope", "ca", "thal")], as.factor)
str(heart)
## 'data.frame': 1025 obs. of 14 variables:
## $ age : int 52 53 70 61 62 58 58 55 46 54 ...
## $ sex : Factor w/ 2 levels "female","male": 2 2 2 2 1 1 2 2 2 2 ...
## $ cp : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ trestbps: int 125 140 145 148 138 100 114 160 120 122 ...
## $ chol : int 212 203 174 203 294 248 318 289 249 286 ...
## $ fbs : Factor w/ 2 levels "false","true": 1 2 1 1 2 1 1 1 1 1 ...
## $ restecg : Factor w/ 3 levels "0","1","2": 2 1 2 2 2 1 3 1 1 1 ...
## $ thalach : int 168 155 125 161 106 122 140 145 144 116 ...
## $ exang : Factor w/ 2 levels "no","yes": 1 2 2 1 1 1 1 2 1 2 ...
## $ oldpeak : num 1 3.1 2.6 0 1.9 1 4.4 0.8 0.8 3.2 ...
## $ slope : Factor w/ 3 levels "0","1","2": 3 1 1 3 2 2 1 2 3 2 ...
## $ ca : Factor w/ 5 levels "0","1","2","3",..: 3 1 1 2 4 1 4 2 1 3 ...
## $ thal : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 3 3 2 4 4 3 ...
## $ target : Factor w/ 2 levels "sehat","sakit": 1 1 1 1 1 2 1 1 1 1 ...
ggcorr(heart, label = TRUE, label_size = 2.9, hjust = 1, layout.exp = 2)
## Warning in ggcorr(heart, label = TRUE, label_size = 2.9, hjust = 1, layout.exp
## = 2): data in column(s) 'sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca',
## 'thal', 'target' are not numeric and were ignored
prop.table(table(heart$target))
##
## sehat sakit
## 0.4868293 0.5131707
RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(100)
index <- sample(nrow(heart), nrow(heart)*0.85)
train <- heart[index,]
test <- heart[-index,]
prop.table(table(train$target))
##
## sehat sakit
## 0.489093 0.510907
prop.table(table(test$target))
##
## sehat sakit
## 0.474026 0.525974
set.seed(205)
idx <- sample(nrow(heart), nrow(heart)*0.7)
heart.train <- heart[idx,]
heart.test <- heart[-idx,]
model.full <- glm(target ~ ., heart.train, family = "binomial")
model.log <- step(model.full, direction = "backward")
## Start: AIC=454.76
## target ~ age + sex + cp + trestbps + chol + fbs + restecg + thalach +
## exang + oldpeak + slope + ca + thal
##
## Df Deviance AIC
## - restecg 2 410.77 452.77
## - age 1 410.66 454.66
## - fbs 1 410.73 454.73
## <none> 408.76 454.76
## - chol 1 413.66 457.66
## - oldpeak 1 414.76 458.76
## - exang 1 416.44 460.44
## - thalach 1 418.53 462.53
## - trestbps 1 421.26 465.26
## - slope 2 429.47 471.47
## - thal 3 440.02 480.02
## - sex 1 443.78 487.78
## - cp 3 466.81 506.81
## - ca 4 495.08 533.08
##
## Step: AIC=452.77
## target ~ age + sex + cp + trestbps + chol + fbs + thalach + exang +
## oldpeak + slope + ca + thal
##
## Df Deviance AIC
## - age 1 412.38 452.38
## <none> 410.77 452.77
## - fbs 1 413.05 453.05
## - chol 1 416.99 456.99
## - oldpeak 1 417.06 457.06
## - exang 1 418.65 458.65
## - thalach 1 420.49 460.49
## - trestbps 1 425.65 465.65
## - slope 2 433.22 471.22
## - thal 3 440.88 476.88
## - sex 1 447.16 487.16
## - cp 3 468.60 504.60
## - ca 4 499.36 533.36
##
## Step: AIC=452.38
## target ~ sex + cp + trestbps + chol + fbs + thalach + exang +
## oldpeak + slope + ca + thal
##
## Df Deviance AIC
## <none> 412.38 452.38
## - fbs 1 414.63 452.63
## - chol 1 417.82 455.82
## - oldpeak 1 419.91 457.91
## - thalach 1 420.49 458.49
## - exang 1 420.77 458.77
## - trestbps 1 425.74 463.74
## - slope 2 433.70 469.70
## - thal 3 442.54 476.54
## - sex 1 449.37 487.37
## - cp 3 472.80 506.80
## - ca 4 501.56 533.56
summary(model.log)
##
## Call:
## glm(formula = target ~ sex + cp + trestbps + chol + fbs + thalach +
## exang + oldpeak + slope + ca + thal, family = "binomial",
## data = heart.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8280 -0.2855 0.0687 0.4336 3.3873
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.552726 2.462031 0.631 0.528257
## sexmale -2.245110 0.403341 -5.566 2.60e-08 ***
## cp1 1.219251 0.394198 3.093 0.001982 **
## cp2 2.081339 0.348960 5.964 2.46e-09 ***
## cp3 2.729114 0.454451 6.005 1.91e-09 ***
## trestbps -0.026051 0.007377 -3.531 0.000413 ***
## chol -0.006347 0.002657 -2.388 0.016927 *
## fbstrue 0.600131 0.403079 1.489 0.136522
## thalach 0.020955 0.007697 2.722 0.006483 **
## exangyes -0.881365 0.304872 -2.891 0.003841 **
## oldpeak -0.408690 0.152404 -2.682 0.007326 **
## slope1 -0.254781 0.583910 -0.436 0.662593
## slope2 1.228134 0.620207 1.980 0.047681 *
## ca1 -2.259030 0.348038 -6.491 8.54e-11 ***
## ca2 -3.215414 0.535018 -6.010 1.86e-09 ***
## ca3 -2.535733 0.720520 -3.519 0.000433 ***
## ca4 1.978437 1.284920 1.540 0.123625
## thal1 3.103312 2.011406 1.543 0.122865
## thal2 2.533745 1.965991 1.289 0.197472
## thal3 1.238946 1.967748 0.630 0.528939
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.24 on 716 degrees of freedom
## Residual deviance: 412.38 on 697 degrees of freedom
## AIC: 452.38
##
## Number of Fisher Scoring iterations: 6
as.data.frame(exp(model.log$coefficients))
## exp(model.log$coefficients)
## (Intercept) 4.72433000
## sexmale 0.10591585
## cp1 3.38465031
## cp2 8.01519766
## cp3 15.31931008
## trestbps 0.97428563
## chol 0.99367356
## fbstrue 1.82235739
## thalach 1.02117567
## exangyes 0.41421699
## oldpeak 0.66452032
## slope1 0.77508648
## slope2 3.41485100
## ca1 0.10445175
## ca2 0.04013870
## ca3 0.07920364
## ca4 7.23143323
## thal1 22.27159838
## thal2 12.60061258
## thal3 3.45197288
pred.log <- predict(model.log, heart.test, type = "response")
heart.test$pred.log.label <- as.factor(ifelse(pred.log >= 0.4, "sakit", "sehat"))
confusionMatrix(heart.test$pred.log.label, heart.test$target, positive = "sakit")
## Warning in confusionMatrix.default(heart.test$pred.log.label,
## heart.test$target, : Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction sehat sakit
## sehat 118 13
## sakit 34 143
##
## Accuracy : 0.8474
## 95% CI : (0.8023, 0.8857)
## No Information Rate : 0.5065
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6942
##
## Mcnemar's Test P-Value : 0.003531
##
## Sensitivity : 0.9167
## Specificity : 0.7763
## Pos Pred Value : 0.8079
## Neg Pred Value : 0.9008
## Prevalence : 0.5065
## Detection Rate : 0.4643
## Detection Prevalence : 0.5747
## Balanced Accuracy : 0.8465
##
## 'Positive' Class : sakit
##
set.seed(205)
idx <- sample(nrow(heart), nrow(heart)*0.7)
knn.train <- heart[idx,]
knn.test <- heart[-idx,]
knn.train.z <- as.data.frame(lapply(knn.train[,c(4,5,8,10)],scale))
knn.test.z <- as.data.frame(lapply(knn.test[,c(4,5,8,10)],scale))
sqrt(nrow(knn.test))
## [1] 17.54993
knn.pred <- knn (train = knn.train.z,
test = knn.test.z,
cl = knn.train$target,
k = 9)
knn.test$knn.pred <- knn.pred
confusionMatrix(knn.test$knn.pred, knn.test$target, positive = "sakit")
## Confusion Matrix and Statistics
##
## Reference
## Prediction sehat sakit
## sehat 109 38
## sakit 43 118
##
## Accuracy : 0.737
## 95% CI : (0.6841, 0.7853)
## No Information Rate : 0.5065
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.4737
##
## Mcnemar's Test P-Value : 0.6567
##
## Sensitivity : 0.7564
## Specificity : 0.7171
## Pos Pred Value : 0.7329
## Neg Pred Value : 0.7415
## Prevalence : 0.5065
## Detection Rate : 0.3831
## Detection Prevalence : 0.5227
## Balanced Accuracy : 0.7368
##
## 'Positive' Class : sakit
##
When viewed from the results of the accuracy metric that has the highest value is the logistic regression model of 84.74%. while the k-NN method only gets an accuracy value of 73%