Objective

This data set dates from 1988 and consists of four databases: Cleveland, Hungary, Switzerland, and Long Beach V. It contains 76 attributes, including the predicted attribute, but all published experiments refer to using a subset of 14 of them. The “target” field refers to the presence of heart disease in the patient. It is integer valued 0 = no disease and 1 = disease.

Attribute Information

  1. age
  2. sex
  3. chest pain type (4 values)
  4. resting blood pressure
  5. serum cholestoral in mg/dl
  6. fasting blood sugar > 120 mg/dl
  7. resting electrocardiographic results (values 0,1,2)
  8. maximum heart rate achieved
  9. exercise induced angina
  10. oldpeak = ST depression induced by exercise relative to rest
  11. the slope of the peak exercise ST segment
  12. number of major vessels (0-3) colored by flourosopy
  13. thal: 0 = normal; 1 = fixed defect; 2 = reversable defect The names and social security numbers of the patients were recently removed from the database, replaced with dummy values.

Library

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2
## Warning: package 'ggplot2' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(gtools)
## Warning: package 'gtools' was built under R version 4.2.2
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.2.2
library(ggplot2)
library(class)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(GGally)
## Warning: package 'GGally' was built under R version 4.2.2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(caret)
## Warning: package 'caret' was built under R version 4.2.2
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(tibble)

Load Data

heart <- read.csv("data_input/heart.csv")
head(heart)
##   age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1  52   1  0      125  212   0       1     168     0     1.0     2  2    3
## 2  53   1  0      140  203   1       0     155     1     3.1     0  0    3
## 3  70   1  0      145  174   0       1     125     1     2.6     0  0    3
## 4  61   1  0      148  203   0       1     161     0     0.0     2  1    3
## 5  62   0  0      138  294   1       1     106     0     1.9     1  3    2
## 6  58   0  0      100  248   0       0     122     0     1.0     1  0    2
##   target
## 1      0
## 2      0
## 3      0
## 4      0
## 5      0
## 6      1
str(heart)
## 'data.frame':    1025 obs. of  14 variables:
##  $ age     : int  52 53 70 61 62 58 58 55 46 54 ...
##  $ sex     : int  1 1 1 1 0 0 1 1 1 1 ...
##  $ cp      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ trestbps: int  125 140 145 148 138 100 114 160 120 122 ...
##  $ chol    : int  212 203 174 203 294 248 318 289 249 286 ...
##  $ fbs     : int  0 1 0 0 1 0 0 0 0 0 ...
##  $ restecg : int  1 0 1 1 1 0 2 0 0 0 ...
##  $ thalach : int  168 155 125 161 106 122 140 145 144 116 ...
##  $ exang   : int  0 1 1 0 0 0 0 1 0 1 ...
##  $ oldpeak : num  1 3.1 2.6 0 1.9 1 4.4 0.8 0.8 3.2 ...
##  $ slope   : int  2 0 0 2 1 1 0 1 2 1 ...
##  $ ca      : int  2 0 0 1 3 0 3 1 0 2 ...
##  $ thal    : int  3 3 3 3 2 2 1 3 3 2 ...
##  $ target  : int  0 0 0 0 0 1 0 0 0 0 ...

preparation

check missing value

colSums(is.na(heart))
##      age      sex       cp trestbps     chol      fbs  restecg  thalach 
##        0        0        0        0        0        0        0        0 
##    exang  oldpeak    slope       ca     thal   target 
##        0        0        0        0        0        0
summary(heart)
##       age             sex               cp            trestbps    
##  Min.   :29.00   Min.   :0.0000   Min.   :0.0000   Min.   : 94.0  
##  1st Qu.:48.00   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:120.0  
##  Median :56.00   Median :1.0000   Median :1.0000   Median :130.0  
##  Mean   :54.43   Mean   :0.6956   Mean   :0.9424   Mean   :131.6  
##  3rd Qu.:61.00   3rd Qu.:1.0000   3rd Qu.:2.0000   3rd Qu.:140.0  
##  Max.   :77.00   Max.   :1.0000   Max.   :3.0000   Max.   :200.0  
##       chol          fbs            restecg          thalach     
##  Min.   :126   Min.   :0.0000   Min.   :0.0000   Min.   : 71.0  
##  1st Qu.:211   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:132.0  
##  Median :240   Median :0.0000   Median :1.0000   Median :152.0  
##  Mean   :246   Mean   :0.1493   Mean   :0.5298   Mean   :149.1  
##  3rd Qu.:275   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:166.0  
##  Max.   :564   Max.   :1.0000   Max.   :2.0000   Max.   :202.0  
##      exang           oldpeak          slope             ca        
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.800   Median :1.000   Median :0.0000  
##  Mean   :0.3366   Mean   :1.072   Mean   :1.385   Mean   :0.7541  
##  3rd Qu.:1.0000   3rd Qu.:1.800   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :6.200   Max.   :2.000   Max.   :4.0000  
##       thal           target      
##  Min.   :0.000   Min.   :0.0000  
##  1st Qu.:2.000   1st Qu.:0.0000  
##  Median :2.000   Median :1.0000  
##  Mean   :2.324   Mean   :0.5132  
##  3rd Qu.:3.000   3rd Qu.:1.0000  
##  Max.   :3.000   Max.   :1.0000
heart <- heart %>% 
  mutate(sex = factor(sex, levels = c(0,1), labels = c("female","male")),
         fbs = factor(fbs, levels = c(0,1), labels = c("false","true")),
         exang = factor(exang, levels = c(0,1), labels = c("no","yes")),
         target = factor(target, levels = c(0,1), 
                        labels = c("sehat","sakit")))
heart[,c("cp","restecg", "slope", "ca", "thal")] <- lapply(heart[,c("cp","restecg", "slope", "ca", "thal")], as.factor)
str(heart)
## 'data.frame':    1025 obs. of  14 variables:
##  $ age     : int  52 53 70 61 62 58 58 55 46 54 ...
##  $ sex     : Factor w/ 2 levels "female","male": 2 2 2 2 1 1 2 2 2 2 ...
##  $ cp      : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ trestbps: int  125 140 145 148 138 100 114 160 120 122 ...
##  $ chol    : int  212 203 174 203 294 248 318 289 249 286 ...
##  $ fbs     : Factor w/ 2 levels "false","true": 1 2 1 1 2 1 1 1 1 1 ...
##  $ restecg : Factor w/ 3 levels "0","1","2": 2 1 2 2 2 1 3 1 1 1 ...
##  $ thalach : int  168 155 125 161 106 122 140 145 144 116 ...
##  $ exang   : Factor w/ 2 levels "no","yes": 1 2 2 1 1 1 1 2 1 2 ...
##  $ oldpeak : num  1 3.1 2.6 0 1.9 1 4.4 0.8 0.8 3.2 ...
##  $ slope   : Factor w/ 3 levels "0","1","2": 3 1 1 3 2 2 1 2 3 2 ...
##  $ ca      : Factor w/ 5 levels "0","1","2","3",..: 3 1 1 2 4 1 4 2 1 3 ...
##  $ thal    : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 3 3 2 4 4 3 ...
##  $ target  : Factor w/ 2 levels "sehat","sakit": 1 1 1 1 1 2 1 1 1 1 ...

Exploratory Data Analysis

ggcorr(heart, label = TRUE, label_size = 2.9, hjust = 1, layout.exp = 2)
## Warning in ggcorr(heart, label = TRUE, label_size = 2.9, hjust = 1, layout.exp
## = 2): data in column(s) 'sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca',
## 'thal', 'target' are not numeric and were ignored

prop.table(table(heart$target))
## 
##     sehat     sakit 
## 0.4868293 0.5131707
RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(100)
index <- sample(nrow(heart), nrow(heart)*0.85)
train <- heart[index,]
test <- heart[-index,]
prop.table(table(train$target))
## 
##    sehat    sakit 
## 0.489093 0.510907
prop.table(table(test$target))
## 
##    sehat    sakit 
## 0.474026 0.525974

Modelling

Logistic Regression

set.seed(205)
idx <- sample(nrow(heart), nrow(heart)*0.7)
heart.train <- heart[idx,]
heart.test <- heart[-idx,]
model.full <- glm(target ~ ., heart.train, family = "binomial")
model.log <- step(model.full, direction = "backward")
## Start:  AIC=454.76
## target ~ age + sex + cp + trestbps + chol + fbs + restecg + thalach + 
##     exang + oldpeak + slope + ca + thal
## 
##            Df Deviance    AIC
## - restecg   2   410.77 452.77
## - age       1   410.66 454.66
## - fbs       1   410.73 454.73
## <none>          408.76 454.76
## - chol      1   413.66 457.66
## - oldpeak   1   414.76 458.76
## - exang     1   416.44 460.44
## - thalach   1   418.53 462.53
## - trestbps  1   421.26 465.26
## - slope     2   429.47 471.47
## - thal      3   440.02 480.02
## - sex       1   443.78 487.78
## - cp        3   466.81 506.81
## - ca        4   495.08 533.08
## 
## Step:  AIC=452.77
## target ~ age + sex + cp + trestbps + chol + fbs + thalach + exang + 
##     oldpeak + slope + ca + thal
## 
##            Df Deviance    AIC
## - age       1   412.38 452.38
## <none>          410.77 452.77
## - fbs       1   413.05 453.05
## - chol      1   416.99 456.99
## - oldpeak   1   417.06 457.06
## - exang     1   418.65 458.65
## - thalach   1   420.49 460.49
## - trestbps  1   425.65 465.65
## - slope     2   433.22 471.22
## - thal      3   440.88 476.88
## - sex       1   447.16 487.16
## - cp        3   468.60 504.60
## - ca        4   499.36 533.36
## 
## Step:  AIC=452.38
## target ~ sex + cp + trestbps + chol + fbs + thalach + exang + 
##     oldpeak + slope + ca + thal
## 
##            Df Deviance    AIC
## <none>          412.38 452.38
## - fbs       1   414.63 452.63
## - chol      1   417.82 455.82
## - oldpeak   1   419.91 457.91
## - thalach   1   420.49 458.49
## - exang     1   420.77 458.77
## - trestbps  1   425.74 463.74
## - slope     2   433.70 469.70
## - thal      3   442.54 476.54
## - sex       1   449.37 487.37
## - cp        3   472.80 506.80
## - ca        4   501.56 533.56
summary(model.log)
## 
## Call:
## glm(formula = target ~ sex + cp + trestbps + chol + fbs + thalach + 
##     exang + oldpeak + slope + ca + thal, family = "binomial", 
##     data = heart.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8280  -0.2855   0.0687   0.4336   3.3873  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.552726   2.462031   0.631 0.528257    
## sexmale     -2.245110   0.403341  -5.566 2.60e-08 ***
## cp1          1.219251   0.394198   3.093 0.001982 ** 
## cp2          2.081339   0.348960   5.964 2.46e-09 ***
## cp3          2.729114   0.454451   6.005 1.91e-09 ***
## trestbps    -0.026051   0.007377  -3.531 0.000413 ***
## chol        -0.006347   0.002657  -2.388 0.016927 *  
## fbstrue      0.600131   0.403079   1.489 0.136522    
## thalach      0.020955   0.007697   2.722 0.006483 ** 
## exangyes    -0.881365   0.304872  -2.891 0.003841 ** 
## oldpeak     -0.408690   0.152404  -2.682 0.007326 ** 
## slope1      -0.254781   0.583910  -0.436 0.662593    
## slope2       1.228134   0.620207   1.980 0.047681 *  
## ca1         -2.259030   0.348038  -6.491 8.54e-11 ***
## ca2         -3.215414   0.535018  -6.010 1.86e-09 ***
## ca3         -2.535733   0.720520  -3.519 0.000433 ***
## ca4          1.978437   1.284920   1.540 0.123625    
## thal1        3.103312   2.011406   1.543 0.122865    
## thal2        2.533745   1.965991   1.289 0.197472    
## thal3        1.238946   1.967748   0.630 0.528939    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.24  on 716  degrees of freedom
## Residual deviance: 412.38  on 697  degrees of freedom
## AIC: 452.38
## 
## Number of Fisher Scoring iterations: 6
as.data.frame(exp(model.log$coefficients))
##             exp(model.log$coefficients)
## (Intercept)                  4.72433000
## sexmale                      0.10591585
## cp1                          3.38465031
## cp2                          8.01519766
## cp3                         15.31931008
## trestbps                     0.97428563
## chol                         0.99367356
## fbstrue                      1.82235739
## thalach                      1.02117567
## exangyes                     0.41421699
## oldpeak                      0.66452032
## slope1                       0.77508648
## slope2                       3.41485100
## ca1                          0.10445175
## ca2                          0.04013870
## ca3                          0.07920364
## ca4                          7.23143323
## thal1                       22.27159838
## thal2                       12.60061258
## thal3                        3.45197288
pred.log <- predict(model.log, heart.test, type = "response")
heart.test$pred.log.label <- as.factor(ifelse(pred.log >= 0.4, "sakit", "sehat"))
confusionMatrix(heart.test$pred.log.label, heart.test$target, positive = "sakit")
## Warning in confusionMatrix.default(heart.test$pred.log.label,
## heart.test$target, : Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction sehat sakit
##      sehat   118    13
##      sakit    34   143
##                                           
##                Accuracy : 0.8474          
##                  95% CI : (0.8023, 0.8857)
##     No Information Rate : 0.5065          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6942          
##                                           
##  Mcnemar's Test P-Value : 0.003531        
##                                           
##             Sensitivity : 0.9167          
##             Specificity : 0.7763          
##          Pos Pred Value : 0.8079          
##          Neg Pred Value : 0.9008          
##              Prevalence : 0.5065          
##          Detection Rate : 0.4643          
##    Detection Prevalence : 0.5747          
##       Balanced Accuracy : 0.8465          
##                                           
##        'Positive' Class : sakit           
## 

KNN

set.seed(205)
idx <- sample(nrow(heart), nrow(heart)*0.7)
knn.train <- heart[idx,]
knn.test <- heart[-idx,]
knn.train.z <- as.data.frame(lapply(knn.train[,c(4,5,8,10)],scale))
knn.test.z <- as.data.frame(lapply(knn.test[,c(4,5,8,10)],scale))
sqrt(nrow(knn.test))
## [1] 17.54993
knn.pred <- knn (train = knn.train.z,
                      test = knn.test.z,
                      cl = knn.train$target,
                      k = 9)
knn.test$knn.pred <- knn.pred
confusionMatrix(knn.test$knn.pred, knn.test$target, positive = "sakit")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction sehat sakit
##      sehat   109    38
##      sakit    43   118
##                                           
##                Accuracy : 0.737           
##                  95% CI : (0.6841, 0.7853)
##     No Information Rate : 0.5065          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.4737          
##                                           
##  Mcnemar's Test P-Value : 0.6567          
##                                           
##             Sensitivity : 0.7564          
##             Specificity : 0.7171          
##          Pos Pred Value : 0.7329          
##          Neg Pred Value : 0.7415          
##              Prevalence : 0.5065          
##          Detection Rate : 0.3831          
##    Detection Prevalence : 0.5227          
##       Balanced Accuracy : 0.7368          
##                                           
##        'Positive' Class : sakit           
## 

Conclusion

When viewed from the results of the accuracy metric that has the highest value is the logistic regression model of 84.74%. while the k-NN method only gets an accuracy value of 73%