Introduction

Disini kita akan menggunakan model regresi logistik dan K-Nearest Neighbor (K-NN) untuk memprediksi adanya penyakit jantung pada pasien, yang merupakan variabel target dengan 13 atribut, yaitu:

age: in years

sex: male or female

chest pain type (4 values)

resting blood pressure: detect blood pressure during rest (in mmHg)

serum cholestoral in mg/dl

fasting blood sugar > 120 mg/dl

resting electrocardiographic results (values 0,1,2)

maximum heart rate achieved

exercise induced angina

oldpeak = ST depression induced by exercise relative to rest

the slope of the peak exercise ST segment

number of major vessels (0-3) colored by flourosopy

thal: 3 = No Thalassemia; 6 = Fixed Defect Thalassemia; 7 = Reversible Defect Thalassemia

#read data
heart <- read.csv("data_input/heart.csv")
head(heart)

Exploratory Data Analysis

#Check Missing value
colSums(is.na(heart))

##   ï..age      sex       cp trestbps     chol      fbs  restecg  thalach 
##        0        0        0        0        0        0        0        0 
##    exang  oldpeak    slope       ca     thal   target 
##        0        0        0        0        0        0

heart <- heart %>% 
  rename("age" = 'ï..age') %>% 
  mutate(sex = ifelse(sex==1, "Male","Female"),
         fbs = ifelse(fbs == 1, "> 120 mg/dl", "< 120 mg/dl"),
         exang = ifelse(exang == 1, "Exercise Induced Angina" ,"No Exercise Induced Angina"),
         cp = ifelse(cp == 0, "Chest Pain Type 0",
                      ifelse(cp == 1, "Chest Pain Type 1", ifelse(cp==2, "Chest Pain Type 2", "Chest Pain Type 3"))),
         restecg = ifelse(restecg == 0, "Normal",
                           if_else(restecg == 1, "Abnormality", "Probable or Definite")),
         thal = ifelse(thal== 0, "No Thalassemia", ifelse(thal==1, "Normal Thalassemia", ifelse(thal==2, "Fixed Defect Thalassemia",    "Reversible Defect Thalassemia"))),
         target = ifelse(target == 0, "Healthy", "Heart Disease"),
         slope = ifelse(slope == 0, "Peak Excercise ST Slope 0", ifelse(slope==1,"Peak Excercise ST Slope 1", "Peak Excercise ST Slope 2"))
         ) %>% 
  mutate_if(is.character, as.factor)
  

head(heart)

# heart <- heart %>% 
#   mutate_if(is.integer, as.factor) %>% 
#   mutate(sex = factor(sex, levels = c(0,1), labels = c("Female", "Male")),
#          fbs =factor(fbs, levels = c(0,1), labels = c("False", "True")),
#          exang = factor(exang, levels = c(0,1), labels = c("No", "Yes")),
#          target = factor(target, levels = c(0,1), 
#                         labels = c("Health", "Not Health")))
# str(heart)

Age

ta <- ggplot(heart[(!is.na(heart$target) & !is.na(heart$age)),], aes(x = age, fill = target)) +
       geom_density(alpha=0.5, aes(fill=factor(target))) + labs(title="target density and Age") +
       scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + theme_grey()
ta

Sex

sex <- ggplot(data = heart, mapping = aes(x = target)) + 
             geom_bar(mapping = aes(fill = sex)) + theme_grey() + 
             ggtitle("Distribution of Patient by gender") +
             xlab("Marital") + ylab("Number of patient") + facet_wrap(heart$sex)
ggplotly(sex)

## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

Chest Pain

cp <- ggplot(data = heart, mapping = aes(x = sex)) + 
             geom_bar(mapping = aes(fill = target)) + theme_grey() + 
             ggtitle("Distribution of Customers by personal loan") +
             xlab("Marital") + ylab("Number of Customers") + facet_wrap(heart$cp)
ggplotly(cp)

resting blood preasure

trestbps <- ggplot(heart, aes(x=trestbps, fill=target)) + geom_histogram(aes(y=..density..), color="grey17") +
  geom_density(alpha=.2, fill="black")+
  facet_wrap(~target, ncol=1,scale="fixed")+
  xlab("Resting Blood Pressure (mmHg)") + 
  ylab("Density/Count") +
  ggtitle("Resting Blood Pressure")
ggplotly(trestbps)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cholestoral in mg/dl

chol <- ggplot(heart, aes(x=chol, fill=target)) + geom_histogram(aes(y=..density..), color="grey17") +
  geom_density(alpha=.2, fill="black")+
  facet_wrap(~target, ncol=1,scale="fixed")+
  xlab("Cholesterol") + 
  ylab("Density/Count") +
  ggtitle("Serum Cholestoral (mg/dl)") 
ggplotly(chol)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

fasting blood sugar mg/dl

fbs <- ggplot(data = heart, mapping = aes(x = fbs)) + 
             geom_bar(mapping = aes(fill = target)) + theme_grey() + 
             ggtitle("Distribution of Customers by personal loan") +
             xlab("Marital") + ylab("Number of Customers") + facet_wrap(heart$target)
ggplotly(fbs)

restecg

restecg <- ggplot(data = heart, mapping = aes(x = restecg)) + 
             geom_bar(mapping = aes(fill = target)) + theme_grey() + 
             ggtitle("Distribution of Customers by personal loan") +
             xlab("Marital") + ylab("Number of Customers") + facet_wrap(heart$target)
ggplotly(restecg)

Maximum Heart Rate

thalach <- ggplot(heart, aes(x=thalach , fill=target)) + geom_histogram(aes(y=..density..), color="grey17") +
  geom_density(alpha=.2, fill="black")+
  facet_wrap(~target, ncol=1,scale="fixed")+
  xlab("Maximum Heart Rate Achieved") + 
  ylab("Density/Count") +
  ggtitle("Maximum Heart Rate Achieved") 
ggplotly(thalach)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exercise Induced Angina

exang <- ggplot(data = heart, mapping = aes(x = exang)) + 
             geom_bar(mapping = aes(fill = target)) + theme_grey() + 
             ggtitle("Distribution of Patient by gender") +
             xlab("Marital") + ylab("Number of patient") + facet_wrap(heart$target)
ggplotly(exang)

ST Depression

stdep <- ggplot(heart, aes(x=oldpeak , fill=target)) + geom_histogram(aes(y=..density..), color="grey17") +
  geom_density(alpha=.2, fill="yellow")+
  facet_wrap(~target, ncol=1,scale="fixed")+
  xlab("Oldpeak") + 
  ylab("Density/Count") +
  ggtitle("ST depression induced by exercise relative to rest") +
  scale_fill_discrete(name = "Heart Disease", labels = c("Absence", "Presence")) + theme(plot.title = element_text(hjust = 0.5))
ggplotly(stdep)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Multivariables`

ggplot(heart, aes(x=age, y=oldpeak, color=sex, size=ca)) + geom_point(alpha=0.7) + facet_wrap(~target, ncol=1,scale="fixed") + xlab("Age") +
  ylab("Oldpeak") +
  scale_fill_discrete(name = "Heart Disease", labels = c("Absence", "Presence")) + theme(plot.title = element_text(hjust = 0.5))

Slope of The Peak Exercise

slope <- ggplot(data = heart, mapping = aes(x = slope)) + 
             geom_bar(mapping = aes(fill = target)) + theme_grey() + 
             ggtitle("Distribution of Patient by gender") +
             xlab("Marital") + ylab("Number of patient") + facet_wrap(heart$target)
ggplotly(slope)

Number of Major Vessels

ca <- ggplot(data = heart, mapping = aes(x = ca)) + 
             geom_bar(mapping = aes(fill = target)) + theme_grey() + 
             ggtitle("Distribution of Patient by gender") +
             xlab("Marital") + ylab("Number of patient") + facet_wrap(heart$target)
ggplotly(ca)

Thallasemia

thal <- ggplot(data = heart, mapping = aes(x = thal)) + 
             geom_bar(mapping = aes(fill = target)) + theme_grey() + 
             ggtitle("Distribution of Patient by gender") +
             xlab("Marital") + ylab("Number of patient") + facet_wrap(heart$target)
ggplotly(thal)

Model

# inspectdf::inspect_cat(heart$target), show_plot = T)

# Check Proportion data
prop.table(table(heart$target))

## 
##       Healthy Heart Disease 
##     0.4554455     0.5445545

Logistic Regression

# Data Splitting
set.seed(303)
id <-sample(nrow(heart),nrow(heart)*0.75)
heart_train<-heart[id,]
heart_test<-heart[-id,]

# inspect corelation between predictors
ggcorr(heart[,-120], hjust = 1, layout.exp = 2, label = T, label_size = 5)

## Warning in ggcorr(heart[, -120], hjust = 1, layout.exp = 2, label = T,
## label_size = 5): data in column(s) 'sex', 'cp', 'fbs', 'restecg', 'exang',
## 'slope', 'thal', 'target' are not numeric and were ignored

#Variable Targeting
modelall <- glm(target~.,heart_train, family='binomial')

summary(modelall)

## 
## Call:
## glm(formula = target ~ ., family = "binomial", data = heart_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7992  -0.3429   0.1546   0.4645   2.8191  
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        2.299211   3.312564   0.694 0.487627    
## age                               -0.007418   0.028227  -0.263 0.792714    
## sexMale                           -1.022120   0.578166  -1.768 0.077083 .  
## cpChest Pain Type 1                0.710026   0.653115   1.087 0.276976    
## cpChest Pain Type 2                1.764556   0.543602   3.246 0.001170 ** 
## cpChest Pain Type 3                1.935198   0.760741   2.544 0.010964 *  
## trestbps                          -0.019759   0.013441  -1.470 0.141540    
## chol                              -0.005541   0.005435  -1.020 0.307925    
## fbs> 120 mg/dl                     0.587834   0.679932   0.865 0.387287    
## restecgNormal                     -0.241410   0.456491  -0.529 0.596917    
## restecgProbable or Definite       -0.142914   2.294098  -0.062 0.950327    
## thalach                            0.019879   0.012566   1.582 0.113650    
## exangNo Exercise Induced Angina    1.108604   0.493709   2.245 0.024739 *  
## oldpeak                           -0.375135   0.261791  -1.433 0.151870    
## slopePeak Excercise ST Slope 1    -0.438271   1.030294  -0.425 0.670556    
## slopePeak Excercise ST Slope 2     0.555018   1.111701   0.499 0.617602    
## ca                                -0.724630   0.262506  -2.760 0.005772 ** 
## thalNo Thalassemia                -1.880193   2.093017  -0.898 0.369017    
## thalNormal Thalassemia            -1.183986   0.961793  -1.231 0.218316    
## thalReversible Defect Thalassemia -1.619954   0.492001  -3.293 0.000993 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 313.41  on 226  degrees of freedom
## Residual deviance: 148.39  on 207  degrees of freedom
## AIC: 188.39
## 
## Number of Fisher Scoring iterations: 6

# Stepwise Backward Method
model.b <- stepAIC(modelall, direction='backward')

## Start:  AIC=188.39
## target ~ age + sex + cp + trestbps + chol + fbs + restecg + thalach + 
##     exang + oldpeak + slope + ca + thal
## 
##            Df Deviance    AIC
## - restecg   2   148.68 184.68
## - age       1   148.46 186.46
## - fbs       1   149.16 187.16
## - chol      1   149.44 187.44
## - slope     2   152.12 188.12
## <none>          148.40 188.40
## - oldpeak   1   150.54 188.54
## - trestbps  1   150.62 188.62
## - thalach   1   151.01 189.01
## - sex       1   151.63 189.63
## - exang     1   153.46 191.46
## - thal      3   160.24 194.24
## - ca        1   156.26 194.26
## - cp        3   162.75 196.75
## 
## Step:  AIC=184.67
## target ~ age + sex + cp + trestbps + chol + fbs + thalach + exang + 
##     oldpeak + slope + ca + thal
## 
##            Df Deviance    AIC
## - age       1   148.80 182.80
## - fbs       1   149.48 183.48
## - chol      1   149.97 183.97
## <none>          148.68 184.68
## - oldpeak   1   150.79 184.79
## - slope     2   152.88 184.88
## - trestbps  1   150.98 184.98
## - thalach   1   151.30 185.30
## - sex       1   152.22 186.22
## - exang     1   153.73 187.73
## - thal      3   160.25 190.25
## - ca        1   156.54 190.54
## - cp        3   162.85 192.85
## 
## Step:  AIC=182.8
## target ~ sex + cp + trestbps + chol + fbs + thalach + exang + 
##     oldpeak + slope + ca + thal
## 
##            Df Deviance    AIC
## - fbs       1   149.59 181.59
## - chol      1   150.28 182.28
## <none>          148.80 182.80
## - oldpeak   1   150.84 182.84
## - slope     2   152.97 182.97
## - trestbps  1   151.60 183.60
## - sex       1   152.31 184.31
## - thalach   1   152.44 184.44
## - exang     1   153.91 185.91
## - thal      3   160.40 188.40
## - ca        1   157.76 189.76
## - cp        3   163.07 191.07
## 
## Step:  AIC=181.59
## target ~ sex + cp + trestbps + chol + thalach + exang + oldpeak + 
##     slope + ca + thal
## 
##            Df Deviance    AIC
## - chol      1   150.89 180.89
## <none>          149.59 181.59
## - slope     2   153.66 181.66
## - oldpeak   1   151.75 181.75
## - trestbps  1   151.99 181.99
## - sex       1   152.88 182.88
## - thalach   1   153.18 183.18
## - exang     1   154.27 184.27
## - thal      3   161.20 187.20
## - ca        1   157.84 187.84
## - cp        3   165.46 191.46
## 
## Step:  AIC=180.89
## target ~ sex + cp + trestbps + thalach + exang + oldpeak + slope + 
##     ca + thal
## 
##            Df Deviance    AIC
## <none>          150.89 180.89
## - slope     2   155.31 181.31
## - trestbps  1   153.41 181.41
## - oldpeak   1   153.43 181.43
## - sex       1   153.49 181.49
## - thalach   1   154.41 182.41
## - exang     1   155.24 183.24
## - thal      3   162.41 186.41
## - ca        1   159.56 187.56
## - cp        3   167.37 191.37

model.b <- glm(target ~ sex + cp + trestbps + thalach + exang + oldpeak + slope + 
    ca + thal,heart_train, family='binomial')
summary(model.b)

## 
## Call:
## glm(formula = target ~ sex + cp + trestbps + thalach + exang + 
##     oldpeak + slope + ca + thal, family = "binomial", data = heart_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7972  -0.3713   0.1809   0.4796   2.7643  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        0.21948    2.58654   0.085 0.932376    
## sexMale                           -0.87424    0.54828  -1.595 0.110823    
## cpChest Pain Type 1                0.76182    0.64011   1.190 0.233996    
## cpChest Pain Type 2                1.88346    0.53225   3.539 0.000402 ***
## cpChest Pain Type 3                1.97437    0.74872   2.637 0.008364 ** 
## trestbps                          -0.01965    0.01264  -1.555 0.120051    
## thalach                            0.02101    0.01151   1.825 0.067987 .  
## exangNo Exercise Induced Angina    1.00303    0.47989   2.090 0.036607 *  
## oldpeak                           -0.39790    0.25501  -1.560 0.118689    
## slopePeak Excercise ST Slope 1    -0.47540    1.00992  -0.471 0.637831    
## slopePeak Excercise ST Slope 2     0.58853    1.08607   0.542 0.587893    
## ca                                -0.70002    0.24603  -2.845 0.004438 ** 
## thalNo Thalassemia                -1.41522    2.38294  -0.594 0.552582    
## thalNormal Thalassemia            -0.96782    0.92548  -1.046 0.295676    
## thalReversible Defect Thalassemia -1.57991    0.47940  -3.296 0.000982 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 313.41  on 226  degrees of freedom
## Residual deviance: 150.89  on 212  degrees of freedom
## AIC: 180.89
## 
## Number of Fisher Scoring iterations: 6

# Model Interpretation
exp(model.b$coefficients) %>% 
  data.frame()

from the model, model.b, we could interprete several things:

The odds of male to diagnosed with heart disease is 42% less than female.

Patient detected with no exercise induced angina (unstable angina) is 3 times more reluctant to heart disease compared with patient with exercise induced angina.

The odds of patient detected with oldpeak to diagnosed with heart disease is 70% less than those who don’t.

Evaluation

Preparation We put the predict value from the model into our data, heart_train and heart_test.

heart_test$pred.Target <- predict(model.b, heart_test, type = 'response')

heart_train$pred.Target <- predict(model.b, heart_train, type = 'response')

Tuning Cutoff

performa <- function(cutoff, prob, ref, postarget, negtarget) 
{
  predict <- factor(ifelse(prob >= cutoff, postarget, negtarget))
  conf <- caret::confusionMatrix(predict , ref, positive = postarget)
  acc <- conf$overall[1]
  rec <- conf$byClass[1]
  prec <- conf$byClass[3]
  spec <- conf$byClass[2]
  mat <- t(as.matrix(c(rec , acc , prec, spec))) 
  colnames(mat) <- c("recall", "accuracy", "precicion", "specificity")
  return(mat)
}

co <- seq(0.01,0.80,length=100)
result <- matrix(0,100,4)

for(i in 1:100){
  result[i,] = performa(cutoff = co[i], 
                     prob = heart_test$pred.Target, 
                     ref = heart_test$target, 
                     postarget = "Heart Disease", 
                     negtarget = "Healthy")
}

ggplotly(tibble("Recall" = result[,1],
           "Accuracy" = result[,2],
           "Precision" = result[,3],
           "Specificity" = result[,4],
                   "Cutoff" = co) %>% 
  gather(key = "performa", value = "value", 1:4) %>% 
  ggplot(aes(x = Cutoff, y = value, col = performa)) +
  geom_line(lwd = 1.5) +
  scale_color_manual(values = c("darkred","darkgreen","orange", "blue")) +
  scale_y_continuous(breaks = seq(0,1,0.1), limits = c(0,1)) +
  scale_x_continuous(breaks = seq(0,1,0.1)) +
  labs(title = "Tradeoff model perfomance") +
  theme_minimal() +
  theme(legend.position = "top",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank()))

Before doing evaluation, we need to inspect the good cutoff or threshold to maximize the accuracy, recall and precision value. Here from the graph, 0.63 is the most balance value.

heart_test$pred.Label <- ifelse(heart_test$pred.Target < 0.63, "Healthy", "Heart Disease") %>% as.factor()

heart_train$pred.Label <- ifelse(heart_train$pred.Target < 0.63, "Healthy", "Heart Disease") %>% as.factor()

Data Training Evaluation

logtrain <- confusionMatrix(heart_train$pred.Label, heart_train$target, positive="Heart Disease") %>% print()

## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Healthy Heart Disease
##   Healthy            96            16
##   Heart Disease       9           106
##                                              
##                Accuracy : 0.8899             
##                  95% CI : (0.8417, 0.9274)   
##     No Information Rate : 0.5374             
##     P-Value [Acc > NIR] : <0.0000000000000002
##                                              
##                   Kappa : 0.7795             
##                                              
##  Mcnemar's Test P-Value : 0.2301             
##                                              
##             Sensitivity : 0.8689             
##             Specificity : 0.9143             
##          Pos Pred Value : 0.9217             
##          Neg Pred Value : 0.8571             
##              Prevalence : 0.5374             
##          Detection Rate : 0.4670             
##    Detection Prevalence : 0.5066             
##       Balanced Accuracy : 0.8916             
##                                              
##        'Positive' Class : Heart Disease      
##

Data Test Evaluation

logtest <- confusionMatrix(heart_test$pred.Label, heart_test$target, positive="Heart Disease") %>% print()

## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Healthy Heart Disease
##   Healthy            27             6
##   Heart Disease       6            37
##                                           
##                Accuracy : 0.8421          
##                  95% CI : (0.7404, 0.9157)
##     No Information Rate : 0.5658          
##     P-Value [Acc > NIR] : 0.0000002689    
##                                           
##                   Kappa : 0.6786          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.8605          
##             Specificity : 0.8182          
##          Pos Pred Value : 0.8605          
##          Neg Pred Value : 0.8182          
##              Prevalence : 0.5658          
##          Detection Rate : 0.4868          
##    Detection Prevalence : 0.5658          
##       Balanced Accuracy : 0.8393          
##                                           
##        'Positive' Class : Heart Disease   
##

The train data give accuracy of 88.99%, while the data test give accurracy of 84.21%. Because the accuracy is not far, so we can assume the model is fit.

K-Nearest Neighbour

#Variable Transforming
dv <- dummyVars(" ~.", data = heart)
heart2 <- data.frame(predict(dv, newdata = heart))

str(heart2)

## 'data.frame':    303 obs. of  28 variables:
##  $ age                               : num  63 37 41 56 57 57 56 44 52 57 ...
##  $ sex.Female                        : num  0 0 1 0 1 0 1 0 0 0 ...
##  $ sex.Male                          : num  1 1 0 1 0 1 0 1 1 1 ...
##  $ cp.Chest.Pain.Type.0              : num  0 0 0 0 1 1 0 0 0 0 ...
##  $ cp.Chest.Pain.Type.1              : num  0 0 1 1 0 0 1 1 0 0 ...
##  $ cp.Chest.Pain.Type.2              : num  0 1 0 0 0 0 0 0 1 1 ...
##  $ cp.Chest.Pain.Type.3              : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ trestbps                          : num  145 130 130 120 120 140 140 120 172 150 ...
##  $ chol                              : num  233 250 204 236 354 192 294 263 199 168 ...
##  $ fbs...120.mg.dl                   : num  0 1 1 1 1 1 1 1 0 1 ...
##  $ fbs...120.mg.dl.1                 : num  1 0 0 0 0 0 0 0 1 0 ...
##  $ restecg.Abnormality               : num  0 1 0 1 1 1 0 1 1 1 ...
##  $ restecg.Normal                    : num  1 0 1 0 0 0 1 0 0 0 ...
##  $ restecg.Probable.or.Definite      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ thalach                           : num  150 187 172 178 163 148 153 173 162 174 ...
##  $ exang.Exercise.Induced.Angina     : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ exang.No.Exercise.Induced.Angina  : num  1 1 1 1 0 1 1 1 1 1 ...
##  $ oldpeak                           : num  2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
##  $ slope.Peak.Excercise.ST.Slope.0   : num  1 1 0 0 0 0 0 0 0 0 ...
##  $ slope.Peak.Excercise.ST.Slope.1   : num  0 0 0 0 0 1 1 0 0 0 ...
##  $ slope.Peak.Excercise.ST.Slope.2   : num  0 0 1 1 1 0 0 1 1 1 ...
##  $ ca                                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ thal.Fixed.Defect.Thalassemia     : num  0 1 1 1 1 0 1 0 0 1 ...
##  $ thal.No.Thalassemia               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ thal.Normal.Thalassemia           : num  1 0 0 0 0 1 0 0 0 0 ...
##  $ thal.Reversible.Defect.Thalassemia: num  0 0 0 0 0 0 0 1 1 0 ...
##  $ target.Healthy                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ target.Heart.Disease              : num  1 1 1 1 1 1 1 1 1 1 ...

We need to transform all the variables into numeric class, as KNN only processing numeric variables.

heart2 <- heart2 %>% 
dplyr::select(-c(target.Healthy, sex.Female, fbs...120.mg.dl.1 , exang.Exercise.Induced.Angina))
    
names(heart2)

##  [1] "age"                                "sex.Male"                          
##  [3] "cp.Chest.Pain.Type.0"               "cp.Chest.Pain.Type.1"              
##  [5] "cp.Chest.Pain.Type.2"               "cp.Chest.Pain.Type.3"              
##  [7] "trestbps"                           "chol"                              
##  [9] "fbs...120.mg.dl"                    "restecg.Abnormality"               
## [11] "restecg.Normal"                     "restecg.Probable.or.Definite"      
## [13] "thalach"                            "exang.No.Exercise.Induced.Angina"  
## [15] "oldpeak"                            "slope.Peak.Excercise.ST.Slope.0"   
## [17] "slope.Peak.Excercise.ST.Slope.1"    "slope.Peak.Excercise.ST.Slope.2"   
## [19] "ca"                                 "thal.Fixed.Defect.Thalassemia"     
## [21] "thal.No.Thalassemia"                "thal.Normal.Thalassemia"           
## [23] "thal.Reversible.Defect.Thalassemia" "target.Heart.Disease"

#Data Splitting
heart2_train <- heart2[id,1:22]
heart2_test <- heart2[-id,1:22]

heart2_train_label <- heart2[id,23]%>% as.factor()
heart2_test_label <- heart2[-id,23] %>% as.factor()

Then, we need to scale the data as the scale from each variables are different.

heart2_train <- scale(heart2_train[,1:22])
heart2_test <- scale(heart2_test[,1:22],
                     center = attr(heart2_train, "scaled:center"), 
                     scale = attr(heart2_train, "scaled:scale"))

KNN Prediction

sqrt(nrow(heart2_train))

## [1] 15.06652

library(class)
pred.knn.train <- knn(train = heart2_train, test= heart2_train, cl=heart2_train_label, k=15)

pred.knn.test <- knn(train = heart2_train, test= heart2_test, cl=heart2_train_label, k=15)

KNN Model Evaluation

#Data Train Evaluation
knn.train <- confusionMatrix(pred.knn.train, heart2_train_label, positive="1") %>% print()

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 129   8
##          1  11  79
##                                              
##                Accuracy : 0.9163             
##                  95% CI : (0.8724, 0.9489)   
##     No Information Rate : 0.6167             
##     P-Value [Acc > NIR] : <0.0000000000000002
##                                              
##                   Kappa : 0.8241             
##                                              
##  Mcnemar's Test P-Value : 0.6464             
##                                              
##             Sensitivity : 0.9080             
##             Specificity : 0.9214             
##          Pos Pred Value : 0.8778             
##          Neg Pred Value : 0.9416             
##              Prevalence : 0.3833             
##          Detection Rate : 0.3480             
##    Detection Prevalence : 0.3965             
##       Balanced Accuracy : 0.9147             
##                                              
##        'Positive' Class : 1                  
##

#Data Test Evaluation
knn.test <- confusionMatrix(pred.knn.test, heart2_test_label, positive="1") %>% print()

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 44  5
##          1  2 25
##                                           
##                Accuracy : 0.9079          
##                  95% CI : (0.8194, 0.9622)
##     No Information Rate : 0.6053          
##     P-Value [Acc > NIR] : 0.000000003453  
##                                           
##                   Kappa : 0.8038          
##                                           
##  Mcnemar's Test P-Value : 0.4497          
##                                           
##             Sensitivity : 0.8333          
##             Specificity : 0.9565          
##          Pos Pred Value : 0.9259          
##          Neg Pred Value : 0.8980          
##              Prevalence : 0.3947          
##          Detection Rate : 0.3289          
##    Detection Prevalence : 0.3553          
##       Balanced Accuracy : 0.8949          
##                                           
##        'Positive' Class : 1               
##

The train data give accuracy of 91.63% , while the data test give accurracy of 90.79%. Because the accuracy is not far, so we can assume the model is fit.

Conclusion

Logistic Model

eval_logit <- data_frame(Accuracy = logtest$overall[1],
           Recall = logtest$byClass[1],
           Precision = logtest$byClass[3]) %>% print()

## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

## # A tibble: 1 x 3
##   Accuracy Recall Precision
##      <dbl>  <dbl>     <dbl>
## 1    0.842  0.860     0.860

K-Nearest Neighbour

eval_knn <- data_frame(Accuracy = knn.test$overall[1],
           Recall = knn.test$byClass[1],
           Precision = knn.test$byClass[3]) %>% print()

## # A tibble: 1 x 3
##   Accuracy Recall Precision
##      <dbl>  <dbl>     <dbl>
## 1    0.908  0.833     0.926

logistic regression give accuracy of 84.21%, while KNN model give accuracy of 90.78%. logistic regression also could predict the actual patient with heart disease with recall value of 86.79%, compared to KNN model with value of 83.33%.

Logistic Regression and KNN with Heart Disease dataset

Sandy Putra Utama

2021-03-01

Introduction

Exploratory Data Analysis

Age

Sex

Chest Pain

resting blood preasure

cholestoral in mg/dl

fasting blood sugar mg/dl

restecg

Maximum Heart Rate

Exercise Induced Angina

ST Depression

Multivariables`

Slope of The Peak Exercise

Number of Major Vessels

Thallasemia

Model

Logistic Regression

Evaluation

Tuning Cutoff

Data Training Evaluation

Data Test Evaluation

K-Nearest Neighbour

KNN Prediction

KNN Model Evaluation

Conclusion

Logistic Model

K-Nearest Neighbour