Objective

The objective of this model is to classify does a patient has heart disease or not based on several predictors. The predictors that used are:

  1. Age
  2. Sex (Male = 1, Female =2 )
  3. Chest pain type (Asymptomatic = 0, Atypical angina = 1, Non-anginal pain = 2, Typical agine = 3)
  4. Blood pressure (mm Hg)
  5. Serum cholestoral (mg/dl)
  6. Fasting blood sugar
  7. Resting electrocardiographic result (Showing probable = 0, Normal = 1, ST-T wave abnormality = 2)
  8. Maximum heart rate
  9. Exercise induced agina (Yes = 1, No = 0)
  10. ST depression
  11. Slope of peak exercise ST segment
  12. Number of major vessel
  13. Thalium (Normal = 3, Fixed defect = 6, Reversable defect = 7)
  14. Target (0 = Disease, 1 = No disease)

The data and the description are obtained from this link:

  1. https://www.kaggle.com/ronitf/heart-disease-uci/discussion/105877
  2. https://www.kaggle.com/ronitf/heart-disease-uci/discussion/105877

Model Used

The model that use to classify the hearth disease are logistic regression and K-Nearest Neighbor

Logistic Regression

Logistic Regression Model

Data Check

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
db <- read.csv("data_input/heart_db.csv")

str(db)
## 'data.frame':    303 obs. of  14 variables:
##  $ age     : int  63 37 41 56 57 57 56 44 52 57 ...
##  $ sex     : int  1 1 0 1 0 1 0 1 1 1 ...
##  $ cp      : int  3 2 1 1 0 0 1 1 2 2 ...
##  $ trestbps: int  145 130 130 120 120 140 140 120 172 150 ...
##  $ chol    : int  233 250 204 236 354 192 294 263 199 168 ...
##  $ fbs     : int  1 0 0 0 0 0 0 0 1 0 ...
##  $ restecg : int  0 1 0 1 1 1 0 1 1 1 ...
##  $ thalach : int  150 187 172 178 163 148 153 173 162 174 ...
##  $ exang   : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ oldpeak : num  2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
##  $ slope   : int  0 0 2 2 2 1 1 2 2 2 ...
##  $ ca      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ thal    : int  1 2 2 2 2 1 2 3 3 2 ...
##  $ target  : int  1 1 1 1 1 1 1 1 1 1 ...
db_wrang <- db %>%
  mutate(sex = as.factor(sex),
         cp = as.factor(cp),
         restecg = as.factor(restecg),
         exang = as.factor(exang),
         thal = as.factor(thal),
         target = as.factor(target))

colSums(is.na(db_wrang))
##      age      sex       cp trestbps     chol      fbs  restecg  thalach 
##        0        0        0        0        0        0        0        0 
##    exang  oldpeak    slope       ca     thal   target 
##        0        0        0        0        0        0
prop.table(table(db_wrang$Channel))
## numeric(0)
head(db_wrang)
##   age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1  63   1  3      145  233   1       0     150     0     2.3     0  0    1
## 2  37   1  2      130  250   0       1     187     0     3.5     0  0    2
## 3  41   0  1      130  204   0       0     172     0     1.4     2  0    2
## 4  56   1  1      120  236   0       1     178     0     0.8     2  0    2
## 5  57   0  0      120  354   0       1     163     1     0.6     2  0    2
## 6  57   1  0      140  192   0       1     148     0     0.4     1  0    1
##   target
## 1      1
## 2      1
## 3      1
## 4      1
## 5      1
## 6      1
colSums(is.na(db_wrang))
##      age      sex       cp trestbps     chol      fbs  restecg  thalach 
##        0        0        0        0        0        0        0        0 
##    exang  oldpeak    slope       ca     thal   target 
##        0        0        0        0        0        0

Creating Training Set and Test Set

# your code here
RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(123)
row_data <- nrow(db_wrang)
# sampel dan ambil 80% data secara acak
index <- sample(row_data, row_data*0.8)

data_train <- db_wrang[ index, ]
data_test <- db_wrang[ -index, ]

Check class imbalance

summary(db_wrang$target)
##   0   1 
## 138 165

Insight

Balance of the data is still okay

Model

model_full <- glm(target ~ ., data_train, family = "binomial")
model_step <- step(model_full, direction = "both", trace = 0)
summary(model_step)
## 
## Call:
## glm(formula = target ~ sex + cp + trestbps + thalach + oldpeak + 
##     ca + thal, family = "binomial", data = data_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5829  -0.4654   0.1754   0.5138   2.4143  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -13.90251  882.74573  -0.016 0.987435    
## sex1         -1.36163    0.50143  -2.715 0.006618 ** 
## cp1           1.60739    0.60060   2.676 0.007444 ** 
## cp2           2.49951    0.52261   4.783 1.73e-06 ***
## cp3           2.09854    0.65071   3.225 0.001260 ** 
## trestbps     -0.02124    0.01118  -1.899 0.057545 .  
## thalach       0.02580    0.00967   2.668 0.007630 ** 
## oldpeak      -0.60814    0.21936  -2.772 0.005566 ** 
## ca           -0.64125    0.19192  -3.341 0.000834 ***
## thal1        14.51660  882.74375   0.016 0.986879    
## thal2        14.43642  882.74348   0.016 0.986952    
## thal3        13.17788  882.74347   0.015 0.988089    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 333.83  on 241  degrees of freedom
## Residual deviance: 172.30  on 230  degrees of freedom
## AIC: 196.3
## 
## Number of Fisher Scoring iterations: 13
levels(data_train$target)
## [1] "0" "1"
exp(model_step$coefficients)
##  (Intercept)         sex1          cp1          cp2          cp3     trestbps 
## 9.166759e-07 2.562419e-01 4.989785e+00 1.217646e+01 8.154217e+00 9.789869e-01 
##      thalach      oldpeak           ca        thal1        thal2        thal3 
## 1.026134e+00 5.443619e-01 5.266337e-01 2.015945e+06 1.860612e+06 5.285406e+05

Multicolinearity Check

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
vif(model_full)
##              GVIF Df GVIF^(1/(2*Df))
## age      1.469721  1        1.212320
## sex      1.529873  1        1.236880
## cp       1.785928  3        1.101482
## trestbps 1.241659  1        1.114298
## chol     1.378209  1        1.173971
## fbs      1.137868  1        1.066709
## restecg  1.118337  2        1.028355
## thalach  1.472149  1        1.213321
## exang    1.264174  1        1.124355
## oldpeak  1.671726  1        1.292952
## slope    1.700970  1        1.304212
## ca       1.205816  1        1.098096
## thal     1.397378  3        1.057351

Insight:

  1. According to the logistic model, the significant predictors to predict whether or not a patient has heart disease are sex, chest pain type, resting blood pressure, maximum heart rate achieved, ST depression, number of major vessel, and thalium
  2. The odds of not having heart disease when all coefficient equals to 0 is 9.17e-07
  3. The odds of not having heart disease is dominantly influence by CP = 3 (typical agine), with 8.15 more chance to not having heart disease
  4. This is followed by thalac (maximum heart rate achieved), with 1.03 more chance to not having heart disease in every 1 increase of maximum heart rate achieved
  5. There’s no multicolinearity among predictors

Prediction & Confusion Matrix

pred_test <- predict(model_step, data_test, type = "response")
pred_class <- ifelse(pred_test > 0.5, 1, 0) %>% 
  as.factor()

head(pred_class)
##  3  4  6  8  9 11 
##  1  1  0  1  1  0 
## Levels: 0 1
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
confusionMatrix(pred_class, data_test$target)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 19  5
##          1  8 29
##                                           
##                Accuracy : 0.7869          
##                  95% CI : (0.6632, 0.8814)
##     No Information Rate : 0.5574          
##     P-Value [Acc > NIR] : 0.0001579       
##                                           
##                   Kappa : 0.5631          
##                                           
##  Mcnemar's Test P-Value : 0.5790997       
##                                           
##             Sensitivity : 0.7037          
##             Specificity : 0.8529          
##          Pos Pred Value : 0.7917          
##          Neg Pred Value : 0.7838          
##              Prevalence : 0.4426          
##          Detection Rate : 0.3115          
##    Detection Prevalence : 0.3934          
##       Balanced Accuracy : 0.7783          
##                                           
##        'Positive' Class : 0               
## 

K-NN Model

Scaling

Data Train

train_scale_x <- data_train %>%
  select(-c(sex, cp, restecg, exang, thal, target))%>%
  scale()

train_y <- data_train$target

attr(train_scale_x, "scaled:center")
##         age    trestbps        chol         fbs     thalach     oldpeak 
##  54.2148760 131.7314050 249.0330579   0.1528926 149.0950413   0.9933884 
##       slope          ca 
##   1.4297521   0.7727273
attr(train_scale_x, "scaled:center")
##         age    trestbps        chol         fbs     thalach     oldpeak 
##  54.2148760 131.7314050 249.0330579   0.1528926 149.0950413   0.9933884 
##       slope          ca 
##   1.4297521   0.7727273

Data Test

test_scale_x <- data_test %>%
  select(-c(sex, cp, restecg, exang, thal, target))%>%
  scale(center = attr(train_scale_x,"scaled:center"),
        scale = attr(train_scale_x,"scaled:scale"))

test_y <- data_test$target

Select K

k_choose <- sqrt(nrow(train_scale_x))%>%
  floor()

Create KNN Model

model_knn <- knn3(x = train_scale_x,
                  y = train_y,
                  k = k_choose)

pred_knn <- predict(model_knn, newdata = test_scale_x, type = "class")

Confusion Matrix

confusionMatrix(pred_knn, test_y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 20  2
##          1  7 32
##                                           
##                Accuracy : 0.8525          
##                  95% CI : (0.7383, 0.9302)
##     No Information Rate : 0.5574          
##     P-Value [Acc > NIR] : 8.993e-07       
##                                           
##                   Kappa : 0.6952          
##                                           
##  Mcnemar's Test P-Value : 0.1824          
##                                           
##             Sensitivity : 0.7407          
##             Specificity : 0.9412          
##          Pos Pred Value : 0.9091          
##          Neg Pred Value : 0.8205          
##              Prevalence : 0.4426          
##          Detection Rate : 0.3279          
##    Detection Prevalence : 0.3607          
##       Balanced Accuracy : 0.8410          
##                                           
##        'Positive' Class : 0               
## 

Conclusion

  1. The accuracy, sensitivity, and specificity of K-NN model is better than using Logistic Model
  2. Sensitivity / Recall is better parameter to evaluate the performance of classification modeel, as the goal of this model is to minimize the False Negative (FN) value