library(readxl)
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
library(ggplot2)

1. Import data

df <- read_excel("~/Desktop/Misc/Clean data.xlsx")
table1(~ age + sex + bmi + sbp_mean + dbp_mean + tob + exer + kaidan + bs + non_hdl + hyper + 
         diabetes_over_boundary + MetS  | target_st, df)
## Warning in table1.formula(~age + sex + bmi + sbp_mean + dbp_mean + tob + : Terms
## to the right of '|' in formula 'x' define table columns and are expected to be
## factors with meaningful labels.
0
(N=6951)
1
(N=438)
Overall
(N=7389)
age
Mean (SD) 54.6 (13.1) 64.4 (9.78) 55.1 (13.1)
Median [Min, Max] 55.0 [30.0, 84.0] 66.0 [32.0, 80.0] 56.0 [30.0, 84.0]
sex
Mean (SD) 0.548 (0.498) 0.466 (0.499) 0.543 (0.498)
Median [Min, Max] 1.00 [0, 1.00] 0 [0, 1.00] 1.00 [0, 1.00]
bmi
Mean (SD) 22.5 (3.00) 23.1 (3.21) 22.5 (3.01)
Median [Min, Max] 22.3 [14.2, 30.5] 23.0 [14.2, 30.5] 22.3 [14.2, 30.5]
sbp_mean
Mean (SD) 126 (20.5) 138 (21.9) 126 (20.8)
Median [Min, Max] 123 [78.0, 180] 137 [86.0, 180] 124 [78.0, 180]
dbp_mean
Mean (SD) 77.4 (11.7) 81.0 (12.5) 77.6 (11.8)
Median [Min, Max] 77.0 [47.5, 108] 81.0 [47.5, 108] 77.0 [47.5, 108]
tob
Mean (SD) 2.27 (0.879) 2.16 (0.882) 2.26 (0.879)
Median [Min, Max] 3.00 [1.00, 3.00] 2.00 [1.00, 3.00] 3.00 [1.00, 3.00]
exer
Mean (SD) 1.61 (0.488) 1.60 (0.491) 1.61 (0.488)
Median [Min, Max] 2.00 [1.00, 2.00] 2.00 [1.00, 2.00] 2.00 [1.00, 2.00]
kaidan
Mean (SD) 3.35 (1.26) 3.39 (1.27) 3.35 (1.26)
Median [Min, Max] 3.00 [1.00, 5.00] 3.00 [1.00, 5.00] 3.00 [1.00, 5.00]
bs
Mean (SD) 96.1 (9.78) 100 (11.0) 96.4 (9.90)
Median [Min, Max] 95.0 [72.0, 120] 98.0 [72.0, 120] 95.0 [72.0, 120]
non_hdl
Mean (SD) 152 (36.1) 159 (36.1) 152 (36.1)
Median [Min, Max] 150 [53.5, 250] 155 [78.0, 250] 150 [53.5, 250]
hyper
Mean (SD) 0.295 (0.456) 0.550 (0.498) 0.311 (0.463)
Median [Min, Max] 0 [0, 1.00] 1.00 [0, 1.00] 0 [0, 1.00]
diabetes_over_boundary
Mean (SD) 0.115 (0.319) 0.228 (0.420) 0.122 (0.327)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
MetS
Mean (SD) 0.234 (0.424) 0.413 (0.493) 0.245 (0.430)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]

2. Rename by pipeline

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.8     ✔ purrr   0.3.4
## ✔ tidyr   1.2.1     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
df2 <- df %>% rename(stroke = target_st, sbp = sbp_mean, dbp = dbp_mean, 
                     eGFR = egfr_ckd_epi, smsts = tob, hypertension = hyper,
                     DM = diabetes_over_boundary)

3. Convert to factor

df2$sex = as.factor(df2$sex)
df2$smsts = as.factor(df2$smsts)
df2$exer = as.factor(df2$exer)
df2$kaidan = as.factor(df2$kaidan)
df2$hypertension = as.factor(df2$hypertension)
df2$DM = as.factor(df2$DM)
df2$MetS = as.factor(df2$MetS)
df2$stroke = as.factor(df2$stroke)

4. Exploratory data analysis

4.1 Stroke Distribution

ggplot(df2, aes(stroke, fill = stroke)) + 
  geom_bar() +
  theme_bw() +
  labs(title = "Stroke Classification", x = "Stroke") +
  theme(plot.title = element_text(hjust = 0.5))

library(ggthemes) # Or
ggplot(data = df2, aes(x = stroke, fill =stroke)) + geom_bar() + 
  theme_economist()+ labs(x = "Stroke") + guides(fill = FALSE) # remove the legend
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.

4.2 Correlation Matrix

# Selecting numerical variables
sel = df2[, c("age", "sbp", "dbp", "eGFR", "non_hdl", "waist", "wt20", "bs", "ua",
          "rbc", "wbc", "tg", "cpk", "alb", "bmi",  "bf", "smsts", "DM", 
          "hypertension", "MetS", "sleep_conv", "stroke", "fh1_yn", "fh5_yn", "fh246_yn")]

# Let’s now see the correlation between all the numerical variables in the dataset.
# Correlation Matrix can be ploted as follows.
cor_data <- cor(sel[,setdiff(names(sel), c("smsts", "DM", 
                             "hypertension", "MetS", "sleep_conv", "stroke", "fh1_yn", "fh5_yn", "fh246_yn"))])
# Numerical Correlation Matrix
cor_data
##                 age         sbp         dbp         eGFR     non_hdl
## age      1.00000000  0.45366897  0.18612989 -0.579516831  0.19953932
## sbp      0.45366897  1.00000000  0.71805544 -0.263306152  0.17256589
## dbp      0.18612989  0.71805544  1.00000000 -0.103267334  0.18020565
## eGFR    -0.57951683 -0.26330615 -0.10326733  1.000000000 -0.18807449
## non_hdl  0.19953932  0.17256589  0.18020565 -0.188074491  1.00000000
## waist    0.21651029  0.29534598  0.32680903 -0.096791475  0.26398393
## wt20    -0.05240172  0.04441302  0.13741367 -0.028136056 -0.06793941
## bs       0.21440422  0.25841835  0.20235615 -0.148171238  0.15430376
## ua       0.05595851  0.15541011  0.20913662 -0.256085361  0.12815722
## rbc     -0.20778253  0.03968653  0.22601528  0.096637557  0.13546604
## wbc     -0.02313799  0.05461379  0.08902853  0.025087005  0.10310387
## tg       0.13471841  0.20629392  0.22859959 -0.115454102  0.44128322
## cpk      0.03765889  0.08233121  0.10791172 -0.125058245  0.07561372
## alb     -0.15991809  0.04486632  0.12655206  0.040265944  0.18012810
## bmi      0.04024757  0.24439570  0.32318478 -0.055370209  0.27467909
## bf       0.01213702  0.08623060  0.08001366  0.001783862  0.27634262
##               waist        wt20          bs          ua         rbc         wbc
## age      0.21651029 -0.05240172  0.21440422  0.05595851 -0.20778253 -0.02313799
## sbp      0.29534598  0.04441302  0.25841835  0.15541011  0.03968653  0.05461379
## dbp      0.32680903  0.13741367  0.20235615  0.20913662  0.22601528  0.08902853
## eGFR    -0.09679147 -0.02813606 -0.14817124 -0.25608536  0.09663756  0.02508701
## non_hdl  0.26398393 -0.06793941  0.15430376  0.12815722  0.13546604  0.10310387
## waist    1.00000000  0.34386258  0.27248381  0.30856125  0.27768115  0.16982907
## wt20     0.34386258  1.00000000  0.11590218  0.35687646  0.33807759  0.12891520
## bs       0.27248381  0.11590218  1.00000000  0.16394421  0.17797831  0.12684171
## ua       0.30856125  0.35687646  0.16394421  1.00000000  0.33443117  0.18045482
## rbc      0.27768115  0.33807759  0.17797831  0.33443117  1.00000000  0.25557041
## wbc      0.16982907  0.12891520  0.12684171  0.18045482  0.25557041  1.00000000
## tg       0.38905550  0.14433729  0.21925995  0.31238166  0.25782755  0.25416920
## cpk      0.12534234  0.19506513  0.03473554  0.18168435  0.06588585 -0.02645865
## alb      0.05391083  0.01783037  0.08891826  0.13676795  0.29780954  0.08792599
## bmi      0.78994573  0.29863394  0.23133388  0.25821915  0.26084257  0.13673400
## bf       0.31383574 -0.28783607  0.01572509 -0.12479430 -0.09784038 -0.02406466
##                  tg         cpk         alb         bmi           bf
## age      0.13471841  0.03765889 -0.15991809  0.04024757  0.012137023
## sbp      0.20629392  0.08233121  0.04486632  0.24439570  0.086230598
## dbp      0.22859959  0.10791172  0.12655206  0.32318478  0.080013662
## eGFR    -0.11545410 -0.12505824  0.04026594 -0.05537021  0.001783862
## non_hdl  0.44128322  0.07561372  0.18012810  0.27467909  0.276342620
## waist    0.38905550  0.12534234  0.05391083  0.78994573  0.313835741
## wt20     0.14433729  0.19506513  0.01783037  0.29863394 -0.287836068
## bs       0.21925995  0.03473554  0.08891826  0.23133388  0.015725091
## ua       0.31238166  0.18168435  0.13676795  0.25821915 -0.124794302
## rbc      0.25782755  0.06588585  0.29780954  0.26084257 -0.097840382
## wbc      0.25416920 -0.02645865  0.08792599  0.13673400 -0.024064664
## tg       1.00000000  0.02314921  0.12003293  0.34314220  0.085053440
## cpk      0.02314921  1.00000000  0.07703153  0.14132311 -0.152825939
## alb      0.12003293  0.07703153  1.00000000  0.09603130  0.100237271
## bmi      0.34314220  0.14132311  0.09603130  1.00000000  0.448686121
## bf       0.08505344 -0.15282594  0.10023727  0.44868612  1.000000000
# Correlation matrix plots
corrplot::corrplot(cor_data)

corrplot::corrplot(cor_data, type = "upper", method = "number")

corrplot::corrplot(cor_data, type = "upper", method = "pie")

4.3 Univariate Analysis

#
sel2 = df2[, c("age", "sbp", "dbp", "eGFR", "non_hdl", "waist", "wt20", "bs", "ua",
               "rbc", "wbc", "tg", "cpk", "alb", "bmi",  "bf", "stroke")]

library(gridExtra) #Multiple plot in single grip space
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
sel2 = as.data.frame(sel2)
univar_graph <- function(univar_name, univar, data, output_var) {
  g_1 <- ggplot(data, aes(x=univar)) + 
    geom_density() + 
    xlab(univar_name) + 
    theme_bw()
  g_2 <- ggplot(data, aes(x=univar, fill=output_var)) + 
    geom_density(alpha=0.4) + 
    xlab(univar_name) + 
    theme_bw()
  gridExtra::grid.arrange(g_1, g_2, ncol=2, top = paste(univar_name,"variable", "/ [ Skew:",timeDate::skewness(univar),"]"))
}
for (x in 1:(ncol(sel2)-1)) {
  univar_graph(univar_name = names(sel2)[x], univar = sel2[,x], data = sel2, output_var = sel2[,'stroke'])
}

5. Machine Learning preparation

# Splitting dataset
partition <- caret::createDataPartition(y = df2$stroke, times = 1, p = 0.7, list = FALSE)
# create training data set
train_set <- df2[partition,]

# create testing data set, subtracting the rows partition to get remaining 30% of the data
test_set <- df2[-partition,]

5.1 Random Forest Model

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
# K-folds method
# Convert target variable to factor with two levels
train_set$stroke <- factor(train_set$stroke, levels = c("0", "1"))
test_set$stroke <- factor(test_set$stroke, levels = c("0", "1"))

# Convert level names to valid R variable names
levels(train_set$stroke) <- make.names(levels(train_set$stroke))
levels(test_set$stroke) <- make.names(levels(test_set$stroke))

model_forest <- caret::train(stroke ~., data = train_set,
                             method = "ranger",
                             metric = "ROC",
                             trControl = trainControl(method = "cv", number = 10,
                                                      classProbs = T, summaryFunction = twoClassSummary),
                             preProcess = c("center","scale","pca"))
model_forest
## Random Forest 
## 
## 5173 samples
##   56 predictor
##    2 classes: 'X0', 'X1' 
## 
## Pre-processing: centered (60), scaled (60), principal component
##  signal extraction (60) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 4655, 4656, 4656, 4656, 4655, 4656, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   ROC        Sens       Spec
##    2    gini        0.6791999  1.0000000  0   
##    2    extratrees  0.7011567  1.0000000  0   
##   24    gini        0.6707406  0.9993827  0   
##   24    extratrees  0.6968904  1.0000000  0   
##   46    gini        0.6667594  0.9993831  0   
##   46    extratrees  0.6931750  1.0000000  0   
## 
## Tuning parameter 'min.node.size' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 2, splitrule = extratrees
##  and min.node.size = 1.
# final ROC Value
model_forest$results[6,4]
## [1] 0.693175

5.2 XGBOOST - eXtreme Gradient BOOSTing

xgb_grid_1  <-  expand.grid(
  nrounds = 50,
  eta = c(0.03),
  max_depth = 1,
  gamma = 0,
  colsample_bytree = 0.6,
  min_child_weight = 1,
  subsample = 0.5
)

model_xgb <- caret::train(stroke ~., data = train_set,
                          method = "xgbTree",
                          metric = "ROC",
                          tuneGrid=xgb_grid_1,
                          trControl = trainControl(method = "cv", number = 10,
                                                   classProbs = T, summaryFunction = twoClassSummary),
                          preProcess = c("center","scale","pca"))
model_xgb
## eXtreme Gradient Boosting 
## 
## 5173 samples
##   56 predictor
##    2 classes: 'X0', 'X1' 
## 
## Pre-processing: centered (60), scaled (60), principal component
##  signal extraction (60) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 4655, 4655, 4656, 4657, 4655, 4656, ... 
## Resampling results:
## 
##   ROC        Sens  Spec
##   0.6955346  1     0   
## 
## Tuning parameter 'nrounds' was held constant at a value of 50
## Tuning
##  held constant at a value of 1
## Tuning parameter 'subsample' was held
##  constant at a value of 0.5
# final ROC value
model_xgb$results["ROC"]
##         ROC
## 1 0.6955346

5.3 KNN - K Nearest Neighbours

model_knn <- caret::train(stroke ~., data = train_set,
                          method = "knn",
                          metric = "ROC",
                          tuneGrid = expand.grid(.k = c(3:10)),
                          trControl = trainControl(method = "cv", number = 10,
                                                   classProbs = T, summaryFunction = twoClassSummary),
                          preProcess = c("center","scale","pca"))

model_knn
## k-Nearest Neighbors 
## 
## 5173 samples
##   56 predictor
##    2 classes: 'X0', 'X1' 
## 
## Pre-processing: centered (60), scaled (60), principal component
##  signal extraction (60) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 4656, 4655, 4656, 4657, 4655, 4655, ... 
## Resampling results across tuning parameters:
## 
##   k   ROC        Sens       Spec       
##    3  0.5353313  0.9907517  0.019677419
##    4  0.5493395  0.9903402  0.016451613
##    5  0.5587449  0.9977387  0.000000000
##    6  0.5670182  0.9975342  0.003333333
##    7  0.5773121  0.9985622  0.000000000
##    8  0.5891223  0.9993836  0.003225806
##    9  0.5911604  0.9995889  0.000000000
##   10  0.5967023  0.9991778  0.000000000
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 10.
#final ROC value
model_knn$results[8,2]
## [1] 0.5967023

5.4 Logistic Regression

model_glm <- caret::train(stroke ~., data = train_set,
                          method = "glm",
                          metric = "ROC",
                          tuneLength = 10,
                          trControl = trainControl(method = "cv", number = 10,
                                                   classProbs = T, summaryFunction = twoClassSummary),
                          preProcess = c("center","scale","pca"))

model_glm
## Generalized Linear Model 
## 
## 5173 samples
##   56 predictor
##    2 classes: 'X0', 'X1' 
## 
## Pre-processing: centered (60), scaled (60), principal component
##  signal extraction (60) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 4656, 4657, 4656, 4656, 4655, 4656, ... 
## Resampling results:
## 
##   ROC        Sens  Spec       
##   0.7343427  1     0.003225806
#final ROC Value
model_glm$results[2]
##         ROC
## 1 0.7343427

5.5 Rpart CART - classification and Regression Trees

library(rpart.plot) #CART Decision Tree
## Loading required package: rpart
model_rpart <- caret::train(stroke ~., data = train_set,
                            method = "rpart",
                            metric = "ROC",
                            tuneLength = 20,
                            trControl = trainControl(method = "cv", number = 10,
                                                     classProbs = T, summaryFunction = twoClassSummary))

model_rpart
## CART 
## 
## 5173 samples
##   56 predictor
##    2 classes: 'X0', 'X1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 4655, 4655, 4655, 4657, 4657, 4656, ... 
## Resampling results across tuning parameters:
## 
##   cp            ROC        Sens       Spec      
##   0.0000000000  0.6560288  0.9734906  0.04258065
##   0.0001714384  0.6560288  0.9734906  0.04258065
##   0.0003428767  0.6560288  0.9734906  0.04258065
##   0.0005143151  0.6530779  0.9734906  0.04258065
##   0.0006857535  0.6527347  0.9736964  0.04258065
##   0.0008571918  0.6583681  0.9747235  0.04258065
##   0.0010286302  0.6548404  0.9763674  0.04258065
##   0.0012000686  0.6548404  0.9763674  0.04258065
##   0.0013715069  0.6666735  0.9773958  0.04258065
##   0.0015429453  0.6666735  0.9773958  0.04258065
##   0.0017143837  0.6587505  0.9780118  0.03602151
##   0.0018858220  0.6565074  0.9812998  0.03602151
##   0.0020572604  0.6565074  0.9812998  0.03602151
##   0.0022286988  0.6523509  0.9812998  0.03602151
##   0.0024001372  0.6523509  0.9812998  0.03602151
##   0.0025715755  0.6540987  0.9821220  0.03602151
##   0.0027430139  0.6540987  0.9821220  0.03602151
##   0.0029144523  0.6548507  0.9825331  0.03602151
##   0.0030858906  0.6548507  0.9825331  0.03602151
##   0.0032573290  0.6541354  0.9831491  0.03602151
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.001542945.
# Plot model accuracy vs different values of cp (complexity parameter)
plot(model_rpart)

# Best Model Cp (Complexity Parameter)
model_rpart$bestTune
##             cp
## 10 0.001542945
# Structure of final model selected
model_rpart$finalModel
## n= 5173 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##    1) root 5173 307 X0 (0.94065339 0.05934661)  
##      2) age< 55.5 2553  56 X0 (0.97806502 0.02193498) *
##      3) age>=55.5 2620 251 X0 (0.90419847 0.09580153)  
##        6) upr_nega>=0.5 2110 176 X0 (0.91658768 0.08341232)  
##         12) age< 62.5 761  38 X0 (0.95006570 0.04993430) *
##         13) age>=62.5 1349 138 X0 (0.89770200 0.10229800)  
##           26) pulse_mean< 87.5 1259 118 X0 (0.90627482 0.09372518)  
##             52) id>=821.5 1242 111 X0 (0.91062802 0.08937198)  
##              104) hg< 14.05 760  51 X0 (0.93289474 0.06710526)  
##                208) waist< 100.5 732  44 X0 (0.93989071 0.06010929)  
##                  416) ca>=8.85 667  33 X0 (0.95052474 0.04947526) *
##                  417) ca< 8.85 65  11 X0 (0.83076923 0.16923077)  
##                    834) sbp>=108 55   5 X0 (0.90909091 0.09090909) *
##                    835) sbp< 108 10   4 X1 (0.40000000 0.60000000) *
##                209) waist>=100.5 28   7 X0 (0.75000000 0.25000000)  
##                  418) hg< 13.45 18   1 X0 (0.94444444 0.05555556) *
##                  419) hg>=13.45 10   4 X1 (0.40000000 0.60000000) *
##              105) hg>=14.05 482  60 X0 (0.87551867 0.12448133)  
##                210) elbow>=6.1335 399  40 X0 (0.89974937 0.10025063)  
##                  420) frct< 308.25 379  33 X0 (0.91292876 0.08707124)  
##                    840) id>=51006.5 210   9 X0 (0.95714286 0.04285714) *
##                    841) id< 51006.5 169  24 X0 (0.85798817 0.14201183)  
##                     1682) ua>=5.75 95   7 X0 (0.92631579 0.07368421)  
##                       3364) hg< 15.75 73   2 X0 (0.97260274 0.02739726) *
##                       3365) hg>=15.75 22   5 X0 (0.77272727 0.22727273)  
##                         6730) dbp< 83.5 15   1 X0 (0.93333333 0.06666667) *
##                         6731) dbp>=83.5 7   3 X1 (0.42857143 0.57142857) *
##                     1683) ua< 5.75 74  17 X0 (0.77027027 0.22972973)  
##                       3366) eGFR>=71.07033 33   3 X0 (0.90909091 0.09090909) *
##                       3367) eGFR< 71.07033 41  14 X0 (0.65853659 0.34146341)  
##                         6734) wbc< 6.915 32   8 X0 (0.75000000 0.25000000) *
##                         6735) wbc>=6.915 9   3 X1 (0.33333333 0.66666667) *
##                  421) frct>=308.25 20   7 X0 (0.65000000 0.35000000)  
##                    842) bmi>=23.64297 10   0 X0 (1.00000000 0.00000000) *
##                    843) bmi< 23.64297 10   3 X1 (0.30000000 0.70000000) *
##                211) elbow< 6.1335 83  20 X0 (0.75903614 0.24096386)  
##                  422) bmi< 22.64269 52   7 X0 (0.86538462 0.13461538)  
##                    844) wbc>=3.965 45   3 X0 (0.93333333 0.06666667) *
##                    845) wbc< 3.965 7   3 X1 (0.42857143 0.57142857) *
##                  423) bmi>=22.64269 31  13 X0 (0.58064516 0.41935484)  
##                    846) bf>=23.7 22   6 X0 (0.72727273 0.27272727)  
##                     1692) bs< 98 11   0 X0 (1.00000000 0.00000000) *
##                     1693) bs>=98 11   5 X1 (0.45454545 0.54545455) *
##                    847) bf< 23.7 9   2 X1 (0.22222222 0.77777778) *
##             53) id< 821.5 17   7 X0 (0.58823529 0.41176471) *
##           27) pulse_mean>=87.5 90  20 X0 (0.77777778 0.22222222)  
##             54) sbp>=150.5 31   1 X0 (0.96774194 0.03225806) *
##             55) sbp< 150.5 59  19 X0 (0.67796610 0.32203390)  
##              110) wt20>=51.925 19   1 X0 (0.94736842 0.05263158) *
##              111) wt20< 51.925 40  18 X0 (0.55000000 0.45000000)  
##                222) age< 73.5 30  10 X0 (0.66666667 0.33333333)  
##                  444) ztt>=11.85 9   0 X0 (1.00000000 0.00000000) *
##                  445) ztt< 11.85 21  10 X0 (0.52380952 0.47619048)  
##                    890) ztt< 8.65 13   3 X0 (0.76923077 0.23076923) *
##                    891) ztt>=8.65 8   1 X1 (0.12500000 0.87500000) *
##                223) age>=73.5 10   2 X1 (0.20000000 0.80000000) *
##        7) upr_nega< 0.5 510  75 X0 (0.85294118 0.14705882)  
##         14) sbp< 156.5 396  43 X0 (0.89141414 0.10858586) *
##         15) sbp>=156.5 114  32 X0 (0.71929825 0.28070175)  
##           30) eGFR< 79.96992 97  22 X0 (0.77319588 0.22680412)  
##             60) eGFR>=48.67081 78  13 X0 (0.83333333 0.16666667)  
##              120) non_hdl< 159.5 32   1 X0 (0.96875000 0.03125000) *
##              121) non_hdl>=159.5 46  12 X0 (0.73913043 0.26086957)  
##                242) id< 14772 18   1 X0 (0.94444444 0.05555556) *
##                243) id>=14772 28  11 X0 (0.60714286 0.39285714)  
##                  486) got< 22.5 18   4 X0 (0.77777778 0.22222222) *
##                  487) got>=22.5 10   3 X1 (0.30000000 0.70000000) *
##             61) eGFR< 48.67081 19   9 X0 (0.52631579 0.47368421) *
##           31) eGFR>=79.96992 17   7 X1 (0.41176471 0.58823529) *
# Should have refactored diabetes to make pos as the primary factor level to get straight forward decision tree
rpart.plot::rpart.plot(model_rpart$finalModel, type = 2, fallen.leaves = T, extra = 2, cex = 0.70)

# Training Data set - Model Comparision by ROC value
model_list <- list(Random_Forest = model_forest, XGBoost = model_xgb, KNN = model_knn, Logistic_Regression = model_glm, Rpart_DT = model_rpart)
resamples <- resamples(model_list)

#box plot
bwplot(resamples, metric="ROC")

#dot plot
dotplot(resamples, metric="ROC")

6. Predition on Test Data Set

6.1 Random Forest

# prediction on Test data set
pred_rf <- predict(model_forest, test_set)
# Confusion Matrix 
cm_rf <- confusionMatrix(pred_rf, test_set$stroke, positive="X1")

# Prediction Probabilities
pred_prob_rf <- predict(model_forest, test_set, type="prob")

# ROC value
library(pROC) #ROC
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc_rf <- roc(test_set$stroke, pred_prob_rf$X1)
## Setting levels: control = X0, case = X1
## Setting direction: controls < cases
# Confusion Matrix for Random Forest Model
cm_rf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   X0   X1
##         X0 2085  131
##         X1    0    0
##                                           
##                Accuracy : 0.9409          
##                  95% CI : (0.9302, 0.9503)
##     No Information Rate : 0.9409          
##     P-Value [Acc > NIR] : 0.5232          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.00000         
##             Specificity : 1.00000         
##          Pos Pred Value :     NaN         
##          Neg Pred Value : 0.94088         
##              Prevalence : 0.05912         
##          Detection Rate : 0.00000         
##    Detection Prevalence : 0.00000         
##       Balanced Accuracy : 0.50000         
##                                           
##        'Positive' Class : X1              
## 
# ROC Value for Random Forest
roc_rf
## 
## Call:
## roc.default(response = test_set$stroke, predictor = pred_prob_rf$X1)
## 
## Data: pred_prob_rf$X1 in 2085 controls (test_set$stroke X0) < 131 cases (test_set$stroke X1).
## Area under the curve: 0.6946
# AUC - Area under the curve
caTools::colAUC(pred_prob_rf$X1, test_set$stroke, plotROC = T)

##                [,1]
## X0 vs. X1 0.6945668

6.2 XGBOOST - eXtreme Gradient BOOSTing

# prediction on Test data set
pred_xgb <- predict(model_xgb, test_set)
# Confusion Matrix 
cm_xgb <- confusionMatrix(pred_xgb, test_set$stroke, positive="X1")

# Prediction Probabilities
pred_prob_xgb <- predict(model_xgb, test_set, type="prob")
# ROC value
roc_xgb <- roc(test_set$stroke, pred_prob_xgb$X1)
## Setting levels: control = X0, case = X1
## Setting direction: controls < cases
# Confusion matrix 
cm_xgb
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   X0   X1
##         X0 2085  131
##         X1    0    0
##                                           
##                Accuracy : 0.9409          
##                  95% CI : (0.9302, 0.9503)
##     No Information Rate : 0.9409          
##     P-Value [Acc > NIR] : 0.5232          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.00000         
##             Specificity : 1.00000         
##          Pos Pred Value :     NaN         
##          Neg Pred Value : 0.94088         
##              Prevalence : 0.05912         
##          Detection Rate : 0.00000         
##    Detection Prevalence : 0.00000         
##       Balanced Accuracy : 0.50000         
##                                           
##        'Positive' Class : X1              
## 
# ROC Value for for XGBoost
roc_xgb
## 
## Call:
## roc.default(response = test_set$stroke, predictor = pred_prob_xgb$X1)
## 
## Data: pred_prob_xgb$X1 in 2085 controls (test_set$stroke X0) < 131 cases (test_set$stroke X1).
## Area under the curve: 0.7061
# AUC - Area under the curve
caTools::colAUC(pred_prob_xgb$X1, test_set$stroke, plotROC = T)

##                [,1]
## X0 vs. X1 0.7060812

6.3 KNN - K Nearest Neighbours

# prediction on Test data set
pred_knn <- predict(model_knn, test_set)
# Confusion Matrix 
cm_knn <- confusionMatrix(pred_knn, test_set$stroke, positive="X1")

# Prediction Probabilities
pred_prob_knn <- predict(model_knn, test_set, type="prob")
# ROC value
roc_knn <- roc(test_set$stroke, pred_prob_knn$X1)
## Setting levels: control = X0, case = X1
## Setting direction: controls < cases
# Confusion matrix 
cm_knn
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   X0   X1
##         X0 2085  131
##         X1    0    0
##                                           
##                Accuracy : 0.9409          
##                  95% CI : (0.9302, 0.9503)
##     No Information Rate : 0.9409          
##     P-Value [Acc > NIR] : 0.5232          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.00000         
##             Specificity : 1.00000         
##          Pos Pred Value :     NaN         
##          Neg Pred Value : 0.94088         
##              Prevalence : 0.05912         
##          Detection Rate : 0.00000         
##    Detection Prevalence : 0.00000         
##       Balanced Accuracy : 0.50000         
##                                           
##        'Positive' Class : X1              
## 
# ROC Value for for XGBoost
roc_knn
## 
## Call:
## roc.default(response = test_set$stroke, predictor = pred_prob_knn$X1)
## 
## Data: pred_prob_knn$X1 in 2085 controls (test_set$stroke X0) < 131 cases (test_set$stroke X1).
## Area under the curve: 0.6078
# AUC - Area under the curve
caTools::colAUC(pred_prob_knn$X1, test_set$stroke, plotROC = T)

##               [,1]
## X0 vs. X1 0.607824

6.4 Logistic Regression

# prediction on Test data set
pred_glm <- predict(model_glm, test_set)
# Confusion Matrix 
cm_glm <- confusionMatrix(pred_glm, test_set$stroke, positive="X1")

# Prediction Probabilities
pred_prob_glm <- predict(model_glm, test_set, type="prob")
# ROC value
roc_glm <- roc(test_set$stroke, pred_prob_glm$X1)
## Setting levels: control = X0, case = X1
## Setting direction: controls < cases
# Confusion matrix 
cm_glm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   X0   X1
##         X0 2085  131
##         X1    0    0
##                                           
##                Accuracy : 0.9409          
##                  95% CI : (0.9302, 0.9503)
##     No Information Rate : 0.9409          
##     P-Value [Acc > NIR] : 0.5232          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.00000         
##             Specificity : 1.00000         
##          Pos Pred Value :     NaN         
##          Neg Pred Value : 0.94088         
##              Prevalence : 0.05912         
##          Detection Rate : 0.00000         
##    Detection Prevalence : 0.00000         
##       Balanced Accuracy : 0.50000         
##                                           
##        'Positive' Class : X1              
## 
roc_glm
## 
## Call:
## roc.default(response = test_set$stroke, predictor = pred_prob_glm$X1)
## 
## Data: pred_prob_glm$X1 in 2085 controls (test_set$stroke X0) < 131 cases (test_set$stroke X1).
## Area under the curve: 0.7354
# AUC - Area under the curve
caTools::colAUC(pred_prob_glm$X1, test_set$stroke, plotROC = T)

##                [,1]
## X0 vs. X1 0.7353873

6.5 Rpart CART - classification and Regression Trees

# prediction on Test data set
pred_rpart <- predict(model_rpart, test_set)
# Confusion Matrix 
cm_rpart <- confusionMatrix(pred_rpart, test_set$stroke, positive="X1")

# Prediction Probabilities
pred_prob_rpart <- predict(model_rpart, test_set, type="prob")
# ROC value
roc_rpart <- roc(test_set$stroke, pred_prob_rpart$X1)
## Setting levels: control = X0, case = X1
## Setting direction: controls < cases
# Confusion matrix 
cm_rpart
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   X0   X1
##         X0 2042  127
##         X1   43    4
##                                          
##                Accuracy : 0.9233         
##                  95% CI : (0.9114, 0.934)
##     No Information Rate : 0.9409         
##     P-Value [Acc > NIR] : 0.9997         
##                                          
##                   Kappa : 0.0142         
##                                          
##  Mcnemar's Test P-Value : 1.943e-10      
##                                          
##             Sensitivity : 0.030534       
##             Specificity : 0.979376       
##          Pos Pred Value : 0.085106       
##          Neg Pred Value : 0.941448       
##              Prevalence : 0.059116       
##          Detection Rate : 0.001805       
##    Detection Prevalence : 0.021209       
##       Balanced Accuracy : 0.504955       
##                                          
##        'Positive' Class : X1             
## 
# ROC Value for for XGBoost
roc_rpart
## 
## Call:
## roc.default(response = test_set$stroke, predictor = pred_prob_rpart$X1)
## 
## Data: pred_prob_rpart$X1 in 2085 controls (test_set$stroke X0) < 131 cases (test_set$stroke X1).
## Area under the curve: 0.6461
# AUC - Area under the curve
caTools::colAUC(pred_prob_rpart$X1, test_set$stroke, plotROC = T)

##                [,1]
## X0 vs. X1 0.6461164

7. Test Set Results Comparision

result_rf <- c(cm_rf$byClass['Sensitivity'], cm_rf$byClass['Specificity'], cm_rf$byClass['Precision'], 
               cm_rf$byClass['Recall'], cm_rf$byClass['F1'], roc_rf$auc)

result_xgb <- c(cm_xgb$byClass['Sensitivity'], cm_xgb$byClass['Specificity'], cm_xgb$byClass['Precision'], 
                cm_xgb$byClass['Recall'], cm_xgb$byClass['F1'], roc_xgb$auc)

result_knn <- c(cm_knn$byClass['Sensitivity'], cm_knn$byClass['Specificity'], cm_knn$byClass['Precision'], 
                cm_knn$byClass['Recall'], cm_knn$byClass['F1'], roc_knn$auc)

result_glm <- c(cm_glm$byClass['Sensitivity'], cm_glm$byClass['Specificity'], cm_glm$byClass['Precision'], 
                cm_glm$byClass['Recall'], cm_glm$byClass['F1'], roc_glm$auc)

result_rpart <- c(cm_rpart$byClass['Sensitivity'], cm_rpart$byClass['Specificity'], cm_rpart$byClass['Precision'], 
                  cm_rpart$byClass['Recall'], cm_rpart$byClass['F1'], roc_rpart$auc)


all_results <- data.frame(rbind(result_rf, result_xgb, result_knn, result_glm, result_rpart))
names(all_results) <- c("Sensitivity", "Specificity", "Precision", "Recall", "F1", "AUC")
all_results
##              Sensitivity Specificity  Precision     Recall         F1       AUC
## result_rf     0.00000000   1.0000000         NA 0.00000000         NA 0.6945668
## result_xgb    0.00000000   1.0000000         NA 0.00000000         NA 0.7060812
## result_knn    0.00000000   1.0000000         NA 0.00000000         NA 0.6078240
## result_glm    0.00000000   1.0000000         NA 0.00000000         NA 0.7353873
## result_rpart  0.03053435   0.9793765 0.08510638 0.03053435 0.04494382 0.6461164

8. Visualization to compare accuracy of ML models

col <- c("#ed3b3b", "#0099ff")

graphics::fourfoldplot(cm_rf$table, color = col, conf.level = 0.95, margin = 1, 
                       main = paste("Random Forest Accuracy(",round(cm_rf$overall[1]*100),"%)", sep = ""))

graphics::fourfoldplot(cm_xgb$table, color = col, conf.level = 0.95, margin = 1, 
                       main = paste("XGB Accuracy(",round(cm_xgb$overall[1]*100),"%)", sep = ""))

graphics::fourfoldplot(cm_knn$table, color = col, conf.level = 0.95, margin = 1, 
                       main = paste("KNN Accuracy(",round(cm_knn$overall[1]*100),"%)", sep = ""))

graphics::fourfoldplot(cm_glm$table, color = col, conf.level = 0.95, margin = 1, 
                       main = paste("Logistic Regression Accuracy(",round(cm_glm$overall[1]*100),"%)", sep = ""))

graphics::fourfoldplot(cm_rpart$table, color = col, conf.level = 0.95, margin = 1, 
                       main = paste("Rpart DT Accuracy(",round(cm_rpart$overall[1]*100),"%)", sep = ""))

https://rpubs.com/Mayank7j_2020/PimaIndiansDiabetesData