library(readxl)
library(table1)

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

library(ggplot2)

1. Import data

df <- read_excel("~/Desktop/Misc/Clean data.xlsx")
table1(~ age + sex + bmi + sbp_mean + dbp_mean + tob + exer + kaidan + bs + non_hdl + hyper + 
         diabetes_over_boundary + MetS  | target_st, df)

## Warning in table1.formula(~age + sex + bmi + sbp_mean + dbp_mean + tob + : Terms
## to the right of '|' in formula 'x' define table columns and are expected to be
## factors with meaningful labels.

	0 (N=6951)	1 (N=438)	Overall (N=7389)
age
Mean (SD)	54.6 (13.1)	64.4 (9.78)	55.1 (13.1)
Median [Min, Max]	55.0 [30.0, 84.0]	66.0 [32.0, 80.0]	56.0 [30.0, 84.0]
sex
Mean (SD)	0.548 (0.498)	0.466 (0.499)	0.543 (0.498)
Median [Min, Max]	1.00 [0, 1.00]	0 [0, 1.00]	1.00 [0, 1.00]
bmi
Mean (SD)	22.5 (3.00)	23.1 (3.21)	22.5 (3.01)
Median [Min, Max]	22.3 [14.2, 30.5]	23.0 [14.2, 30.5]	22.3 [14.2, 30.5]
sbp_mean
Mean (SD)	126 (20.5)	138 (21.9)	126 (20.8)
Median [Min, Max]	123 [78.0, 180]	137 [86.0, 180]	124 [78.0, 180]
dbp_mean
Mean (SD)	77.4 (11.7)	81.0 (12.5)	77.6 (11.8)
Median [Min, Max]	77.0 [47.5, 108]	81.0 [47.5, 108]	77.0 [47.5, 108]
tob
Mean (SD)	2.27 (0.879)	2.16 (0.882)	2.26 (0.879)
Median [Min, Max]	3.00 [1.00, 3.00]	2.00 [1.00, 3.00]	3.00 [1.00, 3.00]
exer
Mean (SD)	1.61 (0.488)	1.60 (0.491)	1.61 (0.488)
Median [Min, Max]	2.00 [1.00, 2.00]	2.00 [1.00, 2.00]	2.00 [1.00, 2.00]
kaidan
Mean (SD)	3.35 (1.26)	3.39 (1.27)	3.35 (1.26)
Median [Min, Max]	3.00 [1.00, 5.00]	3.00 [1.00, 5.00]	3.00 [1.00, 5.00]
bs
Mean (SD)	96.1 (9.78)	100 (11.0)	96.4 (9.90)
Median [Min, Max]	95.0 [72.0, 120]	98.0 [72.0, 120]	95.0 [72.0, 120]
non_hdl
Mean (SD)	152 (36.1)	159 (36.1)	152 (36.1)
Median [Min, Max]	150 [53.5, 250]	155 [78.0, 250]	150 [53.5, 250]
hyper
Mean (SD)	0.295 (0.456)	0.550 (0.498)	0.311 (0.463)
Median [Min, Max]	0 [0, 1.00]	1.00 [0, 1.00]	0 [0, 1.00]
diabetes_over_boundary
Mean (SD)	0.115 (0.319)	0.228 (0.420)	0.122 (0.327)
Median [Min, Max]	0 [0, 1.00]	0 [0, 1.00]	0 [0, 1.00]
MetS
Mean (SD)	0.234 (0.424)	0.413 (0.493)	0.245 (0.430)
Median [Min, Max]	0 [0, 1.00]	0 [0, 1.00]	0 [0, 1.00]

2. Rename by pipeline

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ tibble  3.1.8     ✔ purrr   0.3.4
## ✔ tidyr   1.2.1     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

df2 <- df %>% rename(stroke = target_st, sbp = sbp_mean, dbp = dbp_mean, 
                     eGFR = egfr_ckd_epi, smsts = tob, hypertension = hyper,
                     DM = diabetes_over_boundary)

3. Convert to factor

df2$sex = as.factor(df2$sex)
df2$smsts = as.factor(df2$smsts)
df2$exer = as.factor(df2$exer)
df2$kaidan = as.factor(df2$kaidan)
df2$hypertension = as.factor(df2$hypertension)
df2$DM = as.factor(df2$DM)
df2$MetS = as.factor(df2$MetS)
df2$stroke = as.factor(df2$stroke)

4. Exploratory data analysis

4.1 Stroke Distribution

ggplot(df2, aes(stroke, fill = stroke)) + 
  geom_bar() +
  theme_bw() +
  labs(title = "Stroke Classification", x = "Stroke") +
  theme(plot.title = element_text(hjust = 0.5))

library(ggthemes) # Or
ggplot(data = df2, aes(x = stroke, fill =stroke)) + geom_bar() + 
  theme_economist()+ labs(x = "Stroke") + guides(fill = FALSE) # remove the legend

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.

4.2 Correlation Matrix

# Selecting numerical variables
sel = df2[, c("age", "sbp", "dbp", "eGFR", "non_hdl", "waist", "wt20", "bs", "ua",
          "rbc", "wbc", "tg", "cpk", "alb", "bmi",  "bf", "smsts", "DM", 
          "hypertension", "MetS", "sleep_conv", "stroke", "fh1_yn", "fh5_yn", "fh246_yn")]

# Let’s now see the correlation between all the numerical variables in the dataset.
# Correlation Matrix can be ploted as follows.
cor_data <- cor(sel[,setdiff(names(sel), c("smsts", "DM", 
                             "hypertension", "MetS", "sleep_conv", "stroke", "fh1_yn", "fh5_yn", "fh246_yn"))])
# Numerical Correlation Matrix
cor_data

##                 age         sbp         dbp         eGFR     non_hdl
## age      1.00000000  0.45366897  0.18612989 -0.579516831  0.19953932
## sbp      0.45366897  1.00000000  0.71805544 -0.263306152  0.17256589
## dbp      0.18612989  0.71805544  1.00000000 -0.103267334  0.18020565
## eGFR    -0.57951683 -0.26330615 -0.10326733  1.000000000 -0.18807449
## non_hdl  0.19953932  0.17256589  0.18020565 -0.188074491  1.00000000
## waist    0.21651029  0.29534598  0.32680903 -0.096791475  0.26398393
## wt20    -0.05240172  0.04441302  0.13741367 -0.028136056 -0.06793941
## bs       0.21440422  0.25841835  0.20235615 -0.148171238  0.15430376
## ua       0.05595851  0.15541011  0.20913662 -0.256085361  0.12815722
## rbc     -0.20778253  0.03968653  0.22601528  0.096637557  0.13546604
## wbc     -0.02313799  0.05461379  0.08902853  0.025087005  0.10310387
## tg       0.13471841  0.20629392  0.22859959 -0.115454102  0.44128322
## cpk      0.03765889  0.08233121  0.10791172 -0.125058245  0.07561372
## alb     -0.15991809  0.04486632  0.12655206  0.040265944  0.18012810
## bmi      0.04024757  0.24439570  0.32318478 -0.055370209  0.27467909
## bf       0.01213702  0.08623060  0.08001366  0.001783862  0.27634262
##               waist        wt20          bs          ua         rbc         wbc
## age      0.21651029 -0.05240172  0.21440422  0.05595851 -0.20778253 -0.02313799
## sbp      0.29534598  0.04441302  0.25841835  0.15541011  0.03968653  0.05461379
## dbp      0.32680903  0.13741367  0.20235615  0.20913662  0.22601528  0.08902853
## eGFR    -0.09679147 -0.02813606 -0.14817124 -0.25608536  0.09663756  0.02508701
## non_hdl  0.26398393 -0.06793941  0.15430376  0.12815722  0.13546604  0.10310387
## waist    1.00000000  0.34386258  0.27248381  0.30856125  0.27768115  0.16982907
## wt20     0.34386258  1.00000000  0.11590218  0.35687646  0.33807759  0.12891520
## bs       0.27248381  0.11590218  1.00000000  0.16394421  0.17797831  0.12684171
## ua       0.30856125  0.35687646  0.16394421  1.00000000  0.33443117  0.18045482
## rbc      0.27768115  0.33807759  0.17797831  0.33443117  1.00000000  0.25557041
## wbc      0.16982907  0.12891520  0.12684171  0.18045482  0.25557041  1.00000000
## tg       0.38905550  0.14433729  0.21925995  0.31238166  0.25782755  0.25416920
## cpk      0.12534234  0.19506513  0.03473554  0.18168435  0.06588585 -0.02645865
## alb      0.05391083  0.01783037  0.08891826  0.13676795  0.29780954  0.08792599
## bmi      0.78994573  0.29863394  0.23133388  0.25821915  0.26084257  0.13673400
## bf       0.31383574 -0.28783607  0.01572509 -0.12479430 -0.09784038 -0.02406466
##                  tg         cpk         alb         bmi           bf
## age      0.13471841  0.03765889 -0.15991809  0.04024757  0.012137023
## sbp      0.20629392  0.08233121  0.04486632  0.24439570  0.086230598
## dbp      0.22859959  0.10791172  0.12655206  0.32318478  0.080013662
## eGFR    -0.11545410 -0.12505824  0.04026594 -0.05537021  0.001783862
## non_hdl  0.44128322  0.07561372  0.18012810  0.27467909  0.276342620
## waist    0.38905550  0.12534234  0.05391083  0.78994573  0.313835741
## wt20     0.14433729  0.19506513  0.01783037  0.29863394 -0.287836068
## bs       0.21925995  0.03473554  0.08891826  0.23133388  0.015725091
## ua       0.31238166  0.18168435  0.13676795  0.25821915 -0.124794302
## rbc      0.25782755  0.06588585  0.29780954  0.26084257 -0.097840382
## wbc      0.25416920 -0.02645865  0.08792599  0.13673400 -0.024064664
## tg       1.00000000  0.02314921  0.12003293  0.34314220  0.085053440
## cpk      0.02314921  1.00000000  0.07703153  0.14132311 -0.152825939
## alb      0.12003293  0.07703153  1.00000000  0.09603130  0.100237271
## bmi      0.34314220  0.14132311  0.09603130  1.00000000  0.448686121
## bf       0.08505344 -0.15282594  0.10023727  0.44868612  1.000000000

# Correlation matrix plots
corrplot::corrplot(cor_data)

corrplot::corrplot(cor_data, type = "upper", method = "number")

corrplot::corrplot(cor_data, type = "upper", method = "pie")

4.3 Univariate Analysis

#
sel2 = df2[, c("age", "sbp", "dbp", "eGFR", "non_hdl", "waist", "wt20", "bs", "ua",
               "rbc", "wbc", "tg", "cpk", "alb", "bmi",  "bf", "stroke")]

library(gridExtra) #Multiple plot in single grip space

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

sel2 = as.data.frame(sel2)
univar_graph <- function(univar_name, univar, data, output_var) {
  g_1 <- ggplot(data, aes(x=univar)) + 
    geom_density() + 
    xlab(univar_name) + 
    theme_bw()
  g_2 <- ggplot(data, aes(x=univar, fill=output_var)) + 
    geom_density(alpha=0.4) + 
    xlab(univar_name) + 
    theme_bw()
  gridExtra::grid.arrange(g_1, g_2, ncol=2, top = paste(univar_name,"variable", "/ [ Skew:",timeDate::skewness(univar),"]"))
}
for (x in 1:(ncol(sel2)-1)) {
  univar_graph(univar_name = names(sel2)[x], univar = sel2[,x], data = sel2, output_var = sel2[,'stroke'])
}

5. Machine Learning preparation

# Splitting dataset
partition <- caret::createDataPartition(y = df2$stroke, times = 1, p = 0.7, list = FALSE)
# create training data set
train_set <- df2[partition,]

# create testing data set, subtracting the rows partition to get remaining 30% of the data
test_set <- df2[-partition,]

5.1 Random Forest Model

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

# K-folds method
# Convert target variable to factor with two levels
train_set$stroke <- factor(train_set$stroke, levels = c("0", "1"))
test_set$stroke <- factor(test_set$stroke, levels = c("0", "1"))

# Convert level names to valid R variable names
levels(train_set$stroke) <- make.names(levels(train_set$stroke))
levels(test_set$stroke) <- make.names(levels(test_set$stroke))

model_forest <- caret::train(stroke ~., data = train_set,
                             method = "ranger",
                             metric = "ROC",
                             trControl = trainControl(method = "cv", number = 10,
                                                      classProbs = T, summaryFunction = twoClassSummary),
                             preProcess = c("center","scale","pca"))
model_forest

## Random Forest 
## 
## 5173 samples
##   56 predictor
##    2 classes: 'X0', 'X1' 
## 
## Pre-processing: centered (60), scaled (60), principal component
##  signal extraction (60) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 4655, 4656, 4656, 4656, 4655, 4656, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   ROC        Sens       Spec
##    2    gini        0.6791999  1.0000000  0   
##    2    extratrees  0.7011567  1.0000000  0   
##   24    gini        0.6707406  0.9993827  0   
##   24    extratrees  0.6968904  1.0000000  0   
##   46    gini        0.6667594  0.9993831  0   
##   46    extratrees  0.6931750  1.0000000  0   
## 
## Tuning parameter 'min.node.size' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 2, splitrule = extratrees
##  and min.node.size = 1.

# final ROC Value
model_forest$results[6,4]

## [1] 0.693175

5.2 XGBOOST - eXtreme Gradient BOOSTing

xgb_grid_1  <-  expand.grid(
  nrounds = 50,
  eta = c(0.03),
  max_depth = 1,
  gamma = 0,
  colsample_bytree = 0.6,
  min_child_weight = 1,
  subsample = 0.5
)

model_xgb <- caret::train(stroke ~., data = train_set,
                          method = "xgbTree",
                          metric = "ROC",
                          tuneGrid=xgb_grid_1,
                          trControl = trainControl(method = "cv", number = 10,
                                                   classProbs = T, summaryFunction = twoClassSummary),
                          preProcess = c("center","scale","pca"))
model_xgb

## eXtreme Gradient Boosting 
## 
## 5173 samples
##   56 predictor
##    2 classes: 'X0', 'X1' 
## 
## Pre-processing: centered (60), scaled (60), principal component
##  signal extraction (60) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 4655, 4655, 4656, 4657, 4655, 4656, ... 
## Resampling results:
## 
##   ROC        Sens  Spec
##   0.6955346  1     0   
## 
## Tuning parameter 'nrounds' was held constant at a value of 50
## Tuning
##  held constant at a value of 1
## Tuning parameter 'subsample' was held
##  constant at a value of 0.5

# final ROC value
model_xgb$results["ROC"]

##         ROC
## 1 0.6955346

5.3 KNN - K Nearest Neighbours

model_knn <- caret::train(stroke ~., data = train_set,
                          method = "knn",
                          metric = "ROC",
                          tuneGrid = expand.grid(.k = c(3:10)),
                          trControl = trainControl(method = "cv", number = 10,
                                                   classProbs = T, summaryFunction = twoClassSummary),
                          preProcess = c("center","scale","pca"))

model_knn

## k-Nearest Neighbors 
## 
## 5173 samples
##   56 predictor
##    2 classes: 'X0', 'X1' 
## 
## Pre-processing: centered (60), scaled (60), principal component
##  signal extraction (60) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 4656, 4655, 4656, 4657, 4655, 4655, ... 
## Resampling results across tuning parameters:
## 
##   k   ROC        Sens       Spec       
##    3  0.5353313  0.9907517  0.019677419
##    4  0.5493395  0.9903402  0.016451613
##    5  0.5587449  0.9977387  0.000000000
##    6  0.5670182  0.9975342  0.003333333
##    7  0.5773121  0.9985622  0.000000000
##    8  0.5891223  0.9993836  0.003225806
##    9  0.5911604  0.9995889  0.000000000
##   10  0.5967023  0.9991778  0.000000000
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 10.

#final ROC value
model_knn$results[8,2]

## [1] 0.5967023

5.4 Logistic Regression

model_glm <- caret::train(stroke ~., data = train_set,
                          method = "glm",
                          metric = "ROC",
                          tuneLength = 10,
                          trControl = trainControl(method = "cv", number = 10,
                                                   classProbs = T, summaryFunction = twoClassSummary),
                          preProcess = c("center","scale","pca"))

model_glm

## Generalized Linear Model 
## 
## 5173 samples
##   56 predictor
##    2 classes: 'X0', 'X1' 
## 
## Pre-processing: centered (60), scaled (60), principal component
##  signal extraction (60) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 4656, 4657, 4656, 4656, 4655, 4656, ... 
## Resampling results:
## 
##   ROC        Sens  Spec       
##   0.7343427  1     0.003225806

#final ROC Value
model_glm$results[2]

##         ROC
## 1 0.7343427

5.5 Rpart CART - classification and Regression Trees

library(rpart.plot) #CART Decision Tree

## Loading required package: rpart

model_rpart <- caret::train(stroke ~., data = train_set,
                            method = "rpart",
                            metric = "ROC",
                            tuneLength = 20,
                            trControl = trainControl(method = "cv", number = 10,
                                                     classProbs = T, summaryFunction = twoClassSummary))

model_rpart

## CART 
## 
## 5173 samples
##   56 predictor
##    2 classes: 'X0', 'X1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 4655, 4655, 4655, 4657, 4657, 4656, ... 
## Resampling results across tuning parameters:
## 
##   cp            ROC        Sens       Spec      
##   0.0000000000  0.6560288  0.9734906  0.04258065
##   0.0001714384  0.6560288  0.9734906  0.04258065
##   0.0003428767  0.6560288  0.9734906  0.04258065
##   0.0005143151  0.6530779  0.9734906  0.04258065
##   0.0006857535  0.6527347  0.9736964  0.04258065
##   0.0008571918  0.6583681  0.9747235  0.04258065
##   0.0010286302  0.6548404  0.9763674  0.04258065
##   0.0012000686  0.6548404  0.9763674  0.04258065
##   0.0013715069  0.6666735  0.9773958  0.04258065
##   0.0015429453  0.6666735  0.9773958  0.04258065
##   0.0017143837  0.6587505  0.9780118  0.03602151
##   0.0018858220  0.6565074  0.9812998  0.03602151
##   0.0020572604  0.6565074  0.9812998  0.03602151
##   0.0022286988  0.6523509  0.9812998  0.03602151
##   0.0024001372  0.6523509  0.9812998  0.03602151
##   0.0025715755  0.6540987  0.9821220  0.03602151
##   0.0027430139  0.6540987  0.9821220  0.03602151
##   0.0029144523  0.6548507  0.9825331  0.03602151
##   0.0030858906  0.6548507  0.9825331  0.03602151
##   0.0032573290  0.6541354  0.9831491  0.03602151
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.001542945.

# Plot model accuracy vs different values of cp (complexity parameter)
plot(model_rpart)

# Best Model Cp (Complexity Parameter)
model_rpart$bestTune

##             cp
## 10 0.001542945

# Structure of final model selected
model_rpart$finalModel

## n= 5173 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##    1) root 5173 307 X0 (0.94065339 0.05934661)  
##      2) age< 55.5 2553  56 X0 (0.97806502 0.02193498) *
##      3) age>=55.5 2620 251 X0 (0.90419847 0.09580153)  
##        6) upr_nega>=0.5 2110 176 X0 (0.91658768 0.08341232)  
##         12) age< 62.5 761  38 X0 (0.95006570 0.04993430) *
##         13) age>=62.5 1349 138 X0 (0.89770200 0.10229800)  
##           26) pulse_mean< 87.5 1259 118 X0 (0.90627482 0.09372518)  
##             52) id>=821.5 1242 111 X0 (0.91062802 0.08937198)  
##              104) hg< 14.05 760  51 X0 (0.93289474 0.06710526)  
##                208) waist< 100.5 732  44 X0 (0.93989071 0.06010929)  
##                  416) ca>=8.85 667  33 X0 (0.95052474 0.04947526) *
##                  417) ca< 8.85 65  11 X0 (0.83076923 0.16923077)  
##                    834) sbp>=108 55   5 X0 (0.90909091 0.09090909) *
##                    835) sbp< 108 10   4 X1 (0.40000000 0.60000000) *
##                209) waist>=100.5 28   7 X0 (0.75000000 0.25000000)  
##                  418) hg< 13.45 18   1 X0 (0.94444444 0.05555556) *
##                  419) hg>=13.45 10   4 X1 (0.40000000 0.60000000) *
##              105) hg>=14.05 482  60 X0 (0.87551867 0.12448133)  
##                210) elbow>=6.1335 399  40 X0 (0.89974937 0.10025063)  
##                  420) frct< 308.25 379  33 X0 (0.91292876 0.08707124)  
##                    840) id>=51006.5 210   9 X0 (0.95714286 0.04285714) *
##                    841) id< 51006.5 169  24 X0 (0.85798817 0.14201183)  
##                     1682) ua>=5.75 95   7 X0 (0.92631579 0.07368421)  
##                       3364) hg< 15.75 73   2 X0 (0.97260274 0.02739726) *
##                       3365) hg>=15.75 22   5 X0 (0.77272727 0.22727273)  
##                         6730) dbp< 83.5 15   1 X0 (0.93333333 0.06666667) *
##                         6731) dbp>=83.5 7   3 X1 (0.42857143 0.57142857) *
##                     1683) ua< 5.75 74  17 X0 (0.77027027 0.22972973)  
##                       3366) eGFR>=71.07033 33   3 X0 (0.90909091 0.09090909) *
##                       3367) eGFR< 71.07033 41  14 X0 (0.65853659 0.34146341)  
##                         6734) wbc< 6.915 32   8 X0 (0.75000000 0.25000000) *
##                         6735) wbc>=6.915 9   3 X1 (0.33333333 0.66666667) *
##                  421) frct>=308.25 20   7 X0 (0.65000000 0.35000000)  
##                    842) bmi>=23.64297 10   0 X0 (1.00000000 0.00000000) *
##                    843) bmi< 23.64297 10   3 X1 (0.30000000 0.70000000) *
##                211) elbow< 6.1335 83  20 X0 (0.75903614 0.24096386)  
##                  422) bmi< 22.64269 52   7 X0 (0.86538462 0.13461538)  
##                    844) wbc>=3.965 45   3 X0 (0.93333333 0.06666667) *
##                    845) wbc< 3.965 7   3 X1 (0.42857143 0.57142857) *
##                  423) bmi>=22.64269 31  13 X0 (0.58064516 0.41935484)  
##                    846) bf>=23.7 22   6 X0 (0.72727273 0.27272727)  
##                     1692) bs< 98 11   0 X0 (1.00000000 0.00000000) *
##                     1693) bs>=98 11   5 X1 (0.45454545 0.54545455) *
##                    847) bf< 23.7 9   2 X1 (0.22222222 0.77777778) *
##             53) id< 821.5 17   7 X0 (0.58823529 0.41176471) *
##           27) pulse_mean>=87.5 90  20 X0 (0.77777778 0.22222222)  
##             54) sbp>=150.5 31   1 X0 (0.96774194 0.03225806) *
##             55) sbp< 150.5 59  19 X0 (0.67796610 0.32203390)  
##              110) wt20>=51.925 19   1 X0 (0.94736842 0.05263158) *
##              111) wt20< 51.925 40  18 X0 (0.55000000 0.45000000)  
##                222) age< 73.5 30  10 X0 (0.66666667 0.33333333)  
##                  444) ztt>=11.85 9   0 X0 (1.00000000 0.00000000) *
##                  445) ztt< 11.85 21  10 X0 (0.52380952 0.47619048)  
##                    890) ztt< 8.65 13   3 X0 (0.76923077 0.23076923) *
##                    891) ztt>=8.65 8   1 X1 (0.12500000 0.87500000) *
##                223) age>=73.5 10   2 X1 (0.20000000 0.80000000) *
##        7) upr_nega< 0.5 510  75 X0 (0.85294118 0.14705882)  
##         14) sbp< 156.5 396  43 X0 (0.89141414 0.10858586) *
##         15) sbp>=156.5 114  32 X0 (0.71929825 0.28070175)  
##           30) eGFR< 79.96992 97  22 X0 (0.77319588 0.22680412)  
##             60) eGFR>=48.67081 78  13 X0 (0.83333333 0.16666667)  
##              120) non_hdl< 159.5 32   1 X0 (0.96875000 0.03125000) *
##              121) non_hdl>=159.5 46  12 X0 (0.73913043 0.26086957)  
##                242) id< 14772 18   1 X0 (0.94444444 0.05555556) *
##                243) id>=14772 28  11 X0 (0.60714286 0.39285714)  
##                  486) got< 22.5 18   4 X0 (0.77777778 0.22222222) *
##                  487) got>=22.5 10   3 X1 (0.30000000 0.70000000) *
##             61) eGFR< 48.67081 19   9 X0 (0.52631579 0.47368421) *
##           31) eGFR>=79.96992 17   7 X1 (0.41176471 0.58823529) *

# Should have refactored diabetes to make pos as the primary factor level to get straight forward decision tree
rpart.plot::rpart.plot(model_rpart$finalModel, type = 2, fallen.leaves = T, extra = 2, cex = 0.70)

# Training Data set - Model Comparision by ROC value
model_list <- list(Random_Forest = model_forest, XGBoost = model_xgb, KNN = model_knn, Logistic_Regression = model_glm, Rpart_DT = model_rpart)
resamples <- resamples(model_list)

#box plot
bwplot(resamples, metric="ROC")

#dot plot
dotplot(resamples, metric="ROC")

6. Predition on Test Data Set

6.1 Random Forest

# prediction on Test data set
pred_rf <- predict(model_forest, test_set)
# Confusion Matrix 
cm_rf <- confusionMatrix(pred_rf, test_set$stroke, positive="X1")

# Prediction Probabilities
pred_prob_rf <- predict(model_forest, test_set, type="prob")

# ROC value
library(pROC) #ROC

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

roc_rf <- roc(test_set$stroke, pred_prob_rf$X1)

## Setting levels: control = X0, case = X1

## Setting direction: controls < cases

# Confusion Matrix for Random Forest Model
cm_rf

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   X0   X1
##         X0 2085  131
##         X1    0    0
##                                           
##                Accuracy : 0.9409          
##                  95% CI : (0.9302, 0.9503)
##     No Information Rate : 0.9409          
##     P-Value [Acc > NIR] : 0.5232          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.00000         
##             Specificity : 1.00000         
##          Pos Pred Value :     NaN         
##          Neg Pred Value : 0.94088         
##              Prevalence : 0.05912         
##          Detection Rate : 0.00000         
##    Detection Prevalence : 0.00000         
##       Balanced Accuracy : 0.50000         
##                                           
##        'Positive' Class : X1              
##

# ROC Value for Random Forest
roc_rf

## 
## Call:
## roc.default(response = test_set$stroke, predictor = pred_prob_rf$X1)
## 
## Data: pred_prob_rf$X1 in 2085 controls (test_set$stroke X0) < 131 cases (test_set$stroke X1).
## Area under the curve: 0.6946

# AUC - Area under the curve
caTools::colAUC(pred_prob_rf$X1, test_set$stroke, plotROC = T)

##                [,1]
## X0 vs. X1 0.6945668

6.2 XGBOOST - eXtreme Gradient BOOSTing

# prediction on Test data set
pred_xgb <- predict(model_xgb, test_set)
# Confusion Matrix 
cm_xgb <- confusionMatrix(pred_xgb, test_set$stroke, positive="X1")

# Prediction Probabilities
pred_prob_xgb <- predict(model_xgb, test_set, type="prob")
# ROC value
roc_xgb <- roc(test_set$stroke, pred_prob_xgb$X1)

## Setting levels: control = X0, case = X1

## Setting direction: controls < cases

# Confusion matrix 
cm_xgb

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   X0   X1
##         X0 2085  131
##         X1    0    0
##                                           
##                Accuracy : 0.9409          
##                  95% CI : (0.9302, 0.9503)
##     No Information Rate : 0.9409          
##     P-Value [Acc > NIR] : 0.5232          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.00000         
##             Specificity : 1.00000         
##          Pos Pred Value :     NaN         
##          Neg Pred Value : 0.94088         
##              Prevalence : 0.05912         
##          Detection Rate : 0.00000         
##    Detection Prevalence : 0.00000         
##       Balanced Accuracy : 0.50000         
##                                           
##        'Positive' Class : X1              
##

# ROC Value for for XGBoost
roc_xgb

## 
## Call:
## roc.default(response = test_set$stroke, predictor = pred_prob_xgb$X1)
## 
## Data: pred_prob_xgb$X1 in 2085 controls (test_set$stroke X0) < 131 cases (test_set$stroke X1).
## Area under the curve: 0.7061

# AUC - Area under the curve
caTools::colAUC(pred_prob_xgb$X1, test_set$stroke, plotROC = T)

##                [,1]
## X0 vs. X1 0.7060812

6.3 KNN - K Nearest Neighbours

# prediction on Test data set
pred_knn <- predict(model_knn, test_set)
# Confusion Matrix 
cm_knn <- confusionMatrix(pred_knn, test_set$stroke, positive="X1")

# Prediction Probabilities
pred_prob_knn <- predict(model_knn, test_set, type="prob")
# ROC value
roc_knn <- roc(test_set$stroke, pred_prob_knn$X1)

## Setting levels: control = X0, case = X1

## Setting direction: controls < cases

# Confusion matrix 
cm_knn

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   X0   X1
##         X0 2085  131
##         X1    0    0
##                                           
##                Accuracy : 0.9409          
##                  95% CI : (0.9302, 0.9503)
##     No Information Rate : 0.9409          
##     P-Value [Acc > NIR] : 0.5232          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.00000         
##             Specificity : 1.00000         
##          Pos Pred Value :     NaN         
##          Neg Pred Value : 0.94088         
##              Prevalence : 0.05912         
##          Detection Rate : 0.00000         
##    Detection Prevalence : 0.00000         
##       Balanced Accuracy : 0.50000         
##                                           
##        'Positive' Class : X1              
##

# ROC Value for for XGBoost
roc_knn

## 
## Call:
## roc.default(response = test_set$stroke, predictor = pred_prob_knn$X1)
## 
## Data: pred_prob_knn$X1 in 2085 controls (test_set$stroke X0) < 131 cases (test_set$stroke X1).
## Area under the curve: 0.6078

# AUC - Area under the curve
caTools::colAUC(pred_prob_knn$X1, test_set$stroke, plotROC = T)

##               [,1]
## X0 vs. X1 0.607824

6.4 Logistic Regression

# prediction on Test data set
pred_glm <- predict(model_glm, test_set)
# Confusion Matrix 
cm_glm <- confusionMatrix(pred_glm, test_set$stroke, positive="X1")

# Prediction Probabilities
pred_prob_glm <- predict(model_glm, test_set, type="prob")
# ROC value
roc_glm <- roc(test_set$stroke, pred_prob_glm$X1)

## Setting levels: control = X0, case = X1

## Setting direction: controls < cases

# Confusion matrix 
cm_glm

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   X0   X1
##         X0 2085  131
##         X1    0    0
##                                           
##                Accuracy : 0.9409          
##                  95% CI : (0.9302, 0.9503)
##     No Information Rate : 0.9409          
##     P-Value [Acc > NIR] : 0.5232          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.00000         
##             Specificity : 1.00000         
##          Pos Pred Value :     NaN         
##          Neg Pred Value : 0.94088         
##              Prevalence : 0.05912         
##          Detection Rate : 0.00000         
##    Detection Prevalence : 0.00000         
##       Balanced Accuracy : 0.50000         
##                                           
##        'Positive' Class : X1              
##

roc_glm

## 
## Call:
## roc.default(response = test_set$stroke, predictor = pred_prob_glm$X1)
## 
## Data: pred_prob_glm$X1 in 2085 controls (test_set$stroke X0) < 131 cases (test_set$stroke X1).
## Area under the curve: 0.7354

# AUC - Area under the curve
caTools::colAUC(pred_prob_glm$X1, test_set$stroke, plotROC = T)

##                [,1]
## X0 vs. X1 0.7353873

6.5 Rpart CART - classification and Regression Trees

# prediction on Test data set
pred_rpart <- predict(model_rpart, test_set)
# Confusion Matrix 
cm_rpart <- confusionMatrix(pred_rpart, test_set$stroke, positive="X1")

# Prediction Probabilities
pred_prob_rpart <- predict(model_rpart, test_set, type="prob")
# ROC value
roc_rpart <- roc(test_set$stroke, pred_prob_rpart$X1)

## Setting levels: control = X0, case = X1

## Setting direction: controls < cases

# Confusion matrix 
cm_rpart

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   X0   X1
##         X0 2042  127
##         X1   43    4
##                                          
##                Accuracy : 0.9233         
##                  95% CI : (0.9114, 0.934)
##     No Information Rate : 0.9409         
##     P-Value [Acc > NIR] : 0.9997         
##                                          
##                   Kappa : 0.0142         
##                                          
##  Mcnemar's Test P-Value : 1.943e-10      
##                                          
##             Sensitivity : 0.030534       
##             Specificity : 0.979376       
##          Pos Pred Value : 0.085106       
##          Neg Pred Value : 0.941448       
##              Prevalence : 0.059116       
##          Detection Rate : 0.001805       
##    Detection Prevalence : 0.021209       
##       Balanced Accuracy : 0.504955       
##                                          
##        'Positive' Class : X1             
##

# ROC Value for for XGBoost
roc_rpart

## 
## Call:
## roc.default(response = test_set$stroke, predictor = pred_prob_rpart$X1)
## 
## Data: pred_prob_rpart$X1 in 2085 controls (test_set$stroke X0) < 131 cases (test_set$stroke X1).
## Area under the curve: 0.6461

# AUC - Area under the curve
caTools::colAUC(pred_prob_rpart$X1, test_set$stroke, plotROC = T)

##                [,1]
## X0 vs. X1 0.6461164

7. Test Set Results Comparision

result_rf <- c(cm_rf$byClass['Sensitivity'], cm_rf$byClass['Specificity'], cm_rf$byClass['Precision'], 
               cm_rf$byClass['Recall'], cm_rf$byClass['F1'], roc_rf$auc)

result_xgb <- c(cm_xgb$byClass['Sensitivity'], cm_xgb$byClass['Specificity'], cm_xgb$byClass['Precision'], 
                cm_xgb$byClass['Recall'], cm_xgb$byClass['F1'], roc_xgb$auc)

result_knn <- c(cm_knn$byClass['Sensitivity'], cm_knn$byClass['Specificity'], cm_knn$byClass['Precision'], 
                cm_knn$byClass['Recall'], cm_knn$byClass['F1'], roc_knn$auc)

result_glm <- c(cm_glm$byClass['Sensitivity'], cm_glm$byClass['Specificity'], cm_glm$byClass['Precision'], 
                cm_glm$byClass['Recall'], cm_glm$byClass['F1'], roc_glm$auc)

result_rpart <- c(cm_rpart$byClass['Sensitivity'], cm_rpart$byClass['Specificity'], cm_rpart$byClass['Precision'], 
                  cm_rpart$byClass['Recall'], cm_rpart$byClass['F1'], roc_rpart$auc)


all_results <- data.frame(rbind(result_rf, result_xgb, result_knn, result_glm, result_rpart))
names(all_results) <- c("Sensitivity", "Specificity", "Precision", "Recall", "F1", "AUC")
all_results

##              Sensitivity Specificity  Precision     Recall         F1       AUC
## result_rf     0.00000000   1.0000000         NA 0.00000000         NA 0.6945668
## result_xgb    0.00000000   1.0000000         NA 0.00000000         NA 0.7060812
## result_knn    0.00000000   1.0000000         NA 0.00000000         NA 0.6078240
## result_glm    0.00000000   1.0000000         NA 0.00000000         NA 0.7353873
## result_rpart  0.03053435   0.9793765 0.08510638 0.03053435 0.04494382 0.6461164

8. Visualization to compare accuracy of ML models

col <- c("#ed3b3b", "#0099ff")

graphics::fourfoldplot(cm_rf$table, color = col, conf.level = 0.95, margin = 1, 
                       main = paste("Random Forest Accuracy(",round(cm_rf$overall[1]*100),"%)", sep = ""))

graphics::fourfoldplot(cm_xgb$table, color = col, conf.level = 0.95, margin = 1, 
                       main = paste("XGB Accuracy(",round(cm_xgb$overall[1]*100),"%)", sep = ""))

graphics::fourfoldplot(cm_knn$table, color = col, conf.level = 0.95, margin = 1, 
                       main = paste("KNN Accuracy(",round(cm_knn$overall[1]*100),"%)", sep = ""))

graphics::fourfoldplot(cm_glm$table, color = col, conf.level = 0.95, margin = 1, 
                       main = paste("Logistic Regression Accuracy(",round(cm_glm$overall[1]*100),"%)", sep = ""))

graphics::fourfoldplot(cm_rpart$table, color = col, conf.level = 0.95, margin = 1, 
                       main = paste("Rpart DT Accuracy(",round(cm_rpart$overall[1]*100),"%)", sep = ""))

https://rpubs.com/Mayank7j_2020/PimaIndiansDiabetesData

Machine Learning for Su data set

Vu Thien

2023-03-14

1. Import data

2. Rename by pipeline

3. Convert to factor

4. Exploratory data analysis

4.1 Stroke Distribution

4.2 Correlation Matrix

4.3 Univariate Analysis

5. Machine Learning preparation

5.1 Random Forest Model

5.2 XGBOOST - eXtreme Gradient BOOSTing

5.3 KNN - K Nearest Neighbours

5.4 Logistic Regression

5.5 Rpart CART - classification and Regression Trees

6. Predition on Test Data Set

6.1 Random Forest

6.2 XGBOOST - eXtreme Gradient BOOSTing

6.3 KNN - K Nearest Neighbours

6.4 Logistic Regression

6.5 Rpart CART - classification and Regression Trees

7. Test Set Results Comparision

8. Visualization to compare accuracy of ML models

https://rpubs.com/Mayank7j_2020/PimaIndiansDiabetesData