library(readxl)
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
library(ggplot2)
df <- read_excel("~/Desktop/Misc/Clean data.xlsx")
table1(~ age + sex + bmi + sbp_mean + dbp_mean + tob + exer + kaidan + bs + non_hdl + hyper +
diabetes_over_boundary + MetS | target_st, df)
## Warning in table1.formula(~age + sex + bmi + sbp_mean + dbp_mean + tob + : Terms
## to the right of '|' in formula 'x' define table columns and are expected to be
## factors with meaningful labels.
| 0 (N=6951) |
1 (N=438) |
Overall (N=7389) |
|
|---|---|---|---|
| age | |||
| Mean (SD) | 54.6 (13.1) | 64.4 (9.78) | 55.1 (13.1) |
| Median [Min, Max] | 55.0 [30.0, 84.0] | 66.0 [32.0, 80.0] | 56.0 [30.0, 84.0] |
| sex | |||
| Mean (SD) | 0.548 (0.498) | 0.466 (0.499) | 0.543 (0.498) |
| Median [Min, Max] | 1.00 [0, 1.00] | 0 [0, 1.00] | 1.00 [0, 1.00] |
| bmi | |||
| Mean (SD) | 22.5 (3.00) | 23.1 (3.21) | 22.5 (3.01) |
| Median [Min, Max] | 22.3 [14.2, 30.5] | 23.0 [14.2, 30.5] | 22.3 [14.2, 30.5] |
| sbp_mean | |||
| Mean (SD) | 126 (20.5) | 138 (21.9) | 126 (20.8) |
| Median [Min, Max] | 123 [78.0, 180] | 137 [86.0, 180] | 124 [78.0, 180] |
| dbp_mean | |||
| Mean (SD) | 77.4 (11.7) | 81.0 (12.5) | 77.6 (11.8) |
| Median [Min, Max] | 77.0 [47.5, 108] | 81.0 [47.5, 108] | 77.0 [47.5, 108] |
| tob | |||
| Mean (SD) | 2.27 (0.879) | 2.16 (0.882) | 2.26 (0.879) |
| Median [Min, Max] | 3.00 [1.00, 3.00] | 2.00 [1.00, 3.00] | 3.00 [1.00, 3.00] |
| exer | |||
| Mean (SD) | 1.61 (0.488) | 1.60 (0.491) | 1.61 (0.488) |
| Median [Min, Max] | 2.00 [1.00, 2.00] | 2.00 [1.00, 2.00] | 2.00 [1.00, 2.00] |
| kaidan | |||
| Mean (SD) | 3.35 (1.26) | 3.39 (1.27) | 3.35 (1.26) |
| Median [Min, Max] | 3.00 [1.00, 5.00] | 3.00 [1.00, 5.00] | 3.00 [1.00, 5.00] |
| bs | |||
| Mean (SD) | 96.1 (9.78) | 100 (11.0) | 96.4 (9.90) |
| Median [Min, Max] | 95.0 [72.0, 120] | 98.0 [72.0, 120] | 95.0 [72.0, 120] |
| non_hdl | |||
| Mean (SD) | 152 (36.1) | 159 (36.1) | 152 (36.1) |
| Median [Min, Max] | 150 [53.5, 250] | 155 [78.0, 250] | 150 [53.5, 250] |
| hyper | |||
| Mean (SD) | 0.295 (0.456) | 0.550 (0.498) | 0.311 (0.463) |
| Median [Min, Max] | 0 [0, 1.00] | 1.00 [0, 1.00] | 0 [0, 1.00] |
| diabetes_over_boundary | |||
| Mean (SD) | 0.115 (0.319) | 0.228 (0.420) | 0.122 (0.327) |
| Median [Min, Max] | 0 [0, 1.00] | 0 [0, 1.00] | 0 [0, 1.00] |
| MetS | |||
| Mean (SD) | 0.234 (0.424) | 0.413 (0.493) | 0.245 (0.430) |
| Median [Min, Max] | 0 [0, 1.00] | 0 [0, 1.00] | 0 [0, 1.00] |
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ purrr 0.3.4
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
df2 <- df %>% rename(stroke = target_st, sbp = sbp_mean, dbp = dbp_mean,
eGFR = egfr_ckd_epi, smsts = tob, hypertension = hyper,
DM = diabetes_over_boundary)
df2$sex = as.factor(df2$sex)
df2$smsts = as.factor(df2$smsts)
df2$exer = as.factor(df2$exer)
df2$kaidan = as.factor(df2$kaidan)
df2$hypertension = as.factor(df2$hypertension)
df2$DM = as.factor(df2$DM)
df2$MetS = as.factor(df2$MetS)
df2$stroke = as.factor(df2$stroke)
ggplot(df2, aes(stroke, fill = stroke)) +
geom_bar() +
theme_bw() +
labs(title = "Stroke Classification", x = "Stroke") +
theme(plot.title = element_text(hjust = 0.5))
library(ggthemes) # Or
ggplot(data = df2, aes(x = stroke, fill =stroke)) + geom_bar() +
theme_economist()+ labs(x = "Stroke") + guides(fill = FALSE) # remove the legend
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
# Selecting numerical variables
sel = df2[, c("age", "sbp", "dbp", "eGFR", "non_hdl", "waist", "wt20", "bs", "ua",
"rbc", "wbc", "tg", "cpk", "alb", "bmi", "bf", "smsts", "DM",
"hypertension", "MetS", "sleep_conv", "stroke", "fh1_yn", "fh5_yn", "fh246_yn")]
# Let’s now see the correlation between all the numerical variables in the dataset.
# Correlation Matrix can be ploted as follows.
cor_data <- cor(sel[,setdiff(names(sel), c("smsts", "DM",
"hypertension", "MetS", "sleep_conv", "stroke", "fh1_yn", "fh5_yn", "fh246_yn"))])
# Numerical Correlation Matrix
cor_data
## age sbp dbp eGFR non_hdl
## age 1.00000000 0.45366897 0.18612989 -0.579516831 0.19953932
## sbp 0.45366897 1.00000000 0.71805544 -0.263306152 0.17256589
## dbp 0.18612989 0.71805544 1.00000000 -0.103267334 0.18020565
## eGFR -0.57951683 -0.26330615 -0.10326733 1.000000000 -0.18807449
## non_hdl 0.19953932 0.17256589 0.18020565 -0.188074491 1.00000000
## waist 0.21651029 0.29534598 0.32680903 -0.096791475 0.26398393
## wt20 -0.05240172 0.04441302 0.13741367 -0.028136056 -0.06793941
## bs 0.21440422 0.25841835 0.20235615 -0.148171238 0.15430376
## ua 0.05595851 0.15541011 0.20913662 -0.256085361 0.12815722
## rbc -0.20778253 0.03968653 0.22601528 0.096637557 0.13546604
## wbc -0.02313799 0.05461379 0.08902853 0.025087005 0.10310387
## tg 0.13471841 0.20629392 0.22859959 -0.115454102 0.44128322
## cpk 0.03765889 0.08233121 0.10791172 -0.125058245 0.07561372
## alb -0.15991809 0.04486632 0.12655206 0.040265944 0.18012810
## bmi 0.04024757 0.24439570 0.32318478 -0.055370209 0.27467909
## bf 0.01213702 0.08623060 0.08001366 0.001783862 0.27634262
## waist wt20 bs ua rbc wbc
## age 0.21651029 -0.05240172 0.21440422 0.05595851 -0.20778253 -0.02313799
## sbp 0.29534598 0.04441302 0.25841835 0.15541011 0.03968653 0.05461379
## dbp 0.32680903 0.13741367 0.20235615 0.20913662 0.22601528 0.08902853
## eGFR -0.09679147 -0.02813606 -0.14817124 -0.25608536 0.09663756 0.02508701
## non_hdl 0.26398393 -0.06793941 0.15430376 0.12815722 0.13546604 0.10310387
## waist 1.00000000 0.34386258 0.27248381 0.30856125 0.27768115 0.16982907
## wt20 0.34386258 1.00000000 0.11590218 0.35687646 0.33807759 0.12891520
## bs 0.27248381 0.11590218 1.00000000 0.16394421 0.17797831 0.12684171
## ua 0.30856125 0.35687646 0.16394421 1.00000000 0.33443117 0.18045482
## rbc 0.27768115 0.33807759 0.17797831 0.33443117 1.00000000 0.25557041
## wbc 0.16982907 0.12891520 0.12684171 0.18045482 0.25557041 1.00000000
## tg 0.38905550 0.14433729 0.21925995 0.31238166 0.25782755 0.25416920
## cpk 0.12534234 0.19506513 0.03473554 0.18168435 0.06588585 -0.02645865
## alb 0.05391083 0.01783037 0.08891826 0.13676795 0.29780954 0.08792599
## bmi 0.78994573 0.29863394 0.23133388 0.25821915 0.26084257 0.13673400
## bf 0.31383574 -0.28783607 0.01572509 -0.12479430 -0.09784038 -0.02406466
## tg cpk alb bmi bf
## age 0.13471841 0.03765889 -0.15991809 0.04024757 0.012137023
## sbp 0.20629392 0.08233121 0.04486632 0.24439570 0.086230598
## dbp 0.22859959 0.10791172 0.12655206 0.32318478 0.080013662
## eGFR -0.11545410 -0.12505824 0.04026594 -0.05537021 0.001783862
## non_hdl 0.44128322 0.07561372 0.18012810 0.27467909 0.276342620
## waist 0.38905550 0.12534234 0.05391083 0.78994573 0.313835741
## wt20 0.14433729 0.19506513 0.01783037 0.29863394 -0.287836068
## bs 0.21925995 0.03473554 0.08891826 0.23133388 0.015725091
## ua 0.31238166 0.18168435 0.13676795 0.25821915 -0.124794302
## rbc 0.25782755 0.06588585 0.29780954 0.26084257 -0.097840382
## wbc 0.25416920 -0.02645865 0.08792599 0.13673400 -0.024064664
## tg 1.00000000 0.02314921 0.12003293 0.34314220 0.085053440
## cpk 0.02314921 1.00000000 0.07703153 0.14132311 -0.152825939
## alb 0.12003293 0.07703153 1.00000000 0.09603130 0.100237271
## bmi 0.34314220 0.14132311 0.09603130 1.00000000 0.448686121
## bf 0.08505344 -0.15282594 0.10023727 0.44868612 1.000000000
# Correlation matrix plots
corrplot::corrplot(cor_data)
corrplot::corrplot(cor_data, type = "upper", method = "number")
corrplot::corrplot(cor_data, type = "upper", method = "pie")
#
sel2 = df2[, c("age", "sbp", "dbp", "eGFR", "non_hdl", "waist", "wt20", "bs", "ua",
"rbc", "wbc", "tg", "cpk", "alb", "bmi", "bf", "stroke")]
library(gridExtra) #Multiple plot in single grip space
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
sel2 = as.data.frame(sel2)
univar_graph <- function(univar_name, univar, data, output_var) {
g_1 <- ggplot(data, aes(x=univar)) +
geom_density() +
xlab(univar_name) +
theme_bw()
g_2 <- ggplot(data, aes(x=univar, fill=output_var)) +
geom_density(alpha=0.4) +
xlab(univar_name) +
theme_bw()
gridExtra::grid.arrange(g_1, g_2, ncol=2, top = paste(univar_name,"variable", "/ [ Skew:",timeDate::skewness(univar),"]"))
}
for (x in 1:(ncol(sel2)-1)) {
univar_graph(univar_name = names(sel2)[x], univar = sel2[,x], data = sel2, output_var = sel2[,'stroke'])
}
# Splitting dataset
partition <- caret::createDataPartition(y = df2$stroke, times = 1, p = 0.7, list = FALSE)
# create training data set
train_set <- df2[partition,]
# create testing data set, subtracting the rows partition to get remaining 30% of the data
test_set <- df2[-partition,]
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# K-folds method
# Convert target variable to factor with two levels
train_set$stroke <- factor(train_set$stroke, levels = c("0", "1"))
test_set$stroke <- factor(test_set$stroke, levels = c("0", "1"))
# Convert level names to valid R variable names
levels(train_set$stroke) <- make.names(levels(train_set$stroke))
levels(test_set$stroke) <- make.names(levels(test_set$stroke))
model_forest <- caret::train(stroke ~., data = train_set,
method = "ranger",
metric = "ROC",
trControl = trainControl(method = "cv", number = 10,
classProbs = T, summaryFunction = twoClassSummary),
preProcess = c("center","scale","pca"))
model_forest
## Random Forest
##
## 5173 samples
## 56 predictor
## 2 classes: 'X0', 'X1'
##
## Pre-processing: centered (60), scaled (60), principal component
## signal extraction (60)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 4655, 4656, 4656, 4656, 4655, 4656, ...
## Resampling results across tuning parameters:
##
## mtry splitrule ROC Sens Spec
## 2 gini 0.6791999 1.0000000 0
## 2 extratrees 0.7011567 1.0000000 0
## 24 gini 0.6707406 0.9993827 0
## 24 extratrees 0.6968904 1.0000000 0
## 46 gini 0.6667594 0.9993831 0
## 46 extratrees 0.6931750 1.0000000 0
##
## Tuning parameter 'min.node.size' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 2, splitrule = extratrees
## and min.node.size = 1.
# final ROC Value
model_forest$results[6,4]
## [1] 0.693175
xgb_grid_1 <- expand.grid(
nrounds = 50,
eta = c(0.03),
max_depth = 1,
gamma = 0,
colsample_bytree = 0.6,
min_child_weight = 1,
subsample = 0.5
)
model_xgb <- caret::train(stroke ~., data = train_set,
method = "xgbTree",
metric = "ROC",
tuneGrid=xgb_grid_1,
trControl = trainControl(method = "cv", number = 10,
classProbs = T, summaryFunction = twoClassSummary),
preProcess = c("center","scale","pca"))
model_xgb
## eXtreme Gradient Boosting
##
## 5173 samples
## 56 predictor
## 2 classes: 'X0', 'X1'
##
## Pre-processing: centered (60), scaled (60), principal component
## signal extraction (60)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 4655, 4655, 4656, 4657, 4655, 4656, ...
## Resampling results:
##
## ROC Sens Spec
## 0.6955346 1 0
##
## Tuning parameter 'nrounds' was held constant at a value of 50
## Tuning
## held constant at a value of 1
## Tuning parameter 'subsample' was held
## constant at a value of 0.5
# final ROC value
model_xgb$results["ROC"]
## ROC
## 1 0.6955346
model_knn <- caret::train(stroke ~., data = train_set,
method = "knn",
metric = "ROC",
tuneGrid = expand.grid(.k = c(3:10)),
trControl = trainControl(method = "cv", number = 10,
classProbs = T, summaryFunction = twoClassSummary),
preProcess = c("center","scale","pca"))
model_knn
## k-Nearest Neighbors
##
## 5173 samples
## 56 predictor
## 2 classes: 'X0', 'X1'
##
## Pre-processing: centered (60), scaled (60), principal component
## signal extraction (60)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 4656, 4655, 4656, 4657, 4655, 4655, ...
## Resampling results across tuning parameters:
##
## k ROC Sens Spec
## 3 0.5353313 0.9907517 0.019677419
## 4 0.5493395 0.9903402 0.016451613
## 5 0.5587449 0.9977387 0.000000000
## 6 0.5670182 0.9975342 0.003333333
## 7 0.5773121 0.9985622 0.000000000
## 8 0.5891223 0.9993836 0.003225806
## 9 0.5911604 0.9995889 0.000000000
## 10 0.5967023 0.9991778 0.000000000
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 10.
#final ROC value
model_knn$results[8,2]
## [1] 0.5967023
model_glm <- caret::train(stroke ~., data = train_set,
method = "glm",
metric = "ROC",
tuneLength = 10,
trControl = trainControl(method = "cv", number = 10,
classProbs = T, summaryFunction = twoClassSummary),
preProcess = c("center","scale","pca"))
model_glm
## Generalized Linear Model
##
## 5173 samples
## 56 predictor
## 2 classes: 'X0', 'X1'
##
## Pre-processing: centered (60), scaled (60), principal component
## signal extraction (60)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 4656, 4657, 4656, 4656, 4655, 4656, ...
## Resampling results:
##
## ROC Sens Spec
## 0.7343427 1 0.003225806
#final ROC Value
model_glm$results[2]
## ROC
## 1 0.7343427
library(rpart.plot) #CART Decision Tree
## Loading required package: rpart
model_rpart <- caret::train(stroke ~., data = train_set,
method = "rpart",
metric = "ROC",
tuneLength = 20,
trControl = trainControl(method = "cv", number = 10,
classProbs = T, summaryFunction = twoClassSummary))
model_rpart
## CART
##
## 5173 samples
## 56 predictor
## 2 classes: 'X0', 'X1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 4655, 4655, 4655, 4657, 4657, 4656, ...
## Resampling results across tuning parameters:
##
## cp ROC Sens Spec
## 0.0000000000 0.6560288 0.9734906 0.04258065
## 0.0001714384 0.6560288 0.9734906 0.04258065
## 0.0003428767 0.6560288 0.9734906 0.04258065
## 0.0005143151 0.6530779 0.9734906 0.04258065
## 0.0006857535 0.6527347 0.9736964 0.04258065
## 0.0008571918 0.6583681 0.9747235 0.04258065
## 0.0010286302 0.6548404 0.9763674 0.04258065
## 0.0012000686 0.6548404 0.9763674 0.04258065
## 0.0013715069 0.6666735 0.9773958 0.04258065
## 0.0015429453 0.6666735 0.9773958 0.04258065
## 0.0017143837 0.6587505 0.9780118 0.03602151
## 0.0018858220 0.6565074 0.9812998 0.03602151
## 0.0020572604 0.6565074 0.9812998 0.03602151
## 0.0022286988 0.6523509 0.9812998 0.03602151
## 0.0024001372 0.6523509 0.9812998 0.03602151
## 0.0025715755 0.6540987 0.9821220 0.03602151
## 0.0027430139 0.6540987 0.9821220 0.03602151
## 0.0029144523 0.6548507 0.9825331 0.03602151
## 0.0030858906 0.6548507 0.9825331 0.03602151
## 0.0032573290 0.6541354 0.9831491 0.03602151
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.001542945.
# Plot model accuracy vs different values of cp (complexity parameter)
plot(model_rpart)
# Best Model Cp (Complexity Parameter)
model_rpart$bestTune
## cp
## 10 0.001542945
# Structure of final model selected
model_rpart$finalModel
## n= 5173
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 5173 307 X0 (0.94065339 0.05934661)
## 2) age< 55.5 2553 56 X0 (0.97806502 0.02193498) *
## 3) age>=55.5 2620 251 X0 (0.90419847 0.09580153)
## 6) upr_nega>=0.5 2110 176 X0 (0.91658768 0.08341232)
## 12) age< 62.5 761 38 X0 (0.95006570 0.04993430) *
## 13) age>=62.5 1349 138 X0 (0.89770200 0.10229800)
## 26) pulse_mean< 87.5 1259 118 X0 (0.90627482 0.09372518)
## 52) id>=821.5 1242 111 X0 (0.91062802 0.08937198)
## 104) hg< 14.05 760 51 X0 (0.93289474 0.06710526)
## 208) waist< 100.5 732 44 X0 (0.93989071 0.06010929)
## 416) ca>=8.85 667 33 X0 (0.95052474 0.04947526) *
## 417) ca< 8.85 65 11 X0 (0.83076923 0.16923077)
## 834) sbp>=108 55 5 X0 (0.90909091 0.09090909) *
## 835) sbp< 108 10 4 X1 (0.40000000 0.60000000) *
## 209) waist>=100.5 28 7 X0 (0.75000000 0.25000000)
## 418) hg< 13.45 18 1 X0 (0.94444444 0.05555556) *
## 419) hg>=13.45 10 4 X1 (0.40000000 0.60000000) *
## 105) hg>=14.05 482 60 X0 (0.87551867 0.12448133)
## 210) elbow>=6.1335 399 40 X0 (0.89974937 0.10025063)
## 420) frct< 308.25 379 33 X0 (0.91292876 0.08707124)
## 840) id>=51006.5 210 9 X0 (0.95714286 0.04285714) *
## 841) id< 51006.5 169 24 X0 (0.85798817 0.14201183)
## 1682) ua>=5.75 95 7 X0 (0.92631579 0.07368421)
## 3364) hg< 15.75 73 2 X0 (0.97260274 0.02739726) *
## 3365) hg>=15.75 22 5 X0 (0.77272727 0.22727273)
## 6730) dbp< 83.5 15 1 X0 (0.93333333 0.06666667) *
## 6731) dbp>=83.5 7 3 X1 (0.42857143 0.57142857) *
## 1683) ua< 5.75 74 17 X0 (0.77027027 0.22972973)
## 3366) eGFR>=71.07033 33 3 X0 (0.90909091 0.09090909) *
## 3367) eGFR< 71.07033 41 14 X0 (0.65853659 0.34146341)
## 6734) wbc< 6.915 32 8 X0 (0.75000000 0.25000000) *
## 6735) wbc>=6.915 9 3 X1 (0.33333333 0.66666667) *
## 421) frct>=308.25 20 7 X0 (0.65000000 0.35000000)
## 842) bmi>=23.64297 10 0 X0 (1.00000000 0.00000000) *
## 843) bmi< 23.64297 10 3 X1 (0.30000000 0.70000000) *
## 211) elbow< 6.1335 83 20 X0 (0.75903614 0.24096386)
## 422) bmi< 22.64269 52 7 X0 (0.86538462 0.13461538)
## 844) wbc>=3.965 45 3 X0 (0.93333333 0.06666667) *
## 845) wbc< 3.965 7 3 X1 (0.42857143 0.57142857) *
## 423) bmi>=22.64269 31 13 X0 (0.58064516 0.41935484)
## 846) bf>=23.7 22 6 X0 (0.72727273 0.27272727)
## 1692) bs< 98 11 0 X0 (1.00000000 0.00000000) *
## 1693) bs>=98 11 5 X1 (0.45454545 0.54545455) *
## 847) bf< 23.7 9 2 X1 (0.22222222 0.77777778) *
## 53) id< 821.5 17 7 X0 (0.58823529 0.41176471) *
## 27) pulse_mean>=87.5 90 20 X0 (0.77777778 0.22222222)
## 54) sbp>=150.5 31 1 X0 (0.96774194 0.03225806) *
## 55) sbp< 150.5 59 19 X0 (0.67796610 0.32203390)
## 110) wt20>=51.925 19 1 X0 (0.94736842 0.05263158) *
## 111) wt20< 51.925 40 18 X0 (0.55000000 0.45000000)
## 222) age< 73.5 30 10 X0 (0.66666667 0.33333333)
## 444) ztt>=11.85 9 0 X0 (1.00000000 0.00000000) *
## 445) ztt< 11.85 21 10 X0 (0.52380952 0.47619048)
## 890) ztt< 8.65 13 3 X0 (0.76923077 0.23076923) *
## 891) ztt>=8.65 8 1 X1 (0.12500000 0.87500000) *
## 223) age>=73.5 10 2 X1 (0.20000000 0.80000000) *
## 7) upr_nega< 0.5 510 75 X0 (0.85294118 0.14705882)
## 14) sbp< 156.5 396 43 X0 (0.89141414 0.10858586) *
## 15) sbp>=156.5 114 32 X0 (0.71929825 0.28070175)
## 30) eGFR< 79.96992 97 22 X0 (0.77319588 0.22680412)
## 60) eGFR>=48.67081 78 13 X0 (0.83333333 0.16666667)
## 120) non_hdl< 159.5 32 1 X0 (0.96875000 0.03125000) *
## 121) non_hdl>=159.5 46 12 X0 (0.73913043 0.26086957)
## 242) id< 14772 18 1 X0 (0.94444444 0.05555556) *
## 243) id>=14772 28 11 X0 (0.60714286 0.39285714)
## 486) got< 22.5 18 4 X0 (0.77777778 0.22222222) *
## 487) got>=22.5 10 3 X1 (0.30000000 0.70000000) *
## 61) eGFR< 48.67081 19 9 X0 (0.52631579 0.47368421) *
## 31) eGFR>=79.96992 17 7 X1 (0.41176471 0.58823529) *
# Should have refactored diabetes to make pos as the primary factor level to get straight forward decision tree
rpart.plot::rpart.plot(model_rpart$finalModel, type = 2, fallen.leaves = T, extra = 2, cex = 0.70)
# Training Data set - Model Comparision by ROC value
model_list <- list(Random_Forest = model_forest, XGBoost = model_xgb, KNN = model_knn, Logistic_Regression = model_glm, Rpart_DT = model_rpart)
resamples <- resamples(model_list)
#box plot
bwplot(resamples, metric="ROC")
#dot plot
dotplot(resamples, metric="ROC")
# prediction on Test data set
pred_rf <- predict(model_forest, test_set)
# Confusion Matrix
cm_rf <- confusionMatrix(pred_rf, test_set$stroke, positive="X1")
# Prediction Probabilities
pred_prob_rf <- predict(model_forest, test_set, type="prob")
# ROC value
library(pROC) #ROC
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc_rf <- roc(test_set$stroke, pred_prob_rf$X1)
## Setting levels: control = X0, case = X1
## Setting direction: controls < cases
# Confusion Matrix for Random Forest Model
cm_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction X0 X1
## X0 2085 131
## X1 0 0
##
## Accuracy : 0.9409
## 95% CI : (0.9302, 0.9503)
## No Information Rate : 0.9409
## P-Value [Acc > NIR] : 0.5232
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.00000
## Specificity : 1.00000
## Pos Pred Value : NaN
## Neg Pred Value : 0.94088
## Prevalence : 0.05912
## Detection Rate : 0.00000
## Detection Prevalence : 0.00000
## Balanced Accuracy : 0.50000
##
## 'Positive' Class : X1
##
# ROC Value for Random Forest
roc_rf
##
## Call:
## roc.default(response = test_set$stroke, predictor = pred_prob_rf$X1)
##
## Data: pred_prob_rf$X1 in 2085 controls (test_set$stroke X0) < 131 cases (test_set$stroke X1).
## Area under the curve: 0.6946
# AUC - Area under the curve
caTools::colAUC(pred_prob_rf$X1, test_set$stroke, plotROC = T)
## [,1]
## X0 vs. X1 0.6945668
# prediction on Test data set
pred_xgb <- predict(model_xgb, test_set)
# Confusion Matrix
cm_xgb <- confusionMatrix(pred_xgb, test_set$stroke, positive="X1")
# Prediction Probabilities
pred_prob_xgb <- predict(model_xgb, test_set, type="prob")
# ROC value
roc_xgb <- roc(test_set$stroke, pred_prob_xgb$X1)
## Setting levels: control = X0, case = X1
## Setting direction: controls < cases
# Confusion matrix
cm_xgb
## Confusion Matrix and Statistics
##
## Reference
## Prediction X0 X1
## X0 2085 131
## X1 0 0
##
## Accuracy : 0.9409
## 95% CI : (0.9302, 0.9503)
## No Information Rate : 0.9409
## P-Value [Acc > NIR] : 0.5232
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.00000
## Specificity : 1.00000
## Pos Pred Value : NaN
## Neg Pred Value : 0.94088
## Prevalence : 0.05912
## Detection Rate : 0.00000
## Detection Prevalence : 0.00000
## Balanced Accuracy : 0.50000
##
## 'Positive' Class : X1
##
# ROC Value for for XGBoost
roc_xgb
##
## Call:
## roc.default(response = test_set$stroke, predictor = pred_prob_xgb$X1)
##
## Data: pred_prob_xgb$X1 in 2085 controls (test_set$stroke X0) < 131 cases (test_set$stroke X1).
## Area under the curve: 0.7061
# AUC - Area under the curve
caTools::colAUC(pred_prob_xgb$X1, test_set$stroke, plotROC = T)
## [,1]
## X0 vs. X1 0.7060812
# prediction on Test data set
pred_knn <- predict(model_knn, test_set)
# Confusion Matrix
cm_knn <- confusionMatrix(pred_knn, test_set$stroke, positive="X1")
# Prediction Probabilities
pred_prob_knn <- predict(model_knn, test_set, type="prob")
# ROC value
roc_knn <- roc(test_set$stroke, pred_prob_knn$X1)
## Setting levels: control = X0, case = X1
## Setting direction: controls < cases
# Confusion matrix
cm_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction X0 X1
## X0 2085 131
## X1 0 0
##
## Accuracy : 0.9409
## 95% CI : (0.9302, 0.9503)
## No Information Rate : 0.9409
## P-Value [Acc > NIR] : 0.5232
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.00000
## Specificity : 1.00000
## Pos Pred Value : NaN
## Neg Pred Value : 0.94088
## Prevalence : 0.05912
## Detection Rate : 0.00000
## Detection Prevalence : 0.00000
## Balanced Accuracy : 0.50000
##
## 'Positive' Class : X1
##
# ROC Value for for XGBoost
roc_knn
##
## Call:
## roc.default(response = test_set$stroke, predictor = pred_prob_knn$X1)
##
## Data: pred_prob_knn$X1 in 2085 controls (test_set$stroke X0) < 131 cases (test_set$stroke X1).
## Area under the curve: 0.6078
# AUC - Area under the curve
caTools::colAUC(pred_prob_knn$X1, test_set$stroke, plotROC = T)
## [,1]
## X0 vs. X1 0.607824
# prediction on Test data set
pred_glm <- predict(model_glm, test_set)
# Confusion Matrix
cm_glm <- confusionMatrix(pred_glm, test_set$stroke, positive="X1")
# Prediction Probabilities
pred_prob_glm <- predict(model_glm, test_set, type="prob")
# ROC value
roc_glm <- roc(test_set$stroke, pred_prob_glm$X1)
## Setting levels: control = X0, case = X1
## Setting direction: controls < cases
# Confusion matrix
cm_glm
## Confusion Matrix and Statistics
##
## Reference
## Prediction X0 X1
## X0 2085 131
## X1 0 0
##
## Accuracy : 0.9409
## 95% CI : (0.9302, 0.9503)
## No Information Rate : 0.9409
## P-Value [Acc > NIR] : 0.5232
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.00000
## Specificity : 1.00000
## Pos Pred Value : NaN
## Neg Pred Value : 0.94088
## Prevalence : 0.05912
## Detection Rate : 0.00000
## Detection Prevalence : 0.00000
## Balanced Accuracy : 0.50000
##
## 'Positive' Class : X1
##
roc_glm
##
## Call:
## roc.default(response = test_set$stroke, predictor = pred_prob_glm$X1)
##
## Data: pred_prob_glm$X1 in 2085 controls (test_set$stroke X0) < 131 cases (test_set$stroke X1).
## Area under the curve: 0.7354
# AUC - Area under the curve
caTools::colAUC(pred_prob_glm$X1, test_set$stroke, plotROC = T)
## [,1]
## X0 vs. X1 0.7353873
# prediction on Test data set
pred_rpart <- predict(model_rpart, test_set)
# Confusion Matrix
cm_rpart <- confusionMatrix(pred_rpart, test_set$stroke, positive="X1")
# Prediction Probabilities
pred_prob_rpart <- predict(model_rpart, test_set, type="prob")
# ROC value
roc_rpart <- roc(test_set$stroke, pred_prob_rpart$X1)
## Setting levels: control = X0, case = X1
## Setting direction: controls < cases
# Confusion matrix
cm_rpart
## Confusion Matrix and Statistics
##
## Reference
## Prediction X0 X1
## X0 2042 127
## X1 43 4
##
## Accuracy : 0.9233
## 95% CI : (0.9114, 0.934)
## No Information Rate : 0.9409
## P-Value [Acc > NIR] : 0.9997
##
## Kappa : 0.0142
##
## Mcnemar's Test P-Value : 1.943e-10
##
## Sensitivity : 0.030534
## Specificity : 0.979376
## Pos Pred Value : 0.085106
## Neg Pred Value : 0.941448
## Prevalence : 0.059116
## Detection Rate : 0.001805
## Detection Prevalence : 0.021209
## Balanced Accuracy : 0.504955
##
## 'Positive' Class : X1
##
# ROC Value for for XGBoost
roc_rpart
##
## Call:
## roc.default(response = test_set$stroke, predictor = pred_prob_rpart$X1)
##
## Data: pred_prob_rpart$X1 in 2085 controls (test_set$stroke X0) < 131 cases (test_set$stroke X1).
## Area under the curve: 0.6461
# AUC - Area under the curve
caTools::colAUC(pred_prob_rpart$X1, test_set$stroke, plotROC = T)
## [,1]
## X0 vs. X1 0.6461164
result_rf <- c(cm_rf$byClass['Sensitivity'], cm_rf$byClass['Specificity'], cm_rf$byClass['Precision'],
cm_rf$byClass['Recall'], cm_rf$byClass['F1'], roc_rf$auc)
result_xgb <- c(cm_xgb$byClass['Sensitivity'], cm_xgb$byClass['Specificity'], cm_xgb$byClass['Precision'],
cm_xgb$byClass['Recall'], cm_xgb$byClass['F1'], roc_xgb$auc)
result_knn <- c(cm_knn$byClass['Sensitivity'], cm_knn$byClass['Specificity'], cm_knn$byClass['Precision'],
cm_knn$byClass['Recall'], cm_knn$byClass['F1'], roc_knn$auc)
result_glm <- c(cm_glm$byClass['Sensitivity'], cm_glm$byClass['Specificity'], cm_glm$byClass['Precision'],
cm_glm$byClass['Recall'], cm_glm$byClass['F1'], roc_glm$auc)
result_rpart <- c(cm_rpart$byClass['Sensitivity'], cm_rpart$byClass['Specificity'], cm_rpart$byClass['Precision'],
cm_rpart$byClass['Recall'], cm_rpart$byClass['F1'], roc_rpart$auc)
all_results <- data.frame(rbind(result_rf, result_xgb, result_knn, result_glm, result_rpart))
names(all_results) <- c("Sensitivity", "Specificity", "Precision", "Recall", "F1", "AUC")
all_results
## Sensitivity Specificity Precision Recall F1 AUC
## result_rf 0.00000000 1.0000000 NA 0.00000000 NA 0.6945668
## result_xgb 0.00000000 1.0000000 NA 0.00000000 NA 0.7060812
## result_knn 0.00000000 1.0000000 NA 0.00000000 NA 0.6078240
## result_glm 0.00000000 1.0000000 NA 0.00000000 NA 0.7353873
## result_rpart 0.03053435 0.9793765 0.08510638 0.03053435 0.04494382 0.6461164
col <- c("#ed3b3b", "#0099ff")
graphics::fourfoldplot(cm_rf$table, color = col, conf.level = 0.95, margin = 1,
main = paste("Random Forest Accuracy(",round(cm_rf$overall[1]*100),"%)", sep = ""))
graphics::fourfoldplot(cm_xgb$table, color = col, conf.level = 0.95, margin = 1,
main = paste("XGB Accuracy(",round(cm_xgb$overall[1]*100),"%)", sep = ""))
graphics::fourfoldplot(cm_knn$table, color = col, conf.level = 0.95, margin = 1,
main = paste("KNN Accuracy(",round(cm_knn$overall[1]*100),"%)", sep = ""))
graphics::fourfoldplot(cm_glm$table, color = col, conf.level = 0.95, margin = 1,
main = paste("Logistic Regression Accuracy(",round(cm_glm$overall[1]*100),"%)", sep = ""))
graphics::fourfoldplot(cm_rpart$table, color = col, conf.level = 0.95, margin = 1,
main = paste("Rpart DT Accuracy(",round(cm_rpart$overall[1]*100),"%)", sep = ""))