1. Import dataset: anackd. This is baseline Sui study dataset

dat = read.csv("C:/Users/thien/OneDrive/Desktop/anackd.csv")

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.4.0     v purrr   0.3.4
## v tibble  3.1.8     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
dat1 = dat %>% select(age, sex, sbp, dbp, pulse, tc, hdl, nonhdlc, tg, bs, 
                      rbc, hg, hct, tob, alc, dx_ht, dx_af, dx_ckd, dx_mi, dx_st, 
                      pyst, exer, kaidan, stress, BMI, BMIg, nonhdlc, diabetes, 
                      hypertension, metSg, eGFR, eGFRg, uprg, fast, dx_dm)

write.csv(dat1, "data.csv")

2. Use the select data

data <- read.csv("data.csv")
table(data$dx_st)
## 
##    0    1 
## 6910  432

#3. Down Sampling

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(tidyverse)
data <- data %>% select(-pyst)
data$dx_st <- as.factor(data$dx_st)


dat2 = downSample(data %>% select(-dx_st), y = data$dx_st)

#4. Spliting data set

dat3 = dat2[,apply(is.na(dat2), 2, sum) == 0] # Next time should check imputation method

sample = createDataPartition(dat3$Class, p = 0.7, list = FALSE)

trainset = dat3[sample,]
testset = dat3[-sample,]

#5. Random Forest

model=train(Class~., data=trainset)
predict(model, trainset)
##   [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [223] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [260] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [297] 0 0 0 0 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1
## [334] 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [371] 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1
## [408] 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [445] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [482] 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1
## [519] 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [556] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
## [593] 1 1 1 1 1 1 0 1 1 1 1 1 1 1
## Levels: 0 1
# For trainset
confusionMatrix(predict(model, trainset), dat3$Class[sample], positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 303  12
##          1   0 291
##                                           
##                Accuracy : 0.9802          
##                  95% CI : (0.9657, 0.9897)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9604          
##                                           
##  Mcnemar's Test P-Value : 0.001496        
##                                           
##             Sensitivity : 0.9604          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9619          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4802          
##    Detection Prevalence : 0.4802          
##       Balanced Accuracy : 0.9802          
##                                           
##        'Positive' Class : 1               
## 
# For testset
confusionMatrix(predict(model, testset), dat3$Class[-sample], positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 87 42
##          1 42 87
##                                           
##                Accuracy : 0.6744          
##                  95% CI : (0.6135, 0.7312)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 1.094e-08       
##                                           
##                   Kappa : 0.3488          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.6744          
##             Specificity : 0.6744          
##          Pos Pred Value : 0.6744          
##          Neg Pred Value : 0.6744          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3372          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.6744          
##                                           
##        'Positive' Class : 1               
## 

#5. varImp

varImp(model)
## rf variable importance
## 
##   only 20 most important variables shown (out of 24)
## 
##                Overall
## age            100.000
## eGFR            86.357
## sbp             71.722
## X               64.895
## tc              62.664
## tg              60.854
## rbc             57.695
## bs              54.884
## hct             53.091
## dbp             53.086
## hg              51.087
## hypertension    21.603
## dx_ht           19.351
## uprg            18.421
## metSg           12.843
## diabetes        12.343
## eGFRg           12.104
## BMIgOverweight  10.644
## sex             10.563
## dx_dm            9.527

#6. Correlation coefficients

library(corrplot)
## corrplot 0.92 loaded
corrplot(cor(data.matrix(dat3)))

plot(trainset$Class, trainset$eGFR)