dat = read.csv("C:/Users/thien/OneDrive/Desktop/anackd.csv")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.4.0 v purrr 0.3.4
## v tibble 3.1.8 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
dat1 = dat %>% select(age, sex, sbp, dbp, pulse, tc, hdl, nonhdlc, tg, bs,
rbc, hg, hct, tob, alc, dx_ht, dx_af, dx_ckd, dx_mi, dx_st,
pyst, exer, kaidan, stress, BMI, BMIg, nonhdlc, diabetes,
hypertension, metSg, eGFR, eGFRg, uprg, fast, dx_dm)
write.csv(dat1, "data.csv")
data <- read.csv("data.csv")
table(data$dx_st)
##
## 0 1
## 6910 432
#3. Down Sampling
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(tidyverse)
data <- data %>% select(-pyst)
data$dx_st <- as.factor(data$dx_st)
dat2 = downSample(data %>% select(-dx_st), y = data$dx_st)
#4. Spliting data set
dat3 = dat2[,apply(is.na(dat2), 2, sum) == 0] # Next time should check imputation method
sample = createDataPartition(dat3$Class, p = 0.7, list = FALSE)
trainset = dat3[sample,]
testset = dat3[-sample,]
#5. Random Forest
model=train(Class~., data=trainset)
predict(model, trainset)
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [223] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [260] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [297] 0 0 0 0 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1
## [334] 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [371] 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1
## [408] 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [445] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [482] 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1
## [519] 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [556] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
## [593] 1 1 1 1 1 1 0 1 1 1 1 1 1 1
## Levels: 0 1
# For trainset
confusionMatrix(predict(model, trainset), dat3$Class[sample], positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 303 12
## 1 0 291
##
## Accuracy : 0.9802
## 95% CI : (0.9657, 0.9897)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9604
##
## Mcnemar's Test P-Value : 0.001496
##
## Sensitivity : 0.9604
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9619
## Prevalence : 0.5000
## Detection Rate : 0.4802
## Detection Prevalence : 0.4802
## Balanced Accuracy : 0.9802
##
## 'Positive' Class : 1
##
# For testset
confusionMatrix(predict(model, testset), dat3$Class[-sample], positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 87 42
## 1 42 87
##
## Accuracy : 0.6744
## 95% CI : (0.6135, 0.7312)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 1.094e-08
##
## Kappa : 0.3488
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.6744
## Specificity : 0.6744
## Pos Pred Value : 0.6744
## Neg Pred Value : 0.6744
## Prevalence : 0.5000
## Detection Rate : 0.3372
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.6744
##
## 'Positive' Class : 1
##
#5. varImp
varImp(model)
## rf variable importance
##
## only 20 most important variables shown (out of 24)
##
## Overall
## age 100.000
## eGFR 86.357
## sbp 71.722
## X 64.895
## tc 62.664
## tg 60.854
## rbc 57.695
## bs 54.884
## hct 53.091
## dbp 53.086
## hg 51.087
## hypertension 21.603
## dx_ht 19.351
## uprg 18.421
## metSg 12.843
## diabetes 12.343
## eGFRg 12.104
## BMIgOverweight 10.644
## sex 10.563
## dx_dm 9.527
#6. Correlation coefficients
library(corrplot)
## corrplot 0.92 loaded
corrplot(cor(data.matrix(dat3)))
plot(trainset$Class, trainset$eGFR)