\(~\)
\(~\)
NHANES
datalibrary('tidyverse')
## -- Attaching packages ------------------------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.5
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts --------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(NHANES)
NHANES_DATA_12 <- NHANES %>%
filter(!is.na(Depressed))
Depressed
is has 3 potiential classesNHANES_DATA_12 %>%
select(Depressed) %>%
distinct()
## # A tibble: 3 x 1
## Depressed
## <fct>
## 1 Several
## 2 None
## 3 Most
\(~\)
\(~\)
\(~\)
\(~\)
SumNa <- function(col){sum(is.na(col))}
data.sum <- NHANES_DATA_12 %>%
summarise_all(SumNa) %>%
tidyr::gather(key='feature', value='SumNa') %>%
arrange(-SumNa) %>%
mutate(PctNa = SumNa/nrow(NHANES_DATA_12))
data.sum2 <- data.sum %>%
filter(! (feature %in% c('ID','Depressed'))) %>%
filter(PctNa < .85)
data.sum2$feature
## [1] "UrineFlow2" "UrineVol2" "AgeRegMarij" "PregnantNow"
## [5] "Age1stBaby" "nBabies" "nPregnancies" "SmokeAge"
## [9] "AgeFirstMarij" "SmokeNow" "Testosterone" "AgeMonths"
## [13] "TVHrsDay" "Race3" "CompHrsDay" "PhysActiveDays"
## [17] "SexOrientation" "AlcoholDay" "SexNumPartYear" "Marijuana"
## [21] "RegularMarij" "SexAge" "SexNumPartnLife" "HardDrugs"
## [25] "SexEver" "SameSex" "AlcoholYear" "HHIncome"
## [29] "HHIncomeMid" "Poverty" "UrineFlow1" "BPSys1"
## [33] "BPDia1" "AgeDecade" "DirectChol" "TotChol"
## [37] "BPSys2" "BPDia2" "Education" "BPSys3"
## [41] "BPDia3" "MaritalStatus" "Smoke100" "Smoke100n"
## [45] "Alcohol12PlusYr" "BPSysAve" "BPDiaAve" "Pulse"
## [49] "BMI_WHO" "BMI" "Weight" "HomeRooms"
## [53] "Height" "HomeOwn" "UrineVol1" "SleepHrsNight"
## [57] "LittleInterest" "DaysPhysHlthBad" "DaysMentHlthBad" "Diabetes"
## [61] "Work" "SurveyYr" "Gender" "Age"
## [65] "Race1" "HealthGen" "SleepTrouble" "PhysActive"
data_F <- NHANES_DATA_12 %>%
select(ID, Depressed, data.sum2$feature) %>%
filter(!is.na(Depressed))
data_F
still has missing valuesAmelia::missmap(as.data.frame(data_F))
\(~\)
\(~\)
\(~\)
\(~\)
rfImpute
library('randomForest')
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
data_F.imputed <- rfImpute(Depressed ~ . ,
data_F,
iter=2,
ntree=300)
## ntree OOB 1 2 3
## 300: 8.15% 0.88% 36.47% 31.10%
## ntree OOB 1 2 3
## 300: 8.57% 1.16% 37.17% 32.54%
Amelia::missmap(as.data.frame(data_F.imputed))
\(~\)
\(~\)
\(~\)
\(~\)
library('caret')
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(8576309)
trainIndex <- createDataPartition(data_F.imputed$Depressed,
p = .6,
list = FALSE,
times = 1)
TRAIN <- data_F.imputed[trainIndex, ]
TEST <- data_F.imputed[-trainIndex, ]
\(~\)
\(~\)
\(~\)
\(~\)
train_ctrl <- trainControl(method="cv", # type of resampling in this case Cross-Validated
number=3, # number of folds
search = "random", # we are performing a "random
)
toc <- Sys.time()
model_rf <- train(Depressed ~ .,
data = TRAIN,
method = "rf", # this will use the randomForest::randomForest function
metric = "Accuracy", # which metric should be optimized for
trControl = train_ctrl,
# options to be passed to randomForest
ntree = 741,
keep.forest=TRUE,
importance=TRUE)
tic <- Sys.time()
tic - toc
## Time difference of 9.858868 mins
model_rf
## Random Forest
##
## 4005 samples
## 69 predictor
## 3 classes: 'None', 'Several', 'Most'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 2670, 2669, 2671
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 11 0.8784034 0.5961449
## 63 0.8858945 0.6478456
## 75 0.8804002 0.6338604
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 63.
randomForest::varImpPlot(model_rf$finalModel)
\(~\)
\(~\)
\(~\)
\(~\)
probs <- predict(model_rf, TEST, 'prob')
class <- predict(model_rf, TEST, 'raw')
TEST.scored <- cbind(TEST, probs, class)
yardstick
for Model Metricslibrary('yardstick')
## For binary classification, the first factor level is assumed to be the event.
## Set the global option `yardstick.event_first` to `FALSE` to change this.
##
## Attaching package: 'yardstick'
## The following objects are masked from 'package:caret':
##
## precision, recall
## The following object is masked from 'package:readr':
##
## spec
cm <- conf_mat(TEST.scored, truth = Depressed, class)
cm
## Truth
## Prediction None Several Most
## None 2032 166 23
## Several 53 213 22
## Most 13 24 122
summary(cm)
## # A tibble: 13 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.887
## 2 kap multiclass 0.653
## 3 sens macro 0.743
## 4 spec macro 0.874
## 5 ppv macro 0.807
## 6 npv macro 0.918
## 7 mcc multiclass 0.660
## 8 j_index macro 0.616
## 9 bal_accuracy macro 0.808
## 10 detection_prevalence macro 0.333
## 11 precision macro 0.807
## 12 recall macro 0.743
## 13 f_meas macro 0.769
library('ggplot2')
ggplot(summary(cm), aes(x=.metric, y=.estimate)) +
geom_bar(stat="identity") +
coord_flip()
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
knitr::opts_chunk$set(echo = TRUE)
library('tidyverse')
library(NHANES)
NHANES_DATA_12 <- NHANES %>%
filter(!is.na(Depressed))
NHANES_DATA_12 %>%
select(Depressed) %>%
distinct()
SumNa <- function(col){sum(is.na(col))}
data.sum <- NHANES_DATA_12 %>%
summarise_all(SumNa) %>%
tidyr::gather(key='feature', value='SumNa') %>%
arrange(-SumNa) %>%
mutate(PctNa = SumNa/nrow(NHANES_DATA_12))
data.sum2 <- data.sum %>%
filter(! (feature %in% c('ID','Depressed'))) %>%
filter(PctNa < .85)
data.sum2$feature
data_F <- NHANES_DATA_12 %>%
select(ID, Depressed, data.sum2$feature) %>%
filter(!is.na(Depressed))
Amelia::missmap(as.data.frame(data_F))
library('randomForest')
data_F.imputed <- rfImpute(Depressed ~ . ,
data_F,
iter=2,
ntree=300)
Amelia::missmap(as.data.frame(data_F.imputed))
library('caret')
set.seed(8576309)
trainIndex <- createDataPartition(data_F.imputed$Depressed,
p = .6,
list = FALSE,
times = 1)
TRAIN <- data_F.imputed[trainIndex, ]
TEST <- data_F.imputed[-trainIndex, ]
train_ctrl <- trainControl(method="cv", # type of resampling in this case Cross-Validated
number=3, # number of folds
search = "random", # we are performing a "random
)
toc <- Sys.time()
model_rf <- train(Depressed ~ .,
data = TRAIN,
method = "rf", # this will use the randomForest::randomForest function
metric = "Accuracy", # which metric should be optimized for
trControl = train_ctrl,
# options to be passed to randomForest
ntree = 741,
keep.forest=TRUE,
importance=TRUE)
tic <- Sys.time()
tic - toc
model_rf
randomForest::varImpPlot(model_rf$finalModel)
probs <- predict(model_rf, TEST, 'prob')
class <- predict(model_rf, TEST, 'raw')
TEST.scored <- cbind(TEST, probs, class)
library('yardstick')
cm <- conf_mat(TEST.scored, truth = Depressed, class)
cm
summary(cm)
library('ggplot2')
ggplot(summary(cm), aes(x=.metric, y=.estimate)) +
geom_bar(stat="identity") +
coord_flip()