\(~\)

\(~\)

1 NHANES data

library('tidyverse')
## -- Attaching packages ------------------------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.0     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts --------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(NHANES)

NHANES_DATA_12 <- NHANES %>%
  filter(!is.na(Depressed))

1.1 Note that Depressed is has 3 potiential classes

NHANES_DATA_12 %>% 
  select(Depressed) %>%
  distinct()
## # A tibble: 3 x 1
##   Depressed
##   <fct>    
## 1 Several  
## 2 None     
## 3 Most

\(~\)

\(~\)


\(~\)

\(~\)

2 Some data prep

SumNa <- function(col){sum(is.na(col))}

data.sum <- NHANES_DATA_12 %>% 
  summarise_all(SumNa) %>%
  tidyr::gather(key='feature', value='SumNa') %>%
  arrange(-SumNa) %>%
  mutate(PctNa = SumNa/nrow(NHANES_DATA_12))

data.sum2 <- data.sum %>% 
  filter(! (feature %in% c('ID','Depressed'))) %>%
  filter(PctNa < .85)

data.sum2$feature
##  [1] "UrineFlow2"      "UrineVol2"       "AgeRegMarij"     "PregnantNow"    
##  [5] "Age1stBaby"      "nBabies"         "nPregnancies"    "SmokeAge"       
##  [9] "AgeFirstMarij"   "SmokeNow"        "Testosterone"    "AgeMonths"      
## [13] "TVHrsDay"        "Race3"           "CompHrsDay"      "PhysActiveDays" 
## [17] "SexOrientation"  "AlcoholDay"      "SexNumPartYear"  "Marijuana"      
## [21] "RegularMarij"    "SexAge"          "SexNumPartnLife" "HardDrugs"      
## [25] "SexEver"         "SameSex"         "AlcoholYear"     "HHIncome"       
## [29] "HHIncomeMid"     "Poverty"         "UrineFlow1"      "BPSys1"         
## [33] "BPDia1"          "AgeDecade"       "DirectChol"      "TotChol"        
## [37] "BPSys2"          "BPDia2"          "Education"       "BPSys3"         
## [41] "BPDia3"          "MaritalStatus"   "Smoke100"        "Smoke100n"      
## [45] "Alcohol12PlusYr" "BPSysAve"        "BPDiaAve"        "Pulse"          
## [49] "BMI_WHO"         "BMI"             "Weight"          "HomeRooms"      
## [53] "Height"          "HomeOwn"         "UrineVol1"       "SleepHrsNight"  
## [57] "LittleInterest"  "DaysPhysHlthBad" "DaysMentHlthBad" "Diabetes"       
## [61] "Work"            "SurveyYr"        "Gender"          "Age"            
## [65] "Race1"           "HealthGen"       "SleepTrouble"    "PhysActive"
data_F <- NHANES_DATA_12 %>% 
  select(ID, Depressed, data.sum2$feature) %>%
  filter(!is.na(Depressed))

2.1 note that data_F still has missing values

Amelia::missmap(as.data.frame(data_F))

\(~\)

\(~\)


\(~\)

\(~\)

3 Random Forest Impute with rfImpute

library('randomForest')
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
data_F.imputed <- rfImpute(Depressed ~ . , 
                           data_F, 
                           iter=2, 
                           ntree=300)
## ntree      OOB      1      2      3
##   300:   8.15%  0.88% 36.47% 31.10%
## ntree      OOB      1      2      3
##   300:   8.57%  1.16% 37.17% 32.54%

3.0.1 Note we no longer have missing data

Amelia::missmap(as.data.frame(data_F.imputed))

\(~\)

\(~\)


\(~\)

\(~\)

4 Split Data

library('caret')
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
set.seed(8576309)

trainIndex <- createDataPartition(data_F.imputed$Depressed, 
                                  p = .6, 
                                  list = FALSE, 
                                  times = 1)

TRAIN <- data_F.imputed[trainIndex, ]
TEST <- data_F.imputed[-trainIndex, ]

\(~\)

\(~\)


\(~\)

\(~\)

5 Train model

train_ctrl <- trainControl(method="cv", # type of resampling in this case Cross-Validated
                           number=3, # number of folds
                           search = "random", # we are performing a "random
                           )

toc <- Sys.time()
model_rf <- train(Depressed ~ .,
                       data = TRAIN,
                       method = "rf", # this will use the randomForest::randomForest function
                       metric = "Accuracy", # which metric should be optimized for 
                       trControl = train_ctrl,
                       # options to be passed to randomForest
                       ntree = 741,
                       keep.forest=TRUE,
                       importance=TRUE) 
tic <- Sys.time()

tic - toc
## Time difference of 9.858868 mins

5.1 Model output

model_rf
## Random Forest 
## 
## 4005 samples
##   69 predictor
##    3 classes: 'None', 'Several', 'Most' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 2670, 2669, 2671 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   11    0.8784034  0.5961449
##   63    0.8858945  0.6478456
##   75    0.8804002  0.6338604
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 63.
randomForest::varImpPlot(model_rf$finalModel)

\(~\)

\(~\)


\(~\)

\(~\)

6 Score Test Data

probs <- predict(model_rf, TEST, 'prob')
class <- predict(model_rf, TEST, 'raw')


TEST.scored <- cbind(TEST, probs, class)

6.1 Use yardstick for Model Metrics

library('yardstick')
## For binary classification, the first factor level is assumed to be the event.
## Set the global option `yardstick.event_first` to `FALSE` to change this.
## 
## Attaching package: 'yardstick'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
## The following object is masked from 'package:readr':
## 
##     spec
cm <- conf_mat(TEST.scored, truth = Depressed, class)

cm
##           Truth
## Prediction None Several Most
##    None    2032     166   23
##    Several   53     213   22
##    Most      13      24  122
summary(cm)
## # A tibble: 13 x 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             multiclass     0.887
##  2 kap                  multiclass     0.653
##  3 sens                 macro          0.743
##  4 spec                 macro          0.874
##  5 ppv                  macro          0.807
##  6 npv                  macro          0.918
##  7 mcc                  multiclass     0.660
##  8 j_index              macro          0.616
##  9 bal_accuracy         macro          0.808
## 10 detection_prevalence macro          0.333
## 11 precision            macro          0.807
## 12 recall               macro          0.743
## 13 f_meas               macro          0.769
library('ggplot2')

ggplot(summary(cm), aes(x=.metric, y=.estimate)) + 
  geom_bar(stat="identity") + 
  coord_flip()

\(~\)

\(~\)


\(~\)

\(~\)

7 Code Appendix

\(~\)

knitr::opts_chunk$set(echo = TRUE)
library('tidyverse')
library(NHANES)

NHANES_DATA_12 <- NHANES %>%
  filter(!is.na(Depressed))

NHANES_DATA_12 %>% 
  select(Depressed) %>%
  distinct()
SumNa <- function(col){sum(is.na(col))}

data.sum <- NHANES_DATA_12 %>% 
  summarise_all(SumNa) %>%
  tidyr::gather(key='feature', value='SumNa') %>%
  arrange(-SumNa) %>%
  mutate(PctNa = SumNa/nrow(NHANES_DATA_12))

data.sum2 <- data.sum %>% 
  filter(! (feature %in% c('ID','Depressed'))) %>%
  filter(PctNa < .85)

data.sum2$feature

data_F <- NHANES_DATA_12 %>% 
  select(ID, Depressed, data.sum2$feature) %>%
  filter(!is.na(Depressed))
Amelia::missmap(as.data.frame(data_F))
library('randomForest')
data_F.imputed <- rfImpute(Depressed ~ . , 
                           data_F, 
                           iter=2, 
                           ntree=300)
Amelia::missmap(as.data.frame(data_F.imputed))
library('caret')

set.seed(8576309)

trainIndex <- createDataPartition(data_F.imputed$Depressed, 
                                  p = .6, 
                                  list = FALSE, 
                                  times = 1)

TRAIN <- data_F.imputed[trainIndex, ]
TEST <- data_F.imputed[-trainIndex, ]
train_ctrl <- trainControl(method="cv", # type of resampling in this case Cross-Validated
                           number=3, # number of folds
                           search = "random", # we are performing a "random
                           )

toc <- Sys.time()
model_rf <- train(Depressed ~ .,
                       data = TRAIN,
                       method = "rf", # this will use the randomForest::randomForest function
                       metric = "Accuracy", # which metric should be optimized for 
                       trControl = train_ctrl,
                       # options to be passed to randomForest
                       ntree = 741,
                       keep.forest=TRUE,
                       importance=TRUE) 
tic <- Sys.time()

tic - toc
model_rf

randomForest::varImpPlot(model_rf$finalModel)

probs <- predict(model_rf, TEST, 'prob')
class <- predict(model_rf, TEST, 'raw')


TEST.scored <- cbind(TEST, probs, class)
library('yardstick')

cm <- conf_mat(TEST.scored, truth = Depressed, class)

cm

summary(cm)

library('ggplot2')

ggplot(summary(cm), aes(x=.metric, y=.estimate)) + 
  geom_bar(stat="identity") + 
  coord_flip()