\(~\)

\(~\)

1 NHANES data

#install.packages('svmLinear3')
#install.packages('LiblineaR')

library('tidyverse')
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v dplyr   1.0.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(NHANES)

NHANES_DATA_12 <- NHANES %>%
  filter(!is.na(Depressed))

1.1 Note that Depressed is has 3 potiential classes

NHANES_DATA_12 %>% 
  select(Depressed) %>%
  distinct()
## # A tibble: 3 x 1
##   Depressed
##   <fct>    
## 1 Several  
## 2 None     
## 3 Most

\(~\)

\(~\)


\(~\)

\(~\)

2 Some data prep

SumNa <- function(col){sum(is.na(col))}

data.sum <- NHANES_DATA_12 %>% 
  summarise_all(SumNa) %>%
  tidyr::gather(key='feature', value='SumNa') %>%
  arrange(-SumNa) %>%
  mutate(PctNa = SumNa/nrow(NHANES_DATA_12))

data.sum2 <- data.sum %>% 
  filter(! (feature %in% c('ID','Depressed'))) %>%
  filter(PctNa < .85)

data.sum2$feature
##  [1] "UrineFlow2"      "UrineVol2"       "AgeRegMarij"     "PregnantNow"    
##  [5] "Age1stBaby"      "nBabies"         "nPregnancies"    "SmokeAge"       
##  [9] "AgeFirstMarij"   "SmokeNow"        "Testosterone"    "AgeMonths"      
## [13] "TVHrsDay"        "Race3"           "CompHrsDay"      "PhysActiveDays" 
## [17] "SexOrientation"  "AlcoholDay"      "SexNumPartYear"  "Marijuana"      
## [21] "RegularMarij"    "SexAge"          "SexNumPartnLife" "HardDrugs"      
## [25] "SexEver"         "SameSex"         "AlcoholYear"     "HHIncome"       
## [29] "HHIncomeMid"     "Poverty"         "UrineFlow1"      "BPSys1"         
## [33] "BPDia1"          "AgeDecade"       "DirectChol"      "TotChol"        
## [37] "BPSys2"          "BPDia2"          "Education"       "BPSys3"         
## [41] "BPDia3"          "MaritalStatus"   "Smoke100"        "Smoke100n"      
## [45] "Alcohol12PlusYr" "BPSysAve"        "BPDiaAve"        "Pulse"          
## [49] "BMI_WHO"         "BMI"             "Weight"          "HomeRooms"      
## [53] "Height"          "HomeOwn"         "UrineVol1"       "SleepHrsNight"  
## [57] "LittleInterest"  "DaysPhysHlthBad" "DaysMentHlthBad" "Diabetes"       
## [61] "Work"            "SurveyYr"        "Gender"          "Age"            
## [65] "Race1"           "HealthGen"       "SleepTrouble"    "PhysActive"
data_F <- NHANES_DATA_12 %>% 
  select(ID, Depressed, data.sum2$feature) %>%
  filter(!is.na(Depressed))

2.1 note that data_F still has missing values

Amelia::missmap(as.data.frame(data_F))

\(~\)

\(~\)


\(~\)

\(~\)

3 Random Forest Impute with rfImpute

library('randomForest')
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
data_F.imputed <- rfImpute(Depressed ~ . , 
                           data_F, 
                           iter=2, 
                           ntree=300)
## ntree      OOB      1      2      3
##   300:   8.11%  0.74% 36.47% 32.06%
## ntree      OOB      1      2      3
##   300:   8.54%  1.20% 36.27% 33.73%

3.0.1 Note we no longer have missing data

Amelia::missmap(as.data.frame(data_F.imputed))

\(~\)

\(~\)


\(~\)

\(~\)

4 Split Data

library('caret')
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
set.seed(8576309)

trainIndex <- createDataPartition(data_F.imputed$Depressed, 
                                  p = .6, 
                                  list = FALSE, 
                                  times = 1)

TRAIN <- data_F.imputed[trainIndex, ]
TEST <- data_F.imputed[-trainIndex, ]

\(~\)

\(~\)


\(~\)

\(~\)

5 Train model

To see list of models compatible and tuneable hyperparameters within the caret package you can visit: https://topepo.github.io/caret/available-models.html

SVMGrid <- expand.grid(cost = c(1,2,5), Loss = c(1,5,6))

train_ctrl <- trainControl(method="cv", # type of resampling in this case Cross-Validated
                           number=3)

toc <- Sys.time()
model_svm <- train(Depressed ~ .,
                       data = TRAIN,
                       method = "svmLinear3", # this will use the svmLinear3 library
                       metric = "Accuracy", # which metric should be optimized for 
                       trControl = train_ctrl,
                       tuneGrid = SVMGrid) 
tic <- Sys.time()

tic - toc
## Time difference of 5.958181 mins

5.1 Model output

model_svm
## L2 Regularized Support Vector Machine (dual) with Linear Kernel 
## 
## 4005 samples
##   69 predictor
##    3 classes: 'None', 'Several', 'Most' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 2670, 2669, 2671 
## Resampling results across tuning parameters:
## 
##   cost  Loss  Accuracy   Kappa      
##   1     1     0.5346848  0.029585510
##   1     5     0.5709959  0.007226782
##   1     6     0.7437887  0.028501308
##   2     1     0.7860176  0.000000000
##   2     5     0.7410419  0.033046429
##   2     6     0.7860176  0.000000000
##   5     1     0.7860176  0.000000000
##   5     5     0.4763128  0.059009727
##   5     6     0.7860176  0.000000000
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were cost = 2 and Loss = 1.
varImp(model_svm)
## ROC curve variable importance
## 
##   variables are sorted by maximum importance across the classes
##   only 20 most important variables shown (out of 69)
## 
##                   None Several   Most
## DaysMentHlthBad 100.00   70.34 100.00
## LittleInterest   90.65   57.67  90.65
## Age1stBaby       76.93   49.47  76.93
## SmokeNow         57.47   36.67  57.47
## HealthGen        51.08   30.53  51.08
## HHIncome         44.77   26.09  44.77
## Poverty          43.57   25.38  43.57
## nPregnancies     41.20   27.79  41.20
## HHIncomeMid      39.49   23.65  39.49
## Education        38.66   25.36  38.66
## DaysPhysHlthBad  34.68   19.16  34.68
## SleepTrouble     33.63   19.78  33.63
## Work             33.07   23.61  33.07
## TVHrsDay         32.73   20.04  32.73
## SleepHrsNight    32.09   17.46  32.09
## Testosterone     31.44   22.91  31.44
## HomeRooms        31.43   17.13  31.43
## PhysActive       29.67   15.33  29.67
## PregnantNow      28.31   22.63  28.31
## AgeMonths        28.25   28.25  21.14
plot(varImp(model_svm), top = 20)

6 Code Appendix

\(~\)

knitr::opts_chunk$set(echo = TRUE)

#install.packages('svmLinear3')
#install.packages('LiblineaR')

library('tidyverse')
library(NHANES)

NHANES_DATA_12 <- NHANES %>%
  filter(!is.na(Depressed))

NHANES_DATA_12 %>% 
  select(Depressed) %>%
  distinct()
SumNa <- function(col){sum(is.na(col))}

data.sum <- NHANES_DATA_12 %>% 
  summarise_all(SumNa) %>%
  tidyr::gather(key='feature', value='SumNa') %>%
  arrange(-SumNa) %>%
  mutate(PctNa = SumNa/nrow(NHANES_DATA_12))

data.sum2 <- data.sum %>% 
  filter(! (feature %in% c('ID','Depressed'))) %>%
  filter(PctNa < .85)

data.sum2$feature

data_F <- NHANES_DATA_12 %>% 
  select(ID, Depressed, data.sum2$feature) %>%
  filter(!is.na(Depressed))
Amelia::missmap(as.data.frame(data_F))
library('randomForest')
data_F.imputed <- rfImpute(Depressed ~ . , 
                           data_F, 
                           iter=2, 
                           ntree=300)
Amelia::missmap(as.data.frame(data_F.imputed))
library('caret')

set.seed(8576309)

trainIndex <- createDataPartition(data_F.imputed$Depressed, 
                                  p = .6, 
                                  list = FALSE, 
                                  times = 1)

TRAIN <- data_F.imputed[trainIndex, ]
TEST <- data_F.imputed[-trainIndex, ]

SVMGrid <- expand.grid(cost = c(1,2,5), Loss = c(1,5,6))

train_ctrl <- trainControl(method="cv", # type of resampling in this case Cross-Validated
                           number=3)

toc <- Sys.time()
model_svm <- train(Depressed ~ .,
                       data = TRAIN,
                       method = "svmLinear3", # this will use the svmLinear3 library
                       metric = "Accuracy", # which metric should be optimized for 
                       trControl = train_ctrl,
                       tuneGrid = SVMGrid) 
tic <- Sys.time()

tic - toc
model_svm


varImp(model_svm)

plot(varImp(model_svm), top = 20)