\(~\)

1 Read in the Data

library('tidyverse')

## -- Attaching packages ----------------------------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.0     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts -------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(NHANES)

NHANES_DATA_12 <- NHANES %>% 
  select(-DiabetesAge) %>%
  filter(SurveyYr =='2011_12') 

SumNa <- function(col){sum(is.na(col))}

data.sum <- NHANES_DATA_12 %>% 
  summarise_all(SumNa) %>%
  tidyr::gather(key='feature', value='SumNa') %>%
  arrange(-SumNa) %>%
  mutate(PctNa = SumNa/nrow(NHANES_DATA_12))

data.sum2 <- data.sum %>% 
  filter(! (feature %in% c('ID','Diabetes'))) %>%
  filter(PctNa < .55)

data.sum2$feature

##  [1] "PhysActiveDays"  "SexOrientation"  "SexNumPartYear"  "Marijuana"      
##  [5] "RegularMarij"    "AlcoholDay"      "SexAge"          "SexNumPartnLife"
##  [9] "HardDrugs"       "SexEver"         "SameSex"         "AlcoholYear"    
## [13] "LittleInterest"  "Depressed"       "Alcohol12PlusYr" "Education"      
## [17] "MaritalStatus"   "Smoke100"        "Smoke100n"       "DaysPhysHlthBad"
## [21] "DaysMentHlthBad" "HealthGen"       "SleepHrsNight"   "Work"           
## [25] "SleepTrouble"    "BPSys1"          "BPDia1"          "Testosterone"   
## [29] "PhysActive"      "BPSys2"          "BPDia2"          "UrineFlow1"     
## [33] "BPSys3"          "BPDia3"          "DirectChol"      "TotChol"        
## [37] "BPSysAve"        "BPDiaAve"        "Pulse"           "UrineVol1"      
## [41] "HHIncome"        "HHIncomeMid"     "Poverty"         "BMI_WHO"        
## [45] "AgeDecade"       "BMI"             "Height"          "TVHrsDay"       
## [49] "CompHrsDay"      "Weight"          "HomeRooms"       "HomeOwn"        
## [53] "SurveyYr"        "Gender"          "Age"             "Race1"          
## [57] "Race3"

data_F <- NHANES_DATA_12 %>% 
  select(ID, Diabetes, data.sum2$feature) %>%
  filter(!is.na(Diabetes)) %>%
  na.omit()

\(~\)

1.0.1 Install if not Function

install_if_not <- function( list.of.packages ) {
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}

\(~\)

2 Getting Started with `caret`

The caret package is short for Classification And REgression Training.

It contains many useful functions for modeling.

For starters we can split data into training and testing sets with the caret::createDataPartition function:

library('caret')

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

# this will ensure our results are the same every run, to randomize you may use: set.seed(Sys.time())
set.seed(8675309)


# The createDataPartition function is used to create training and test sets
trainIndex <- createDataPartition(data_F$Diabetes, 
                                  p = .6, 
                                  list = FALSE, 
                                  times = 1)

TRAIN <- data_F[trainIndex, ]
TEST <- data_F[-trainIndex, ]

\(~\)

3 Grid Search with `caret`

The primary use of the caret package is to aide with performing resampling methods such as cross-validation, boosting.

For a list of models available to caret you can reference http://topepo.github.io/caret/available-models.html this is a list of models that are compatable with caret and corresponding list of available tuning parameters for the model.

For instance, the randomForest package only has one available tuning parameter, mtry.

What caret will enable us to do is to test the different values of mtry.

Here, we will use a 5-fold cross-validated model over the grid of available options for mtry and see which performs best:

library(caret)

grid_rf <- expand.grid( mtry=c(1:20) )

grid_rf

dim(grid_rf)

## [1] 20  1

train_ctrl <- trainControl(method="cv", # type of resampling in this case Cross-Validated
                           number=5, # number of folds
                           search = "grid" # we are performing a "grid-search"
                           )


model_cv_grid <- train(Diabetes ~ .,
                       data = TRAIN,
                       method = "rf", # this will use the randomForest::randomForest function
                       metric = "Accuracy", # which metric should be optimized for 
                       trControl = train_ctrl, 
                       tuneGrid = grid_rf,
                       # options to be passed to randomForest
                       ntree = 741,
                       keep.forest=TRUE,
                       importance=TRUE
                       ) 

model_cv_grid

## Random Forest 
## 
## 396 samples
##  58 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 317, 316, 317, 317, 317 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    1    0.9469937  0.0000000
##    2    0.9469937  0.0000000
##    3    0.9621519  0.4033579
##    4    0.9621519  0.4033579
##    5    0.9621519  0.4033579
##    6    0.9621519  0.4033579
##    7    0.9621519  0.4033579
##    8    0.9621519  0.4033579
##    9    0.9621519  0.4033579
##   10    0.9621519  0.4033579
##   11    0.9621519  0.4033579
##   12    0.9621519  0.4033579
##   13    0.9621519  0.4033579
##   14    0.9621519  0.4033579
##   15    0.9621519  0.4033579
##   16    0.9621519  0.4033579
##   17    0.9621519  0.4033579
##   18    0.9621519  0.4033579
##   19    0.9621519  0.4033579
##   20    0.9621519  0.4033579
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.

plot(model_cv_grid)

str(model_cv_grid,1)

## List of 24
##  $ method      : chr "rf"
##  $ modelInfo   :List of 15
##  $ modelType   : chr "Classification"
##  $ results     :'data.frame':    20 obs. of  5 variables:
##  $ pred        : NULL
##  $ bestTune    :'data.frame':    1 obs. of  1 variable:
##  $ call        : language train.formula(form = Diabetes ~ ., data = TRAIN, method = "rf", metric = "Accuracy",      trControl = train_ctrl,| __truncated__ ...
##  $ dots        :List of 3
##  $ metric      : chr "Accuracy"
##  $ control     :List of 27
##  $ finalModel  :List of 23
##   ..- attr(*, "class")= chr "randomForest"
##  $ preProcess  : NULL
##  $ trainingData:Classes 'tbl_df', 'tbl' and 'data.frame':    396 obs. of  59 variables:
##   ..- attr(*, "na.action")= 'omit' Named int [1:4276] 1 3 4 5 6 7 8 9 10 12 ...
##   .. ..- attr(*, "names")= chr [1:4276] "1" "3" "4" "5" ...
##  $ resample    :'data.frame':    5 obs. of  3 variables:
##  $ resampledCM :'data.frame':    100 obs. of  6 variables:
##  $ perfNames   : chr [1:2] "Accuracy" "Kappa"
##  $ maximize    : logi TRUE
##  $ yLimits     : NULL
##  $ times       :List of 3
##  $ levels      : chr [1:2] "No" "Yes"
##   ..- attr(*, "ordered")= logi FALSE
##  $ terms       :Classes 'terms', 'formula'  language Diabetes ~ ID + PhysActiveDays + SexOrientation + SexNumPartYear + Marijuana +      RegularMarij + AlcoholDay + S| __truncated__ ...
##   .. ..- attr(*, "variables")= language list(Diabetes, ID, PhysActiveDays, SexOrientation, SexNumPartYear, Marijuana,      RegularMarij, AlcoholDay, SexA| __truncated__ ...
##   .. ..- attr(*, "factors")= int [1:59, 1:58] 0 1 0 0 0 0 0 0 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. ..- attr(*, "term.labels")= chr [1:58] "ID" "PhysActiveDays" "SexOrientation" "SexNumPartYear" ...
##   .. ..- attr(*, "order")= int [1:58] 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(Diabetes, ID, PhysActiveDays, SexOrientation, SexNumPartYear, Marijuana,      RegularMarij, AlcoholDay, SexA| __truncated__ ...
##   .. ..- attr(*, "dataClasses")= Named chr [1:59] "factor" "numeric" "numeric" "factor" ...
##   .. .. ..- attr(*, "names")= chr [1:59] "Diabetes" "ID" "PhysActiveDays" "SexOrientation" ...
##  $ coefnames   : chr [1:108] "ID" "PhysActiveDays" "SexOrientationHeterosexual" "SexOrientationHomosexual" ...
##  $ contrasts   :List of 27
##  $ xlevels     :List of 27
##  - attr(*, "class")= chr [1:2] "train" "train.formula"

model_cv_grid$bestTune$mtry

## [1] 3

model_cv_grid$results

##    mtry  Accuracy     Kappa  AccuracySD   KappaSD
## 1     1 0.9469937 0.0000000 0.005307123 0.0000000
## 2     2 0.9469937 0.0000000 0.005307123 0.0000000
## 3     3 0.9621519 0.4033579 0.012502203 0.2723539
## 4     4 0.9621519 0.4033579 0.012502203 0.2723539
## 5     5 0.9621519 0.4033579 0.012502203 0.2723539
## 6     6 0.9621519 0.4033579 0.012502203 0.2723539
## 7     7 0.9621519 0.4033579 0.012502203 0.2723539
## 8     8 0.9621519 0.4033579 0.012502203 0.2723539
## 9     9 0.9621519 0.4033579 0.012502203 0.2723539
## 10   10 0.9621519 0.4033579 0.012502203 0.2723539
## 11   11 0.9621519 0.4033579 0.012502203 0.2723539
## 12   12 0.9621519 0.4033579 0.012502203 0.2723539
## 13   13 0.9621519 0.4033579 0.012502203 0.2723539
## 14   14 0.9621519 0.4033579 0.012502203 0.2723539
## 15   15 0.9621519 0.4033579 0.012502203 0.2723539
## 16   16 0.9621519 0.4033579 0.012502203 0.2723539
## 17   17 0.9621519 0.4033579 0.012502203 0.2723539
## 18   18 0.9621519 0.4033579 0.012502203 0.2723539
## 19   19 0.9621519 0.4033579 0.012502203 0.2723539
## 20   20 0.9621519 0.4033579 0.012502203 0.2723539

model_cv_grid$results %>%
  select(mtry, Accuracy, Kappa) %>%
  gather(-mtry, key="metric", value="Value") %>%
  ggplot(aes(x=mtry, y=Value, color = metric, shape=metric ) ) + 
     geom_point() + 
     geom_line()

model_cv_grid$finalModel

## 
## Call:
##  randomForest(x = x, y = y, ntree = 741, mtry = param$mtry, importance = TRUE,      keep.forest = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 741
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 3.79%
## Confusion matrix:
##      No Yes class.error
## No  375   0   0.0000000
## Yes  15   6   0.7142857

3.1 Feature Importances

varImp(model_cv_grid)

## rf variable importance
## 
##   only 20 most important variables shown (out of 108)
## 
##                 Importance
## SexNumPartnLife     100.00
## UrineFlow1           89.84
## BMI                  88.83
## BPDiaAve             87.60
## AgeDecade 50-59      85.30
## BPDia2               84.98
## Age                  84.57
## Height               82.30
## BPDia3               81.90
## BPDia1               78.03
## Weight               77.97
## Pulse                77.88
## BPSys2               77.55
## BPSys3               75.83
## BPSys1               74.15
## TotChol              72.19
## ID                   71.92
## AlcoholDay           71.77
## SexAge               71.25
## SleepHrsNight        70.71

model_cv_grid$finalModel$importance

##                                       No           Yes MeanDecreaseAccuracy
## ID                          8.341262e-04  9.954437e-03         1.296629e-03
## PhysActiveDays              1.913270e-04  5.194659e-03         4.409630e-04
## SexOrientationHeterosexual  4.078457e-05  0.000000e+00         3.872420e-05
## SexOrientationHomosexual    2.014220e-05  0.000000e+00         1.874344e-05
## SexNumPartYear              4.043881e-04  4.405439e-03         5.987866e-04
## MarijuanaYes                5.694672e-04  1.999609e-03         6.194468e-04
## RegularMarijYes             2.395877e-04  4.021971e-03         4.165889e-04
## AlcoholDay                  5.510716e-04  1.508019e-02         1.301528e-03
## SexAge                      5.670584e-04  1.286708e-02         1.134064e-03
## SexNumPartnLife             1.502080e-03  1.973325e-02         2.440264e-03
## HardDrugsYes                4.022662e-04  7.202589e-03         7.416562e-04
## SexEverYes                  0.000000e+00  0.000000e+00         0.000000e+00
## SameSexYes                  1.180457e-04  1.103721e-03         1.579279e-04
## AlcoholYear                 7.051510e-04  4.343673e-03         8.746739e-04
## LittleInterestSeveral       3.007011e-04  1.881890e-03         3.774726e-04
## LittleInterestMost          6.030607e-05  0.000000e+00         5.729313e-05
## DepressedSeveral            1.507222e-04 -2.211726e-04         1.433751e-04
## DepressedMost               5.918036e-05  3.748688e-04         7.489524e-05
## Alcohol12PlusYrYes          8.051732e-05  0.000000e+00         7.605402e-05
## Education9 - 11th Grade    -9.127885e-05  5.735493e-04        -5.573739e-05
## EducationHigh School        5.210412e-04  5.042536e-03         7.195073e-04
## EducationSome College       2.309968e-04  3.854771e-03         3.972307e-04
## EducationCollege Grad       3.334475e-04  4.666045e-03         5.840761e-04
## MaritalStatusLivePartner    2.086221e-04  2.472041e-03         3.166085e-04
## MaritalStatusMarried        1.044386e-04  4.013822e-03         3.004816e-04
## MaritalStatusNeverMarried   1.272860e-04  4.006325e-03         3.330597e-04
## MaritalStatusSeparated     -9.528178e-05 -5.087018e-04        -1.179922e-04
## MaritalStatusWidowed        0.000000e+00  0.000000e+00         0.000000e+00
## Smoke100Yes                 5.676440e-04  2.452859e-03         6.503419e-04
## Smoke100nSmoker             2.826617e-04  3.094323e-03         4.274327e-04
## DaysPhysHlthBad             3.288388e-04  5.964844e-03         6.405180e-04
## DaysMentHlthBad             1.149341e-04  3.819797e-03         3.250520e-04
## HealthGenVgood              2.249076e-04  2.806696e-03         3.646914e-04
## HealthGenGood               2.669887e-04  2.589872e-03         3.925381e-04
## HealthGenFair               1.324826e-05  4.174529e-03         2.112347e-04
## HealthGenPoor              -3.874664e-05 -1.499475e-04        -4.537051e-05
## SleepHrsNight               7.283011e-04  7.420762e-03         1.062441e-03
## WorkNotWorking              2.582079e-05  3.484591e-03         2.099249e-04
## WorkWorking                 2.493450e-04  5.431946e-03         4.293637e-04
## SleepTroubleYes             6.963944e-04  3.567680e-03         8.229261e-04
## BPSys1                      1.649495e-03  7.107561e-03         1.910941e-03
## BPDia1                      1.739677e-03  1.109493e-02         2.209947e-03
## Testosterone                6.469916e-04  7.214112e-03         1.001948e-03
## PhysActiveYes               6.315495e-04  3.636763e-03         7.891283e-04
## BPSys2                      2.287912e-03  5.732830e-03         2.459610e-03
## BPDia2                      1.898107e-03  8.555772e-03         2.240932e-03
## UrineFlow1                  1.469049e-03  1.503415e-02         2.171559e-03
## BPSys3                      2.615702e-03  6.064321e-03         2.823036e-03
## BPDia3                      1.550011e-03  1.079846e-02         2.037198e-03
## DirectChol                  6.715118e-04  1.007073e-02         1.141932e-03
## TotChol                     9.181682e-04  8.394900e-03         1.318147e-03
## BPSysAve                    1.782187e-03  5.352883e-03         1.995548e-03
## BPDiaAve                    2.572784e-03  1.294737e-02         2.974411e-03
## Pulse                       9.866097e-04  1.253665e-02         1.591740e-03
## UrineVol1                   6.783902e-04  6.627759e-03         1.014965e-03
## HHIncome 5000-9999          1.078202e-04  0.000000e+00         1.019838e-04
## HHIncome10000-14999         4.953798e-05  6.185335e-04         6.543739e-05
## HHIncome15000-19999         5.005108e-05  0.000000e+00         4.727516e-05
## HHIncome20000-24999         4.106650e-05 -2.174239e-04         2.946806e-05
## HHIncome25000-34999         8.992583e-05 -3.003332e-04         7.675864e-05
## HHIncome35000-44999         1.980914e-04  5.050505e-04         2.070784e-04
## HHIncome45000-54999         6.082102e-05 -3.063363e-04         2.668529e-05
## HHIncome55000-64999        -4.554397e-06  3.213161e-05        -7.062565e-06
## HHIncome65000-74999         5.804912e-05  7.625902e-04         9.082183e-05
## HHIncome75000-99999         4.761815e-04  1.082462e-02         9.346937e-04
## HHIncomemore 99999          6.094297e-04  4.870665e-03         8.381542e-04
## HHIncomeMid                 1.004284e-03  3.775562e-03         1.182149e-03
## Poverty                     1.390839e-03  3.966258e-03         1.523497e-03
## BMI_WHO18.5_to_24.9         7.001008e-04  1.006255e-03         7.143621e-04
## BMI_WHO25.0_to_29.9         6.224625e-04  1.947127e-03         6.934010e-04
## BMI_WHO30.0_plus            1.158913e-03  7.343208e-03         1.479369e-03
## AgeDecade 10-19             0.000000e+00  0.000000e+00         0.000000e+00
## AgeDecade 20-29             2.453518e-04  2.460210e-03         3.635919e-04
## AgeDecade 30-39             2.579051e-04  2.407826e-03         3.642544e-04
## AgeDecade 40-49             2.130089e-04  4.451883e-03         3.939345e-04
## AgeDecade 50-59             9.069346e-04  1.360311e-02         1.506555e-03
## AgeDecade 60-69             0.000000e+00  0.000000e+00         0.000000e+00
## AgeDecade 70+               0.000000e+00  0.000000e+00         0.000000e+00
## BMI                         1.783865e-03  1.450536e-02         2.446608e-03
## Height                      7.517342e-04  1.269575e-02         1.355632e-03
## TVHrsDay0_to_1_hr           1.085984e-04  9.757936e-04         1.745445e-04
## TVHrsDay1_hr                7.074222e-05  7.111310e-04         1.032728e-04
## TVHrsDay2_hr                1.481450e-04  2.233196e-03         2.381488e-04
## TVHrsDay3_hr                3.775136e-04  4.323308e-03         5.826369e-04
## TVHrsDay4_hr               -8.533808e-05  1.959541e-04        -7.175324e-05
## TVHrsDayMore_4_hr           2.392996e-04  1.053995e-02         7.992061e-04
## CompHrsDay0_to_1_hr         3.836207e-04  1.587852e-03         4.514624e-04
## CompHrsDay1_hr              6.332322e-04  3.875218e-03         8.066848e-04
## CompHrsDay2_hr              1.084903e-04  2.238469e-03         2.375355e-04
## CompHrsDay3_hr             -9.844473e-05  0.000000e+00        -9.401935e-05
## CompHrsDay4_hr              3.895439e-05  2.249213e-04         4.631112e-05
## CompHrsDayMore_4_hr         3.726164e-05 -6.871296e-04         7.395513e-06
## Weight                      2.410584e-03  1.251572e-02         2.977781e-03
## HomeRooms                   7.544334e-04  1.011076e-02         1.135259e-03
## HomeOwnRent                 2.751140e-04  3.962412e-03         4.348997e-04
## HomeOwnOther               -7.803824e-06  0.000000e+00        -8.160400e-06
## SurveyYr2011_12             0.000000e+00  0.000000e+00         0.000000e+00
## Gendermale                  5.215084e-04  1.343116e-03         5.569063e-04
## Age                         1.241782e-03  1.458257e-02         1.939388e-03
## Race1Hispanic               1.920406e-05  0.000000e+00         1.827632e-05
## Race1Mexican                5.982145e-05  6.313862e-04         8.658560e-05
## Race1White                  2.662379e-04  1.772643e-03         3.344663e-04
## Race1Other                 -1.942956e-05  0.000000e+00        -1.844590e-05
## Race3Black                  1.725566e-04 -6.632294e-05         1.562583e-04
## Race3Hispanic               4.918663e-05  2.249213e-04         5.570617e-05
## Race3Mexican                6.099103e-05  2.913753e-04         7.729055e-05
## Race3White                  1.020431e-04  1.772010e-03         1.819081e-04
## Race3Other                 -9.243340e-06  0.000000e+00        -8.937269e-06
##                            MeanDecreaseGini
## ID                              0.953575113
## PhysActiveDays                  0.436855738
## SexOrientationHeterosexual      0.012191179
## SexOrientationHomosexual        0.016146206
## SexNumPartYear                  0.458146997
## MarijuanaYes                    0.237697524
## RegularMarijYes                 0.188362481
## AlcoholDay                      0.732289183
## SexAge                          0.900136529
## SexNumPartnLife                 0.986234958
## HardDrugsYes                    0.232671098
## SexEverYes                      0.000000000
## SameSexYes                      0.043806337
## AlcoholYear                     0.849765078
## LittleInterestSeveral           0.178885366
## LittleInterestMost              0.029787773
## DepressedSeveral                0.125153985
## DepressedMost                   0.028570997
## Alcohol12PlusYrYes              0.039811904
## Education9 - 11th Grade         0.135532238
## EducationHigh School            0.243880297
## EducationSome College           0.266807105
## EducationCollege Grad           0.182457410
## MaritalStatusLivePartner        0.126895581
## MaritalStatusMarried            0.190186388
## MaritalStatusNeverMarried       0.142570966
## MaritalStatusSeparated          0.172002388
## MaritalStatusWidowed            0.005332356
## Smoke100Yes                     0.171877158
## Smoke100nSmoker                 0.222814481
## DaysPhysHlthBad                 0.480694433
## DaysMentHlthBad                 0.393839839
## HealthGenVgood                  0.231288451
## HealthGenGood                   0.206952285
## HealthGenFair                   0.153793434
## HealthGenPoor                   0.087875710
## SleepHrsNight                   0.597496812
## WorkNotWorking                  0.138626351
## WorkWorking                     0.128207176
## SleepTroubleYes                 0.264568515
## BPSys1                          0.750600927
## BPDia1                          0.777318096
## Testosterone                    1.060240631
## PhysActiveYes                   0.252457009
## BPSys2                          0.752997829
## BPDia2                          0.674020159
## UrineFlow1                      1.305087310
## BPSys3                          0.713326620
## BPDia3                          0.789175336
## DirectChol                      1.144484933
## TotChol                         1.018048867
## BPSysAve                        0.719732353
## BPDiaAve                        0.854062199
## Pulse                           0.877995067
## UrineVol1                       0.890773667
## HHIncome 5000-9999              0.107402832
## HHIncome10000-14999             0.022403055
## HHIncome15000-19999             0.071829397
## HHIncome20000-24999             0.071087188
## HHIncome25000-34999             0.113894004
## HHIncome35000-44999             0.083768362
## HHIncome45000-54999             0.161515382
## HHIncome55000-64999             0.169190284
## HHIncome65000-74999             0.029874562
## HHIncome75000-99999             0.299612021
## HHIncomemore 99999              0.204972647
## HHIncomeMid                     0.547676943
## Poverty                         0.761119466
## BMI_WHO18.5_to_24.9             0.168181127
## BMI_WHO25.0_to_29.9             0.214158783
## BMI_WHO30.0_plus                0.344434830
## AgeDecade 10-19                 0.000000000
## AgeDecade 20-29                 0.115816946
## AgeDecade 30-39                 0.140955059
## AgeDecade 40-49                 0.306516967
## AgeDecade 50-59                 0.409172252
## AgeDecade 60-69                 0.000000000
## AgeDecade 70+                   0.000000000
## BMI                             1.165174605
## Height                          1.297876977
## TVHrsDay0_to_1_hr               0.167720305
## TVHrsDay1_hr                    0.125606432
## TVHrsDay2_hr                    0.135858129
## TVHrsDay3_hr                    0.216013652
## TVHrsDay4_hr                    0.141444888
## TVHrsDayMore_4_hr               0.254191216
## CompHrsDay0_to_1_hr             0.190603248
## CompHrsDay1_hr                  0.279361243
## CompHrsDay2_hr                  0.117651584
## CompHrsDay3_hr                  0.013406943
## CompHrsDay4_hr                  0.032263125
## CompHrsDayMore_4_hr             0.157265678
## Weight                          1.154469690
## HomeRooms                       0.680144963
## HomeOwnRent                     0.256173655
## HomeOwnOther                    0.077218701
## SurveyYr2011_12                 0.000000000
## Gendermale                      0.218110356
## Age                             0.939226525
## Race1Hispanic                   0.006655825
## Race1Mexican                    0.097950546
## Race1White                      0.144245730
## Race1Other                      0.013926300
## Race3Black                      0.088555704
## Race3Hispanic                   0.015826232
## Race3Mexican                    0.123686632
## Race3White                      0.157132074
## Race3Other                      0.006479537

randomForest::varImpPlot(model_cv_grid$finalModel)

as_tibble(model_cv_grid$finalModel$importance, rownames='Feature') %>%
  arrange(-MeanDecreaseAccuracy)

## # A tibble: 108 x 5
##    Feature              No     Yes MeanDecreaseAccuracy MeanDecreaseGini
##    <chr>             <dbl>   <dbl>                <dbl>            <dbl>
##  1 Weight          0.00241 0.0125               0.00298            1.15 
##  2 BPDiaAve        0.00257 0.0129               0.00297            0.854
##  3 BPSys3          0.00262 0.00606              0.00282            0.713
##  4 BPSys2          0.00229 0.00573              0.00246            0.753
##  5 BMI             0.00178 0.0145               0.00245            1.17 
##  6 SexNumPartnLife 0.00150 0.0197               0.00244            0.986
##  7 BPDia2          0.00190 0.00856              0.00224            0.674
##  8 BPDia1          0.00174 0.0111               0.00221            0.777
##  9 UrineFlow1      0.00147 0.0150               0.00217            1.31 
## 10 BPDia3          0.00155 0.0108               0.00204            0.789
## # ... with 98 more rows

as_tibble(model_cv_grid$finalModel$importance, rownames='Feature') %>%
  arrange(-MeanDecreaseGini)

## # A tibble: 108 x 5
##    Feature               No     Yes MeanDecreaseAccuracy MeanDecreaseGini
##    <chr>              <dbl>   <dbl>                <dbl>            <dbl>
##  1 UrineFlow1      0.00147  0.0150               0.00217            1.31 
##  2 Height          0.000752 0.0127               0.00136            1.30 
##  3 BMI             0.00178  0.0145               0.00245            1.17 
##  4 Weight          0.00241  0.0125               0.00298            1.15 
##  5 DirectChol      0.000672 0.0101               0.00114            1.14 
##  6 Testosterone    0.000647 0.00721              0.00100            1.06 
##  7 TotChol         0.000918 0.00839              0.00132            1.02 
##  8 SexNumPartnLife 0.00150  0.0197               0.00244            0.986
##  9 ID              0.000834 0.00995              0.00130            0.954
## 10 Age             0.00124  0.0146               0.00194            0.939
## # ... with 98 more rows

3.2 Score Testing Set

probs <- predict(model_cv_grid , TEST,"prob")
class <- predict(model_cv_grid , TEST,"raw")

TEST.scored <- cbind(TEST , probs, class )
glimpse(TEST.scored)

## Observations: 264
## Variables: 62
## $ ID              <int> 62172, 62199, 62231, 62340, 62444, 62460, 62481, 62...
## $ Diabetes        <fct> No, No, No, No, No, No, No, No, No, No, No, No, No,...
## $ PhysActiveDays  <int> 2, 7, 3, 5, 6, 3, 3, 7, 1, 1, 2, 1, 7, 3, 5, 4, 3, ...
## $ SexOrientation  <fct> Heterosexual, Homosexual, Heterosexual, Heterosexua...
## $ SexNumPartYear  <int> 2, 0, 1, 1, 2, 1, 2, 1, 1, 2, 2, 10, 1, 1, 0, 1, 1,...
## $ Marijuana       <fct> Yes, Yes, No, No, Yes, Yes, No, Yes, No, No, No, Ye...
## $ RegularMarij    <fct> No, No, No, No, No, No, No, No, No, No, No, Yes, No...
## $ AlcoholDay      <int> 3, 1, 2, 1, 4, 3, 2, 1, 1, 6, 6, 15, 1, 2, 1, 1, 1,...
## $ SexAge          <int> 17, 19, 14, 19, 17, 16, 20, 16, 24, 17, 17, 17, 18,...
## $ SexNumPartnLife <int> 4, 6, 1, 5, 3, 50, 25, 10, 7, 7, 7, 300, 4, 5, 5, 1...
## $ HardDrugs       <fct> No, Yes, No, No, No, Yes, No, No, No, No, No, Yes, ...
## $ SexEver         <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Y...
## $ SameSex         <fct> No, Yes, No, No, No, No, No, No, No, No, No, No, No...
## $ AlcoholYear     <int> 104, 260, 3, 12, 52, 104, 6, 300, 24, 52, 52, 104, ...
## $ LittleInterest  <fct> Several, None, None, Most, None, Several, None, Non...
## $ Depressed       <fct> Most, None, None, Most, None, Several, None, None, ...
## $ Alcohol12PlusYr <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Y...
## $ Education       <fct> High School, College Grad, High School, Some Colleg...
## $ MaritalStatus   <fct> NeverMarried, LivePartner, Married, Married, NeverM...
## $ Smoke100        <fct> Yes, Yes, No, No, No, Yes, No, Yes, No, Yes, Yes, Y...
## $ Smoke100n       <fct> Smoker, Smoker, Non-Smoker, Non-Smoker, Non-Smoker,...
## $ DaysPhysHlthBad <int> 2, 0, 0, 0, 0, 0, 2, 0, 0, 1, 1, 7, 0, 2, 0, 0, 0, ...
## $ DaysMentHlthBad <int> 10, 1, 0, 26, 0, 3, 30, 5, 0, 0, 0, 14, 0, 2, 0, 0,...
## $ HealthGen       <fct> Good, Vgood, Good, Vgood, Fair, Good, Vgood, Vgood,...
## $ SleepHrsNight   <int> 8, 8, 4, 6, 5, 6, 6, 8, 7, 6, 6, 6, 8, 6, 9, 7, 7, ...
## $ Work            <fct> NotWorking, Working, Working, Working, Working, Wor...
## $ SleepTrouble    <fct> No, No, No, No, No, Yes, No, No, No, No, No, Yes, N...
## $ BPSys1          <int> 100, 112, 120, 106, 112, 120, 98, 118, 102, 132, 13...
## $ BPDia1          <int> 70, 70, 70, 80, 54, 90, 56, 76, 62, 70, 70, 86, 76,...
## $ Testosterone    <dbl> 47.53, 269.24, 14.90, 296.66, 19.76, 299.19, 48.93,...
## $ PhysActive      <fct> No, Yes, No, No, Yes, Yes, No, Yes, Yes, Yes, Yes, ...
## $ BPSys2          <int> 102, 108, 116, 106, 116, 124, 96, 108, 100, 124, 12...
## $ BPDia2          <int> 68, 64, 70, 82, 54, 88, 62, 80, 68, 66, 66, 72, 74,...
## $ UrineFlow1      <dbl> 0.645, 0.380, 0.196, 0.623, 0.356, 0.498, 1.300, 1....
## $ BPSys3          <int> 104, 112, 112, 106, 112, 130, 94, 120, 102, 126, 12...
## $ BPDia3          <int> 76, 66, 68, 80, 56, 90, 54, 76, 60, 70, 70, 70, 76,...
## $ DirectChol      <dbl> 1.89, 0.91, 1.53, 0.98, 1.16, 1.37, 1.19, 1.81, 1.4...
## $ TotChol         <dbl> 4.37, 4.42, 4.71, 4.16, 4.34, 4.65, 3.83, 4.89, 4.3...
## $ BPSysAve        <int> 103, 110, 114, 106, 114, 127, 95, 114, 101, 125, 12...
## $ BPDiaAve        <int> 72, 65, 69, 81, 55, 89, 58, 78, 64, 68, 68, 71, 75,...
## $ Pulse           <int> 80, 84, 64, 68, 68, 68, 78, 84, 72, 80, 80, 78, 58,...
## $ UrineVol1       <int> 107, 65, 19, 96, 26, 118, 282, 72, 276, 106, 106, 9...
## $ HHIncome        <fct> 20000-24999, more 99999, more 99999, 25000-34999,  ...
## $ HHIncomeMid     <int> 22500, 100000, 100000, 30000, 2500, 100000, 70000, ...
## $ Poverty         <dbl> 2.02, 5.00, 3.92, 1.28, 0.00, 4.34, 3.13, 4.07, 3.0...
## $ BMI_WHO         <fct> 30.0_plus, 25.0_to_29.9, 30.0_plus, 25.0_to_29.9, 3...
## $ AgeDecade       <fct>  40-49,  50-59,  40-49,  40-49,  20-29,  40-49,  30...
## $ BMI             <dbl> 33.3, 28.0, 33.2, 25.9, 33.6, 21.9, 27.7, 22.5, 27....
## $ Height          <dbl> 172.0, 186.0, 164.7, 173.2, 169.3, 188.1, 170.5, 16...
## $ TVHrsDay        <fct> More_4_hr, 1_hr, 2_hr, 3_hr, 4_hr, 1_hr, 1_hr, 2_hr...
## $ CompHrsDay      <fct> More_4_hr, 1_hr, 0_to_1_hr, 1_hr, 1_hr, 0_to_1_hr, ...
## $ Weight          <dbl> 98.6, 96.9, 90.1, 77.6, 96.4, 77.5, 80.5, 58.2, 87....
## $ HomeRooms       <int> 4, 4, 6, 6, 5, 11, 5, 7, 10, 1, 1, 8, 8, 13, 5, 8, ...
## $ HomeOwn         <fct> Rent, Rent, Own, Own, Own, Own, Rent, Own, Own, Ren...
## $ SurveyYr        <fct> 2011_12, 2011_12, 2011_12, 2011_12, 2011_12, 2011_1...
## $ Gender          <fct> female, male, female, male, female, male, female, f...
## $ Age             <int> 43, 57, 48, 44, 23, 41, 39, 41, 32, 21, 21, 35, 57,...
## $ Race1           <fct> Black, White, Mexican, White, Black, White, Hispani...
## $ Race3           <fct> Black, White, Mexican, White, Black, White, Hispani...
## $ No              <dbl> 0.9460189, 0.9716599, 0.9230769, 0.9919028, 0.95276...
## $ Yes             <dbl> 0.053981107, 0.028340081, 0.076923077, 0.008097166,...
## $ class           <fct> No, No, No, No, No, No, No, No, No, No, No, No, No,...

4 Random Search with `caret`

Another option, is to have caret randomly sample the tuning parameters for you:

train_ctrl_rand <- trainControl(method="boot", # type of resampling, in this case bootstrap
                                number = 13, # number of resamplings     
                                search = "random" # we are performing a "random" search
                                )
                           
model_boot_rand <- train(Diabetes ~ .,
                         data = TRAIN,
                         method = "rf", # this will use the randomForest::randomForest function
                         metric = "Accuracy", # which metric should be optimized for 
                         trControl = train_ctrl_rand, 
                         # options to be passed to randomForest
                         ntree = 741 )

model_boot_rand

## Random Forest 
## 
## 396 samples
##  58 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Bootstrapped (13 reps) 
## Summary of sample sizes: 396, 396, 396, 396, 396, 396, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   17    0.9582423  0.3002562
##   21    0.9582423  0.3002562
##   78    0.9529188  0.2676311
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 17.

4.1 Score Testing Set

probs <- predict(model_boot_rand , TEST,"prob")
class <- predict(model_boot_rand , TEST,"raw")

TEST.scored.boot <- cbind(TEST , probs, class )

5 Compare Grid-Search to Boosted

colnames(TEST.scored)

##  [1] "ID"              "Diabetes"        "PhysActiveDays"  "SexOrientation" 
##  [5] "SexNumPartYear"  "Marijuana"       "RegularMarij"    "AlcoholDay"     
##  [9] "SexAge"          "SexNumPartnLife" "HardDrugs"       "SexEver"        
## [13] "SameSex"         "AlcoholYear"     "LittleInterest"  "Depressed"      
## [17] "Alcohol12PlusYr" "Education"       "MaritalStatus"   "Smoke100"       
## [21] "Smoke100n"       "DaysPhysHlthBad" "DaysMentHlthBad" "HealthGen"      
## [25] "SleepHrsNight"   "Work"            "SleepTrouble"    "BPSys1"         
## [29] "BPDia1"          "Testosterone"    "PhysActive"      "BPSys2"         
## [33] "BPDia2"          "UrineFlow1"      "BPSys3"          "BPDia3"         
## [37] "DirectChol"      "TotChol"         "BPSysAve"        "BPDiaAve"       
## [41] "Pulse"           "UrineVol1"       "HHIncome"        "HHIncomeMid"    
## [45] "Poverty"         "BMI_WHO"         "AgeDecade"       "BMI"            
## [49] "Height"          "TVHrsDay"        "CompHrsDay"      "Weight"         
## [53] "HomeRooms"       "HomeOwn"         "SurveyYr"        "Gender"         
## [57] "Age"             "Race1"           "Race3"           "No"             
## [61] "Yes"             "class"

colnames(TEST.scored.boot)

##  [1] "ID"              "Diabetes"        "PhysActiveDays"  "SexOrientation" 
##  [5] "SexNumPartYear"  "Marijuana"       "RegularMarij"    "AlcoholDay"     
##  [9] "SexAge"          "SexNumPartnLife" "HardDrugs"       "SexEver"        
## [13] "SameSex"         "AlcoholYear"     "LittleInterest"  "Depressed"      
## [17] "Alcohol12PlusYr" "Education"       "MaritalStatus"   "Smoke100"       
## [21] "Smoke100n"       "DaysPhysHlthBad" "DaysMentHlthBad" "HealthGen"      
## [25] "SleepHrsNight"   "Work"            "SleepTrouble"    "BPSys1"         
## [29] "BPDia1"          "Testosterone"    "PhysActive"      "BPSys2"         
## [33] "BPDia2"          "UrineFlow1"      "BPSys3"          "BPDia3"         
## [37] "DirectChol"      "TotChol"         "BPSysAve"        "BPDiaAve"       
## [41] "Pulse"           "UrineVol1"       "HHIncome"        "HHIncomeMid"    
## [45] "Poverty"         "BMI_WHO"         "AgeDecade"       "BMI"            
## [49] "Height"          "TVHrsDay"        "CompHrsDay"      "Weight"         
## [53] "HomeRooms"       "HomeOwn"         "SurveyYr"        "Gender"         
## [57] "Age"             "Race1"           "Race3"           "No"             
## [61] "Yes"             "class"

test.model_cv_grid.score <- TEST.scored %>% 
  mutate(model="cv") 

test.model_boot_rand.score <- TEST.scored.boot %>%
  mutate(model='boot') 
  
stacked_df <- rbind(test.model_cv_grid.score, test.model_boot_rand.score)

library('yardstick')

## For binary classification, the first factor level is assumed to be the event.
## Set the global option `yardstick.event_first` to `FALSE` to change this.

## 
## Attaching package: 'yardstick'

## The following objects are masked from 'package:caret':
## 
##     precision, recall

## The following object is masked from 'package:readr':
## 
##     spec

roc_aucs <- stacked_df %>%
  group_by(model) %>%
  roc_auc(truth = Diabetes, Yes)

roc_aucs

## # A tibble: 2 x 4
##   model .metric .estimator .estimate
##   <chr> <chr>   <chr>          <dbl>
## 1 boot  roc_auc binary         0.84 
## 2 cv    roc_auc binary         0.850

boot_auc <- (roc_aucs %>% filter(model=='boot'))$.estimate
cv_auc <- (roc_aucs %>% filter(model=='cv'))$.estimate

roc_curves <- stacked_df %>%
  group_by(model) %>%
  roc_curve(truth = Diabetes, Yes)

autoplot(roc_curves) + 
  labs(caption = paste0("Bootstrapped ROC AUC: ", round(boot_auc,4),
                      "\nCross-Validated ROC AUC: ", round(cv_auc,4)))

confusion_matrices <- stacked_df %>%
  group_by(model) %>%
  conf_mat(truth =  Diabetes, class)

(confusion_matrices %>% filter(model == 'boot'))$conf_mat[[1]]

##           Truth
## Prediction  No Yes
##        No  250   8
##        Yes   0   6

(confusion_matrices %>% filter(model == 'cv'))$conf_mat[[1]]

##           Truth
## Prediction  No Yes
##        No  250   8
##        Yes   0   6

metrics_boots <- summary((confusion_matrices %>% filter(model == 'boot'))$conf_mat[[1]])  

metrics_cv <- summary((confusion_matrices %>% filter(model == 'cv'))$conf_mat[[1]])  

metrics_compare <- metrics_boots %>%
  left_join(metrics_cv, 
            by=c('.metric','.estimator'),
            suffix = c("_boot","_cv")) %>%
  gather(-.metric,-.estimator,key="model_type",value = Value)

ggplot(metrics_compare, aes(x=.metric, y=Value, fill=model_type)) +
  geom_bar(stat="identity", position=position_dodge()) + 
  coord_flip()

For more information on the caret package please see: http://topepo.github.io/caret/index.html

\(~\)

6 Code Appendix

\(~\)

knitr::opts_chunk$set(echo = TRUE)
library('tidyverse')
library(NHANES)

NHANES_DATA_12 <- NHANES %>% 
  select(-DiabetesAge) %>%
  filter(SurveyYr =='2011_12') 

SumNa <- function(col){sum(is.na(col))}

data.sum <- NHANES_DATA_12 %>% 
  summarise_all(SumNa) %>%
  tidyr::gather(key='feature', value='SumNa') %>%
  arrange(-SumNa) %>%
  mutate(PctNa = SumNa/nrow(NHANES_DATA_12))

data.sum2 <- data.sum %>% 
  filter(! (feature %in% c('ID','Diabetes'))) %>%
  filter(PctNa < .55)

data.sum2$feature

data_F <- NHANES_DATA_12 %>% 
  select(ID, Diabetes, data.sum2$feature) %>%
  filter(!is.na(Diabetes)) %>%
  na.omit()
install_if_not <- function( list.of.packages ) {
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}


library('caret')
# this will ensure our results are the same every run, to randomize you may use: set.seed(Sys.time())
set.seed(8675309)


# The createDataPartition function is used to create training and test sets
trainIndex <- createDataPartition(data_F$Diabetes, 
                                  p = .6, 
                                  list = FALSE, 
                                  times = 1)

TRAIN <- data_F[trainIndex, ]
TEST <- data_F[-trainIndex, ]

library(caret)

grid_rf <- expand.grid( mtry=c(1:20) )

grid_rf

dim(grid_rf)


train_ctrl <- trainControl(method="cv", # type of resampling in this case Cross-Validated
                           number=5, # number of folds
                           search = "grid" # we are performing a "grid-search"
                           )


model_cv_grid <- train(Diabetes ~ .,
                       data = TRAIN,
                       method = "rf", # this will use the randomForest::randomForest function
                       metric = "Accuracy", # which metric should be optimized for 
                       trControl = train_ctrl, 
                       tuneGrid = grid_rf,
                       # options to be passed to randomForest
                       ntree = 741,
                       keep.forest=TRUE,
                       importance=TRUE
                       ) 

model_cv_grid

plot(model_cv_grid)

str(model_cv_grid,1)

model_cv_grid$bestTune$mtry 

model_cv_grid$results

model_cv_grid$results %>%
  select(mtry, Accuracy, Kappa) %>%
  gather(-mtry, key="metric", value="Value") %>%
  ggplot(aes(x=mtry, y=Value, color = metric, shape=metric ) ) + 
     geom_point() + 
     geom_line()

model_cv_grid$finalModel
varImp(model_cv_grid)

model_cv_grid$finalModel$importance
randomForest::varImpPlot(model_cv_grid$finalModel)

as_tibble(model_cv_grid$finalModel$importance, rownames='Feature') %>%
  arrange(-MeanDecreaseAccuracy)

as_tibble(model_cv_grid$finalModel$importance, rownames='Feature') %>%
  arrange(-MeanDecreaseGini)
probs <- predict(model_cv_grid , TEST,"prob")
class <- predict(model_cv_grid , TEST,"raw")

TEST.scored <- cbind(TEST , probs, class )
glimpse(TEST.scored)
train_ctrl_rand <- trainControl(method="boot", # type of resampling, in this case bootstrap
                                number = 13, # number of resamplings     
                                search = "random" # we are performing a "random" search
                                )
                           
model_boot_rand <- train(Diabetes ~ .,
                         data = TRAIN,
                         method = "rf", # this will use the randomForest::randomForest function
                         metric = "Accuracy", # which metric should be optimized for 
                         trControl = train_ctrl_rand, 
                         # options to be passed to randomForest
                         ntree = 741 )

model_boot_rand                         

probs <- predict(model_boot_rand , TEST,"prob")
class <- predict(model_boot_rand , TEST,"raw")

TEST.scored.boot <- cbind(TEST , probs, class )

colnames(TEST.scored)
colnames(TEST.scored.boot)

test.model_cv_grid.score <- TEST.scored %>% 
  mutate(model="cv") 

test.model_boot_rand.score <- TEST.scored.boot %>%
  mutate(model='boot') 
  
stacked_df <- rbind(test.model_cv_grid.score, test.model_boot_rand.score)

library('yardstick')

roc_aucs <- stacked_df %>%
  group_by(model) %>%
  roc_auc(truth = Diabetes, Yes)

roc_aucs

boot_auc <- (roc_aucs %>% filter(model=='boot'))$.estimate
cv_auc <- (roc_aucs %>% filter(model=='cv'))$.estimate

roc_curves <- stacked_df %>%
  group_by(model) %>%
  roc_curve(truth = Diabetes, Yes)

autoplot(roc_curves) + 
  labs(caption = paste0("Bootstrapped ROC AUC: ", round(boot_auc,4),
                      "\nCross-Validated ROC AUC: ", round(cv_auc,4)))
                      
                      
confusion_matrices <- stacked_df %>%
  group_by(model) %>%
  conf_mat(truth =  Diabetes, class)

(confusion_matrices %>% filter(model == 'boot'))$conf_mat[[1]]
(confusion_matrices %>% filter(model == 'cv'))$conf_mat[[1]]

metrics_boots <- summary((confusion_matrices %>% filter(model == 'boot'))$conf_mat[[1]])  

metrics_cv <- summary((confusion_matrices %>% filter(model == 'cv'))$conf_mat[[1]])  

metrics_compare <- metrics_boots %>%
  left_join(metrics_cv, 
            by=c('.metric','.estimator'),
            suffix = c("_boot","_cv")) %>%
  gather(-.metric,-.estimator,key="model_type",value = Value)

ggplot(metrics_compare, aes(x=.metric, y=Value, fill=model_type)) +
  geom_bar(stat="identity", position=position_dodge()) + 
  coord_flip()

RandomForest with `caret`

J Kyle Armstrong, PhD

08 April 2020

1 Read in the Data

1.0.1 Install if not Function

2 Getting Started with `caret`

3 Grid Search with `caret`

3.1 Feature Importances

3.2 Score Testing Set

4 Random Search with `caret`

4.1 Score Testing Set

5 Compare Grid-Search to Boosted

6 Code Appendix

RandomForest with caret

J Kyle Armstrong, PhD

08 April 2020

1 Read in the Data

1.0.1 Install if not Function

2 Getting Started with caret

3 Grid Search with caret

3.1 Feature Importances

3.2 Score Testing Set

4 Random Search with caret

4.1 Score Testing Set

5 Compare Grid-Search to Boosted

6 Code Appendix

RandomForest with `caret`

2 Getting Started with `caret`

3 Grid Search with `caret`

4 Random Search with `caret`