\(~\)

1 Read in the Data

install_if_not <- function( list.of.packages ) {
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}

library('tidyverse')

## -- Attaching packages ----------------------------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.0     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts -------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(NHANES)

NHANES_DATA_12 <- NHANES %>% 
  select(-DiabetesAge) %>%
  filter(SurveyYr =='2011_12') 

SumNa <- function(col){sum(is.na(col))}

data.sum <- NHANES_DATA_12 %>% 
  summarise_all(SumNa) %>%
  tidyr::gather(key='feature', value='SumNa') %>%
  arrange(-SumNa) %>%
  mutate(PctNa = SumNa/nrow(NHANES_DATA_12))

data.sum2 <- data.sum %>% 
  filter(! (feature %in% c('ID','Diabetes'))) %>%
  filter(PctNa < .55)

data.sum2$feature

##  [1] "PhysActiveDays"  "SexOrientation"  "SexNumPartYear"  "Marijuana"      
##  [5] "RegularMarij"    "AlcoholDay"      "SexAge"          "SexNumPartnLife"
##  [9] "HardDrugs"       "SexEver"         "SameSex"         "AlcoholYear"    
## [13] "LittleInterest"  "Depressed"       "Alcohol12PlusYr" "Education"      
## [17] "MaritalStatus"   "Smoke100"        "Smoke100n"       "DaysPhysHlthBad"
## [21] "DaysMentHlthBad" "HealthGen"       "SleepHrsNight"   "Work"           
## [25] "SleepTrouble"    "BPSys1"          "BPDia1"          "Testosterone"   
## [29] "PhysActive"      "BPSys2"         
##  [ reached getOption("max.print") -- omitted 27 entries ]

data_F <- NHANES_DATA_12 %>% 
  select(ID, Diabetes, data.sum2$feature) %>%
  filter(!is.na(Diabetes)) %>%
  na.omit()

\(~\)

2 Split Data into Training and Test Sets.

diab_pop.no_na_vals <- data_F


library('caret')

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

# this will ensure our results are the same every run, to randomize you may use: set.seed(Sys.time())
set.seed(8675309)


# The createDataPartition function is used to create training and test sets
trainIndex <- createDataPartition(diab_pop.no_na_vals$SexNumPartnLife, 
                                  p = .6, 
                                  list = FALSE, 
                                  times = 1)

TRAIN <- diab_pop.no_na_vals[trainIndex, ]
TEST <- diab_pop.no_na_vals[-trainIndex, ]

\(~\)

3 Train `randomForest` model

install_if_not('randomForest')

## [1] "the package 'randomForest' is already installed"

library('randomForest')

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

rf_model <- randomForest(SexNumPartnLife ~ . ,
                         data = TRAIN,
                         ntree= 550,
                         mtry=3,
                         keep.forest=TRUE,
                         importance=TRUE)

rf_model

## 
## Call:
##  randomForest(formula = SexNumPartnLife ~ ., data = TRAIN, ntree = 550,      mtry = 3, keep.forest = TRUE, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 550
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 1793.489
##                     % Var explained: 56.18

\(~\)

3.1 Random Forest is a Collection of Decision Trees

str(rf_model, 1)

## List of 18
##  $ call           : language randomForest(formula = SexNumPartnLife ~ ., data = TRAIN, ntree = 550,      mtry = 3, keep.forest = TRUE, importance = TRUE)
##  $ type           : chr "regression"
##  $ predicted      : Named num [1:397] 20.06 27.64 10.17 13.01 6.45 ...
##   ..- attr(*, "names")= chr [1:397] "1" "2" "3" "4" ...
##  $ mse            : num [1:550] 850 1211 2423 2432 2195 ...
##  $ rsq            : num [1:550] 0.792 0.704 0.408 0.406 0.464 ...
##  $ oob.times      : int [1:397] 201 199 203 210 210 201 217 209 209 211 ...
##  $ importance     : num [1:58, 1:2] 239.114 1.18 -6.598 -0.108 327.648 ...
##   ..- attr(*, "dimnames")=List of 2
##  $ importanceSD   : Named num [1:58] 65.95 0.682 11.491 0.213 87.597 ...
##   ..- attr(*, "names")= chr [1:58] "ID" "Diabetes" "PhysActiveDays" "SexOrientation" ...
##  $ localImportance: NULL
##  $ proximity      : NULL
##  $ ntree          : num 550
##  $ mtry           : num 3
##  $ forest         :List of 11
##  $ coefs          : NULL
##  $ y              : Named num [1:397] 6 4 20 1 5 5 5 15 8 3 ...
##   ..- attr(*, "names")= chr [1:397] "1" "2" "3" "4" ...
##  $ test           : NULL
##  $ inbag          : NULL
##  $ terms          :Classes 'terms', 'formula'  language SexNumPartnLife ~ ID + Diabetes + PhysActiveDays + SexOrientation + SexNumPartYear +      Marijuana + RegularMari| __truncated__ ...
##   .. ..- attr(*, "variables")= language list(SexNumPartnLife, ID, Diabetes, PhysActiveDays, SexOrientation, SexNumPartYear,      Marijuana, RegularMarij,| __truncated__ ...
##   .. ..- attr(*, "factors")= int [1:59, 1:58] 0 1 0 0 0 0 0 0 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. ..- attr(*, "term.labels")= chr [1:58] "ID" "Diabetes" "PhysActiveDays" "SexOrientation" ...
##   .. ..- attr(*, "order")= int [1:58] 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..- attr(*, "intercept")= num 0
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(SexNumPartnLife, ID, Diabetes, PhysActiveDays, SexOrientation, SexNumPartYear,      Marijuana, RegularMarij,| __truncated__ ...
##   .. ..- attr(*, "dataClasses")= Named chr [1:59] "numeric" "numeric" "factor" "numeric" ...
##   .. .. ..- attr(*, "names")= chr [1:59] "SexNumPartnLife" "ID" "Diabetes" "PhysActiveDays" ...
##  - attr(*, "class")= chr [1:2] "randomForest.formula" "randomForest"

getTree(rf_model, 1)

##     left daughter right daughter split var split point status prediction
## 1               2              3        20      1.0000     -3  21.657431
## 2               4              5        50    119.0000     -3  11.840517
## 3               6              7        24      6.5000     -3  35.460606
## 4               8              9        36      0.8050     -3   8.873096
## 5              10             11        57     29.0000     -3  28.542857
##  [ reached getOption("max.print") -- omitted 254 rows ]

reprtree:::plot.getTree(rf_model,k=1)

## Registered S3 method overwritten by 'tree':
##   method     from
##   print.tree cli

## Registered S3 method overwritten by 'reprtree':
##   method    from
##   text.tree tree

## Loading required package: plotrix

reprtree:::plot.getTree(rf_model,k=4)

reprtree:::plot.getTree(rf_model,k=500)

\(~\)

3.2 Feature Importances

varImp(rf_model)

##                    Overall
## ID               3.6257077
## Diabetes         1.7314275
## PhysActiveDays  -0.5741940
## SexOrientation  -0.5066696
## SexNumPartYear   3.7403828
## Marijuana        2.4565804
## RegularMarij     4.6999550
## AlcoholDay       5.1591547
## SexAge           3.8067884
## HardDrugs        2.8429804
## SexEver          0.0000000
## SameSex          0.9616397
## AlcoholYear      1.8518106
## LittleInterest   1.9113983
## Depressed        2.3629126
## Alcohol12PlusYr  1.3638525
## Education        2.1438324
## MaritalStatus    3.0361751
## Smoke100         2.2221353
## Smoke100n        2.8860931
## DaysPhysHlthBad  0.2909286
## DaysMentHlthBad  0.6231740
## HealthGen        2.8874822
## SleepHrsNight    3.1977870
## Work             0.9840099
## SleepTrouble     2.0832615
## BPSys1           3.9430683
## BPDia1           3.2744681
## Testosterone     3.3497603
## PhysActive       0.8386805
##  [ reached 'max' / getOption("max.print") -- omitted 28 rows ]

importance(rf_model)

##                    %IncMSE IncNodePurity
## ID               3.6257077   46471.57400
## Diabetes         1.7314275     312.78004
## PhysActiveDays  -0.5741940    7817.43567
## SexOrientation  -0.5066696     315.21416
## SexNumPartYear   3.7403828   74065.13220
## Marijuana        2.4565804    2426.49283
## RegularMarij     4.6999550   17813.88634
## AlcoholDay       5.1591547   72433.98476
## SexAge           3.8067884   35679.11104
## HardDrugs        2.8429804   29474.31404
## SexEver          0.0000000       0.00000
## SameSex          0.9616397     391.03106
## AlcoholYear      1.8518106   51286.67381
## LittleInterest   1.9113983     429.16386
## Depressed        2.3629126   30898.63660
##  [ reached getOption("max.print") -- omitted 43 rows ]

varImpPlot(rf_model)

varImpPlot(rf_model, type=1)

varImpPlot(rf_model, type=2)

\(~\)

3.2.1 Score Training Set

y_hat <- predict(rf_model, TRAIN)

TRAIN.rf_scored <- as_tibble(cbind(TRAIN, y_hat))
glimpse(TRAIN.rf_scored)

## Observations: 397
## Variables: 60
## $ ID              <int> 62199, 62205, 62220, 62231, 62340, 62340, 62340, 62...
## $ Diabetes        <fct> No, No, No, No, No, No, No, No, No, No, No, No, No,...
## $ PhysActiveDays  <int> 2, 1, 3, 3, 3, 5, 3, 2, 5, 6, 3, 7, 3, 1, 1, 2, 1, ...
## $ SexOrientation  <fct> Homosexual, Heterosexual, Heterosexual, Heterosexua...
## $ SexNumPartYear  <int> 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 0, 1, 2, 2, 10,...
## $ Marijuana       <fct> Yes, Yes, No, No, No, No, No, Yes, Yes, Yes, No, No...
## $ RegularMarij    <fct> No, No, No, No, No, No, No, Yes, No, No, No, No, No...
## $ AlcoholDay      <int> 1, 3, 3, 2, 1, 1, 1, 4, 7, 4, 2, 1, 2, 1, 6, 6, 15,...
## $ SexAge          <int> 19, 14, 18, 14, 19, 19, 19, 16, 18, 17, 20, 17, 22,...
## $ SexNumPartnLife <int> 6, 4, 20, 1, 5, 5, 5, 15, 8, 3, 25, 4, 1, 7, 7, 7, ...
## $ HardDrugs       <fct> Yes, No, No, No, No, No, No, Yes, Yes, No, No, No, ...
## $ SexEver         <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Y...
## $ SameSex         <fct> Yes, No, No, No, No, No, No, No, Yes, No, No, No, N...
## $ AlcoholYear     <int> 260, 52, 36, 3, 12, 12, 12, 104, 260, 52, 6, 5, 104...
## $ LittleInterest  <fct> None, None, None, None, Most, Most, Most, None, Non...
## $ Depressed       <fct> None, None, Several, None, Most, Most, Most, None, ...
## $ Alcohol12PlusYr <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Y...
## $ Education       <fct> College Grad, College Grad, College Grad, High Scho...
## $ MaritalStatus   <fct> LivePartner, NeverMarried, NeverMarried, Married, M...
## $ Smoke100        <fct> Yes, No, No, No, No, No, No, Yes, Yes, No, No, No, ...
## $ Smoke100n       <fct> Smoker, Non-Smoker, Non-Smoker, Non-Smoker, Non-Smo...
## $ DaysPhysHlthBad <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 1, 7, ...
## $ DaysMentHlthBad <int> 1, 0, 3, 0, 26, 26, 26, 10, 3, 0, 30, 10, 4, 0, 0, ...
## $ HealthGen       <fct> Vgood, Good, Good, Good, Vgood, Vgood, Vgood, Excel...
## $ SleepHrsNight   <int> 8, 6, 6, 4, 6, 6, 6, 6, 8, 5, 6, 7, 8, 7, 6, 6, 6, ...
## $ Work            <fct> Working, Working, Working, Working, Working, Workin...
## $ SleepTrouble    <fct> No, Yes, Yes, No, No, No, No, No, No, No, No, No, N...
## $ BPSys1          <int> 112, 116, 118, 120, 106, 106, 106, 112, 104, 112, 9...
## $ BPDia1          <int> 70, 86, 70, 70, 80, 80, 80, 66, 80, 54, 56, 68, 52,...
## $ Testosterone    <dbl> 269.24, 466.11, 35.95, 14.90, 296.66, 296.66, 296.6...
## $ PhysActive      <fct> Yes, Yes, Yes, No, No, No, No, Yes, Yes, Yes, No, N...
## $ BPSys2          <int> 108, 122, 122, 116, 106, 106, 106, 112, 110, 116, 9...
## $ BPDia2          <int> 64, 88, 72, 70, 82, 82, 82, 64, 84, 54, 62, 64, 56,...
## $ UrineFlow1      <dbl> 0.380, 0.355, 0.624, 0.196, 0.623, 0.623, 0.623, 0....
## $ BPSys3          <int> 112, 122, 118, 112, 106, 106, 106, 118, 110, 112, 9...
## $ BPDia3          <int> 66, 86, 70, 68, 80, 80, 80, 70, 80, 56, 54, 68, 58,...
## $ DirectChol      <dbl> 0.91, 1.03, 1.37, 1.53, 0.98, 0.98, 0.98, 1.86, 1.1...
## $ TotChol         <dbl> 4.42, 5.46, 5.09, 4.71, 4.16, 4.16, 4.16, 4.47, 5.2...
## $ BPSysAve        <int> 110, 122, 120, 114, 106, 106, 106, 115, 110, 114, 9...
## $ BPDiaAve        <int> 65, 87, 71, 69, 81, 81, 81, 67, 82, 55, 58, 66, 57,...
## $ Pulse           <int> 84, 70, 62, 64, 68, 68, 68, 70, 80, 68, 78, 76, 54,...
## $ UrineVol1       <int> 65, 22, 121, 19, 96, 96, 96, 44, 193, 26, 282, 93, ...
## $ HHIncome        <fct> more 99999, more 99999, 55000-64999, more 99999, 25...
## $ HHIncomeMid     <int> 100000, 100000, 60000, 100000, 30000, 30000, 30000,...
## $ Poverty         <dbl> 5.00, 5.00, 4.92, 3.92, 1.28, 1.28, 1.28, 2.72, 5.0...
## $ BMI_WHO         <fct> 25.0_to_29.9, 25.0_to_29.9, 30.0_plus, 30.0_plus, 2...
## $ AgeDecade       <fct>  50-59,  20-29,  30-39,  40-49,  40-49,  40-49,  40...
## $ BMI             <dbl> 28.0, 28.9, 40.4, 33.2, 25.9, 25.9, 25.9, 22.3, 25....
## $ Height          <dbl> 186.0, 171.4, 167.3, 164.7, 173.2, 173.2, 173.2, 16...
## $ TVHrsDay        <fct> 1_hr, 1_hr, 2_hr, 2_hr, 3_hr, 3_hr, 3_hr, 2_hr, 0_t...
## $ CompHrsDay      <fct> 1_hr, 3_hr, 1_hr, 0_to_1_hr, 1_hr, 1_hr, 1_hr, 2_hr...
## $ Weight          <dbl> 96.9, 84.8, 113.1, 90.1, 77.6, 77.6, 77.6, 58.7, 91...
## $ HomeRooms       <int> 4, 12, 4, 6, 6, 6, 6, 5, 9, 5, 5, 9, 6, 10, 1, 1, 8...
## $ HomeOwn         <fct> Rent, Own, Own, Own, Own, Own, Own, Rent, Own, Own,...
## $ SurveyYr        <fct> 2011_12, 2011_12, 2011_12, 2011_12, 2011_12, 2011_1...
## $ Gender          <fct> male, male, female, female, male, male, male, femal...
## $ Age             <int> 57, 28, 31, 48, 44, 44, 44, 43, 36, 23, 39, 30, 23,...
## $ Race1           <fct> White, White, Black, Mexican, White, White, White, ...
## $ Race3           <fct> White, White, Black, Mexican, White, White, White, ...
## $ y_hat           <dbl> 13.628471, 14.382737, 15.506667, 6.655474, 5.760368...

\(~\)

3.2.1.1 Confusion Matrix on Training Set

RMSE_rf_Train <- yardstick::rmse(TRAIN.rf_scored, truth=SexNumPartnLife, estimate=y_hat)

RMSE_rf_Train

## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        22.5

\(~\)

4 Score Test Set

y_hat <- predict(rf_model, TEST)

#predict(diab_pop.no_na_vals.train.rf_model, diab_pop.no_na_vals.test, type ="prob")

test.rf_scored <- as_tibble(cbind(TEST, y_hat))

glimpse(test.rf_scored)

## Observations: 263
## Variables: 60
## $ ID              <int> 62172, 62199, 62222, 62291, 62297, 62342, 62342, 62...
## $ Diabetes        <fct> No, No, No, No, No, No, No, No, No, No, Yes, No, No...
## $ PhysActiveDays  <int> 2, 7, 2, 7, 3, 2, 7, 1, 7, 3, 5, 7, 7, 5, 5, 3, 7, ...
## $ SexOrientation  <fct> Heterosexual, Homosexual, Heterosexual, Heterosexua...
## $ SexNumPartYear  <int> 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, ...
## $ Marijuana       <fct> Yes, Yes, No, No, Yes, No, No, No, No, Yes, Yes, Ye...
## $ RegularMarij    <fct> No, No, No, No, No, No, No, No, No, No, Yes, No, No...
## $ AlcoholDay      <int> 3, 1, 2, 3, 4, 2, 2, 2, 6, 3, 1, 1, 1, 2, 1, 1, 1, ...
## $ SexAge          <int> 17, 19, 20, 20, 22, 19, 19, 19, 17, 16, 18, 16, 16,...
## $ SexNumPartnLife <int> 4, 6, 1, 3, 25, 3, 3, 1, 3, 50, 5, 10, 10, 15, 1, 6...
## $ HardDrugs       <fct> No, Yes, No, No, Yes, No, No, No, No, Yes, No, No, ...
## $ SexEver         <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Y...
## $ SameSex         <fct> No, Yes, No, No, No, No, No, Yes, No, No, No, No, N...
## $ AlcoholYear     <int> 104, 260, 52, 3, 156, 36, 36, 2, 104, 104, 2, 300, ...
## $ LittleInterest  <fct> Several, None, None, Several, Most, None, None, Sev...
## $ Depressed       <fct> Most, None, None, None, None, None, None, Several, ...
## $ Alcohol12PlusYr <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Y...
## $ Education       <fct> High School, College Grad, College Grad, Some Colle...
## $ MaritalStatus   <fct> NeverMarried, LivePartner, Married, Married, Marrie...
## $ Smoke100        <fct> Yes, Yes, No, No, Yes, No, No, No, No, Yes, Yes, Ye...
## $ Smoke100n       <fct> Smoker, Smoker, Non-Smoker, Non-Smoker, Smoker, Non...
## $ DaysPhysHlthBad <int> 2, 0, 0, 21, 0, 0, 0, 20, 0, 0, 0, 0, 0, 5, 14, 0, ...
## $ DaysMentHlthBad <int> 10, 1, 0, 0, 0, 0, 0, 7, 0, 3, 0, 5, 5, 0, 20, 0, 0...
## $ HealthGen       <fct> Good, Vgood, Good, Fair, Vgood, Good, Good, Fair, G...
## $ SleepHrsNight   <int> 8, 8, 7, 7, 7, 6, 6, 8, 9, 6, 7, 8, 8, 8, 5, 6, 8, ...
## $ Work            <fct> NotWorking, Working, Working, Working, Working, Wor...
## $ SleepTrouble    <fct> No, No, Yes, Yes, No, Yes, Yes, No, No, Yes, Yes, N...
## $ BPSys1          <int> 100, 112, 106, 122, 126, 126, 126, 116, 122, 120, 1...
## $ BPDia1          <int> 70, 70, 76, 70, 76, 84, 84, 68, 80, 90, 72, 76, 76,...
## $ Testosterone    <dbl> 47.53, 269.24, 343.14, 257.94, 619.40, 36.68, 36.68...
## $ PhysActive      <fct> No, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No,...
## $ BPSys2          <int> 102, 108, 108, 120, 122, 114, 114, 106, 114, 124, 1...
## $ BPDia2          <int> 68, 64, 70, 68, 72, 74, 74, 70, 78, 88, 76, 80, 80,...
## $ UrineFlow1      <dbl> 0.645, 0.380, 1.297, 0.809, 0.448, 0.647, 0.647, 2....
## $ BPSys3          <int> 104, 112, 100, 116, 122, 110, 110, 112, 116, 130, 1...
## $ BPDia3          <int> 76, 66, 76, 62, 76, 70, 70, 70, 70, 90, 78, 76, 76,...
## $ DirectChol      <dbl> 1.89, 0.91, 1.32, 1.11, 1.55, 1.27, 1.27, 1.14, 0.8...
## $ TotChol         <dbl> 4.37, 4.42, 5.53, 3.44, 5.59, 5.12, 5.12, 3.26, 6.0...
## $ BPSysAve        <int> 103, 110, 104, 118, 122, 112, 112, 109, 115, 127, 1...
## $ BPDiaAve        <int> 72, 65, 73, 65, 74, 72, 72, 70, 74, 89, 77, 78, 78,...
## $ Pulse           <int> 80, 84, 78, 60, 60, 78, 78, 90, 76, 68, 84, 84, 84,...
## $ UrineVol1       <int> 107, 65, 83, 178, 13, 66, 66, 240, 212, 118, 57, 72...
## $ HHIncome        <fct> 20000-24999, more 99999, more 99999, 75000-99999, 6...
## $ HHIncomeMid     <int> 22500, 100000, 100000, 87500, 70000, 50000, 50000, ...
## $ Poverty         <dbl> 2.02, 5.00, 5.00, 5.00, 3.67, 1.85, 1.85, 1.50, 1.3...
## $ BMI_WHO         <fct> 30.0_plus, 25.0_to_29.9, 25.0_to_29.9, 25.0_to_29.9...
## $ AgeDecade       <fct>  40-49,  50-59,  30-39,  50-59,  40-49,  30-39,  30...
## $ BMI             <dbl> 33.3, 28.0, 25.0, 29.9, 31.1, 32.9, 32.9, 38.0, 30....
## $ Height          <dbl> 172.0, 186.0, 179.0, 175.4, 167.9, 166.6, 166.6, 16...
## $ TVHrsDay        <fct> More_4_hr, 1_hr, 2_hr, 3_hr, 3_hr, 1_hr, 1_hr, 2_hr...
## $ CompHrsDay      <fct> More_4_hr, 1_hr, 0_to_1_hr, 3_hr, 2_hr, More_4_hr, ...
## $ Weight          <dbl> 98.6, 96.9, 80.1, 91.9, 87.8, 91.3, 91.3, 106.0, 96...
## $ HomeRooms       <int> 4, 4, 3, 6, 5, 6, 6, 6, 4, 11, 6, 7, 7, 7, 6, 7, 8,...
## $ HomeOwn         <fct> Rent, Rent, Rent, Own, Own, Own, Own, Rent, Own, Ow...
## $ SurveyYr        <fct> 2011_12, 2011_12, 2011_12, 2011_12, 2011_12, 2011_1...
## $ Gender          <fct> female, male, male, male, male, female, female, fem...
## $ Age             <int> 43, 57, 32, 56, 43, 38, 38, 23, 22, 41, 51, 41, 41,...
## $ Race1           <fct> Black, White, White, Other, Black, White, White, Wh...
## $ Race3           <fct> Black, White, White, Other, Black, White, White, Wh...
## $ y_hat           <dbl> 13.639121, 14.451091, 8.212610, 15.450294, 19.12722...

\(~\)

4.1 Testing Set Confusion Matrix

RMSE_rf_TEST <- yardstick::rmse(test.rf_scored, truth=SexNumPartnLife, estimate=y_hat)

RMSE_rf_TEST

## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        19.3

\(~\)

5 Code Appendix

\(~\)

knitr::opts_chunk$set(echo = TRUE)
options(max.print=30) 
install_if_not <- function( list.of.packages ) {
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}

library('tidyverse')
library(NHANES)

NHANES_DATA_12 <- NHANES %>% 
  select(-DiabetesAge) %>%
  filter(SurveyYr =='2011_12') 

SumNa <- function(col){sum(is.na(col))}

data.sum <- NHANES_DATA_12 %>% 
  summarise_all(SumNa) %>%
  tidyr::gather(key='feature', value='SumNa') %>%
  arrange(-SumNa) %>%
  mutate(PctNa = SumNa/nrow(NHANES_DATA_12))

data.sum2 <- data.sum %>% 
  filter(! (feature %in% c('ID','Diabetes'))) %>%
  filter(PctNa < .55)

data.sum2$feature

data_F <- NHANES_DATA_12 %>% 
  select(ID, Diabetes, data.sum2$feature) %>%
  filter(!is.na(Diabetes)) %>%
  na.omit()




diab_pop.no_na_vals <- data_F


library('caret')
# this will ensure our results are the same every run, to randomize you may use: set.seed(Sys.time())
set.seed(8675309)


# The createDataPartition function is used to create training and test sets
trainIndex <- createDataPartition(diab_pop.no_na_vals$SexNumPartnLife, 
                                  p = .6, 
                                  list = FALSE, 
                                  times = 1)

TRAIN <- diab_pop.no_na_vals[trainIndex, ]
TEST <- diab_pop.no_na_vals[-trainIndex, ]
install_if_not('randomForest')
library('randomForest')


rf_model <- randomForest(SexNumPartnLife ~ . ,
                         data = TRAIN,
                         ntree= 550,
                         mtry=3,
                         keep.forest=TRUE,
                         importance=TRUE)

rf_model
str(rf_model, 1)

getTree(rf_model, 1)


reprtree:::plot.getTree(rf_model,k=1)
reprtree:::plot.getTree(rf_model,k=4)
reprtree:::plot.getTree(rf_model,k=500)
varImp(rf_model)

importance(rf_model)

varImpPlot(rf_model)

varImpPlot(rf_model, type=1)
varImpPlot(rf_model, type=2)

y_hat <- predict(rf_model, TRAIN)

TRAIN.rf_scored <- as_tibble(cbind(TRAIN, y_hat))
glimpse(TRAIN.rf_scored)
RMSE_rf_Train <- yardstick::rmse(TRAIN.rf_scored, truth=SexNumPartnLife, estimate=y_hat)

RMSE_rf_Train
y_hat <- predict(rf_model, TEST)

#predict(diab_pop.no_na_vals.train.rf_model, diab_pop.no_na_vals.test, type ="prob")

test.rf_scored <- as_tibble(cbind(TEST, y_hat))

glimpse(test.rf_scored)

RMSE_rf_TEST <- yardstick::rmse(test.rf_scored, truth=SexNumPartnLife, estimate=y_hat)

RMSE_rf_TEST

Random Forest Regression

J Kyle Armstrong, PhD

26 August 2020

1 Read in the Data

2 Split Data into Training and Test Sets.

3 Train `randomForest` model

3.1 Random Forest is a Collection of Decision Trees

3.2 Feature Importances

3.2.1 Score Training Set

3.2.1.1 Confusion Matrix on Training Set

4 Score Test Set

4.1 Testing Set Confusion Matrix

5 Code Appendix

Random Forest Regression

J Kyle Armstrong, PhD

26 August 2020

1 Read in the Data

2 Split Data into Training and Test Sets.

3 Train randomForest model

3.1 Random Forest is a Collection of Decision Trees

3.2 Feature Importances

3.2.1 Score Training Set

3.2.1.1 Confusion Matrix on Training Set

4 Score Test Set

4.1 Testing Set Confusion Matrix

5 Code Appendix

3 Train `randomForest` model