1 Setup

library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')

glimpse(diab_pop)
## Observations: 5,719
## Variables: 10
## $ seqn     <dbl> 83732, 83733, 83734, 83735, 83736, 83737, 83741, 83742, 83...
## $ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Ma...
## $ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57...
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White...
## $ dmdeduc2 <fct> College grad or above, High school graduate/GED, High scho...
## $ dmdmartl <fct> Married, Divorced, Married, Living with partner, Divorced,...
## $ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "...
## $ bmxbmi   <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6...
## $ diq010   <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes,...
## $ lbxglu   <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284...

1.0.1 Let’s try to predict lbxglu:

df <- diab_pop %>% 
  na.omit()

my_factor_vars <- df %>% select_if(is.factor) %>% colnames()

df_as_nums <- df %>%
  mutate_at(vars(my_factor_vars), as.integer) %>%
  mutate_at(vars(my_factor_vars), as.factor)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(my_factor_vars)` instead of `my_factor_vars` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
glimpse(df_as_nums)
## Observations: 1,876
## Variables: 10
## $ seqn     <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83761, 83...
## $ riagendr <fct> 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1...
## $ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80...
## $ ridreth1 <fct> 3, 3, 1, 5, 2, 4, 2, 5, 1, 3, 3, 2, 4, 3, 2, 3, 4, 1, 1, 4...
## $ dmdeduc2 <fct> 3, 3, 2, 2, 5, 5, 1, 5, 1, 5, 1, 4, 3, 4, 1, 5, 4, 1, 3, 3...
## $ dmdmartl <fct> 3, 1, 4, 5, 1, 2, 4, 5, 3, 6, 1, 1, 5, 3, 2, 6, 5, 5, 1, 5...
## $ indhhin2 <fct> 4, 5, 13, 10, 6, 5, 5, 1, 4, 10, 4, 13, 13, 6, 3, 10, 6, 3...
## $ bmxbmi   <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0...
## $ diq010   <fct> 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ lbxglu   <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 9...
library('caret')
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
dV.df <- dummyVars(lbxglu ~ . , data = df_as_nums, fullRank=TRUE)

df_dV <- as_tibble(predict(dV.df,df_as_nums)) 
df_dV$lbxglu <- df$lbxglu 

\(~\)


\(~\)

2 Split data

library(rsample)

set.seed(8675309)

train_test <- initial_split(df_dV, prop = .6)
TRAIN <- training(train_test)
TEST <- testing(train_test)

\(~\)


\(~\)

3 Create features and outcome

X <- TRAIN %>% select(-lbxglu) 

y <- TRAIN$lbxglu

\(~\)


\(~\)

4 Recursive Feature Elimination

4.1 The rfeControl function

ctrl <- rfeControl(functions = lmFuncs,
                   method = "repeatedcv",
                   number = 10,
                   repeats = 10,
                   verbose = FALSE)

4.2 The rfe function

lmProfile <- rfe(X, y,
                 sizes = c(1:ncol(X)),
                 rfeControl = ctrl)

\(~\)


\(~\)

5 Results

lmProfile
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 10 times) 
## 
## Resampling performance over subset size:
## 
##  Variables  RMSE Rsquared   MAE RMSESD RsquaredSD MAESD Selected
##          1 31.73   0.3568 17.52  6.297    0.09818 2.240         
##          2 31.90   0.3502 17.67  6.249    0.09575 2.232         
##          3 31.89   0.3507 17.74  6.186    0.09453 2.221         
##          4 31.80   0.3542 17.73  6.123    0.09405 2.184         
##          5 31.75   0.3559 17.71  6.106    0.09361 2.193         
##          6 31.79   0.3546 17.77  6.072    0.09289 2.189         
##          7 31.83   0.3532 17.81  6.064    0.09265 2.185         
##          8 31.85   0.3523 17.84  6.056    0.09170 2.188         
##          9 31.87   0.3515 17.86  6.052    0.09156 2.165         
##         10 31.89   0.3504 17.85  6.079    0.09120 2.176         
##         11 31.92   0.3492 17.88  6.071    0.08974 2.170         
##         12 31.94   0.3486 17.89  6.065    0.08938 2.172         
##         13 31.94   0.3485 17.90  6.046    0.08964 2.162         
##         14 31.97   0.3473 17.93  6.020    0.08946 2.158         
##         15 31.98   0.3473 17.92  6.019    0.08943 2.174         
##         16 31.99   0.3467 17.92  6.012    0.08870 2.166         
##         17 31.99   0.3467 17.93  6.004    0.08872 2.167         
##         18 31.99   0.3467 17.94  5.995    0.08765 2.149         
##         19 31.99   0.3470 17.95  5.982    0.08706 2.149         
##         20 31.97   0.3475 17.94  5.973    0.08675 2.137         
##         21 31.96   0.3481 17.93  5.982    0.08715 2.145         
##         22 31.96   0.3479 17.93  5.999    0.08669 2.156         
##         23 31.95   0.3487 17.92  5.993    0.08689 2.156         
##         24 31.93   0.3497 17.90  5.977    0.08736 2.144         
##         25 31.89   0.3517 17.83  5.988    0.08762 2.101         
##         26 31.80   0.3554 17.76  6.047    0.08843 2.105         
##         27 31.74   0.3579 17.67  6.071    0.08797 2.129         
##         28 31.72   0.3592 17.51  6.102    0.08803 2.130        *
##         29 31.75   0.3581 17.56  6.092    0.08764 2.121         
## 
## The top 5 variables (out of 28):
##    diq010.2, indhhin2.5, dmdmartl.4, indhhin2.4, indhhin2.12
plot(lmProfile, type = c("g", "o"))

lmProfile$optVariables
##  [1] "diq010.2"    "indhhin2.5"  "dmdmartl.4"  "indhhin2.4"  "indhhin2.12"
##  [6] "indhhin2.13" "indhhin2.3"  "ridreth1.4"  "indhhin2.6"  "indhhin2.2" 
## [11] "dmdmartl.3"  "indhhin2.8"  "indhhin2.10" "indhhin2.14" "ridreth1.3" 
## [16] "riagendr.2"  "dmdmartl.5"  "indhhin2.11" "ridreth1.2"  "dmdeduc2.3" 
## [21] "ridreth1.5"  "dmdeduc2.2"  "dmdmartl.2"  "dmdmartl.6"  "dmdeduc2.4" 
## [26] "dmdeduc2.5"  "bmxbmi"      "ridageyr"
lmProfile$fit
## 
## Call:
## lm(formula = y ~ ., data = tmp)
## 
## Coefficients:
## (Intercept)     diq010.2   indhhin2.5   dmdmartl.4   indhhin2.4  indhhin2.12  
##   142.66467    -62.07477     13.57685     12.23171     11.54773      9.70199  
## indhhin2.13   indhhin2.3   ridreth1.4   indhhin2.6   indhhin2.2   dmdmartl.3  
##     7.51897      6.49780     -6.64457      5.70919      4.90936      5.17152  
##  indhhin2.8  indhhin2.10  indhhin2.14   ridreth1.3   riagendr.2   dmdmartl.5  
##     4.32807      3.40585      3.53175     -3.76940     -3.82057     -3.07822  
## indhhin2.11   ridreth1.2   dmdeduc2.3   ridreth1.5   dmdeduc2.2   dmdmartl.2  
##     2.03963      2.08708      2.06865     -2.23775     -1.80767      0.97266  
##  dmdmartl.6   dmdeduc2.4   dmdeduc2.5       bmxbmi     ridageyr  
##    -0.89894     -0.23991     -0.06898      0.53070      0.11944
y_hat <- predict(lmProfile$fit, TEST)

TEST.scored <- cbind(TEST,y_hat)

yardstick::rmse(TEST.scored, lbxglu, y_hat)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        37.9

\(~\)

\(~\)

6 Code Appendix

\(~\)

library(tidyverse)

diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')

glimpse(diab_pop)
df <- diab_pop %>% 
  na.omit()

my_factor_vars <- df %>% select_if(is.factor) %>% colnames()

df_as_nums <- df %>%
  mutate_at(vars(my_factor_vars), as.integer) %>%
  mutate_at(vars(my_factor_vars), as.factor)

glimpse(df_as_nums)

library('caret')

dV.df <- dummyVars(lbxglu ~ . , data = df_as_nums, fullRank=TRUE)

df_dV <- as_tibble(predict(dV.df,df_as_nums)) 
df_dV$lbxglu <- df$lbxglu 

library(rsample)

set.seed(8675309)

train_test <- initial_split(df_dV, prop = .6)
TRAIN <- training(train_test)
TEST <- testing(train_test)
X <- TRAIN %>% select(-lbxglu) 

y <- TRAIN$lbxglu
ctrl <- rfeControl(functions = lmFuncs,
                   method = "repeatedcv",
                   number = 10,
                   repeats = 10,
                   verbose = FALSE)
lmProfile <- rfe(X, y,
                 sizes = c(1:ncol(X)),
                 rfeControl = ctrl)
lmProfile

plot(lmProfile, type = c("g", "o"))

lmProfile$optVariables

lmProfile$fit

y_hat <- predict(lmProfile$fit, TEST)

TEST.scored <- cbind(TEST,y_hat)

yardstick::rmse(TEST.scored, lbxglu, y_hat)