0.1 install_if_not function

install_if_not <- function( list.of.packages ) {
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}

install_if_not("gam") 
## [1] "the package 'gam' is already installed"

\(~\)

\(~\)


\(~\)

\(~\)

1 Read in the data

library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')

glimpse(diab_pop)
## Observations: 5,719
## Variables: 10
## $ seqn     <dbl> 83732, 83733, 83734, 83735, 83736, 83737, 83741, 83742, 83...
## $ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Ma...
## $ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57...
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White...
## $ dmdeduc2 <fct> College grad or above, High school graduate/GED, High scho...
## $ dmdmartl <fct> Married, Divorced, Married, Living with partner, Divorced,...
## $ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "...
## $ bmxbmi   <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6...
## $ diq010   <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes,...
## $ lbxglu   <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284...

1.0.1 Let’s try to predict lbxglu:

1.1 omit nas

df <- diab_pop %>% 
  na.omit()

1.2 select factors and transform to numeric

my_factor_vars <- df %>% select_if(is.factor) %>% colnames()

df_as_nums <- df %>%
  mutate_at(vars(my_factor_vars), as.integer) %>%
  mutate_at(vars(my_factor_vars), as.factor)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(my_factor_vars)` instead of `my_factor_vars` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
glimpse(df_as_nums)
## Observations: 1,876
## Variables: 10
## $ seqn     <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83761, 83...
## $ riagendr <fct> 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1...
## $ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80...
## $ ridreth1 <fct> 3, 3, 1, 5, 2, 4, 2, 5, 1, 3, 3, 2, 4, 3, 2, 3, 4, 1, 1, 4...
## $ dmdeduc2 <fct> 3, 3, 2, 2, 5, 5, 1, 5, 1, 5, 1, 4, 3, 4, 1, 5, 4, 1, 3, 3...
## $ dmdmartl <fct> 3, 1, 4, 5, 1, 2, 4, 5, 3, 6, 1, 1, 5, 3, 2, 6, 5, 5, 1, 5...
## $ indhhin2 <fct> 4, 5, 13, 10, 6, 5, 5, 1, 4, 10, 4, 13, 13, 6, 3, 10, 6, 3...
## $ bmxbmi   <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0...
## $ diq010   <fct> 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ lbxglu   <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 9...

\(~\)

\(~\)


\(~\)

\(~\)

2 Load caret

library('caret')
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift

2.1 dummyVars

dV.df <- dummyVars(lbxglu ~ . , data = df_as_nums, fullRank=TRUE)

df_dV <- as_tibble(predict(dV.df,df_as_nums)) 
df_dV$lbxglu <- df$lbxglu 

2.2 split data

set.seed(8675309)

trainIndex <- createDataPartition(df_dV$lbxglu, 
                                  p = .6, 
                                  list = FALSE, 
                                  times = 1)
TRAIN <- df_dV[trainIndex, ]  
TEST <-  df_dV[-trainIndex, ] 

2.3 features versus target

X <- TRAIN %>% select(-lbxglu) 

dim(X)
## [1] 1127   29
y <- TRAIN$lbxglu

\(~\)

\(~\)


\(~\)

\(~\)

3 sbfControl function

ctrl <- sbfControl(functions = lmSBF,
                   method = "repeatedcv",
                   number = 7,
                   repeats = 5,
                   verbose = FALSE)

4 sbf function

lmProfile <- sbf(X, y, 
                 sbfControl  = ctrl)

\(~\)

\(~\)


\(~\)

\(~\)

5 Results

lmProfile
## 
## Selection By Filter
## 
## Outer resampling method: Cross-Validated (7 fold, repeated 5 times) 
## 
## Resampling performance:
## 
##   RMSE Rsquared   MAE RMSESD RsquaredSD MAESD
##  34.44   0.3292 17.85  7.101    0.09527 2.506
## 
## Using the training set, 8 variables were selected:
##    ridageyr, ridreth1.3, dmdmartl.2, dmdmartl.4, dmdmartl.5...
## 
## During resampling, the top 5 selected variables (out of a possible 16):
##    bmxbmi (100%), diq010.2 (100%), dmdmartl.5 (100%), ridageyr (100%), ridreth1.3 (100%)
## 
## On average, 7.3 variables were selected (min = 5, max = 11)

5.1 Optimal Variables

lmProfile$optVariables
## [1] "ridageyr"    "ridreth1.3"  "dmdmartl.2"  "dmdmartl.4"  "dmdmartl.5" 
## [6] "indhhin2.14" "bmxbmi"      "diq010.2"

5.2 Best Fit Model

lmProfile$fit
## 
## Call:
## lm(formula = y ~ ., data = tmp)
## 
## Coefficients:
## (Intercept)     ridageyr   ridreth1.3   dmdmartl.2   dmdmartl.4   dmdmartl.5  
##   150.57672      0.08535     -2.50795      2.07176      8.96023     -1.90338  
## indhhin2.14       bmxbmi     diq010.2  
##    -2.63139      0.43097    -62.46384

5.3 Score Test Data

y_hat <- predict(lmProfile$fit, TEST)

TEST.scored <- cbind(TEST, y_hat)

5.4 yardstick

yardstick::rmse(TEST.scored, lbxglu, y_hat)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        33.9

\(~\)

\(~\)

6 Code Appendix

\(~\)

\(~\)

install_if_not <- function( list.of.packages ) {
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}

install_if_not("gam") 
library(tidyverse)

diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')

glimpse(diab_pop)
df <- diab_pop %>% 
  na.omit()
my_factor_vars <- df %>% select_if(is.factor) %>% colnames()

df_as_nums <- df %>%
  mutate_at(vars(my_factor_vars), as.integer) %>%
  mutate_at(vars(my_factor_vars), as.factor)

glimpse(df_as_nums)
library('caret')
dV.df <- dummyVars(lbxglu ~ . , data = df_as_nums, fullRank=TRUE)

df_dV <- as_tibble(predict(dV.df,df_as_nums)) 
df_dV$lbxglu <- df$lbxglu 
set.seed(8675309)

trainIndex <- createDataPartition(df_dV$lbxglu, 
                                  p = .6, 
                                  list = FALSE, 
                                  times = 1)
TRAIN <- df_dV[trainIndex, ]  
TEST <-  df_dV[-trainIndex, ] 
X <- TRAIN %>% select(-lbxglu) 

dim(X)

y <- TRAIN$lbxglu
ctrl <- sbfControl(functions = lmSBF,
                   method = "repeatedcv",
                   number = 7,
                   repeats = 5,
                   verbose = FALSE)
lmProfile <- sbf(X, y, 
                 sbfControl  = ctrl)
lmProfile
lmProfile$optVariables
lmProfile$fit
y_hat <- predict(lmProfile$fit, TEST)

TEST.scored <- cbind(TEST, y_hat)
yardstick::rmse(TEST.scored, lbxglu, y_hat)