install_if_not
functioninstall_if_not <- function( list.of.packages ) {
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}
install_if_not("gam")
## [1] "the package 'gam' is already installed"
\(~\)
\(~\)
\(~\)
\(~\)
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.5
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')
glimpse(diab_pop)
## Observations: 5,719
## Variables: 10
## $ seqn <dbl> 83732, 83733, 83734, 83735, 83736, 83737, 83741, 83742, 83...
## $ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Ma...
## $ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57...
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White...
## $ dmdeduc2 <fct> College grad or above, High school graduate/GED, High scho...
## $ dmdmartl <fct> Married, Divorced, Married, Living with partner, Divorced,...
## $ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "...
## $ bmxbmi <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6...
## $ diq010 <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes,...
## $ lbxglu <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284...
lbxglu
:df <- diab_pop %>%
na.omit()
my_factor_vars <- df %>% select_if(is.factor) %>% colnames()
df_as_nums <- df %>%
mutate_at(vars(my_factor_vars), as.integer) %>%
mutate_at(vars(my_factor_vars), as.factor)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(my_factor_vars)` instead of `my_factor_vars` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
glimpse(df_as_nums)
## Observations: 1,876
## Variables: 10
## $ seqn <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83761, 83...
## $ riagendr <fct> 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1...
## $ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80...
## $ ridreth1 <fct> 3, 3, 1, 5, 2, 4, 2, 5, 1, 3, 3, 2, 4, 3, 2, 3, 4, 1, 1, 4...
## $ dmdeduc2 <fct> 3, 3, 2, 2, 5, 5, 1, 5, 1, 5, 1, 4, 3, 4, 1, 5, 4, 1, 3, 3...
## $ dmdmartl <fct> 3, 1, 4, 5, 1, 2, 4, 5, 3, 6, 1, 1, 5, 3, 2, 6, 5, 5, 1, 5...
## $ indhhin2 <fct> 4, 5, 13, 10, 6, 5, 5, 1, 4, 10, 4, 13, 13, 6, 3, 10, 6, 3...
## $ bmxbmi <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0...
## $ diq010 <fct> 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ lbxglu <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 9...
\(~\)
\(~\)
\(~\)
\(~\)
caret
library('caret')
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
dummyVars
dV.df <- dummyVars(lbxglu ~ . , data = df_as_nums, fullRank=TRUE)
df_dV <- as_tibble(predict(dV.df,df_as_nums))
df_dV$lbxglu <- df$lbxglu
set.seed(8675309)
trainIndex <- createDataPartition(df_dV$lbxglu,
p = .6,
list = FALSE,
times = 1)
TRAIN <- df_dV[trainIndex, ]
TEST <- df_dV[-trainIndex, ]
X <- TRAIN %>% select(-lbxglu)
dim(X)
## [1] 1127 29
y <- TRAIN$lbxglu
\(~\)
\(~\)
\(~\)
\(~\)
sbfControl
functionctrl <- sbfControl(functions = lmSBF,
method = "repeatedcv",
number = 7,
repeats = 5,
verbose = FALSE)
sbf
functionlmProfile <- sbf(X, y,
sbfControl = ctrl)
\(~\)
\(~\)
\(~\)
\(~\)
lmProfile
##
## Selection By Filter
##
## Outer resampling method: Cross-Validated (7 fold, repeated 5 times)
##
## Resampling performance:
##
## RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 34.44 0.3292 17.85 7.101 0.09527 2.506
##
## Using the training set, 8 variables were selected:
## ridageyr, ridreth1.3, dmdmartl.2, dmdmartl.4, dmdmartl.5...
##
## During resampling, the top 5 selected variables (out of a possible 16):
## bmxbmi (100%), diq010.2 (100%), dmdmartl.5 (100%), ridageyr (100%), ridreth1.3 (100%)
##
## On average, 7.3 variables were selected (min = 5, max = 11)
lmProfile$optVariables
## [1] "ridageyr" "ridreth1.3" "dmdmartl.2" "dmdmartl.4" "dmdmartl.5"
## [6] "indhhin2.14" "bmxbmi" "diq010.2"
lmProfile$fit
##
## Call:
## lm(formula = y ~ ., data = tmp)
##
## Coefficients:
## (Intercept) ridageyr ridreth1.3 dmdmartl.2 dmdmartl.4 dmdmartl.5
## 150.57672 0.08535 -2.50795 2.07176 8.96023 -1.90338
## indhhin2.14 bmxbmi diq010.2
## -2.63139 0.43097 -62.46384
y_hat <- predict(lmProfile$fit, TEST)
TEST.scored <- cbind(TEST, y_hat)
yardstick
yardstick::rmse(TEST.scored, lbxglu, y_hat)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 33.9
\(~\)
\(~\)
\(~\)
\(~\)
install_if_not <- function( list.of.packages ) {
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}
install_if_not("gam")
library(tidyverse)
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')
glimpse(diab_pop)
df <- diab_pop %>%
na.omit()
my_factor_vars <- df %>% select_if(is.factor) %>% colnames()
df_as_nums <- df %>%
mutate_at(vars(my_factor_vars), as.integer) %>%
mutate_at(vars(my_factor_vars), as.factor)
glimpse(df_as_nums)
library('caret')
dV.df <- dummyVars(lbxglu ~ . , data = df_as_nums, fullRank=TRUE)
df_dV <- as_tibble(predict(dV.df,df_as_nums))
df_dV$lbxglu <- df$lbxglu
set.seed(8675309)
trainIndex <- createDataPartition(df_dV$lbxglu,
p = .6,
list = FALSE,
times = 1)
TRAIN <- df_dV[trainIndex, ]
TEST <- df_dV[-trainIndex, ]
X <- TRAIN %>% select(-lbxglu)
dim(X)
y <- TRAIN$lbxglu
ctrl <- sbfControl(functions = lmSBF,
method = "repeatedcv",
number = 7,
repeats = 5,
verbose = FALSE)
lmProfile <- sbf(X, y,
sbfControl = ctrl)
lmProfile
lmProfile$optVariables
lmProfile$fit
y_hat <- predict(lmProfile$fit, TEST)
TEST.scored <- cbind(TEST, y_hat)
yardstick::rmse(TEST.scored, lbxglu, y_hat)