library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.5
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')
glimpse(diab_pop)
## Observations: 5,719
## Variables: 10
## $ seqn <dbl> 83732, 83733, 83734, 83735, 83736, 83737, 83741, 83742, 83...
## $ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Ma...
## $ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57...
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White...
## $ dmdeduc2 <fct> College grad or above, High school graduate/GED, High scho...
## $ dmdmartl <fct> Married, Divorced, Married, Living with partner, Divorced,...
## $ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "...
## $ bmxbmi <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6...
## $ diq010 <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes,...
## $ lbxglu <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284...
lbxglu
:df <- diab_pop %>%
na.omit()
my_factor_vars <- df %>% select_if(is.factor) %>% colnames()
df_as_nums <- df %>%
mutate_at(vars(my_factor_vars), as.integer) %>%
mutate_at(vars(my_factor_vars), as.factor)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(my_factor_vars)` instead of `my_factor_vars` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
glimpse(df_as_nums)
## Observations: 1,876
## Variables: 10
## $ seqn <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83761, 83...
## $ riagendr <fct> 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1...
## $ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80...
## $ ridreth1 <fct> 3, 3, 1, 5, 2, 4, 2, 5, 1, 3, 3, 2, 4, 3, 2, 3, 4, 1, 1, 4...
## $ dmdeduc2 <fct> 3, 3, 2, 2, 5, 5, 1, 5, 1, 5, 1, 4, 3, 4, 1, 5, 4, 1, 3, 3...
## $ dmdmartl <fct> 3, 1, 4, 5, 1, 2, 4, 5, 3, 6, 1, 1, 5, 3, 2, 6, 5, 5, 1, 5...
## $ indhhin2 <fct> 4, 5, 13, 10, 6, 5, 5, 1, 4, 10, 4, 13, 13, 6, 3, 10, 6, 3...
## $ bmxbmi <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0...
## $ diq010 <fct> 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ lbxglu <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 9...
library('caret')
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
dV.df <- dummyVars(lbxglu ~ . , data = df_as_nums, fullRank=TRUE)
df_dV <- as_tibble(predict(dV.df,df_as_nums))
df_dV$lbxglu <- df$lbxglu
\(~\)
\(~\)
library(rsample)
set.seed(8675309)
train_test <- initial_split(df_dV, prop = .6)
TRAIN <- training(train_test)
TEST <- testing(train_test)
\(~\)
\(~\)
X <- TRAIN %>% select(-lbxglu)
y <- TRAIN$lbxglu
\(~\)
\(~\)
rfeControl
functionctrl <- rfeControl(functions = lmFuncs,
method = "repeatedcv",
number = 10,
repeats = 10,
verbose = FALSE)
rfe
functionlmProfile <- rfe(X, y,
sizes = c(1:ncol(X)),
rfeControl = ctrl)
\(~\)
\(~\)
lmProfile
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold, repeated 10 times)
##
## Resampling performance over subset size:
##
## Variables RMSE Rsquared MAE RMSESD RsquaredSD MAESD Selected
## 1 31.73 0.3568 17.52 6.297 0.09818 2.240
## 2 31.90 0.3502 17.67 6.249 0.09575 2.232
## 3 31.89 0.3507 17.74 6.186 0.09453 2.221
## 4 31.80 0.3542 17.73 6.123 0.09405 2.184
## 5 31.75 0.3559 17.71 6.106 0.09361 2.193
## 6 31.79 0.3546 17.77 6.072 0.09289 2.189
## 7 31.83 0.3532 17.81 6.064 0.09265 2.185
## 8 31.85 0.3523 17.84 6.056 0.09170 2.188
## 9 31.87 0.3515 17.86 6.052 0.09156 2.165
## 10 31.89 0.3504 17.85 6.079 0.09120 2.176
## 11 31.92 0.3492 17.88 6.071 0.08974 2.170
## 12 31.94 0.3486 17.89 6.065 0.08938 2.172
## 13 31.94 0.3485 17.90 6.046 0.08964 2.162
## 14 31.97 0.3473 17.93 6.020 0.08946 2.158
## 15 31.98 0.3473 17.92 6.019 0.08943 2.174
## 16 31.99 0.3467 17.92 6.012 0.08870 2.166
## 17 31.99 0.3467 17.93 6.004 0.08872 2.167
## 18 31.99 0.3467 17.94 5.995 0.08765 2.149
## 19 31.99 0.3470 17.95 5.982 0.08706 2.149
## 20 31.97 0.3475 17.94 5.973 0.08675 2.137
## 21 31.96 0.3481 17.93 5.982 0.08715 2.145
## 22 31.96 0.3479 17.93 5.999 0.08669 2.156
## 23 31.95 0.3487 17.92 5.993 0.08689 2.156
## 24 31.93 0.3497 17.90 5.977 0.08736 2.144
## 25 31.89 0.3517 17.83 5.988 0.08762 2.101
## 26 31.80 0.3554 17.76 6.047 0.08843 2.105
## 27 31.74 0.3579 17.67 6.071 0.08797 2.129
## 28 31.72 0.3592 17.51 6.102 0.08803 2.130 *
## 29 31.75 0.3581 17.56 6.092 0.08764 2.121
##
## The top 5 variables (out of 28):
## diq010.2, indhhin2.5, dmdmartl.4, indhhin2.4, indhhin2.12
plot(lmProfile, type = c("g", "o"))
lmProfile$optVariables
## [1] "diq010.2" "indhhin2.5" "dmdmartl.4" "indhhin2.4" "indhhin2.12"
## [6] "indhhin2.13" "indhhin2.3" "ridreth1.4" "indhhin2.6" "indhhin2.2"
## [11] "dmdmartl.3" "indhhin2.8" "indhhin2.10" "indhhin2.14" "ridreth1.3"
## [16] "riagendr.2" "dmdmartl.5" "indhhin2.11" "ridreth1.2" "dmdeduc2.3"
## [21] "ridreth1.5" "dmdeduc2.2" "dmdmartl.2" "dmdmartl.6" "dmdeduc2.4"
## [26] "dmdeduc2.5" "bmxbmi" "ridageyr"
lmProfile$fit
##
## Call:
## lm(formula = y ~ ., data = tmp)
##
## Coefficients:
## (Intercept) diq010.2 indhhin2.5 dmdmartl.4 indhhin2.4 indhhin2.12
## 142.66467 -62.07477 13.57685 12.23171 11.54773 9.70199
## indhhin2.13 indhhin2.3 ridreth1.4 indhhin2.6 indhhin2.2 dmdmartl.3
## 7.51897 6.49780 -6.64457 5.70919 4.90936 5.17152
## indhhin2.8 indhhin2.10 indhhin2.14 ridreth1.3 riagendr.2 dmdmartl.5
## 4.32807 3.40585 3.53175 -3.76940 -3.82057 -3.07822
## indhhin2.11 ridreth1.2 dmdeduc2.3 ridreth1.5 dmdeduc2.2 dmdmartl.2
## 2.03963 2.08708 2.06865 -2.23775 -1.80767 0.97266
## dmdmartl.6 dmdeduc2.4 dmdeduc2.5 bmxbmi ridageyr
## -0.89894 -0.23991 -0.06898 0.53070 0.11944
y_hat <- predict(lmProfile$fit, TEST)
TEST.scored <- cbind(TEST,y_hat)
yardstick::rmse(TEST.scored, lbxglu, y_hat)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 37.9
\(~\)
\(~\)
\(~\)
library(tidyverse)
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')
glimpse(diab_pop)
df <- diab_pop %>%
na.omit()
my_factor_vars <- df %>% select_if(is.factor) %>% colnames()
df_as_nums <- df %>%
mutate_at(vars(my_factor_vars), as.integer) %>%
mutate_at(vars(my_factor_vars), as.factor)
glimpse(df_as_nums)
library('caret')
dV.df <- dummyVars(lbxglu ~ . , data = df_as_nums, fullRank=TRUE)
df_dV <- as_tibble(predict(dV.df,df_as_nums))
df_dV$lbxglu <- df$lbxglu
library(rsample)
set.seed(8675309)
train_test <- initial_split(df_dV, prop = .6)
TRAIN <- training(train_test)
TEST <- testing(train_test)
X <- TRAIN %>% select(-lbxglu)
y <- TRAIN$lbxglu
ctrl <- rfeControl(functions = lmFuncs,
method = "repeatedcv",
number = 10,
repeats = 10,
verbose = FALSE)
lmProfile <- rfe(X, y,
sizes = c(1:ncol(X)),
rfeControl = ctrl)
lmProfile
plot(lmProfile, type = c("g", "o"))
lmProfile$optVariables
lmProfile$fit
y_hat <- predict(lmProfile$fit, TEST)
TEST.scored <- cbind(TEST,y_hat)
yardstick::rmse(TEST.scored, lbxglu, y_hat)