set.seed(1701)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
select(-seqn) %>%
mutate(diq010 = fct_relevel(diq010, c('No Diabetes','Diabetes')))
glimpse(diab_pop)
## Rows: 5,719
## Columns: 9
## $ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Male,~
## $ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57, 8~
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White, N~
## $ dmdeduc2 <fct> College grad or above, High school graduate/GED, High school ~
## $ dmdmartl <fct> Married, Divorced, Married, Living with partner, Divorced, Se~
## $ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "$65~
## $ bmxbmi <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6, 2~
## $ diq010 <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes, No~
## $ lbxglu <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284, 3~
diq010
:df <- diab_pop %>%
na.omit()
my_factor_vars_1 <- df %>% select_if(is.factor) %>% colnames()
my_factor_vars <- setdiff(my_factor_vars_1, 'diq010')
df_as_nums <- df %>%
mutate_at(all_of(my_factor_vars), as.integer) %>%
mutate_at(all_of(my_factor_vars), as.factor)
glimpse(df_as_nums)
## Rows: 1,876
## Columns: 9
## $ riagendr <fct> 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1~
## $ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80, 7~
## $ ridreth1 <fct> 3, 3, 1, 5, 2, 4, 2, 5, 1, 3, 3, 2, 4, 3, 2, 3, 4, 1, 1, 4, 2~
## $ dmdeduc2 <fct> 3, 3, 2, 2, 5, 5, 1, 5, 1, 5, 1, 4, 3, 4, 1, 5, 4, 1, 3, 3, 4~
## $ dmdmartl <fct> 3, 1, 4, 5, 1, 2, 4, 5, 3, 6, 1, 1, 5, 3, 2, 6, 5, 5, 1, 5, 1~
## $ indhhin2 <fct> 4, 5, 13, 10, 6, 5, 5, 1, 4, 10, 4, 13, 13, 6, 3, 10, 6, 3, 4~
## $ bmxbmi <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0, 2~
## $ diq010 <fct> No Diabetes, Diabetes, No Diabetes, No Diabetes, No Diabetes,~
## $ lbxglu <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 94, ~
preProcess
pP <- preProcess(df_as_nums, c('center','scale'))
df_as_nums <- predict(pP,df_as_nums)
glimpse(df_as_nums)
## Rows: 1,876
## Columns: 9
## $ riagendr <fct> 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1~
## $ ridageyr <dbl> 0.15617810, 1.59749118, 1.25157604, -0.30504208, 0.96331342, ~
## $ ridreth1 <fct> 3, 3, 1, 5, 2, 4, 2, 5, 1, 3, 3, 2, 4, 3, 2, 3, 4, 1, 1, 4, 2~
## $ dmdeduc2 <fct> 3, 3, 2, 2, 5, 5, 1, 5, 1, 5, 1, 4, 3, 4, 1, 5, 4, 1, 3, 3, 4~
## $ dmdmartl <fct> 3, 1, 4, 5, 1, 2, 4, 5, 3, 6, 1, 1, 5, 3, 2, 6, 5, 5, 1, 5, 1~
## $ indhhin2 <fct> 4, 5, 13, 10, 6, 5, 5, 1, 4, 10, 4, 13, 13, 6, 3, 10, 6, 3, 4~
## $ bmxbmi <dbl> 0.20545760, -0.08208648, -0.11084088, -0.75781505, 2.06011687~
## $ diq010 <fct> No Diabetes, Diabetes, No Diabetes, No Diabetes, No Diabetes,~
## $ lbxglu <dbl> -0.30006288, -0.70905532, -0.15571260, -0.70905532, 0.3976301~
dV.df <- dummyVars( ~ . ,
data = df_as_nums,
fullRank=TRUE)
df_dV <- as_tibble(predict(dV.df,df_as_nums)) %>%
mutate(diq010.Diabetes = as.factor(diq010.Diabetes))
target <- 'diq010.Diabetes'
features <- colnames(df_dV)[!colnames(df_dV) %in% c('seqn' , 'diq010.Diabetes')]
length(features)
## [1] 28
We have {r} length(features)
features.
sample_features <- sample(features, 4, replace = FALSE)
curent_formula <- paste0(target, ' ~ ', paste0(sample_features, collapse = " + "))
as.formula(curent_formula)
## diq010.Diabetes ~ dmdeduc2.3 + dmdmartl.6 + indhhin2.11 + dmdeduc2.2
make_model_order_num
This function will return a formula with a provided number of selected features selected at random
make_model_order_num <- function(num_features){
set.seed(NULL)
sample_features <- sample(features, num_features, replace = FALSE)
curent_formula <- paste0(target, ' ~ ', paste0(sample_features, collapse = " + "))
return(as.formula(curent_formula))
}
make_model_order_num(3)
## diq010.Diabetes ~ dmdeduc2.4 + indhhin2.6 + indhhin2.13
## <environment: 0x000000002db60e68>
make_model_order_num(3)
## diq010.Diabetes ~ ridreth1.4 + dmdeduc2.2 + indhhin2.8
## <environment: 0x000000002dcac938>
make_model_order_num(6)
## diq010.Diabetes ~ ridreth1.5 + dmdeduc2.3 + indhhin2.11 + dmdeduc2.5 +
## indhhin2.6 + dmdmartl.2
## <environment: 0x000000002e054508>
df_model
This is a dataframe to hold the model options
df_model <- tribble(
~model_name, ~model_id,
# "zero", make_model(0)
"Fold1", 1,
"Fold2", 2,
"Fold3", 3,
"Fold4", 4,
"Fold5", 5,
"Fold6", 6,
"Fold7", 7,
"Fold8", 8,
)
df_model
## # A tibble: 8 x 2
## model_name model_id
## <chr> <dbl>
## 1 Fold1 1
## 2 Fold2 2
## 3 Fold3 3
## 4 Fold4 4
## 5 Fold5 5
## 6 Fold6 6
## 7 Fold7 7
## 8 Fold8 8
map(df_model,4)$model_creator
## NULL
\(~\)
\(~\)
library(rsample)
train_test <- initial_split(df_dV, prop = .6)
TRAIN <- training(train_test)
TEST <- testing(train_test)
TRAIN.v_fold <- vfold_cv(TRAIN, v = 8,
repeats = 321)
glimpse(TRAIN.v_fold)
## Rows: 2,568
## Columns: 3
## $ splits <list> [<vfold_split[984 x 141 x 1125 x 29]>], [<vfold_split[984 x 14~
## $ id <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "Repeat001"~
## $ id2 <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold7", ~
TRAIN.v_fold %>% select(id2) %>% distinct()
## # A tibble: 8 x 1
## id2
## <chr>
## 1 Fold1
## 2 Fold2
## 3 Fold3
## 4 Fold4
## 5 Fold5
## 6 Fold6
## 7 Fold7
## 8 Fold8
TRAIN.v_fold %>%
filter(id2=='Fold6') %>%
glimpse()
## Rows: 321
## Columns: 3
## $ splits <list> [<vfold_split[985 x 140 x 1125 x 29]>], [<vfold_split[985 x 14~
## $ id <chr> "Repeat001", "Repeat002", "Repeat003", "Repeat004", "Repeat005"~
## $ id2 <chr> "Fold6", "Fold6", "Fold6", "Fold6", "Fold6", "Fold6", "Fold6", ~
TRAIN.56.6 <- (TRAIN.v_fold %>%
filter(id2=='Fold6'))$splits[[56]] %>%
analysis()
TRAIN.56.6
## # A tibble: 985 x 29
## riagendr.2 ridageyr ridreth1.2 ridreth1.3 ridreth1.4 ridreth1.5 dmdeduc2.2
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1.02 1 0 0 0 0
## 2 1 -0.997 0 1 0 0 0
## 3 0 1.14 0 1 0 0 0
## 4 0 0.214 0 0 1 0 0
## 5 0 -0.247 0 0 0 0 1
## 6 1 1.42 0 1 0 0 0
## 7 0 -0.132 0 0 0 1 0
## 8 0 -0.593 0 0 1 0 0
## 9 1 -1.63 1 0 0 0 0
## 10 0 0.560 0 0 0 0 0
## # ... with 975 more rows, and 22 more variables: dmdeduc2.3 <dbl>,
## # dmdeduc2.4 <dbl>, dmdeduc2.5 <dbl>, dmdmartl.2 <dbl>, dmdmartl.3 <dbl>,
## # dmdmartl.4 <dbl>, dmdmartl.5 <dbl>, dmdmartl.6 <dbl>, indhhin2.2 <dbl>,
## # indhhin2.3 <dbl>, indhhin2.4 <dbl>, indhhin2.5 <dbl>, indhhin2.6 <dbl>,
## # indhhin2.8 <dbl>, indhhin2.10 <dbl>, indhhin2.11 <dbl>, indhhin2.12 <dbl>,
## # indhhin2.13 <dbl>, indhhin2.14 <dbl>, bmxbmi <dbl>, diq010.Diabetes <fct>,
## # lbxglu <dbl>
TEST.56.6 <- (TRAIN.v_fold %>%
filter(id2=='Fold6'))$splits[[56]] %>%
assessment()
TEST.56.6
## # A tibble: 140 x 29
## riagendr.2 ridageyr ridreth1.2 ridreth1.3 ridreth1.4 ridreth1.5 dmdeduc2.2
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.156 0 0 1 0 0
## 2 0 -0.478 0 0 0 0 0
## 3 1 0.156 0 1 0 0 0
## 4 0 -0.0168 0 0 0 0 0
## 5 1 -0.766 0 1 0 0 0
## 6 1 -1.52 0 0 1 0 1
## 7 0 0.675 0 0 1 0 0
## 8 0 -0.882 0 0 0 0 1
## 9 0 -1.11 0 1 0 0 0
## 10 0 1.71 0 1 0 0 0
## # ... with 130 more rows, and 22 more variables: dmdeduc2.3 <dbl>,
## # dmdeduc2.4 <dbl>, dmdeduc2.5 <dbl>, dmdmartl.2 <dbl>, dmdmartl.3 <dbl>,
## # dmdmartl.4 <dbl>, dmdmartl.5 <dbl>, dmdmartl.6 <dbl>, indhhin2.2 <dbl>,
## # indhhin2.3 <dbl>, indhhin2.4 <dbl>, indhhin2.5 <dbl>, indhhin2.6 <dbl>,
## # indhhin2.8 <dbl>, indhhin2.10 <dbl>, indhhin2.11 <dbl>, indhhin2.12 <dbl>,
## # indhhin2.13 <dbl>, indhhin2.14 <dbl>, bmxbmi <dbl>, diq010.Diabetes <fct>,
## # lbxglu <dbl>
\(~\)
\(~\)
df_model
to foldsWe’re joining the model tuning grid to the folds
glimpse(TRAIN.v_fold)
## Rows: 2,568
## Columns: 3
## $ splits <list> [<vfold_split[984 x 141 x 1125 x 29]>], [<vfold_split[984 x 14~
## $ id <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "Repeat001"~
## $ id2 <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold7", ~
glimpse(df_model)
## Rows: 8
## Columns: 2
## $ model_name <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold~
## $ model_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8
df_model <- TRAIN.v_fold %>%
left_join(df_model, by = c('id2'="model_name"))
glimpse(df_model)
## Rows: 2,568
## Columns: 4
## $ splits <list> [<vfold_split[984 x 141 x 1125 x 29]>], [<vfold_split[984 x ~
## $ id <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "Repeat00~
## $ id2 <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold7"~
## $ model_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5~
\(~\)
\(~\)
The lm_model
function
below will take in:
rsample::analysis
data set from the sample...
:From those inputs it is instructed to only return Adjusted R2 which is a numerical value.
lm_model <- function(splits, ...){
LM <- glm(... , analysis(splits), family = 'binomial')
holdout <- assessment(splits)
holdout$estimate <- predict(LM , holdout)
yardstick::rsq(holdout,
truth=as.numeric(diq010.Diabetes), estimate)$.estimate
}
glimpse(df_model)
## Rows: 2,568
## Columns: 4
## $ splits <list> [<vfold_split[984 x 141 x 1125 x 29]>], [<vfold_split[984 x ~
## $ id <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "Repeat00~
## $ id2 <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold7"~
## $ model_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5~
The purrr
library in R
is mysterious and powerful; here, map2_dbl
is going to look to return a numeric value, a double
from the computation.
If you run ?map2_dbl
it will display:
map2_dbl(.x, .y, .f, ...)
In the context of our current status:
df_model$spilts
gives us a list of data - .x
map()
is a function who returns a list , .y
map
is mapping the values of df_model$model_id
into the function make_model_order_num
lm_model
is a function .f
data
and ...
Adj_R2
from lm_model
We will now run all the models and store the results in Adj_R2
:
toc <- Sys.time()
df_model$Adj_R2 <- map2_dbl(
df_model$splits,
map(df_model$model_id, make_model_order_num),
lm_model
)
## Warning: A correlation computation is required, but `estimate` is constant
## and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be
## returned.
tic <- Sys.time()
print(paste0("Adj R2 estimates in ", round(tic - toc , 4 ) , " seconds " ))
## [1] "Adj R2 estimates in 32.8164 seconds "
We just ran a bunch of models and computed R2 for each of them!:
glimpse(df_model)
## Rows: 2,568
## Columns: 5
## $ splits <list> [<vfold_split[984 x 141 x 1125 x 29]>], [<vfold_split[984 x ~
## $ id <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "Repeat00~
## $ id2 <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold7"~
## $ model_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5~
## $ Adj_R2 <dbl> 4.095442e-03, 2.586026e-03, 3.911438e-02, 3.933703e-04, 5.628~
df_model %>%
ggplot(aes(x=id,
y=Adj_R2,
fill = Adj_R2)) +
geom_bar(stat = 'identity') +
scale_fill_gradient(low = "yellow", high = "red", na.value = NA) +
coord_flip() +
facet_wrap( ~ model_id)
## Warning: Removed 1 rows containing missing values (position_stack).
\(~\)
\(~\)
To compute RMSE we need to know how the model performs it’s holdout set:
holdout_results <- function(splits, ...) {
mod <- glm(..., data = analysis(splits), family ='binomial')
holdout <- assessment(splits)
holdout$estimate <- predict(mod,holdout)
yardstick::roc_auc(holdout,
truth=diq010.Diabetes, estimate)$.estimate
}
toc <- Sys.time()
df_model$roc_auc <- map2_dbl(
df_model$splits,
map(df_model$model_id, make_model_order_num),
holdout_results
)
tic <- Sys.time()
print(paste0("roc_auc estimates in ", round(tic - toc , 4 ) , " seconds " ))
## [1] "roc_auc estimates in 43.3728 seconds "
\(~\)
\(~\)
glimpse(df_model)
## Rows: 2,568
## Columns: 6
## $ splits <list> [<vfold_split[984 x 141 x 1125 x 29]>], [<vfold_split[984 x ~
## $ id <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "Repeat00~
## $ id2 <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold7"~
## $ model_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5~
## $ Adj_R2 <dbl> 4.095442e-03, 2.586026e-03, 3.911438e-02, 3.933703e-04, 5.628~
## $ roc_auc <dbl> 0.4668803, 0.4564280, 0.4915876, 0.4476190, 0.3150794, 0.4306~
While we see that models with higher number of features have larger Adjusted R2 There appears to be little correlation between Adjusted R2 and model performance:
df_model %>%
ggplot(aes(x=Adj_R2,
y=roc_auc,
color=id)) +
geom_point() +
facet_wrap(~model_id) +
theme(legend.position = "none")
## Warning: Removed 1 rows containing missing values (geom_point).
# Normalize RMSE and R2 , remove outliers
COR <-df_model %>%
mutate_at(vars(Adj_R2,roc_auc),scale) %>%
filter(abs(Adj_R2) <2 ) %>%
filter(abs(roc_auc) <2 )
COR %>%
ggplot(aes(x= roc_auc,
y= Adj_R2 ,
color=id)) +
geom_point() +
facet_wrap(~model_id) +
theme(legend.position = "none")
cor.test(COR$Adj_R2, COR$roc_auc, method=c("pearson"))
##
## Pearson's product-moment correlation
##
## data: COR$Adj_R2 and COR$roc_auc
## t = -2.0378, df = 2255, p-value = 0.04168
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08398542 -0.00161775
## sample estimates:
## cor
## -0.04287444
t.test(COR$Adj_R2, COR$roc_auc,paired=TRUE)
##
## Paired t-test
##
## data: COR$Adj_R2 and COR$roc_auc
## t = -15.996, df = 2256, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.3901790 -0.3049597
## sample estimates:
## mean of the differences
## -0.3475693
(df_model %>%
arrange(-roc_auc) %>%
filter(row_number() < 5))$roc_auc
## [1] 0.7299006 0.7141933 0.6740696 0.6587931
# Top_5_RMSE_formulas <- (df_model %>%
# arrange(RMSE) %>%
# filter(row_number() < 5))$model_creator
#
# Top_5_RMSE_formulas
(df_model %>%
arrange(-Adj_R2) %>%
filter(row_number() < 5))$Adj_R2 %>%
signif(4)
## [1] 0.5591 0.5532 0.5397 0.5357
# Top_5_AdjR2_formulas <- (df_model %>%
# arrange(Adj_R2) %>%
# filter(row_number() < 5))$model_creator
#
# Top_5_AdjR2_formulas
\(~\)
\(~\)
\(~\)
set.seed(1701)
library(tidyverse)
library(caret)
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
select(-seqn) %>%
mutate(diq010 = fct_relevel(diq010, c('No Diabetes','Diabetes')))
glimpse(diab_pop)
df <- diab_pop %>%
na.omit()
my_factor_vars_1 <- df %>% select_if(is.factor) %>% colnames()
my_factor_vars <- setdiff(my_factor_vars_1, 'diq010')
df_as_nums <- df %>%
mutate_at(all_of(my_factor_vars), as.integer) %>%
mutate_at(all_of(my_factor_vars), as.factor)
glimpse(df_as_nums)
pP <- preProcess(df_as_nums, c('center','scale'))
df_as_nums <- predict(pP,df_as_nums)
glimpse(df_as_nums)
dV.df <- dummyVars( ~ . ,
data = df_as_nums,
fullRank=TRUE)
df_dV <- as_tibble(predict(dV.df,df_as_nums)) %>%
mutate(diq010.Diabetes = as.factor(diq010.Diabetes))
target <- 'diq010.Diabetes'
features <- colnames(df_dV)[!colnames(df_dV) %in% c('seqn' , 'diq010.Diabetes')]
length(features)
sample_features <- sample(features, 4, replace = FALSE)
curent_formula <- paste0(target, ' ~ ', paste0(sample_features, collapse = " + "))
as.formula(curent_formula)
make_model_order_num <- function(num_features){
set.seed(NULL)
sample_features <- sample(features, num_features, replace = FALSE)
curent_formula <- paste0(target, ' ~ ', paste0(sample_features, collapse = " + "))
return(as.formula(curent_formula))
}
make_model_order_num(3)
make_model_order_num(3)
make_model_order_num(6)
df_model <- tribble(
~model_name, ~model_id,
# "zero", make_model(0)
"Fold1", 1,
"Fold2", 2,
"Fold3", 3,
"Fold4", 4,
"Fold5", 5,
"Fold6", 6,
"Fold7", 7,
"Fold8", 8,
)
df_model
map(df_model,4)$model_creator
library(rsample)
train_test <- initial_split(df_dV, prop = .6)
TRAIN <- training(train_test)
TEST <- testing(train_test)
TRAIN.v_fold <- vfold_cv(TRAIN, v = 8,
repeats = 321)
glimpse(TRAIN.v_fold)
TRAIN.v_fold %>% select(id2) %>% distinct()
TRAIN.v_fold %>%
filter(id2=='Fold6') %>%
glimpse()
TRAIN.56.6 <- (TRAIN.v_fold %>%
filter(id2=='Fold6'))$splits[[56]] %>%
analysis()
TRAIN.56.6
TEST.56.6 <- (TRAIN.v_fold %>%
filter(id2=='Fold6'))$splits[[56]] %>%
assessment()
TEST.56.6
glimpse(TRAIN.v_fold)
glimpse(df_model)
df_model <- TRAIN.v_fold %>%
left_join(df_model, by = c('id2'="model_name"))
glimpse(df_model)
lm_model <- function(splits, ...){
LM <- glm(... , analysis(splits), family = 'binomial')
holdout <- assessment(splits)
holdout$estimate <- predict(LM , holdout)
yardstick::rsq(holdout,
truth=as.numeric(diq010.Diabetes), estimate)$.estimate
}
glimpse(df_model)
toc <- Sys.time()
df_model$Adj_R2 <- map2_dbl(
df_model$splits,
map(df_model$model_id, make_model_order_num),
lm_model
)
tic <- Sys.time()
print(paste0("Adj R2 estimates in ", round(tic - toc , 4 ) , " seconds " ))
glimpse(df_model)
df_model %>%
ggplot(aes(x=id,
y=Adj_R2,
fill = Adj_R2)) +
geom_bar(stat = 'identity') +
scale_fill_gradient(low = "yellow", high = "red", na.value = NA) +
coord_flip() +
facet_wrap( ~ model_id)
holdout_results <- function(splits, ...) {
mod <- glm(..., data = analysis(splits), family ='binomial')
holdout <- assessment(splits)
holdout$estimate <- predict(mod,holdout)
yardstick::roc_auc(holdout,
truth=diq010.Diabetes, estimate)$.estimate
}
toc <- Sys.time()
df_model$roc_auc <- map2_dbl(
df_model$splits,
map(df_model$model_id, make_model_order_num),
holdout_results
)
tic <- Sys.time()
print(paste0("roc_auc estimates in ", round(tic - toc , 4 ) , " seconds " ))
glimpse(df_model)
df_model %>%
ggplot(aes(x=Adj_R2,
y=roc_auc,
color=id)) +
geom_point() +
facet_wrap(~model_id) +
theme(legend.position = "none")
# Normalize RMSE and R2 , remove outliers
COR <-df_model %>%
mutate_at(vars(Adj_R2,roc_auc),scale) %>%
filter(abs(Adj_R2) <2 ) %>%
filter(abs(roc_auc) <2 )
COR %>%
ggplot(aes(x= roc_auc,
y= Adj_R2 ,
color=id)) +
geom_point() +
facet_wrap(~model_id) +
theme(legend.position = "none")
cor.test(COR$Adj_R2, COR$roc_auc, method=c("pearson"))
t.test(COR$Adj_R2, COR$roc_auc,paired=TRUE)
(df_model %>%
arrange(-roc_auc) %>%
filter(row_number() < 5))$roc_auc
# Top_5_RMSE_formulas <- (df_model %>%
# arrange(RMSE) %>%
# filter(row_number() < 5))$model_creator
#
# Top_5_RMSE_formulas
(df_model %>%
arrange(-Adj_R2) %>%
filter(row_number() < 5))$Adj_R2 %>%
signif(4)
# Top_5_AdjR2_formulas <- (df_model %>%
# arrange(Adj_R2) %>%
# filter(row_number() < 5))$model_creator
#
# Top_5_AdjR2_formulas