1 Setup

set.seed(1701)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
  select(-seqn) %>%
  mutate(diq010 = fct_relevel(diq010, c('No Diabetes','Diabetes')))


glimpse(diab_pop)

## Rows: 5,719
## Columns: 9
## $ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Male,~
## $ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57, 8~
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White, N~
## $ dmdeduc2 <fct> College grad or above, High school graduate/GED, High school ~
## $ dmdmartl <fct> Married, Divorced, Married, Living with partner, Divorced, Se~
## $ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "$65~
## $ bmxbmi   <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6, 2~
## $ diq010   <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes, No~
## $ lbxglu   <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284, 3~

1.0.1 Let’s try to predict `diq010`:

df <- diab_pop %>% 
  na.omit()

my_factor_vars_1 <- df %>% select_if(is.factor) %>% colnames()

my_factor_vars <- setdiff(my_factor_vars_1, 'diq010')

df_as_nums <- df %>%
  mutate_at(all_of(my_factor_vars), as.integer) %>%
  mutate_at(all_of(my_factor_vars), as.factor)

glimpse(df_as_nums)

## Rows: 1,876
## Columns: 9
## $ riagendr <fct> 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1~
## $ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80, 7~
## $ ridreth1 <fct> 3, 3, 1, 5, 2, 4, 2, 5, 1, 3, 3, 2, 4, 3, 2, 3, 4, 1, 1, 4, 2~
## $ dmdeduc2 <fct> 3, 3, 2, 2, 5, 5, 1, 5, 1, 5, 1, 4, 3, 4, 1, 5, 4, 1, 3, 3, 4~
## $ dmdmartl <fct> 3, 1, 4, 5, 1, 2, 4, 5, 3, 6, 1, 1, 5, 3, 2, 6, 5, 5, 1, 5, 1~
## $ indhhin2 <fct> 4, 5, 13, 10, 6, 5, 5, 1, 4, 10, 4, 13, 13, 6, 3, 10, 6, 3, 4~
## $ bmxbmi   <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0, 2~
## $ diq010   <fct> No Diabetes, Diabetes, No Diabetes, No Diabetes, No Diabetes,~
## $ lbxglu   <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 94, ~

1.0.2 `preProcess`

pP <- preProcess(df_as_nums, c('center','scale')) 

df_as_nums <- predict(pP,df_as_nums) 

glimpse(df_as_nums)

## Rows: 1,876
## Columns: 9
## $ riagendr <fct> 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1~
## $ ridageyr <dbl> 0.15617810, 1.59749118, 1.25157604, -0.30504208, 0.96331342, ~
## $ ridreth1 <fct> 3, 3, 1, 5, 2, 4, 2, 5, 1, 3, 3, 2, 4, 3, 2, 3, 4, 1, 1, 4, 2~
## $ dmdeduc2 <fct> 3, 3, 2, 2, 5, 5, 1, 5, 1, 5, 1, 4, 3, 4, 1, 5, 4, 1, 3, 3, 4~
## $ dmdmartl <fct> 3, 1, 4, 5, 1, 2, 4, 5, 3, 6, 1, 1, 5, 3, 2, 6, 5, 5, 1, 5, 1~
## $ indhhin2 <fct> 4, 5, 13, 10, 6, 5, 5, 1, 4, 10, 4, 13, 13, 6, 3, 10, 6, 3, 4~
## $ bmxbmi   <dbl> 0.20545760, -0.08208648, -0.11084088, -0.75781505, 2.06011687~
## $ diq010   <fct> No Diabetes, Diabetes, No Diabetes, No Diabetes, No Diabetes,~
## $ lbxglu   <dbl> -0.30006288, -0.70905532, -0.15571260, -0.70905532, 0.3976301~

dV.df <- dummyVars( ~ . , 
                   data = df_as_nums, 
                   fullRank=TRUE)

df_dV <- as_tibble(predict(dV.df,df_as_nums)) %>%
  mutate(diq010.Diabetes = as.factor(diq010.Diabetes))

target <- 'diq010.Diabetes'

features <- colnames(df_dV)[!colnames(df_dV) %in% c('seqn' , 'diq010.Diabetes')]

length(features)

## [1] 28

We have {r} length(features) features.

1.0.3 test sampling function

sample_features <- sample(features, 4, replace = FALSE)

curent_formula <- paste0(target, ' ~ ', paste0(sample_features, collapse = " + "))

as.formula(curent_formula)

## diq010.Diabetes ~ dmdeduc2.3 + dmdmartl.6 + indhhin2.11 + dmdeduc2.2

2 `make_model_order_num`

This function will return a formula with a provided number of selected features selected at random

make_model_order_num <- function(num_features){
  
  set.seed(NULL)
  
  sample_features <- sample(features, num_features, replace = FALSE)

  curent_formula <- paste0(target,  ' ~ ', paste0(sample_features, collapse = " + "))

return(as.formula(curent_formula))
}

2.1 test

make_model_order_num(3)

## diq010.Diabetes ~ dmdeduc2.4 + indhhin2.6 + indhhin2.13
## <environment: 0x000000002db60e68>

make_model_order_num(3)

## diq010.Diabetes ~ ridreth1.4 + dmdeduc2.2 + indhhin2.8
## <environment: 0x000000002dcac938>

make_model_order_num(6)

## diq010.Diabetes ~ ridreth1.5 + dmdeduc2.3 + indhhin2.11 + dmdeduc2.5 + 
##     indhhin2.6 + dmdmartl.2
## <environment: 0x000000002e054508>

3 `df_model`

This is a dataframe to hold the model options

df_model <- tribble(
    ~model_name, ~model_id,
#    "zero", make_model(0)
    "Fold1", 1,
    "Fold2", 2,
    "Fold3", 3,
    "Fold4", 4,
    "Fold5", 5,
    "Fold6", 6,
    "Fold7", 7,
    "Fold8", 8,
  )

df_model

## # A tibble: 8 x 2
##   model_name model_id
##   <chr>         <dbl>
## 1 Fold1             1
## 2 Fold2             2
## 3 Fold3             3
## 4 Fold4             4
## 5 Fold5             5
## 6 Fold6             6
## 7 Fold7             7
## 8 Fold8             8

map(df_model,4)$model_creator

## NULL

$~$

4 Split data

library(rsample)

train_test <- initial_split(df_dV, prop = .6)
TRAIN <- training(train_test)
TEST <- testing(train_test)

TRAIN.v_fold <- vfold_cv(TRAIN, v = 8, 
                         repeats = 321)

glimpse(TRAIN.v_fold)

## Rows: 2,568
## Columns: 3
## $ splits <list> [<vfold_split[984 x 141 x 1125 x 29]>], [<vfold_split[984 x 14~
## $ id     <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "Repeat001"~
## $ id2    <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold7", ~

TRAIN.v_fold %>% select(id2) %>% distinct()

## # A tibble: 8 x 1
##   id2  
##   <chr>
## 1 Fold1
## 2 Fold2
## 3 Fold3
## 4 Fold4
## 5 Fold5
## 6 Fold6
## 7 Fold7
## 8 Fold8

4.1 Let’s Take a look at the 6th fold

TRAIN.v_fold %>%
  filter(id2=='Fold6') %>%
  glimpse()

## Rows: 321
## Columns: 3
## $ splits <list> [<vfold_split[985 x 140 x 1125 x 29]>], [<vfold_split[985 x 14~
## $ id     <chr> "Repeat001", "Repeat002", "Repeat003", "Repeat004", "Repeat005"~
## $ id2    <chr> "Fold6", "Fold6", "Fold6", "Fold6", "Fold6", "Fold6", "Fold6", ~

4.1.1 This is the TRAINing data from the 56th sample of the 6th Fold

TRAIN.56.6 <- (TRAIN.v_fold %>%
  filter(id2=='Fold6'))$splits[[56]] %>% 
  analysis()

TRAIN.56.6

## # A tibble: 985 x 29
##    riagendr.2 ridageyr ridreth1.2 ridreth1.3 ridreth1.4 ridreth1.5 dmdeduc2.2
##         <dbl>    <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
##  1          1    1.02           1          0          0          0          0
##  2          1   -0.997          0          1          0          0          0
##  3          0    1.14           0          1          0          0          0
##  4          0    0.214          0          0          1          0          0
##  5          0   -0.247          0          0          0          0          1
##  6          1    1.42           0          1          0          0          0
##  7          0   -0.132          0          0          0          1          0
##  8          0   -0.593          0          0          1          0          0
##  9          1   -1.63           1          0          0          0          0
## 10          0    0.560          0          0          0          0          0
## # ... with 975 more rows, and 22 more variables: dmdeduc2.3 <dbl>,
## #   dmdeduc2.4 <dbl>, dmdeduc2.5 <dbl>, dmdmartl.2 <dbl>, dmdmartl.3 <dbl>,
## #   dmdmartl.4 <dbl>, dmdmartl.5 <dbl>, dmdmartl.6 <dbl>, indhhin2.2 <dbl>,
## #   indhhin2.3 <dbl>, indhhin2.4 <dbl>, indhhin2.5 <dbl>, indhhin2.6 <dbl>,
## #   indhhin2.8 <dbl>, indhhin2.10 <dbl>, indhhin2.11 <dbl>, indhhin2.12 <dbl>,
## #   indhhin2.13 <dbl>, indhhin2.14 <dbl>, bmxbmi <dbl>, diq010.Diabetes <fct>,
## #   lbxglu <dbl>

4.1.2 This is the TESTing data from the 56th sample of the 6th Fold

TEST.56.6 <- (TRAIN.v_fold %>%
  filter(id2=='Fold6'))$splits[[56]] %>% 
  assessment()

TEST.56.6

## # A tibble: 140 x 29
##    riagendr.2 ridageyr ridreth1.2 ridreth1.3 ridreth1.4 ridreth1.5 dmdeduc2.2
##         <dbl>    <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
##  1          1   0.156           0          0          1          0          0
##  2          0  -0.478           0          0          0          0          0
##  3          1   0.156           0          1          0          0          0
##  4          0  -0.0168          0          0          0          0          0
##  5          1  -0.766           0          1          0          0          0
##  6          1  -1.52            0          0          1          0          1
##  7          0   0.675           0          0          1          0          0
##  8          0  -0.882           0          0          0          0          1
##  9          0  -1.11            0          1          0          0          0
## 10          0   1.71            0          1          0          0          0
## # ... with 130 more rows, and 22 more variables: dmdeduc2.3 <dbl>,
## #   dmdeduc2.4 <dbl>, dmdeduc2.5 <dbl>, dmdmartl.2 <dbl>, dmdmartl.3 <dbl>,
## #   dmdmartl.4 <dbl>, dmdmartl.5 <dbl>, dmdmartl.6 <dbl>, indhhin2.2 <dbl>,
## #   indhhin2.3 <dbl>, indhhin2.4 <dbl>, indhhin2.5 <dbl>, indhhin2.6 <dbl>,
## #   indhhin2.8 <dbl>, indhhin2.10 <dbl>, indhhin2.11 <dbl>, indhhin2.12 <dbl>,
## #   indhhin2.13 <dbl>, indhhin2.14 <dbl>, bmxbmi <dbl>, diq010.Diabetes <fct>,
## #   lbxglu <dbl>

$~$

5 Join `df_model` to folds

We’re joining the model tuning grid to the folds

glimpse(TRAIN.v_fold)

## Rows: 2,568
## Columns: 3
## $ splits <list> [<vfold_split[984 x 141 x 1125 x 29]>], [<vfold_split[984 x 14~
## $ id     <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "Repeat001"~
## $ id2    <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold7", ~

glimpse(df_model)

## Rows: 8
## Columns: 2
## $ model_name <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold~
## $ model_id   <dbl> 1, 2, 3, 4, 5, 6, 7, 8

df_model <- TRAIN.v_fold %>% 
  left_join(df_model, by = c('id2'="model_name")) 

glimpse(df_model)

## Rows: 2,568
## Columns: 4
## $ splits   <list> [<vfold_split[984 x 141 x 1125 x 29]>], [<vfold_split[984 x ~
## $ id       <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "Repeat00~
## $ id2      <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold7"~
## $ model_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5~

$~$

6 Adjusted R²

The lm_model function below will take in:

an rsample::analysis data set from the sample
a formula that we will pass in later with ...:

From those inputs it is instructed to only return Adjusted R² which is a numerical value.

lm_model <- function(splits, ...){
  
  LM <- glm(... , analysis(splits), family = 'binomial')
  
  holdout <- assessment(splits)
  
  holdout$estimate <- predict(LM , holdout)
  
  yardstick::rsq(holdout,
                     truth=as.numeric(diq010.Diabetes), estimate)$.estimate
}

glimpse(df_model)

## Rows: 2,568
## Columns: 4
## $ splits   <list> [<vfold_split[984 x 141 x 1125 x 29]>], [<vfold_split[984 x ~
## $ id       <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "Repeat00~
## $ id2      <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold7"~
## $ model_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5~

6.1 Get Estimates

The purrr library in R is mysterious and powerful; here, map2_dbl is going to look to return a numeric value, a double from the computation.

If you run ?map2_dbl it will display:

map2_dbl(.x, .y, .f, ...)

In the context of our current status:

df_model$spilts gives us a list of data - .x
map() is a function who returns a list , .y
- map is mapping the values of df_model$model_id into the function make_model_order_num
lm_model is a function .f
- this function takes in two values data and ...
The value it will return from us will be the Adj_R2 from lm_model

We will now run all the models and store the results in Adj_R2:

toc <- Sys.time()

df_model$Adj_R2 <- map2_dbl(
  df_model$splits,
  map(df_model$model_id, make_model_order_num),
  lm_model
)

## Warning: A correlation computation is required, but `estimate` is constant
## and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be
## returned.

tic <- Sys.time()

print(paste0("Adj R2 estimates in ", round(tic - toc , 4 ) , " seconds " ))

## [1] "Adj R2 estimates in 32.8164 seconds "

We just ran a bunch of models and computed R² for each of them!:

glimpse(df_model)

## Rows: 2,568
## Columns: 5
## $ splits   <list> [<vfold_split[984 x 141 x 1125 x 29]>], [<vfold_split[984 x ~
## $ id       <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "Repeat00~
## $ id2      <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold7"~
## $ model_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5~
## $ Adj_R2   <dbl> 4.095442e-03, 2.586026e-03, 3.911438e-02, 3.933703e-04, 5.628~

6.1.1 Display Results

df_model %>%
  ggplot(aes(x=id, 
         y=Adj_R2,
         fill = Adj_R2)) +
  geom_bar(stat = 'identity') +
  scale_fill_gradient(low = "yellow", high = "red", na.value = NA) +
  coord_flip() +
  facet_wrap( ~ model_id)

## Warning: Removed 1 rows containing missing values (position_stack).

$~$

7 ROC_AUC

To compute RMSE we need to know how the model performs it’s holdout set:

holdout_results <- function(splits, ...) {
  
  mod <- glm(..., data = analysis(splits), family ='binomial')
  
  holdout <- assessment(splits)
  
  holdout$estimate <- predict(mod,holdout)
  
  yardstick::roc_auc(holdout,
                     truth=diq010.Diabetes, estimate)$.estimate
}

7.1 Compute Errors

toc <- Sys.time()

df_model$roc_auc <- map2_dbl(
  df_model$splits,
  map(df_model$model_id, make_model_order_num),
  holdout_results
)

tic <- Sys.time()

print(paste0("roc_auc estimates in ", round(tic - toc , 4 ) , " seconds " ))

## [1] "roc_auc estimates in 43.3728 seconds "

$~$

8 Compare Results

glimpse(df_model)

## Rows: 2,568
## Columns: 6
## $ splits   <list> [<vfold_split[984 x 141 x 1125 x 29]>], [<vfold_split[984 x ~
## $ id       <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "Repeat00~
## $ id2      <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold7"~
## $ model_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5~
## $ Adj_R2   <dbl> 4.095442e-03, 2.586026e-03, 3.911438e-02, 3.933703e-04, 5.628~
## $ roc_auc  <dbl> 0.4668803, 0.4564280, 0.4915876, 0.4476190, 0.3150794, 0.4306~

8.1 Graphs

While we see that models with higher number of features have larger Adjusted R² There appears to be little correlation between Adjusted R² and model performance:

df_model %>%
  ggplot(aes(x=Adj_R2,
             y=roc_auc,
             color=id)) +
  geom_point() +
  facet_wrap(~model_id) + 
  theme(legend.position = "none")

## Warning: Removed 1 rows containing missing values (geom_point).

# Normalize RMSE and R2 , remove outliers 
COR <-df_model %>%
  mutate_at(vars(Adj_R2,roc_auc),scale) %>%
  filter(abs(Adj_R2) <2 ) %>%
  filter(abs(roc_auc) <2 )


COR %>%
  ggplot(aes(x= roc_auc,
             y= Adj_R2 ,
             color=id)) +
  geom_point() +
  facet_wrap(~model_id) + 
  theme(legend.position = "none")

cor.test(COR$Adj_R2, COR$roc_auc, method=c("pearson"))

## 
##  Pearson's product-moment correlation
## 
## data:  COR$Adj_R2 and COR$roc_auc
## t = -2.0378, df = 2255, p-value = 0.04168
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08398542 -0.00161775
## sample estimates:
##         cor 
## -0.04287444

t.test(COR$Adj_R2, COR$roc_auc,paired=TRUE)

## 
##  Paired t-test
## 
## data:  COR$Adj_R2 and COR$roc_auc
## t = -15.996, df = 2256, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.3901790 -0.3049597
## sample estimates:
## mean of the differences 
##              -0.3475693

9 Top 5 Formulas

9.1 RMSE

(df_model %>%
  arrange(-roc_auc) %>%
  filter(row_number() < 5))$roc_auc

## [1] 0.7299006 0.7141933 0.6740696 0.6587931

# Top_5_RMSE_formulas <- (df_model %>%
#   arrange(RMSE) %>%
#   filter(row_number() < 5))$model_creator
# 
# Top_5_RMSE_formulas

9.2 Adj_R2

(df_model %>%
  arrange(-Adj_R2) %>%
  filter(row_number() < 5))$Adj_R2 %>%
  signif(4)

## [1] 0.5591 0.5532 0.5397 0.5357

# Top_5_AdjR2_formulas <- (df_model %>%
#   arrange(Adj_R2) %>%
#   filter(row_number() < 5))$model_creator
# 
# Top_5_AdjR2_formulas

$~$

10 Code Appendix

$~$

set.seed(1701)
library(tidyverse)
library(caret)

diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
  select(-seqn) %>%
  mutate(diq010 = fct_relevel(diq010, c('No Diabetes','Diabetes')))


glimpse(diab_pop)
df <- diab_pop %>% 
  na.omit()

my_factor_vars_1 <- df %>% select_if(is.factor) %>% colnames()

my_factor_vars <- setdiff(my_factor_vars_1, 'diq010')

df_as_nums <- df %>%
  mutate_at(all_of(my_factor_vars), as.integer) %>%
  mutate_at(all_of(my_factor_vars), as.factor)

glimpse(df_as_nums)
pP <- preProcess(df_as_nums, c('center','scale')) 

df_as_nums <- predict(pP,df_as_nums) 

glimpse(df_as_nums)


dV.df <- dummyVars( ~ . , 
                   data = df_as_nums, 
                   fullRank=TRUE)

df_dV <- as_tibble(predict(dV.df,df_as_nums)) %>%
  mutate(diq010.Diabetes = as.factor(diq010.Diabetes))

target <- 'diq010.Diabetes'

features <- colnames(df_dV)[!colnames(df_dV) %in% c('seqn' , 'diq010.Diabetes')]

length(features)

sample_features <- sample(features, 4, replace = FALSE)

curent_formula <- paste0(target, ' ~ ', paste0(sample_features, collapse = " + "))

as.formula(curent_formula)


make_model_order_num <- function(num_features){
  
  set.seed(NULL)
  
  sample_features <- sample(features, num_features, replace = FALSE)

  curent_formula <- paste0(target,  ' ~ ', paste0(sample_features, collapse = " + "))

return(as.formula(curent_formula))
}
make_model_order_num(3)
make_model_order_num(3)
make_model_order_num(6)

df_model <- tribble(
    ~model_name, ~model_id,
#    "zero", make_model(0)
    "Fold1", 1,
    "Fold2", 2,
    "Fold3", 3,
    "Fold4", 4,
    "Fold5", 5,
    "Fold6", 6,
    "Fold7", 7,
    "Fold8", 8,
  )

df_model
map(df_model,4)$model_creator


library(rsample)

train_test <- initial_split(df_dV, prop = .6)
TRAIN <- training(train_test)
TEST <- testing(train_test)

TRAIN.v_fold <- vfold_cv(TRAIN, v = 8, 
                         repeats = 321)

glimpse(TRAIN.v_fold)

TRAIN.v_fold %>% select(id2) %>% distinct()
TRAIN.v_fold %>%
  filter(id2=='Fold6') %>%
  glimpse()

TRAIN.56.6 <- (TRAIN.v_fold %>%
  filter(id2=='Fold6'))$splits[[56]] %>% 
  analysis()

TRAIN.56.6

TEST.56.6 <- (TRAIN.v_fold %>%
  filter(id2=='Fold6'))$splits[[56]] %>% 
  assessment()

TEST.56.6
glimpse(TRAIN.v_fold)
glimpse(df_model)
df_model <- TRAIN.v_fold %>% 
  left_join(df_model, by = c('id2'="model_name")) 

glimpse(df_model)
lm_model <- function(splits, ...){
  
  LM <- glm(... , analysis(splits), family = 'binomial')
  
  holdout <- assessment(splits)
  
  holdout$estimate <- predict(LM , holdout)
  
  yardstick::rsq(holdout,
                     truth=as.numeric(diq010.Diabetes), estimate)$.estimate
}
glimpse(df_model)
toc <- Sys.time()

df_model$Adj_R2 <- map2_dbl(
  df_model$splits,
  map(df_model$model_id, make_model_order_num),
  lm_model
)

tic <- Sys.time()

print(paste0("Adj R2 estimates in ", round(tic - toc , 4 ) , " seconds " ))

glimpse(df_model)
df_model %>%
  ggplot(aes(x=id, 
         y=Adj_R2,
         fill = Adj_R2)) +
  geom_bar(stat = 'identity') +
  scale_fill_gradient(low = "yellow", high = "red", na.value = NA) +
  coord_flip() +
  facet_wrap( ~ model_id) 
holdout_results <- function(splits, ...) {
  
  mod <- glm(..., data = analysis(splits), family ='binomial')
  
  holdout <- assessment(splits)
  
  holdout$estimate <- predict(mod,holdout)
  
  yardstick::roc_auc(holdout,
                     truth=diq010.Diabetes, estimate)$.estimate
}

toc <- Sys.time()

df_model$roc_auc <- map2_dbl(
  df_model$splits,
  map(df_model$model_id, make_model_order_num),
  holdout_results
)

tic <- Sys.time()

print(paste0("roc_auc estimates in ", round(tic - toc , 4 ) , " seconds " ))



glimpse(df_model)



df_model %>%
  ggplot(aes(x=Adj_R2,
             y=roc_auc,
             color=id)) +
  geom_point() +
  facet_wrap(~model_id) + 
  theme(legend.position = "none")

# Normalize RMSE and R2 , remove outliers 
COR <-df_model %>%
  mutate_at(vars(Adj_R2,roc_auc),scale) %>%
  filter(abs(Adj_R2) <2 ) %>%
  filter(abs(roc_auc) <2 )


COR %>%
  ggplot(aes(x= roc_auc,
             y= Adj_R2 ,
             color=id)) +
  geom_point() +
  facet_wrap(~model_id) + 
  theme(legend.position = "none")

cor.test(COR$Adj_R2, COR$roc_auc, method=c("pearson"))
t.test(COR$Adj_R2, COR$roc_auc,paired=TRUE)
(df_model %>%
  arrange(-roc_auc) %>%
  filter(row_number() < 5))$roc_auc

# Top_5_RMSE_formulas <- (df_model %>%
#   arrange(RMSE) %>%
#   filter(row_number() < 5))$model_creator
# 
# Top_5_RMSE_formulas

(df_model %>%
  arrange(-Adj_R2) %>%
  filter(row_number() < 5))$Adj_R2 %>%
  signif(4)

# Top_5_AdjR2_formulas <- (df_model %>%
#   arrange(Adj_R2) %>%
#   filter(row_number() < 5))$model_creator
# 
# Top_5_AdjR2_formulas

R² VS Area Under ROC Curve - formula on resamples

J Kyle Armstrong, PhD

20 January 2022

1 Setup

1.0.1 Let’s try to predict `diq010`:

1.0.2 `preProcess`

1.0.3 test sampling function

2 `make_model_order_num`

2.1 test

3 `df_model`

4 Split data

4.1 Let’s Take a look at the 6th fold

4.1.1 This is the TRAINing data from the 56th sample of the 6th Fold

4.1.2 This is the TESTing data from the 56th sample of the 6th Fold

5 Join `df_model` to folds

6 Adjusted R²

6.1 Get Estimates

6.1.1 Display Results

7 ROC_AUC

7.1 Compute Errors

8 Compare Results

8.1 Graphs

9 Top 5 Formulas

9.1 RMSE

9.2 Adj_R2

10 Code Appendix

R2 VS Area Under ROC Curve - formula on resamples

J Kyle Armstrong, PhD

20 January 2022

1 Setup

1.0.1 Let’s try to predict diq010:

1.0.2 preProcess

1.0.3 test sampling function

2 make_model_order_num

2.1 test

3 df_model

4 Split data

4.1 Let’s Take a look at the 6th fold

4.1.1 This is the TRAINing data from the 56th sample of the 6th Fold

4.1.2 This is the TESTing data from the 56th sample of the 6th Fold

5 Join df_model to folds

6 Adjusted R2

6.1 Get Estimates

6.1.1 Display Results

7 ROC_AUC

7.1 Compute Errors

8 Compare Results

8.1 Graphs

9 Top 5 Formulas

9.1 RMSE

9.2 Adj_R2

10 Code Appendix

R² VS Area Under ROC Curve - formula on resamples

1.0.1 Let’s try to predict `diq010`:

1.0.2 `preProcess`

2 `make_model_order_num`

3 `df_model`

5 Join `df_model` to folds

6 Adjusted R²