set.seed(1701)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v dplyr 1.0.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
select(-seqn)
glimpse(diab_pop)
## Rows: 5,719
## Columns: 9
## $ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Ma...
## $ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57...
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White...
## $ dmdeduc2 <fct> College grad or above, High school graduate/GED, High scho...
## $ dmdmartl <fct> Married, Divorced, Married, Living with partner, Divorced,...
## $ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "...
## $ bmxbmi <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6...
## $ diq010 <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes,...
## $ lbxglu <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284...
lbxglu
:target <- 'lbxglu'
df <- diab_pop %>%
na.omit()
my_factor_vars <- df %>% select_if(is.factor) %>% colnames()
df_as_nums <- df %>%
mutate_at(vars(my_factor_vars), as.integer) %>%
mutate_at(vars(my_factor_vars), as.factor)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(my_factor_vars)` instead of `my_factor_vars` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
glimpse(df_as_nums)
## Rows: 1,876
## Columns: 9
## $ riagendr <fct> 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1...
## $ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80...
## $ ridreth1 <fct> 3, 3, 1, 5, 2, 4, 2, 5, 1, 3, 3, 2, 4, 3, 2, 3, 4, 1, 1, 4...
## $ dmdeduc2 <fct> 3, 3, 2, 2, 5, 5, 1, 5, 1, 5, 1, 4, 3, 4, 1, 5, 4, 1, 3, 3...
## $ dmdmartl <fct> 3, 1, 4, 5, 1, 2, 4, 5, 3, 6, 1, 1, 5, 3, 2, 6, 5, 5, 1, 5...
## $ indhhin2 <fct> 4, 5, 13, 10, 6, 5, 5, 1, 4, 10, 4, 13, 13, 6, 3, 10, 6, 3...
## $ bmxbmi <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0...
## $ diq010 <fct> 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ lbxglu <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 9...
preProcess
pP <- preProcess(df_as_nums, c('center','scale'))
df_as_nums <- predict(pP,df_as_nums)
glimpse(df_as_nums)
## Rows: 1,876
## Columns: 9
## $ riagendr <fct> 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1...
## $ ridageyr <dbl> 0.15617810, 1.59749118, 1.25157604, -0.30504208, 0.9633134...
## $ ridreth1 <fct> 3, 3, 1, 5, 2, 4, 2, 5, 1, 3, 3, 2, 4, 3, 2, 3, 4, 1, 1, 4...
## $ dmdeduc2 <fct> 3, 3, 2, 2, 5, 5, 1, 5, 1, 5, 1, 4, 3, 4, 1, 5, 4, 1, 3, 3...
## $ dmdmartl <fct> 3, 1, 4, 5, 1, 2, 4, 5, 3, 6, 1, 1, 5, 3, 2, 6, 5, 5, 1, 5...
## $ indhhin2 <fct> 4, 5, 13, 10, 6, 5, 5, 1, 4, 10, 4, 13, 13, 6, 3, 10, 6, 3...
## $ bmxbmi <dbl> 0.20545760, -0.08208648, -0.11084088, -0.75781505, 2.06011...
## $ diq010 <fct> 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ lbxglu <dbl> -0.30006288, -0.70905532, -0.15571260, -0.70905532, 0.3976...
dV.df <- dummyVars( ~ . ,
data = df_as_nums,
fullRank=TRUE)
df_dV <- as_tibble(predict(dV.df,df_as_nums))
features <- colnames(df_dV)[!colnames(df_dV) %in% c('seqn' , target)]
length(features)
## [1] 28
We have length(features)
features.
sample_features <- sample(features, 4, replace = FALSE)
curent_formula <- paste0( target ,' ~ ', paste0(sample_features, collapse = " + "))
as.formula(curent_formula)
## lbxglu ~ dmdeduc2.3 + dmdmartl.6 + indhhin2.11 + dmdeduc2.2
make_model_order_num
This function will return a formula with a provided number of selected features selected at random
make_model_order_num <- function(num_features){
set.seed(NULL)
sample_features <- sample(features, num_features, replace = FALSE)
curent_formula <- paste0(target, ' ~ ', paste0(sample_features, collapse = " + "))
return(as.formula(curent_formula))
}
make_model_order_num(3)
## lbxglu ~ diq010.2 + indhhin2.5 + indhhin2.10
## <environment: 0x000000001cee7d20>
make_model_order_num(3)
## lbxglu ~ dmdmartl.3 + bmxbmi + indhhin2.2
## <environment: 0x000000001cab0a50>
make_model_order_num(6)
## lbxglu ~ dmdeduc2.2 + dmdmartl.4 + dmdmartl.6 + indhhin2.4 +
## dmdeduc2.5 + indhhin2.3
## <environment: 0x000000001c6535e0>
df_model
This is a dataframe to hold the model options
df_model <- tribble(
~model_name, ~model_creator, ~model_id,
# "zero", make_model(0)
"Fold1", make_model_order_num(1), 1,
"Fold2", make_model_order_num(2), 2,
"Fold3", make_model_order_num(3), 3,
"Fold4", make_model_order_num(4), 4,
"Fold5", make_model_order_num(5), 5,
"Fold6" , make_model_order_num(6), 6,
"Fold7", make_model_order_num(7), 7,
"Fold8", make_model_order_num(8), 8,
)
df_model
## # A tibble: 8 x 3
## model_name model_creator model_id
## <chr> <list> <dbl>
## 1 Fold1 <formula> 1
## 2 Fold2 <formula> 2
## 3 Fold3 <formula> 3
## 4 Fold4 <formula> 4
## 5 Fold5 <formula> 5
## 6 Fold6 <formula> 6
## 7 Fold7 <formula> 7
## 8 Fold8 <formula> 8
map(df_model,4)$model_creator
## lbxglu ~ dmdmartl.5 + indhhin2.10 + indhhin2.11 + indhhin2.12
## <environment: 0x000000001c089000>
\(~\)
\(~\)
library(rsample)
train_test <- initial_split(df_dV, prop = .6)
TRAIN <- training(train_test)
TEST <- testing(train_test)
TRAIN.v_fold <- vfold_cv(TRAIN, v = 8,
repeats = 321)
glimpse(TRAIN.v_fold)
## Rows: 2,568
## Columns: 3
## $ splits <list> [<rsplit[985 x 141 x 1126 x 29]>, <rsplit[985 x 141 x 1126 ...
## $ id <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "Repeat0...
## $ id2 <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold7...
TRAIN.v_fold %>% select(id2) %>% distinct()
## # A tibble: 8 x 1
## id2
## <chr>
## 1 Fold1
## 2 Fold2
## 3 Fold3
## 4 Fold4
## 5 Fold5
## 6 Fold6
## 7 Fold7
## 8 Fold8
TRAIN.v_fold %>%
filter(id2=='Fold6') %>%
glimpse()
## Rows: 321
## Columns: 3
## $ splits <list> [<rsplit[985 x 141 x 1126 x 29]>, <rsplit[985 x 141 x 1126 ...
## $ id <chr> "Repeat001", "Repeat002", "Repeat003", "Repeat004", "Repeat0...
## $ id2 <chr> "Fold6", "Fold6", "Fold6", "Fold6", "Fold6", "Fold6", "Fold6...
TRAIN.56.6 <- (TRAIN.v_fold %>%
filter(id2=='Fold6'))$splits[[56]] %>%
analysis()
TRAIN.56.6
## # A tibble: 985 x 29
## riagendr.2 ridageyr ridreth1.2 ridreth1.3 ridreth1.4 ridreth1.5 dmdeduc2.2
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.156 0 1 0 0 0
## 2 1 1.25 0 0 0 0 1
## 3 0 -0.305 0 0 0 1 1
## 4 1 -1.52 0 0 0 1 0
## 5 1 1.02 0 0 0 0 0
## 6 0 0.906 0 1 0 0 0
## 7 1 -0.766 1 0 0 0 0
## 8 1 -1.75 0 0 1 0 0
## 9 1 -0.651 0 0 0 0 0
## 10 0 1.08 0 0 1 0 0
## # ... with 975 more rows, and 22 more variables: dmdeduc2.3 <dbl>,
## # dmdeduc2.4 <dbl>, dmdeduc2.5 <dbl>, dmdmartl.2 <dbl>, dmdmartl.3 <dbl>,
## # dmdmartl.4 <dbl>, dmdmartl.5 <dbl>, dmdmartl.6 <dbl>, indhhin2.2 <dbl>,
## # indhhin2.3 <dbl>, indhhin2.4 <dbl>, indhhin2.5 <dbl>, indhhin2.6 <dbl>,
## # indhhin2.8 <dbl>, indhhin2.10 <dbl>, indhhin2.11 <dbl>, indhhin2.12 <dbl>,
## # indhhin2.13 <dbl>, indhhin2.14 <dbl>, bmxbmi <dbl>, diq010.2 <dbl>,
## # lbxglu <dbl>
lm.TRAIN.56.6 <- lm(curent_formula , TRAIN.56.6)
TRAIN.56.6$estimate <- predict(lm.TRAIN.56.6 , TRAIN.56.6)
RMSE.TRAIN.56.6 <- yardstick::rmse(TRAIN.56.6,
truth = lbxglu, estimate)$.estimate
RMSE.TRAIN.56.6
## [1] 0.9554868
R2.TRAIN.56.6 <- yardstick::rsq(TRAIN.56.6,
truth = lbxglu, estimate)$.estimate
R2.TRAIN.56.6
## [1] 0.002785922
SS_tot
:mean_lbx_glu <- mean(TRAIN.56.6$lbxglu)
# compute SS_tot :
(TRAIN.56.6 %>%
mutate(e = lbxglu - mean_lbx_glu) %>%
mutate(e2 = e^2 ) %>%
summarise(SS_tot = sum(e2)))$SS_tot -> SS_tot
SS_tot
## [1] 901.7731
R2_by_formula
will be pretty close to R2.TRAIN.56.6
RMSE_to_R2 <- function(RMSE, n_rows, SS_tot){
1 - ( (RMSE^2 * n_rows) / SS_tot )
}
R2_by_formula <- RMSE_to_R2(RMSE.TRAIN.56.6 , nrow(TRAIN.56.6) , (SS_tot))
R2_by_formula
## [1] 0.002785922
sprintf("%.50f", R2.TRAIN.56.6 - R2_by_formula)
## [1] "-0.00000000000000001734723475976807094411924481391907"
TEST.56.6 <- (TRAIN.v_fold %>%
filter(id2=='Fold6'))$splits[[56]] %>%
assessment()
TEST.56.6
## # A tibble: 141 x 29
## riagendr.2 ridageyr ridreth1.2 ridreth1.3 ridreth1.4 ridreth1.5 dmdeduc2.2
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 -0.593 0 0 1 0 0
## 2 0 0.733 0 1 0 0 0
## 3 0 0.271 0 1 0 0 0
## 4 0 1.02 0 0 0 0 0
## 5 1 0.963 0 1 0 0 0
## 6 1 -0.651 0 1 0 0 0
## 7 0 -0.997 1 0 0 0 0
## 8 1 -0.132 0 0 1 0 0
## 9 1 1.37 1 0 0 0 0
## 10 0 0.214 0 1 0 0 0
## # ... with 131 more rows, and 22 more variables: dmdeduc2.3 <dbl>,
## # dmdeduc2.4 <dbl>, dmdeduc2.5 <dbl>, dmdmartl.2 <dbl>, dmdmartl.3 <dbl>,
## # dmdmartl.4 <dbl>, dmdmartl.5 <dbl>, dmdmartl.6 <dbl>, indhhin2.2 <dbl>,
## # indhhin2.3 <dbl>, indhhin2.4 <dbl>, indhhin2.5 <dbl>, indhhin2.6 <dbl>,
## # indhhin2.8 <dbl>, indhhin2.10 <dbl>, indhhin2.11 <dbl>, indhhin2.12 <dbl>,
## # indhhin2.13 <dbl>, indhhin2.14 <dbl>, bmxbmi <dbl>, diq010.2 <dbl>,
## # lbxglu <dbl>
\(~\)
\(~\)
df_model
to foldsWe’re joining the model tuning grid to the folds
glimpse(TRAIN.v_fold)
## Rows: 2,568
## Columns: 3
## $ splits <list> [<rsplit[985 x 141 x 1126 x 29]>, <rsplit[985 x 141 x 1126 ...
## $ id <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "Repeat0...
## $ id2 <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold7...
glimpse(df_model)
## Rows: 8
## Columns: 3
## $ model_name <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6",...
## $ model_creator <list> [lbxglu ~ dmdmartl.5, lbxglu ~ dmdmartl.5 + indhhin2...
## $ model_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8
df_model <- TRAIN.v_fold %>%
left_join(df_model, by = c('id2'="model_name"))
glimpse(df_model)
## Rows: 2,568
## Columns: 5
## $ splits <list> [<rsplit[985 x 141 x 1126 x 29]>, <rsplit[985 x 141 ...
## $ id <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "...
## $ id2 <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6",...
## $ model_creator <list> [lbxglu ~ dmdmartl.5, lbxglu ~ dmdmartl.5 + indhhin2...
## $ model_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2,...
\(~\)
\(~\)
The lm_model
function
below will take in:
rsample::analysis
data set from the sample...
:From those inputs it is instructed to only return Adjusted R2 which is a numerical value.
lm_model <- function(data, ...){
LM <- lm(... , analysis(data))
R2 <- summary(LM)$adj.r.squared
return(R2)
}
curent_formula
## [1] "lbxglu ~ dmdeduc2.3 + dmdmartl.6 + indhhin2.11 + dmdeduc2.2"
Adj_R2.TRAIN.56.6 <- lm_model((TRAIN.v_fold %>%
filter(id2=='Fold6'))$splits[[56]] , curent_formula)
Adj_R2.TRAIN.56.6 == summary(lm.TRAIN.56.6)$adj.r.squared
## [1] TRUE
est_Adj_r2 <- function(r2_est,
num_features,
size_data){
X <- 1 - r2_est
Y <- size_data -1
Z <- size_data - num_features -1
S <- 1 - (X)*(Y/Z)
return(S)
}
num_features_TRAIN.56.6 <- str_count(curent_formula," [./+] ") + 1
est_Adj_r2(R2.TRAIN.56.6, num_features_TRAIN.56.6, nrow(TRAIN.56.6))
## [1] -0.00128434
sprintf("%.50f", Adj_R2.TRAIN.56.6 - est_Adj_r2(R2.TRAIN.56.6, num_features_TRAIN.56.6, nrow(TRAIN.56.6)))
## [1] "0.00000000000000000000000000000000000000000000000000"
The purrr
library in R
is mysterious and powerful; here, map2_dbl
is going to look to return a numeric value, a double
from the computation.
If you run ?map2_dbl
it will display:
map2_dbl(.x, .y, .f, ...)
In the context of our current status:
df_model$spilts
gives us a list of data - .x
df_model$model_creator
gives us a formula - .y
lm_model
is a function .f
data
and ...
Adj_R2
from lm_model
We will now run all the models and store the results in Adj_R2
:
toc <- Sys.time()
df_model$Adj_R2 <- map2_dbl(
df_model$splits,
df_model$model_creator,
lm_model
)
tic <- Sys.time()
print(paste0("Adj R2 estimates in ", round(tic - toc , 4 ) , " seconds " ))
## [1] "Adj R2 estimates in 10.1142 seconds "
We just ran a bunch of models and computed R2 for each of them!:
glimpse(df_model)
## Rows: 2,568
## Columns: 6
## $ splits <list> [<rsplit[985 x 141 x 1126 x 29]>, <rsplit[985 x 141 ...
## $ id <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "...
## $ id2 <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6",...
## $ model_creator <list> [lbxglu ~ dmdmartl.5, lbxglu ~ dmdmartl.5 + indhhin2...
## $ model_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2,...
## $ Adj_R2 <dbl> 0.007497634, 0.015957576, 0.013516686, 0.016351878, 0...
df_model %>%
ggplot(aes(x=id,
y=Adj_R2,
fill = Adj_R2)) +
geom_bar(stat = 'identity') +
scale_fill_gradient(low = "yellow", high = "red", na.value = NA) +
coord_flip() +
facet_wrap( ~ model_id)
\(~\)
\(~\)
To compute RMSE we need to know how the model performs it’s holdout set:
holdout_results <- function(splits, ...) {
mod <- lm(..., data = analysis(splits))
holdout <- assessment(splits)
holdout$estimate <- predict(mod,holdout)
yardstick::rmse(holdout,
truth=lbxglu, estimate)$.estimate
}
toc <- Sys.time()
df_model$RMSE <- map2_dbl(
df_model$splits,
df_model$model_creator,
holdout_results
)
tic <- Sys.time()
print(paste0("RMSE estimates in ", round(tic - toc , 4 ) , " seconds " ))
## [1] "RMSE estimates in 31.3433 seconds "
\(~\)
\(~\)
glimpse(df_model)
## Rows: 2,568
## Columns: 7
## $ splits <list> [<rsplit[985 x 141 x 1126 x 29]>, <rsplit[985 x 141 ...
## $ id <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "...
## $ id2 <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6",...
## $ model_creator <list> [lbxglu ~ dmdmartl.5, lbxglu ~ dmdmartl.5 + indhhin2...
## $ model_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2,...
## $ Adj_R2 <dbl> 0.007497634, 0.015957576, 0.013516686, 0.016351878, 0...
## $ RMSE <dbl> 0.8675786, 1.3447705, 0.9409805, 0.9220485, 0.9517088...
While we see that models with higher number of features have larger Adjusted R2 There appears to be little correlation between Adjusted R2 and model performance:
df_model %>%
ggplot(aes(x=Adj_R2,
y=RMSE,
color=id)) +
geom_point() +
facet_wrap(~model_id) +
theme(legend.position = "none")
# Normalize RMSE and R2 , remove outliers
COR <-df_model %>%
mutate_at(vars(Adj_R2,RMSE),scale) %>%
filter(abs(Adj_R2) <2 ) %>%
filter(abs(RMSE) <2 )
COR %>%
ggplot(aes(x=Adj_R2,
y=RMSE,
color=id)) +
geom_point() +
facet_wrap(~model_id) +
theme(legend.position = "none")
COR %>%
ggplot(aes(x=Adj_R2,
y=RMSE,
color=id,
shape = as.factor(model_id))) +
geom_point() +
scale_shape_manual(values=seq(0:8)) +
theme(legend.position = "none")
COR %>%
ggplot(aes(x=Adj_R2,
y=RMSE,
color=as.factor(model_id))
) +
geom_point()
COR %>%
filter(Adj_R2 > 1) %>%
ggplot(aes(x=Adj_R2,
y=RMSE,
color=as.factor(model_id))
) +
geom_point()
cor.test(COR$Adj_R2, COR$RMSE, method=c("pearson"))
##
## Pearson's product-moment correlation
##
## data: COR$Adj_R2 and COR$RMSE
## t = -2.6893, df = 2369, p-value = 0.00721
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.09521299 -0.01494715
## sample estimates:
## cor
## -0.0551692
t.test(COR$Adj_R2, COR$RMSE,paired=TRUE)
##
## Paired t-test
##
## data: COR$Adj_R2 and COR$RMSE
## t = 0.86927, df = 2370, p-value = 0.3848
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.02897810 0.07512586
## sample estimates:
## mean of the differences
## 0.02307388
What might the relationship between RMSE and Adjusted R2 look like:
Random_RMSEs <- runif(100000 , min = 1 , max = 25*100000)
R2.Random_RMSEs <- map_dbl(Random_RMSEs, RMSE_to_R2 , nrow(TRAIN.56.6) , SS_tot)
Adj_R2.Random_RMSEs <- map_dbl(R2.Random_RMSEs,
est_Adj_r2,
num_features_TRAIN.56.6,
nrow(TRAIN.56.6))
Randm_compare <- tibble(Random_RMSEs, Adj_R2.Random_RMSEs)
Randm_compare %>%
ggplot(aes(x=Adj_R2.Random_RMSEs, y=Random_RMSEs)) +
geom_point()
Randm_compare %>%
mutate_at(vars(Adj_R2.Random_RMSEs,Random_RMSEs), scale) %>%
ggplot(aes(x=Adj_R2.Random_RMSEs, y=Random_RMSEs)) +
geom_point()
N
changes shouldn’t SS_tot
also change :RMSE_to_Adj_R2 <- function(RMSE, n_rows, SS_tot, n_features ){
R2 <- RMSE_to_R2(RMSE, n_rows, SS_tot)
Adj_R2 <- est_Adj_r2(R2, n_features , n_rows)
return(Adj_R2)
}
Adj_R2.double_rows <- map_dbl(Random_RMSEs,
RMSE_to_Adj_R2 ,
nrow(TRAIN.56.6)*2 ,
SS_tot = SS_tot,
n_features = num_features_TRAIN.56.6)
Randm_compare.Adj_R2.double_rows <- tibble(Random_RMSEs,
Adj_R2.double_rows)
Adj_R2.tripple_ss_tot <- map_dbl(Random_RMSEs,
RMSE_to_Adj_R2 ,
nrow(TRAIN.56.6) ,
SS_tot = SS_tot*3,
n_features = num_features_TRAIN.56.6)
Randm_compare.Adj_R2.tripple_ss_tot <- tibble(Random_RMSEs,
Adj_R2.tripple_ss_tot)
Adj_R2.double_rows_tripple_ss_tot <- map_dbl(Random_RMSEs,
RMSE_to_Adj_R2 ,
nrow(TRAIN.56.6)*2 ,
SS_tot = SS_tot*3,
n_features = num_features_TRAIN.56.6)
Randm_compare.Adj_R2.double_rows_tripple_ss_tot <- tibble(Random_RMSEs,
Adj_R2.double_rows_tripple_ss_tot,
)
Each of those might have a variation on the number of features :
Adj_R2.double_rows.double_features <- map_dbl(Random_RMSEs,
RMSE_to_Adj_R2 ,
nrow(TRAIN.56.6)*2 ,
SS_tot = SS_tot,
n_features = num_features_TRAIN.56.6*2)
Randm_compare.Adj_R2.double_rows.double_features <- tibble(Random_RMSEs,
Adj_R2.double_rows.double_features)
Adj_R2.tripple_ss_tot.double_features <- map_dbl(Random_RMSEs,
RMSE_to_Adj_R2 ,
nrow(TRAIN.56.6) ,
SS_tot = SS_tot*3,
n_features = num_features_TRAIN.56.6*2)
Randm_compare.Adj_R2.tripple_ss_tot.double_feature <- tibble(Random_RMSEs,
Adj_R2.tripple_ss_tot.double_features)
Adj_R2.double_rows_tripple_ss_tot.double_features <- map_dbl(Random_RMSEs,
RMSE_to_Adj_R2 ,
nrow(TRAIN.56.6)*2 ,
SS_tot = SS_tot*3,
n_features = num_features_TRAIN.56.6*2)
Randm_compare.Adj_R2.double_rows_tripple_ss_tot.double_features <- tibble(Random_RMSEs,
Adj_R2.double_rows_tripple_ss_tot.double_features,
)
compare_round2 <- Randm_compare %>%
left_join(Randm_compare.Adj_R2.double_rows) %>%
left_join(Randm_compare.Adj_R2.tripple_ss_tot) %>%
left_join(Randm_compare.Adj_R2.double_rows_tripple_ss_tot) %>%
left_join(Randm_compare.Adj_R2.double_rows.double_features) %>%
left_join(Randm_compare.Adj_R2.tripple_ss_tot.double_feature) %>%
left_join(Randm_compare.Adj_R2.double_rows_tripple_ss_tot.double_features)
## Joining, by = "Random_RMSEs"
## Joining, by = "Random_RMSEs"
## Joining, by = "Random_RMSEs"
## Joining, by = "Random_RMSEs"
## Joining, by = "Random_RMSEs"
## Joining, by = "Random_RMSEs"
compare_round2 %>%
pivot_longer(!Random_RMSEs,
names_to = "transformation",
values_to = "Adj_R2"
) -> compare_joins.long2
glimpse(compare_joins.long2)
## Rows: 700,882
## Columns: 3
## $ Random_RMSEs <dbl> 2440790, 2440790, 2440790, 2440790, 2440790, 2440790...
## $ transformation <chr> "Adj_R2.Random_RMSEs", "Adj_R2.double_rows", "Adj_R2...
## $ Adj_R2 <dbl> -6.533846e+12, -1.304106e+13, -2.177949e+12, -4.3470...
compare_joins.long2 %>%
ggplot(aes(x= Adj_R2,
y=Random_RMSEs,
color = transformation )) +
geom_point() +
facet_wrap(~transformation)
compare_joins.long2 %>%
ggplot(aes(x= Adj_R2,
y=Random_RMSEs,
color = transformation )) +
geom_point()
When you take into account the other factors that goes into computing it - the same RMSE-value could be any number of Adjusted R2 values:
compare_joins.long2 %>%
ggplot(aes(x= Random_RMSEs ,
y= Adj_R2 ,
color = transformation )) +
geom_point()
glimpse(compare_joins.long2)
## Rows: 700,882
## Columns: 3
## $ Random_RMSEs <dbl> 2440790, 2440790, 2440790, 2440790, 2440790, 2440790...
## $ transformation <chr> "Adj_R2.Random_RMSEs", "Adj_R2.double_rows", "Adj_R2...
## $ Adj_R2 <dbl> -6.533846e+12, -1.304106e+13, -2.177949e+12, -4.3470...
compare_joins.long2 %>%
select(transformation) %>%
distinct()
## # A tibble: 7 x 1
## transformation
## <chr>
## 1 Adj_R2.Random_RMSEs
## 2 Adj_R2.double_rows
## 3 Adj_R2.tripple_ss_tot
## 4 Adj_R2.double_rows_tripple_ss_tot
## 5 Adj_R2.double_rows.double_features
## 6 Adj_R2.tripple_ss_tot.double_features
## 7 Adj_R2.double_rows_tripple_ss_tot.double_features
compare_joins.long2 %>%
group_by(transformation) %>%
summarise(mean_adj_r2 = mean(Adj_R2))
## # A tibble: 7 x 2
## transformation mean_adj_r2
## * <chr> <dbl>
## 1 Adj_R2.double_rows -4.55e12
## 2 Adj_R2.double_rows.double_features -4.56e12
## 3 Adj_R2.double_rows_tripple_ss_tot -1.52e12
## 4 Adj_R2.double_rows_tripple_ss_tot.double_features -1.52e12
## 5 Adj_R2.Random_RMSEs -2.28e12
## 6 Adj_R2.tripple_ss_tot -7.60e11
## 7 Adj_R2.tripple_ss_tot.double_features -7.63e11
rand_sort <- runif(nrow(compare_joins.long2.filter))
glimpse(rand_sort)
## num [1:400504] 0.692 0.908 0.34 0.809 0.553 ...
compare_joins.long3 <- cbind(compare_joins.long2.filter, rand_sort)
glimpse(compare_joins.long3)
## Rows: 400,504
## Columns: 4
## $ Random_RMSEs <dbl> 2440790, 2440790, 2440790, 2440790, 2193509, 2193509...
## $ transformation <chr> "Adj_R2.Random_RMSEs", "Adj_R2.tripple_ss_tot", "Adj...
## $ Adj_R2 <dbl> -6.533846e+12, -2.177949e+12, -4.347021e+12, -1.3067...
## $ rand_sort <dbl> 0.6922980, 0.9080481, 0.3395490, 0.8092567, 0.552940...
compare_joins.long4 <- compare_joins.long3 %>%
arrange(rand_sort) %>%
group_by(Random_RMSEs) %>%
mutate(rn = row_number()) %>%
filter(rn == 1) %>%
ungroup() %>%
select(-rn)
compare_joins.long4
## # A tibble: 99,999 x 4
## Random_RMSEs transformation Adj_R2 rand_sort
## <dbl> <chr> <dbl> <dbl>
## 1 2055522. Adj_R2.double_rows_tripple_ss_tot -3.08e12 0.00000226
## 2 852694. Adj_R2.Random_RMSEs -7.97e11 0.0000104
## 3 1672910. Adj_R2.double_rows_tripple_ss_tot -2.04e12 0.0000104
## 4 1197120. Adj_R2.tripple_ss_tot -5.24e11 0.0000105
## 5 649792. Adj_R2.double_rows_tripple_ss_tot -3.08e11 0.0000130
## 6 414406. Adj_R2.Random_RMSEs -1.88e11 0.0000245
## 7 240247. Adj_R2.Random_RMSEs -6.33e10 0.0000332
## 8 1844988. Adj_R2.double_rows_tripple_ss_tot -2.48e12 0.0000333
## 9 145725. Adj_R2.Random_RMSEs -2.33e10 0.0000368
## 10 1398102. Adj_R2.double_rows.double_features -4.29e12 0.0000372
## # ... with 99,989 more rows
And we’ll sample some of the RMSE at random too:
compare_joins.long4 %>%
mutate_at(vars(Adj_R2,Random_RMSEs), scale) %>%
filter(rand_sort > .90 ) %>%
ggplot(aes(x= Adj_R2 ,
y= Random_RMSEs,
color = as.factor(transformation))) +
geom_point() +
theme(legend.position = "none")
Distribution is very difficult to tell even with colored points - keep in mind that each RMSE is getting associated with a different Adjusted R2 value for each different fold and repeat as they each have different N and number of features.
(df_model %>%
arrange(RMSE) %>%
filter(row_number() < 5))$RMSE
## [1] 0.4489166 0.4571849 0.5036834 0.5044152
Top_5_RMSE_formulas <- (df_model %>%
arrange(RMSE) %>%
filter(row_number() < 5))$model_creator
Top_5_RMSE_formulas
## [[1]]
## lbxglu ~ dmdmartl.5
## <environment: 0x000000001c091cc0>
##
## [[2]]
## lbxglu ~ dmdmartl.5 + indhhin2.10 + indhhin2.11 + indhhin2.12
## <environment: 0x000000001c089000>
##
## [[3]]
## lbxglu ~ dmdmartl.5
## <environment: 0x000000001c091cc0>
##
## [[4]]
## lbxglu ~ dmdmartl.5
## <environment: 0x000000001c091cc0>
(df_model %>%
arrange(-Adj_R2) %>%
filter(row_number() < 5))$Adj_R2
## [1] 0.03053928 0.03052720 0.02988576 0.02985333
Top_5_AdjR2_formulas <- (df_model %>%
arrange(-Adj_R2) %>%
filter(row_number() < 5))$model_creator
Top_5_AdjR2_formulas
## [[1]]
## lbxglu ~ dmdmartl.5 + indhhin2.10 + indhhin2.11 + indhhin2.12
## <environment: 0x000000001c089000>
##
## [[2]]
## lbxglu ~ dmdmartl.5 + indhhin2.10 + indhhin2.11 + indhhin2.12
## <environment: 0x000000001c089000>
##
## [[3]]
## lbxglu ~ dmdmartl.5 + indhhin2.10 + indhhin2.11 + indhhin2.12 +
## dmdmartl.2 + indhhin2.2
## <environment: 0x000000001c07ba20>
##
## [[4]]
## lbxglu ~ dmdmartl.5 + indhhin2.10 + indhhin2.11 + indhhin2.12 +
## dmdmartl.2
## <environment: 0x000000001c07f9e8>
Best_RMSE_formula <- Top_5_RMSE_formulas[[1]]
Best_AdjR2_formula <- Top_5_AdjR2_formulas[[1]]
if(Best_RMSE_formula == Best_AdjR2_formula){
Best_RMSE_formula <- Top_5_RMSE_formulas[[1]]
Best_AdjR2_formula <- Top_5_AdjR2_formulas[[2]]
}
Best_RMSE_formula
VS Best_AdjR2_formula
Final.Glm_function <- function(formula, data){
lm(formula, data)
}
Final.Glm.Adj_R2 <- Final.Glm_function(Best_AdjR2_formula , TRAIN)
Final.Glm.RMSE <- Final.Glm_function(Best_RMSE_formula, TRAIN)
TEST.scored <- TEST
TEST.scored$estimate <- predict(Final.Glm.Adj_R2, TEST)
TEST.Adj_R2 <- TEST.scored %>%
mutate(model = "Adj_R2")
TEST.scored$estimate <- predict(Final.Glm.RMSE, TEST)
TEST.RMSE <- TEST.scored %>%
mutate(model = "RMSE")
TEST_stacked <- bind_rows(TEST.Adj_R2, TEST.RMSE)
TEST_stacked %>%
group_by(model) %>%
yardstick::rmse(truth=lbxglu, estimate)
## # A tibble: 2 x 4
## model .metric .estimator .estimate
## <chr> <chr> <chr> <dbl>
## 1 Adj_R2 rmse standard 1.07
## 2 RMSE rmse standard 1.06
TEST_stacked %>%
group_by(model) %>%
mutate(error = estimate - lbxglu) %>%
arrange(error) %>%
mutate(order_n = row_number()) %>%
ggplot(aes(x=order_n,
y=error,
color=model)) +
geom_point() +
geom_line() +
facet_wrap(~model)
TEST_stacked
## # A tibble: 1,500 x 31
## riagendr.2 ridageyr ridreth1.2 ridreth1.3 ridreth1.4 ridreth1.5 dmdeduc2.2
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 1.60 0 1 0 0 0
## 2 1 0.963 1 0 0 0 0
## 3 0 0.963 0 0 1 0 0
## 4 1 0.387 1 0 0 0 0
## 5 0 0.329 0 1 0 0 0
## 6 1 -1.75 0 0 1 0 0
## 7 0 -1.52 0 1 0 0 0
## 8 1 1.71 1 0 0 0 0
## 9 0 1.14 0 1 0 0 0
## 10 1 -1.23 0 0 0 0 0
## # ... with 1,490 more rows, and 24 more variables: dmdeduc2.3 <dbl>,
## # dmdeduc2.4 <dbl>, dmdeduc2.5 <dbl>, dmdmartl.2 <dbl>, dmdmartl.3 <dbl>,
## # dmdmartl.4 <dbl>, dmdmartl.5 <dbl>, dmdmartl.6 <dbl>, indhhin2.2 <dbl>,
## # indhhin2.3 <dbl>, indhhin2.4 <dbl>, indhhin2.5 <dbl>, indhhin2.6 <dbl>,
## # indhhin2.8 <dbl>, indhhin2.10 <dbl>, indhhin2.11 <dbl>, indhhin2.12 <dbl>,
## # indhhin2.13 <dbl>, indhhin2.14 <dbl>, bmxbmi <dbl>, diq010.2 <dbl>,
## # lbxglu <dbl>, estimate <dbl>, model <chr>
TEST_stacked.RMSE.byGroup <- TEST_stacked %>%
group_by(model, riagendr.2, ridreth1.2, dmdeduc2.2, dmdmartl.2, indhhin2.2, diq010.2) %>%
yardstick::rmse(truth=lbxglu, estimate)
TEST_stacked.RMSE.byGroup %>% glimpse()
## Rows: 80
## Columns: 10
## $ model <chr> "Adj_R2", "Adj_R2", "Adj_R2", "Adj_R2", "Adj_R2", "Adj_R...
## $ riagendr.2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,...
## $ ridreth1.2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,...
## $ dmdeduc2.2 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,...
## $ dmdmartl.2 <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ indhhin2.2 <dbl> 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,...
## $ diq010.2 <dbl> 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,...
## $ .metric <chr> "rmse", "rmse", "rmse", "rmse", "rmse", "rmse", "rmse", ...
## $ .estimator <chr> "standard", "standard", "standard", "standard", "standar...
## $ .estimate <dbl> 2.3660675, 0.6226157, 0.9916426, 0.2835908, 2.1961608, 0...
TEST_stacked.RMSE.byGroup %>%
group_by(model) %>%
ggplot(aes(x=riagendr.2,
y=.estimate,
fill= model )) +
geom_bar(stat='identity', position = 'dodge') +
facet_wrap(~ridreth1.2+diq010.2)
TEST_stacked %>%
group_by(model, riagendr.2, ridreth1.2, dmdeduc2.2, dmdmartl.2, indhhin2.2, diq010.2) %>%
mutate(error = estimate - lbxglu) %>%
arrange(error) %>%
mutate(order_n = row_number()) %>%
ggplot(aes(x=order_n,
y=error,
color=model)) +
geom_point() +
geom_line() +
facet_wrap(~ridreth1.2+diq010.2)
TEST_stacked %>%
group_by(model) %>%
yardstick::rsq(truth=lbxglu, estimate)
## # A tibble: 2 x 4
## model .metric .estimator .estimate
## <chr> <chr> <chr> <dbl>
## 1 Adj_R2 rsq standard 0.00590
## 2 RMSE rsq standard 0.0102
\(~\)
\(~\)
\(~\)
set.seed(1701)
library(tidyverse)
library(caret)
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
select(-seqn)
glimpse(diab_pop)
target <- 'lbxglu'
df <- diab_pop %>%
na.omit()
my_factor_vars <- df %>% select_if(is.factor) %>% colnames()
df_as_nums <- df %>%
mutate_at(vars(my_factor_vars), as.integer) %>%
mutate_at(vars(my_factor_vars), as.factor)
glimpse(df_as_nums)
pP <- preProcess(df_as_nums, c('center','scale'))
df_as_nums <- predict(pP,df_as_nums)
glimpse(df_as_nums)
dV.df <- dummyVars( ~ . ,
data = df_as_nums,
fullRank=TRUE)
df_dV <- as_tibble(predict(dV.df,df_as_nums))
features <- colnames(df_dV)[!colnames(df_dV) %in% c('seqn' , target)]
length(features)
sample_features <- sample(features, 4, replace = FALSE)
curent_formula <- paste0( target ,' ~ ', paste0(sample_features, collapse = " + "))
as.formula(curent_formula)
make_model_order_num <- function(num_features){
set.seed(NULL)
sample_features <- sample(features, num_features, replace = FALSE)
curent_formula <- paste0(target, ' ~ ', paste0(sample_features, collapse = " + "))
return(as.formula(curent_formula))
}
make_model_order_num(3)
make_model_order_num(3)
make_model_order_num(6)
df_model <- tribble(
~model_name, ~model_creator, ~model_id,
# "zero", make_model(0)
"Fold1", make_model_order_num(1), 1,
"Fold2", make_model_order_num(2), 2,
"Fold3", make_model_order_num(3), 3,
"Fold4", make_model_order_num(4), 4,
"Fold5", make_model_order_num(5), 5,
"Fold6" , make_model_order_num(6), 6,
"Fold7", make_model_order_num(7), 7,
"Fold8", make_model_order_num(8), 8,
)
df_model
map(df_model,4)$model_creator
library(rsample)
train_test <- initial_split(df_dV, prop = .6)
TRAIN <- training(train_test)
TEST <- testing(train_test)
TRAIN.v_fold <- vfold_cv(TRAIN, v = 8,
repeats = 321)
glimpse(TRAIN.v_fold)
TRAIN.v_fold %>% select(id2) %>% distinct()
TRAIN.v_fold %>%
filter(id2=='Fold6') %>%
glimpse()
TRAIN.56.6 <- (TRAIN.v_fold %>%
filter(id2=='Fold6'))$splits[[56]] %>%
analysis()
TRAIN.56.6
lm.TRAIN.56.6 <- lm(curent_formula , TRAIN.56.6)
TRAIN.56.6$estimate <- predict(lm.TRAIN.56.6 , TRAIN.56.6)
RMSE.TRAIN.56.6 <- yardstick::rmse(TRAIN.56.6,
truth = lbxglu, estimate)$.estimate
RMSE.TRAIN.56.6
R2.TRAIN.56.6 <- yardstick::rsq(TRAIN.56.6,
truth = lbxglu, estimate)$.estimate
R2.TRAIN.56.6
mean_lbx_glu <- mean(TRAIN.56.6$lbxglu)
# compute SS_tot :
(TRAIN.56.6 %>%
mutate(e = lbxglu - mean_lbx_glu) %>%
mutate(e2 = e^2 ) %>%
summarise(SS_tot = sum(e2)))$SS_tot -> SS_tot
SS_tot
RMSE_to_R2 <- function(RMSE, n_rows, SS_tot){
1 - ( (RMSE^2 * n_rows) / SS_tot )
}
R2_by_formula <- RMSE_to_R2(RMSE.TRAIN.56.6 , nrow(TRAIN.56.6) , (SS_tot))
R2_by_formula
sprintf("%.50f", R2.TRAIN.56.6 - R2_by_formula)
TEST.56.6 <- (TRAIN.v_fold %>%
filter(id2=='Fold6'))$splits[[56]] %>%
assessment()
TEST.56.6
glimpse(TRAIN.v_fold)
glimpse(df_model)
df_model <- TRAIN.v_fold %>%
left_join(df_model, by = c('id2'="model_name"))
glimpse(df_model)
lm_model <- function(data, ...){
LM <- lm(... , analysis(data))
R2 <- summary(LM)$adj.r.squared
return(R2)
}
curent_formula
Adj_R2.TRAIN.56.6 <- lm_model((TRAIN.v_fold %>%
filter(id2=='Fold6'))$splits[[56]] , curent_formula)
Adj_R2.TRAIN.56.6 == summary(lm.TRAIN.56.6)$adj.r.squared
est_Adj_r2 <- function(r2_est,
num_features,
size_data){
X <- 1 - r2_est
Y <- size_data -1
Z <- size_data - num_features -1
S <- 1 - (X)*(Y/Z)
return(S)
}
num_features_TRAIN.56.6 <- str_count(curent_formula," [./+] ") + 1
est_Adj_r2(R2.TRAIN.56.6, num_features_TRAIN.56.6, nrow(TRAIN.56.6))
sprintf("%.50f", Adj_R2.TRAIN.56.6 - est_Adj_r2(R2.TRAIN.56.6, num_features_TRAIN.56.6, nrow(TRAIN.56.6)))
toc <- Sys.time()
df_model$Adj_R2 <- map2_dbl(
df_model$splits,
df_model$model_creator,
lm_model
)
tic <- Sys.time()
print(paste0("Adj R2 estimates in ", round(tic - toc , 4 ) , " seconds " ))
glimpse(df_model)
df_model %>%
ggplot(aes(x=id,
y=Adj_R2,
fill = Adj_R2)) +
geom_bar(stat = 'identity') +
scale_fill_gradient(low = "yellow", high = "red", na.value = NA) +
coord_flip() +
facet_wrap( ~ model_id)
holdout_results <- function(splits, ...) {
mod <- lm(..., data = analysis(splits))
holdout <- assessment(splits)
holdout$estimate <- predict(mod,holdout)
yardstick::rmse(holdout,
truth=lbxglu, estimate)$.estimate
}
toc <- Sys.time()
df_model$RMSE <- map2_dbl(
df_model$splits,
df_model$model_creator,
holdout_results
)
tic <- Sys.time()
print(paste0("RMSE estimates in ", round(tic - toc , 4 ) , " seconds " ))
glimpse(df_model)
df_model %>%
ggplot(aes(x=Adj_R2,
y=RMSE,
color=id)) +
geom_point() +
facet_wrap(~model_id) +
theme(legend.position = "none")
# Normalize RMSE and R2 , remove outliers
COR <-df_model %>%
mutate_at(vars(Adj_R2,RMSE),scale) %>%
filter(abs(Adj_R2) <2 ) %>%
filter(abs(RMSE) <2 )
COR %>%
ggplot(aes(x=Adj_R2,
y=RMSE,
color=id)) +
geom_point() +
facet_wrap(~model_id) +
theme(legend.position = "none")
COR %>%
ggplot(aes(x=Adj_R2,
y=RMSE,
color=id,
shape = as.factor(model_id))) +
geom_point() +
scale_shape_manual(values=seq(0:8)) +
theme(legend.position = "none")
COR %>%
ggplot(aes(x=Adj_R2,
y=RMSE,
color=as.factor(model_id))
) +
geom_point()
COR %>%
filter(Adj_R2 > 1) %>%
ggplot(aes(x=Adj_R2,
y=RMSE,
color=as.factor(model_id))
) +
geom_point()
cor.test(COR$Adj_R2, COR$RMSE, method=c("pearson"))
t.test(COR$Adj_R2, COR$RMSE,paired=TRUE)
Random_RMSEs <- runif(100000 , min = 1 , max = 25*100000)
R2.Random_RMSEs <- map_dbl(Random_RMSEs, RMSE_to_R2 , nrow(TRAIN.56.6) , SS_tot)
Adj_R2.Random_RMSEs <- map_dbl(R2.Random_RMSEs,
est_Adj_r2,
num_features_TRAIN.56.6,
nrow(TRAIN.56.6))
Randm_compare <- tibble(Random_RMSEs, Adj_R2.Random_RMSEs)
Randm_compare %>%
ggplot(aes(x=Adj_R2.Random_RMSEs, y=Random_RMSEs)) +
geom_point()
Randm_compare %>%
mutate_at(vars(Adj_R2.Random_RMSEs,Random_RMSEs), scale) %>%
ggplot(aes(x=Adj_R2.Random_RMSEs, y=Random_RMSEs)) +
geom_point()
RMSE_to_Adj_R2 <- function(RMSE, n_rows, SS_tot, n_features ){
R2 <- RMSE_to_R2(RMSE, n_rows, SS_tot)
Adj_R2 <- est_Adj_r2(R2, n_features , n_rows)
return(Adj_R2)
}
Adj_R2.double_rows <- map_dbl(Random_RMSEs,
RMSE_to_Adj_R2 ,
nrow(TRAIN.56.6)*2 ,
SS_tot = SS_tot,
n_features = num_features_TRAIN.56.6)
Randm_compare.Adj_R2.double_rows <- tibble(Random_RMSEs,
Adj_R2.double_rows)
Adj_R2.tripple_ss_tot <- map_dbl(Random_RMSEs,
RMSE_to_Adj_R2 ,
nrow(TRAIN.56.6) ,
SS_tot = SS_tot*3,
n_features = num_features_TRAIN.56.6)
Randm_compare.Adj_R2.tripple_ss_tot <- tibble(Random_RMSEs,
Adj_R2.tripple_ss_tot)
Adj_R2.double_rows_tripple_ss_tot <- map_dbl(Random_RMSEs,
RMSE_to_Adj_R2 ,
nrow(TRAIN.56.6)*2 ,
SS_tot = SS_tot*3,
n_features = num_features_TRAIN.56.6)
Randm_compare.Adj_R2.double_rows_tripple_ss_tot <- tibble(Random_RMSEs,
Adj_R2.double_rows_tripple_ss_tot,
)
Adj_R2.double_rows.double_features <- map_dbl(Random_RMSEs,
RMSE_to_Adj_R2 ,
nrow(TRAIN.56.6)*2 ,
SS_tot = SS_tot,
n_features = num_features_TRAIN.56.6*2)
Randm_compare.Adj_R2.double_rows.double_features <- tibble(Random_RMSEs,
Adj_R2.double_rows.double_features)
Adj_R2.tripple_ss_tot.double_features <- map_dbl(Random_RMSEs,
RMSE_to_Adj_R2 ,
nrow(TRAIN.56.6) ,
SS_tot = SS_tot*3,
n_features = num_features_TRAIN.56.6*2)
Randm_compare.Adj_R2.tripple_ss_tot.double_feature <- tibble(Random_RMSEs,
Adj_R2.tripple_ss_tot.double_features)
Adj_R2.double_rows_tripple_ss_tot.double_features <- map_dbl(Random_RMSEs,
RMSE_to_Adj_R2 ,
nrow(TRAIN.56.6)*2 ,
SS_tot = SS_tot*3,
n_features = num_features_TRAIN.56.6*2)
Randm_compare.Adj_R2.double_rows_tripple_ss_tot.double_features <- tibble(Random_RMSEs,
Adj_R2.double_rows_tripple_ss_tot.double_features,
)
compare_round2 <- Randm_compare %>%
left_join(Randm_compare.Adj_R2.double_rows) %>%
left_join(Randm_compare.Adj_R2.tripple_ss_tot) %>%
left_join(Randm_compare.Adj_R2.double_rows_tripple_ss_tot) %>%
left_join(Randm_compare.Adj_R2.double_rows.double_features) %>%
left_join(Randm_compare.Adj_R2.tripple_ss_tot.double_feature) %>%
left_join(Randm_compare.Adj_R2.double_rows_tripple_ss_tot.double_features)
compare_round2 %>%
pivot_longer(!Random_RMSEs,
names_to = "transformation",
values_to = "Adj_R2"
) -> compare_joins.long2
glimpse(compare_joins.long2)
compare_joins.long2 %>%
ggplot(aes(x= Adj_R2,
y=Random_RMSEs,
color = transformation )) +
geom_point() +
facet_wrap(~transformation)
compare_joins.long2 %>%
ggplot(aes(x= Adj_R2,
y=Random_RMSEs,
color = transformation )) +
geom_point()
compare_joins.long2 %>%
ggplot(aes(x= Random_RMSEs ,
y= Adj_R2 ,
color = transformation )) +
geom_point()
glimpse(compare_joins.long2)
compare_joins.long2 %>%
select(transformation) %>%
distinct()
compare_joins.long2 %>%
group_by(transformation) %>%
summarise(mean_adj_r2 = mean(Adj_R2))
compare_joins.long2.filter <- compare_joins.long2 %>%
filter(transformation %in% c('Adj_R2.Random_RMSEs',
'Adj_R2.double_rows.double_features',
'Adj_R2.double_rows_tripple_ss_tot',
'Adj_R2.tripple_ss_tot')
)
rand_sort <- runif(nrow(compare_joins.long2.filter))
glimpse(rand_sort)
compare_joins.long3 <- cbind(compare_joins.long2.filter, rand_sort)
glimpse(compare_joins.long3)
compare_joins.long4 <- compare_joins.long3 %>%
arrange(rand_sort) %>%
group_by(Random_RMSEs) %>%
mutate(rn = row_number()) %>%
filter(rn == 1) %>%
ungroup() %>%
select(-rn)
compare_joins.long4
compare_joins.long4 %>%
mutate_at(vars(Adj_R2,Random_RMSEs), scale) %>%
filter(rand_sort > .90 ) %>%
ggplot(aes(x= Adj_R2 ,
y= Random_RMSEs,
color = as.factor(transformation))) +
geom_point() +
theme(legend.position = "none")
(df_model %>%
arrange(RMSE) %>%
filter(row_number() < 5))$RMSE
Top_5_RMSE_formulas <- (df_model %>%
arrange(RMSE) %>%
filter(row_number() < 5))$model_creator
Top_5_RMSE_formulas
(df_model %>%
arrange(-Adj_R2) %>%
filter(row_number() < 5))$Adj_R2
Top_5_AdjR2_formulas <- (df_model %>%
arrange(-Adj_R2) %>%
filter(row_number() < 5))$model_creator
Top_5_AdjR2_formulas
Best_RMSE_formula <- Top_5_RMSE_formulas[[1]]
Best_AdjR2_formula <- Top_5_AdjR2_formulas[[1]]
if(Best_RMSE_formula == Best_AdjR2_formula){
Best_RMSE_formula <- Top_5_RMSE_formulas[[1]]
Best_AdjR2_formula <- Top_5_AdjR2_formulas[[2]]
}
Final.Glm_function <- function(formula, data){
lm(formula, data)
}
Final.Glm.Adj_R2 <- Final.Glm_function(Best_AdjR2_formula , TRAIN)
Final.Glm.RMSE <- Final.Glm_function(Best_RMSE_formula, TRAIN)
TEST.scored <- TEST
TEST.scored$estimate <- predict(Final.Glm.Adj_R2, TEST)
TEST.Adj_R2 <- TEST.scored %>%
mutate(model = "Adj_R2")
TEST.scored$estimate <- predict(Final.Glm.RMSE, TEST)
TEST.RMSE <- TEST.scored %>%
mutate(model = "RMSE")
TEST_stacked <- bind_rows(TEST.Adj_R2, TEST.RMSE)
TEST_stacked %>%
group_by(model) %>%
yardstick::rmse(truth=lbxglu, estimate)
TEST_stacked %>%
group_by(model) %>%
mutate(error = estimate - lbxglu) %>%
arrange(error) %>%
mutate(order_n = row_number()) %>%
ggplot(aes(x=order_n,
y=error,
color=model)) +
geom_point() +
geom_line() +
facet_wrap(~model)
TEST_stacked
TEST_stacked.RMSE.byGroup <- TEST_stacked %>%
group_by(model, riagendr.2, ridreth1.2, dmdeduc2.2, dmdmartl.2, indhhin2.2, diq010.2) %>%
yardstick::rmse(truth=lbxglu, estimate)
TEST_stacked.RMSE.byGroup %>% glimpse()
TEST_stacked.RMSE.byGroup %>%
group_by(model) %>%
ggplot(aes(x=riagendr.2,
y=.estimate,
fill= model )) +
geom_bar(stat='identity', position = 'dodge') +
facet_wrap(~ridreth1.2+diq010.2)
TEST_stacked %>%
group_by(model, riagendr.2, ridreth1.2, dmdeduc2.2, dmdmartl.2, indhhin2.2, diq010.2) %>%
mutate(error = estimate - lbxglu) %>%
arrange(error) %>%
mutate(order_n = row_number()) %>%
ggplot(aes(x=order_n,
y=error,
color=model)) +
geom_point() +
geom_line() +
facet_wrap(~ridreth1.2+diq010.2)
TEST_stacked %>%
group_by(model) %>%
yardstick::rsq(truth=lbxglu, estimate)