\(~\)
\(~\)
\(~\)
library('tidyverse')
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.4 v dplyr 0.8.5
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library('caret')
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library('yardstick')
## For binary classification, the first factor level is assumed to be the event.
## Set the global option `yardstick.event_first` to `FALSE` to change this.
##
## Attaching package: 'yardstick'
## The following objects are masked from 'package:caret':
##
## precision, recall
## The following object is masked from 'package:readr':
##
## spec
library('ggplot2')
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
na.omit()
glimpse(diab_pop)
## Rows: 1,876
## Columns: 10
## $ seqn <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83761, 83...
## $ riagendr <fct> Male, Male, Female, Male, Female, Male, Female, Female, Fe...
## $ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80...
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, MexicanAmerican, O...
## $ dmdeduc2 <fct> High school graduate/GED, High school graduate/GED, Grades...
## $ dmdmartl <fct> Divorced, Married, Separated, Never married, Married, Wido...
## $ indhhin2 <fct> "$15,000-$19,999", "$20,000-$24,999", "$75,000-$99,999", "...
## $ bmxbmi <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0...
## $ diq010 <fct> No Diabetes, Diabetes, No Diabetes, No Diabetes, No Diabet...
## $ lbxglu <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 9...
levels(diab_pop$indhhin2)
## [1] "$0-$4,999" "$5,000-$9,999" "$10,000-$14,999"
## [4] "$15,000-$19,999" "$20,000-$24,999" "$25,000-$34,999"
## [7] "$35,000-$44,999" "$45,000-$54,999" "$55,000-$64,999"
## [10] "$65,000-$74,999" "20,000+" "less than $20,000"
## [13] "$75,000-$99,999" "$100,000+"
income_levels <- levels(diab_pop$indhhin2)
levels = c("$0-$4,999",
"$5,000-$9,999",
"$10,000-$14,999",
"$15,000-$19,999",
"less than $20,000",
"20,000+",
"$20,000-$24,999",
"$25,000-$34,999",
"$35,000-$44,999",
"$45,000-$54,999",
"$55,000-$64,999",
"$65,000-$74,999",
"$75,000-$99,999",
"$100,000+"
)
setdiff(income_levels, levels)
## character(0)
diab_pop$indhhin2 <- factor(diab_pop$indhhin2 ,
levels=levels,
ordered = TRUE)
odered_levels <- levels(diab_pop$indhhin2)
glimpse(diab_pop)
## Rows: 1,876
## Columns: 10
## $ seqn <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83761, 83...
## $ riagendr <fct> Male, Male, Female, Male, Female, Male, Female, Female, Fe...
## $ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80...
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, MexicanAmerican, O...
## $ dmdeduc2 <fct> High school graduate/GED, High school graduate/GED, Grades...
## $ dmdmartl <fct> Divorced, Married, Separated, Never married, Married, Wido...
## $ indhhin2 <ord> "$15,000-$19,999", "$20,000-$24,999", "$75,000-$99,999", "...
## $ bmxbmi <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0...
## $ diq010 <fct> No Diabetes, Diabetes, No Diabetes, No Diabetes, No Diabet...
## $ lbxglu <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 9...
feature_names <- c('riagendr' , 'ridreth1' , 'dmdeduc2' , 'dmdmartl' , 'indhhin2' , 'lbxglu', 'diq010')
feature_names_plus <- paste(feature_names, collapse = ' + ' )
feature_names_plus
## [1] "riagendr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + lbxglu + diq010"
formula_1 <- as.formula(paste0('bmxbmi ~ ',feature_names_plus))
formula_1
## bmxbmi ~ riagendr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 +
## lbxglu + diq010
# THIS IS NOT A GREAT IDEA
options(warn=-1)
# I have this on, there is an expected warning
## "prediction from a rank-deficient fit may be misleading"
## without this option on the output is very difficult to read
caret Train glm FunctionTrain_Glm_Iteration <- function(data){
TrainInd <- createDataPartition(data$bmxbmi,
p =.7,
list=FALSE)
TRAIN <- data[TrainInd, ]
gml_control <- trainControl(
method = 'cv',
number = 22,
preProcOptions = c("zv","corr",'center','scale',"conditionalX")
)
gml.model <- train(as.formula(formula_1) ,
method='glm',
data =TRAIN,
trControl=gml_control,
family = "gaussian"
)
CoEff <- as_tibble(gml.model$finalModel$coefficients, rownames="feature") %>%
rename(coeff = value)
TEST <- data[-TrainInd,]
estimate <- as_tibble(predict(gml.model, TEST,'raw')) %>%
rename(estimate= value)
TEST.scored <- cbind(TEST, estimate)
RMSE <- TEST.scored %>%
rmse(truth=bmxbmi , estimate)
return(list(Training_Data = TRAIN,
gml.model = gml.model,
CoEff = CoEff,
TEST.scored =TEST.scored,
RMSE_TEST = RMSE))
}
Id <- sample(diab_pop$seqn, nrow(diab_pop)*.3, replace=F)
length(Id)
## [1] 562
t1 <- diab_pop %>%
filter(seqn %in% Id)
dim(t1)
## [1] 562 10
X1 <- Train_Glm_Iteration(t1)
str(X1,1)
## List of 5
## $ Training_Data:'data.frame': 394 obs. of 10 variables:
## ..- attr(*, "na.action")= 'omit' Named int [1:3843] 1 4 5 7 8 9 10 12 16 18 ...
## .. ..- attr(*, "names")= chr [1:3843] "1" "4" "5" "7" ...
## $ gml.model :List of 24
## ..- attr(*, "class")= chr [1:2] "train" "train.formula"
## $ CoEff : tibble [30 x 2] (S3: tbl_df/tbl/data.frame)
## $ TEST.scored :'data.frame': 168 obs. of 11 variables:
## $ RMSE_TEST : tibble [1 x 3] (S3: tbl_df/tbl/data.frame)
nrow(X1$Training_Data) + nrow(X1$TEST.scored) == nrow(t1)
## [1] TRUE
arsenal::comparedf(X1$Training_Data, X1$TEST.scored, by=c('seqn'))
## Compare Object
##
## Function Call:
## arsenal::comparedf(x = X1$Training_Data, y = X1$TEST.scored,
## by = c("seqn"))
##
## Shared: 9 non-by variables and 0 observations.
## Not shared: 1 variables and 562 observations.
##
## Differences found in 0/9 variables compared.
## 0 variables compared have non-identical attributes.
X1.comparedf <- arsenal::comparedf(X1$Training_Data, X1$TEST.scored, by=c('seqn'))
sum.X1.comparedf <- summary(X1.comparedf)
sum.X1.comparedf$comparison.summary.table
## statistic value
## 1 Number of by-variables 1
## 2 Number of non-by variables in common 9
## 3 Number of variables compared 9
## 4 Number of variables in x but not y 0
## 5 Number of variables in y but not x 1
## 6 Number of variables compared with some values unequal 0
## 7 Number of variables compared with all values equal 9
## 8 Number of observations in common 0
## 9 Number of observations in x but not y 394
## 10 Number of observations in y but not x 168
## 11 Number of observations with some compared variables unequal 0
## 12 Number of observations with all compared variables equal 0
## 13 Number of values unequal 0
rsq(X1$TEST.scored,
truth =bmxbmi , estimate)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.0713
rsq_trad(X1$TEST.scored,
truth =bmxbmi , estimate)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq_trad standard 0.0503
summary(lm(formula_1,
X1$Training_Data %>%
mutate_if(is.numeric ,scale)
))$r.squared
## [1] 0.1492866
Adj_R2_est <- function(r2_est,
num_features,
num_records){
n <- num_records
p <- num_features
X <- 1-r2_est
Y <- (n-1)
Z <- (n-p-1)
adj_r2_est = 1 - X*(Y/Z)
return(adj_r2_est)
}
r2_est <- (rsq(X1$Training_Data,
truth =bmxbmi , predict(X1$gml.model,X1$Training_Data)))$.estimate
r2_est
## [1] 0.1492866
Adj_R2_est(r2_est,
length(feature_names),
nrow(X1$Training_Data))
## [1] 0.1338591
summary(lm(formula_1,
X1$Training_Data %>%
mutate_if(is.numeric ,scale)
))$adj.r.squared
## [1] 0.08652905
X1.glm <- glm(formula_1,
X1$Training_Data %>% mutate_if(is.numeric ,scale),
family = "gaussian")
#install.packages('rsq')
rsq::rsq(X1.glm, adj=TRUE)
## [1] 0.08652905
rsq::rsq(X1.glm, adj=FALSE)
## [1] 0.1492866
Id2 <- sample(diab_pop$seqn, nrow(diab_pop)*.5, replace=F)
t2 <- diab_pop %>%
filter(seqn %in% Id2)
X2 <- Train_Glm_Iteration(t2)
Swan <- diab_pop %>%
filter(indhhin2 == '$75,000-$99,999' & riagendr == 'Female' & ridreth1 == 'Non-Hispanic White')
Id3 <- sample(Swan$seqn, nrow(Swan)*.8, replace=F)
t3 <- diab_pop %>%
filter(indhhin2 == '$75,000-$99,999' & riagendr == 'Female' & ridreth1 == 'Non-Hispanic White') %>%
filter(seqn %in% Id3)
t3 %>%
group_by(indhhin2,riagendr) %>%
summary(cnt=n_distinct(seqn))
## seqn riagendr ridageyr ridreth1
## Min. :84166 Male : 0 Min. :21.00 MexicanAmerican : 0
## 1st Qu.:85920 Female:29 1st Qu.:41.00 Other Hispanic : 0
## Median :88903 Median :50.00 Non-Hispanic White:29
## Mean :88496 Mean :52.07 Non-Hispanic Black: 0
## 3rd Qu.:90932 3rd Qu.:64.00 Other : 0
## Max. :93258 Max. :80.00
##
## dmdeduc2 dmdmartl
## Less than 9th grade : 2 Married :18
## Grades 9-11th : 1 Widowed : 4
## High school graduate/GED : 4 Divorced : 2
## Some college or AA degrees:12 Separated : 0
## College grad or above :10 Never married : 1
## Living with partner: 4
##
## indhhin2 bmxbmi diq010 lbxglu
## $75,000-$99,999 :29 Min. :16.70 Diabetes : 2 Min. : 80.0
## $0-$4,999 : 0 1st Qu.:25.30 No Diabetes:27 1st Qu.: 93.0
## $5,000-$9,999 : 0 Median :28.10 Median :100.0
## $10,000-$14,999 : 0 Mean :30.64 Mean :100.9
## $15,000-$19,999 : 0 3rd Qu.:36.30 3rd Qu.:105.0
## less than $20,000: 0 Max. :63.60 Max. :134.0
## (Other) : 0
X3 <- Train_Glm_Iteration(t3)
Id4 <- sample(diab_pop$seqn, nrow(diab_pop)*.9, replace=F)
t4 <- diab_pop %>%
filter(seqn %in% Id4)
X4 <- Train_Glm_Iteration(t4)
M_union <- union(Id2,Id3)
Id5 <- setdiff(diab_pop$seqn, M_union)
t5 <- diab_pop %>%
filter(seqn %in% Id5)
X5 <- Train_Glm_Iteration(t5)
str(X2$Training_Data)
## 'data.frame': 659 obs. of 10 variables:
## $ seqn : num 83733 83750 83789 83790 83799 ...
## $ riagendr: Factor w/ 2 levels "Male","Female": 1 1 1 1 2 2 2 2 1 2 ...
## $ ridageyr: num 53 45 66 56 37 80 29 37 41 38 ...
## $ ridreth1: Factor w/ 5 levels "MexicanAmerican",..: 3 5 3 3 2 2 1 3 4 5 ...
## $ dmdeduc2: Factor w/ 5 levels "Less than 9th grade",..: 3 2 5 1 4 1 1 3 4 4 ...
## $ dmdmartl: Factor w/ 6 levels "Married","Widowed",..: 3 5 6 1 1 2 5 1 1 1 ...
## $ indhhin2: Ord.factor w/ 14 levels "$0-$4,999"<"$5,000-$9,999"<..: 4 12 12 4 13 3 3 10 14 13 ...
## $ bmxbmi : num 30.8 24.1 34 24.4 25.5 28.5 29.7 35.3 40.7 21.8 ...
## $ diq010 : Factor w/ 2 levels "Diabetes","No Diabetes": 2 2 2 2 2 2 2 2 2 2 ...
## $ lbxglu : num 101 84 113 397 100 119 102 79 110 89 ...
## - attr(*, "na.action")= 'omit' Named int 1 4 5 7 8 9 10 12 16 18 ...
## ..- attr(*, "names")= chr "1" "4" "5" "7" ...
str(X3$Training_Data)
## 'data.frame': 22 obs. of 10 variables:
## $ seqn : num 84166 84511 85093 85443 85920 ...
## $ riagendr: Factor w/ 2 levels "Male","Female": 2 2 2 2 2 2 2 2 2 2 ...
## $ ridageyr: num 67 78 61 48 61 35 40 64 73 22 ...
## $ ridreth1: Factor w/ 5 levels "MexicanAmerican",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ dmdeduc2: Factor w/ 5 levels "Less than 9th grade",..: 5 5 2 3 5 5 5 5 5 5 ...
## $ dmdmartl: Factor w/ 6 levels "Married","Widowed",..: 1 2 3 1 1 1 1 1 1 6 ...
## $ indhhin2: Ord.factor w/ 14 levels "$0-$4,999"<"$5,000-$9,999"<..: 13 13 13 13 13 13 13 13 13 13 ...
## $ bmxbmi : num 26.1 23.1 36.2 27.1 42.7 28.9 23.5 39.3 28.3 16.7 ...
## $ diq010 : Factor w/ 2 levels "Diabetes","No Diabetes": 2 2 2 2 2 2 2 1 2 2 ...
## $ lbxglu : num 134 99 92 97 108 100 104 128 105 84 ...
## - attr(*, "na.action")= 'omit' Named int 1 4 5 7 8 9 10 12 16 18 ...
## ..- attr(*, "names")= chr "1" "4" "5" "7" ...
str(X5$Training_Data)
## 'data.frame': 648 obs. of 10 variables:
## $ seqn : num 83754 83755 83757 83787 83809 ...
## $ riagendr: Factor w/ 2 levels "Male","Female": 2 1 2 2 2 1 2 2 1 1 ...
## $ ridageyr: num 67 67 57 68 20 70 39 49 35 40 ...
## $ ridreth1: Factor w/ 5 levels "MexicanAmerican",..: 2 4 2 1 4 3 1 3 1 4 ...
## $ dmdeduc2: Factor w/ 5 levels "Less than 9th grade",..: 5 5 1 1 3 5 3 3 3 4 ...
## $ dmdmartl: Factor w/ 6 levels "Married","Widowed",..: 1 2 4 3 5 6 1 1 1 5 ...
## $ indhhin2: Ord.factor w/ 14 levels "$0-$4,999"<"$5,000-$9,999"<..: 8 7 7 4 13 12 4 14 13 13 ...
## $ bmxbmi : num 43.7 28.8 35.4 33.5 26.2 27 27.2 27.4 31.1 30.7 ...
## $ diq010 : Factor w/ 2 levels "Diabetes","No Diabetes": 2 1 1 2 2 2 2 2 2 2 ...
## $ lbxglu : num 130 284 398 111 94 94 101 126 97 90 ...
## - attr(*, "na.action")= 'omit' Named int 1 4 5 7 8 9 10 12 16 18 ...
## ..- attr(*, "names")= chr "1" "4" "5" "7" ...
arsenal::comparedf(X3$Training_Data,
X5$Training_Data)
## Compare Object
##
## Function Call:
## arsenal::comparedf(x = X3$Training_Data, y = X5$Training_Data)
##
## Shared: 10 non-by variables and 22 observations.
## Not shared: 0 variables and 626 observations.
##
## Differences found in 10/10 variables compared.
## 0 variables compared have non-identical attributes.
CoEff_compare <- bind_rows(X1$CoEff %>% mutate(strat = 't1'),
X2$CoEff %>% mutate(strat = 't2'),
X3$CoEff %>% mutate(strat = 't3'),
X4$CoEff %>% mutate(strat = 't4'),
X5$CoEff %>% mutate(strat = 't5'))
glimpse(CoEff_compare)
## Rows: 150
## Columns: 3
## $ feature <chr> "(Intercept)", "riagendrFemale", "`ridreth1Other Hispanic`"...
## $ coeff <dbl> 27.02949669, 1.44742261, -2.10550076, -1.35430170, 0.477308...
## $ strat <chr> "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t1",...
library('ggplot2')
CoEff_compare %>%
group_by(strat) %>%
ggplot(aes(x=feature, y=coeff)) +
geom_point() +
coord_flip() +
facet_wrap(.~strat)
CoEff_compare %>%
ggplot(aes(x=feature, y=coeff)) +
geom_boxplot() +
coord_flip()
RMSE <- bind_rows(X1$RMSE_TEST %>% mutate(strat = 't1'),
X2$RMSE_TEST %>% mutate(strat = 't2'),
X3$RMSE_TEST %>% mutate(strat = 't3'),
X4$RMSE_TEST %>% mutate(strat = 't4'),
X5$RMSE_TEST %>% mutate(strat = 't5'))
RMSE
## # A tibble: 5 x 4
## .metric .estimator .estimate strat
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 6.67 t1
## 2 rmse standard 6.44 t2
## 3 rmse standard 11.8 t3
## 4 rmse standard 7.02 t4
## 5 rmse standard 6.57 t5
mean(RMSE$.estimate)
## [1] 7.69929
var(RMSE$.estimate)
## [1] 5.257416
f2 <- diab_pop %>%
anti_join(t2 %>% select(seqn))
## Joining, by = "seqn"
nrow(diab_pop) #1876
## [1] 1876
nrow(t2) #938
## [1] 938
nrow(f2) #938
## [1] 938
arsenal::comparedf(t2,f2,by='seqn')
## Compare Object
##
## Function Call:
## arsenal::comparedf(x = t2, y = f2, by = "seqn")
##
## Shared: 9 non-by variables and 0 observations.
## Not shared: 0 variables and 1876 observations.
##
## Differences found in 0/9 variables compared.
## 0 variables compared have non-identical attributes.
Test2.estimate <- predict(X2$gml.model, f2)
Test2.Scored <- cbind(f2,Test2.estimate)
Test2.Scored %>%
rmse(truth=bmxbmi, Test2.estimate)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 6.71
Test2.Scored %>%
group_by(indhhin2) %>%
rmse(truth=bmxbmi, Test2.estimate)
## # A tibble: 12 x 4
## indhhin2 .metric .estimator .estimate
## <ord> <chr> <chr> <dbl>
## 1 $0-$4,999 rmse standard 8.64
## 2 $5,000-$9,999 rmse standard 5.89
## 3 $10,000-$14,999 rmse standard 6.32
## 4 $15,000-$19,999 rmse standard 6.44
## 5 less than $20,000 rmse standard 8.18
## 6 20,000+ rmse standard 6.31
## 7 $20,000-$24,999 rmse standard 6.80
## 8 $25,000-$34,999 rmse standard 7.22
## 9 $45,000-$54,999 rmse standard 7.17
## 10 $65,000-$74,999 rmse standard 5.90
## 11 $75,000-$99,999 rmse standard 7.23
## 12 $100,000+ rmse standard 5.79
f5 <- diab_pop %>%
anti_join(t5 %>% select(seqn))
## Joining, by = "seqn"
nrow(diab_pop) #1876
## [1] 1876
nrow(t5)
## [1] 924
nrow(f5)
## [1] 952
arsenal::comparedf(t5,f5,by='seqn')
## Compare Object
##
## Function Call:
## arsenal::comparedf(x = t5, y = f5, by = "seqn")
##
## Shared: 9 non-by variables and 0 observations.
## Not shared: 0 variables and 1876 observations.
##
## Differences found in 0/9 variables compared.
## 0 variables compared have non-identical attributes.
Test3.estimate <- predict(X3$gml.model, f5)
Test3.Scored <- cbind(f5, Test3.estimate)
arsenal::comparedf(t2,f5,by='seqn')
## Compare Object
##
## Function Call:
## arsenal::comparedf(x = t2, y = f5, by = "seqn")
##
## Shared: 9 non-by variables and 938 observations.
## Not shared: 0 variables and 14 observations.
##
## Differences found in 0/9 variables compared.
## 0 variables compared have non-identical attributes.
Test3.estimate <- predict(X2$gml.model, f5)
Test3.Scored.2 <- cbind(f5, Test3.estimate)
Test3.Scored.Stack <- rbind(Test3.Scored %>% mutate(strat=3),
Test3.Scored.2 %>% mutate(strat=2))
Test3.Scored.Stack %>%
group_by(strat) %>%
rmse(truth=bmxbmi, Test3.estimate)
## # A tibble: 2 x 4
## strat .metric .estimator .estimate
## <dbl> <chr> <chr> <dbl>
## 1 2 rmse standard 6.65
## 2 3 rmse standard 20.7
Error_Rates_by_model_by_class <- Test3.Scored.Stack %>%
mutate(strat = case_when(
strat ==3 ~ "black_swan",
strat ==2 ~ "random"
)) %>%
group_by(strat, indhhin2, diq010, riagendr) %>%
rmse(truth=bmxbmi, Test3.estimate) %>%
arrange(desc(.estimate)) %>%
rename(RMSE_est = .estimate)
Error_Rates_by_model_by_class
## # A tibble: 96 x 7
## strat indhhin2 diq010 riagendr .metric .estimator RMSE_est
## <chr> <ord> <fct> <fct> <chr> <chr> <dbl>
## 1 black_swan less than $20,000 Diabetes Male rmse standard 111.
## 2 black_swan $75,000-$99,999 Diabetes Female rmse standard 50.7
## 3 black_swan $25,000-$34,999 Diabetes Female rmse standard 47.8
## 4 black_swan $25,000-$34,999 Diabetes Male rmse standard 47.7
## 5 black_swan $15,000-$19,999 Diabetes Male rmse standard 47.4
## 6 black_swan $65,000-$74,999 Diabetes Female rmse standard 45.7
## 7 black_swan $20,000-$24,999 Diabetes Female rmse standard 44.3
## 8 black_swan $20,000-$24,999 Diabetes Male rmse standard 40.5
## 9 black_swan $100,000+ Diabetes Male rmse standard 38.4
## 10 black_swan $45,000-$54,999 Diabetes Female rmse standard 37.9
## # ... with 86 more rows
summary(Error_Rates_by_model_by_class$RMSE_est)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.291 6.086 11.131 16.275 20.717 110.952
options(warn=0)
Error_Rates_by_model_by_class %>%
group_by(strat, diq010, indhhin2, riagendr) %>%
ggplot(aes(x=strat,
y=RMSE_est,
fill=indhhin2,
label=diq010)) +
geom_bar(stat = "identity",
position = "dodge") +
coord_flip() +
facet_wrap( ~ diq010 + riagendr)
Test3.Scored.Stack %>%
mutate(strat = case_when(
strat ==3 ~ "black_swan",
strat ==2 ~ "random"
)) %>%
group_by(strat, indhhin2, diq010, ridreth1) %>%
rmse(truth=bmxbmi, Test3.estimate) %>%
arrange(desc(.estimate)) %>%
rename(RMSE_est = .estimate) %>%
group_by(strat, diq010, indhhin2, ridreth1) %>%
ggplot(aes(x=strat,
y=RMSE_est,
fill=indhhin2,
label=diq010)) +
geom_bar(stat = "identity",
position = "dodge") +
coord_flip() +
facet_wrap( ~ ridreth1 + diq010)
\(~\)
\(~\)
library('tidyverse')
library('caret')
library('yardstick')
library('ggplot2')
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
na.omit()
glimpse(diab_pop)
levels(diab_pop$indhhin2)
income_levels <- levels(diab_pop$indhhin2)
levels = c("$0-$4,999",
"$5,000-$9,999",
"$10,000-$14,999",
"$15,000-$19,999",
"less than $20,000",
"20,000+",
"$20,000-$24,999",
"$25,000-$34,999",
"$35,000-$44,999",
"$45,000-$54,999",
"$55,000-$64,999",
"$65,000-$74,999",
"$75,000-$99,999",
"$100,000+"
)
setdiff(income_levels, levels)
diab_pop$indhhin2 <- factor(diab_pop$indhhin2 ,
levels=levels,
ordered = TRUE)
odered_levels <- levels(diab_pop$indhhin2)
glimpse(diab_pop)
feature_names <- c('riagendr' , 'ridreth1' , 'dmdeduc2' , 'dmdmartl' , 'indhhin2' , 'lbxglu', 'diq010')
feature_names_plus <- paste(feature_names, collapse = ' + ' )
feature_names_plus
formula_1 <- as.formula(paste0('bmxbmi ~ ',feature_names_plus))
formula_1
# THIS IS NOT A GREAT IDEA
options(warn=-1)
# I have this on, there is an expected warning
## "prediction from a rank-deficient fit may be misleading"
## without this option on the output is very difficult to read
Train_Glm_Iteration <- function(data){
TrainInd <- createDataPartition(data$bmxbmi,
p =.7,
list=FALSE)
TRAIN <- data[TrainInd, ]
gml_control <- trainControl(
method = 'cv',
number = 22,
preProcOptions = c("zv","corr",'center','scale',"conditionalX")
)
gml.model <- train(as.formula(formula_1) ,
method='glm',
data =TRAIN,
trControl=gml_control,
family = "gaussian"
)
CoEff <- as_tibble(gml.model$finalModel$coefficients, rownames="feature") %>%
rename(coeff = value)
TEST <- data[-TrainInd,]
estimate <- as_tibble(predict(gml.model, TEST,'raw')) %>%
rename(estimate= value)
TEST.scored <- cbind(TEST, estimate)
RMSE <- TEST.scored %>%
rmse(truth=bmxbmi , estimate)
return(list(Training_Data = TRAIN,
gml.model = gml.model,
CoEff = CoEff,
TEST.scored =TEST.scored,
RMSE_TEST = RMSE))
}
Id <- sample(diab_pop$seqn, nrow(diab_pop)*.3, replace=F)
length(Id)
t1 <- diab_pop %>%
filter(seqn %in% Id)
dim(t1)
X1 <- Train_Glm_Iteration(t1)
str(X1,1)
nrow(X1$Training_Data) + nrow(X1$TEST.scored) == nrow(t1)
arsenal::comparedf(X1$Training_Data, X1$TEST.scored, by=c('seqn'))
X1.comparedf <- arsenal::comparedf(X1$Training_Data, X1$TEST.scored, by=c('seqn'))
sum.X1.comparedf <- summary(X1.comparedf)
sum.X1.comparedf$comparison.summary.table
rsq(X1$TEST.scored,
truth =bmxbmi , estimate)
rsq_trad(X1$TEST.scored,
truth =bmxbmi , estimate)
summary(lm(formula_1,
X1$Training_Data %>%
mutate_if(is.numeric ,scale)
))$r.squared
Adj_R2_est <- function(r2_est,
num_features,
num_records){
n <- num_records
p <- num_features
X <- 1-r2_est
Y <- (n-1)
Z <- (n-p-1)
adj_r2_est = 1 - X*(Y/Z)
return(adj_r2_est)
}
r2_est <- (rsq(X1$Training_Data,
truth =bmxbmi , predict(X1$gml.model,X1$Training_Data)))$.estimate
r2_est
Adj_R2_est(r2_est,
length(feature_names),
nrow(X1$Training_Data))
summary(lm(formula_1,
X1$Training_Data %>%
mutate_if(is.numeric ,scale)
))$adj.r.squared
X1.glm <- glm(formula_1,
X1$Training_Data %>% mutate_if(is.numeric ,scale),
family = "gaussian")
#install.packages('rsq')
rsq::rsq(X1.glm, adj=TRUE)
rsq::rsq(X1.glm, adj=FALSE)
Id2 <- sample(diab_pop$seqn, nrow(diab_pop)*.5, replace=F)
t2 <- diab_pop %>%
filter(seqn %in% Id2)
X2 <- Train_Glm_Iteration(t2)
Swan <- diab_pop %>%
filter(indhhin2 == '$75,000-$99,999' & riagendr == 'Female' & ridreth1 == 'Non-Hispanic White')
Id3 <- sample(Swan$seqn, nrow(Swan)*.8, replace=F)
t3 <- diab_pop %>%
filter(indhhin2 == '$75,000-$99,999' & riagendr == 'Female' & ridreth1 == 'Non-Hispanic White') %>%
filter(seqn %in% Id3)
t3 %>%
group_by(indhhin2,riagendr) %>%
summary(cnt=n_distinct(seqn))
X3 <- Train_Glm_Iteration(t3)
Id4 <- sample(diab_pop$seqn, nrow(diab_pop)*.9, replace=F)
t4 <- diab_pop %>%
filter(seqn %in% Id4)
X4 <- Train_Glm_Iteration(t4)
M_union <- union(Id2,Id3)
Id5 <- setdiff(diab_pop$seqn, M_union)
t5 <- diab_pop %>%
filter(seqn %in% Id5)
X5 <- Train_Glm_Iteration(t5)
str(X2$Training_Data)
str(X3$Training_Data)
str(X5$Training_Data)
arsenal::comparedf(X3$Training_Data,
X5$Training_Data)
CoEff_compare <- bind_rows(X1$CoEff %>% mutate(strat = 't1'),
X2$CoEff %>% mutate(strat = 't2'),
X3$CoEff %>% mutate(strat = 't3'),
X4$CoEff %>% mutate(strat = 't4'),
X5$CoEff %>% mutate(strat = 't5'))
glimpse(CoEff_compare)
library('ggplot2')
CoEff_compare %>%
group_by(strat) %>%
ggplot(aes(x=feature, y=coeff)) +
geom_point() +
coord_flip() +
facet_wrap(.~strat)
CoEff_compare %>%
ggplot(aes(x=feature, y=coeff)) +
geom_boxplot() +
coord_flip()
RMSE <- bind_rows(X1$RMSE_TEST %>% mutate(strat = 't1'),
X2$RMSE_TEST %>% mutate(strat = 't2'),
X3$RMSE_TEST %>% mutate(strat = 't3'),
X4$RMSE_TEST %>% mutate(strat = 't4'),
X5$RMSE_TEST %>% mutate(strat = 't5'))
RMSE
mean(RMSE$.estimate)
var(RMSE$.estimate)
f2 <- diab_pop %>%
anti_join(t2 %>% select(seqn))
nrow(diab_pop) #1876
nrow(t2) #938
nrow(f2) #938
arsenal::comparedf(t2,f2,by='seqn')
Test2.estimate <- predict(X2$gml.model, f2)
Test2.Scored <- cbind(f2,Test2.estimate)
Test2.Scored %>%
rmse(truth=bmxbmi, Test2.estimate)
Test2.Scored %>%
group_by(indhhin2) %>%
rmse(truth=bmxbmi, Test2.estimate)
f5 <- diab_pop %>%
anti_join(t5 %>% select(seqn))
nrow(diab_pop) #1876
nrow(t5)
nrow(f5)
arsenal::comparedf(t5,f5,by='seqn')
Test3.estimate <- predict(X3$gml.model, f5)
Test3.Scored <- cbind(f5, Test3.estimate)
arsenal::comparedf(t2,f5,by='seqn')
Test3.estimate <- predict(X2$gml.model, f5)
Test3.Scored.2 <- cbind(f5, Test3.estimate)
Test3.Scored.Stack <- rbind(Test3.Scored %>% mutate(strat=3),
Test3.Scored.2 %>% mutate(strat=2))
Test3.Scored.Stack %>%
group_by(strat) %>%
rmse(truth=bmxbmi, Test3.estimate)
Error_Rates_by_model_by_class <- Test3.Scored.Stack %>%
mutate(strat = case_when(
strat ==3 ~ "black_swan",
strat ==2 ~ "random"
)) %>%
group_by(strat, indhhin2, diq010, riagendr) %>%
rmse(truth=bmxbmi, Test3.estimate) %>%
arrange(desc(.estimate)) %>%
rename(RMSE_est = .estimate)
Error_Rates_by_model_by_class
summary(Error_Rates_by_model_by_class$RMSE_est)
options(warn=0)
Error_Rates_by_model_by_class %>%
group_by(strat, diq010, indhhin2, riagendr) %>%
ggplot(aes(x=strat,
y=RMSE_est,
fill=indhhin2,
label=diq010)) +
geom_bar(stat = "identity",
position = "dodge") +
coord_flip() +
facet_wrap( ~ diq010 + riagendr)
Test3.Scored.Stack %>%
mutate(strat = case_when(
strat ==3 ~ "black_swan",
strat ==2 ~ "random"
)) %>%
group_by(strat, indhhin2, diq010, ridreth1) %>%
rmse(truth=bmxbmi, Test3.estimate) %>%
arrange(desc(.estimate)) %>%
rename(RMSE_est = .estimate) %>%
group_by(strat, diq010, indhhin2, ridreth1) %>%
ggplot(aes(x=strat,
y=RMSE_est,
fill=indhhin2,
label=diq010)) +
geom_bar(stat = "identity",
position = "dodge") +
coord_flip() +
facet_wrap( ~ ridreth1 + diq010)