caret
glm
logit\(~\)
\(~\)
\(~\)
set.seed(1701)
library('tidyverse')
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v dplyr 1.0.4
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library('caret')
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library('yardstick')
## For binary classification, the first factor level is assumed to be the event.
## Set the global option `yardstick.event_first` to `FALSE` to change this.
##
## Attaching package: 'yardstick'
## The following objects are masked from 'package:caret':
##
## precision, recall
## The following object is masked from 'package:readr':
##
## spec
library('ggplot2')
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
na.omit()
glimpse(diab_pop)
## Rows: 1,876
## Columns: 10
## $ seqn <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83761, 83...
## $ riagendr <fct> Male, Male, Female, Male, Female, Male, Female, Female, Fe...
## $ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80...
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, MexicanAmerican, O...
## $ dmdeduc2 <fct> High school graduate/GED, High school graduate/GED, Grades...
## $ dmdmartl <fct> Divorced, Married, Separated, Never married, Married, Wido...
## $ indhhin2 <fct> "$15,000-$19,999", "$20,000-$24,999", "$75,000-$99,999", "...
## $ bmxbmi <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0...
## $ diq010 <fct> No Diabetes, Diabetes, No Diabetes, No Diabetes, No Diabet...
## $ lbxglu <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 9...
levels(diab_pop$indhhin2)
## [1] "$0-$4,999" "$5,000-$9,999" "$10,000-$14,999"
## [4] "$15,000-$19,999" "$20,000-$24,999" "$25,000-$34,999"
## [7] "$35,000-$44,999" "$45,000-$54,999" "$55,000-$64,999"
## [10] "$65,000-$74,999" "20,000+" "less than $20,000"
## [13] "$75,000-$99,999" "$100,000+"
income_levels <- levels(diab_pop$indhhin2)
levels = c("$0-$4,999",
"$5,000-$9,999",
"$10,000-$14,999",
"$15,000-$19,999",
"less than $20,000",
"20,000+",
"$20,000-$24,999",
"$25,000-$34,999",
"$35,000-$44,999",
"$45,000-$54,999",
"$55,000-$64,999",
"$65,000-$74,999",
"$75,000-$99,999",
"$100,000+"
)
setdiff(income_levels, levels)
## character(0)
diab_pop$indhhin2 <- factor(diab_pop$indhhin2 ,
levels=levels,
ordered = TRUE)
odered_levels <- levels(diab_pop$indhhin2)
glimpse(diab_pop)
## Rows: 1,876
## Columns: 10
## $ seqn <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83761, 83...
## $ riagendr <fct> Male, Male, Female, Male, Female, Male, Female, Female, Fe...
## $ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80...
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, MexicanAmerican, O...
## $ dmdeduc2 <fct> High school graduate/GED, High school graduate/GED, Grades...
## $ dmdmartl <fct> Divorced, Married, Separated, Never married, Married, Wido...
## $ indhhin2 <ord> "$15,000-$19,999", "$20,000-$24,999", "$75,000-$99,999", "...
## $ bmxbmi <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0...
## $ diq010 <fct> No Diabetes, Diabetes, No Diabetes, No Diabetes, No Diabet...
## $ lbxglu <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 9...
feature_names <- c('riagendr' , 'ridreth1' , 'dmdeduc2' , 'dmdmartl' , 'indhhin2' , 'lbxglu','bmxbmi')
feature_names_plus <- paste(feature_names, collapse = ' + ' )
feature_names_plus
## [1] "riagendr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + lbxglu + bmxbmi"
formula_1 <- as.formula(paste0('diq010 ~ ',feature_names_plus))
formula_1
## diq010 ~ riagendr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 +
## lbxglu + bmxbmi
# THIS IS NOT A GREAT IDEA
options(warn=-1)
# I have this on, there is an expected warning
## "prediction from a rank-deficient fit may be misleading"
## without this option on the output is very difficult to read
caret
glm
logit train
function
Train_Glm_Iteration <- function(data){
TrainInd <- createDataPartition(data$diq010,
p =.7,
list=FALSE)
TRAIN <- data[TrainInd, ]
bootstrap <- trainControl(method="boot", number=42)
gml.model <- train(as.formula(formula_1),
method='glm',
data =TRAIN,
family='binomial',
preProcess = c('center','scale'),
trControl=bootstrap
)
CoEff <- as_tibble(gml.model$finalModel$coefficients, rownames="feature") %>%
rename(coeff = value)
TEST <- data[-TrainInd,]
estimate <- predict(gml.model, TEST,'raw')
prob <- predict(gml.model, TEST,'prob')
TEST.scored <- cbind(TEST, estimate, prob)
return(list(Training_Data = TRAIN,
gml.model = gml.model,
CoEff = CoEff,
TEST.scored =TEST.scored)
)
}
Id <- sample(diab_pop$seqn, nrow(diab_pop)*.3, replace=F)
length(Id)
## [1] 562
t1 <- diab_pop %>%
filter(seqn %in% Id)
dim(t1)
## [1] 562 10
X1 <- Train_Glm_Iteration(t1)
str(X1,1)
## List of 4
## $ Training_Data:'data.frame': 394 obs. of 10 variables:
## ..- attr(*, "na.action")= 'omit' Named int [1:3843] 1 4 5 7 8 9 10 12 16 18 ...
## .. ..- attr(*, "names")= chr [1:3843] "1" "4" "5" "7" ...
## $ gml.model :List of 24
## ..- attr(*, "class")= chr [1:2] "train" "train.formula"
## $ CoEff : tibble [30 x 2] (S3: tbl_df/tbl/data.frame)
## $ TEST.scored :'data.frame': 168 obs. of 13 variables:
X1$TEST.scored %>%
roc_auc(truth= diq010 , Diabetes)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.739
X1$TEST.scored %>%
roc_curve(truth= diq010 , Diabetes) %>%
autoplot()
conf_matX1 <- X1$TEST.scored %>%
conf_mat(truth= diq010 , estimate)
conf_matX1
## Truth
## Prediction Diabetes No Diabetes
## Diabetes 11 8
## No Diabetes 15 134
sum.conf_matX1 <- summary(conf_matX1)
sum.conf_matX1
## # A tibble: 13 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.863
## 2 kap binary 0.412
## 3 sens binary 0.423
## 4 spec binary 0.944
## 5 ppv binary 0.579
## 6 npv binary 0.899
## 7 mcc binary 0.419
## 8 j_index binary 0.367
## 9 bal_accuracy binary 0.683
## 10 detection_prevalence binary 0.113
## 11 precision binary 0.579
## 12 recall binary 0.423
## 13 f_meas binary 0.489
nrow(X1$Training_Data) + nrow(X1$TEST.scored) == nrow(t1)
## [1] TRUE
X1.comparedf <- arsenal::comparedf(X1$Training_Data, X1$TEST.scored, by=c('seqn'))
sum.X1.comparedf <- summary(X1.comparedf)
sum.X1.comparedf$comparison.summary.table
## statistic value
## 1 Number of by-variables 1
## 2 Number of non-by variables in common 9
## 3 Number of variables compared 9
## 4 Number of variables in x but not y 0
## 5 Number of variables in y but not x 3
## 6 Number of variables compared with some values unequal 0
## 7 Number of variables compared with all values equal 9
## 8 Number of observations in common 0
## 9 Number of observations in x but not y 394
## 10 Number of observations in y but not x 168
## 11 Number of observations with some compared variables unequal 0
## 12 Number of observations with all compared variables equal 0
## 13 Number of values unequal 0
Id2 <- sample(diab_pop$seqn, nrow(diab_pop)*.5, replace=F)
t2 <- diab_pop %>%
filter(seqn %in% Id2)
X2 <- Train_Glm_Iteration(t2)
Swan <- diab_pop %>%
filter(indhhin2 == '$75,000-$99,999' & riagendr == 'Female' & ridreth1 == 'Non-Hispanic White')
Id3 <- sample(Swan$seqn, nrow(Swan)*.8, replace=F)
t3 <- diab_pop %>%
filter(indhhin2 == '$75,000-$99,999' & riagendr == 'Female' & ridreth1 == 'Non-Hispanic White') %>%
filter(seqn %in% Id3)
t3 %>% summary()
## seqn riagendr ridageyr ridreth1
## Min. :84166 Male : 0 Min. :21.00 MexicanAmerican : 0
## 1st Qu.:85920 Female:29 1st Qu.:32.00 Other Hispanic : 0
## Median :89041 Median :48.00 Non-Hispanic White:29
## Mean :88741 Mean :50.14 Non-Hispanic Black: 0
## 3rd Qu.:91064 3rd Qu.:61.00 Other : 0
## Max. :92970 Max. :80.00
##
## dmdeduc2 dmdmartl
## Less than 9th grade : 0 Married :19
## Grades 9-11th : 1 Widowed : 3
## High school graduate/GED : 4 Divorced : 2
## Some college or AA degrees:14 Separated : 0
## College grad or above :10 Never married : 1
## Living with partner: 4
##
## indhhin2 bmxbmi diq010 lbxglu
## $75,000-$99,999 :29 Min. :16.70 Diabetes : 1 Min. : 80.0
## $0-$4,999 : 0 1st Qu.:23.70 No Diabetes:28 1st Qu.: 92.0
## $5,000-$9,999 : 0 Median :26.80 Median : 99.0
## $10,000-$14,999 : 0 Mean :30.05 Mean :104.3
## $15,000-$19,999 : 0 3rd Qu.:33.30 3rd Qu.:105.0
## less than $20,000: 0 Max. :63.60 Max. :207.0
## (Other) : 0
X3 <- Train_Glm_Iteration(t3)
Id4 <- sample(diab_pop$seqn, nrow(diab_pop)*.9, replace=F)
t4 <- diab_pop %>%
filter(seqn %in% Id4)
X4 <- Train_Glm_Iteration(t4)
M_union <- union(Id2,Id3)
Id5 <- setdiff(diab_pop$seqn, M_union)
t5 <- diab_pop %>%
filter(seqn %in% Id5)
X5 <- Train_Glm_Iteration(t5)
str(X2$Training_Data)
## 'data.frame': 657 obs. of 10 variables:
## $ seqn : num 83757 83809 83813 83834 83851 ...
## $ riagendr: Factor w/ 2 levels "Male","Female": 2 2 1 1 2 1 2 2 1 2 ...
## $ ridageyr: num 57 20 24 69 37 74 80 80 75 33 ...
## $ ridreth1: Factor w/ 5 levels "MexicanAmerican",..: 2 4 3 4 3 3 3 3 4 3 ...
## $ dmdeduc2: Factor w/ 5 levels "Less than 9th grade",..: 1 3 4 3 3 5 3 4 4 4 ...
## $ dmdmartl: Factor w/ 6 levels "Married","Widowed",..: 4 5 3 5 1 1 2 1 6 1 ...
## $ indhhin2: Ord.factor w/ 14 levels "$0-$4,999"<"$5,000-$9,999"<..: 7 13 8 3 10 8 4 10 4 12 ...
## $ bmxbmi : num 35.4 26.2 26.9 28.2 35.3 27.2 23.5 26.9 30.8 25.9 ...
## $ diq010 : Factor w/ 2 levels "Diabetes","No Diabetes": 1 2 2 2 2 1 2 2 1 2 ...
## $ lbxglu : num 398 94 105 105 79 123 137 110 145 83 ...
## - attr(*, "na.action")= 'omit' Named int 1 4 5 7 8 9 10 12 16 18 ...
## ..- attr(*, "names")= chr "1" "4" "5" "7" ...
str(X3$Training_Data)
## 'data.frame': 21 obs. of 10 variables:
## $ seqn : num 84166 84511 84517 84786 84816 ...
## $ riagendr: Factor w/ 2 levels "Male","Female": 2 2 2 2 2 2 2 2 2 2 ...
## $ ridageyr: num 67 78 80 50 28 61 61 40 73 68 ...
## $ ridreth1: Factor w/ 5 levels "MexicanAmerican",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ dmdeduc2: Factor w/ 5 levels "Less than 9th grade",..: 5 5 4 4 4 2 5 5 5 4 ...
## $ dmdmartl: Factor w/ 6 levels "Married","Widowed",..: 1 2 2 1 1 3 1 1 1 6 ...
## $ indhhin2: Ord.factor w/ 14 levels "$0-$4,999"<"$5,000-$9,999"<..: 13 13 13 13 13 13 13 13 13 13 ...
## $ bmxbmi : num 26.1 23.1 26.6 28.9 18.4 36.2 42.7 23.5 28.3 29.6 ...
## $ diq010 : Factor w/ 2 levels "Diabetes","No Diabetes": 2 2 2 2 2 2 2 2 2 2 ...
## $ lbxglu : num 134 99 83 87 93 92 108 104 105 107 ...
## - attr(*, "na.action")= 'omit' Named int 1 4 5 7 8 9 10 12 16 18 ...
## ..- attr(*, "names")= chr "1" "4" "5" "7" ...
str(X5$Training_Data)
## 'data.frame': 647 obs. of 10 variables:
## $ seqn : num 83733 83737 83754 83755 83761 ...
## $ riagendr: Factor w/ 2 levels "Male","Female": 1 2 2 1 2 2 2 1 2 2 ...
## $ ridageyr: num 53 72 67 67 24 68 37 70 20 39 ...
## $ ridreth1: Factor w/ 5 levels "MexicanAmerican",..: 3 1 2 4 5 1 2 3 4 1 ...
## $ dmdeduc2: Factor w/ 5 levels "Less than 9th grade",..: 3 2 5 5 5 1 4 5 4 3 ...
## $ dmdmartl: Factor w/ 6 levels "Married","Widowed",..: 3 4 1 2 5 3 1 6 5 1 ...
## $ indhhin2: Ord.factor w/ 14 levels "$0-$4,999"<"$5,000-$9,999"<..: 4 13 8 7 1 4 13 12 8 4 ...
## $ bmxbmi : num 30.8 28.6 43.7 28.8 25.3 33.5 25.5 27 22.2 27.2 ...
## $ diq010 : Factor w/ 2 levels "Diabetes","No Diabetes": 2 2 2 1 2 2 2 2 2 2 ...
## $ lbxglu : num 101 107 130 284 95 111 100 94 80 101 ...
## - attr(*, "na.action")= 'omit' Named int 1 4 5 7 8 9 10 12 16 18 ...
## ..- attr(*, "names")= chr "1" "4" "5" "7" ...
arsenal::comparedf(X3$Training_Data,
X5$Training_Data)
## Compare Object
##
## Function Call:
## arsenal::comparedf(x = X3$Training_Data, y = X5$Training_Data)
##
## Shared: 10 non-by variables and 21 observations.
## Not shared: 0 variables and 626 observations.
##
## Differences found in 10/10 variables compared.
## 0 variables compared have non-identical attributes.
X1$CoEff
## # A tibble: 30 x 2
## feature coeff
## <chr> <dbl>
## 1 (Intercept) 2.68
## 2 riagendrFemale 0.223
## 3 `ridreth1Other Hispanic` -0.451
## 4 `ridreth1Non-Hispanic White` -0.0250
## 5 `ridreth1Non-Hispanic Black` -0.483
## 6 ridreth1Other -0.337
## 7 `dmdeduc2Grades 9-11th` 0.420
## 8 `dmdeduc2High school graduate/GED` 0.207
## 9 `dmdeduc2Some college or AA degrees` 0.630
## 10 `dmdeduc2College grad or above` 0.980
## # ... with 20 more rows
CoEff_compare <- bind_rows(X1$CoEff %>% mutate(strat = 't1'),
X2$CoEff %>% mutate(strat = 't2'),
X3$CoEff %>% mutate(strat = 't3'),
X4$CoEff %>% mutate(strat = 't4'),
X5$CoEff %>% mutate(strat = 't5'))
glimpse(CoEff_compare)
## Rows: 150
## Columns: 3
## $ feature <chr> "(Intercept)", "riagendrFemale", "`ridreth1Other Hispanic`"...
## $ coeff <dbl> 2.67925964, 0.22277284, -0.45124437, -0.02501180, -0.482570...
## $ strat <chr> "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t1",...
CoEff_compare %>%
group_by(strat) %>%
ggplot(aes(x=feature, y=coeff)) +
geom_point() +
coord_flip() +
facet_wrap(.~strat)
CoEff_compare %>%
ggplot(aes(x=feature, y=coeff)) +
geom_boxplot() +
coord_flip()
\(~\)
\(~\)
str(X1,1)
## List of 4
## $ Training_Data:'data.frame': 394 obs. of 10 variables:
## ..- attr(*, "na.action")= 'omit' Named int [1:3843] 1 4 5 7 8 9 10 12 16 18 ...
## .. ..- attr(*, "names")= chr [1:3843] "1" "4" "5" "7" ...
## $ gml.model :List of 24
## ..- attr(*, "class")= chr [1:2] "train" "train.formula"
## $ CoEff : tibble [30 x 2] (S3: tbl_df/tbl/data.frame)
## $ TEST.scored :'data.frame': 168 obs. of 13 variables:
glimpse(X1$TEST.scored)
## Rows: 168
## Columns: 13
## $ seqn <dbl> 83734, 83755, 83790, 83849, 83908, 83947, 83963, 8415...
## $ riagendr <fct> Male, Male, Male, Male, Male, Female, Female, Female,...
## $ ridageyr <dbl> 78, 67, 56, 71, 51, 33, 44, 53, 44, 34, 64, 55, 23, 4...
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic Black, Non-Hispanic ...
## $ dmdeduc2 <fct> High school graduate/GED, College grad or above, Less...
## $ dmdmartl <fct> Married, Widowed, Married, Married, Married, Married,...
## $ indhhin2 <ord> "$20,000-$24,999", "$20,000-$24,999", "$15,000-$19,99...
## $ bmxbmi <dbl> 28.8, 28.8, 24.4, 27.6, 24.7, 25.9, 52.1, 25.3, 32.6,...
## $ diq010 <fct> Diabetes, Diabetes, No Diabetes, Diabetes, No Diabete...
## $ lbxglu <dbl> 84, 284, 397, 76, 102, 83, 109, 86, 102, 98, 134, 116...
## $ estimate <fct> No Diabetes, Diabetes, Diabetes, No Diabetes, No Diab...
## $ Diabetes <dbl> 0.040939395, 0.998934295, 0.999998287, 0.019392257, 0...
## $ `No Diabetes` <dbl> 9.590606e-01, 1.065705e-03, 1.713443e-06, 9.806077e-0...
look <- as_tibble(X1$TEST.scored)
look %>%
roc_auc(truth=diq010, Diabetes)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.739
look %>%
roc_curve(truth=diq010, Diabetes) %>%
autoplot()
look %>%
conf_mat(truth=diq010, estimate)
## Truth
## Prediction Diabetes No Diabetes
## Diabetes 11 8
## No Diabetes 15 134
look %>%
conf_mat(truth=diq010, estimate) %>%
summary()
## # A tibble: 13 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.863
## 2 kap binary 0.412
## 3 sens binary 0.423
## 4 spec binary 0.944
## 5 ppv binary 0.579
## 6 npv binary 0.899
## 7 mcc binary 0.419
## 8 j_index binary 0.367
## 9 bal_accuracy binary 0.683
## 10 detection_prevalence binary 0.113
## 11 precision binary 0.579
## 12 recall binary 0.423
## 13 f_meas binary 0.489
Get_Errors_Function
Get_Errors_Function <- function(data){
look <- data
AUC <- look %>%
roc_auc(truth=diq010, Diabetes)
ROC_CURVE <- look %>%
roc_curve(truth=diq010, Diabetes)
CONF_MAT <- look %>%
conf_mat(truth=diq010, estimate)
SUM.CONF_MAT <- look %>%
conf_mat(truth=diq010, estimate) %>%
summary()
output = list(
AUC = AUC,
ROC_CURVE = ROC_CURVE,
CONF_MAT = CONF_MAT,
SUM.CONF_MAT = SUM.CONF_MAT
)
return(output)
}
\(~\)
f3 <- diab_pop %>%
anti_join(t3 %>% select(seqn))
## Joining, by = "seqn"
glimpse(f3)
## Rows: 1,847
## Columns: 10
## $ seqn <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83761, 83...
## $ riagendr <fct> Male, Male, Female, Male, Female, Male, Female, Female, Fe...
## $ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80...
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, MexicanAmerican, O...
## $ dmdeduc2 <fct> High school graduate/GED, High school graduate/GED, Grades...
## $ dmdmartl <fct> Divorced, Married, Separated, Never married, Married, Wido...
## $ indhhin2 <ord> "$15,000-$19,999", "$20,000-$24,999", "$75,000-$99,999", "...
## $ bmxbmi <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0...
## $ diq010 <fct> No Diabetes, Diabetes, No Diabetes, No Diabetes, No Diabet...
## $ lbxglu <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 9...
nrow(diab_pop) #1876
## [1] 1876
nrow(t3) #
## [1] 29
nrow(f3) #
## [1] 1847
nrow(f3) + nrow(t3) == nrow(diab_pop)
## [1] TRUE
arsenal::comparedf(t3,f3,by=c('seqn'))
## Compare Object
##
## Function Call:
## arsenal::comparedf(x = t3, y = f3, by = c("seqn"))
##
## Shared: 9 non-by variables and 0 observations.
## Not shared: 0 variables and 1876 observations.
##
## Differences found in 0/9 variables compared.
## 0 variables compared have non-identical attributes.
compare_df_obj <- arsenal::comparedf(t3,f3,by=c('seqn'))
summ.compare_df_obj <- summary(compare_df_obj)
summ.compare_df_obj$comparison.summary.table
## statistic value
## 1 Number of by-variables 1
## 2 Number of non-by variables in common 9
## 3 Number of variables compared 9
## 4 Number of variables in x but not y 0
## 5 Number of variables in y but not x 0
## 6 Number of variables compared with some values unequal 0
## 7 Number of variables compared with all values equal 9
## 8 Number of observations in common 0
## 9 Number of observations in x but not y 29
## 10 Number of observations in y but not x 1847
## 11 Number of observations with some compared variables unequal 0
## 12 Number of observations with all compared variables equal 0
## 13 Number of values unequal 0
nrow(f3) + ( nrow(X3$TEST.scored) + nrow(X3$Training_Data) ) == nrow(diab_pop)
## [1] TRUE
arsenal::comparedf(X3$TEST.scored,f3,by=c('seqn'))
## Compare Object
##
## Function Call:
## arsenal::comparedf(x = X3$TEST.scored, y = f3, by = c("seqn"))
##
## Shared: 9 non-by variables and 0 observations.
## Not shared: 3 variables and 1855 observations.
##
## Differences found in 0/9 variables compared.
## 0 variables compared have non-identical attributes.
compare_df_obj <- arsenal::comparedf(X3$TEST.scored,f3,by=c('seqn'))
summ.compare_df_obj <- summary(compare_df_obj)
summ.compare_df_obj$comparison.summary.table
## statistic value
## 1 Number of by-variables 1
## 2 Number of non-by variables in common 9
## 3 Number of variables compared 9
## 4 Number of variables in x but not y 3
## 5 Number of variables in y but not x 0
## 6 Number of variables compared with some values unequal 0
## 7 Number of variables compared with all values equal 9
## 8 Number of observations in common 0
## 9 Number of observations in x but not y 8
## 10 Number of observations in y but not x 1847
## 11 Number of observations with some compared variables unequal 0
## 12 Number of observations with all compared variables equal 0
## 13 Number of values unequal 0
f3 <- bind_rows(X3$TEST.scored %>% select(colnames(diab_pop)),
f3)
str(predict(X3$gml.model, f3,'prob'),1)
## 'data.frame': 1855 obs. of 2 variables:
## $ Diabetes : num 7.88e-12 7.89e-12 7.88e-12 7.88e-12 7.88e-12 ...
## $ No Diabetes: num 1 1 1 1 1 ...
f3$Diabetes <- predict(X3$gml.model, f3,'prob')$Diabetes
glimpse(f3)
## Rows: 1,855
## Columns: 11
## $ seqn <dbl> 85443, 87007, 87510, 90121, 90384, 90814, 92358, 92970, 83...
## $ riagendr <fct> Female, Female, Female, Female, Female, Female, Female, Fe...
## $ ridageyr <dbl> 48, 45, 29, 44, 47, 80, 21, 25, 53, 78, 72, 45, 67, 67, 57...
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White...
## $ dmdeduc2 <fct> High school graduate/GED, Some college or AA degrees, Coll...
## $ dmdmartl <fct> Married, Married, Married, Married, Living with partner, M...
## $ indhhin2 <ord> "$75,000-$99,999", "$75,000-$99,999", "$75,000-$99,999", "...
## $ bmxbmi <dbl> 27.1, 39.4, 22.6, 45.2, 28.1, 33.3, 21.8, 26.8, 30.8, 28.8...
## $ diq010 <fct> No Diabetes, No Diabetes, No Diabetes, No Diabetes, No Dia...
## $ lbxglu <dbl> 97, 207, 97, 104, 107, 102, 80, 92, 101, 84, 107, 84, 130,...
## $ Diabetes <dbl> 7.884915e-12, 7.885026e-12, 7.884915e-12, 7.884915e-12, 7....
str(predict(X3$gml.model, f3,'raw'),1)
## Factor w/ 2 levels "Diabetes","No Diabetes": 2 2 2 2 2 2 2 2 2 2 ...
f3$estimate <- predict(X3$gml.model, f3,'raw')
glimpse(f3)
## Rows: 1,855
## Columns: 12
## $ seqn <dbl> 85443, 87007, 87510, 90121, 90384, 90814, 92358, 92970, 83...
## $ riagendr <fct> Female, Female, Female, Female, Female, Female, Female, Fe...
## $ ridageyr <dbl> 48, 45, 29, 44, 47, 80, 21, 25, 53, 78, 72, 45, 67, 67, 57...
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White...
## $ dmdeduc2 <fct> High school graduate/GED, Some college or AA degrees, Coll...
## $ dmdmartl <fct> Married, Married, Married, Married, Living with partner, M...
## $ indhhin2 <ord> "$75,000-$99,999", "$75,000-$99,999", "$75,000-$99,999", "...
## $ bmxbmi <dbl> 27.1, 39.4, 22.6, 45.2, 28.1, 33.3, 21.8, 26.8, 30.8, 28.8...
## $ diq010 <fct> No Diabetes, No Diabetes, No Diabetes, No Diabetes, No Dia...
## $ lbxglu <dbl> 97, 207, 97, 104, 107, 102, 80, 92, 101, 84, 107, 84, 130,...
## $ Diabetes <dbl> 7.884915e-12, 7.885026e-12, 7.884915e-12, 7.884915e-12, 7....
## $ estimate <fct> No Diabetes, No Diabetes, No Diabetes, No Diabetes, No Dia...
Get_Errors_Function(f3)
## $AUC
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.548
##
## $ROC_CURVE
## # A tibble: 8 x 3
## .threshold specificity sensitivity
## <dbl> <dbl> <dbl>
## 1 -Inf 0 1
## 2 7.88e-12 0 1
## 3 7.89e-12 0.799 0.343
## 4 7.89e-12 0.806 0.104
## 5 1.00e+ 0 0.806 0.0964
## 6 1.00e+ 0 0.806 0.0964
## 7 1.00e+ 0 0.999 0.00714
## 8 Inf 1 0
##
## $CONF_MAT
## Truth
## Prediction Diabetes No Diabetes
## Diabetes 27 306
## No Diabetes 253 1269
##
## $SUM.CONF_MAT
## # A tibble: 13 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.699
## 2 kap binary -0.0908
## 3 sens binary 0.0964
## 4 spec binary 0.806
## 5 ppv binary 0.0811
## 6 npv binary 0.834
## 7 mcc binary -0.0913
## 8 j_index binary -0.0979
## 9 bal_accuracy binary 0.451
## 10 detection_prevalence binary 0.180
## 11 precision binary 0.0811
## 12 recall binary 0.0964
## 13 f_meas binary 0.0881
SAMPLE_3.ERRORS <- Get_Errors_Function(f3)
str(SAMPLE_3.ERRORS,1)
## List of 4
## $ AUC : tibble [1 x 3] (S3: tbl_df/tbl/data.frame)
## $ ROC_CURVE : roc_df [8 x 3] (S3: roc_df/tbl_df/tbl/data.frame)
## $ CONF_MAT :List of 2
## ..- attr(*, "class")= chr "conf_mat"
## $ SUM.CONF_MAT: tibble [13 x 3] (S3: tbl_df/tbl/data.frame)
SAMPLE_3.ERRORS$AUC
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.548
Get_Errors_Function
(f3 %>%
group_by(riagendr) %>%
Get_Errors_Function())$AUC
## # A tibble: 2 x 4
## riagendr .metric .estimator .estimate
## <fct> <chr> <chr> <dbl>
## 1 Male roc_auc binary 0.538
## 2 Female roc_auc binary 0.561
(f3 %>%
group_by(riagendr) %>%
Get_Errors_Function())$AUC %>%
rename(AUC_EST = .estimate) %>%
group_by(riagendr) %>%
ggplot(aes(x=riagendr,
y=AUC_EST,
fill=riagendr)) +
geom_bar(stat = "identity",
position = "dodge") +
coord_flip()
(f3 %>%
group_by(riagendr , indhhin2) %>%
Get_Errors_Function())$AUC
## # A tibble: 24 x 5
## riagendr indhhin2 .metric .estimator .estimate
## <fct> <ord> <chr> <chr> <dbl>
## 1 Male $0-$4,999 roc_auc binary 0.479
## 2 Male $5,000-$9,999 roc_auc binary 0.449
## 3 Male $10,000-$14,999 roc_auc binary 0.511
## 4 Male $15,000-$19,999 roc_auc binary 0.508
## 5 Male less than $20,000 roc_auc binary 0.643
## 6 Male 20,000+ roc_auc binary 0.370
## 7 Male $20,000-$24,999 roc_auc binary 0.555
## 8 Male $25,000-$34,999 roc_auc binary 0.554
## 9 Male $45,000-$54,999 roc_auc binary 0.5
## 10 Male $65,000-$74,999 roc_auc binary 0.504
## # ... with 14 more rows
(f3 %>%
group_by(riagendr, indhhin2) %>%
Get_Errors_Function())$AUC %>%
rename(AUC_EST = .estimate) %>%
group_by(riagendr) %>%
ggplot(aes(x=indhhin2,
y=AUC_EST,
fill=riagendr)) +
geom_bar(stat = "identity",
position = "dodge") +
coord_flip()
(f3 %>%
group_by(riagendr, indhhin2) %>%
Get_Errors_Function())$ROC_CURVE %>%
autoplot() +
labs( title = "ROC Curves by Sex and Income")
(f3 %>%
group_by(riagendr, ridreth1) %>%
Get_Errors_Function())$AUC %>%
rename(AUC_EST = .estimate) %>%
group_by(riagendr) %>%
ggplot(aes(x=ridreth1,
y=AUC_EST,
fill=riagendr)) +
geom_bar(stat = "identity",
position = "dodge") +
coord_flip()
(f3 %>%
group_by(riagendr, indhhin2, ridreth1) %>%
Get_Errors_Function())$AUC %>%
rename(AUC_EST = .estimate) %>%
group_by(riagendr) %>%
ggplot(aes(x=indhhin2,
y=AUC_EST,
fill=riagendr)) +
geom_bar(stat = "identity",
position = "dodge") +
coord_flip() +
facet_wrap( ~ ridreth1)
## Error: Problem with `summarise()` input `.estimate`.
## x No case observation.
## i Input `.estimate` is `metric_fn(...)`.
## i The error occurred in group 5: riagendr = "Male", indhhin2 = "$0-$4,999", ridreth1 = "Other".
(f3 %>%
group_by(riagendr, indhhin2) %>%
Get_Errors_Function())$CONF_MAT
## # A tibble: 24 x 3
## # Groups: riagendr [2]
## riagendr indhhin2 conf_mat
## <fct> <ord> <list>
## 1 Male $0-$4,999 <conf_mat>
## 2 Male $5,000-$9,999 <conf_mat>
## 3 Male $10,000-$14,999 <conf_mat>
## 4 Male $15,000-$19,999 <conf_mat>
## 5 Male less than $20,000 <conf_mat>
## 6 Male 20,000+ <conf_mat>
## 7 Male $20,000-$24,999 <conf_mat>
## 8 Male $25,000-$34,999 <conf_mat>
## 9 Male $45,000-$54,999 <conf_mat>
## 10 Male $65,000-$74,999 <conf_mat>
## # ... with 14 more rows
Sum_Conf_T3_Example <- (f3 %>%
group_by(riagendr, indhhin2) %>%
Get_Errors_Function())$SUM.CONF_MAT
Sum_Conf_T3_Example
## riagendr indhhin2
## Male :12 $0-$4,999 : 2
## Female:12 $5,000-$9,999 : 2
## $10,000-$14,999 : 2
## $15,000-$19,999 : 2
## less than $20,000: 2
## 20,000+ : 2
## (Other) :12
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
## conf_mat.Length conf_mat.Class conf_mat.Mode
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
## 2 conf_mat list
f2 <- diab_pop %>%
filter(!seqn %in% Id2)
nrow(diab_pop) #1876
## [1] 1876
nrow(t2) #
## [1] 938
nrow(f2) #
## [1] 938
nrow(f2) + nrow(t2) == nrow(diab_pop)
## [1] TRUE
arsenal::comparedf(t2,f2,by=c('seqn'))
## Compare Object
##
## Function Call:
## arsenal::comparedf(x = t2, y = f2, by = c("seqn"))
##
## Shared: 9 non-by variables and 0 observations.
## Not shared: 0 variables and 1876 observations.
##
## Differences found in 0/9 variables compared.
## 0 variables compared have non-identical attributes.
compare_df_obj <- arsenal::comparedf(t2,f2,by=c('seqn'))
summ.compare_df_obj <- summary(compare_df_obj)
summ.compare_df_obj$comparison.summary.table
## statistic value
## 1 Number of by-variables 1
## 2 Number of non-by variables in common 9
## 3 Number of variables compared 9
## 4 Number of variables in x but not y 0
## 5 Number of variables in y but not x 0
## 6 Number of variables compared with some values unequal 0
## 7 Number of variables compared with all values equal 9
## 8 Number of observations in common 0
## 9 Number of observations in x but not y 938
## 10 Number of observations in y but not x 938
## 11 Number of observations with some compared variables unequal 0
## 12 Number of observations with all compared variables equal 0
## 13 Number of values unequal 0
f2$estimate <- predict(X2$gml.model, f2,'raw')
f2$Diabetes <- predict(X2$gml.model, f2,'prob')$Diabetes
TEST.Scored_stacked <- bind_rows(
f2 %>% mutate(model = 'random'),
f3 %>% mutate(model = 'black_swan')
)
(TEST.Scored_stacked %>%
group_by(model) %>%
Get_Errors_Function())$AUC
## # A tibble: 2 x 4
## model .metric .estimator .estimate
## <chr> <chr> <chr> <dbl>
## 1 black_swan roc_auc binary 0.548
## 2 random roc_auc binary 0.835
(TEST.Scored_stacked %>%
group_by(model, riagendr) %>%
Get_Errors_Function())$AUC %>%
rename(AUC_EST = .estimate) %>%
group_by(model, riagendr) %>%
ggplot(aes(x=riagendr,
y=AUC_EST,
fill=model)) +
geom_bar(stat = "identity",
position = "dodge") +
coord_flip()
(TEST.Scored_stacked %>%
group_by(model, riagendr,indhhin2) %>%
Get_Errors_Function())$AUC %>%
rename(AUC_EST = .estimate) %>%
group_by(model, riagendr, indhhin2) %>%
ggplot(aes(x=riagendr,
y=AUC_EST,
fill=indhhin2)) +
geom_bar(stat = "identity",
position = "dodge") +
coord_flip() +
facet_wrap(model~.)
(TEST.Scored_stacked %>%
group_by(model, riagendr, ridreth1) %>%
Get_Errors_Function())$AUC %>%
rename(AUC_EST = .estimate) %>%
group_by(model, riagendr, ridreth1) %>%
ggplot(aes(x=riagendr,
y=AUC_EST,
fill=ridreth1)) +
geom_bar(stat = "identity",
position = "dodge") +
coord_flip() +
facet_wrap(model~.)
(TEST.Scored_stacked %>%
group_by(model, riagendr, ridreth1) %>%
Get_Errors_Function())$AUC %>%
rename(AUC_EST = .estimate) %>%
group_by(model, riagendr, ridreth1) %>%
ggplot(aes(x=riagendr,
y=AUC_EST,
fill=model)) +
geom_bar(stat = "identity",
position = "dodge") +
coord_flip() +
facet_wrap(ridreth1~.)
\(~\)
\(~\)
set.seed(1701)
library('tidyverse')
library('caret')
library('yardstick')
library('ggplot2')
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
na.omit()
glimpse(diab_pop)
levels(diab_pop$indhhin2)
income_levels <- levels(diab_pop$indhhin2)
levels = c("$0-$4,999",
"$5,000-$9,999",
"$10,000-$14,999",
"$15,000-$19,999",
"less than $20,000",
"20,000+",
"$20,000-$24,999",
"$25,000-$34,999",
"$35,000-$44,999",
"$45,000-$54,999",
"$55,000-$64,999",
"$65,000-$74,999",
"$75,000-$99,999",
"$100,000+"
)
setdiff(income_levels, levels)
diab_pop$indhhin2 <- factor(diab_pop$indhhin2 ,
levels=levels,
ordered = TRUE)
odered_levels <- levels(diab_pop$indhhin2)
glimpse(diab_pop)
feature_names <- c('riagendr' , 'ridreth1' , 'dmdeduc2' , 'dmdmartl' , 'indhhin2' , 'lbxglu','bmxbmi')
feature_names_plus <- paste(feature_names, collapse = ' + ' )
feature_names_plus
formula_1 <- as.formula(paste0('diq010 ~ ',feature_names_plus))
formula_1
# THIS IS NOT A GREAT IDEA
options(warn=-1)
# I have this on, there is an expected warning
## "prediction from a rank-deficient fit may be misleading"
## without this option on the output is very difficult to read
Train_Glm_Iteration <- function(data){
TrainInd <- createDataPartition(data$diq010,
p =.7,
list=FALSE)
TRAIN <- data[TrainInd, ]
bootstrap <- trainControl(method="boot", number=42)
gml.model <- train(as.formula(formula_1),
method='glm',
data =TRAIN,
family='binomial',
preProcess = c('center','scale'),
trControl=bootstrap
)
CoEff <- as_tibble(gml.model$finalModel$coefficients, rownames="feature") %>%
rename(coeff = value)
TEST <- data[-TrainInd,]
estimate <- predict(gml.model, TEST,'raw')
prob <- predict(gml.model, TEST,'prob')
TEST.scored <- cbind(TEST, estimate, prob)
return(list(Training_Data = TRAIN,
gml.model = gml.model,
CoEff = CoEff,
TEST.scored =TEST.scored)
)
}
Id <- sample(diab_pop$seqn, nrow(diab_pop)*.3, replace=F)
length(Id)
t1 <- diab_pop %>%
filter(seqn %in% Id)
dim(t1)
X1 <- Train_Glm_Iteration(t1)
str(X1,1)
X1$TEST.scored %>%
roc_auc(truth= diq010 , Diabetes)
X1$TEST.scored %>%
roc_curve(truth= diq010 , Diabetes) %>%
autoplot()
conf_matX1 <- X1$TEST.scored %>%
conf_mat(truth= diq010 , estimate)
conf_matX1
sum.conf_matX1 <- summary(conf_matX1)
sum.conf_matX1
nrow(X1$Training_Data) + nrow(X1$TEST.scored) == nrow(t1)
X1.comparedf <- arsenal::comparedf(X1$Training_Data, X1$TEST.scored, by=c('seqn'))
sum.X1.comparedf <- summary(X1.comparedf)
sum.X1.comparedf$comparison.summary.table
Id2 <- sample(diab_pop$seqn, nrow(diab_pop)*.5, replace=F)
t2 <- diab_pop %>%
filter(seqn %in% Id2)
X2 <- Train_Glm_Iteration(t2)
Swan <- diab_pop %>%
filter(indhhin2 == '$75,000-$99,999' & riagendr == 'Female' & ridreth1 == 'Non-Hispanic White')
Id3 <- sample(Swan$seqn, nrow(Swan)*.8, replace=F)
t3 <- diab_pop %>%
filter(indhhin2 == '$75,000-$99,999' & riagendr == 'Female' & ridreth1 == 'Non-Hispanic White') %>%
filter(seqn %in% Id3)
t3 %>% summary()
X3 <- Train_Glm_Iteration(t3)
Id4 <- sample(diab_pop$seqn, nrow(diab_pop)*.9, replace=F)
t4 <- diab_pop %>%
filter(seqn %in% Id4)
X4 <- Train_Glm_Iteration(t4)
M_union <- union(Id2,Id3)
Id5 <- setdiff(diab_pop$seqn, M_union)
t5 <- diab_pop %>%
filter(seqn %in% Id5)
X5 <- Train_Glm_Iteration(t5)
str(X2$Training_Data)
str(X3$Training_Data)
str(X5$Training_Data)
arsenal::comparedf(X3$Training_Data,
X5$Training_Data)
X1$CoEff
CoEff_compare <- bind_rows(X1$CoEff %>% mutate(strat = 't1'),
X2$CoEff %>% mutate(strat = 't2'),
X3$CoEff %>% mutate(strat = 't3'),
X4$CoEff %>% mutate(strat = 't4'),
X5$CoEff %>% mutate(strat = 't5'))
glimpse(CoEff_compare)
CoEff_compare %>%
group_by(strat) %>%
ggplot(aes(x=feature, y=coeff)) +
geom_point() +
coord_flip() +
facet_wrap(.~strat)
CoEff_compare %>%
ggplot(aes(x=feature, y=coeff)) +
geom_boxplot() +
coord_flip()
str(X1,1)
glimpse(X1$TEST.scored)
look <- as_tibble(X1$TEST.scored)
look %>%
roc_auc(truth=diq010, Diabetes)
look %>%
roc_curve(truth=diq010, Diabetes) %>%
autoplot()
look %>%
conf_mat(truth=diq010, estimate)
look %>%
conf_mat(truth=diq010, estimate) %>%
summary()
Get_Errors_Function <- function(data){
look <- data
AUC <- look %>%
roc_auc(truth=diq010, Diabetes)
ROC_CURVE <- look %>%
roc_curve(truth=diq010, Diabetes)
CONF_MAT <- look %>%
conf_mat(truth=diq010, estimate)
SUM.CONF_MAT <- look %>%
conf_mat(truth=diq010, estimate) %>%
summary()
output = list(
AUC = AUC,
ROC_CURVE = ROC_CURVE,
CONF_MAT = CONF_MAT,
SUM.CONF_MAT = SUM.CONF_MAT
)
return(output)
}
f3 <- diab_pop %>%
anti_join(t3 %>% select(seqn))
glimpse(f3)
nrow(diab_pop) #1876
nrow(t3) #
nrow(f3) #
nrow(f3) + nrow(t3) == nrow(diab_pop)
arsenal::comparedf(t3,f3,by=c('seqn'))
compare_df_obj <- arsenal::comparedf(t3,f3,by=c('seqn'))
summ.compare_df_obj <- summary(compare_df_obj)
summ.compare_df_obj$comparison.summary.table
nrow(f3) + ( nrow(X3$TEST.scored) + nrow(X3$Training_Data) ) == nrow(diab_pop)
arsenal::comparedf(X3$TEST.scored,f3,by=c('seqn'))
compare_df_obj <- arsenal::comparedf(X3$TEST.scored,f3,by=c('seqn'))
summ.compare_df_obj <- summary(compare_df_obj)
summ.compare_df_obj$comparison.summary.table
f3 <- bind_rows(X3$TEST.scored %>% select(colnames(diab_pop)),
f3)
str(predict(X3$gml.model, f3,'prob'),1)
f3$Diabetes <- predict(X3$gml.model, f3,'prob')$Diabetes
glimpse(f3)
str(predict(X3$gml.model, f3,'raw'),1)
f3$estimate <- predict(X3$gml.model, f3,'raw')
glimpse(f3)
Get_Errors_Function(f3)
SAMPLE_3.ERRORS <- Get_Errors_Function(f3)
str(SAMPLE_3.ERRORS,1)
SAMPLE_3.ERRORS$AUC
(f3 %>%
group_by(riagendr) %>%
Get_Errors_Function())$AUC
(f3 %>%
group_by(riagendr) %>%
Get_Errors_Function())$AUC %>%
rename(AUC_EST = .estimate) %>%
group_by(riagendr) %>%
ggplot(aes(x=riagendr,
y=AUC_EST,
fill=riagendr)) +
geom_bar(stat = "identity",
position = "dodge") +
coord_flip()
(f3 %>%
group_by(riagendr , indhhin2) %>%
Get_Errors_Function())$AUC
(f3 %>%
group_by(riagendr, indhhin2) %>%
Get_Errors_Function())$AUC %>%
rename(AUC_EST = .estimate) %>%
group_by(riagendr) %>%
ggplot(aes(x=indhhin2,
y=AUC_EST,
fill=riagendr)) +
geom_bar(stat = "identity",
position = "dodge") +
coord_flip()
(f3 %>%
group_by(riagendr, indhhin2) %>%
Get_Errors_Function())$ROC_CURVE %>%
autoplot() +
labs( title = "ROC Curves by Sex and Income")
(f3 %>%
group_by(riagendr, ridreth1) %>%
Get_Errors_Function())$AUC %>%
rename(AUC_EST = .estimate) %>%
group_by(riagendr) %>%
ggplot(aes(x=ridreth1,
y=AUC_EST,
fill=riagendr)) +
geom_bar(stat = "identity",
position = "dodge") +
coord_flip()
(f3 %>%
group_by(riagendr, indhhin2, ridreth1) %>%
Get_Errors_Function())$AUC %>%
rename(AUC_EST = .estimate) %>%
group_by(riagendr) %>%
ggplot(aes(x=indhhin2,
y=AUC_EST,
fill=riagendr)) +
geom_bar(stat = "identity",
position = "dodge") +
coord_flip() +
facet_wrap( ~ ridreth1)
(f3 %>%
group_by(riagendr, indhhin2) %>%
Get_Errors_Function())$CONF_MAT
Sum_Conf_T3_Example <- (f3 %>%
group_by(riagendr, indhhin2) %>%
Get_Errors_Function())$SUM.CONF_MAT
Sum_Conf_T3_Example
f2 <- diab_pop %>%
filter(!seqn %in% Id2)
nrow(diab_pop) #1876
nrow(t2) #
nrow(f2) #
nrow(f2) + nrow(t2) == nrow(diab_pop)
arsenal::comparedf(t2,f2,by=c('seqn'))
compare_df_obj <- arsenal::comparedf(t2,f2,by=c('seqn'))
summ.compare_df_obj <- summary(compare_df_obj)
summ.compare_df_obj$comparison.summary.table
f2$estimate <- predict(X2$gml.model, f2,'raw')
f2$Diabetes <- predict(X2$gml.model, f2,'prob')$Diabetes
TEST.Scored_stacked <- bind_rows(
f2 %>% mutate(model = 'random'),
f3 %>% mutate(model = 'black_swan')
)
(TEST.Scored_stacked %>%
group_by(model) %>%
Get_Errors_Function())$AUC
(TEST.Scored_stacked %>%
group_by(model, riagendr) %>%
Get_Errors_Function())$AUC %>%
rename(AUC_EST = .estimate) %>%
group_by(model, riagendr) %>%
ggplot(aes(x=riagendr,
y=AUC_EST,
fill=model)) +
geom_bar(stat = "identity",
position = "dodge") +
coord_flip()
(TEST.Scored_stacked %>%
group_by(model, riagendr,indhhin2) %>%
Get_Errors_Function())$AUC %>%
rename(AUC_EST = .estimate) %>%
group_by(model, riagendr, indhhin2) %>%
ggplot(aes(x=riagendr,
y=AUC_EST,
fill=indhhin2)) +
geom_bar(stat = "identity",
position = "dodge") +
coord_flip() +
facet_wrap(model~.)
(TEST.Scored_stacked %>%
group_by(model, riagendr, ridreth1) %>%
Get_Errors_Function())$AUC %>%
rename(AUC_EST = .estimate) %>%
group_by(model, riagendr, ridreth1) %>%
ggplot(aes(x=riagendr,
y=AUC_EST,
fill=ridreth1)) +
geom_bar(stat = "identity",
position = "dodge") +
coord_flip() +
facet_wrap(model~.)
(TEST.Scored_stacked %>%
group_by(model, riagendr, ridreth1) %>%
Get_Errors_Function())$AUC %>%
rename(AUC_EST = .estimate) %>%
group_by(model, riagendr, ridreth1) %>%
ggplot(aes(x=riagendr,
y=AUC_EST,
fill=model)) +
geom_bar(stat = "identity",
position = "dodge") +
coord_flip() +
facet_wrap(ridreth1~.)