$~$

1 STEP UP

set.seed(1701)

library('tidyverse')

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v dplyr   1.0.4
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library('caret')

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library('yardstick')

## For binary classification, the first factor level is assumed to be the event.
## Set the global option `yardstick.event_first` to `FALSE` to change this.

## 
## Attaching package: 'yardstick'

## The following objects are masked from 'package:caret':
## 
##     precision, recall

## The following object is masked from 'package:readr':
## 
##     spec

library('ggplot2')


diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
  na.omit() 

glimpse(diab_pop)

## Rows: 1,876
## Columns: 10
## $ seqn     <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83761, 83...
## $ riagendr <fct> Male, Male, Female, Male, Female, Male, Female, Female, Fe...
## $ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80...
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, MexicanAmerican, O...
## $ dmdeduc2 <fct> High school graduate/GED, High school graduate/GED, Grades...
## $ dmdmartl <fct> Divorced, Married, Separated, Never married, Married, Wido...
## $ indhhin2 <fct> "$15,000-$19,999", "$20,000-$24,999", "$75,000-$99,999", "...
## $ bmxbmi   <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0...
## $ diq010   <fct> No Diabetes, Diabetes, No Diabetes, No Diabetes, No Diabet...
## $ lbxglu   <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 9...

levels(diab_pop$indhhin2)

##  [1] "$0-$4,999"         "$5,000-$9,999"     "$10,000-$14,999"  
##  [4] "$15,000-$19,999"   "$20,000-$24,999"   "$25,000-$34,999"  
##  [7] "$35,000-$44,999"   "$45,000-$54,999"   "$55,000-$64,999"  
## [10] "$65,000-$74,999"   "20,000+"           "less than $20,000"
## [13] "$75,000-$99,999"   "$100,000+"

income_levels <- levels(diab_pop$indhhin2)


levels = c("$0-$4,999", 
           "$5,000-$9,999", 
           "$10,000-$14,999",
           "$15,000-$19,999",
           "less than $20,000",
           "20,000+", 
           "$20,000-$24,999",
           "$25,000-$34,999",
           "$35,000-$44,999",
           "$45,000-$54,999",
           "$55,000-$64,999",
           "$65,000-$74,999",
           "$75,000-$99,999",
           "$100,000+"
            )

setdiff(income_levels, levels)

## character(0)

diab_pop$indhhin2 <- factor(diab_pop$indhhin2 ,
                            levels=levels,
                            ordered = TRUE)

odered_levels <- levels(diab_pop$indhhin2)

glimpse(diab_pop)

## Rows: 1,876
## Columns: 10
## $ seqn     <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83761, 83...
## $ riagendr <fct> Male, Male, Female, Male, Female, Male, Female, Female, Fe...
## $ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80...
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, MexicanAmerican, O...
## $ dmdeduc2 <fct> High school graduate/GED, High school graduate/GED, Grades...
## $ dmdmartl <fct> Divorced, Married, Separated, Never married, Married, Wido...
## $ indhhin2 <ord> "$15,000-$19,999", "$20,000-$24,999", "$75,000-$99,999", "...
## $ bmxbmi   <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0...
## $ diq010   <fct> No Diabetes, Diabetes, No Diabetes, No Diabetes, No Diabet...
## $ lbxglu   <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 9...

feature_names <- c('riagendr' , 'ridreth1' , 'dmdeduc2' , 'dmdmartl' , 'indhhin2' , 'lbxglu','bmxbmi')

feature_names_plus <- paste(feature_names, collapse = ' + ' )

feature_names_plus

## [1] "riagendr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + lbxglu + bmxbmi"

formula_1 <- as.formula(paste0('diq010 ~ ',feature_names_plus))

formula_1

## diq010 ~ riagendr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + 
##     lbxglu + bmxbmi

1.1 WARNING - THIS IS A BAD OPTION

# THIS IS NOT A GREAT IDEA 

options(warn=-1)

# I have this on, there is an expected warning 
## "prediction from a rank-deficient fit may be misleading"
## without this option on the output is very difficult to read

2 `caret` `glm` logit `train` `function`

Train_Glm_Iteration <- function(data){
  
  TrainInd <- createDataPartition(data$diq010,
                                  p =.7,
                                  list=FALSE)

  TRAIN <- data[TrainInd, ] 
  
 bootstrap <- trainControl(method="boot", number=42)
  
  gml.model <- train(as.formula(formula_1),
    method='glm',
    data =TRAIN,
    family='binomial',
    preProcess = c('center','scale'),
    trControl=bootstrap
    )
  

  
  CoEff <-  as_tibble(gml.model$finalModel$coefficients, rownames="feature") %>%
    rename(coeff = value)
  
  TEST <- data[-TrainInd,]
  
  estimate <- predict(gml.model, TEST,'raw') 
  
  prob <- predict(gml.model, TEST,'prob')
  
  TEST.scored <- cbind(TEST, estimate, prob)
  
  return(list(Training_Data = TRAIN,
              gml.model = gml.model,
              CoEff = CoEff,
              TEST.scored =TEST.scored)
         )
  
}

3 Make Samples

3.1 SAMPLE 1

Id <- sample(diab_pop$seqn, nrow(diab_pop)*.3, replace=F)
length(Id)

## [1] 562

t1 <- diab_pop %>% 
  filter(seqn %in% Id)

dim(t1)

## [1] 562  10

X1 <- Train_Glm_Iteration(t1)

str(X1,1)

## List of 4
##  $ Training_Data:'data.frame':   394 obs. of  10 variables:
##   ..- attr(*, "na.action")= 'omit' Named int [1:3843] 1 4 5 7 8 9 10 12 16 18 ...
##   .. ..- attr(*, "names")= chr [1:3843] "1" "4" "5" "7" ...
##  $ gml.model    :List of 24
##   ..- attr(*, "class")= chr [1:2] "train" "train.formula"
##  $ CoEff        : tibble [30 x 2] (S3: tbl_df/tbl/data.frame)
##  $ TEST.scored  :'data.frame':   168 obs. of  13 variables:

X1$TEST.scored %>%
  roc_auc(truth= diq010 , Diabetes)

## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.739

X1$TEST.scored %>%
  roc_curve(truth= diq010 , Diabetes) %>%
  autoplot()

conf_matX1 <- X1$TEST.scored %>%
  conf_mat(truth= diq010 , estimate)

conf_matX1

##              Truth
## Prediction    Diabetes No Diabetes
##   Diabetes          11           8
##   No Diabetes       15         134

sum.conf_matX1  <- summary(conf_matX1)

sum.conf_matX1

## # A tibble: 13 x 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary         0.863
##  2 kap                  binary         0.412
##  3 sens                 binary         0.423
##  4 spec                 binary         0.944
##  5 ppv                  binary         0.579
##  6 npv                  binary         0.899
##  7 mcc                  binary         0.419
##  8 j_index              binary         0.367
##  9 bal_accuracy         binary         0.683
## 10 detection_prevalence binary         0.113
## 11 precision            binary         0.579
## 12 recall               binary         0.423
## 13 f_meas               binary         0.489

nrow(X1$Training_Data) + nrow(X1$TEST.scored) == nrow(t1)

## [1] TRUE

X1.comparedf <- arsenal::comparedf(X1$Training_Data, X1$TEST.scored, by=c('seqn')) 

sum.X1.comparedf <- summary(X1.comparedf)

sum.X1.comparedf$comparison.summary.table

##                                                      statistic value
## 1                                       Number of by-variables     1
## 2                         Number of non-by variables in common     9
## 3                                 Number of variables compared     9
## 4                           Number of variables in x but not y     0
## 5                           Number of variables in y but not x     3
## 6        Number of variables compared with some values unequal     0
## 7           Number of variables compared with all values equal     9
## 8                             Number of observations in common     0
## 9                        Number of observations in x but not y   394
## 10                       Number of observations in y but not x   168
## 11 Number of observations with some compared variables unequal     0
## 12    Number of observations with all compared variables equal     0
## 13                                    Number of values unequal     0

3.2 SAMPLE 2

Id2 <- sample(diab_pop$seqn, nrow(diab_pop)*.5, replace=F)

t2 <- diab_pop %>% 
  filter(seqn %in% Id2)

X2 <- Train_Glm_Iteration(t2)

3.3 SAMPLE 3 - “black swan”

3.3.1 Income == ‘$75,000-$99,999’ & Gender == ‘Female’ & ridreth1 == ‘Non-Hispanic White’

Swan <- diab_pop %>% 
  filter(indhhin2 == '$75,000-$99,999' & riagendr == 'Female' &  ridreth1 == 'Non-Hispanic White') 

Id3 <- sample(Swan$seqn, nrow(Swan)*.8, replace=F)

t3 <- diab_pop %>% 
  filter(indhhin2 == '$75,000-$99,999' & riagendr == 'Female' &  ridreth1 == 'Non-Hispanic White') %>%
  filter(seqn %in% Id3)

t3 %>% summary()

##       seqn         riagendr     ridageyr                   ridreth1 
##  Min.   :84166   Male  : 0   Min.   :21.00   MexicanAmerican   : 0  
##  1st Qu.:85920   Female:29   1st Qu.:32.00   Other Hispanic    : 0  
##  Median :89041               Median :48.00   Non-Hispanic White:29  
##  Mean   :88741               Mean   :50.14   Non-Hispanic Black: 0  
##  3rd Qu.:91064               3rd Qu.:61.00   Other             : 0  
##  Max.   :92970               Max.   :80.00                          
##                                                                     
##                        dmdeduc2                 dmdmartl 
##  Less than 9th grade       : 0   Married            :19  
##  Grades 9-11th             : 1   Widowed            : 3  
##  High school graduate/GED  : 4   Divorced           : 2  
##  Some college or AA degrees:14   Separated          : 0  
##  College grad or above     :10   Never married      : 1  
##                                  Living with partner: 4  
##                                                          
##               indhhin2      bmxbmi              diq010       lbxglu     
##  $75,000-$99,999  :29   Min.   :16.70   Diabetes   : 1   Min.   : 80.0  
##  $0-$4,999        : 0   1st Qu.:23.70   No Diabetes:28   1st Qu.: 92.0  
##  $5,000-$9,999    : 0   Median :26.80                    Median : 99.0  
##  $10,000-$14,999  : 0   Mean   :30.05                    Mean   :104.3  
##  $15,000-$19,999  : 0   3rd Qu.:33.30                    3rd Qu.:105.0  
##  less than $20,000: 0   Max.   :63.60                    Max.   :207.0  
##  (Other)          : 0

X3 <- Train_Glm_Iteration(t3)

3.4 SAMPLE 4

Id4 <- sample(diab_pop$seqn, nrow(diab_pop)*.9, replace=F)

t4 <- diab_pop %>% 
  filter(seqn %in% Id4)

X4 <- Train_Glm_Iteration(t4)

3.5 SAMPLE 5

M_union <- union(Id2,Id3)

Id5 <- setdiff(diab_pop$seqn, M_union)


t5 <- diab_pop %>% 
  filter(seqn %in% Id5)


X5 <- Train_Glm_Iteration(t5)

3.5.1 Compare SAMPLE 1 to SAMPLE 5

str(X2$Training_Data)

## 'data.frame':    657 obs. of  10 variables:
##  $ seqn    : num  83757 83809 83813 83834 83851 ...
##  $ riagendr: Factor w/ 2 levels "Male","Female": 2 2 1 1 2 1 2 2 1 2 ...
##  $ ridageyr: num  57 20 24 69 37 74 80 80 75 33 ...
##  $ ridreth1: Factor w/ 5 levels "MexicanAmerican",..: 2 4 3 4 3 3 3 3 4 3 ...
##  $ dmdeduc2: Factor w/ 5 levels "Less than 9th grade",..: 1 3 4 3 3 5 3 4 4 4 ...
##  $ dmdmartl: Factor w/ 6 levels "Married","Widowed",..: 4 5 3 5 1 1 2 1 6 1 ...
##  $ indhhin2: Ord.factor w/ 14 levels "$0-$4,999"<"$5,000-$9,999"<..: 7 13 8 3 10 8 4 10 4 12 ...
##  $ bmxbmi  : num  35.4 26.2 26.9 28.2 35.3 27.2 23.5 26.9 30.8 25.9 ...
##  $ diq010  : Factor w/ 2 levels "Diabetes","No Diabetes": 1 2 2 2 2 1 2 2 1 2 ...
##  $ lbxglu  : num  398 94 105 105 79 123 137 110 145 83 ...
##  - attr(*, "na.action")= 'omit' Named int  1 4 5 7 8 9 10 12 16 18 ...
##   ..- attr(*, "names")= chr  "1" "4" "5" "7" ...

str(X3$Training_Data)

## 'data.frame':    21 obs. of  10 variables:
##  $ seqn    : num  84166 84511 84517 84786 84816 ...
##  $ riagendr: Factor w/ 2 levels "Male","Female": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ridageyr: num  67 78 80 50 28 61 61 40 73 68 ...
##  $ ridreth1: Factor w/ 5 levels "MexicanAmerican",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ dmdeduc2: Factor w/ 5 levels "Less than 9th grade",..: 5 5 4 4 4 2 5 5 5 4 ...
##  $ dmdmartl: Factor w/ 6 levels "Married","Widowed",..: 1 2 2 1 1 3 1 1 1 6 ...
##  $ indhhin2: Ord.factor w/ 14 levels "$0-$4,999"<"$5,000-$9,999"<..: 13 13 13 13 13 13 13 13 13 13 ...
##  $ bmxbmi  : num  26.1 23.1 26.6 28.9 18.4 36.2 42.7 23.5 28.3 29.6 ...
##  $ diq010  : Factor w/ 2 levels "Diabetes","No Diabetes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ lbxglu  : num  134 99 83 87 93 92 108 104 105 107 ...
##  - attr(*, "na.action")= 'omit' Named int  1 4 5 7 8 9 10 12 16 18 ...
##   ..- attr(*, "names")= chr  "1" "4" "5" "7" ...

str(X5$Training_Data)

## 'data.frame':    647 obs. of  10 variables:
##  $ seqn    : num  83733 83737 83754 83755 83761 ...
##  $ riagendr: Factor w/ 2 levels "Male","Female": 1 2 2 1 2 2 2 1 2 2 ...
##  $ ridageyr: num  53 72 67 67 24 68 37 70 20 39 ...
##  $ ridreth1: Factor w/ 5 levels "MexicanAmerican",..: 3 1 2 4 5 1 2 3 4 1 ...
##  $ dmdeduc2: Factor w/ 5 levels "Less than 9th grade",..: 3 2 5 5 5 1 4 5 4 3 ...
##  $ dmdmartl: Factor w/ 6 levels "Married","Widowed",..: 3 4 1 2 5 3 1 6 5 1 ...
##  $ indhhin2: Ord.factor w/ 14 levels "$0-$4,999"<"$5,000-$9,999"<..: 4 13 8 7 1 4 13 12 8 4 ...
##  $ bmxbmi  : num  30.8 28.6 43.7 28.8 25.3 33.5 25.5 27 22.2 27.2 ...
##  $ diq010  : Factor w/ 2 levels "Diabetes","No Diabetes": 2 2 2 1 2 2 2 2 2 2 ...
##  $ lbxglu  : num  101 107 130 284 95 111 100 94 80 101 ...
##  - attr(*, "na.action")= 'omit' Named int  1 4 5 7 8 9 10 12 16 18 ...
##   ..- attr(*, "names")= chr  "1" "4" "5" "7" ...

arsenal::comparedf(X3$Training_Data,
                   X5$Training_Data)

## Compare Object
## 
## Function Call: 
## arsenal::comparedf(x = X3$Training_Data, y = X5$Training_Data)
## 
## Shared: 10 non-by variables and 21 observations.
## Not shared: 0 variables and 626 observations.
## 
## Differences found in 10/10 variables compared.
## 0 variables compared have non-identical attributes.

4 Compare Coefficents across all samples

X1$CoEff

## # A tibble: 30 x 2
##    feature                                coeff
##    <chr>                                  <dbl>
##  1 (Intercept)                           2.68  
##  2 riagendrFemale                        0.223 
##  3 `ridreth1Other Hispanic`             -0.451 
##  4 `ridreth1Non-Hispanic White`         -0.0250
##  5 `ridreth1Non-Hispanic Black`         -0.483 
##  6 ridreth1Other                        -0.337 
##  7 `dmdeduc2Grades 9-11th`               0.420 
##  8 `dmdeduc2High school graduate/GED`    0.207 
##  9 `dmdeduc2Some college or AA degrees`  0.630 
## 10 `dmdeduc2College grad or above`       0.980 
## # ... with 20 more rows

CoEff_compare <- bind_rows(X1$CoEff %>% mutate(strat = 't1'),
          X2$CoEff %>% mutate(strat = 't2'),
          X3$CoEff %>% mutate(strat = 't3'),
          X4$CoEff %>% mutate(strat = 't4'),
          X5$CoEff %>% mutate(strat = 't5'))


glimpse(CoEff_compare)

## Rows: 150
## Columns: 3
## $ feature <chr> "(Intercept)", "riagendrFemale", "`ridreth1Other Hispanic`"...
## $ coeff   <dbl> 2.67925964, 0.22277284, -0.45124437, -0.02501180, -0.482570...
## $ strat   <chr> "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t1",...

CoEff_compare %>%
  group_by(strat) %>%
  ggplot(aes(x=feature, y=coeff)) +
  geom_point() + 
  coord_flip() +
  facet_wrap(.~strat)

CoEff_compare %>%
  ggplot(aes(x=feature, y=coeff)) +
  geom_boxplot() + 
  coord_flip()

$~$

5 Review Ouput

str(X1,1)

## List of 4
##  $ Training_Data:'data.frame':   394 obs. of  10 variables:
##   ..- attr(*, "na.action")= 'omit' Named int [1:3843] 1 4 5 7 8 9 10 12 16 18 ...
##   .. ..- attr(*, "names")= chr [1:3843] "1" "4" "5" "7" ...
##  $ gml.model    :List of 24
##   ..- attr(*, "class")= chr [1:2] "train" "train.formula"
##  $ CoEff        : tibble [30 x 2] (S3: tbl_df/tbl/data.frame)
##  $ TEST.scored  :'data.frame':   168 obs. of  13 variables:

glimpse(X1$TEST.scored)

## Rows: 168
## Columns: 13
## $ seqn          <dbl> 83734, 83755, 83790, 83849, 83908, 83947, 83963, 8415...
## $ riagendr      <fct> Male, Male, Male, Male, Male, Female, Female, Female,...
## $ ridageyr      <dbl> 78, 67, 56, 71, 51, 33, 44, 53, 44, 34, 64, 55, 23, 4...
## $ ridreth1      <fct> Non-Hispanic White, Non-Hispanic Black, Non-Hispanic ...
## $ dmdeduc2      <fct> High school graduate/GED, College grad or above, Less...
## $ dmdmartl      <fct> Married, Widowed, Married, Married, Married, Married,...
## $ indhhin2      <ord> "$20,000-$24,999", "$20,000-$24,999", "$15,000-$19,99...
## $ bmxbmi        <dbl> 28.8, 28.8, 24.4, 27.6, 24.7, 25.9, 52.1, 25.3, 32.6,...
## $ diq010        <fct> Diabetes, Diabetes, No Diabetes, Diabetes, No Diabete...
## $ lbxglu        <dbl> 84, 284, 397, 76, 102, 83, 109, 86, 102, 98, 134, 116...
## $ estimate      <fct> No Diabetes, Diabetes, Diabetes, No Diabetes, No Diab...
## $ Diabetes      <dbl> 0.040939395, 0.998934295, 0.999998287, 0.019392257, 0...
## $ `No Diabetes` <dbl> 9.590606e-01, 1.065705e-03, 1.713443e-06, 9.806077e-0...

look <- as_tibble(X1$TEST.scored)

look %>%
  roc_auc(truth=diq010, Diabetes)

## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.739

look %>%
  roc_curve(truth=diq010, Diabetes) %>%
  autoplot()

look %>%
  conf_mat(truth=diq010, estimate)

##              Truth
## Prediction    Diabetes No Diabetes
##   Diabetes          11           8
##   No Diabetes       15         134

look %>%
  conf_mat(truth=diq010, estimate) %>%
  summary()

## # A tibble: 13 x 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary         0.863
##  2 kap                  binary         0.412
##  3 sens                 binary         0.423
##  4 spec                 binary         0.944
##  5 ppv                  binary         0.579
##  6 npv                  binary         0.899
##  7 mcc                  binary         0.419
##  8 j_index              binary         0.367
##  9 bal_accuracy         binary         0.683
## 10 detection_prevalence binary         0.113
## 11 precision            binary         0.579
## 12 recall               binary         0.423
## 13 f_meas               binary         0.489

6 create `Get_Errors_Function`

Get_Errors_Function <- function(data){

  look <- data
  
  AUC <- look %>%
    roc_auc(truth=diq010, Diabetes)
  
  ROC_CURVE <- look %>%
    roc_curve(truth=diq010, Diabetes) 
  
  CONF_MAT <- look %>%
    conf_mat(truth=diq010, estimate)
  
  SUM.CONF_MAT <- look %>%
    conf_mat(truth=diq010, estimate) %>%
    summary()
  
  output = list(
    AUC = AUC,
    ROC_CURVE = ROC_CURVE,
    CONF_MAT = CONF_MAT,
    SUM.CONF_MAT = SUM.CONF_MAT
  )
  
  return(output)
}

$~$

7 Test on Sample 3

f3 <- diab_pop %>% 
  anti_join(t3 %>% select(seqn))

## Joining, by = "seqn"

glimpse(f3)

## Rows: 1,847
## Columns: 10
## $ seqn     <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83761, 83...
## $ riagendr <fct> Male, Male, Female, Male, Female, Male, Female, Female, Fe...
## $ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80...
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, MexicanAmerican, O...
## $ dmdeduc2 <fct> High school graduate/GED, High school graduate/GED, Grades...
## $ dmdmartl <fct> Divorced, Married, Separated, Never married, Married, Wido...
## $ indhhin2 <ord> "$15,000-$19,999", "$20,000-$24,999", "$75,000-$99,999", "...
## $ bmxbmi   <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0...
## $ diq010   <fct> No Diabetes, Diabetes, No Diabetes, No Diabetes, No Diabet...
## $ lbxglu   <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 9...

nrow(diab_pop) #1876

## [1] 1876

nrow(t3) #

## [1] 29

nrow(f3) #

## [1] 1847

nrow(f3) + nrow(t3) == nrow(diab_pop)

## [1] TRUE

arsenal::comparedf(t3,f3,by=c('seqn'))

## Compare Object
## 
## Function Call: 
## arsenal::comparedf(x = t3, y = f3, by = c("seqn"))
## 
## Shared: 9 non-by variables and 0 observations.
## Not shared: 0 variables and 1876 observations.
## 
## Differences found in 0/9 variables compared.
## 0 variables compared have non-identical attributes.

compare_df_obj <- arsenal::comparedf(t3,f3,by=c('seqn'))

summ.compare_df_obj <- summary(compare_df_obj)
summ.compare_df_obj$comparison.summary.table

##                                                      statistic value
## 1                                       Number of by-variables     1
## 2                         Number of non-by variables in common     9
## 3                                 Number of variables compared     9
## 4                           Number of variables in x but not y     0
## 5                           Number of variables in y but not x     0
## 6        Number of variables compared with some values unequal     0
## 7           Number of variables compared with all values equal     9
## 8                             Number of observations in common     0
## 9                        Number of observations in x but not y    29
## 10                       Number of observations in y but not x  1847
## 11 Number of observations with some compared variables unequal     0
## 12    Number of observations with all compared variables equal     0
## 13                                    Number of values unequal     0

nrow(f3) + ( nrow(X3$TEST.scored) + nrow(X3$Training_Data) ) == nrow(diab_pop)

## [1] TRUE

arsenal::comparedf(X3$TEST.scored,f3,by=c('seqn'))

## Compare Object
## 
## Function Call: 
## arsenal::comparedf(x = X3$TEST.scored, y = f3, by = c("seqn"))
## 
## Shared: 9 non-by variables and 0 observations.
## Not shared: 3 variables and 1855 observations.
## 
## Differences found in 0/9 variables compared.
## 0 variables compared have non-identical attributes.

compare_df_obj <- arsenal::comparedf(X3$TEST.scored,f3,by=c('seqn'))

summ.compare_df_obj <- summary(compare_df_obj)
summ.compare_df_obj$comparison.summary.table

##                                                      statistic value
## 1                                       Number of by-variables     1
## 2                         Number of non-by variables in common     9
## 3                                 Number of variables compared     9
## 4                           Number of variables in x but not y     3
## 5                           Number of variables in y but not x     0
## 6        Number of variables compared with some values unequal     0
## 7           Number of variables compared with all values equal     9
## 8                             Number of observations in common     0
## 9                        Number of observations in x but not y     8
## 10                       Number of observations in y but not x  1847
## 11 Number of observations with some compared variables unequal     0
## 12    Number of observations with all compared variables equal     0
## 13                                    Number of values unequal     0

f3 <- bind_rows(X3$TEST.scored %>% select(colnames(diab_pop)),
                f3)

7.1 Predict

7.1.1 Probs

str(predict(X3$gml.model, f3,'prob'),1)

## 'data.frame':    1855 obs. of  2 variables:
##  $ Diabetes   : num  7.88e-12 7.89e-12 7.88e-12 7.88e-12 7.88e-12 ...
##  $ No Diabetes: num  1 1 1 1 1 ...

f3$Diabetes <- predict(X3$gml.model, f3,'prob')$Diabetes

glimpse(f3)

## Rows: 1,855
## Columns: 11
## $ seqn     <dbl> 85443, 87007, 87510, 90121, 90384, 90814, 92358, 92970, 83...
## $ riagendr <fct> Female, Female, Female, Female, Female, Female, Female, Fe...
## $ ridageyr <dbl> 48, 45, 29, 44, 47, 80, 21, 25, 53, 78, 72, 45, 67, 67, 57...
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White...
## $ dmdeduc2 <fct> High school graduate/GED, Some college or AA degrees, Coll...
## $ dmdmartl <fct> Married, Married, Married, Married, Living with partner, M...
## $ indhhin2 <ord> "$75,000-$99,999", "$75,000-$99,999", "$75,000-$99,999", "...
## $ bmxbmi   <dbl> 27.1, 39.4, 22.6, 45.2, 28.1, 33.3, 21.8, 26.8, 30.8, 28.8...
## $ diq010   <fct> No Diabetes, No Diabetes, No Diabetes, No Diabetes, No Dia...
## $ lbxglu   <dbl> 97, 207, 97, 104, 107, 102, 80, 92, 101, 84, 107, 84, 130,...
## $ Diabetes <dbl> 7.884915e-12, 7.885026e-12, 7.884915e-12, 7.884915e-12, 7....

7.1.2 Predict Class

str(predict(X3$gml.model, f3,'raw'),1)

##  Factor w/ 2 levels "Diabetes","No Diabetes": 2 2 2 2 2 2 2 2 2 2 ...

f3$estimate <- predict(X3$gml.model, f3,'raw')

glimpse(f3)

## Rows: 1,855
## Columns: 12
## $ seqn     <dbl> 85443, 87007, 87510, 90121, 90384, 90814, 92358, 92970, 83...
## $ riagendr <fct> Female, Female, Female, Female, Female, Female, Female, Fe...
## $ ridageyr <dbl> 48, 45, 29, 44, 47, 80, 21, 25, 53, 78, 72, 45, 67, 67, 57...
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White...
## $ dmdeduc2 <fct> High school graduate/GED, Some college or AA degrees, Coll...
## $ dmdmartl <fct> Married, Married, Married, Married, Living with partner, M...
## $ indhhin2 <ord> "$75,000-$99,999", "$75,000-$99,999", "$75,000-$99,999", "...
## $ bmxbmi   <dbl> 27.1, 39.4, 22.6, 45.2, 28.1, 33.3, 21.8, 26.8, 30.8, 28.8...
## $ diq010   <fct> No Diabetes, No Diabetes, No Diabetes, No Diabetes, No Dia...
## $ lbxglu   <dbl> 97, 207, 97, 104, 107, 102, 80, 92, 101, 84, 107, 84, 130,...
## $ Diabetes <dbl> 7.884915e-12, 7.885026e-12, 7.884915e-12, 7.884915e-12, 7....
## $ estimate <fct> No Diabetes, No Diabetes, No Diabetes, No Diabetes, No Dia...

7.2 Test Function

7.2.1 Get AUC

Get_Errors_Function(f3)

## $AUC
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.548
## 
## $ROC_CURVE
## # A tibble: 8 x 3
##    .threshold specificity sensitivity
##         <dbl>       <dbl>       <dbl>
## 1 -Inf              0         1      
## 2    7.88e-12       0         1      
## 3    7.89e-12       0.799     0.343  
## 4    7.89e-12       0.806     0.104  
## 5    1.00e+ 0       0.806     0.0964 
## 6    1.00e+ 0       0.806     0.0964 
## 7    1.00e+ 0       0.999     0.00714
## 8  Inf              1         0      
## 
## $CONF_MAT
##              Truth
## Prediction    Diabetes No Diabetes
##   Diabetes          27         306
##   No Diabetes      253        1269
## 
## $SUM.CONF_MAT
## # A tibble: 13 x 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary        0.699 
##  2 kap                  binary       -0.0908
##  3 sens                 binary        0.0964
##  4 spec                 binary        0.806 
##  5 ppv                  binary        0.0811
##  6 npv                  binary        0.834 
##  7 mcc                  binary       -0.0913
##  8 j_index              binary       -0.0979
##  9 bal_accuracy         binary        0.451 
## 10 detection_prevalence binary        0.180 
## 11 precision            binary        0.0811
## 12 recall               binary        0.0964
## 13 f_meas               binary        0.0881

SAMPLE_3.ERRORS <- Get_Errors_Function(f3) 

str(SAMPLE_3.ERRORS,1)

## List of 4
##  $ AUC         : tibble [1 x 3] (S3: tbl_df/tbl/data.frame)
##  $ ROC_CURVE   : roc_df [8 x 3] (S3: roc_df/tbl_df/tbl/data.frame)
##  $ CONF_MAT    :List of 2
##   ..- attr(*, "class")= chr "conf_mat"
##  $ SUM.CONF_MAT: tibble [13 x 3] (S3: tbl_df/tbl/data.frame)

SAMPLE_3.ERRORS$AUC

## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.548

8 Apply `Get_Errors_Function`

8.1 Model 3 Error By Classes

(f3 %>% 
  group_by(riagendr) %>%
  Get_Errors_Function())$AUC

## # A tibble: 2 x 4
##   riagendr .metric .estimator .estimate
##   <fct>    <chr>   <chr>          <dbl>
## 1 Male     roc_auc binary         0.538
## 2 Female   roc_auc binary         0.561

8.1.1 Sex

  (f3 %>% 
  group_by(riagendr) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(riagendr) %>%
  ggplot(aes(x=riagendr,
             y=AUC_EST,
             fill=riagendr)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip()

8.1.2 Sex and Income

(f3 %>% 
  group_by(riagendr , indhhin2) %>%
  Get_Errors_Function())$AUC

## # A tibble: 24 x 5
##    riagendr indhhin2          .metric .estimator .estimate
##    <fct>    <ord>             <chr>   <chr>          <dbl>
##  1 Male     $0-$4,999         roc_auc binary         0.479
##  2 Male     $5,000-$9,999     roc_auc binary         0.449
##  3 Male     $10,000-$14,999   roc_auc binary         0.511
##  4 Male     $15,000-$19,999   roc_auc binary         0.508
##  5 Male     less than $20,000 roc_auc binary         0.643
##  6 Male     20,000+           roc_auc binary         0.370
##  7 Male     $20,000-$24,999   roc_auc binary         0.555
##  8 Male     $25,000-$34,999   roc_auc binary         0.554
##  9 Male     $45,000-$54,999   roc_auc binary         0.5  
## 10 Male     $65,000-$74,999   roc_auc binary         0.504
## # ... with 14 more rows

(f3 %>% 
  group_by(riagendr, indhhin2) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(riagendr) %>%
  ggplot(aes(x=indhhin2,
             y=AUC_EST,
             fill=riagendr)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip()

(f3 %>% 
  group_by(riagendr, indhhin2) %>%
  Get_Errors_Function())$ROC_CURVE %>%
  autoplot() +
  labs( title = "ROC Curves by Sex and Income")

8.1.3 Income and Ethnicity

(f3 %>% 
  group_by(riagendr, ridreth1) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(riagendr) %>%
  ggplot(aes(x=ridreth1,
             y=AUC_EST,
             fill=riagendr)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip()

8.1.3.1 Not all levels may be available

(f3 %>% 
  group_by(riagendr, indhhin2, ridreth1) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(riagendr) %>%
  ggplot(aes(x=indhhin2,
             y=AUC_EST,
             fill=riagendr)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip() +
  facet_wrap( ~ ridreth1)

## Error: Problem with `summarise()` input `.estimate`.
## x No case observation.
## i Input `.estimate` is `metric_fn(...)`.
## i The error occurred in group 5: riagendr = "Male", indhhin2 = "$0-$4,999", ridreth1 = "Other".

8.1.4 Confusion Matricies

(f3 %>% 
  group_by(riagendr, indhhin2) %>%
  Get_Errors_Function())$CONF_MAT

## # A tibble: 24 x 3
## # Groups:   riagendr [2]
##    riagendr indhhin2          conf_mat  
##    <fct>    <ord>             <list>    
##  1 Male     $0-$4,999         <conf_mat>
##  2 Male     $5,000-$9,999     <conf_mat>
##  3 Male     $10,000-$14,999   <conf_mat>
##  4 Male     $15,000-$19,999   <conf_mat>
##  5 Male     less than $20,000 <conf_mat>
##  6 Male     20,000+           <conf_mat>
##  7 Male     $20,000-$24,999   <conf_mat>
##  8 Male     $25,000-$34,999   <conf_mat>
##  9 Male     $45,000-$54,999   <conf_mat>
## 10 Male     $65,000-$74,999   <conf_mat>
## # ... with 14 more rows

Sum_Conf_T3_Example <- (f3 %>% 
  group_by(riagendr, indhhin2) %>%
  Get_Errors_Function())$SUM.CONF_MAT

Sum_Conf_T3_Example

##    riagendr               indhhin2 
##  Male  :12   $0-$4,999        : 2  
##  Female:12   $5,000-$9,999    : 2  
##              $10,000-$14,999  : 2  
##              $15,000-$19,999  : 2  
##              less than $20,000: 2  
##              20,000+          : 2  
##              (Other)          :12  
##                                    
##                                    
##                                    
##                                    
##                                    
##                                    
##                                    
##                                    
##                                    
##                                    
##                                    
##                                    
##                                    
##                                    
##                                    
##                                    
##                                    
##  conf_mat.Length  conf_mat.Class  conf_mat.Mode
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list                      
##  2         conf_mat  list

9 Random Error Model

f2 <- diab_pop %>% 
  filter(!seqn %in% Id2)


nrow(diab_pop) #1876

## [1] 1876

nrow(t2) #

## [1] 938

nrow(f2) #

## [1] 938

nrow(f2) + nrow(t2) == nrow(diab_pop)

## [1] TRUE

arsenal::comparedf(t2,f2,by=c('seqn'))

## Compare Object
## 
## Function Call: 
## arsenal::comparedf(x = t2, y = f2, by = c("seqn"))
## 
## Shared: 9 non-by variables and 0 observations.
## Not shared: 0 variables and 1876 observations.
## 
## Differences found in 0/9 variables compared.
## 0 variables compared have non-identical attributes.

compare_df_obj <- arsenal::comparedf(t2,f2,by=c('seqn'))

summ.compare_df_obj <- summary(compare_df_obj)
summ.compare_df_obj$comparison.summary.table

##                                                      statistic value
## 1                                       Number of by-variables     1
## 2                         Number of non-by variables in common     9
## 3                                 Number of variables compared     9
## 4                           Number of variables in x but not y     0
## 5                           Number of variables in y but not x     0
## 6        Number of variables compared with some values unequal     0
## 7           Number of variables compared with all values equal     9
## 8                             Number of observations in common     0
## 9                        Number of observations in x but not y   938
## 10                       Number of observations in y but not x   938
## 11 Number of observations with some compared variables unequal     0
## 12    Number of observations with all compared variables equal     0
## 13                                    Number of values unequal     0

f2$estimate <- predict(X2$gml.model, f2,'raw')
f2$Diabetes <- predict(X2$gml.model, f2,'prob')$Diabetes

10 Compare Random to Swan

TEST.Scored_stacked <- bind_rows(
  f2 %>% mutate(model = 'random'),
  f3 %>% mutate(model = 'black_swan')
)


(TEST.Scored_stacked %>%
  group_by(model) %>%
  Get_Errors_Function())$AUC

## # A tibble: 2 x 4
##   model      .metric .estimator .estimate
##   <chr>      <chr>   <chr>          <dbl>
## 1 black_swan roc_auc binary         0.548
## 2 random     roc_auc binary         0.835

10.0.1 By Model by Sex

  (TEST.Scored_stacked %>% 
  group_by(model, riagendr) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(model, riagendr) %>%
  ggplot(aes(x=riagendr,
             y=AUC_EST,
             fill=model)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip()

10.0.2 By Model by Sex by Income

  (TEST.Scored_stacked %>% 
  group_by(model, riagendr,indhhin2) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(model, riagendr, indhhin2) %>%
  ggplot(aes(x=riagendr,
             y=AUC_EST,
             fill=indhhin2)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip() +
  facet_wrap(model~.)

10.0.3 By Model, Sex, & Ethnicity

  (TEST.Scored_stacked %>% 
  group_by(model, riagendr, ridreth1) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(model, riagendr, ridreth1) %>%
  ggplot(aes(x=riagendr,
             y=AUC_EST,
             fill=ridreth1)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip() +
  facet_wrap(model~.)

  (TEST.Scored_stacked %>% 
  group_by(model, riagendr, ridreth1) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(model, riagendr, ridreth1) %>%
  ggplot(aes(x=riagendr,
             y=AUC_EST,
             fill=model)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip() +
  facet_wrap(ridreth1~.)

11 Code Appendix

$~$

set.seed(1701)

library('tidyverse')
library('caret')
library('yardstick')
library('ggplot2')


diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
  na.omit() 

glimpse(diab_pop)

levels(diab_pop$indhhin2)

income_levels <- levels(diab_pop$indhhin2)


levels = c("$0-$4,999", 
           "$5,000-$9,999", 
           "$10,000-$14,999",
           "$15,000-$19,999",
           "less than $20,000",
           "20,000+", 
           "$20,000-$24,999",
           "$25,000-$34,999",
           "$35,000-$44,999",
           "$45,000-$54,999",
           "$55,000-$64,999",
           "$65,000-$74,999",
           "$75,000-$99,999",
           "$100,000+"
            )

setdiff(income_levels, levels)

diab_pop$indhhin2 <- factor(diab_pop$indhhin2 ,
                            levels=levels,
                            ordered = TRUE)

odered_levels <- levels(diab_pop$indhhin2)

glimpse(diab_pop) 

feature_names <- c('riagendr' , 'ridreth1' , 'dmdeduc2' , 'dmdmartl' , 'indhhin2' , 'lbxglu','bmxbmi')

feature_names_plus <- paste(feature_names, collapse = ' + ' )

feature_names_plus

formula_1 <- as.formula(paste0('diq010 ~ ',feature_names_plus))

formula_1

# THIS IS NOT A GREAT IDEA 

options(warn=-1)

# I have this on, there is an expected warning 
## "prediction from a rank-deficient fit may be misleading"
## without this option on the output is very difficult to read

Train_Glm_Iteration <- function(data){
  
  TrainInd <- createDataPartition(data$diq010,
                                  p =.7,
                                  list=FALSE)

  TRAIN <- data[TrainInd, ] 
  
 bootstrap <- trainControl(method="boot", number=42)
  
  gml.model <- train(as.formula(formula_1),
    method='glm',
    data =TRAIN,
    family='binomial',
    preProcess = c('center','scale'),
    trControl=bootstrap
    )
  

  
  CoEff <-  as_tibble(gml.model$finalModel$coefficients, rownames="feature") %>%
    rename(coeff = value)
  
  TEST <- data[-TrainInd,]
  
  estimate <- predict(gml.model, TEST,'raw') 
  
  prob <- predict(gml.model, TEST,'prob')
  
  TEST.scored <- cbind(TEST, estimate, prob)
  
  return(list(Training_Data = TRAIN,
              gml.model = gml.model,
              CoEff = CoEff,
              TEST.scored =TEST.scored)
         )
  
}
Id <- sample(diab_pop$seqn, nrow(diab_pop)*.3, replace=F)
length(Id)

t1 <- diab_pop %>% 
  filter(seqn %in% Id)

dim(t1)

X1 <- Train_Glm_Iteration(t1)

str(X1,1)

X1$TEST.scored %>%
  roc_auc(truth= diq010 , Diabetes)

X1$TEST.scored %>%
  roc_curve(truth= diq010 , Diabetes) %>%
  autoplot()

conf_matX1 <- X1$TEST.scored %>%
  conf_mat(truth= diq010 , estimate)

conf_matX1

sum.conf_matX1  <- summary(conf_matX1)

sum.conf_matX1


nrow(X1$Training_Data) + nrow(X1$TEST.scored) == nrow(t1)

X1.comparedf <- arsenal::comparedf(X1$Training_Data, X1$TEST.scored, by=c('seqn')) 

sum.X1.comparedf <- summary(X1.comparedf)

sum.X1.comparedf$comparison.summary.table
Id2 <- sample(diab_pop$seqn, nrow(diab_pop)*.5, replace=F)

t2 <- diab_pop %>% 
  filter(seqn %in% Id2)

X2 <- Train_Glm_Iteration(t2)
Swan <- diab_pop %>% 
  filter(indhhin2 == '$75,000-$99,999' & riagendr == 'Female' &  ridreth1 == 'Non-Hispanic White') 

Id3 <- sample(Swan$seqn, nrow(Swan)*.8, replace=F)

t3 <- diab_pop %>% 
  filter(indhhin2 == '$75,000-$99,999' & riagendr == 'Female' &  ridreth1 == 'Non-Hispanic White') %>%
  filter(seqn %in% Id3)

t3 %>% summary()

X3 <- Train_Glm_Iteration(t3)
Id4 <- sample(diab_pop$seqn, nrow(diab_pop)*.9, replace=F)

t4 <- diab_pop %>% 
  filter(seqn %in% Id4)

X4 <- Train_Glm_Iteration(t4)
M_union <- union(Id2,Id3)

Id5 <- setdiff(diab_pop$seqn, M_union)


t5 <- diab_pop %>% 
  filter(seqn %in% Id5)


X5 <- Train_Glm_Iteration(t5)
str(X2$Training_Data)
str(X3$Training_Data)
str(X5$Training_Data)


arsenal::comparedf(X3$Training_Data,
                   X5$Training_Data)
X1$CoEff

CoEff_compare <- bind_rows(X1$CoEff %>% mutate(strat = 't1'),
          X2$CoEff %>% mutate(strat = 't2'),
          X3$CoEff %>% mutate(strat = 't3'),
          X4$CoEff %>% mutate(strat = 't4'),
          X5$CoEff %>% mutate(strat = 't5'))


glimpse(CoEff_compare)
CoEff_compare %>%
  group_by(strat) %>%
  ggplot(aes(x=feature, y=coeff)) +
  geom_point() + 
  coord_flip() +
  facet_wrap(.~strat)
CoEff_compare %>%
  ggplot(aes(x=feature, y=coeff)) +
  geom_boxplot() + 
  coord_flip()

str(X1,1)

glimpse(X1$TEST.scored)

look <- as_tibble(X1$TEST.scored)

look %>%
  roc_auc(truth=diq010, Diabetes)

look %>%
  roc_curve(truth=diq010, Diabetes) %>%
  autoplot()

look %>%
  conf_mat(truth=diq010, estimate)

look %>%
  conf_mat(truth=diq010, estimate) %>%
  summary()

Get_Errors_Function <- function(data){

  look <- data
  
  AUC <- look %>%
    roc_auc(truth=diq010, Diabetes)
  
  ROC_CURVE <- look %>%
    roc_curve(truth=diq010, Diabetes) 
  
  CONF_MAT <- look %>%
    conf_mat(truth=diq010, estimate)
  
  SUM.CONF_MAT <- look %>%
    conf_mat(truth=diq010, estimate) %>%
    summary()
  
  output = list(
    AUC = AUC,
    ROC_CURVE = ROC_CURVE,
    CONF_MAT = CONF_MAT,
    SUM.CONF_MAT = SUM.CONF_MAT
  )
  
  return(output)
}

f3 <- diab_pop %>% 
  anti_join(t3 %>% select(seqn))

glimpse(f3)
nrow(diab_pop) #1876
nrow(t3) # 
nrow(f3) #

nrow(f3) + nrow(t3) == nrow(diab_pop)

arsenal::comparedf(t3,f3,by=c('seqn'))

compare_df_obj <- arsenal::comparedf(t3,f3,by=c('seqn'))

summ.compare_df_obj <- summary(compare_df_obj)
summ.compare_df_obj$comparison.summary.table
nrow(f3) + ( nrow(X3$TEST.scored) + nrow(X3$Training_Data) ) == nrow(diab_pop)

arsenal::comparedf(X3$TEST.scored,f3,by=c('seqn'))

compare_df_obj <- arsenal::comparedf(X3$TEST.scored,f3,by=c('seqn'))

summ.compare_df_obj <- summary(compare_df_obj)
summ.compare_df_obj$comparison.summary.table

f3 <- bind_rows(X3$TEST.scored %>% select(colnames(diab_pop)),
                f3)
str(predict(X3$gml.model, f3,'prob'),1)

f3$Diabetes <- predict(X3$gml.model, f3,'prob')$Diabetes

glimpse(f3)

str(predict(X3$gml.model, f3,'raw'),1)

f3$estimate <- predict(X3$gml.model, f3,'raw')

glimpse(f3)


Get_Errors_Function(f3)

SAMPLE_3.ERRORS <- Get_Errors_Function(f3) 

str(SAMPLE_3.ERRORS,1)

SAMPLE_3.ERRORS$AUC
(f3 %>% 
  group_by(riagendr) %>%
  Get_Errors_Function())$AUC
  (f3 %>% 
  group_by(riagendr) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(riagendr) %>%
  ggplot(aes(x=riagendr,
             y=AUC_EST,
             fill=riagendr)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip() 


(f3 %>% 
  group_by(riagendr , indhhin2) %>%
  Get_Errors_Function())$AUC

(f3 %>% 
  group_by(riagendr, indhhin2) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(riagendr) %>%
  ggplot(aes(x=indhhin2,
             y=AUC_EST,
             fill=riagendr)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip()

(f3 %>% 
  group_by(riagendr, indhhin2) %>%
  Get_Errors_Function())$ROC_CURVE %>%
  autoplot() +
  labs( title = "ROC Curves by Sex and Income")


(f3 %>% 
  group_by(riagendr, ridreth1) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(riagendr) %>%
  ggplot(aes(x=ridreth1,
             y=AUC_EST,
             fill=riagendr)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip()

(f3 %>% 
  group_by(riagendr, indhhin2, ridreth1) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(riagendr) %>%
  ggplot(aes(x=indhhin2,
             y=AUC_EST,
             fill=riagendr)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip() +
  facet_wrap( ~ ridreth1)
(f3 %>% 
  group_by(riagendr, indhhin2) %>%
  Get_Errors_Function())$CONF_MAT 


Sum_Conf_T3_Example <- (f3 %>% 
  group_by(riagendr, indhhin2) %>%
  Get_Errors_Function())$SUM.CONF_MAT

Sum_Conf_T3_Example

f2 <- diab_pop %>% 
  filter(!seqn %in% Id2)


nrow(diab_pop) #1876
nrow(t2) # 
nrow(f2) #

nrow(f2) + nrow(t2) == nrow(diab_pop)

arsenal::comparedf(t2,f2,by=c('seqn'))

compare_df_obj <- arsenal::comparedf(t2,f2,by=c('seqn'))

summ.compare_df_obj <- summary(compare_df_obj)
summ.compare_df_obj$comparison.summary.table


f2$estimate <- predict(X2$gml.model, f2,'raw')
f2$Diabetes <- predict(X2$gml.model, f2,'prob')$Diabetes


TEST.Scored_stacked <- bind_rows(
  f2 %>% mutate(model = 'random'),
  f3 %>% mutate(model = 'black_swan')
)


(TEST.Scored_stacked %>%
  group_by(model) %>%
  Get_Errors_Function())$AUC


  (TEST.Scored_stacked %>% 
  group_by(model, riagendr) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(model, riagendr) %>%
  ggplot(aes(x=riagendr,
             y=AUC_EST,
             fill=model)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip() 

  (TEST.Scored_stacked %>% 
  group_by(model, riagendr,indhhin2) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(model, riagendr, indhhin2) %>%
  ggplot(aes(x=riagendr,
             y=AUC_EST,
             fill=indhhin2)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip() +
  facet_wrap(model~.)


  (TEST.Scored_stacked %>% 
  group_by(model, riagendr, ridreth1) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(model, riagendr, ridreth1) %>%
  ggplot(aes(x=riagendr,
             y=AUC_EST,
             fill=ridreth1)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip() +
  facet_wrap(model~.)


  (TEST.Scored_stacked %>% 
  group_by(model, riagendr, ridreth1) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(model, riagendr, ridreth1) %>%
  ggplot(aes(x=riagendr,
             y=AUC_EST,
             fill=model)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip() +
  facet_wrap(ridreth1~.)

Resampling Samples - Classification with `caret` `glm` logit

J Kyle Armstrong, PhD

04 February 2021

1 STEP UP

1.1 WARNING - THIS IS A BAD OPTION

2 `caret` `glm` logit `train` `function`

3 Make Samples

3.1 SAMPLE 1

3.2 SAMPLE 2

3.3 SAMPLE 3 - “black swan”

3.3.1 Income == ‘$75,000-$99,999’ & Gender == ‘Female’ & ridreth1 == ‘Non-Hispanic White’

3.4 SAMPLE 4

3.5 SAMPLE 5

3.5.1 Compare SAMPLE 1 to SAMPLE 5

4 Compare Coefficents across all samples

5 Review Ouput

6 create `Get_Errors_Function`

7 Test on Sample 3

7.1 Predict

7.1.1 Probs

7.1.2 Predict Class

7.2 Test Function

7.2.1 Get AUC

8 Apply `Get_Errors_Function`

8.1 Model 3 Error By Classes

8.1.1 Sex

8.1.2 Sex and Income

8.1.3 Income and Ethnicity

8.1.3.1 Not all levels may be available

8.1.4 Confusion Matricies

9 Random Error Model

10 Compare Random to Swan

10.0.1 By Model by Sex

10.0.2 By Model by Sex by Income

10.0.3 By Model, Sex, & Ethnicity

11 Code Appendix

Resampling Samples - Classification with caret glm logit

J Kyle Armstrong, PhD

04 February 2021

1 STEP UP

1.1 WARNING - THIS IS A BAD OPTION

2 caret glm logit train function

3 Make Samples

3.1 SAMPLE 1

3.2 SAMPLE 2

3.3 SAMPLE 3 - “black swan”

3.3.1 Income == ‘$75,000-$99,999’ & Gender == ‘Female’ & ridreth1 == ‘Non-Hispanic White’

3.4 SAMPLE 4

3.5 SAMPLE 5

3.5.1 Compare SAMPLE 1 to SAMPLE 5

4 Compare Coefficents across all samples

5 Review Ouput

6 create Get_Errors_Function

7 Test on Sample 3

7.1 Predict

7.1.1 Probs

7.1.2 Predict Class

7.2 Test Function

7.2.1 Get AUC

8 Apply Get_Errors_Function

8.1 Model 3 Error By Classes

8.1.1 Sex

8.1.2 Sex and Income

8.1.3 Income and Ethnicity

8.1.3.1 Not all levels may be available

8.1.4 Confusion Matricies

9 Random Error Model

10 Compare Random to Swan

10.0.1 By Model by Sex

10.0.2 By Model by Sex by Income

10.0.3 By Model, Sex, & Ethnicity

11 Code Appendix

Resampling Samples - Classification with `caret` `glm` logit

2 `caret` `glm` logit `train` `function`

6 create `Get_Errors_Function`

8 Apply `Get_Errors_Function`