\(~\)

\(~\)


\(~\)

\(~\)

1 Install if not Function

install_if_not <- function( list.of.packages ) {
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}
install_if_not("NHANES")
## [1] "the package 'NHANES' is already installed"
install_if_not("RANN")
## [1] "the package 'RANN' is already installed"

2 NHANES Dataset

library("tidyverse")
## -- Attaching packages -------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.4
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ----------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library("NHANES")
## Warning: package 'NHANES' was built under R version 3.6.3
data <- NHANES %>% select(-DiabetesAge)

dim(data)
## [1] 10000    75
glimpse(data)
## Observations: 10,000
## Variables: 75
## $ ID               <int> 51624, 51624, 51624, 51625, 51630, 51638, 51646, 5...
## $ SurveyYr         <fct> 2009_10, 2009_10, 2009_10, 2009_10, 2009_10, 2009_...
## $ Gender           <fct> male, male, male, male, female, male, male, female...
## $ Age              <int> 34, 34, 34, 4, 49, 9, 8, 45, 45, 45, 66, 58, 54, 1...
## $ AgeDecade        <fct>  30-39,  30-39,  30-39,  0-9,  40-49,  0-9,  0-9, ...
## $ AgeMonths        <int> 409, 409, 409, 49, 596, 115, 101, 541, 541, 541, 7...
## $ Race1            <fct> White, White, White, Other, White, White, White, W...
## $ Race3            <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Education        <fct> High School, High School, High School, NA, Some Co...
## $ MaritalStatus    <fct> Married, Married, Married, NA, LivePartner, NA, NA...
## $ HHIncome         <fct> 25000-34999, 25000-34999, 25000-34999, 20000-24999...
## $ HHIncomeMid      <int> 30000, 30000, 30000, 22500, 40000, 87500, 60000, 8...
## $ Poverty          <dbl> 1.36, 1.36, 1.36, 1.07, 1.91, 1.84, 2.33, 5.00, 5....
## $ HomeRooms        <int> 6, 6, 6, 9, 5, 6, 7, 6, 6, 6, 5, 10, 6, 10, 10, 4,...
## $ HomeOwn          <fct> Own, Own, Own, Own, Rent, Rent, Own, Own, Own, Own...
## $ Work             <fct> NotWorking, NotWorking, NotWorking, NA, NotWorking...
## $ Weight           <dbl> 87.4, 87.4, 87.4, 17.0, 86.7, 29.8, 35.2, 75.7, 75...
## $ Length           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ HeadCirc         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Height           <dbl> 164.7, 164.7, 164.7, 105.4, 168.4, 133.1, 130.6, 1...
## $ BMI              <dbl> 32.22, 32.22, 32.22, 15.30, 30.57, 16.82, 20.64, 2...
## $ BMICatUnder20yrs <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ BMI_WHO          <fct> 30.0_plus, 30.0_plus, 30.0_plus, 12.0_18.5, 30.0_p...
## $ Pulse            <int> 70, 70, 70, NA, 86, 82, 72, 62, 62, 62, 60, 62, 76...
## $ BPSysAve         <int> 113, 113, 113, NA, 112, 86, 107, 118, 118, 118, 11...
## $ BPDiaAve         <int> 85, 85, 85, NA, 75, 47, 37, 64, 64, 64, 63, 74, 85...
## $ BPSys1           <int> 114, 114, 114, NA, 118, 84, 114, 106, 106, 106, 12...
## $ BPDia1           <int> 88, 88, 88, NA, 82, 50, 46, 62, 62, 62, 64, 76, 86...
## $ BPSys2           <int> 114, 114, 114, NA, 108, 84, 108, 118, 118, 118, 10...
## $ BPDia2           <int> 88, 88, 88, NA, 74, 50, 36, 68, 68, 68, 62, 72, 88...
## $ BPSys3           <int> 112, 112, 112, NA, 116, 88, 106, 118, 118, 118, 11...
## $ BPDia3           <int> 82, 82, 82, NA, 76, 44, 38, 60, 60, 60, 64, 76, 82...
## $ Testosterone     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ DirectChol       <dbl> 1.29, 1.29, 1.29, NA, 1.16, 1.34, 1.55, 2.12, 2.12...
## $ TotChol          <dbl> 3.49, 3.49, 3.49, NA, 6.70, 4.86, 4.09, 5.82, 5.82...
## $ UrineVol1        <int> 352, 352, 352, NA, 77, 123, 238, 106, 106, 106, 11...
## $ UrineFlow1       <dbl> NA, NA, NA, NA, 0.094, 1.538, 1.322, 1.116, 1.116,...
## $ UrineVol2        <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ UrineFlow2       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Diabetes         <fct> No, No, No, No, No, No, No, No, No, No, No, No, No...
## $ HealthGen        <fct> Good, Good, Good, NA, Good, NA, NA, Vgood, Vgood, ...
## $ DaysPhysHlthBad  <int> 0, 0, 0, NA, 0, NA, NA, 0, 0, 0, 10, 0, 4, NA, NA,...
## $ DaysMentHlthBad  <int> 15, 15, 15, NA, 10, NA, NA, 3, 3, 3, 0, 0, 0, NA, ...
## $ LittleInterest   <fct> Most, Most, Most, NA, Several, NA, NA, None, None,...
## $ Depressed        <fct> Several, Several, Several, NA, Several, NA, NA, No...
## $ nPregnancies     <int> NA, NA, NA, NA, 2, NA, NA, 1, 1, 1, NA, NA, NA, NA...
## $ nBabies          <int> NA, NA, NA, NA, 2, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ Age1stBaby       <int> NA, NA, NA, NA, 27, NA, NA, NA, NA, NA, NA, NA, NA...
## $ SleepHrsNight    <int> 4, 4, 4, NA, 8, NA, NA, 8, 8, 8, 7, 5, 4, NA, 5, 7...
## $ SleepTrouble     <fct> Yes, Yes, Yes, NA, Yes, NA, NA, No, No, No, No, No...
## $ PhysActive       <fct> No, No, No, NA, No, NA, NA, Yes, Yes, Yes, Yes, Ye...
## $ PhysActiveDays   <int> NA, NA, NA, NA, NA, NA, NA, 5, 5, 5, 7, 5, 1, NA, ...
## $ TVHrsDay         <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ CompHrsDay       <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ TVHrsDayChild    <int> NA, NA, NA, 4, NA, 5, 1, NA, NA, NA, NA, NA, NA, 4...
## $ CompHrsDayChild  <int> NA, NA, NA, 1, NA, 0, 6, NA, NA, NA, NA, NA, NA, 3...
## $ Alcohol12PlusYr  <fct> Yes, Yes, Yes, NA, Yes, NA, NA, Yes, Yes, Yes, Yes...
## $ AlcoholDay       <int> NA, NA, NA, NA, 2, NA, NA, 3, 3, 3, 1, 2, 6, NA, N...
## $ AlcoholYear      <int> 0, 0, 0, NA, 20, NA, NA, 52, 52, 52, 100, 104, 364...
## $ SmokeNow         <fct> No, No, No, NA, Yes, NA, NA, NA, NA, NA, No, NA, N...
## $ Smoke100         <fct> Yes, Yes, Yes, NA, Yes, NA, NA, No, No, No, Yes, N...
## $ Smoke100n        <fct> Smoker, Smoker, Smoker, NA, Smoker, NA, NA, Non-Sm...
## $ SmokeAge         <int> 18, 18, 18, NA, 38, NA, NA, NA, NA, NA, 13, NA, NA...
## $ Marijuana        <fct> Yes, Yes, Yes, NA, Yes, NA, NA, Yes, Yes, Yes, NA,...
## $ AgeFirstMarij    <int> 17, 17, 17, NA, 18, NA, NA, 13, 13, 13, NA, 19, 15...
## $ RegularMarij     <fct> No, No, No, NA, No, NA, NA, No, No, No, NA, Yes, Y...
## $ AgeRegMarij      <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 20, 15...
## $ HardDrugs        <fct> Yes, Yes, Yes, NA, Yes, NA, NA, No, No, No, No, Ye...
## $ SexEver          <fct> Yes, Yes, Yes, NA, Yes, NA, NA, Yes, Yes, Yes, Yes...
## $ SexAge           <int> 16, 16, 16, NA, 12, NA, NA, 13, 13, 13, 17, 22, 12...
## $ SexNumPartnLife  <int> 8, 8, 8, NA, 10, NA, NA, 20, 20, 20, 15, 7, 100, N...
## $ SexNumPartYear   <int> 1, 1, 1, NA, 1, NA, NA, 0, 0, 0, NA, 1, 1, NA, NA,...
## $ SameSex          <fct> No, No, No, NA, Yes, NA, NA, Yes, Yes, Yes, No, No...
## $ SexOrientation   <fct> Heterosexual, Heterosexual, Heterosexual, NA, Hete...
## $ PregnantNow      <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...

2.1 Missing data

Amelia::missmap(data)
## Warning: Unknown or uninitialised column: 'arguments'.

## Warning: Unknown or uninitialised column: 'arguments'.
## Warning: Unknown or uninitialised column: 'imputations'.

SumNa <- function(col){sum(is.na(col))}

data.sum <- data %>% 
    summarise_all(SumNa) %>%
    tidyr::gather(key='feature', value='SumNa') %>%
    arrange(-SumNa) %>%
    mutate(PctNa = SumNa/nrow(data))

data.sum
## # A tibble: 75 x 3
##    feature          SumNa PctNa
##    <chr>            <int> <dbl>
##  1 HeadCirc          9912 0.991
##  2 Length            9457 0.946
##  3 TVHrsDayChild     9347 0.935
##  4 CompHrsDayChild   9347 0.935
##  5 BMICatUnder20yrs  8726 0.873
##  6 AgeRegMarij       8634 0.863
##  7 UrineFlow2        8524 0.852
##  8 UrineVol2         8522 0.852
##  9 PregnantNow       8304 0.830
## 10 Age1stBaby        8116 0.812
## # ... with 65 more rows
data.sum2 <- data.sum %>% 
  filter(! (feature %in% c('ID','Diabetes'))) %>%
  filter(PctNa < .45)


data.sum2$feature
##  [1] "SexAge"          "SexNumPartnLife" "HardDrugs"       "SexEver"        
##  [5] "SameSex"         "AlcoholYear"     "Alcohol12PlusYr" "LittleInterest" 
##  [9] "Depressed"       "Education"       "MaritalStatus"   "Smoke100"       
## [13] "Smoke100n"       "DaysPhysHlthBad" "DaysMentHlthBad" "HealthGen"      
## [17] "SleepHrsNight"   "Work"            "SleepTrouble"    "BPSys1"         
##  [ reached getOption("max.print") -- omitted 27 entries ]
data_F <- data %>% 
  select(ID, Diabetes, data.sum2$feature) %>%
  filter(!is.na(Diabetes))

Amelia::missmap(data_F)
## Warning: Unknown or uninitialised column: 'arguments'.

## Warning: Unknown or uninitialised column: 'arguments'.
## Warning: Unknown or uninitialised column: 'imputations'.

3 Split Data

library('rsample')

set.seed(8675309)

train_test_split <- initial_split(data_F, prop = 3/4) 

TRAIN <- training(train_test_split)
TEST <- testing(train_test_split)

4 caret

library('caret')
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift

4.1 dummyVars

dV <- dummyVars(Diabetes ~ .,  TRAIN)

dV
## Dummy Variable Object
## 
## Formula: Diabetes ~ .
## 49 variables, 22 factors
## Variables and levels will be separated by '.'
## A less than full rank encoding is used
dV.TRAIN <- predict(dV, TRAIN)
## Warning in model.frame.default(Terms, newdata, na.action = na.action, xlev =
## object$lvls): variable 'Diabetes' is not a factor
dV.TRAIN
##         ID SexAge SexNumPartnLife HardDrugs.No HardDrugs.Yes SexEver.No
##      SexEver.Yes SameSex.No SameSex.Yes AlcoholYear Alcohol12PlusYr.No
##      Alcohol12PlusYr.Yes LittleInterest.None LittleInterest.Several
##      LittleInterest.Most Depressed.None Depressed.Several Depressed.Most
##      Education.8th Grade Education.9 - 11th Grade Education.High School
##      Education.Some College Education.College Grad MaritalStatus.Divorced
##      MaritalStatus.LivePartner MaritalStatus.Married MaritalStatus.NeverMarried
##      MaritalStatus.Separated MaritalStatus.Widowed Smoke100.No Smoke100.Yes
##      Smoke100n.Non-Smoker Smoke100n.Smoker DaysPhysHlthBad DaysMentHlthBad
##      HealthGen.Excellent HealthGen.Vgood HealthGen.Good HealthGen.Fair
##      HealthGen.Poor SleepHrsNight Work.Looking Work.NotWorking Work.Working
##      SleepTrouble.No SleepTrouble.Yes BPSys1 BPDia1 PhysActive.No
##      PhysActive.Yes BPSys2 BPDia2 BPSys3 BPDia3 UrineFlow1 DirectChol TotChol
##      BPSysAve BPDiaAve Pulse UrineVol1 HHIncome. 0-4999 HHIncome. 5000-9999
##      HHIncome.10000-14999 HHIncome.15000-19999 HHIncome.20000-24999
##      HHIncome.25000-34999 HHIncome.35000-44999 HHIncome.45000-54999
##      HHIncome.55000-64999 HHIncome.65000-74999 HHIncome.75000-99999
##      HHIncome.more 99999 HHIncomeMid Poverty BMI_WHO.12.0_18.5
##      BMI_WHO.18.5_to_24.9 BMI_WHO.25.0_to_29.9 BMI_WHO.30.0_plus   BMI Height
##      AgeDecade. 0-9 AgeDecade. 10-19 AgeDecade. 20-29 AgeDecade. 30-39
##      AgeDecade. 40-49 AgeDecade. 50-59 AgeDecade. 60-69 AgeDecade. 70+ Weight
##      HomeRooms HomeOwn.Own HomeOwn.Rent HomeOwn.Other SurveyYr.2009_10
##      SurveyYr.2011_12 Gender.female Gender.male Age Race1.Black Race1.Hispanic
##      Race1.Mexican Race1.White Race1.Other
##  [ reached getOption("max.print") -- omitted 7394 rows ]

4.2 impute, center, & scale

pP <- preProcess(dV.TRAIN, method = c('knnImpute', 'center', 'scale'))
pP
## Created from 2885 samples and 104 variables
## 
## Pre-processing:
##   - centered (104)
##   - ignored (0)
##   - 5 nearest neighbor imputation (104)
##   - scaled (104)
pP.dV.TRAIN <- predict(pP, dV.TRAIN)

4.3 train model

trControl <- trainControl(method = 'repeatedcv',
                          number = 5,
                          repeats =  5,
                          search = 'random')

logit.CV <- train(x= pP.dV.TRAIN , y= TRAIN$Diabetes, 
                  method = 'glmnet',
                  trControl = trControl,
                  family = 'binomial' )
logit.CV
## glmnet 
## 
## 7394 samples
##  104 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 5915, 5915, 5915, 5915, 5916, 5915, ... 
## Resampling results across tuning parameters:
## 
##   alpha      lambda       Accuracy   Kappa    
##   0.1517191  2.886218362  0.9242629  0.0000000
##   0.3508348  4.571764116  0.9242629  0.0000000
##   0.7249268  0.007131488  0.9261291  0.1226199
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0.7249268 and lambda
##  = 0.007131488.
plot(logit.CV)

4.4 score test & train data

4.4.1 score test

# score TEST
dV.TEST <- predict(dV, TEST)
## Warning in model.frame.default(Terms, newdata, na.action = na.action, xlev =
## object$lvls): variable 'Diabetes' is not a factor
pP.dV.TEST <- predict(pP, dV.TEST)

class <- predict(logit.CV, pP.dV.TEST)
probs <- predict(logit.CV, pP.dV.TEST,'prob')

TEST.scored <- cbind(TEST,class,probs) %>% mutate(data = "TEST")

4.4.2 score train

# score TRAIN
class <- predict(logit.CV, pP.dV.TRAIN)
probs <- predict(logit.CV, pP.dV.TRAIN,'prob')

TRAIN.scored <- cbind(TRAIN,class,probs) %>% mutate(data = "TRAIN")


TRAIN_TEST_scored <- rbind(TRAIN.scored, TEST.scored)

5 yardstick

library('yardstick')
## For binary classification, the first factor level is assumed to be the event.
## Set the global option `yardstick.event_first` to `FALSE` to change this.
## 
## Attaching package: 'yardstick'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
## The following object is masked from 'package:readr':
## 
##     spec
TRAIN_TEST_scored %>%
  group_by(data) %>%
  roc_auc(truth=Diabetes,Yes)
## # A tibble: 2 x 4
##   data  .metric .estimator .estimate
##   <chr> <chr>   <chr>          <dbl>
## 1 TEST  roc_auc binary         0.879
## 2 TRAIN roc_auc binary         0.873
TRAIN_TEST_scored %>%
  group_by(data) %>%
  roc_curve(truth=Diabetes,Yes) %>%
  autoplot()

conf_mat.TRAIN_TEST_scored <- TRAIN_TEST_scored %>%
  group_by(data) %>%
  conf_mat(truth=Diabetes,class)

conf_mat.TRAIN_TEST_scored
## # A tibble: 2 x 2
##   data  conf_mat  
##   <chr> <list>    
## 1 TEST  <conf_mat>
## 2 TRAIN <conf_mat>
conf_mat.TRAIN_TEST_scored$conf_mat
## [[1]]
##           Truth
## Prediction   No  Yes
##        No  2249  186
##        Yes   15   14
## 
## [[2]]
##           Truth
## Prediction   No  Yes
##        No  6810  513
##        Yes   24   47
metrics.TRAIN_TEST_scored <- map_dfr(conf_mat.TRAIN_TEST_scored$conf_mat,
                                     summary,
                                     .id="data") %>%
                              mutate(data_char = case_when(data == 1 ~ "TEST",
                                                           data == 2 ~ "TRAIN") )

library('ggplot2')

metrics.TRAIN_TEST_scored %>%
  ggplot(aes(x = .metric, y=.estimate, fill= data_char)) +
  geom_bar(stat="identity", position=position_dodge()) +
  coord_flip()


\(~\)

\(~\)

6 Code Appendix

\(~\)

install_if_not <- function( list.of.packages ) {
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}

install_if_not("NHANES")
install_if_not("RANN")
library("tidyverse")
library("NHANES")

data <- NHANES %>% select(-DiabetesAge)

dim(data)

glimpse(data)
Amelia::missmap(data)

SumNa <- function(col){sum(is.na(col))}

data.sum <- data %>% 
    summarise_all(SumNa) %>%
    tidyr::gather(key='feature', value='SumNa') %>%
    arrange(-SumNa) %>%
    mutate(PctNa = SumNa/nrow(data))

data.sum
data.sum2 <- data.sum %>% 
  filter(! (feature %in% c('ID','Diabetes'))) %>%
  filter(PctNa < .45)


data.sum2$feature
data_F <- data %>% 
  select(ID, Diabetes, data.sum2$feature) %>%
  filter(!is.na(Diabetes))

Amelia::missmap(data_F)
library('rsample')

set.seed(8675309)

train_test_split <- initial_split(data_F, prop = 3/4) 

TRAIN <- training(train_test_split)
TEST <- testing(train_test_split)
library('caret')

dV <- dummyVars(Diabetes ~ .,  TRAIN)

dV
dV.TRAIN <- predict(dV, TRAIN)

dV.TRAIN

pP <- preProcess(dV.TRAIN, method = c('knnImpute', 'center', 'scale'))
pP


pP.dV.TRAIN <- predict(pP, dV.TRAIN)


trControl <- trainControl(method = 'repeatedcv',
                          number = 5,
                          repeats =  5,
                          search = 'random')

logit.CV <- train(x= pP.dV.TRAIN , y= TRAIN$Diabetes, 
                  method = 'glmnet',
                  trControl = trControl,
                  family = 'binomial' )


logit.CV

plot(logit.CV)


# score TEST
dV.TEST <- predict(dV, TEST)

pP.dV.TEST <- predict(pP, dV.TEST)

class <- predict(logit.CV, pP.dV.TEST)
probs <- predict(logit.CV, pP.dV.TEST,'prob')

TEST.scored <- cbind(TEST,class,probs) %>% mutate(data = "TEST")

# score TRAIN
class <- predict(logit.CV, pP.dV.TRAIN)
probs <- predict(logit.CV, pP.dV.TRAIN,'prob')

TRAIN.scored <- cbind(TRAIN,class,probs) %>% mutate(data = "TRAIN")


TRAIN_TEST_scored <- rbind(TRAIN.scored, TEST.scored)

library('yardstick')

TRAIN_TEST_scored %>%
  group_by(data) %>%
  roc_auc(truth=Diabetes,Yes)


TRAIN_TEST_scored %>%
  group_by(data) %>%
  roc_curve(truth=Diabetes,Yes) %>%
  autoplot()

conf_mat.TRAIN_TEST_scored <- TRAIN_TEST_scored %>%
  group_by(data) %>%
  conf_mat(truth=Diabetes,class)

conf_mat.TRAIN_TEST_scored

conf_mat.TRAIN_TEST_scored$conf_mat

metrics.TRAIN_TEST_scored <- map_dfr(conf_mat.TRAIN_TEST_scored$conf_mat,
                                     summary,
                                     .id="data") %>%
                              mutate(data_char = case_when(data == 1 ~ "TEST",
                                                           data == 2 ~ "TRAIN") )

library('ggplot2')

metrics.TRAIN_TEST_scored %>%
  ggplot(aes(x = .metric, y=.estimate, fill= data_char)) +
  geom_bar(stat="identity", position=position_dodge()) +
  coord_flip()