1 SET UP

library(data.table)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v dplyr   1.0.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::between()   masks data.table::between()
## x dplyr::filter()    masks stats::filter()
## x dplyr::first()     masks data.table::first()
## x dplyr::lag()       masks stats::lag()
## x dplyr::last()      masks data.table::last()
## x purrr::transpose() masks data.table::transpose()

library(ggbiplot)

## Loading required package: plyr

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:purrr':
## 
##     compact

## Loading required package: scales

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

## Loading required package: grid

library('caret')

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
  na.omit() 

numeric_features <- diab_pop %>%
  select_if(is.numeric) %>%
  select(-seqn) %>%
  colnames()

numeric_features

## [1] "ridageyr" "bmxbmi"   "lbxglu"

2 Split Data

TrainInd <- createDataPartition(diab_pop$diq010,
                                p=.7,
                                list=FALSE)

TRAIN <- diab_pop[TrainInd,]

pP.TRAIN <- preProcess(TRAIN, c('center','scale'))

2.1 How many Clusters should we make ?

TRAIN_scaled <- predict(pP.TRAIN, TRAIN)

TRAIN.PCA_Model <- prcomp(TRAIN_scaled %>% select(bmxbmi, lbxglu, ridageyr))

summary(TRAIN.PCA_Model)

## Importance of components:
##                           PC1    PC2    PC3
## Standard deviation     1.1229 1.0040 0.8550
## Proportion of Variance 0.4203 0.3360 0.2437
## Cumulative Proportion  0.4203 0.7563 1.0000

ggbiplot(TRAIN.PCA_Model)

# How can we determine the number of principal components?

We aim to find the components with the maximum variance so we can retain as much information about the original dataset as possible.

To determine the number of principal components to retain in our analysis we need to compute the proportion of variance explained.

We can plot the cumulative proportion of variance explained in a scree plot to determine the number of principal components to retain.

# Pull SD
sd.TRAIN <- TRAIN.PCA_Model$sdev

# Compute variance
var.TRAIN <- sd.TRAIN^2

# Proportion of variance explained
prop.TRAIN <- var.TRAIN/sum(var.TRAIN)

# Plot the cumulative proportion of variance explained
plot(cumsum(prop.TRAIN), 
     xlab='Principal components', 
     ylab='Proportion of variance explained', 
     type='b')

TRAIN.PCA <- cbind(TRAIN, predict(TRAIN.PCA_Model, predict(pP.TRAIN, TRAIN)))


glimpse(TRAIN.PCA)

## Rows: 1,314
## Columns: 13
## $ seqn     <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83787, 83...
## $ riagendr <fct> Male, Male, Female, Male, Female, Male, Female, Female, Ma...
## $ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 68, 66, 56, 20, 24, 29, 71, 49...
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, MexicanAmerican, O...
## $ dmdeduc2 <fct> High school graduate/GED, High school graduate/GED, Grades...
## $ dmdmartl <fct> Divorced, Married, Separated, Never married, Married, Wido...
## $ indhhin2 <fct> "$15,000-$19,999", "$20,000-$24,999", "$75,000-$99,999", "...
## $ bmxbmi   <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 33.5, 34.0, 24.4...
## $ diq010   <fct> No Diabetes, Diabetes, No Diabetes, No Diabetes, No Diabet...
## $ lbxglu   <dbl> 101, 84, 107, 84, 130, 284, 398, 111, 113, 397, 94, 105, 1...
## $ PC1      <dbl> 0.048832304, -0.441774174, -0.631194933, 0.953266726, -1.5...
## $ PC2      <dbl> 0.09984689, -0.85121540, -0.69957999, -0.53185607, 1.35308...
## $ PC3      <dbl> -0.368516359, -1.450441326, -0.821917670, -0.042857600, -1...

features <- colnames(TRAIN.PCA)[!colnames(TRAIN.PCA) %in% c('seqn', 'diq010', 'PC3')] 

features

##  [1] "riagendr" "ridageyr" "ridreth1" "dmdeduc2" "dmdmartl" "indhhin2"
##  [7] "bmxbmi"   "lbxglu"   "PC1"      "PC2"

paste0('diq010 ~ ', paste( features , collapse = " + " ))

## [1] "diq010 ~ riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + bmxbmi + lbxglu + PC1 + PC2"

formula1 <- as.formula('diq010 ~ PC1 + PC2')

formula2 <- as.formula('diq010 ~ bmxbmi + lbxglu + ridageyr')
  
formula3 <- as.formula(paste0('diq010 ~ ', paste( features , collapse = " + " )))

train_ctrl <- trainControl(method="boot", # type of resampling, in this case bootstrap
                                number = 5, # number of resamplings     
                                search = "random" # we are performing a "random" search
                                )

model_boot_glm_1 <- train(formula1,
                         data = TRAIN.PCA,
                         method = "glm", 
                         trControl = train_ctrl,
                         # options to be passed to glm
                         family = 'binomial',
                         preProcess = c('center','scale')
                         )

model_boot_glm_2 <- train(formula2,
                         data = TRAIN.PCA,
                         method = "glm", 
                         trControl = train_ctrl,
                         # options to be passed to glm
                         family = 'binomial',
                         preProcess = c('center','scale'))

model_boot_glm_3 <- train(formula3,
                         data = TRAIN.PCA,
                         method = "glm", 
                         trControl = train_ctrl,
                         # options to be passed to glm
                         family = 'binomial',
                         preProcess = c('center','scale'))

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

3 Dealing with the Test data

saveRDS(model_boot_glm_1,'model_boot_glm_1.RDS')
saveRDS(pP.TRAIN,'pP.TRAIN.RDS')
saveRDS(TRAIN.PCA_Model,'TRAIN.PCA_Model.RDS')

rm(model_boot_glm_1)
rm(pP.TRAIN)
rm(TRAIN.PCA_Model)

model_boot_glm_1

## Error in eval(expr, envir, enclos): object 'model_boot_glm_1' not found

model_boot_glm_1 <- readRDS('model_boot_glm_1.RDS')
pP.TRAIN <- readRDS('pP.TRAIN.RDS')
TRAIN.PCA_Model <- readRDS('TRAIN.PCA_Model.RDS')

TEST <- diab_pop[-TrainInd,]

# this should be right 
str( predict(TRAIN.PCA_Model, predict(pP.TRAIN, TEST)) , 1 )

##  num [1:562, 1:3] 1.447 0.885 -1.126 -0.252 2.004 ...
##  - attr(*, "dimnames")=List of 2

str( TEST %>% predict(pP.TRAIN, .) %>% predict(TRAIN.PCA_Model, . ) , 1)

##  num [1:562, 1:3] 1.447 0.885 -1.126 -0.252 2.004 ...
##  - attr(*, "dimnames")=List of 2

# this should be wrong 
predict(pP.TRAIN, predict(TRAIN.PCA_Model, TEST))

## Error in newdata[, object$method$center, drop = FALSE]: subscript out of bounds

TEST.PCA <- cbind(TEST , predict(TRAIN.PCA_Model, predict(pP.TRAIN, TEST)))

glimpse(TEST.PCA)

## Rows: 562
## Columns: 13
## $ seqn     <dbl> 83761, 83799, 83818, 83820, 83822, 83828, 83834, 83851, 83...
## $ riagendr <fct> Female, Female, Female, Male, Female, Female, Male, Female...
## $ ridageyr <dbl> 24, 37, 80, 70, 20, 39, 69, 37, 41, 54, 80, 27, 33, 55, 68...
## $ ridreth1 <fct> Other, Other Hispanic, Other Hispanic, Non-Hispanic White,...
## $ dmdeduc2 <fct> College grad or above, Some college or AA degrees, Less th...
## $ dmdmartl <fct> Never married, Married, Widowed, Living with partner, Neve...
## $ indhhin2 <fct> "$0-$4,999", "$75,000-$99,999", "$10,000-$14,999", "$65,00...
## $ bmxbmi   <dbl> 25.3, 25.5, 28.5, 27.0, 22.2, 27.2, 28.2, 35.3, 40.7, 30.2...
## $ diq010   <fct> No Diabetes, No Diabetes, No Diabetes, No Diabetes, No Dia...
## $ lbxglu   <dbl> 95, 100, 119, 94, 80, 101, 105, 79, 110, 99, 137, 89, 83, ...
## $ PC1      <dbl> 1.44674529, 0.88522732, -1.12620195, -0.25176729, 2.003719...
## $ PC2      <dbl> 0.204951996, -0.126635241, -0.927462780, -0.852259376, -0....
## $ PC3      <dbl> 0.83261212, 0.45091448, -0.88784185, -0.89883531, 0.867879...

3.1 Predicting on the TEST data

TEST_1 <- TEST.PCA

TEST_1$estimate <- predict(model_boot_glm_1, TEST.PCA, 'raw')
TEST_1$Diabetes <- predict(model_boot_glm_1, TEST.PCA, 'prob')$Diabetes

TEST_2 <- TEST.PCA

TEST_2$estimate <- predict(model_boot_glm_2, TEST.PCA, 'raw')
TEST_2$Diabetes <- predict(model_boot_glm_2, TEST.PCA, 'prob')$Diabetes

TEST_3 <- TEST.PCA

TEST_3$estimate <- predict(model_boot_glm_3, TEST.PCA, 'raw')

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

TEST_3$Diabetes <- predict(model_boot_glm_3, TEST.PCA, 'prob')$Diabetes

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

TEST.stacked <- bind_rows(TEST_1 %>% mutate(model = 'w_PCAs_only'),
                          TEST_2 %>% mutate(model = 'w_nums'),
                          TEST_3 %>% mutate(model = "full_model")
                          )
library('yardstick')

## For binary classification, the first factor level is assumed to be the event.
## Use the argument `event_level = "second"` to alter this as needed.

## 
## Attaching package: 'yardstick'

## The following objects are masked from 'package:caret':
## 
##     precision, recall, sensitivity, specificity

## The following object is masked from 'package:readr':
## 
##     spec

library('glue')

## 
## Attaching package: 'glue'

## The following object is masked from 'package:dplyr':
## 
##     collapse

Aucs <- TEST.stacked %>%
  group_by(model) %>%
  roc_auc(truth = diq010, Diabetes) %>%
  arrange(-.estimate) %>%
  glue_data("{model} AUC : {round(.estimate, 3)}") 
  


TEST.stacked %>%
  group_by(model) %>%
  roc_curve(truth = diq010, Diabetes) %>%
  autoplot() +
  labs(caption  = paste(Aucs, collapse = " \n "))

4 Code Appendix

\(~\)

knitr::opts_chunk$set(echo = TRUE)
library(data.table)
library(tidyverse)
library(ggbiplot)
library('caret')

diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
  na.omit() 

numeric_features <- diab_pop %>%
  select_if(is.numeric) %>%
  select(-seqn) %>%
  colnames()

numeric_features
TrainInd <- createDataPartition(diab_pop$diq010,
                                p=.7,
                                list=FALSE)

TRAIN <- diab_pop[TrainInd,]

pP.TRAIN <- preProcess(TRAIN, c('center','scale'))


TRAIN_scaled <- predict(pP.TRAIN, TRAIN)

TRAIN.PCA_Model <- prcomp(TRAIN_scaled %>% select(bmxbmi, lbxglu, ridageyr))

summary(TRAIN.PCA_Model)


ggbiplot(TRAIN.PCA_Model)

# Pull SD
sd.TRAIN <- TRAIN.PCA_Model$sdev

# Compute variance
var.TRAIN <- sd.TRAIN^2

# Proportion of variance explained
prop.TRAIN <- var.TRAIN/sum(var.TRAIN)

# Plot the cumulative proportion of variance explained
plot(cumsum(prop.TRAIN), 
     xlab='Principal components', 
     ylab='Proportion of variance explained', 
     type='b')

TRAIN.PCA <- cbind(TRAIN, predict(TRAIN.PCA_Model, predict(pP.TRAIN, TRAIN)))


glimpse(TRAIN.PCA) 

features <- colnames(TRAIN.PCA)[!colnames(TRAIN.PCA) %in% c('seqn', 'diq010', 'PC3')] 

features

paste0('diq010 ~ ', paste( features , collapse = " + " ))

formula1 <- as.formula('diq010 ~ PC1 + PC2')

formula2 <- as.formula('diq010 ~ bmxbmi + lbxglu + ridageyr')
  
formula3 <- as.formula(paste0('diq010 ~ ', paste( features , collapse = " + " )))


train_ctrl <- trainControl(method="boot", # type of resampling, in this case bootstrap
                                number = 5, # number of resamplings     
                                search = "random" # we are performing a "random" search
                                )

model_boot_glm_1 <- train(formula1,
                         data = TRAIN.PCA,
                         method = "glm", 
                         trControl = train_ctrl,
                         # options to be passed to glm
                         family = 'binomial',
                         preProcess = c('center','scale')
                         )

model_boot_glm_2 <- train(formula2,
                         data = TRAIN.PCA,
                         method = "glm", 
                         trControl = train_ctrl,
                         # options to be passed to glm
                         family = 'binomial',
                         preProcess = c('center','scale'))

model_boot_glm_3 <- train(formula3,
                         data = TRAIN.PCA,
                         method = "glm", 
                         trControl = train_ctrl,
                         # options to be passed to glm
                         family = 'binomial',
                         preProcess = c('center','scale'))
saveRDS(model_boot_glm_1,'model_boot_glm_1.RDS')
saveRDS(pP.TRAIN,'pP.TRAIN.RDS')
saveRDS(TRAIN.PCA_Model,'TRAIN.PCA_Model.RDS')

rm(model_boot_glm_1)
rm(pP.TRAIN)
rm(TRAIN.PCA_Model)
model_boot_glm_1

model_boot_glm_1 <- readRDS('model_boot_glm_1.RDS')
pP.TRAIN <- readRDS('pP.TRAIN.RDS')
TRAIN.PCA_Model <- readRDS('TRAIN.PCA_Model.RDS')

TEST <- diab_pop[-TrainInd,]

# this should be right 
str( predict(TRAIN.PCA_Model, predict(pP.TRAIN, TEST)) , 1 )

str( TEST %>% predict(pP.TRAIN, .) %>% predict(TRAIN.PCA_Model, . ) , 1)
#
# this should be wrong 
predict(pP.TRAIN, predict(TRAIN.PCA_Model, TEST))
TEST.PCA <- cbind(TEST , predict(TRAIN.PCA_Model, predict(pP.TRAIN, TEST)))

glimpse(TEST.PCA)



TEST_1 <- TEST.PCA

TEST_1$estimate <- predict(model_boot_glm_1, TEST.PCA, 'raw')
TEST_1$Diabetes <- predict(model_boot_glm_1, TEST.PCA, 'prob')$Diabetes

TEST_2 <- TEST.PCA

TEST_2$estimate <- predict(model_boot_glm_2, TEST.PCA, 'raw')
TEST_2$Diabetes <- predict(model_boot_glm_2, TEST.PCA, 'prob')$Diabetes

TEST_3 <- TEST.PCA

TEST_3$estimate <- predict(model_boot_glm_3, TEST.PCA, 'raw')
TEST_3$Diabetes <- predict(model_boot_glm_3, TEST.PCA, 'prob')$Diabetes



TEST.stacked <- bind_rows(TEST_1 %>% mutate(model = 'w_PCAs_only'),
                          TEST_2 %>% mutate(model = 'w_nums'),
                          TEST_3 %>% mutate(model = "full_model")
                          )
library('yardstick')
library('glue')


Aucs <- TEST.stacked %>%
  group_by(model) %>%
  roc_auc(truth = diq010, Diabetes) %>%
  arrange(-.estimate) %>%
  glue_data("{model} AUC : {round(.estimate, 3)}") 
  


TEST.stacked %>%
  group_by(model) %>%
  roc_curve(truth = diq010, Diabetes) %>%
  autoplot() +
  labs(caption  = paste(Aucs, collapse = " \n "))

Modeling with PCA - Diab_Pop

J Kyle Armstrong, PhD

11 February 2021