library(data.table)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v dplyr 1.0.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::between() masks data.table::between()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks data.table::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
library(ggbiplot)
## Loading required package: plyr
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
## Loading required package: scales
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
## Loading required package: grid
library('caret')
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
na.omit()
numeric_features <- diab_pop %>%
select_if(is.numeric) %>%
select(-seqn) %>%
colnames()
numeric_features
## [1] "ridageyr" "bmxbmi" "lbxglu"
TrainInd <- createDataPartition(diab_pop$diq010,
p=.7,
list=FALSE)
TRAIN <- diab_pop[TrainInd,]
pP.TRAIN <- preProcess(TRAIN, c('center','scale'))
TRAIN_scaled <- predict(pP.TRAIN, TRAIN)
TRAIN.PCA_Model <- prcomp(TRAIN_scaled %>% select(bmxbmi, lbxglu, ridageyr))
summary(TRAIN.PCA_Model)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.1229 1.0040 0.8550
## Proportion of Variance 0.4203 0.3360 0.2437
## Cumulative Proportion 0.4203 0.7563 1.0000
ggbiplot(TRAIN.PCA_Model)
# How can we determine the number of principal components?
We aim to find the components with the maximum variance so we can retain as much information about the original dataset as possible.
To determine the number of principal components to retain in our analysis we need to compute the proportion of variance explained.
We can plot the cumulative proportion of variance explained in a scree plot to determine the number of principal components to retain.
# Pull SD
sd.TRAIN <- TRAIN.PCA_Model$sdev
# Compute variance
var.TRAIN <- sd.TRAIN^2
# Proportion of variance explained
prop.TRAIN <- var.TRAIN/sum(var.TRAIN)
# Plot the cumulative proportion of variance explained
plot(cumsum(prop.TRAIN),
xlab='Principal components',
ylab='Proportion of variance explained',
type='b')
TRAIN.PCA <- cbind(TRAIN, predict(TRAIN.PCA_Model, predict(pP.TRAIN, TRAIN)))
glimpse(TRAIN.PCA)
## Rows: 1,314
## Columns: 13
## $ seqn <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83787, 83...
## $ riagendr <fct> Male, Male, Female, Male, Female, Male, Female, Female, Ma...
## $ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 68, 66, 56, 20, 24, 29, 71, 49...
## $ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, MexicanAmerican, O...
## $ dmdeduc2 <fct> High school graduate/GED, High school graduate/GED, Grades...
## $ dmdmartl <fct> Divorced, Married, Separated, Never married, Married, Wido...
## $ indhhin2 <fct> "$15,000-$19,999", "$20,000-$24,999", "$75,000-$99,999", "...
## $ bmxbmi <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 33.5, 34.0, 24.4...
## $ diq010 <fct> No Diabetes, Diabetes, No Diabetes, No Diabetes, No Diabet...
## $ lbxglu <dbl> 101, 84, 107, 84, 130, 284, 398, 111, 113, 397, 94, 105, 1...
## $ PC1 <dbl> 0.048832304, -0.441774174, -0.631194933, 0.953266726, -1.5...
## $ PC2 <dbl> 0.09984689, -0.85121540, -0.69957999, -0.53185607, 1.35308...
## $ PC3 <dbl> -0.368516359, -1.450441326, -0.821917670, -0.042857600, -1...
features <- colnames(TRAIN.PCA)[!colnames(TRAIN.PCA) %in% c('seqn', 'diq010', 'PC3')]
features
## [1] "riagendr" "ridageyr" "ridreth1" "dmdeduc2" "dmdmartl" "indhhin2"
## [7] "bmxbmi" "lbxglu" "PC1" "PC2"
paste0('diq010 ~ ', paste( features , collapse = " + " ))
## [1] "diq010 ~ riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + bmxbmi + lbxglu + PC1 + PC2"
formula1 <- as.formula('diq010 ~ PC1 + PC2')
formula2 <- as.formula('diq010 ~ bmxbmi + lbxglu + ridageyr')
formula3 <- as.formula(paste0('diq010 ~ ', paste( features , collapse = " + " )))
train_ctrl <- trainControl(method="boot", # type of resampling, in this case bootstrap
number = 5, # number of resamplings
search = "random" # we are performing a "random" search
)
model_boot_glm_1 <- train(formula1,
data = TRAIN.PCA,
method = "glm",
trControl = train_ctrl,
# options to be passed to glm
family = 'binomial',
preProcess = c('center','scale')
)
model_boot_glm_2 <- train(formula2,
data = TRAIN.PCA,
method = "glm",
trControl = train_ctrl,
# options to be passed to glm
family = 'binomial',
preProcess = c('center','scale'))
model_boot_glm_3 <- train(formula3,
data = TRAIN.PCA,
method = "glm",
trControl = train_ctrl,
# options to be passed to glm
family = 'binomial',
preProcess = c('center','scale'))
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
saveRDS(model_boot_glm_1,'model_boot_glm_1.RDS')
saveRDS(pP.TRAIN,'pP.TRAIN.RDS')
saveRDS(TRAIN.PCA_Model,'TRAIN.PCA_Model.RDS')
rm(model_boot_glm_1)
rm(pP.TRAIN)
rm(TRAIN.PCA_Model)
model_boot_glm_1
## Error in eval(expr, envir, enclos): object 'model_boot_glm_1' not found
model_boot_glm_1 <- readRDS('model_boot_glm_1.RDS')
pP.TRAIN <- readRDS('pP.TRAIN.RDS')
TRAIN.PCA_Model <- readRDS('TRAIN.PCA_Model.RDS')
TEST <- diab_pop[-TrainInd,]
# this should be right
str( predict(TRAIN.PCA_Model, predict(pP.TRAIN, TEST)) , 1 )
## num [1:562, 1:3] 1.447 0.885 -1.126 -0.252 2.004 ...
## - attr(*, "dimnames")=List of 2
str( TEST %>% predict(pP.TRAIN, .) %>% predict(TRAIN.PCA_Model, . ) , 1)
## num [1:562, 1:3] 1.447 0.885 -1.126 -0.252 2.004 ...
## - attr(*, "dimnames")=List of 2
#
# this should be wrong
predict(pP.TRAIN, predict(TRAIN.PCA_Model, TEST))
## Error in newdata[, object$method$center, drop = FALSE]: subscript out of bounds
TEST.PCA <- cbind(TEST , predict(TRAIN.PCA_Model, predict(pP.TRAIN, TEST)))
glimpse(TEST.PCA)
## Rows: 562
## Columns: 13
## $ seqn <dbl> 83761, 83799, 83818, 83820, 83822, 83828, 83834, 83851, 83...
## $ riagendr <fct> Female, Female, Female, Male, Female, Female, Male, Female...
## $ ridageyr <dbl> 24, 37, 80, 70, 20, 39, 69, 37, 41, 54, 80, 27, 33, 55, 68...
## $ ridreth1 <fct> Other, Other Hispanic, Other Hispanic, Non-Hispanic White,...
## $ dmdeduc2 <fct> College grad or above, Some college or AA degrees, Less th...
## $ dmdmartl <fct> Never married, Married, Widowed, Living with partner, Neve...
## $ indhhin2 <fct> "$0-$4,999", "$75,000-$99,999", "$10,000-$14,999", "$65,00...
## $ bmxbmi <dbl> 25.3, 25.5, 28.5, 27.0, 22.2, 27.2, 28.2, 35.3, 40.7, 30.2...
## $ diq010 <fct> No Diabetes, No Diabetes, No Diabetes, No Diabetes, No Dia...
## $ lbxglu <dbl> 95, 100, 119, 94, 80, 101, 105, 79, 110, 99, 137, 89, 83, ...
## $ PC1 <dbl> 1.44674529, 0.88522732, -1.12620195, -0.25176729, 2.003719...
## $ PC2 <dbl> 0.204951996, -0.126635241, -0.927462780, -0.852259376, -0....
## $ PC3 <dbl> 0.83261212, 0.45091448, -0.88784185, -0.89883531, 0.867879...
TEST_1 <- TEST.PCA
TEST_1$estimate <- predict(model_boot_glm_1, TEST.PCA, 'raw')
TEST_1$Diabetes <- predict(model_boot_glm_1, TEST.PCA, 'prob')$Diabetes
TEST_2 <- TEST.PCA
TEST_2$estimate <- predict(model_boot_glm_2, TEST.PCA, 'raw')
TEST_2$Diabetes <- predict(model_boot_glm_2, TEST.PCA, 'prob')$Diabetes
TEST_3 <- TEST.PCA
TEST_3$estimate <- predict(model_boot_glm_3, TEST.PCA, 'raw')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
TEST_3$Diabetes <- predict(model_boot_glm_3, TEST.PCA, 'prob')$Diabetes
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
TEST.stacked <- bind_rows(TEST_1 %>% mutate(model = 'w_PCAs_only'),
TEST_2 %>% mutate(model = 'w_nums'),
TEST_3 %>% mutate(model = "full_model")
)
library('yardstick')
## For binary classification, the first factor level is assumed to be the event.
## Use the argument `event_level = "second"` to alter this as needed.
##
## Attaching package: 'yardstick'
## The following objects are masked from 'package:caret':
##
## precision, recall, sensitivity, specificity
## The following object is masked from 'package:readr':
##
## spec
library('glue')
##
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
##
## collapse
Aucs <- TEST.stacked %>%
group_by(model) %>%
roc_auc(truth = diq010, Diabetes) %>%
arrange(-.estimate) %>%
glue_data("{model} AUC : {round(.estimate, 3)}")
TEST.stacked %>%
group_by(model) %>%
roc_curve(truth = diq010, Diabetes) %>%
autoplot() +
labs(caption = paste(Aucs, collapse = " \n "))
\(~\)
knitr::opts_chunk$set(echo = TRUE)
library(data.table)
library(tidyverse)
library(ggbiplot)
library('caret')
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
na.omit()
numeric_features <- diab_pop %>%
select_if(is.numeric) %>%
select(-seqn) %>%
colnames()
numeric_features
TrainInd <- createDataPartition(diab_pop$diq010,
p=.7,
list=FALSE)
TRAIN <- diab_pop[TrainInd,]
pP.TRAIN <- preProcess(TRAIN, c('center','scale'))
TRAIN_scaled <- predict(pP.TRAIN, TRAIN)
TRAIN.PCA_Model <- prcomp(TRAIN_scaled %>% select(bmxbmi, lbxglu, ridageyr))
summary(TRAIN.PCA_Model)
ggbiplot(TRAIN.PCA_Model)
# Pull SD
sd.TRAIN <- TRAIN.PCA_Model$sdev
# Compute variance
var.TRAIN <- sd.TRAIN^2
# Proportion of variance explained
prop.TRAIN <- var.TRAIN/sum(var.TRAIN)
# Plot the cumulative proportion of variance explained
plot(cumsum(prop.TRAIN),
xlab='Principal components',
ylab='Proportion of variance explained',
type='b')
TRAIN.PCA <- cbind(TRAIN, predict(TRAIN.PCA_Model, predict(pP.TRAIN, TRAIN)))
glimpse(TRAIN.PCA)
features <- colnames(TRAIN.PCA)[!colnames(TRAIN.PCA) %in% c('seqn', 'diq010', 'PC3')]
features
paste0('diq010 ~ ', paste( features , collapse = " + " ))
formula1 <- as.formula('diq010 ~ PC1 + PC2')
formula2 <- as.formula('diq010 ~ bmxbmi + lbxglu + ridageyr')
formula3 <- as.formula(paste0('diq010 ~ ', paste( features , collapse = " + " )))
train_ctrl <- trainControl(method="boot", # type of resampling, in this case bootstrap
number = 5, # number of resamplings
search = "random" # we are performing a "random" search
)
model_boot_glm_1 <- train(formula1,
data = TRAIN.PCA,
method = "glm",
trControl = train_ctrl,
# options to be passed to glm
family = 'binomial',
preProcess = c('center','scale')
)
model_boot_glm_2 <- train(formula2,
data = TRAIN.PCA,
method = "glm",
trControl = train_ctrl,
# options to be passed to glm
family = 'binomial',
preProcess = c('center','scale'))
model_boot_glm_3 <- train(formula3,
data = TRAIN.PCA,
method = "glm",
trControl = train_ctrl,
# options to be passed to glm
family = 'binomial',
preProcess = c('center','scale'))
saveRDS(model_boot_glm_1,'model_boot_glm_1.RDS')
saveRDS(pP.TRAIN,'pP.TRAIN.RDS')
saveRDS(TRAIN.PCA_Model,'TRAIN.PCA_Model.RDS')
rm(model_boot_glm_1)
rm(pP.TRAIN)
rm(TRAIN.PCA_Model)
model_boot_glm_1
model_boot_glm_1 <- readRDS('model_boot_glm_1.RDS')
pP.TRAIN <- readRDS('pP.TRAIN.RDS')
TRAIN.PCA_Model <- readRDS('TRAIN.PCA_Model.RDS')
TEST <- diab_pop[-TrainInd,]
# this should be right
str( predict(TRAIN.PCA_Model, predict(pP.TRAIN, TEST)) , 1 )
str( TEST %>% predict(pP.TRAIN, .) %>% predict(TRAIN.PCA_Model, . ) , 1)
#
# this should be wrong
predict(pP.TRAIN, predict(TRAIN.PCA_Model, TEST))
TEST.PCA <- cbind(TEST , predict(TRAIN.PCA_Model, predict(pP.TRAIN, TEST)))
glimpse(TEST.PCA)
TEST_1 <- TEST.PCA
TEST_1$estimate <- predict(model_boot_glm_1, TEST.PCA, 'raw')
TEST_1$Diabetes <- predict(model_boot_glm_1, TEST.PCA, 'prob')$Diabetes
TEST_2 <- TEST.PCA
TEST_2$estimate <- predict(model_boot_glm_2, TEST.PCA, 'raw')
TEST_2$Diabetes <- predict(model_boot_glm_2, TEST.PCA, 'prob')$Diabetes
TEST_3 <- TEST.PCA
TEST_3$estimate <- predict(model_boot_glm_3, TEST.PCA, 'raw')
TEST_3$Diabetes <- predict(model_boot_glm_3, TEST.PCA, 'prob')$Diabetes
TEST.stacked <- bind_rows(TEST_1 %>% mutate(model = 'w_PCAs_only'),
TEST_2 %>% mutate(model = 'w_nums'),
TEST_3 %>% mutate(model = "full_model")
)
library('yardstick')
library('glue')
Aucs <- TEST.stacked %>%
group_by(model) %>%
roc_auc(truth = diq010, Diabetes) %>%
arrange(-.estimate) %>%
glue_data("{model} AUC : {round(.estimate, 3)}")
TEST.stacked %>%
group_by(model) %>%
roc_curve(truth = diq010, Diabetes) %>%
autoplot() +
labs(caption = paste(Aucs, collapse = " \n "))