library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v dplyr 1.0.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library('caret')
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
na.omit()
numeric_features <- diab_pop %>%
select_if(is.numeric) %>%
select(-seqn) %>%
colnames()
TrainInd <- createDataPartition(diab_pop$diq010,
p=.7,
list=FALSE)
TRAIN <- diab_pop[TrainInd,]
fviz_nbclust(TRAIN %>%
select(numeric_features) %>%
scale(),
kmeans,
method='wss')
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(numeric_features)` instead of `numeric_features` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
# Set seed for reproducible results
set.seed(1984)
pP.TRAIN <- preProcess(TRAIN, c('center','scale'))
# Run k-means algorithm
# Nstart = # random starting positions; I chose 523
km <- kmeans(predict(pP.TRAIN, TRAIN) %>% select(numeric_features),
centers = 5,
nstart = 523)
glimpse(km)
## List of 9
## $ cluster : Named int [1:1314] 4 4 4 3 5 2 3 4 2 3 ...
## ..- attr(*, "names")= chr [1:1314] "2" "3" "6" "11" ...
## $ centers : num [1:5, 1:3] 0.624 0.243 -0.903 0.938 -0.137 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:5] "1" "2" "3" "4" ...
## .. ..$ : chr [1:3] "ridageyr" "bmxbmi" "lbxglu"
## $ totss : num 3939
## $ withinss : num [1:5] 127.4 80.2 349.5 348.7 332.9
## $ tot.withinss: num 1239
## $ betweenss : num 2700
## $ size : int [1:5] 84 23 511 464 232
## $ iter : int 4
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
TRAIN.cluster <- cbind(TRAIN, cluster=km$cluster)
head(TRAIN.cluster)
## seqn riagendr ridageyr ridreth1 dmdeduc2
## 2 83733 Male 53 Non-Hispanic White High school graduate/GED
## 3 83734 Male 78 Non-Hispanic White High school graduate/GED
## 6 83737 Female 72 MexicanAmerican Grades 9-11th
## 11 83750 Male 45 Other Grades 9-11th
## 13 83754 Female 67 Other Hispanic College grad or above
## 14 83755 Male 67 Non-Hispanic Black College grad or above
## dmdmartl indhhin2 bmxbmi diq010 lbxglu cluster
## 2 Divorced $15,000-$19,999 30.8 No Diabetes 101 4
## 3 Married $20,000-$24,999 28.8 Diabetes 84 4
## 6 Separated $75,000-$99,999 28.6 No Diabetes 107 4
## 11 Never married $65,000-$74,999 24.1 No Diabetes 84 3
## 13 Married $25,000-$34,999 43.7 No Diabetes 130 5
## 14 Widowed $20,000-$24,999 28.8 Diabetes 284 2
fviz_cluster(km,
data = TRAIN.cluster %>%
select(numeric_features, cluster) %>%
scale()
)
km$centers
## ridageyr bmxbmi lbxglu
## 1 0.6236755 0.4956378 1.9491289
## 2 0.2429273 0.1149841 5.4684576
## 3 -0.9027526 -0.4753420 -0.3604815
## 4 0.9376235 -0.3461239 -0.1395293
## 5 -0.1367535 1.5483751 -0.1748004
library(ggplot2)
TRAIN.cluster <- TRAIN.cluster %>%
mutate(cluster= as.factor(cluster))
TRAIN.cluster %>%
ggplot(aes(x=ridageyr,
y=lbxglu,
color=cluster)) +
geom_point()
Plot_Clusters_by_Facet = function(dat, x, y, c, z) {
ggplot(dat,
aes(x = .data[[x]],
y = .data[[y]],
color=.data[[c]])
) +
geom_point() +
facet_wrap(.data[[z]] ~ .)
}
Plot_Clusters_by_Facet(TRAIN.cluster,
x='ridageyr',
y='lbxglu',
c='cluster',
z='diq010')
Plot_Clusters_by_Facet(TRAIN.cluster,
x='bmxbmi',
y='lbxglu',
c='diq010',
z='cluster')
library('arsenal')
library('knitr')
library('kableExtra')
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
table_arsenal <- tableby(diq010 ~ cluster ,
data = TRAIN.cluster)
table_arsenal
## tableby Object
##
## Function Call:
## tableby(formula = diq010 ~ cluster, data = TRAIN.cluster)
##
## Variable(s):
## diq010 ~ cluster
summary(table_arsenal)
##
##
## | | Diabetes (N=197) | No Diabetes (N=1117) | Total (N=1314) | p value|
## |:-------------------|:----------------:|:--------------------:|:--------------:|-------:|
## |**cluster** | | | | < 0.001|
## | 1 | 66 (33.5%) | 18 (1.6%) | 84 (6.4%) | |
## | 2 | 21 (10.7%) | 2 (0.2%) | 23 (1.8%) | |
## | 3 | 10 (5.1%) | 501 (44.9%) | 511 (38.9%) | |
## | 4 | 71 (36.0%) | 393 (35.2%) | 464 (35.3%) | |
## | 5 | 29 (14.7%) | 203 (18.2%) | 232 (17.7%) | |
sum.table_arsenal <- summary(table_arsenal)
sum.table_arsenal
##
##
## | | Diabetes (N=197) | No Diabetes (N=1117) | Total (N=1314) | p value|
## |:-------------------|:----------------:|:--------------------:|:--------------:|-------:|
## |**cluster** | | | | < 0.001|
## | 1 | 66 (33.5%) | 18 (1.6%) | 84 (6.4%) | |
## | 2 | 21 (10.7%) | 2 (0.2%) | 23 (1.8%) | |
## | 3 | 10 (5.1%) | 501 (44.9%) | 511 (38.9%) | |
## | 4 | 71 (36.0%) | 393 (35.2%) | 464 (35.3%) | |
## | 5 | 29 (14.7%) | 203 (18.2%) | 232 (17.7%) | |
model1 <- train(as.formula('diq010 ~ ridageyr + bmxbmi + lbxglu'),
TRAIN.cluster,
preProcess = c('center','scale'),
method = 'glm',
family = 'binomial')
model2 <- train(as.formula('diq010 ~ cluster'),
TRAIN.cluster,
preProcess = c('center','scale'),
method = 'glm',
family = 'binomial')
model3 <- train(as.formula('diq010 ~ riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + bmxbmi + lbxglu + cluster'),
TRAIN.cluster,
preProcess = c('center','scale'),
method = 'glm',
family = 'binomial')
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999
TEST <- diab_pop[-TrainInd,]
library('clue')
cluster_predict <- clue::cl_predict(km,
predict(pP.TRAIN, TEST) %>% select(ridageyr, bmxbmi, lbxglu)
)
str(cluster_predict,1)
## 'cl_class_ids' int [1:562] 2 4 3 5 3 3 4 4 3 4 ...
TEST$cluster <- clue::cl_predict(km,
predict(pP.TRAIN, TEST) %>% select(ridageyr, bmxbmi, lbxglu)
)
TEST <- TEST %>%
mutate(cluster = as.factor(cluster))
TEST_1 <- TEST
TEST_1$estimate <- predict(model1, TEST, 'raw')
TEST_1$Diabetes <- predict(model1, TEST, 'prob')$Diabetes
TEST_2 <- TEST
TEST_2$estimate <- predict(model2, TEST, 'raw')
TEST_2$Diabetes <- predict(model2, TEST, 'prob')$Diabetes
TEST_3 <- TEST
TEST_3$estimate <- predict(model3, TEST, 'raw')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
TEST_3$Diabetes <- predict(model3, TEST, 'prob')$Diabetes
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
TEST.stacked <- bind_rows(TEST_1 %>% mutate(model = 'w_nums_only'),
TEST_2 %>% mutate(model = 'w_cluster'),
TEST_3 %>% mutate(model = "full_model")
)
library('yardstick')
## For binary classification, the first factor level is assumed to be the event.
## Use the argument `event_level = "second"` to alter this as needed.
##
## Attaching package: 'yardstick'
## The following objects are masked from 'package:caret':
##
## precision, recall, sensitivity, specificity
## The following object is masked from 'package:readr':
##
## spec
library('glue')
##
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
##
## collapse
Aucs <- TEST.stacked %>%
group_by(model) %>%
roc_auc(truth = diq010, Diabetes) %>%
glue_data("{model} AUC : {round(.estimate, 3)}")
TEST.stacked %>%
group_by(model) %>%
roc_curve(truth = diq010, Diabetes) %>%
autoplot() +
labs(caption = paste(Aucs, collapse = " \n "))
\(~\)
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(factoextra)
library('caret')
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
na.omit()
numeric_features <- diab_pop %>%
select_if(is.numeric) %>%
select(-seqn) %>%
colnames()
TrainInd <- createDataPartition(diab_pop$diq010,
p=.7,
list=FALSE)
TRAIN <- diab_pop[TrainInd,]
fviz_nbclust(TRAIN %>%
select(numeric_features) %>%
scale(),
kmeans,
method='wss')
# Set seed for reproducible results
set.seed(1984)
pP.TRAIN <- preProcess(TRAIN, c('center','scale'))
# Run k-means algorithm
# Nstart = # random starting positions; I chose 523
km <- kmeans(predict(pP.TRAIN, TRAIN) %>% select(numeric_features),
centers = 5,
nstart = 523)
glimpse(km)
TRAIN.cluster <- cbind(TRAIN, cluster=km$cluster)
head(TRAIN.cluster)
fviz_cluster(km,
data = TRAIN.cluster %>%
select(numeric_features, cluster) %>%
scale()
)
km$centers
library(ggplot2)
TRAIN.cluster <- TRAIN.cluster %>%
mutate(cluster= as.factor(cluster))
TRAIN.cluster %>%
ggplot(aes(x=ridageyr,
y=lbxglu,
color=cluster)) +
geom_point()
Plot_Clusters_by_Facet = function(dat, x, y, c, z) {
ggplot(dat,
aes(x = .data[[x]],
y = .data[[y]],
color=.data[[c]])
) +
geom_point() +
facet_wrap(.data[[z]] ~ .)
}
Plot_Clusters_by_Facet(TRAIN.cluster,
x='ridageyr',
y='lbxglu',
c='cluster',
z='diq010')
Plot_Clusters_by_Facet(TRAIN.cluster,
x='bmxbmi',
y='lbxglu',
c='diq010',
z='cluster')
library('arsenal')
library('knitr')
library('kableExtra')
table_arsenal <- tableby(diq010 ~ cluster ,
data = TRAIN.cluster)
table_arsenal
summary(table_arsenal)
sum.table_arsenal <- summary(table_arsenal)
sum.table_arsenal
model1 <- train(as.formula('diq010 ~ ridageyr + bmxbmi + lbxglu'),
TRAIN.cluster,
preProcess = c('center','scale'),
method = 'glm',
family = 'binomial')
model2 <- train(as.formula('diq010 ~ cluster'),
TRAIN.cluster,
preProcess = c('center','scale'),
method = 'glm',
family = 'binomial')
model3 <- train(as.formula('diq010 ~ riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + bmxbmi + lbxglu + cluster'),
TRAIN.cluster,
preProcess = c('center','scale'),
method = 'glm',
family = 'binomial')
TEST <- diab_pop[-TrainInd,]
library('clue')
cluster_predict <- clue::cl_predict(km,
predict(pP.TRAIN, TEST) %>% select(ridageyr, bmxbmi, lbxglu)
)
str(cluster_predict,1)
TEST$cluster <- clue::cl_predict(km,
predict(pP.TRAIN, TEST) %>% select(ridageyr, bmxbmi, lbxglu)
)
TEST <- TEST %>%
mutate(cluster = as.factor(cluster))
TEST_1 <- TEST
TEST_1$estimate <- predict(model1, TEST, 'raw')
TEST_1$Diabetes <- predict(model1, TEST, 'prob')$Diabetes
TEST_2 <- TEST
TEST_2$estimate <- predict(model2, TEST, 'raw')
TEST_2$Diabetes <- predict(model2, TEST, 'prob')$Diabetes
TEST_3 <- TEST
TEST_3$estimate <- predict(model3, TEST, 'raw')
TEST_3$Diabetes <- predict(model3, TEST, 'prob')$Diabetes
TEST.stacked <- bind_rows(TEST_1 %>% mutate(model = 'w_nums_only'),
TEST_2 %>% mutate(model = 'w_cluster'),
TEST_3 %>% mutate(model = "full_model")
)
library('yardstick')
library('glue')
Aucs <- TEST.stacked %>%
group_by(model) %>%
roc_auc(truth = diq010, Diabetes) %>%
glue_data("{model} AUC : {round(.estimate, 3)}")
TEST.stacked %>%
group_by(model) %>%
roc_curve(truth = diq010, Diabetes) %>%
autoplot() +
labs(caption = paste(Aucs, collapse = " \n "))