1 SET UP

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v dplyr   1.0.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library('caret')

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
  na.omit() 

numeric_features <- diab_pop %>%
  select_if(is.numeric) %>%
  select(-seqn) %>%
  colnames()

2 Split Data

TrainInd <- createDataPartition(diab_pop$diq010,
                                p=.7,
                                list=FALSE)

TRAIN <- diab_pop[TrainInd,]

2.1 How many Clusters should we make ?

fviz_nbclust(TRAIN %>%
               select(numeric_features) %>%
               scale(),
              kmeans, 
              method='wss')

## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(numeric_features)` instead of `numeric_features` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

3 Train Clustering Model

# Set seed for reproducible results
set.seed(1984)


pP.TRAIN <- preProcess(TRAIN, c('center','scale'))

# Run k-means algorithm
# Nstart = # random starting positions; I chose 523
km <- kmeans(predict(pP.TRAIN, TRAIN) %>% select(numeric_features), 
             centers = 5, 
             nstart = 523)

glimpse(km)

## List of 9
##  $ cluster     : Named int [1:1314] 4 4 4 3 5 2 3 4 2 3 ...
##   ..- attr(*, "names")= chr [1:1314] "2" "3" "6" "11" ...
##  $ centers     : num [1:5, 1:3] 0.624 0.243 -0.903 0.938 -0.137 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:5] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:3] "ridageyr" "bmxbmi" "lbxglu"
##  $ totss       : num 3939
##  $ withinss    : num [1:5] 127.4 80.2 349.5 348.7 332.9
##  $ tot.withinss: num 1239
##  $ betweenss   : num 2700
##  $ size        : int [1:5] 84 23 511 464 232
##  $ iter        : int 4
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

TRAIN.cluster <- cbind(TRAIN, cluster=km$cluster)

head(TRAIN.cluster)

##     seqn riagendr ridageyr           ridreth1                 dmdeduc2
## 2  83733     Male       53 Non-Hispanic White High school graduate/GED
## 3  83734     Male       78 Non-Hispanic White High school graduate/GED
## 6  83737   Female       72    MexicanAmerican            Grades 9-11th
## 11 83750     Male       45              Other            Grades 9-11th
## 13 83754   Female       67     Other Hispanic    College grad or above
## 14 83755     Male       67 Non-Hispanic Black    College grad or above
##         dmdmartl        indhhin2 bmxbmi      diq010 lbxglu cluster
## 2       Divorced $15,000-$19,999   30.8 No Diabetes    101       4
## 3        Married $20,000-$24,999   28.8    Diabetes     84       4
## 6      Separated $75,000-$99,999   28.6 No Diabetes    107       4
## 11 Never married $65,000-$74,999   24.1 No Diabetes     84       3
## 13       Married $25,000-$34,999   43.7 No Diabetes    130       5
## 14       Widowed $20,000-$24,999   28.8    Diabetes    284       2

fviz_cluster(km, 
             data = TRAIN.cluster %>%
               select(numeric_features, cluster) %>%
               scale()
             )

km$centers

##     ridageyr     bmxbmi     lbxglu
## 1  0.6236755  0.4956378  1.9491289
## 2  0.2429273  0.1149841  5.4684576
## 3 -0.9027526 -0.4753420 -0.3604815
## 4  0.9376235 -0.3461239 -0.1395293
## 5 -0.1367535  1.5483751 -0.1748004

library(ggplot2)

TRAIN.cluster <- TRAIN.cluster %>%
  mutate(cluster= as.factor(cluster))

TRAIN.cluster %>%
  ggplot(aes(x=ridageyr,
             y=lbxglu,
             color=cluster)) +
  geom_point()

Plot_Clusters_by_Facet = function(dat, x, y, c, z) {
     ggplot(dat, 
            aes(x = .data[[x]], 
                y = .data[[y]],
                color=.data[[c]])
            ) +
          geom_point() +
          facet_wrap(.data[[z]] ~ .)
}


Plot_Clusters_by_Facet(TRAIN.cluster, 
            x='ridageyr',
            y='lbxglu',
            c='cluster',
            z='diq010')

Plot_Clusters_by_Facet(TRAIN.cluster, 
            x='bmxbmi',
            y='lbxglu',
            c='diq010',
            z='cluster')

library('arsenal')
library('knitr')
library('kableExtra')

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

table_arsenal <- tableby(diq010 ~ cluster , 
                         data = TRAIN.cluster)

table_arsenal

## tableby Object
## 
## Function Call:
## tableby(formula = diq010 ~ cluster, data = TRAIN.cluster)
## 
## Variable(s):
## diq010 ~ cluster

summary(table_arsenal)

## 
## 
## |                    | Diabetes (N=197) | No Diabetes (N=1117) | Total (N=1314) | p value|
## |:-------------------|:----------------:|:--------------------:|:--------------:|-------:|
## |**cluster**         |                  |                      |                | < 0.001|
## |&nbsp;&nbsp;&nbsp;1 |    66 (33.5%)    |      18 (1.6%)       |   84 (6.4%)    |        |
## |&nbsp;&nbsp;&nbsp;2 |    21 (10.7%)    |       2 (0.2%)       |   23 (1.8%)    |        |
## |&nbsp;&nbsp;&nbsp;3 |    10 (5.1%)     |     501 (44.9%)      |  511 (38.9%)   |        |
## |&nbsp;&nbsp;&nbsp;4 |    71 (36.0%)    |     393 (35.2%)      |  464 (35.3%)   |        |
## |&nbsp;&nbsp;&nbsp;5 |    29 (14.7%)    |     203 (18.2%)      |  232 (17.7%)   |        |

sum.table_arsenal <- summary(table_arsenal)

sum.table_arsenal

## 
## 
## |                    | Diabetes (N=197) | No Diabetes (N=1117) | Total (N=1314) | p value|
## |:-------------------|:----------------:|:--------------------:|:--------------:|-------:|
## |**cluster**         |                  |                      |                | < 0.001|
## |&nbsp;&nbsp;&nbsp;1 |    66 (33.5%)    |      18 (1.6%)       |   84 (6.4%)    |        |
## |&nbsp;&nbsp;&nbsp;2 |    21 (10.7%)    |       2 (0.2%)       |   23 (1.8%)    |        |
## |&nbsp;&nbsp;&nbsp;3 |    10 (5.1%)     |     501 (44.9%)      |  511 (38.9%)   |        |
## |&nbsp;&nbsp;&nbsp;4 |    71 (36.0%)    |     393 (35.2%)      |  464 (35.3%)   |        |
## |&nbsp;&nbsp;&nbsp;5 |    29 (14.7%)    |     203 (18.2%)      |  232 (17.7%)   |        |

4 Train logit models

model1 <- train(as.formula('diq010 ~ ridageyr + bmxbmi + lbxglu'),
                TRAIN.cluster,
                preProcess = c('center','scale'),
                method = 'glm',
                family = 'binomial')

model2 <- train(as.formula('diq010 ~ cluster'),
                TRAIN.cluster,
                preProcess = c('center','scale'),
                method = 'glm',
                family = 'binomial')


model3 <- train(as.formula('diq010 ~ riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + bmxbmi + lbxglu + cluster'),
                TRAIN.cluster,
                preProcess = c('center','scale'),
                method = 'glm',
                family = 'binomial')

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: indhhin2$35,000-$44,999,
## indhhin2$55,000-$64,999

5 Dealing with the Test data

TEST <- diab_pop[-TrainInd,]

library('clue')

cluster_predict <- clue::cl_predict(km, 
                                    predict(pP.TRAIN, TEST) %>% select(ridageyr, bmxbmi, lbxglu) 
                                    )

str(cluster_predict,1)

##  'cl_class_ids' int [1:562] 2 4 3 5 3 3 4 4 3 4 ...

TEST$cluster <- clue::cl_predict(km, 
                                 predict(pP.TRAIN, TEST) %>% select(ridageyr, bmxbmi, lbxglu) 
                                 )

5.1 Predicting on the TEST data

TEST <- TEST %>%
  mutate(cluster = as.factor(cluster))

TEST_1 <- TEST

TEST_1$estimate <- predict(model1, TEST, 'raw')
TEST_1$Diabetes <- predict(model1, TEST, 'prob')$Diabetes

TEST_2 <- TEST

TEST_2$estimate <- predict(model2, TEST, 'raw')
TEST_2$Diabetes <- predict(model2, TEST, 'prob')$Diabetes

TEST_3 <- TEST

TEST_3$estimate <- predict(model3, TEST, 'raw')

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

TEST_3$Diabetes <- predict(model3, TEST, 'prob')$Diabetes

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

TEST.stacked <- bind_rows(TEST_1 %>% mutate(model = 'w_nums_only'),
                          TEST_2 %>% mutate(model = 'w_cluster'),
                          TEST_3 %>% mutate(model = "full_model")
                          )
library('yardstick')

## For binary classification, the first factor level is assumed to be the event.
## Use the argument `event_level = "second"` to alter this as needed.

## 
## Attaching package: 'yardstick'

## The following objects are masked from 'package:caret':
## 
##     precision, recall, sensitivity, specificity

## The following object is masked from 'package:readr':
## 
##     spec

library('glue')

## 
## Attaching package: 'glue'

## The following object is masked from 'package:dplyr':
## 
##     collapse

Aucs <- TEST.stacked %>%
  group_by(model) %>%
  roc_auc(truth = diq010, Diabetes) %>%
  glue_data("{model} AUC : {round(.estimate, 3)}")


TEST.stacked %>%
  group_by(model) %>%
  roc_curve(truth = diq010, Diabetes) %>%
  autoplot() +
  labs(caption  = paste(Aucs, collapse = " \n "))

6 Code Appendix

\(~\)

knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(factoextra)
library('caret')

diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
  na.omit() 

numeric_features <- diab_pop %>%
  select_if(is.numeric) %>%
  select(-seqn) %>%
  colnames()


TrainInd <- createDataPartition(diab_pop$diq010,
                                p=.7,
                                list=FALSE)

TRAIN <- diab_pop[TrainInd,]



fviz_nbclust(TRAIN %>%
               select(numeric_features) %>%
               scale(),
              kmeans, 
              method='wss')


# Set seed for reproducible results
set.seed(1984)


pP.TRAIN <- preProcess(TRAIN, c('center','scale'))

# Run k-means algorithm
# Nstart = # random starting positions; I chose 523
km <- kmeans(predict(pP.TRAIN, TRAIN) %>% select(numeric_features), 
             centers = 5, 
             nstart = 523)

glimpse(km)
TRAIN.cluster <- cbind(TRAIN, cluster=km$cluster)

head(TRAIN.cluster)

fviz_cluster(km, 
             data = TRAIN.cluster %>%
               select(numeric_features, cluster) %>%
               scale()
             )

km$centers

library(ggplot2)

TRAIN.cluster <- TRAIN.cluster %>%
  mutate(cluster= as.factor(cluster))

TRAIN.cluster %>%
  ggplot(aes(x=ridageyr,
             y=lbxglu,
             color=cluster)) +
  geom_point()


Plot_Clusters_by_Facet = function(dat, x, y, c, z) {
     ggplot(dat, 
            aes(x = .data[[x]], 
                y = .data[[y]],
                color=.data[[c]])
            ) +
          geom_point() +
          facet_wrap(.data[[z]] ~ .)
}


Plot_Clusters_by_Facet(TRAIN.cluster, 
            x='ridageyr',
            y='lbxglu',
            c='cluster',
            z='diq010')


Plot_Clusters_by_Facet(TRAIN.cluster, 
            x='bmxbmi',
            y='lbxglu',
            c='diq010',
            z='cluster')

library('arsenal')
library('knitr')
library('kableExtra')

table_arsenal <- tableby(diq010 ~ cluster , 
                         data = TRAIN.cluster)

table_arsenal

summary(table_arsenal)

sum.table_arsenal <- summary(table_arsenal)

sum.table_arsenal

model1 <- train(as.formula('diq010 ~ ridageyr + bmxbmi + lbxglu'),
                TRAIN.cluster,
                preProcess = c('center','scale'),
                method = 'glm',
                family = 'binomial')

model2 <- train(as.formula('diq010 ~ cluster'),
                TRAIN.cluster,
                preProcess = c('center','scale'),
                method = 'glm',
                family = 'binomial')


model3 <- train(as.formula('diq010 ~ riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + bmxbmi + lbxglu + cluster'),
                TRAIN.cluster,
                preProcess = c('center','scale'),
                method = 'glm',
                family = 'binomial')

TEST <- diab_pop[-TrainInd,]

library('clue')

cluster_predict <- clue::cl_predict(km, 
                                    predict(pP.TRAIN, TEST) %>% select(ridageyr, bmxbmi, lbxglu) 
                                    )

str(cluster_predict,1)


TEST$cluster <- clue::cl_predict(km, 
                                 predict(pP.TRAIN, TEST) %>% select(ridageyr, bmxbmi, lbxglu) 
                                 )

TEST <- TEST %>%
  mutate(cluster = as.factor(cluster))

TEST_1 <- TEST

TEST_1$estimate <- predict(model1, TEST, 'raw')
TEST_1$Diabetes <- predict(model1, TEST, 'prob')$Diabetes

TEST_2 <- TEST

TEST_2$estimate <- predict(model2, TEST, 'raw')
TEST_2$Diabetes <- predict(model2, TEST, 'prob')$Diabetes

TEST_3 <- TEST

TEST_3$estimate <- predict(model3, TEST, 'raw')
TEST_3$Diabetes <- predict(model3, TEST, 'prob')$Diabetes



TEST.stacked <- bind_rows(TEST_1 %>% mutate(model = 'w_nums_only'),
                          TEST_2 %>% mutate(model = 'w_cluster'),
                          TEST_3 %>% mutate(model = "full_model")
                          )
library('yardstick')
library('glue')


Aucs <- TEST.stacked %>%
  group_by(model) %>%
  roc_auc(truth = diq010, Diabetes) %>%
  glue_data("{model} AUC : {round(.estimate, 3)}")


TEST.stacked %>%
  group_by(model) %>%
  roc_curve(truth = diq010, Diabetes) %>%
  autoplot() +
  labs(caption  = paste(Aucs, collapse = " \n "))

Modeling with k-means clustering - diab_pop

J Kyle Armstrong, PhD

11 February 2021

1 SET UP

2 Split Data

2.1 How many Clusters should we make ?

3 Train Clustering Model

4 Train logit models

5 Dealing with the Test data

5.1 Predicting on the TEST data

6 Code Appendix