# Load data and required packages
knitr::opts_chunk$set(warning = FALSE, message = FALSE)

This project was completed as part of a Statistical Machine Learning final project. The 50/50 Diabetes Health Indicators Dataset, created using information from the Behavioral Risk Factor Surveillance System (BRFSS) by the CDC, was imported from Kaggle. There were 70,692 cases and 21 variables, including the outcome variable Diabetes_binary (0 for no diabetes, 1 for diabetes). I used a variety of statistical machine learning methods that we learned throughout the course.

In the supervised learning section I asked what factors are most important for predicting diabetes. The models used for this section include LASSO logistic regression, KNN, decision tree, and bagging and random forest.

In the unsupervised learning section, I used k-means and hierarchical clustering to examine what types of diagnoses a doctor might come to when considering variables such as HighBP, HighChol, and BMI.


library(dplyr)
library(tidyverse)
library(readr)
library(ggplot2)
library(tidymodels)
library(probably)
library(rpart.plot)
library(ranger)
library(vip)
library(cluster)
library(purrr)
tidymodels_prefer()
diabetes <- read.csv("diabetes_binary_5050split_health_indicators_BRFSS2015.csv")
diabetes <- diabetes %>% mutate(Age = ifelse(Age == 1, "18-24", 
                                      ifelse(Age == 2, "25-29",
                                      ifelse(Age == 3, "30-34",
                                      ifelse(Age==4, "35-39", 
                                      ifelse(Age==5,"40-44", 
                                      ifelse(Age==6, "45-49", 
                                      ifelse(Age==7, "50-54",
                                      ifelse(Age==8,"55-59",
                                      ifelse(Age==9, "60-64", 
                                      ifelse(Age==10, "65-69",
                                      ifelse(Age==11, "70-74", 
                                      ifelse(Age==12, "75-79", "80+")))))))))))))


diabetes <- diabetes %>%
    mutate(Diabetes_binary = relevel(factor(Diabetes_binary), ref="0",),
           HighBP = relevel(factor(HighBP), ref="0"),
           HighChol = relevel(factor(HighChol), ref="0"),
           CholCheck = relevel(factor(CholCheck), ref="0"),
           Smoker = relevel(factor(Smoker), ref="0"),
           Stroke = relevel(factor(Stroke), ref="0"),
           HeartDiseaseorAttack= relevel(factor(HeartDiseaseorAttack), ref="0"),
           PhysActivity = relevel(factor(PhysActivity), ref="0"),
           Fruits = relevel(factor(Fruits), ref="0"),
           Veggies = relevel(factor(Veggies), ref="0"),
           HvyAlcoholConsump = relevel(factor(HvyAlcoholConsump), ref="0"),
           AnyHealthcare = relevel(factor(AnyHealthcare), ref="0"),
           NoDocbcCost = relevel(factor(NoDocbcCost), ref="0"),
           GenHlth = relevel(factor(GenHlth), ref="1"),
           DiffWalk = relevel(factor(DiffWalk), ref="0"),
           Sex = relevel(factor(Sex), ref="0"),
           Age = relevel(factor(Age), ref="18-24"),
           Education = relevel(factor(Education), ref="1"),
           Income = relevel(factor(Income), ref="1"))

Supervised Learning Question

What are the most important health factors for predicting diabetes?

Exploratory Analyses

diabetes %>% count(Diabetes_binary)
##   Diabetes_binary     n
## 1               0 35346
## 2               1 35346
  • The data contains a 50-50 split of individuals with no diabetes (0) and diabetes (1)

Predictors vs Outcome Plots

ggplot(diabetes, aes(fill = Age, x = Diabetes_binary)) +
  geom_bar()

ggplot(diabetes) +
  geom_bar(aes(x = Diabetes_binary, fill = HighBP))

ggplot(diabetes) +
  geom_bar(aes(x = Diabetes_binary, fill = HighChol))

ggplot(diabetes) +
  geom_boxplot(aes(x = Diabetes_binary, y = BMI))

  • Individuals with diabetes (1) tend to be older than individuals with no diabetes (0).

  • More individuals with diabetes have a high BP compared to individuals with no diabetes

  • More individuals with diabetes/prediabetes have high cholesterol compared to individuals with no diabetes

  • Individuals with diabetes/prediabetes have a slightly higher BMI on average compared to individuals with no diabetes

\

LASSO logistic regression:

Fit LASSO Model

set.seed(123)


# Set up CV folds
data_cv <- vfold_cv(diabetes, v = 12)

# LASSO logistic regression model specification
logistic_lasso_spec <- logistic_reg() %>%
    set_engine("glmnet") %>%
    set_args(mixture = 1, penalty = tune()) %>% # go through each penalty
    set_mode("classification")

# Recipe
logistic_lasso_rec <- recipe(Diabetes_binary ~ ., data = diabetes) %>%
    step_normalize(all_numeric_predictors()) %>% 
    step_dummy(all_nominal_predictors())

# Workflow (Recipe + Model)
log_lasso_wf <- workflow() %>%
    add_model(logistic_lasso_spec) %>%
    add_recipe(logistic_lasso_rec)

# Tune model: specify grid of parameters and tune
penalty_grid <- grid_regular(
    penalty(range = c(-5,3)), # log10 scale, lambda seq 10^-5 to 10^3
    levels = 100
)

tune_output <- tune_grid(
    log_lasso_wf,
    resamples = data_cv,
    metrics = metric_set(roc_auc, accuracy),
    grid = penalty_grid
)

Select and Fit Best LASSO Model

autoplot(tune_output) + theme_classic()

# Select "best" penalty by one standard error
best_se_penalty <- select_by_one_std_err(tune_output, metric = "roc_auc", desc(penalty))

# Define workflow with "best" penalty value
final_wf <- finalize_workflow(log_lasso_wf, best_se_penalty)

# Use final_wf to fit final model with "best" penalty value
final_fit_se <- fit(final_wf, data = diabetes)

Variable Importance

glmnet_output <- final_fit_se %>% extract_fit_parsnip() %>% pluck("fit") # get the original glmnet output

plot(glmnet_output, xvar = "lambda", label = TRUE, col = rainbow(20))

Least important variables:

final_fit_se %>% tidy() %>% filter(estimate == 0) # Unimportant variables
## # A tibble: 10 × 3
##    term             estimate penalty
##    <chr>               <dbl>   <dbl>
##  1 PhysHlth                0 0.00320
##  2 Smoker_X1               0 0.00320
##  3 Fruits_X1               0 0.00320
##  4 AnyHealthcare_X1        0 0.00320
##  5 NoDocbcCost_X1          0 0.00320
##  6 Age_X50.54              0 0.00320
##  7 Age_X55.59              0 0.00320
##  8 Education_X4            0 0.00320
##  9 Education_X5            0 0.00320
## 10 Income_X5               0 0.00320

Most important variables:

final_fit_se %>% tidy() %>% filter(estimate != 0) %>% 
  arrange(desc(abs(estimate))) # Important variables
## # A tibble: 36 × 3
##    term                 estimate penalty
##    <chr>                   <dbl>   <dbl>
##  1 (Intercept)            -2.84  0.00320
##  2 GenHlth_X5              1.52  0.00320
##  3 GenHlth_X4              1.46  0.00320
##  4 CholCheck_X1            1.14  0.00320
##  5 Age_X25.29             -1.13  0.00320
##  6 GenHlth_X3              1.04  0.00320
##  7 Age_X30.34             -0.881 0.00320
##  8 HighBP_X1               0.756 0.00320
##  9 HvyAlcoholConsump_X1   -0.678 0.00320
## 10 Age_X35.39             -0.619 0.00320
## # … with 26 more rows

Model Evaluation

Accuracy and ROC AUC measures

tune_output %>%
    collect_metrics() %>%
    filter(penalty == best_se_penalty %>% pull(penalty)) # get standard error of the mean (variability across multiple samples of pop'n)
## # A tibble: 2 × 7
##   penalty .metric  .estimator  mean     n std_err .config               
##     <dbl> <chr>    <chr>      <dbl> <int>   <dbl> <chr>                 
## 1 0.00320 accuracy binary     0.749    12 0.00120 Preprocessor1_Model032
## 2 0.00320 roc_auc  binary     0.825    12 0.00123 Preprocessor1_Model032

No information rate

35346/(35346+35346)
## [1] 0.5

Misclassification Plots

lasso_mod_output <- predict(final_fit_se, new_data = diabetes) %>%
    bind_cols(diabetes)

pred_labels <- c("0" = "Actual 0", "1"  = "Actual 1")

# Plots
ggplot(lasso_mod_output, aes(x = BMI, fill = .pred_class)) +
  geom_histogram() +
  facet_wrap(~Diabetes_binary) +
  labs(fill = "Predictions from LASSO", title = "BMI Predictions") +
  theme_classic()

ggplot(lasso_mod_output, aes(x = HighBP, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from LASSO", title = "HighBP Predictions") +
  theme_classic()

ggplot(lasso_mod_output, aes(x = HighChol, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from LASSO", title = "HighChol Predictions") +
  theme_classic()

ggplot(lasso_mod_output, aes(x = Age, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from LASSO", title = "Age Predictions") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

ggplot(lasso_mod_output, aes(x = GenHlth, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from LASSO", title = "GenHlth Predictions") +
  theme_classic()

LASSO Summary

The best value of lambda, chosen by selecting the best penalty by one standard error, is 0.003199267. The coefficient signs and magnitudes for our LASSO model make sense, as the ones that most greatly impact the model are commonly known risk factors or medical complications of diabetes.

With the set of penalty values that I used, 5 variables had their coefficients set equal to 0. PhysHlth, Smoker, Fruits, AnyHealthcare, and NoDocbcCost were not particularly important for predicting whether or not an individual has diabetes, after accounting for the other features in the model. The 5 variables with the greatest importance were GenHlth, HighBP, BMI, and HighChol, which correlates with the biggest risk factors/complications of diabetes. Of the unimportant variables, we were most surprised by the smoking variable, since the CDC says smokers have a greater risk of developing type 2 diabetes than non smokers.

The model's accuracy (74.9%) and roc_auc (82.5%) is higher than the NIR of 50%, so our accuracy isn’t purely driven by an overwhelming majority in one particular class.

With the set of penalty values that I used, 5 variables had their coefficients set equal to 0. PhysHlth, Smoker, Fruits, AnyHealthcare, and NoDocbcCost were not that important in predicting whether or not an individual has diabetes, after accounting for the other features in the model. Those of highest importance included variables like GenHlth, HighBP, BMI, and HighChol, and Age. Of the unimportant variables, we were most surprised by the smoking variable, since the CDC says smokers have a greater risk of developing type 2 diabetes than non smokers.

This LASSO model is good at predicting variables for risk factors of the outcome, but not as good at predicting variable values that are less common with that outcome. For example, it was able to predict someone as having diabetes (1) for individuals with high blood pressure (HighBP 1), but had a harder time predicting no high blood pressure (HighBP 0).

\

K-nearest neighbors

Fit KNN Model

knn_spec <- nearest_neighbor() %>% # General model type
    set_args(neighbors = tune()) %>% # tuning parameter
    set_engine(engine = "kknn") %>% # Engine name
    set_mode("classification")

diabetes_cv <- vfold_cv(diabetes, v = 6) # Supply dataset and # of folds

diabetes_rec <- recipe(Diabetes_binary ~ HighBP + BMI + HighChol + DiffWalk + GenHlth + HeartDiseaseorAttack + Income + CholCheck + HvyAlcoholConsump + Age + Education + Sex + Stroke + PhysActivity + Veggies + MentHlth, data = diabetes) %>% 
    step_dummy(all_nominal_predictors()) %>%
    step_normalize(all_numeric_predictors())

diabetes_wf <- workflow() %>%
    add_model(knn_spec) %>% # Model specification object
    add_recipe(diabetes_rec) # Data preprocessing recipe object

tuning_param_grid <- grid_regular(
    neighbors(range = c(1, 100)), # min and max of values for neighbors
    levels = 5 # number of neighbors values
)

knn_tune_output <- tune_grid(
    diabetes_wf,
    resamples = diabetes_cv,
    metrics = metric_set(roc_auc, accuracy),
    grid = tuning_param_grid
)

Select and Fit Best Number of Neighbors

autoplot(knn_tune_output) + theme_classic()

## Choose neighbors value that leads to the highest neighbors within 1 std. err.
knn_tune_output %>% 
    select_by_one_std_err(metric = "roc_auc", desc(neighbors)) ## The desc(neighbors) sorts the data from highest to lowest # of neighbors (most simple -> most complex)
## # A tibble: 1 × 9
##   neighbors .metric .estimator  mean     n std_err .config          .best .bound
##       <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>            <dbl>  <dbl>
## 1       100 roc_auc binary     0.807     6 0.00106 Preprocessor1_M… 0.807  0.806
knn_tune_output %>% 
    select_by_one_std_err(metric = "accuracy", desc(neighbors))
## # A tibble: 1 × 9
##   neighbors .metric  .estimator  mean     n std_err .config         .best .bound
##       <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>           <dbl>  <dbl>
## 1       100 accuracy binary     0.734     6 0.00110 Preprocessor1_… 0.734  0.733
best_se_neighbors <- select_by_one_std_err(knn_tune_output, metric = "roc_auc", desc(neighbors))
final_wf <- finalize_workflow(diabetes_wf, best_se_neighbors) 
final_fit <- fit(final_wf, data = diabetes)

Model Evaluation

# Show evaluation metrics for different values of neighbors, ordered
knn_tune_output %>% show_best(metric = "roc_auc")
## # A tibble: 5 × 7
##   neighbors .metric .estimator  mean     n std_err .config             
##       <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1       100 roc_auc binary     0.807     6 0.00106 Preprocessor1_Model5
## 2        75 roc_auc binary     0.806     6 0.00108 Preprocessor1_Model4
## 3        50 roc_auc binary     0.803     6 0.00114 Preprocessor1_Model3
## 4        25 roc_auc binary     0.794     6 0.00143 Preprocessor1_Model2
## 5         1 roc_auc binary     0.662     6 0.00179 Preprocessor1_Model1
knn_tune_output %>% show_best(metric = "accuracy")
## # A tibble: 5 × 7
##   neighbors .metric  .estimator  mean     n  std_err .config             
##       <int> <chr>    <chr>      <dbl> <int>    <dbl> <chr>               
## 1       100 accuracy binary     0.734     6 0.00110  Preprocessor1_Model5
## 2        75 accuracy binary     0.733     6 0.000959 Preprocessor1_Model4
## 3        50 accuracy binary     0.732     6 0.00120  Preprocessor1_Model3
## 4        25 accuracy binary     0.724     6 0.00153  Preprocessor1_Model2
## 5         1 accuracy binary     0.662     6 0.00178  Preprocessor1_Model1

Misclassification Plots

# Use the best model to make predictions
knn_mod_out <- predict(final_fit, new_data = diabetes) %>%
    bind_cols(diabetes)

pred_labels <- c("0" = "Actual 0", "1"  = "Actual 1")

# Plots
ggplot(knn_mod_out, aes(x = BMI, fill = .pred_class)) +
  geom_histogram() +
  labs(y = "BMI", fill = "Predictions from KNN") +
  facet_wrap(~Diabetes_binary) +
  theme_classic()

ggplot(knn_mod_out, aes(x = HighBP, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from KNN") +
  theme_classic()

ggplot(knn_mod_out, aes(x = HighChol, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from KNN") +
  theme_classic()

ggplot(knn_mod_out, aes(x = Age, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from KNN") +
  theme_classic()

ggplot(knn_mod_out, aes(x = GenHlth, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from KNN") +
  theme_classic()

conf_mat( # confusion matrix
    data = knn_mod_out,
    truth = Diabetes_binary,
    estimate = .pred_class
)
##           Truth
## Prediction     0     1
##          0 26055  7406
##          1  9291 27940

KNN Summary

The best value for the number of neighbors would be 100, chosen by selecting the best number of neighbors by one standard error so as to achieve the best model with the least chance of being overfit. Similarly to the LASSO model, my KNN model is good at predicting diabetes where the risk factor is in line with the outcome, yet struggled more when the risk factor contradicted the outcome variable. For example, when HighChol = 1 and diabetes was not present, the model tended to have around a 60% accuracy rate, contrasted with its higher accuracy when HighChol and diabetes were both present. 

With 100 neighbors for our model we have an accuracy of 73.6% (standard error: 0.0025) and an roc_auc of 80.8% (standard error: 0.0018).

\

Decision Tree

Fit Decision Tree

ct_spec <- decision_tree() %>%
  set_engine(engine = "rpart") %>%
  set_args(
      cost_complexity = tune(),
      min_n = 2,
      tree_depth = NULL
    ) %>%
  set_mode("classification")


data_cv <- vfold_cv(diabetes, v = 6)

data_rec <- recipe(Diabetes_binary ~ ., data = diabetes)


data_wf <- workflow() %>% 
  add_model(ct_spec) %>%
  add_recipe(data_rec)

param_grid <- grid_regular(
  cost_complexity(range = c(-5, -1)),
  levels = 10)

tune_res <- tune_grid(
  data_wf, 
  resamples = data_cv, 
  grid = param_grid, 
  metrics = metric_set(accuracy, roc_auc) 
)

Select and Fit Best Tree

autoplot(tune_res) + theme_classic()

best_complexity <- select_by_one_std_err(tune_res, metric = 'roc_auc', desc(cost_complexity))
data_wf_final <- finalize_workflow(data_wf, best_complexity)

final_fit <- fit(data_wf_final, data = diabetes)

Visualize Tree

# Plot the tree (rpart.plot package)
final_fit %>%
    extract_fit_engine() %>%
    rpart.plot()

Variable Importance

# Variable importance metrics 
# Sum of the goodness of split measures (impurity reduction) for each split for which it was the primary variable.
final_fit %>%
    extract_fit_engine() %>%
    pluck('variable.importance')
##               HighBP              GenHlth                  Age 
##         5144.7557155         4165.5466942         2153.8845464 
##                  BMI             HighChol               Income 
##         1929.6252743         1529.5689982          868.5379667 
##             PhysHlth             DiffWalk         PhysActivity 
##          544.7010297          470.9318388          232.6852643 
##             MentHlth HeartDiseaseorAttack    HvyAlcoholConsump 
##           79.7369107           49.3457654           48.9001103 
##                  Sex            Education            CholCheck 
##           23.4850427           19.3695753           18.5153717 
##          NoDocbcCost               Stroke        AnyHealthcare 
##           17.2777173           10.5364755            9.7694860 
##               Smoker               Fruits              Veggies 
##            9.4967505            1.6387786            0.1102831
# Predictions and Exploring Error
diabetes_preds <- diabetes %>%
    mutate(
        pred_prob = predict(final_fit, new_data = diabetes, type = "prob"),
        pred_class = predict(final_fit, new_data = diabetes, type = "class")
    )
diabetes_preds1 <- diabetes_preds %>% mutate(misclassified = Diabetes_binary!=pred_class$.pred_class)

Model Evaluation

tune_res %>% 
    select_by_one_std_err(metric = "roc_auc", desc(cost_complexity))
## # A tibble: 1 × 9
##   cost_complexity .metric .estimator  mean     n std_err .config    .best .bound
##             <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>      <dbl>  <dbl>
## 1        0.000215 roc_auc binary     0.802     6 0.00132 Preproces… 0.802  0.800
tune_res %>% 
    select_by_one_std_err(metric = "accuracy", desc(cost_complexity))
## # A tibble: 1 × 9
##   cost_complexity .metric  .estimator  mean     n std_err .config   .best .bound
##             <dbl> <chr>    <chr>      <dbl> <int>   <dbl> <chr>     <dbl>  <dbl>
## 1        0.000599 accuracy binary     0.740     6 0.00305 Preproce… 0.742  0.740

Misclassification Plots

ggplot(diabetes_preds1, aes(x = misclassified, fill = HighBP)) +
  geom_bar() +
  labs(title = "High BP vs Predictions")

ggplot(diabetes_preds1, aes(x = misclassified, fill = GenHlth)) +
  geom_bar(position = "fill") +
  labs(title = "GenHlth vs Predictions", y = "%")

ggplot(diabetes_preds1, aes(x = misclassified, fill = Age)) +
  geom_bar(position = "fill")  +
  labs(title = "Age vs Predictions", y = "%")

ggplot(diabetes_preds1, aes(y = BMI, x = misclassified)) +
  geom_boxplot() +
  labs(title = "BMI vs Predictions")

ggplot(diabetes_preds1, aes(x = misclassified, fill = HighChol)) +
  geom_bar() +
  labs(title = "HighChol vs Predictions")

Decision Tree Summary

The decision tree model results in a slightly different variable importance order than LASSO, with the top 5 also containing HighBP, GenHlth, and BMI, but also including Age and HighChol. The least important variables were very different, with the top 5 least important predictors including Veggies, Fruits, AnyHealthcare, Stroke, and CholCheck.

For most of the predictors, the rate of misclassification seems to be equal, illustrating the strength of the decision tree model. For HighBP and HighChol, the proportion of having either was relatively equal in cases where the model was accurate and where it wasn’t. Thus, there doesn’t seem to be a bias present where the model attaches a predictor with having diabetes. The exception to this is where there appears to be a higher rate of misclassification among individuals with a GenHlth rating of “Average” (3) and older individuals within the Age variable.

The accuracy of this decision tree model is around 80% (standard error: 0.001), and the ROC AUC is around 74% (standard error: 0.001).

Bagging and Random Forests

Fit Random Forest Model

# Model Specification
rf_spec <- rand_forest() %>%
    set_engine(engine = "ranger") %>% 
    set_args(
        mtry = NULL, # size of random subset of variables
        trees = 1000, # Number of trees
        min_n = 2,
        probability = FALSE, # FALSE: get hard predictions
        importance = "impurity"
    ) %>%
    set_mode("classification")

# Recipe
data_rec <- recipe(Diabetes_binary ~ ., data = diabetes)

# Workflows
data_wf <- workflow() %>%
    add_model(rf_spec) %>%
    add_recipe(data_rec)

# No tune_grid() or vfold_cv()
rf_fit <- fit(data_wf, data = diabetes)

Variable Importance

# Plot of the variable importance information
rf_fit %>% 
    extract_fit_engine() %>% 
    vip(num_features = 30) + theme_classic()

# Extract the numerical information on variable importance and display the most and least important predictors
rf_var_imp <- rf_fit %>% 
    extract_fit_engine() %>%
    vip::vi()
head(rf_var_imp)
## # A tibble: 6 × 2
##   Variable Importance
##   <chr>         <dbl>
## 1 BMI           4185.
## 2 GenHlth       3548.
## 3 Age           3136.
## 4 HighBP        2599.
## 5 Income        2006.
## 6 PhysHlth      1809.
tail(rf_var_imp)
## # A tibble: 6 × 2
##   Variable          Importance
##   <chr>                  <dbl>
## 1 Veggies                 547.
## 2 NoDocbcCost             346.
## 3 Stroke                  292.
## 4 HvyAlcoholConsump       282.
## 5 AnyHealthcare           223.
## 6 CholCheck               196.

Model Evaluation

rf_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Ranger result
## 
## Call:
##  ranger::ranger(x = maybe_data_frame(x), y = y, num.trees = ~1000,      min.node.size = min_rows(~2, x), probability = ~FALSE, importance = ~"impurity",      num.threads = 1, verbose = FALSE, seed = sample.int(10^5,          1)) 
## 
## Type:                             Classification 
## Number of trees:                  1000 
## Sample size:                      70692 
## Number of independent variables:  21 
## Mtry:                             4 
## Target node size:                 2 
## Variable importance mode:         impurity 
## Splitrule:                        gini 
## OOB prediction error:             25.26 %

Misclassification Plots

# OOB confusion matrix
rf_output <- diabetes %>%
    mutate(OOB_pred_diabetes = rf_fit %>% extract_fit_engine() %>% pluck("predictions")) # extracts OOB predictions

conf_mat( # confusion matrix
    data = rf_output,
    truth = Diabetes_binary,
    estimate = OOB_pred_diabetes
)
##           Truth
## Prediction     0     1
##          0 24899  7410
##          1 10447 27936
rf_output <- rf_output %>%
    mutate(is_misclass = Diabetes_binary!=OOB_pred_diabetes)
# Plots
ggplot(rf_output, aes(x = is_misclass, y = BMI)) +
  geom_boxplot() + 
  labs(title = "BMI vs Prediction Misclassification", x = "misclassified")

ggplot(rf_output, aes(x = is_misclass, fill = HighBP)) +
  geom_bar() +
  labs(title = "HighBP vs Prediction Misclassification", x = "misclassified")

ggplot(rf_output, aes(x = is_misclass, fill = HighChol)) +
  geom_bar() +
  labs(title = "HighChol vs Prediction Misclassification", x = "misclassified")

ggplot(rf_output, aes(x = is_misclass, fill = GenHlth)) +
  geom_bar(position = "fill") +
  labs(title = "GenHlth vs Prediction Misclassification", y = "%", x = "misclassified")

ggplot(rf_output, aes(x = is_misclass, fill = Age)) +
  geom_bar(position = "fill") +
  labs(title = "Age vs Prediction Misclassification", y = "%", x = "misclassified")

Bagging and Random Forest Summary

Variable importance for bagging and random forest is very similar to that of the decision tree model, except that the Income variable is in the top 5 more important variables instead of HighChol. The least important variables also remained mostly the same, except HvyAlcoholConsumption replaced the Fruits variable.

Similarly to Decision trees, for most of the predictors the rate of misclassification seems to be equal, illustrating the strength of the bagging and random forest model. The exception is again with the GenHlth category, where there appears to be a higher rate of misclassification among individuals with a GenHlth rating of “Average” (3), and the older age categories within the Age variable.

The out-of-bag prediction error rate for this model is 25.26%, which makes the accuracy rate around 75%. Using the confusion matrix, I was able to caluclate the specificity and sensitivity of the model, which were 79% and 75% respectively.

Supervised Learning Answer

The most common important variables for the supervised learning models included HighBP, BMI, GenHlth, and Age, and the most common least important variables included Stroke, Veggies, CholCheck, AnyHealthcare, Fruits. These results come from supervised learning models that were between 74% and 83% accurate.

Unsupervised Learning Question

What types of diagnoses might a doctor come to when considering variables such as HighBP, HighChol, and BMI?

K-means Clustering

diabetes_clust <- diabetes %>% 
  select(HighBP, HighChol, BMI, Diabetes_binary) %>% 
  slice_sample(n = 100)


# Choosing an appropriate number of clusters
# Create storage vector for total within-cluster sum of squares
tot_wc_ss <- rep(0, 20) 

# Loop
for (k in 1:20) {
    # Perform clustering
    pam_out <- pam(daisy(diabetes_clust), k = k)

    # Store the total within-cluster sum of squares
    tot_wc_ss[k-1] <- sum(pam_out$clusinfo[,"av_diss"]*pam_out$clusinfo[,"size"])
}

plot(1:20, tot_wc_ss, xlab = "Number of clusters", ylab = "Total within-cluster sum of squares")

Hierarchical Clustering

# Random subsample of 50 penguins
set.seed(253)


# Compute a distance matrix on the scaled data
dist_mat_scaled <- dist(daisy(diabetes_clust %>% select(BMI, HighBP, HighChol)))

# The (scaled) distance matrix is the input to hclust()
# The method argument indicates the linkage type
hc_complete <- hclust(dist_mat_scaled, method = "complete")
hc_single <- hclust(dist_mat_scaled, method = "single")
hc_average <- hclust(dist_mat_scaled, method = "average")
hc_centroid <- hclust(dist_mat_scaled, method = "centroid")

# Plot dendrograms
plot(hc_complete, labels = diabetes_clust$Diabetes_binary)

plot(hc_single, labels = diabetes_clust$Diabetes_binary)

plot(hc_average, labels = diabetes_clust$Diabetes_binary)

plot(hc_centroid, labels = diabetes_clust$Diabetes_binary)

diabetes4 <- diabetes_clust %>%
    mutate(
        hclust_height1.5 = factor(cutree(hc_complete, h = 1.5)), # Cut at height (h) 3
        hclust_num3 = factor(cutree(hc_complete, k = 3)) # Cut into 6 clusters (k)
    )

Cluster Plots

ggplot(diabetes4, aes(x = hclust_num3, y = BMI)) +
    geom_boxplot() +
    labs(x = "Cluster", title = "BMI vs Clusters")

ggplot(diabetes4, aes(x = hclust_num3, fill = HighBP)) +
    geom_bar() +
    labs(x = "Cluster", title = "HighBP vs Clusters")

ggplot(diabetes4, aes(x = hclust_num3, fill = HighChol)) +
    geom_bar() +
    labs(x = "Cluster", title = "HighChol vs Clusters")

Clustering Summary

The point at which there are no longer meaningful decreases in heterogeneity occurs somewhere between 3 and 6 clusters. Remembering my goal of finding diagnosis types, I want to favor a fewer number of clusters and chose to create plots considering 3 clusters. These consisted of cluster 1, with individuals who generally have a High BP, mixed cholesterol levels, and a high BMI, cluster 2, with individuals who generally have non-high BP, high cholesterol levels, and a medium BMI, and cluster 3, with individuals who generally have non-high BP, low cholesterol levels, and low BMI. To answer the unsupervised learning question, I would hypothesize that cluster 1 could be considered a diagnosis of diabetes, cluster 2 could be considered a diagnosis of prediabetes, and cluster 3 could be considered a diagnosis of no diabetes.

Cautions and Limitations

Looking back at these models, I would use them as diagnostic tools very cautiously, considering the lower accuracy and roc auc model evaluation metrics (between 74% and 83%). It is important to note that predictors are not the same as causes and should not be treated as so when considering our model. For limitations, I must consider that the data collection method was through phone surveys and therefore may be prone to biases consistant with such method. Another limitation was the time constraints for this project, including the fact that this dataset was so large that it takes significant computational time to run many of the models.
---
title: "Machine Learning Final Project: Predicting Diabetes"
author: "Nicole Branch"
output:
  bookdown::html_document2:
    code_download: true
    split_by: none
    toc: yes
    toc_depth: 3
    toc_float:
      toc_collapsed: true
    number_sections: false
---

```{r setup}
# Load data and required packages
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
```

This project was completed as part of a Statistical Machine Learning final project. The [50/50 Diabetes Health Indicators Dataset](https://www.kaggle.com/datasets/alexteboul/diabetes-health-indicators-dataset?resource=download&select=diabetes_binary_5050split_health_indicators_BRFSS2015.csv), created using information from the Behavioral Risk Factor Surveillance System (BRFSS) by the CDC, was imported from Kaggle. There were 70,692 cases and 21 variables, including the outcome variable Diabetes_binary (0 for no diabetes, 1 for diabetes). I used a variety of statistical machine learning methods that we learned throughout the course.

In the supervised learning section I asked what factors are most important for predicting diabetes. The models used for this section include LASSO logistic regression, KNN, decision tree, and bagging and random forest.

In the unsupervised learning section, I used k-means and hierarchical clustering to examine what types of diagnoses a doctor might come to when considering variables such as HighBP, HighChol, and BMI.

---

```{r}
library(dplyr)
library(tidyverse)
library(readr)
library(ggplot2)
library(tidymodels)
library(probably)
library(rpart.plot)
library(ranger)
library(vip)
library(cluster)
library(purrr)
tidymodels_prefer()
```


```{r}
diabetes <- read.csv("diabetes_binary_5050split_health_indicators_BRFSS2015.csv")
```

```{r}
diabetes <- diabetes %>% mutate(Age = ifelse(Age == 1, "18-24", 
                                      ifelse(Age == 2, "25-29",
                                      ifelse(Age == 3, "30-34",
                                      ifelse(Age==4, "35-39", 
                                      ifelse(Age==5,"40-44", 
                                      ifelse(Age==6, "45-49", 
                                      ifelse(Age==7, "50-54",
                                      ifelse(Age==8,"55-59",
                                      ifelse(Age==9, "60-64", 
                                      ifelse(Age==10, "65-69",
                                      ifelse(Age==11, "70-74", 
                                      ifelse(Age==12, "75-79", "80+")))))))))))))


diabetes <- diabetes %>%
    mutate(Diabetes_binary = relevel(factor(Diabetes_binary), ref="0",),
           HighBP = relevel(factor(HighBP), ref="0"),
           HighChol = relevel(factor(HighChol), ref="0"),
           CholCheck = relevel(factor(CholCheck), ref="0"),
           Smoker = relevel(factor(Smoker), ref="0"),
           Stroke = relevel(factor(Stroke), ref="0"),
           HeartDiseaseorAttack= relevel(factor(HeartDiseaseorAttack), ref="0"),
           PhysActivity = relevel(factor(PhysActivity), ref="0"),
           Fruits = relevel(factor(Fruits), ref="0"),
           Veggies = relevel(factor(Veggies), ref="0"),
           HvyAlcoholConsump = relevel(factor(HvyAlcoholConsump), ref="0"),
           AnyHealthcare = relevel(factor(AnyHealthcare), ref="0"),
           NoDocbcCost = relevel(factor(NoDocbcCost), ref="0"),
           GenHlth = relevel(factor(GenHlth), ref="1"),
           DiffWalk = relevel(factor(DiffWalk), ref="0"),
           Sex = relevel(factor(Sex), ref="0"),
           Age = relevel(factor(Age), ref="18-24"),
           Education = relevel(factor(Education), ref="1"),
           Income = relevel(factor(Income), ref="1"))
```

# Supervised Learning Question

>What are the most important health factors for predicting diabetes?


## Exploratory Analyses


```{r}
diabetes %>% count(Diabetes_binary)
```

+ The data contains a 50-50 split of individuals with no diabetes (0) and diabetes (1)


### Predictors vs Outcome Plots

```{r}
ggplot(diabetes, aes(fill = Age, x = Diabetes_binary)) +
  geom_bar()

ggplot(diabetes) +
  geom_bar(aes(x = Diabetes_binary, fill = HighBP))

ggplot(diabetes) +
  geom_bar(aes(x = Diabetes_binary, fill = HighChol))

ggplot(diabetes) +
  geom_boxplot(aes(x = Diabetes_binary, y = BMI))
```

+ Individuals with diabetes (1) tend to be older than individuals with no diabetes (0).

+ More individuals with diabetes have a high BP compared to individuals with no diabetes

+ More individuals with diabetes/prediabetes have high cholesterol compared to individuals with no diabetes

+ Individuals with diabetes/prediabetes have a slightly higher BMI on average compared to individuals with no diabetes

\\



## LASSO logistic regression:

### Fit LASSO Model

```{r}
set.seed(123)


# Set up CV folds
data_cv <- vfold_cv(diabetes, v = 12)

# LASSO logistic regression model specification
logistic_lasso_spec <- logistic_reg() %>%
    set_engine("glmnet") %>%
    set_args(mixture = 1, penalty = tune()) %>% # go through each penalty
    set_mode("classification")

# Recipe
logistic_lasso_rec <- recipe(Diabetes_binary ~ ., data = diabetes) %>%
    step_normalize(all_numeric_predictors()) %>% 
    step_dummy(all_nominal_predictors())

# Workflow (Recipe + Model)
log_lasso_wf <- workflow() %>%
    add_model(logistic_lasso_spec) %>%
    add_recipe(logistic_lasso_rec)

# Tune model: specify grid of parameters and tune
penalty_grid <- grid_regular(
    penalty(range = c(-5,3)), # log10 scale, lambda seq 10^-5 to 10^3
    levels = 100
)

tune_output <- tune_grid(
    log_lasso_wf,
    resamples = data_cv,
    metrics = metric_set(roc_auc, accuracy),
    grid = penalty_grid
)
```

### Select and Fit Best LASSO Model

```{r}
autoplot(tune_output) + theme_classic()
```


```{r}
# Select "best" penalty by one standard error
best_se_penalty <- select_by_one_std_err(tune_output, metric = "roc_auc", desc(penalty))

# Define workflow with "best" penalty value
final_wf <- finalize_workflow(log_lasso_wf, best_se_penalty)

# Use final_wf to fit final model with "best" penalty value
final_fit_se <- fit(final_wf, data = diabetes)
```

### Variable Importance

```{r}
glmnet_output <- final_fit_se %>% extract_fit_parsnip() %>% pluck("fit") # get the original glmnet output

plot(glmnet_output, xvar = "lambda", label = TRUE, col = rainbow(20))
```

Least important variables:

```{r}
final_fit_se %>% tidy() %>% filter(estimate == 0) # Unimportant variables
```

Most important variables:

```{r}
final_fit_se %>% tidy() %>% filter(estimate != 0) %>% 
  arrange(desc(abs(estimate))) # Important variables
```

### Model Evaluation

Accuracy and ROC AUC measures

```{r}
tune_output %>%
    collect_metrics() %>%
    filter(penalty == best_se_penalty %>% pull(penalty)) # get standard error of the mean (variability across multiple samples of pop'n)
```

No information rate

```{r}
35346/(35346+35346)
```


### Misclassification Plots

```{r}
lasso_mod_output <- predict(final_fit_se, new_data = diabetes) %>%
    bind_cols(diabetes)

pred_labels <- c("0" = "Actual 0", "1"  = "Actual 1")

# Plots
ggplot(lasso_mod_output, aes(x = BMI, fill = .pred_class)) +
  geom_histogram() +
  facet_wrap(~Diabetes_binary) +
  labs(fill = "Predictions from LASSO", title = "BMI Predictions") +
  theme_classic()
 
ggplot(lasso_mod_output, aes(x = HighBP, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from LASSO", title = "HighBP Predictions") +
  theme_classic()

ggplot(lasso_mod_output, aes(x = HighChol, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from LASSO", title = "HighChol Predictions") +
  theme_classic()

ggplot(lasso_mod_output, aes(x = Age, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from LASSO", title = "Age Predictions") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

ggplot(lasso_mod_output, aes(x = GenHlth, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from LASSO", title = "GenHlth Predictions") +
  theme_classic()
```


### LASSO Summary
  
    The best value of lambda, chosen by selecting the best penalty by one standard error, is 0.003199267. The coefficient signs and magnitudes for our LASSO model make sense, as the ones that most greatly impact the model are commonly known risk factors or medical complications of diabetes.
  
    With the set of penalty values that I used, 5 variables had their coefficients set equal to 0. PhysHlth, Smoker, Fruits, AnyHealthcare, and NoDocbcCost were not particularly important for predicting whether or not an individual has diabetes, after accounting for the other features in the model. The 5 variables with the greatest importance were GenHlth, HighBP, BMI, and HighChol, which correlates with the biggest risk factors/complications of diabetes. Of the unimportant variables, we were most surprised by the smoking variable, since the CDC says smokers have a greater risk of developing type 2 diabetes than non smokers.
  
    The model's accuracy (74.9%) and roc_auc (82.5%) is higher than the NIR of 50%, so our accuracy isn’t purely driven by an overwhelming majority in one particular class.

    With the set of penalty values that I used, 5 variables had their coefficients set equal to 0. PhysHlth, Smoker, Fruits, AnyHealthcare, and NoDocbcCost were not that important in predicting whether or not an individual has diabetes, after accounting for the other features in the model. Those of highest importance included variables like GenHlth, HighBP, BMI, and HighChol, and Age. Of the unimportant variables, we were most surprised by the smoking variable, since the CDC says smokers have a greater risk of developing type 2 diabetes than non smokers.
    
    This LASSO model is good at predicting variables for risk factors of the outcome, but not as good at predicting variable values that are less common with that outcome. For example, it was able to predict someone as having diabetes (1) for individuals with high blood pressure (HighBP 1), but had a harder time predicting no high blood pressure (HighBP 0).


\\


## K-nearest neighbors

### Fit KNN Model

```{r}
knn_spec <- nearest_neighbor() %>% # General model type
    set_args(neighbors = tune()) %>% # tuning parameter
    set_engine(engine = "kknn") %>% # Engine name
    set_mode("classification")

diabetes_cv <- vfold_cv(diabetes, v = 6) # Supply dataset and # of folds

diabetes_rec <- recipe(Diabetes_binary ~ HighBP + BMI + HighChol + DiffWalk + GenHlth + HeartDiseaseorAttack + Income + CholCheck + HvyAlcoholConsump + Age + Education + Sex + Stroke + PhysActivity + Veggies + MentHlth, data = diabetes) %>% 
    step_dummy(all_nominal_predictors()) %>%
    step_normalize(all_numeric_predictors())

diabetes_wf <- workflow() %>%
    add_model(knn_spec) %>% # Model specification object
    add_recipe(diabetes_rec) # Data preprocessing recipe object

tuning_param_grid <- grid_regular(
    neighbors(range = c(1, 100)), # min and max of values for neighbors
    levels = 5 # number of neighbors values
)

knn_tune_output <- tune_grid(
    diabetes_wf,
    resamples = diabetes_cv,
    metrics = metric_set(roc_auc, accuracy),
    grid = tuning_param_grid
)

```

### Select and Fit Best Number of Neighbors

```{r}
autoplot(knn_tune_output) + theme_classic()
```

```{r}
## Choose neighbors value that leads to the highest neighbors within 1 std. err.
knn_tune_output %>% 
    select_by_one_std_err(metric = "roc_auc", desc(neighbors)) ## The desc(neighbors) sorts the data from highest to lowest # of neighbors (most simple -> most complex)
knn_tune_output %>% 
    select_by_one_std_err(metric = "accuracy", desc(neighbors))
```


```{r}
best_se_neighbors <- select_by_one_std_err(knn_tune_output, metric = "roc_auc", desc(neighbors))
final_wf <- finalize_workflow(diabetes_wf, best_se_neighbors) 
final_fit <- fit(final_wf, data = diabetes)
```
  
### Model Evaluation

```{r}
# Show evaluation metrics for different values of neighbors, ordered
knn_tune_output %>% show_best(metric = "roc_auc")
knn_tune_output %>% show_best(metric = "accuracy")
```

### Misclassification Plots

```{r}
# Use the best model to make predictions
knn_mod_out <- predict(final_fit, new_data = diabetes) %>%
    bind_cols(diabetes)

pred_labels <- c("0" = "Actual 0", "1"  = "Actual 1")

# Plots
ggplot(knn_mod_out, aes(x = BMI, fill = .pred_class)) +
  geom_histogram() +
  labs(y = "BMI", fill = "Predictions from KNN") +
  facet_wrap(~Diabetes_binary) +
  theme_classic()
 
ggplot(knn_mod_out, aes(x = HighBP, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from KNN") +
  theme_classic()

ggplot(knn_mod_out, aes(x = HighChol, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from KNN") +
  theme_classic()

ggplot(knn_mod_out, aes(x = Age, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from KNN") +
  theme_classic()

ggplot(knn_mod_out, aes(x = GenHlth, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from KNN") +
  theme_classic()
```


```{r}
conf_mat( # confusion matrix
    data = knn_mod_out,
    truth = Diabetes_binary,
    estimate = .pred_class
)
```


### KNN Summary

    The best value for the number of neighbors would be 100, chosen by selecting the best number of neighbors by one standard error so as to achieve the best model with the least chance of being overfit. Similarly to the LASSO model, my KNN model is good at predicting diabetes where the risk factor is in line with the outcome, yet struggled more when the risk factor contradicted the outcome variable. For example, when HighChol = 1 and diabetes was not present, the model tended to have around a 60% accuracy rate, contrasted with its higher accuracy when HighChol and diabetes were both present. 

    With 100 neighbors for our model we have an accuracy of 73.6% (standard error: 0.0025) and an roc_auc of 80.8% (standard error: 0.0018).


\\


## Decision Tree

### Fit Decision Tree

```{r}
ct_spec <- decision_tree() %>%
  set_engine(engine = "rpart") %>%
  set_args(
      cost_complexity = tune(),
      min_n = 2,
      tree_depth = NULL
    ) %>%
  set_mode("classification")


data_cv <- vfold_cv(diabetes, v = 6)

data_rec <- recipe(Diabetes_binary ~ ., data = diabetes)


data_wf <- workflow() %>% 
  add_model(ct_spec) %>%
  add_recipe(data_rec)

param_grid <- grid_regular(
  cost_complexity(range = c(-5, -1)),
  levels = 10)

tune_res <- tune_grid(
  data_wf, 
  resamples = data_cv, 
  grid = param_grid, 
  metrics = metric_set(accuracy, roc_auc) 
)
```

### Select and Fit Best Tree

```{r}
autoplot(tune_res) + theme_classic()
```

```{r}
best_complexity <- select_by_one_std_err(tune_res, metric = 'roc_auc', desc(cost_complexity))
data_wf_final <- finalize_workflow(data_wf, best_complexity)

final_fit <- fit(data_wf_final, data = diabetes)
```

### Visualize Tree

```{r, warning=FALSE}
# Plot the tree (rpart.plot package)
final_fit %>%
    extract_fit_engine() %>%
    rpart.plot()
```

### Variable Importance

```{r}
# Variable importance metrics 
# Sum of the goodness of split measures (impurity reduction) for each split for which it was the primary variable.
final_fit %>%
    extract_fit_engine() %>%
    pluck('variable.importance')

# Predictions and Exploring Error
diabetes_preds <- diabetes %>%
    mutate(
        pred_prob = predict(final_fit, new_data = diabetes, type = "prob"),
        pred_class = predict(final_fit, new_data = diabetes, type = "class")
    )
diabetes_preds1 <- diabetes_preds %>% mutate(misclassified = Diabetes_binary!=pred_class$.pred_class)
```

### Model Evaluation

```{r}
tune_res %>% 
    select_by_one_std_err(metric = "roc_auc", desc(cost_complexity))

tune_res %>% 
    select_by_one_std_err(metric = "accuracy", desc(cost_complexity))
```

### Misclassification Plots

```{r}
ggplot(diabetes_preds1, aes(x = misclassified, fill = HighBP)) +
  geom_bar() +
  labs(title = "High BP vs Predictions")

ggplot(diabetes_preds1, aes(x = misclassified, fill = GenHlth)) +
  geom_bar(position = "fill") +
  labs(title = "GenHlth vs Predictions", y = "%")

ggplot(diabetes_preds1, aes(x = misclassified, fill = Age)) +
  geom_bar(position = "fill")  +
  labs(title = "Age vs Predictions", y = "%")

ggplot(diabetes_preds1, aes(y = BMI, x = misclassified)) +
  geom_boxplot() +
  labs(title = "BMI vs Predictions")

ggplot(diabetes_preds1, aes(x = misclassified, fill = HighChol)) +
  geom_bar() +
  labs(title = "HighChol vs Predictions")
```

### Decision Tree Summary

    The decision tree model results in a slightly different variable importance order than LASSO, with the top 5 also containing HighBP, GenHlth, and BMI, but also including Age and HighChol. The least important variables were very different, with the top 5 least important predictors including Veggies, Fruits, AnyHealthcare, Stroke, and CholCheck.
  
    For most of the predictors, the rate of misclassification seems to be equal, illustrating the strength of the decision tree model. For HighBP and HighChol, the proportion of having either was relatively equal in cases where the model was accurate and where it wasn’t. Thus, there doesn’t seem to be a bias present where the model attaches a predictor with having diabetes. The exception to this is where there appears to be a higher rate of misclassification among individuals with a GenHlth rating of “Average” (3) and older individuals within the Age variable.
  
    The accuracy of this decision tree model is around 80% (standard error: 0.001), and the ROC AUC is around 74% (standard error: 0.001).


## Bagging and Random Forests

### Fit Random Forest Model
```{r}
# Model Specification
rf_spec <- rand_forest() %>%
    set_engine(engine = "ranger") %>% 
    set_args(
        mtry = NULL, # size of random subset of variables
        trees = 1000, # Number of trees
        min_n = 2,
        probability = FALSE, # FALSE: get hard predictions
        importance = "impurity"
    ) %>%
    set_mode("classification")

# Recipe
data_rec <- recipe(Diabetes_binary ~ ., data = diabetes)

# Workflows
data_wf <- workflow() %>%
    add_model(rf_spec) %>%
    add_recipe(data_rec)

# No tune_grid() or vfold_cv()
rf_fit <- fit(data_wf, data = diabetes)

```

### Variable Importance

```{r}
# Plot of the variable importance information
rf_fit %>% 
    extract_fit_engine() %>% 
    vip(num_features = 30) + theme_classic()

# Extract the numerical information on variable importance and display the most and least important predictors
rf_var_imp <- rf_fit %>% 
    extract_fit_engine() %>%
    vip::vi()
head(rf_var_imp)
tail(rf_var_imp)
```

### Model Evaluation

```{r}
rf_fit
```


### Misclassification Plots

```{r}
# OOB confusion matrix
rf_output <- diabetes %>%
    mutate(OOB_pred_diabetes = rf_fit %>% extract_fit_engine() %>% pluck("predictions")) # extracts OOB predictions

conf_mat( # confusion matrix
    data = rf_output,
    truth = Diabetes_binary,
    estimate = OOB_pred_diabetes
)

rf_output <- rf_output %>%
    mutate(is_misclass = Diabetes_binary!=OOB_pred_diabetes)
# Plots
ggplot(rf_output, aes(x = is_misclass, y = BMI)) +
  geom_boxplot() + 
  labs(title = "BMI vs Prediction Misclassification", x = "misclassified")

ggplot(rf_output, aes(x = is_misclass, fill = HighBP)) +
  geom_bar() +
  labs(title = "HighBP vs Prediction Misclassification", x = "misclassified")

ggplot(rf_output, aes(x = is_misclass, fill = HighChol)) +
  geom_bar() +
  labs(title = "HighChol vs Prediction Misclassification", x = "misclassified")

ggplot(rf_output, aes(x = is_misclass, fill = GenHlth)) +
  geom_bar(position = "fill") +
  labs(title = "GenHlth vs Prediction Misclassification", y = "%", x = "misclassified")

ggplot(rf_output, aes(x = is_misclass, fill = Age)) +
  geom_bar(position = "fill") +
  labs(title = "Age vs Prediction Misclassification", y = "%", x = "misclassified")
```

### Bagging and Random Forest Summary

    Variable importance for bagging and random forest is very similar to that of the decision tree model, except that the Income variable is in the top 5 more important variables instead of HighChol. The least important variables also remained mostly the same, except HvyAlcoholConsumption replaced the Fruits variable.
    
    Similarly to Decision trees, for most of the predictors the rate of misclassification seems to be equal, illustrating the strength of the bagging and random forest model. The exception is again with the GenHlth category, where there appears to be a higher rate of misclassification among individuals with a GenHlth rating of “Average” (3), and the older age categories within the Age variable.
    
    The out-of-bag prediction error rate for this model is 25.26%, which makes the accuracy rate around 75%. Using the confusion matrix, I was able to caluclate the specificity and sensitivity of the model, which were 79% and 75% respectively.

## Supervised Learning Answer

    The most common important variables for the supervised learning models included HighBP, BMI, GenHlth, and Age, and the most common least important variables included Stroke, Veggies, CholCheck, AnyHealthcare, Fruits. These results come from supervised learning models that were between 74% and 83% accurate.


# Unsupervised Learning Question

> What types of diagnoses might a doctor come to when considering variables such as HighBP, HighChol, and BMI?


## K-means Clustering

```{r}
diabetes_clust <- diabetes %>% 
  select(HighBP, HighChol, BMI, Diabetes_binary) %>% 
  slice_sample(n = 100)


# Choosing an appropriate number of clusters
# Create storage vector for total within-cluster sum of squares
tot_wc_ss <- rep(0, 20) 

# Loop
for (k in 1:20) {
    # Perform clustering
    pam_out <- pam(daisy(diabetes_clust), k = k)

    # Store the total within-cluster sum of squares
    tot_wc_ss[k-1] <- sum(pam_out$clusinfo[,"av_diss"]*pam_out$clusinfo[,"size"])
}

plot(1:20, tot_wc_ss, xlab = "Number of clusters", ylab = "Total within-cluster sum of squares")
```


## Hierarchical Clustering
```{r}
# Random subsample of 50 penguins
set.seed(253)


# Compute a distance matrix on the scaled data
dist_mat_scaled <- dist(daisy(diabetes_clust %>% select(BMI, HighBP, HighChol)))

# The (scaled) distance matrix is the input to hclust()
# The method argument indicates the linkage type
hc_complete <- hclust(dist_mat_scaled, method = "complete")
hc_single <- hclust(dist_mat_scaled, method = "single")
hc_average <- hclust(dist_mat_scaled, method = "average")
hc_centroid <- hclust(dist_mat_scaled, method = "centroid")

# Plot dendrograms
plot(hc_complete, labels = diabetes_clust$Diabetes_binary)
plot(hc_single, labels = diabetes_clust$Diabetes_binary)
plot(hc_average, labels = diabetes_clust$Diabetes_binary)
plot(hc_centroid, labels = diabetes_clust$Diabetes_binary)

diabetes4 <- diabetes_clust %>%
    mutate(
        hclust_height1.5 = factor(cutree(hc_complete, h = 1.5)), # Cut at height (h) 3
        hclust_num3 = factor(cutree(hc_complete, k = 3)) # Cut into 6 clusters (k)
    )
```

## Cluster Plots

```{r}
ggplot(diabetes4, aes(x = hclust_num3, y = BMI)) +
    geom_boxplot() +
    labs(x = "Cluster", title = "BMI vs Clusters")

ggplot(diabetes4, aes(x = hclust_num3, fill = HighBP)) +
    geom_bar() +
    labs(x = "Cluster", title = "HighBP vs Clusters")

ggplot(diabetes4, aes(x = hclust_num3, fill = HighChol)) +
    geom_bar() +
    labs(x = "Cluster", title = "HighChol vs Clusters")
```


## Clustering Summary
    
    The point at which there are no longer meaningful decreases in heterogeneity occurs somewhere between 3 and 6 clusters. Remembering my goal of finding diagnosis types, I want to favor a fewer number of clusters and chose to create plots considering 3 clusters. These consisted of cluster 1, with individuals who generally have a High BP, mixed cholesterol levels, and a high BMI, cluster 2, with individuals who generally have non-high BP, high cholesterol levels, and a medium BMI, and cluster 3, with individuals who generally have non-high BP, low cholesterol levels, and low BMI. To answer the unsupervised learning question, I would hypothesize that cluster 1 could be considered a diagnosis of diabetes, cluster 2 could be considered a diagnosis of prediabetes, and cluster 3 could be considered a diagnosis of no diabetes.

# Cautions and Limitations

    Looking back at these models, I would use them as diagnostic tools very cautiously, considering the lower accuracy and roc auc model evaluation metrics (between 74% and 83%). It is important to note that predictors are not the same as causes and should not be treated as so when considering our model. For limitations, I must consider that the data collection method was through phone surveys and therefore may be prone to biases consistant with such method. Another limitation was the time constraints for this project, including the fact that this dataset was so large that it takes significant computational time to run many of the models.

