# Load data and required packages
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
This project was completed as part of a Statistical Machine Learning final project. The 50/50 Diabetes Health Indicators Dataset, created using information from the Behavioral Risk Factor Surveillance System (BRFSS) by the CDC, was imported from Kaggle. There were 70,692 cases and 21 variables, including the outcome variable Diabetes_binary (0 for no diabetes, 1 for diabetes). I used a variety of statistical machine learning methods that we learned throughout the course.
In the supervised learning section I asked what factors are most important for predicting diabetes. The models used for this section include LASSO logistic regression, KNN, decision tree, and bagging and random forest.
In the unsupervised learning section, I used k-means and hierarchical clustering to examine what types of diagnoses a doctor might come to when considering variables such as HighBP, HighChol, and BMI.
library(dplyr)
library(tidyverse)
library(readr)
library(ggplot2)
library(tidymodels)
library(probably)
library(rpart.plot)
library(ranger)
library(vip)
library(cluster)
library(purrr)
tidymodels_prefer()
diabetes <- read.csv("diabetes_binary_5050split_health_indicators_BRFSS2015.csv")
diabetes <- diabetes %>% mutate(Age = ifelse(Age == 1, "18-24",
ifelse(Age == 2, "25-29",
ifelse(Age == 3, "30-34",
ifelse(Age==4, "35-39",
ifelse(Age==5,"40-44",
ifelse(Age==6, "45-49",
ifelse(Age==7, "50-54",
ifelse(Age==8,"55-59",
ifelse(Age==9, "60-64",
ifelse(Age==10, "65-69",
ifelse(Age==11, "70-74",
ifelse(Age==12, "75-79", "80+")))))))))))))
diabetes <- diabetes %>%
mutate(Diabetes_binary = relevel(factor(Diabetes_binary), ref="0",),
HighBP = relevel(factor(HighBP), ref="0"),
HighChol = relevel(factor(HighChol), ref="0"),
CholCheck = relevel(factor(CholCheck), ref="0"),
Smoker = relevel(factor(Smoker), ref="0"),
Stroke = relevel(factor(Stroke), ref="0"),
HeartDiseaseorAttack= relevel(factor(HeartDiseaseorAttack), ref="0"),
PhysActivity = relevel(factor(PhysActivity), ref="0"),
Fruits = relevel(factor(Fruits), ref="0"),
Veggies = relevel(factor(Veggies), ref="0"),
HvyAlcoholConsump = relevel(factor(HvyAlcoholConsump), ref="0"),
AnyHealthcare = relevel(factor(AnyHealthcare), ref="0"),
NoDocbcCost = relevel(factor(NoDocbcCost), ref="0"),
GenHlth = relevel(factor(GenHlth), ref="1"),
DiffWalk = relevel(factor(DiffWalk), ref="0"),
Sex = relevel(factor(Sex), ref="0"),
Age = relevel(factor(Age), ref="18-24"),
Education = relevel(factor(Education), ref="1"),
Income = relevel(factor(Income), ref="1"))
Supervised Learning Question
What are the most important health factors for predicting diabetes?
Exploratory Analyses
diabetes %>% count(Diabetes_binary)
## Diabetes_binary n
## 1 0 35346
## 2 1 35346
- The data contains a 50-50 split of individuals with no diabetes (0) and diabetes (1)
Predictors vs Outcome Plots
ggplot(diabetes, aes(fill = Age, x = Diabetes_binary)) +
geom_bar()

ggplot(diabetes) +
geom_bar(aes(x = Diabetes_binary, fill = HighBP))

ggplot(diabetes) +
geom_bar(aes(x = Diabetes_binary, fill = HighChol))

ggplot(diabetes) +
geom_boxplot(aes(x = Diabetes_binary, y = BMI))

Individuals with diabetes (1) tend to be older than individuals with no diabetes (0).
More individuals with diabetes have a high BP compared to individuals with no diabetes
More individuals with diabetes/prediabetes have high cholesterol compared to individuals with no diabetes
Individuals with diabetes/prediabetes have a slightly higher BMI on average compared to individuals with no diabetes
\
LASSO logistic regression:
Fit LASSO Model
set.seed(123)
# Set up CV folds
data_cv <- vfold_cv(diabetes, v = 12)
# LASSO logistic regression model specification
logistic_lasso_spec <- logistic_reg() %>%
set_engine("glmnet") %>%
set_args(mixture = 1, penalty = tune()) %>% # go through each penalty
set_mode("classification")
# Recipe
logistic_lasso_rec <- recipe(Diabetes_binary ~ ., data = diabetes) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
# Workflow (Recipe + Model)
log_lasso_wf <- workflow() %>%
add_model(logistic_lasso_spec) %>%
add_recipe(logistic_lasso_rec)
# Tune model: specify grid of parameters and tune
penalty_grid <- grid_regular(
penalty(range = c(-5,3)), # log10 scale, lambda seq 10^-5 to 10^3
levels = 100
)
tune_output <- tune_grid(
log_lasso_wf,
resamples = data_cv,
metrics = metric_set(roc_auc, accuracy),
grid = penalty_grid
)
Select and Fit Best LASSO Model
autoplot(tune_output) + theme_classic()

# Select "best" penalty by one standard error
best_se_penalty <- select_by_one_std_err(tune_output, metric = "roc_auc", desc(penalty))
# Define workflow with "best" penalty value
final_wf <- finalize_workflow(log_lasso_wf, best_se_penalty)
# Use final_wf to fit final model with "best" penalty value
final_fit_se <- fit(final_wf, data = diabetes)
Variable Importance
glmnet_output <- final_fit_se %>% extract_fit_parsnip() %>% pluck("fit") # get the original glmnet output
plot(glmnet_output, xvar = "lambda", label = TRUE, col = rainbow(20))

Least important variables:
final_fit_se %>% tidy() %>% filter(estimate == 0) # Unimportant variables
## # A tibble: 10 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 PhysHlth 0 0.00320
## 2 Smoker_X1 0 0.00320
## 3 Fruits_X1 0 0.00320
## 4 AnyHealthcare_X1 0 0.00320
## 5 NoDocbcCost_X1 0 0.00320
## 6 Age_X50.54 0 0.00320
## 7 Age_X55.59 0 0.00320
## 8 Education_X4 0 0.00320
## 9 Education_X5 0 0.00320
## 10 Income_X5 0 0.00320
Most important variables:
final_fit_se %>% tidy() %>% filter(estimate != 0) %>%
arrange(desc(abs(estimate))) # Important variables
## # A tibble: 36 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) -2.84 0.00320
## 2 GenHlth_X5 1.52 0.00320
## 3 GenHlth_X4 1.46 0.00320
## 4 CholCheck_X1 1.14 0.00320
## 5 Age_X25.29 -1.13 0.00320
## 6 GenHlth_X3 1.04 0.00320
## 7 Age_X30.34 -0.881 0.00320
## 8 HighBP_X1 0.756 0.00320
## 9 HvyAlcoholConsump_X1 -0.678 0.00320
## 10 Age_X35.39 -0.619 0.00320
## # … with 26 more rows
Model Evaluation
Accuracy and ROC AUC measures
tune_output %>%
collect_metrics() %>%
filter(penalty == best_se_penalty %>% pull(penalty)) # get standard error of the mean (variability across multiple samples of pop'n)
## # A tibble: 2 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.00320 accuracy binary 0.749 12 0.00120 Preprocessor1_Model032
## 2 0.00320 roc_auc binary 0.825 12 0.00123 Preprocessor1_Model032
No information rate
35346/(35346+35346)
## [1] 0.5
Misclassification Plots
lasso_mod_output <- predict(final_fit_se, new_data = diabetes) %>%
bind_cols(diabetes)
pred_labels <- c("0" = "Actual 0", "1" = "Actual 1")
# Plots
ggplot(lasso_mod_output, aes(x = BMI, fill = .pred_class)) +
geom_histogram() +
facet_wrap(~Diabetes_binary) +
labs(fill = "Predictions from LASSO", title = "BMI Predictions") +
theme_classic()

ggplot(lasso_mod_output, aes(x = HighBP, fill = .pred_class)) +
geom_bar() +
facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
labs(y = "Count", fill = "Predictions from LASSO", title = "HighBP Predictions") +
theme_classic()

ggplot(lasso_mod_output, aes(x = HighChol, fill = .pred_class)) +
geom_bar() +
facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
labs(y = "Count", fill = "Predictions from LASSO", title = "HighChol Predictions") +
theme_classic()

ggplot(lasso_mod_output, aes(x = Age, fill = .pred_class)) +
geom_bar() +
facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
labs(y = "Count", fill = "Predictions from LASSO", title = "Age Predictions") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

ggplot(lasso_mod_output, aes(x = GenHlth, fill = .pred_class)) +
geom_bar() +
facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
labs(y = "Count", fill = "Predictions from LASSO", title = "GenHlth Predictions") +
theme_classic()

LASSO Summary
The best value of lambda, chosen by selecting the best penalty by one standard error, is 0.003199267. The coefficient signs and magnitudes for our LASSO model make sense, as the ones that most greatly impact the model are commonly known risk factors or medical complications of diabetes.
With the set of penalty values that I used, 5 variables had their coefficients set equal to 0. PhysHlth, Smoker, Fruits, AnyHealthcare, and NoDocbcCost were not particularly important for predicting whether or not an individual has diabetes, after accounting for the other features in the model. The 5 variables with the greatest importance were GenHlth, HighBP, BMI, and HighChol, which correlates with the biggest risk factors/complications of diabetes. Of the unimportant variables, we were most surprised by the smoking variable, since the CDC says smokers have a greater risk of developing type 2 diabetes than non smokers.
The model's accuracy (74.9%) and roc_auc (82.5%) is higher than the NIR of 50%, so our accuracy isn’t purely driven by an overwhelming majority in one particular class.
With the set of penalty values that I used, 5 variables had their coefficients set equal to 0. PhysHlth, Smoker, Fruits, AnyHealthcare, and NoDocbcCost were not that important in predicting whether or not an individual has diabetes, after accounting for the other features in the model. Those of highest importance included variables like GenHlth, HighBP, BMI, and HighChol, and Age. Of the unimportant variables, we were most surprised by the smoking variable, since the CDC says smokers have a greater risk of developing type 2 diabetes than non smokers.
This LASSO model is good at predicting variables for risk factors of the outcome, but not as good at predicting variable values that are less common with that outcome. For example, it was able to predict someone as having diabetes (1) for individuals with high blood pressure (HighBP 1), but had a harder time predicting no high blood pressure (HighBP 0).
\
K-nearest neighbors
Fit KNN Model
knn_spec <- nearest_neighbor() %>% # General model type
set_args(neighbors = tune()) %>% # tuning parameter
set_engine(engine = "kknn") %>% # Engine name
set_mode("classification")
diabetes_cv <- vfold_cv(diabetes, v = 6) # Supply dataset and # of folds
diabetes_rec <- recipe(Diabetes_binary ~ HighBP + BMI + HighChol + DiffWalk + GenHlth + HeartDiseaseorAttack + Income + CholCheck + HvyAlcoholConsump + Age + Education + Sex + Stroke + PhysActivity + Veggies + MentHlth, data = diabetes) %>%
step_dummy(all_nominal_predictors()) %>%
step_normalize(all_numeric_predictors())
diabetes_wf <- workflow() %>%
add_model(knn_spec) %>% # Model specification object
add_recipe(diabetes_rec) # Data preprocessing recipe object
tuning_param_grid <- grid_regular(
neighbors(range = c(1, 100)), # min and max of values for neighbors
levels = 5 # number of neighbors values
)
knn_tune_output <- tune_grid(
diabetes_wf,
resamples = diabetes_cv,
metrics = metric_set(roc_auc, accuracy),
grid = tuning_param_grid
)
Select and Fit Best Number of Neighbors
autoplot(knn_tune_output) + theme_classic()

## Choose neighbors value that leads to the highest neighbors within 1 std. err.
knn_tune_output %>%
select_by_one_std_err(metric = "roc_auc", desc(neighbors)) ## The desc(neighbors) sorts the data from highest to lowest # of neighbors (most simple -> most complex)
## # A tibble: 1 × 9
## neighbors .metric .estimator mean n std_err .config .best .bound
## <int> <chr> <chr> <dbl> <int> <dbl> <chr> <dbl> <dbl>
## 1 100 roc_auc binary 0.807 6 0.00106 Preprocessor1_M… 0.807 0.806
knn_tune_output %>%
select_by_one_std_err(metric = "accuracy", desc(neighbors))
## # A tibble: 1 × 9
## neighbors .metric .estimator mean n std_err .config .best .bound
## <int> <chr> <chr> <dbl> <int> <dbl> <chr> <dbl> <dbl>
## 1 100 accuracy binary 0.734 6 0.00110 Preprocessor1_… 0.734 0.733
best_se_neighbors <- select_by_one_std_err(knn_tune_output, metric = "roc_auc", desc(neighbors))
final_wf <- finalize_workflow(diabetes_wf, best_se_neighbors)
final_fit <- fit(final_wf, data = diabetes)
Model Evaluation
# Show evaluation metrics for different values of neighbors, ordered
knn_tune_output %>% show_best(metric = "roc_auc")
## # A tibble: 5 × 7
## neighbors .metric .estimator mean n std_err .config
## <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 100 roc_auc binary 0.807 6 0.00106 Preprocessor1_Model5
## 2 75 roc_auc binary 0.806 6 0.00108 Preprocessor1_Model4
## 3 50 roc_auc binary 0.803 6 0.00114 Preprocessor1_Model3
## 4 25 roc_auc binary 0.794 6 0.00143 Preprocessor1_Model2
## 5 1 roc_auc binary 0.662 6 0.00179 Preprocessor1_Model1
knn_tune_output %>% show_best(metric = "accuracy")
## # A tibble: 5 × 7
## neighbors .metric .estimator mean n std_err .config
## <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 100 accuracy binary 0.734 6 0.00110 Preprocessor1_Model5
## 2 75 accuracy binary 0.733 6 0.000959 Preprocessor1_Model4
## 3 50 accuracy binary 0.732 6 0.00120 Preprocessor1_Model3
## 4 25 accuracy binary 0.724 6 0.00153 Preprocessor1_Model2
## 5 1 accuracy binary 0.662 6 0.00178 Preprocessor1_Model1
Misclassification Plots
# Use the best model to make predictions
knn_mod_out <- predict(final_fit, new_data = diabetes) %>%
bind_cols(diabetes)
pred_labels <- c("0" = "Actual 0", "1" = "Actual 1")
# Plots
ggplot(knn_mod_out, aes(x = BMI, fill = .pred_class)) +
geom_histogram() +
labs(y = "BMI", fill = "Predictions from KNN") +
facet_wrap(~Diabetes_binary) +
theme_classic()

ggplot(knn_mod_out, aes(x = HighBP, fill = .pred_class)) +
geom_bar() +
facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
labs(y = "Count", fill = "Predictions from KNN") +
theme_classic()

ggplot(knn_mod_out, aes(x = HighChol, fill = .pred_class)) +
geom_bar() +
facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
labs(y = "Count", fill = "Predictions from KNN") +
theme_classic()

ggplot(knn_mod_out, aes(x = Age, fill = .pred_class)) +
geom_bar() +
facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
labs(y = "Count", fill = "Predictions from KNN") +
theme_classic()

ggplot(knn_mod_out, aes(x = GenHlth, fill = .pred_class)) +
geom_bar() +
facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
labs(y = "Count", fill = "Predictions from KNN") +
theme_classic()

conf_mat( # confusion matrix
data = knn_mod_out,
truth = Diabetes_binary,
estimate = .pred_class
)
## Truth
## Prediction 0 1
## 0 26055 7406
## 1 9291 27940
KNN Summary
The best value for the number of neighbors would be 100, chosen by selecting the best number of neighbors by one standard error so as to achieve the best model with the least chance of being overfit. Similarly to the LASSO model, my KNN model is good at predicting diabetes where the risk factor is in line with the outcome, yet struggled more when the risk factor contradicted the outcome variable. For example, when HighChol = 1 and diabetes was not present, the model tended to have around a 60% accuracy rate, contrasted with its higher accuracy when HighChol and diabetes were both present.
With 100 neighbors for our model we have an accuracy of 73.6% (standard error: 0.0025) and an roc_auc of 80.8% (standard error: 0.0018).
\
Decision Tree
Fit Decision Tree
ct_spec <- decision_tree() %>%
set_engine(engine = "rpart") %>%
set_args(
cost_complexity = tune(),
min_n = 2,
tree_depth = NULL
) %>%
set_mode("classification")
data_cv <- vfold_cv(diabetes, v = 6)
data_rec <- recipe(Diabetes_binary ~ ., data = diabetes)
data_wf <- workflow() %>%
add_model(ct_spec) %>%
add_recipe(data_rec)
param_grid <- grid_regular(
cost_complexity(range = c(-5, -1)),
levels = 10)
tune_res <- tune_grid(
data_wf,
resamples = data_cv,
grid = param_grid,
metrics = metric_set(accuracy, roc_auc)
)
Select and Fit Best Tree
autoplot(tune_res) + theme_classic()

best_complexity <- select_by_one_std_err(tune_res, metric = 'roc_auc', desc(cost_complexity))
data_wf_final <- finalize_workflow(data_wf, best_complexity)
final_fit <- fit(data_wf_final, data = diabetes)
Visualize Tree
# Plot the tree (rpart.plot package)
final_fit %>%
extract_fit_engine() %>%
rpart.plot()

Variable Importance
# Variable importance metrics
# Sum of the goodness of split measures (impurity reduction) for each split for which it was the primary variable.
final_fit %>%
extract_fit_engine() %>%
pluck('variable.importance')
## HighBP GenHlth Age
## 5144.7557155 4165.5466942 2153.8845464
## BMI HighChol Income
## 1929.6252743 1529.5689982 868.5379667
## PhysHlth DiffWalk PhysActivity
## 544.7010297 470.9318388 232.6852643
## MentHlth HeartDiseaseorAttack HvyAlcoholConsump
## 79.7369107 49.3457654 48.9001103
## Sex Education CholCheck
## 23.4850427 19.3695753 18.5153717
## NoDocbcCost Stroke AnyHealthcare
## 17.2777173 10.5364755 9.7694860
## Smoker Fruits Veggies
## 9.4967505 1.6387786 0.1102831
# Predictions and Exploring Error
diabetes_preds <- diabetes %>%
mutate(
pred_prob = predict(final_fit, new_data = diabetes, type = "prob"),
pred_class = predict(final_fit, new_data = diabetes, type = "class")
)
diabetes_preds1 <- diabetes_preds %>% mutate(misclassified = Diabetes_binary!=pred_class$.pred_class)
Model Evaluation
tune_res %>%
select_by_one_std_err(metric = "roc_auc", desc(cost_complexity))
## # A tibble: 1 × 9
## cost_complexity .metric .estimator mean n std_err .config .best .bound
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr> <dbl> <dbl>
## 1 0.000215 roc_auc binary 0.802 6 0.00132 Preproces… 0.802 0.800
tune_res %>%
select_by_one_std_err(metric = "accuracy", desc(cost_complexity))
## # A tibble: 1 × 9
## cost_complexity .metric .estimator mean n std_err .config .best .bound
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr> <dbl> <dbl>
## 1 0.000599 accuracy binary 0.740 6 0.00305 Preproce… 0.742 0.740
Misclassification Plots
ggplot(diabetes_preds1, aes(x = misclassified, fill = HighBP)) +
geom_bar() +
labs(title = "High BP vs Predictions")

ggplot(diabetes_preds1, aes(x = misclassified, fill = GenHlth)) +
geom_bar(position = "fill") +
labs(title = "GenHlth vs Predictions", y = "%")

ggplot(diabetes_preds1, aes(x = misclassified, fill = Age)) +
geom_bar(position = "fill") +
labs(title = "Age vs Predictions", y = "%")

ggplot(diabetes_preds1, aes(y = BMI, x = misclassified)) +
geom_boxplot() +
labs(title = "BMI vs Predictions")

ggplot(diabetes_preds1, aes(x = misclassified, fill = HighChol)) +
geom_bar() +
labs(title = "HighChol vs Predictions")

Decision Tree Summary
The decision tree model results in a slightly different variable importance order than LASSO, with the top 5 also containing HighBP, GenHlth, and BMI, but also including Age and HighChol. The least important variables were very different, with the top 5 least important predictors including Veggies, Fruits, AnyHealthcare, Stroke, and CholCheck.
For most of the predictors, the rate of misclassification seems to be equal, illustrating the strength of the decision tree model. For HighBP and HighChol, the proportion of having either was relatively equal in cases where the model was accurate and where it wasn’t. Thus, there doesn’t seem to be a bias present where the model attaches a predictor with having diabetes. The exception to this is where there appears to be a higher rate of misclassification among individuals with a GenHlth rating of “Average” (3) and older individuals within the Age variable.
The accuracy of this decision tree model is around 80% (standard error: 0.001), and the ROC AUC is around 74% (standard error: 0.001).
Bagging and Random Forests
Fit Random Forest Model
# Model Specification
rf_spec <- rand_forest() %>%
set_engine(engine = "ranger") %>%
set_args(
mtry = NULL, # size of random subset of variables
trees = 1000, # Number of trees
min_n = 2,
probability = FALSE, # FALSE: get hard predictions
importance = "impurity"
) %>%
set_mode("classification")
# Recipe
data_rec <- recipe(Diabetes_binary ~ ., data = diabetes)
# Workflows
data_wf <- workflow() %>%
add_model(rf_spec) %>%
add_recipe(data_rec)
# No tune_grid() or vfold_cv()
rf_fit <- fit(data_wf, data = diabetes)
Variable Importance
# Plot of the variable importance information
rf_fit %>%
extract_fit_engine() %>%
vip(num_features = 30) + theme_classic()

# Extract the numerical information on variable importance and display the most and least important predictors
rf_var_imp <- rf_fit %>%
extract_fit_engine() %>%
vip::vi()
head(rf_var_imp)
## # A tibble: 6 × 2
## Variable Importance
## <chr> <dbl>
## 1 BMI 4185.
## 2 GenHlth 3548.
## 3 Age 3136.
## 4 HighBP 2599.
## 5 Income 2006.
## 6 PhysHlth 1809.
tail(rf_var_imp)
## # A tibble: 6 × 2
## Variable Importance
## <chr> <dbl>
## 1 Veggies 547.
## 2 NoDocbcCost 346.
## 3 Stroke 292.
## 4 HvyAlcoholConsump 282.
## 5 AnyHealthcare 223.
## 6 CholCheck 196.
Model Evaluation
rf_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Ranger result
##
## Call:
## ranger::ranger(x = maybe_data_frame(x), y = y, num.trees = ~1000, min.node.size = min_rows(~2, x), probability = ~FALSE, importance = ~"impurity", num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1))
##
## Type: Classification
## Number of trees: 1000
## Sample size: 70692
## Number of independent variables: 21
## Mtry: 4
## Target node size: 2
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error: 25.26 %
Misclassification Plots
# OOB confusion matrix
rf_output <- diabetes %>%
mutate(OOB_pred_diabetes = rf_fit %>% extract_fit_engine() %>% pluck("predictions")) # extracts OOB predictions
conf_mat( # confusion matrix
data = rf_output,
truth = Diabetes_binary,
estimate = OOB_pred_diabetes
)
## Truth
## Prediction 0 1
## 0 24899 7410
## 1 10447 27936
rf_output <- rf_output %>%
mutate(is_misclass = Diabetes_binary!=OOB_pred_diabetes)
# Plots
ggplot(rf_output, aes(x = is_misclass, y = BMI)) +
geom_boxplot() +
labs(title = "BMI vs Prediction Misclassification", x = "misclassified")

ggplot(rf_output, aes(x = is_misclass, fill = HighBP)) +
geom_bar() +
labs(title = "HighBP vs Prediction Misclassification", x = "misclassified")

ggplot(rf_output, aes(x = is_misclass, fill = HighChol)) +
geom_bar() +
labs(title = "HighChol vs Prediction Misclassification", x = "misclassified")

ggplot(rf_output, aes(x = is_misclass, fill = GenHlth)) +
geom_bar(position = "fill") +
labs(title = "GenHlth vs Prediction Misclassification", y = "%", x = "misclassified")

ggplot(rf_output, aes(x = is_misclass, fill = Age)) +
geom_bar(position = "fill") +
labs(title = "Age vs Prediction Misclassification", y = "%", x = "misclassified")

Bagging and Random Forest Summary
Variable importance for bagging and random forest is very similar to that of the decision tree model, except that the Income variable is in the top 5 more important variables instead of HighChol. The least important variables also remained mostly the same, except HvyAlcoholConsumption replaced the Fruits variable.
Similarly to Decision trees, for most of the predictors the rate of misclassification seems to be equal, illustrating the strength of the bagging and random forest model. The exception is again with the GenHlth category, where there appears to be a higher rate of misclassification among individuals with a GenHlth rating of “Average” (3), and the older age categories within the Age variable.
The out-of-bag prediction error rate for this model is 25.26%, which makes the accuracy rate around 75%. Using the confusion matrix, I was able to caluclate the specificity and sensitivity of the model, which were 79% and 75% respectively.
Supervised Learning Answer
The most common important variables for the supervised learning models included HighBP, BMI, GenHlth, and Age, and the most common least important variables included Stroke, Veggies, CholCheck, AnyHealthcare, Fruits. These results come from supervised learning models that were between 74% and 83% accurate.
Unsupervised Learning Question
What types of diagnoses might a doctor come to when considering variables such as HighBP, HighChol, and BMI?
K-means Clustering
diabetes_clust <- diabetes %>%
select(HighBP, HighChol, BMI, Diabetes_binary) %>%
slice_sample(n = 100)
# Choosing an appropriate number of clusters
# Create storage vector for total within-cluster sum of squares
tot_wc_ss <- rep(0, 20)
# Loop
for (k in 1:20) {
# Perform clustering
pam_out <- pam(daisy(diabetes_clust), k = k)
# Store the total within-cluster sum of squares
tot_wc_ss[k-1] <- sum(pam_out$clusinfo[,"av_diss"]*pam_out$clusinfo[,"size"])
}
plot(1:20, tot_wc_ss, xlab = "Number of clusters", ylab = "Total within-cluster sum of squares")

Hierarchical Clustering
# Random subsample of 50 penguins
set.seed(253)
# Compute a distance matrix on the scaled data
dist_mat_scaled <- dist(daisy(diabetes_clust %>% select(BMI, HighBP, HighChol)))
# The (scaled) distance matrix is the input to hclust()
# The method argument indicates the linkage type
hc_complete <- hclust(dist_mat_scaled, method = "complete")
hc_single <- hclust(dist_mat_scaled, method = "single")
hc_average <- hclust(dist_mat_scaled, method = "average")
hc_centroid <- hclust(dist_mat_scaled, method = "centroid")
# Plot dendrograms
plot(hc_complete, labels = diabetes_clust$Diabetes_binary)

plot(hc_single, labels = diabetes_clust$Diabetes_binary)

plot(hc_average, labels = diabetes_clust$Diabetes_binary)

plot(hc_centroid, labels = diabetes_clust$Diabetes_binary)

diabetes4 <- diabetes_clust %>%
mutate(
hclust_height1.5 = factor(cutree(hc_complete, h = 1.5)), # Cut at height (h) 3
hclust_num3 = factor(cutree(hc_complete, k = 3)) # Cut into 6 clusters (k)
)
Cluster Plots
ggplot(diabetes4, aes(x = hclust_num3, y = BMI)) +
geom_boxplot() +
labs(x = "Cluster", title = "BMI vs Clusters")

ggplot(diabetes4, aes(x = hclust_num3, fill = HighBP)) +
geom_bar() +
labs(x = "Cluster", title = "HighBP vs Clusters")

ggplot(diabetes4, aes(x = hclust_num3, fill = HighChol)) +
geom_bar() +
labs(x = "Cluster", title = "HighChol vs Clusters")

Clustering Summary
The point at which there are no longer meaningful decreases in heterogeneity occurs somewhere between 3 and 6 clusters. Remembering my goal of finding diagnosis types, I want to favor a fewer number of clusters and chose to create plots considering 3 clusters. These consisted of cluster 1, with individuals who generally have a High BP, mixed cholesterol levels, and a high BMI, cluster 2, with individuals who generally have non-high BP, high cholesterol levels, and a medium BMI, and cluster 3, with individuals who generally have non-high BP, low cholesterol levels, and low BMI. To answer the unsupervised learning question, I would hypothesize that cluster 1 could be considered a diagnosis of diabetes, cluster 2 could be considered a diagnosis of prediabetes, and cluster 3 could be considered a diagnosis of no diabetes.
Cautions and Limitations
Looking back at these models, I would use them as diagnostic tools very cautiously, considering the lower accuracy and roc auc model evaluation metrics (between 74% and 83%). It is important to note that predictors are not the same as causes and should not be treated as so when considering our model. For limitations, I must consider that the data collection method was through phone surveys and therefore may be prone to biases consistant with such method. Another limitation was the time constraints for this project, including the fact that this dataset was so large that it takes significant computational time to run many of the models.
---
title: "Machine Learning Final Project: Predicting Diabetes"
author: "Nicole Branch"
output:
  bookdown::html_document2:
    code_download: true
    split_by: none
    toc: yes
    toc_depth: 3
    toc_float:
      toc_collapsed: true
    number_sections: false
---

```{r setup}
# Load data and required packages
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
```

This project was completed as part of a Statistical Machine Learning final project. The [50/50 Diabetes Health Indicators Dataset](https://www.kaggle.com/datasets/alexteboul/diabetes-health-indicators-dataset?resource=download&select=diabetes_binary_5050split_health_indicators_BRFSS2015.csv), created using information from the Behavioral Risk Factor Surveillance System (BRFSS) by the CDC, was imported from Kaggle. There were 70,692 cases and 21 variables, including the outcome variable Diabetes_binary (0 for no diabetes, 1 for diabetes). I used a variety of statistical machine learning methods that we learned throughout the course.

In the supervised learning section I asked what factors are most important for predicting diabetes. The models used for this section include LASSO logistic regression, KNN, decision tree, and bagging and random forest.

In the unsupervised learning section, I used k-means and hierarchical clustering to examine what types of diagnoses a doctor might come to when considering variables such as HighBP, HighChol, and BMI.

---

```{r}
library(dplyr)
library(tidyverse)
library(readr)
library(ggplot2)
library(tidymodels)
library(probably)
library(rpart.plot)
library(ranger)
library(vip)
library(cluster)
library(purrr)
tidymodels_prefer()
```


```{r}
diabetes <- read.csv("diabetes_binary_5050split_health_indicators_BRFSS2015.csv")
```

```{r}
diabetes <- diabetes %>% mutate(Age = ifelse(Age == 1, "18-24", 
                                      ifelse(Age == 2, "25-29",
                                      ifelse(Age == 3, "30-34",
                                      ifelse(Age==4, "35-39", 
                                      ifelse(Age==5,"40-44", 
                                      ifelse(Age==6, "45-49", 
                                      ifelse(Age==7, "50-54",
                                      ifelse(Age==8,"55-59",
                                      ifelse(Age==9, "60-64", 
                                      ifelse(Age==10, "65-69",
                                      ifelse(Age==11, "70-74", 
                                      ifelse(Age==12, "75-79", "80+")))))))))))))


diabetes <- diabetes %>%
    mutate(Diabetes_binary = relevel(factor(Diabetes_binary), ref="0",),
           HighBP = relevel(factor(HighBP), ref="0"),
           HighChol = relevel(factor(HighChol), ref="0"),
           CholCheck = relevel(factor(CholCheck), ref="0"),
           Smoker = relevel(factor(Smoker), ref="0"),
           Stroke = relevel(factor(Stroke), ref="0"),
           HeartDiseaseorAttack= relevel(factor(HeartDiseaseorAttack), ref="0"),
           PhysActivity = relevel(factor(PhysActivity), ref="0"),
           Fruits = relevel(factor(Fruits), ref="0"),
           Veggies = relevel(factor(Veggies), ref="0"),
           HvyAlcoholConsump = relevel(factor(HvyAlcoholConsump), ref="0"),
           AnyHealthcare = relevel(factor(AnyHealthcare), ref="0"),
           NoDocbcCost = relevel(factor(NoDocbcCost), ref="0"),
           GenHlth = relevel(factor(GenHlth), ref="1"),
           DiffWalk = relevel(factor(DiffWalk), ref="0"),
           Sex = relevel(factor(Sex), ref="0"),
           Age = relevel(factor(Age), ref="18-24"),
           Education = relevel(factor(Education), ref="1"),
           Income = relevel(factor(Income), ref="1"))
```

# Supervised Learning Question

>What are the most important health factors for predicting diabetes?


## Exploratory Analyses


```{r}
diabetes %>% count(Diabetes_binary)
```

+ The data contains a 50-50 split of individuals with no diabetes (0) and diabetes (1)


### Predictors vs Outcome Plots

```{r}
ggplot(diabetes, aes(fill = Age, x = Diabetes_binary)) +
  geom_bar()

ggplot(diabetes) +
  geom_bar(aes(x = Diabetes_binary, fill = HighBP))

ggplot(diabetes) +
  geom_bar(aes(x = Diabetes_binary, fill = HighChol))

ggplot(diabetes) +
  geom_boxplot(aes(x = Diabetes_binary, y = BMI))
```

+ Individuals with diabetes (1) tend to be older than individuals with no diabetes (0).

+ More individuals with diabetes have a high BP compared to individuals with no diabetes

+ More individuals with diabetes/prediabetes have high cholesterol compared to individuals with no diabetes

+ Individuals with diabetes/prediabetes have a slightly higher BMI on average compared to individuals with no diabetes

\\



## LASSO logistic regression:

### Fit LASSO Model

```{r}
set.seed(123)


# Set up CV folds
data_cv <- vfold_cv(diabetes, v = 12)

# LASSO logistic regression model specification
logistic_lasso_spec <- logistic_reg() %>%
    set_engine("glmnet") %>%
    set_args(mixture = 1, penalty = tune()) %>% # go through each penalty
    set_mode("classification")

# Recipe
logistic_lasso_rec <- recipe(Diabetes_binary ~ ., data = diabetes) %>%
    step_normalize(all_numeric_predictors()) %>% 
    step_dummy(all_nominal_predictors())

# Workflow (Recipe + Model)
log_lasso_wf <- workflow() %>%
    add_model(logistic_lasso_spec) %>%
    add_recipe(logistic_lasso_rec)

# Tune model: specify grid of parameters and tune
penalty_grid <- grid_regular(
    penalty(range = c(-5,3)), # log10 scale, lambda seq 10^-5 to 10^3
    levels = 100
)

tune_output <- tune_grid(
    log_lasso_wf,
    resamples = data_cv,
    metrics = metric_set(roc_auc, accuracy),
    grid = penalty_grid
)
```

### Select and Fit Best LASSO Model

```{r}
autoplot(tune_output) + theme_classic()
```


```{r}
# Select "best" penalty by one standard error
best_se_penalty <- select_by_one_std_err(tune_output, metric = "roc_auc", desc(penalty))

# Define workflow with "best" penalty value
final_wf <- finalize_workflow(log_lasso_wf, best_se_penalty)

# Use final_wf to fit final model with "best" penalty value
final_fit_se <- fit(final_wf, data = diabetes)
```

### Variable Importance

```{r}
glmnet_output <- final_fit_se %>% extract_fit_parsnip() %>% pluck("fit") # get the original glmnet output

plot(glmnet_output, xvar = "lambda", label = TRUE, col = rainbow(20))
```

Least important variables:

```{r}
final_fit_se %>% tidy() %>% filter(estimate == 0) # Unimportant variables
```

Most important variables:

```{r}
final_fit_se %>% tidy() %>% filter(estimate != 0) %>% 
  arrange(desc(abs(estimate))) # Important variables
```

### Model Evaluation

Accuracy and ROC AUC measures

```{r}
tune_output %>%
    collect_metrics() %>%
    filter(penalty == best_se_penalty %>% pull(penalty)) # get standard error of the mean (variability across multiple samples of pop'n)
```

No information rate

```{r}
35346/(35346+35346)
```


### Misclassification Plots

```{r}
lasso_mod_output <- predict(final_fit_se, new_data = diabetes) %>%
    bind_cols(diabetes)

pred_labels <- c("0" = "Actual 0", "1"  = "Actual 1")

# Plots
ggplot(lasso_mod_output, aes(x = BMI, fill = .pred_class)) +
  geom_histogram() +
  facet_wrap(~Diabetes_binary) +
  labs(fill = "Predictions from LASSO", title = "BMI Predictions") +
  theme_classic()
 
ggplot(lasso_mod_output, aes(x = HighBP, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from LASSO", title = "HighBP Predictions") +
  theme_classic()

ggplot(lasso_mod_output, aes(x = HighChol, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from LASSO", title = "HighChol Predictions") +
  theme_classic()

ggplot(lasso_mod_output, aes(x = Age, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from LASSO", title = "Age Predictions") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

ggplot(lasso_mod_output, aes(x = GenHlth, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from LASSO", title = "GenHlth Predictions") +
  theme_classic()
```


### LASSO Summary
  
    The best value of lambda, chosen by selecting the best penalty by one standard error, is 0.003199267. The coefficient signs and magnitudes for our LASSO model make sense, as the ones that most greatly impact the model are commonly known risk factors or medical complications of diabetes.
  
    With the set of penalty values that I used, 5 variables had their coefficients set equal to 0. PhysHlth, Smoker, Fruits, AnyHealthcare, and NoDocbcCost were not particularly important for predicting whether or not an individual has diabetes, after accounting for the other features in the model. The 5 variables with the greatest importance were GenHlth, HighBP, BMI, and HighChol, which correlates with the biggest risk factors/complications of diabetes. Of the unimportant variables, we were most surprised by the smoking variable, since the CDC says smokers have a greater risk of developing type 2 diabetes than non smokers.
  
    The model's accuracy (74.9%) and roc_auc (82.5%) is higher than the NIR of 50%, so our accuracy isn’t purely driven by an overwhelming majority in one particular class.

    With the set of penalty values that I used, 5 variables had their coefficients set equal to 0. PhysHlth, Smoker, Fruits, AnyHealthcare, and NoDocbcCost were not that important in predicting whether or not an individual has diabetes, after accounting for the other features in the model. Those of highest importance included variables like GenHlth, HighBP, BMI, and HighChol, and Age. Of the unimportant variables, we were most surprised by the smoking variable, since the CDC says smokers have a greater risk of developing type 2 diabetes than non smokers.
    
    This LASSO model is good at predicting variables for risk factors of the outcome, but not as good at predicting variable values that are less common with that outcome. For example, it was able to predict someone as having diabetes (1) for individuals with high blood pressure (HighBP 1), but had a harder time predicting no high blood pressure (HighBP 0).


\\


## K-nearest neighbors

### Fit KNN Model

```{r}
knn_spec <- nearest_neighbor() %>% # General model type
    set_args(neighbors = tune()) %>% # tuning parameter
    set_engine(engine = "kknn") %>% # Engine name
    set_mode("classification")

diabetes_cv <- vfold_cv(diabetes, v = 6) # Supply dataset and # of folds

diabetes_rec <- recipe(Diabetes_binary ~ HighBP + BMI + HighChol + DiffWalk + GenHlth + HeartDiseaseorAttack + Income + CholCheck + HvyAlcoholConsump + Age + Education + Sex + Stroke + PhysActivity + Veggies + MentHlth, data = diabetes) %>% 
    step_dummy(all_nominal_predictors()) %>%
    step_normalize(all_numeric_predictors())

diabetes_wf <- workflow() %>%
    add_model(knn_spec) %>% # Model specification object
    add_recipe(diabetes_rec) # Data preprocessing recipe object

tuning_param_grid <- grid_regular(
    neighbors(range = c(1, 100)), # min and max of values for neighbors
    levels = 5 # number of neighbors values
)

knn_tune_output <- tune_grid(
    diabetes_wf,
    resamples = diabetes_cv,
    metrics = metric_set(roc_auc, accuracy),
    grid = tuning_param_grid
)

```

### Select and Fit Best Number of Neighbors

```{r}
autoplot(knn_tune_output) + theme_classic()
```

```{r}
## Choose neighbors value that leads to the highest neighbors within 1 std. err.
knn_tune_output %>% 
    select_by_one_std_err(metric = "roc_auc", desc(neighbors)) ## The desc(neighbors) sorts the data from highest to lowest # of neighbors (most simple -> most complex)
knn_tune_output %>% 
    select_by_one_std_err(metric = "accuracy", desc(neighbors))
```


```{r}
best_se_neighbors <- select_by_one_std_err(knn_tune_output, metric = "roc_auc", desc(neighbors))
final_wf <- finalize_workflow(diabetes_wf, best_se_neighbors) 
final_fit <- fit(final_wf, data = diabetes)
```
  
### Model Evaluation

```{r}
# Show evaluation metrics for different values of neighbors, ordered
knn_tune_output %>% show_best(metric = "roc_auc")
knn_tune_output %>% show_best(metric = "accuracy")
```

### Misclassification Plots

```{r}
# Use the best model to make predictions
knn_mod_out <- predict(final_fit, new_data = diabetes) %>%
    bind_cols(diabetes)

pred_labels <- c("0" = "Actual 0", "1"  = "Actual 1")

# Plots
ggplot(knn_mod_out, aes(x = BMI, fill = .pred_class)) +
  geom_histogram() +
  labs(y = "BMI", fill = "Predictions from KNN") +
  facet_wrap(~Diabetes_binary) +
  theme_classic()
 
ggplot(knn_mod_out, aes(x = HighBP, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from KNN") +
  theme_classic()

ggplot(knn_mod_out, aes(x = HighChol, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from KNN") +
  theme_classic()

ggplot(knn_mod_out, aes(x = Age, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from KNN") +
  theme_classic()

ggplot(knn_mod_out, aes(x = GenHlth, fill = .pred_class)) +
  geom_bar() +
  facet_wrap(~Diabetes_binary, labeller = labeller(Diabetes_binary = pred_labels)) +
  labs(y = "Count", fill = "Predictions from KNN") +
  theme_classic()
```


```{r}
conf_mat( # confusion matrix
    data = knn_mod_out,
    truth = Diabetes_binary,
    estimate = .pred_class
)
```


### KNN Summary

    The best value for the number of neighbors would be 100, chosen by selecting the best number of neighbors by one standard error so as to achieve the best model with the least chance of being overfit. Similarly to the LASSO model, my KNN model is good at predicting diabetes where the risk factor is in line with the outcome, yet struggled more when the risk factor contradicted the outcome variable. For example, when HighChol = 1 and diabetes was not present, the model tended to have around a 60% accuracy rate, contrasted with its higher accuracy when HighChol and diabetes were both present. 

    With 100 neighbors for our model we have an accuracy of 73.6% (standard error: 0.0025) and an roc_auc of 80.8% (standard error: 0.0018).


\\


## Decision Tree

### Fit Decision Tree

```{r}
ct_spec <- decision_tree() %>%
  set_engine(engine = "rpart") %>%
  set_args(
      cost_complexity = tune(),
      min_n = 2,
      tree_depth = NULL
    ) %>%
  set_mode("classification")


data_cv <- vfold_cv(diabetes, v = 6)

data_rec <- recipe(Diabetes_binary ~ ., data = diabetes)


data_wf <- workflow() %>% 
  add_model(ct_spec) %>%
  add_recipe(data_rec)

param_grid <- grid_regular(
  cost_complexity(range = c(-5, -1)),
  levels = 10)

tune_res <- tune_grid(
  data_wf, 
  resamples = data_cv, 
  grid = param_grid, 
  metrics = metric_set(accuracy, roc_auc) 
)
```

### Select and Fit Best Tree

```{r}
autoplot(tune_res) + theme_classic()
```

```{r}
best_complexity <- select_by_one_std_err(tune_res, metric = 'roc_auc', desc(cost_complexity))
data_wf_final <- finalize_workflow(data_wf, best_complexity)

final_fit <- fit(data_wf_final, data = diabetes)
```

### Visualize Tree

```{r, warning=FALSE}
# Plot the tree (rpart.plot package)
final_fit %>%
    extract_fit_engine() %>%
    rpart.plot()
```

### Variable Importance

```{r}
# Variable importance metrics 
# Sum of the goodness of split measures (impurity reduction) for each split for which it was the primary variable.
final_fit %>%
    extract_fit_engine() %>%
    pluck('variable.importance')

# Predictions and Exploring Error
diabetes_preds <- diabetes %>%
    mutate(
        pred_prob = predict(final_fit, new_data = diabetes, type = "prob"),
        pred_class = predict(final_fit, new_data = diabetes, type = "class")
    )
diabetes_preds1 <- diabetes_preds %>% mutate(misclassified = Diabetes_binary!=pred_class$.pred_class)
```

### Model Evaluation

```{r}
tune_res %>% 
    select_by_one_std_err(metric = "roc_auc", desc(cost_complexity))

tune_res %>% 
    select_by_one_std_err(metric = "accuracy", desc(cost_complexity))
```

### Misclassification Plots

```{r}
ggplot(diabetes_preds1, aes(x = misclassified, fill = HighBP)) +
  geom_bar() +
  labs(title = "High BP vs Predictions")

ggplot(diabetes_preds1, aes(x = misclassified, fill = GenHlth)) +
  geom_bar(position = "fill") +
  labs(title = "GenHlth vs Predictions", y = "%")

ggplot(diabetes_preds1, aes(x = misclassified, fill = Age)) +
  geom_bar(position = "fill")  +
  labs(title = "Age vs Predictions", y = "%")

ggplot(diabetes_preds1, aes(y = BMI, x = misclassified)) +
  geom_boxplot() +
  labs(title = "BMI vs Predictions")

ggplot(diabetes_preds1, aes(x = misclassified, fill = HighChol)) +
  geom_bar() +
  labs(title = "HighChol vs Predictions")
```

### Decision Tree Summary

    The decision tree model results in a slightly different variable importance order than LASSO, with the top 5 also containing HighBP, GenHlth, and BMI, but also including Age and HighChol. The least important variables were very different, with the top 5 least important predictors including Veggies, Fruits, AnyHealthcare, Stroke, and CholCheck.
  
    For most of the predictors, the rate of misclassification seems to be equal, illustrating the strength of the decision tree model. For HighBP and HighChol, the proportion of having either was relatively equal in cases where the model was accurate and where it wasn’t. Thus, there doesn’t seem to be a bias present where the model attaches a predictor with having diabetes. The exception to this is where there appears to be a higher rate of misclassification among individuals with a GenHlth rating of “Average” (3) and older individuals within the Age variable.
  
    The accuracy of this decision tree model is around 80% (standard error: 0.001), and the ROC AUC is around 74% (standard error: 0.001).


## Bagging and Random Forests

### Fit Random Forest Model
```{r}
# Model Specification
rf_spec <- rand_forest() %>%
    set_engine(engine = "ranger") %>% 
    set_args(
        mtry = NULL, # size of random subset of variables
        trees = 1000, # Number of trees
        min_n = 2,
        probability = FALSE, # FALSE: get hard predictions
        importance = "impurity"
    ) %>%
    set_mode("classification")

# Recipe
data_rec <- recipe(Diabetes_binary ~ ., data = diabetes)

# Workflows
data_wf <- workflow() %>%
    add_model(rf_spec) %>%
    add_recipe(data_rec)

# No tune_grid() or vfold_cv()
rf_fit <- fit(data_wf, data = diabetes)

```

### Variable Importance

```{r}
# Plot of the variable importance information
rf_fit %>% 
    extract_fit_engine() %>% 
    vip(num_features = 30) + theme_classic()

# Extract the numerical information on variable importance and display the most and least important predictors
rf_var_imp <- rf_fit %>% 
    extract_fit_engine() %>%
    vip::vi()
head(rf_var_imp)
tail(rf_var_imp)
```

### Model Evaluation

```{r}
rf_fit
```


### Misclassification Plots

```{r}
# OOB confusion matrix
rf_output <- diabetes %>%
    mutate(OOB_pred_diabetes = rf_fit %>% extract_fit_engine() %>% pluck("predictions")) # extracts OOB predictions

conf_mat( # confusion matrix
    data = rf_output,
    truth = Diabetes_binary,
    estimate = OOB_pred_diabetes
)

rf_output <- rf_output %>%
    mutate(is_misclass = Diabetes_binary!=OOB_pred_diabetes)
# Plots
ggplot(rf_output, aes(x = is_misclass, y = BMI)) +
  geom_boxplot() + 
  labs(title = "BMI vs Prediction Misclassification", x = "misclassified")

ggplot(rf_output, aes(x = is_misclass, fill = HighBP)) +
  geom_bar() +
  labs(title = "HighBP vs Prediction Misclassification", x = "misclassified")

ggplot(rf_output, aes(x = is_misclass, fill = HighChol)) +
  geom_bar() +
  labs(title = "HighChol vs Prediction Misclassification", x = "misclassified")

ggplot(rf_output, aes(x = is_misclass, fill = GenHlth)) +
  geom_bar(position = "fill") +
  labs(title = "GenHlth vs Prediction Misclassification", y = "%", x = "misclassified")

ggplot(rf_output, aes(x = is_misclass, fill = Age)) +
  geom_bar(position = "fill") +
  labs(title = "Age vs Prediction Misclassification", y = "%", x = "misclassified")
```

### Bagging and Random Forest Summary

    Variable importance for bagging and random forest is very similar to that of the decision tree model, except that the Income variable is in the top 5 more important variables instead of HighChol. The least important variables also remained mostly the same, except HvyAlcoholConsumption replaced the Fruits variable.
    
    Similarly to Decision trees, for most of the predictors the rate of misclassification seems to be equal, illustrating the strength of the bagging and random forest model. The exception is again with the GenHlth category, where there appears to be a higher rate of misclassification among individuals with a GenHlth rating of “Average” (3), and the older age categories within the Age variable.
    
    The out-of-bag prediction error rate for this model is 25.26%, which makes the accuracy rate around 75%. Using the confusion matrix, I was able to caluclate the specificity and sensitivity of the model, which were 79% and 75% respectively.

## Supervised Learning Answer

    The most common important variables for the supervised learning models included HighBP, BMI, GenHlth, and Age, and the most common least important variables included Stroke, Veggies, CholCheck, AnyHealthcare, Fruits. These results come from supervised learning models that were between 74% and 83% accurate.


# Unsupervised Learning Question

> What types of diagnoses might a doctor come to when considering variables such as HighBP, HighChol, and BMI?


## K-means Clustering

```{r}
diabetes_clust <- diabetes %>% 
  select(HighBP, HighChol, BMI, Diabetes_binary) %>% 
  slice_sample(n = 100)


# Choosing an appropriate number of clusters
# Create storage vector for total within-cluster sum of squares
tot_wc_ss <- rep(0, 20) 

# Loop
for (k in 1:20) {
    # Perform clustering
    pam_out <- pam(daisy(diabetes_clust), k = k)

    # Store the total within-cluster sum of squares
    tot_wc_ss[k-1] <- sum(pam_out$clusinfo[,"av_diss"]*pam_out$clusinfo[,"size"])
}

plot(1:20, tot_wc_ss, xlab = "Number of clusters", ylab = "Total within-cluster sum of squares")
```


## Hierarchical Clustering
```{r}
# Random subsample of 50 penguins
set.seed(253)


# Compute a distance matrix on the scaled data
dist_mat_scaled <- dist(daisy(diabetes_clust %>% select(BMI, HighBP, HighChol)))

# The (scaled) distance matrix is the input to hclust()
# The method argument indicates the linkage type
hc_complete <- hclust(dist_mat_scaled, method = "complete")
hc_single <- hclust(dist_mat_scaled, method = "single")
hc_average <- hclust(dist_mat_scaled, method = "average")
hc_centroid <- hclust(dist_mat_scaled, method = "centroid")

# Plot dendrograms
plot(hc_complete, labels = diabetes_clust$Diabetes_binary)
plot(hc_single, labels = diabetes_clust$Diabetes_binary)
plot(hc_average, labels = diabetes_clust$Diabetes_binary)
plot(hc_centroid, labels = diabetes_clust$Diabetes_binary)

diabetes4 <- diabetes_clust %>%
    mutate(
        hclust_height1.5 = factor(cutree(hc_complete, h = 1.5)), # Cut at height (h) 3
        hclust_num3 = factor(cutree(hc_complete, k = 3)) # Cut into 6 clusters (k)
    )
```

## Cluster Plots

```{r}
ggplot(diabetes4, aes(x = hclust_num3, y = BMI)) +
    geom_boxplot() +
    labs(x = "Cluster", title = "BMI vs Clusters")

ggplot(diabetes4, aes(x = hclust_num3, fill = HighBP)) +
    geom_bar() +
    labs(x = "Cluster", title = "HighBP vs Clusters")

ggplot(diabetes4, aes(x = hclust_num3, fill = HighChol)) +
    geom_bar() +
    labs(x = "Cluster", title = "HighChol vs Clusters")
```


## Clustering Summary
    
    The point at which there are no longer meaningful decreases in heterogeneity occurs somewhere between 3 and 6 clusters. Remembering my goal of finding diagnosis types, I want to favor a fewer number of clusters and chose to create plots considering 3 clusters. These consisted of cluster 1, with individuals who generally have a High BP, mixed cholesterol levels, and a high BMI, cluster 2, with individuals who generally have non-high BP, high cholesterol levels, and a medium BMI, and cluster 3, with individuals who generally have non-high BP, low cholesterol levels, and low BMI. To answer the unsupervised learning question, I would hypothesize that cluster 1 could be considered a diagnosis of diabetes, cluster 2 could be considered a diagnosis of prediabetes, and cluster 3 could be considered a diagnosis of no diabetes.

# Cautions and Limitations

    Looking back at these models, I would use them as diagnostic tools very cautiously, considering the lower accuracy and roc auc model evaluation metrics (between 74% and 83%). It is important to note that predictors are not the same as causes and should not be treated as so when considering our model. For limitations, I must consider that the data collection method was through phone surveys and therefore may be prone to biases consistant with such method. Another limitation was the time constraints for this project, including the fact that this dataset was so large that it takes significant computational time to run many of the models.

