# Load data, required packages, and functions
library(NHANES)
library(caret)
library(ggplot2)
library(dplyr)
library(splines)
library(broom)
library(gridExtra)
library(randomForest)
library(rpart.plot)
data(NHANES)
twoClassSummaryCustom <- function (data, lev = NULL, model = NULL) {
if (length(lev) > 2) {
stop(paste("Your outcome has", length(lev), "levels. The twoClassSummary() function isn't appropriate."))
}
caret:::requireNamespaceQuietStop("pROC")
if (!all(levels(data[, "pred"]) == lev)) {
stop("levels of observed and predicted data do not match")
}
rocObject <- try(pROC::roc(data$obs, data[, lev[1]], direction = ">",
quiet = TRUE), silent = TRUE)
rocAUC <- if (inherits(rocObject, "try-error"))
NA
else rocObject$auc
out <- c(rocAUC, sensitivity(data[, "pred"], data[, "obs"],
lev[1]), specificity(data[, "pred"], data[, "obs"], lev[2]))
out2 <- postResample(data[, "pred"], data[, "obs"])
out <- c(out, out2[1])
names(out) <- c("AUC", "Sens", "Spec", "Accuracy")
out
}
# Any code to clean the data
testNHANES <- NHANES %>%
select(Age, Gender, Weight, Height, Pulse, TotChol, PhysActive, BPSysAve) %>%
filter(!is.na(Age), !is.na(Gender), !is.na(Weight), !is.na(Height), !is.na(Pulse), !is.na(TotChol), !is.na(PhysActive), !is.na(BPSysAve))
testNHANES2 <- NHANES %>%
select(Age, Gender, Weight, Height, Pulse, TotChol, PhysActive, BPSysAve, Diabetes) %>%
filter(!is.na(Age), !is.na(Gender), !is.na(Weight), !is.na(Height), !is.na(Pulse), !is.na(TotChol), !is.na(PhysActive), !is.na(BPSysAve), !is.na(Diabetes))
NHANES is an observational study funded and conducted by the CDC, specifically its National Center for Health Statistics (NCHS) branch. The study, which began in the early 1960s, is an annual survey administered to a nationally representative sample of about 5,000 non-institutionalized persons from around the country. To ensure that its sample is as accurate as possible, NHANES uses a complex, multistage probability sampling design to select a cohort representative of the non-institutionalized population of the US. I will only be looking at the NHANES data provided for years between 2009-2012. In addition to being asked a variety of questions about demographic and socioeconomic information, participants undergo a health interview and examination. This involves taking health measurements in a specially designed and equipped mobile center that travels with the NCHS to locations throughout the country. In total, information about 76 unique variables is collected by a team of physicians, medical technicians, and dietary/health interviewers about a range of health bio-markers and behavioral tendencies. The data that is collected gives valuable insight into the distribution of health problems and risk factors in the U.S. population. This gives researchers important clues about the causes of disease and allows policymakers to detect the extent to which various health problems have changed temporally in the U.S. population (CDC, 2020).
Models Used:
To begin addressing my regression research questions, I used multiple different regression modeling methods that do not account for any nonlinearity in the relationship between predictors and the outcome. The first model I created used the ordinary least squares regression method. The second model I created used subset selection methods. Specifically, I used the backward stepwise subset selection method. The final model I built used LASSO linear regression methods. Collapse code to view how I fit these models.
# OLS Regression
set.seed(123)
OLS_mod2 <- train(
BPSysAve ~ .,
data = testNHANES,
method = "lm",
trControl = trainControl(method = "cv", number = 8, selectionFunction = "oneSE"),
metric = "MAE",
na.action = na.omit
)
# Subset Selection (Backward Stepwise)
set.seed(123)
Backward_Stepwise_mod <- train(
BPSysAve ~ .,
data = testNHANES,
method = "leapBackward",
tuneGrid = data.frame(nvmax = 1:7),
trControl = trainControl(method = "cv", number = 8, selectionFunction = "oneSE"),
metric = "MAE",
na.action = na.omit
)
# LASSO
set.seed(123)
LASSO_mod <- train(
BPSysAve ~ .,
data = testNHANES,
method = "glmnet",
trControl = trainControl(method = "cv", number = 8, selectionFunction = "oneSE"),
tuneGrid = data.frame(alpha = 1, lambda = seq(0, 10, length.out = 100)),
metric = "MAE",
na.action = na.omit
)
#OLS Evaluation
OLS_mod2$results
# Subset Selection Evaluation
plot(Backward_Stepwise_mod)
Backward_Stepwise_mod$results
mean(Backward_Stepwise_mod$resample$MAE)
## [1] 10.85571
#LASSO Evaluation
plot(LASSO_mod)
head(LASSO_mod$results, 8)
LASSO_mod$bestTune
ggplot(LASSO_mod$results, aes(x = lambda)) +
geom_pointrange(aes(y = MAE, ymin = MAE-MAESD, ymax = MAE+MAESD)) +
theme_classic() +
geom_vline(xintercept = 0.505, color = "red") +
labs(x = "lambda", y = "MAE +/- 1SD")
mean(LASSO_mod$resample$MAE)
## [1] 10.89853
By considering multiple methods of linear regression modeling, I was able to begin gathering a sense of how well my various predictor variables are able to actually predict blood pressure. I was also able to see if different modeling methods produced different results that could be important to consider. None of these models considered potential non-linear relationships between predictors and the outcome, so that is something I turn to next.
To check to see if any quantitative predictors might be better modeled with nonlinear relationships, I constructed residual plots to evaluate their relationships.
For each of the models I fit above, I created residual plots of each of the quantitative predictors. The various sets of plots seen below compare these different residual plots produced from the different models against each other. Also included in the set of graphs is a plot comparing the quantitative predictor to blood pressure from the training data. The final set of plots is an overall residual vs predicted graph for each of the models I fit in investigation 1.
# Residual Plot
set.seed(123)
OLS_mod_data <- testNHANES %>%
mutate(pred = predict(OLS_mod2, newdata = testNHANES),
resid = BPSysAve - pred)
p1 <- ggplot(OLS_mod_data, aes(x = Age, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red")
p2 <- ggplot(OLS_mod_data, aes(x = Weight, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red")
p3 <- ggplot(OLS_mod_data, aes(x = Height, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red")
p4 <- ggplot(OLS_mod_data, aes(x = Pulse, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red")
p5 <- ggplot(OLS_mod_data, aes(x = TotChol, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red")
p6 <- ggplot(OLS_mod_data, aes(x = pred, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red")
Backward_Stepwise_mod_data <- testNHANES %>%
mutate(pred = predict(Backward_Stepwise_mod, newdata = testNHANES),
resid = BPSysAve - pred)
q1 <- ggplot(Backward_Stepwise_mod_data, aes(x = Age, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red")
q2 <- ggplot(Backward_Stepwise_mod_data, aes(x = Weight, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red")
q3 <- ggplot(Backward_Stepwise_mod_data, aes(x = Height, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red")
q4 <- ggplot(Backward_Stepwise_mod_data, aes(x = Pulse, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red")
q5 <- ggplot(Backward_Stepwise_mod_data, aes(x = TotChol, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red")
q6 <- ggplot(Backward_Stepwise_mod_data, aes(x = pred, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red")
LASSO_mod_data <- testNHANES %>%
mutate(pred = predict(LASSO_mod, newdata = testNHANES),
resid = BPSysAve - pred)
r1 <- ggplot(LASSO_mod_data, aes(x = Age, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red")
r2 <- ggplot(LASSO_mod_data, aes(x = Weight, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red")
r3 <- ggplot(LASSO_mod_data, aes(x = Height, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red")
r4 <- ggplot(LASSO_mod_data, aes(x = Pulse, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red")
r5 <- ggplot(LASSO_mod_data, aes(x = TotChol, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red")
r6 <- ggplot(LASSO_mod_data, aes(x = pred, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red")
s1 <- testNHANES %>%
ggplot(aes(x = Age, y = BPSysAve)) +
geom_point() +
geom_smooth(color = "blue", se = FALSE) +
geom_smooth(method = "lm", color = "red", se = FALSE)
s2 <- testNHANES %>%
ggplot(aes(x = Weight, y = BPSysAve)) +
geom_point() +
geom_smooth(color = "blue", se = FALSE) +
geom_smooth(method = "lm", color = "red", se = FALSE)
s3 <- testNHANES %>%
ggplot(aes(x = Height, y = BPSysAve)) +
geom_point() +
geom_smooth(color = "blue", se = FALSE) +
geom_smooth(method = "lm", color = "red", se = FALSE)
s4 <- testNHANES %>%
ggplot(aes(x = Pulse, y = BPSysAve)) +
geom_point() +
geom_smooth(color = "blue", se = FALSE) +
geom_smooth(method = "lm", color = "red", se = FALSE)
s5 <- testNHANES %>%
ggplot(aes(x = TotChol, y = BPSysAve)) +
geom_point() +
geom_smooth(color = "blue", se = FALSE) +
geom_smooth(method = "lm", color = "red", se = FALSE)
grid.arrange(p1, q1, r1, s1, nrow = 2, ncol = 2)
grid.arrange(p2, q2, r2, s2, nrow = 2, ncol = 2)
grid.arrange(p3, q3, r3, s3, nrow = 2, ncol = 2)
grid.arrange(p4, q4, r4, s4, nrow = 2, ncol = 2)
grid.arrange(p5, q5, r5, s5, nrow = 2, ncol = 2)
grid.arrange(p6, q6, r6, nrow = 2, ncol = 2)
Creating Models with Natural Splines:
Even if it seems like none of the quantitative predictors would necessarily be better modeled with nonlinear relationships, I can confirm my assumptions and maybe learn more by fitting new, updated models of the subset selection and LASSO methods using natural splines. See code below to view the fitting process.
# Stepwise Selection
set.seed(123)
NS_Backward_Stepwise_mod <- train(
BPSysAve ~ Gender + PhysActive + ns(Age, 3) + ns(Weight, 3) + ns(Height, 3) + ns(Pulse, 3) + ns(TotChol, 3),
data = testNHANES,
method = "leapBackward",
tuneGrid = data.frame(nvmax = 1:7),
trControl = trainControl(method = "cv", number = 8, selectionFunction = "oneSE"),
metric = "MAE",
na.action = na.omit
)
# LASSO
set.seed(123)
NS_LASSO_mod <- train(
BPSysAve ~ Gender + PhysActive + ns(Age, 3) + ns(Weight, 3) + ns(Height, 3) + ns(Pulse, 3) + ns(TotChol, 3),
data = testNHANES,
method = "glmnet",
trControl = trainControl(method = "cv", number = 8, selectionFunction = "oneSE"),
tuneGrid = data.frame(alpha = 1, lambda = seq(0, 10, length.out = 100)),
metric = "MAE",
na.action = na.omit
)
NS_Backward_Stepwise_mod$bestTune
NS_Backward_Stepwise_mod$results
NS_LASSO_mod$bestTune
head(NS_LASSO_mod$results)
ggplot(NS_LASSO_mod$results, aes(x = lambda)) +
geom_pointrange(aes(y = MAE, ymin = MAE-MAESD, ymax = MAE+MAESD)) +
theme_classic() +
geom_vline(xintercept = 0.404 , color = "red") +
labs(x = "lambda", y = "MAE +/- 1SD")
To get a further sense of whether there is significant difference between the models, I can also consider how the variable importance compare among them.
First, let’s address just the models that didn’t consider nonlinearity.
# Stepwise Variable Importance
plot(Backward_Stepwise_mod)
summary(Backward_Stepwise_mod)
## Subset selection object
## 7 Variables (and intercept)
## Forced in Forced out
## Age FALSE FALSE
## Gendermale FALSE FALSE
## Weight FALSE FALSE
## Height FALSE FALSE
## Pulse FALSE FALSE
## TotChol FALSE FALSE
## PhysActiveYes FALSE FALSE
## 1 subsets of each size up to 3
## Selection Algorithm: backward
## Age Gendermale Weight Height Pulse TotChol PhysActiveYes
## 1 ( 1 ) "*" " " " " " " " " " " " "
## 2 ( 1 ) "*" "*" " " " " " " " " " "
## 3 ( 1 ) "*" "*" "*" " " " " " " " "
# LASSO Variable Importance
plot(LASSO_mod$finalModel, xvar = "lambda", label = TRUE, col = rainbow(20))
# Create a boolean matrix (predictors x lambdas) of variable exclusion
bool_predictor_exclude <- LASSO_mod$finalModel$beta==0
# Loop over each variable
var_imp <- sapply(seq_len(nrow(bool_predictor_exclude)), function(row) {
# Extract coefficient path (sorted from highest to lowest lambda)
this_coeff_path <- bool_predictor_exclude[row,]
# Compute and return the # of lambdas until this variable is out forever
ncol(bool_predictor_exclude)-which.min(this_coeff_path)+1
})
# Create a dataset of this information and sort
var_imp_data <- tibble(
var_name = rownames(bool_predictor_exclude),
var_imp = var_imp
)
var_imp_data %>% arrange(desc(var_imp))
The variable importance analyses for both the linear stepwise and LASSO methods yielded similar results. While the variable importance order was not exactly the same between the two models, they still agreed about the top 3 being the same and the difference is likely negligible. This didn’t surprise me greatly as these are all biological indicators that I would’ve guessed with little background knowledge. I chose these predictor variables with blood pressure already in mind, and these results confirm my initial suspicions.
plot(NS_Backward_Stepwise_mod)
summary(NS_Backward_Stepwise_mod)
## Subset selection object
## 17 Variables (and intercept)
## Forced in Forced out
## Gendermale FALSE FALSE
## PhysActiveYes FALSE FALSE
## ns(Age, 3)1 FALSE FALSE
## ns(Age, 3)2 FALSE FALSE
## ns(Age, 3)3 FALSE FALSE
## ns(Weight, 3)1 FALSE FALSE
## ns(Weight, 3)2 FALSE FALSE
## ns(Weight, 3)3 FALSE FALSE
## ns(Height, 3)1 FALSE FALSE
## ns(Height, 3)2 FALSE FALSE
## ns(Height, 3)3 FALSE FALSE
## ns(Pulse, 3)1 FALSE FALSE
## ns(Pulse, 3)2 FALSE FALSE
## ns(Pulse, 3)3 FALSE FALSE
## ns(TotChol, 3)1 FALSE FALSE
## ns(TotChol, 3)2 FALSE FALSE
## ns(TotChol, 3)3 FALSE FALSE
## 1 subsets of each size up to 6
## Selection Algorithm: backward
## Gendermale PhysActiveYes ns(Age, 3)1 ns(Age, 3)2 ns(Age, 3)3
## 1 ( 1 ) " " " " " " " " "*"
## 2 ( 1 ) " " " " " " "*" "*"
## 3 ( 1 ) " " " " "*" "*" "*"
## 4 ( 1 ) "*" " " "*" "*" "*"
## 5 ( 1 ) "*" " " "*" "*" "*"
## 6 ( 1 ) "*" " " "*" "*" "*"
## ns(Weight, 3)1 ns(Weight, 3)2 ns(Weight, 3)3 ns(Height, 3)1
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 4 ( 1 ) " " " " " " " "
## 5 ( 1 ) "*" " " " " " "
## 6 ( 1 ) "*" " " " " " "
## ns(Height, 3)2 ns(Height, 3)3 ns(Pulse, 3)1 ns(Pulse, 3)2
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 4 ( 1 ) " " " " " " " "
## 5 ( 1 ) " " " " " " " "
## 6 ( 1 ) " " " " " " " "
## ns(Pulse, 3)3 ns(TotChol, 3)1 ns(TotChol, 3)2 ns(TotChol, 3)3
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 4 ( 1 ) " " " " " " " "
## 5 ( 1 ) " " " " " " " "
## 6 ( 1 ) " " "*" " " " "
coef(NS_LASSO_mod$finalModel, NS_LASSO_mod$bestTune$lambda)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 109.200977
## Gendermale 3.095061
## PhysActiveYes .
## ns(Age, 3)1 6.334863
## ns(Age, 3)2 22.569763
## ns(Age, 3)3 23.931616
## ns(Weight, 3)1 8.265070
## ns(Weight, 3)2 1.990105
## ns(Weight, 3)3 .
## ns(Height, 3)1 .
## ns(Height, 3)2 .
## ns(Height, 3)3 .
## ns(Pulse, 3)1 .
## ns(Pulse, 3)2 .
## ns(Pulse, 3)3 2.784817
## ns(TotChol, 3)1 8.057782
## ns(TotChol, 3)2 .
## ns(TotChol, 3)3 .
plot(NS_LASSO_mod$finalModel, xvar = "lambda", label = TRUE, col = rainbow(20))
# Create a boolean matrix (predictors x lambdas) of variable exclusion
bool_predictor_exclude2 <- NS_LASSO_mod$finalModel$beta==0
# Loop over each variable
var_imp2 <- sapply(seq_len(nrow(bool_predictor_exclude2)), function(row) {
# Extract coefficient path (sorted from highest to lowest lambda)
this_coeff_path2 <- bool_predictor_exclude2[row,]
# Compute and return the # of lambdas until this variable is out forever
ncol(bool_predictor_exclude2)-which.min(this_coeff_path2)+1
})
# Create a dataset of this information and sort
var_imp_data2 <- tibble(
var_name = rownames(bool_predictor_exclude),
var_imp = var_imp
)
var_imp_data2 %>% arrange(desc(var_imp))
GAM LOESS Model
By calculating and noting all my models’ variable importance thus far, I’ve been able to see a pattern emerge. Three variables in particular are present across all linear and nonlinear models. These three variables are age, weight, and gender. I can use these three particular variables to fit a GAM using LOESS terms to further investigate how other modeling methods improve (or don’t improve) test accuracy. I fit this model below, refer to the code to see specifics of the fitting process.
# GAM LOESS
set.seed(123)
gam_mod <- train(
BPSysAve ~ Age + Weight + Gender,
data = testNHANES,
method = "gamLoess",
tuneGrid = data.frame(degree = 1, span = seq(0.1, 0.9, by = 0.1)),
trControl = trainControl(method = "cv", number = 8, selectionFunction = "oneSE"),
metric = "MAE",
na.action = na.omit
)
plot(gam_mod)
gam_mod$bestTune
gam_mod$results %>%
filter(span==gam_mod$bestTune$span)
par(mfrow = c(3,4)) # Sets up a grid of plots
plot(gam_mod$finalModel, se = TRUE) # Dashed lines are +/- 2 SEs
Model Comparison:
Now having fit a range of different models, I can compare them to determine which of them is ‘best’.
MAE and Uncertainty
Below are the MAE and MAE SD estimates for each of the methods’ “best” model– the simplest model with the lowest MAE.
OLS_mod2$results
Backward_Stepwise_mod$results %>%
filter(nvmax==Backward_Stepwise_mod$bestTune$nvmax)
LASSO_mod$results %>%
filter(lambda == LASSO_mod$bestTune$lambda)
NS_Backward_Stepwise_mod$results %>%
filter(nvmax == NS_Backward_Stepwise_mod$bestTune$nvmax)
NS_LASSO_mod$results %>%
filter(lambda == NS_LASSO_mod$bestTune$lambda)
gam_mod$results %>%
filter(span == gam_mod$bestTune$span)
We can see from the readouts above that across the various models, MAE ranges from as low as 10.666 to as high as 10.899. The uncertainty across these MAE estimates are all fairly close to each other as well. With that in mind, none of these test error estimates are all that far away from each other. Moreover, in the context of average systolic blood pressure, a mater of 0.2 mm Hg difference across models is negligible. Overall though, none of these models seem particularly useful in a practical sense. Systolic blood pressure predictions that are off, on average, by about 10 mm Hg are not as useful as they could be in a clinical setting. I can feel somewhat certain, though, that given the predictors I looked at, the estimates won’t change significantly across different regression methods.
We saw from the residual plots above that, across all the different linear regression models, the trend lines sat nicely along the x-axis. This implies that linear regression, as opposed to a non-linear method, was an appropriate choice in modeling method as there are no systematic biases present in the trend. Stated differently, we can infer from this observed pattern that the quanntitative predictors plotted have fairly linear relationships with average systolic blood pressure in this dataset. I further confirmed this by creating plots comparing each of the quantitative predictors to average systolic blood pressure– these were the bottom right plots in the series of graphs. I also comparde two best fit lines, one as a linear fit and the other as a local regression, for each of these plots. While there is some nonlinearity, the local regression line of best fit still follows fairly closely to the linear regression line. Cumulatively, this suggests to me that linear regression is a suitable option for this regression task.
Across all the models I fit, I was able to get a sense of which variables were most important to the models. As I discussed above, mostly the same variables appeared at the top of the list– age, weight, and gender. While the order of these variables shifted slightly occasionally, the consistence presence of the same predictors made me feel confident that these models were fairly similar. Contextually, this makes sense as biological studies have suggested causal relationships between blood pressure and these predictors. Knowing this is the case and seeing that estimates across different models were fairly similar, I’m able to weigh the benefits and disadvantages of these models to determine which is the best.
Overall Most Preferable Model:
To determine the ‘best’, most preferable model I’ve created so far, I’m focusing mainly on striking a balance between predictive accuracy and interpretability. Thinking first about predictive accuracy, the models I produced all had mean absolute errors that were close to each other. Their error SDs were all fairly uniform as well, so it doesn’t seem as if one model has more tightly grouped/variable estimates than another. Because of this, I’m inclined to believe the differences in test error are likely negligible. With this in mind, I would be more partial towards a method that is simple and easy to interpret. So despite the LOESS GAM producing the lowest MAE (10.677), we lose a lot of interpretability as a result. I think sacrificing interpretabililty for a (contextually) very small decrease in test error is not very worthwhile. For this reason in particular, I think the backward stepwise model is the best, most preferable model of the ones I created. Its few variables combined with its easy interpretation make it particularly appealing. It is slightly less accurate than the other models (its systolic blood pressure estimates were off, on average, by 10.855 mm Hg and the MAE SD is estimated to be 0.286), but still falls within one SD of any of the other models’ test errors. It also gives a fairly clear way of understanding variable importance which can be useful in thinking about interpretations. We can see from the residual plots below that this model doesn’t have any glaring systematic biases that aren’t being addressed. Trend lines fit nicely along the x-axis and quantitative predictors have linear relationships with average systolic blood pressure. Overall, the model performs well and it is simple and interpretable– just what I wanted.
grid.arrange(q1, q2, q3, q4, q5, q6, nrow = 3, ncol = 2)
Models Used:
I created two models using two different methods to address my classification research question. The first model I created used LASSO logistic regression. The second model I created used random forest/bagging methods. Collapse the code to see the fitting process for my LASSO logistic regression model.
# LASSO Logistic Regression
set.seed(253)
logistic_mod <- train(
Diabetes ~ Age + Gender + Weight + Height + Pulse + TotChol + PhysActive + BPSysAve,
data = testNHANES2,
method = "glmnet",
family = "binomial",
tuneGrid = data.frame(alpha = 1, lambda = seq(0, .1, length.out = 100)),
trControl = trainControl(method = "cv", number = 10, selectionFunction = "oneSE", classProbs = TRUE, summaryFunction = twoClassSummaryCustom),
metric = "AUC",
na.action = na.omit
)
# Bagging and Random Forests
set.seed(253)
rf_mod <- train(
Diabetes ~ Age + Gender + Weight + Height + Pulse + TotChol + PhysActive + BPSysAve,
data = testNHANES2,
method = "rf",
tuneGrid = data.frame(mtry = c(0:8)),
trControl = trainControl(method = "oob", selectionFunction = "best"),
metric = "AUC",
ntree = 1000,
na.action = na.omit
)
# Lambda vs. AUC
plot(logistic_mod)
# Best Tune Metrics
logistic_mod$results %>%
filter(lambda==logistic_mod$bestTune$lambda) # ARE SENS AND SPEC SWITCHED HERE?
# NIR Metric
testNHANES2 %>%
count(Diabetes)
6848/(6848+673)
## [1] 0.9105172
# LASSO Logistic Model Predicted Probability Boxplots
lasso_logistic_mod_data <- testNHANES2 %>%
mutate(prob_pred = predict(logistic_mod, newdata = testNHANES2, type = "prob"))
lasso_logistic_mod_data %>%
ggplot(aes(x = Diabetes, y = prob_pred$Yes)) +
geom_boxplot() +
geom_hline(yintercept = .5, color = 'red') +
theme_classic() +
labs(x = 'Diabetes Status', y = 'Predicted Probability of Being Diabetic (DiabetesYes)')
# Checking Predictions (see confusion matrix below)
lasso_logistic_mod_data %>%
mutate(PredictDiabetesYes = prob_pred$Yes > .5) %>% #if I change this to prob_pred$No > .5, the sensitivity and specificity measures match the caret results readout, but aren't we modeling the probability of someone having diabetes (when y=1)?
count(Diabetes, PredictDiabetesYes)
# Confirming Predictions with Training Logistic Model + Predicted Probability Boxplots
training_logistic_mod <- testNHANES2 %>%
with(glm(Diabetes ~ Age + Gender + Weight + Height + Pulse + TotChol + PhysActive + BPSysAve, family = binomial))
augment(training_logistic_mod, type.predict = 'response') %>%
ggplot(aes(x = factor(Diabetes), y = .fitted)) + #replace ... with outcome variable name
geom_boxplot() +
labs(x = 'Diabetic', y = 'Predicted Probability of Outcome') +
theme_classic()
logistic_mod_data_copy <- testNHANES2 %>%
mutate(pred = predict(logistic_mod, newdata = testNHANES2))
Confusion Matrix:
Actual + Actual -
Predict + 4 (TP) 1 (FP)
Predict - 669 (FN) 6847 (TN)
Notes on LASSO Logistic Model Accuracy Measures
Sensitivity = TP/(TP + FN) = 4/(4 + 669) = 0.005943536
Specificity = TN/(TN + FP) = 6847/(6847 + 1) = 0.999854
We can see that the calculations above (which are admittedly from the training data) that they are switched from what is reported in the caret ‘results’ readout. If I change it so that we are modeling the probability of someone not having diabetes, as opposed to the opposite, the sensitivity and specificity would match the caret results readout. Even if the measures above are from the training data, it still provides compelling evidence that the caret readout is switched so you should keep that in mind.
# Bagging and Random Forests Evaluation
# OOB Accuracy vs. No.Randomly Selected Predictors
plot(rf_mod)
# Accuracy Metrics
rf_mod$results
rf_mod$finalModel
##
## Call:
## randomForest(x = x, y = y, ntree = 1000, mtry = min(param$mtry, ncol(x)))
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 4.65%
## Confusion matrix:
## No Yes class.error
## No 6828 20 0.002920561
## Yes 330 343 0.490341753
These two methods were used as a way of gauging whether variables I believed to be potential indicators of diabetes were actually good predictors. The different methods allowed me to come at this question from slightly different angles and ultimately produced different results as will be discussed below.
# LASSO Logistic Regression
plot(logistic_mod$finalModel, xvar = "lambda", label = TRUE, col = rainbow(20))
rownames(logistic_mod$finalModel$beta)
## [1] "Age" "Gendermale" "Weight" "Height"
## [5] "Pulse" "TotChol" "PhysActiveYes" "BPSysAve"
# Create a boolean matrix (predictors x lambdas) of variable exclusion
bool_predictor_exclude3 <- logistic_mod$finalModel$beta==0
# Loop over each variable
var_imp3 <- sapply(seq_len(nrow(bool_predictor_exclude3)), function(row) {
# Extract coefficient path (sorted from highest to lowest lambda)
this_coeff_path3 <- bool_predictor_exclude3[row,]
# Compute and return the # of lambdas until this variable is out forever
ncol(bool_predictor_exclude3)-which.min(this_coeff_path3)+1
})
# Create a dataset of this information and sort
var_imp_data3 <- tibble(
var_name = rownames(bool_predictor_exclude3),
var_imp3 = var_imp3
)
var_imp_data3 %>% arrange(desc(var_imp3))
# Bagging and Random Forests
var_imp_rf <- randomForest::importance(rf_mod$finalModel)
var_imp_rf <- data.frame(
predictor = rownames(var_imp_rf),
MeanDecreaseGini = var_imp_rf[,"MeanDecreaseGini"]
) %>%
arrange(desc(MeanDecreaseGini))
head(var_imp_rf, 8)
Model Comparison:
Now having fit a range of different models, I can compare them to determine which of them is ‘best.’
Error Metrics and Accuracy: Below are error metrics and various accuracy estimates for each of the methods’ “best” model– the simplest model with the highest AUC.
logistic_mod$results %>%
filter(lambda==logistic_mod$bestTune$lambda)
rf_mod$results %>%
filter(mtry == 2)
rf_mod$finalModel
##
## Call:
## randomForest(x = x, y = y, ntree = 1000, mtry = min(param$mtry, ncol(x)))
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 4.65%
## Confusion matrix:
## No Yes class.error
## No 6828 20 0.002920561
## Yes 330 343 0.490341753
Let’s compare the models to each other best we can.
The LASSO logistic model produced an AUC of .806 and the AUC SD is reported as .035. Other corresponding measures of accuracy from this model include the overall accuracy, sensitivity, and specificity. The overall accuracy is reported to be 91% and the sensitivity and specificity are .999 and .006 respectively. Sensitivity SD measures .0004 and and specificity SD is estimated to be .0007, so there is very little variation in estimates and not a major concern for us. It should be noted that sensitivity and specificity are likely switched here, please see the discussion above for details. Assuming that sensitivity and specificity are indeed switched here, we can see that this model does a very poor job of predicting those with diabetes. Even despite only being able to accurately predict those with diabetes less than 1% of the time, the overall accuracy of the model is fairly high. Why is this? I have reason to believe that this model is still so accurate because of the no-information rate of the data. The NIR is 91%, which is very high. This would imply that if the model were to simply predict the majority classifier (in this case non diabetic) every time, the model would still be accurate 91% of the time. This is exactly what we see in our overall accuracy measure above.
Now to turn to the bagging/random forest model, this model cannot be evaluated in the exact same ways but we can get an overall sense of accuracy. The best tune of the random forest model was able to predict cases’ diabetes status 95% of the time. Compared to the LASSO logistic model, the random forest model was much better at predicting when cases actually had diabetes. This is only by relative means though. The random forest model still has an out of bag error estimate of about 50% when it comes to predicting when a case actually does have diabetes. The model does a much better job of predicting when people aren’t diabetic, though. The model produced a out of bag error of less than 1% when it comes to predicting cases that are actually non-diabetic. The model has an overall out of bag error estimate of 4.65%. So, in a broad sense, this model is very accurate at predicting non-diabetic when a case really isn’t diabetic but it is only able to accurately predict diabetes for people who are actually diabetic about 50% of the time.
Overall Most Preferable Model:
After having compared the LASSO logistic regression and bagging/random forest model, I am able to make an informed decision about which of these models is more preferable. I can base this decision on accuracy metrics, interpretability of the estimates, and the usefulness of these models in context of the data. With these aspects in mind, it is clear to me that the random forest model is the more preferable of the two. The random forest model’s overall accuracy (95%) is higher than the LASSO logistic regression model (91%). The random forest model also produces better sensitivity and specificity measures than the logistic model. The logistic model particularly fails at making good predictions for people who are diabetic. While the random forest model is still fairly bad at doing the same thing (it only is accurate predicting diabetes among those who are actually diabetic ~50% of the time), but it is significantly better than the logistic model. In the context of the data, I believe it is more important for a predictive model to be able to identify when a disease is actually present than it is for the model to identify when a disease isn’t. What good is the model if it can only confirm that people aren’t diabetic? It might have some benefits, but when specificity is so high, I’d hope for sensitivity to also be high. So, this aspect weighed heavily in my mind in reasoning which model is preferable. One potential advantage that I had to overlook for the logistic model is its interpretability. The coefficient estimates produced by the logistic model are very easily interpreted, and this has benefits in explaining how certain variables are related to the odds of having diabetes. Nonetheless, the random forest model can be interpreted in broad strokes and its overall higher accuracy measures make it an obvious choice. While I don’t know for certain, I’d imagine that an overall accuracy of 95% for a predictive model in a clinical setting would be somewhat worthwhile. Obviously it’d be ideal to have that number higher, but considering the limited number of predictors I used, I’m somewhat satisfied with seeing this figure as high as it is. As discussed above, the no-information rate is fairly high for this particular classification task, and being above that figure in overall accuracy is a good threshold to have passed. See the readouts below to confirm the error estimates.
rf_mod$finalModel
##
## Call:
## randomForest(x = x, y = y, ntree = 1000, mtry = min(param$mtry, ncol(x)))
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 4.65%
## Confusion matrix:
## No Yes class.error
## No 6828 20 0.002920561
## Yes 330 343 0.490341753
When performing statistical research such as this, it’s important to recognize if any harms may come from your analyses and/or how the data was collected. I find it worth mentioning in this report a few things worth your consideration. Namely, while NHANES’ sample collection methods seem fairly rigorous, one key issue I have with it is the exclusion of institutionalized populations from the data. The absence of this population means that health phenomena specific to them isn’t reflected in this data. And because NHANES informs public policy and funding, these individuals’ needs will likely go to the wayside. I also feel it’s important to contextualize that this data is about a decade old and this analysis reflects on this snapshot in time, and it shouldn’t be extrapolated to today. These are just a few examples, and there are surely more implications that are worth expanding on in a different setting, but hopefully this makes you think a bit more critically about the implications of this analysis.