library(rms) #intended for restricted cubic splines (RCS), but not actively used in the final code (recipes is used instead).
library(glmnet) #For fitting both elastic-net and lasso logistic regression models.
library(dplyr) #To filter, arrange, and manipulate the results dataframe
library(pROC) #ROC curve and AUC computations, not used in this code
library(rsample) #For creating 5-fold cross-validation splits with vfold_cv().
library(recipes) #For applying natural splines (step_ns) to continuous variables.
library(yardstick) #For computing precision and recall in the custom scoring function.
library(tidyr) #data reshaping, not actively used in the code below
library(purrr) #Used to apply the custom evaluation function over parameter combinations
library(tibble) #For creating and handling tibbles (tibble(...)), which are more robust than base data.frames.
library(splines)
library(ggplot2)
library(tidyverse)
library(splines)
library(gridExtra)
set.seed(12345)
n <- 500
data <- tibble(
age = rnorm(n, 50, 10),
systolic_bp = rnorm(n, 120, 15),
cholesterol = rnorm(n, 200, 25),
outcome = rbinom(n, 1, 0.3)
)
str(data)
## tibble [500 × 4] (S3: tbl_df/tbl/data.frame)
## $ age : num [1:500] 55.9 57.1 48.9 45.5 56.1 ...
## $ systolic_bp: num [1:500] 98.7 83 127.3 105.9 170 ...
## $ cholesterol: num [1:500] 242 202 179 181 190 ...
## $ outcome : int [1:500] 0 1 0 1 0 1 1 0 0 1 ...
ggplot(data, aes(x = age, y = outcome)) + geom_smooth(method = "loess") + ggtitle("Age vs Outcome L Curve")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data, aes(x = age, y = outcome)) +
geom_point(alpha = 0.3, position = position_jitter(height = 0.05)) +
geom_smooth(method = "glm", method.args = list(family = "binomial"), se = TRUE) +
ggtitle("age vs Outcome (Logistic Curve)")
## `geom_smooth()` using formula = 'y ~ x'
Choosing the number and placement of knots in spline modeling balances bias and variance. Splines outperform linear models when relationships are nonlinear, but using them on skewed data can cause instability, especially in the tails.
Best Practices
Use percentile-based knots for skewed or irregularly distributed variables.
Use equally spaced knots when the variable is uniformly or normally distributed.
Log-transformed splines only if strong skew and nonlinearity justify it.
Use cross-validation to compare performance across spline methods.
AUC Results
Equal-spaced knots (AUC = 0.711) slightly outperformed others.
Percentile-based knots (AUC = 0.706) performed similarly.
Log-transformed splines (AUC = 0.664) under-performed, likely distorting meaningful relationships.
Conclusion Spline effectiveness depends on data structure.
set.seed(12345)
n <- 500
df <- tibble(
age = rnorm(n, 50, 10),
systolic_bp = rnorm(n, 120, 15),
cholesterol = rnorm(n, 200, 25),
outcome = rbinom(n, 1, 0.3)
)
df$outcome <- rbinom(n, 1, plogis(0.05 * df$age - 0.03 * df$systolic_bp + 0.02 * df$cholesterol)) #Simulate Stronger Signal (for Testing)
# Helper function to create spline matrix
create_spline_matrix <- function(df, transform = c("percentile", "equal", "log")) {
transform <- match.arg(transform)
if (transform == "percentile") {
age_knots <- quantile(df$age, probs = c(0.25, 0.5, 0.75))
bp_knots <- quantile(df$systolic_bp, probs = c(0.25, 0.5, 0.75))
chol_knots <- quantile(df$cholesterol, probs = c(0.25, 0.5, 0.75))
X <- cbind(
ns(df$age, knots = age_knots),
ns(df$systolic_bp, knots = bp_knots),
ns(df$cholesterol, knots = chol_knots)
)
} else if (transform == "equal") {
age_range <- range(df$age)
bp_range <- range(df$systolic_bp)
chol_range <- range(df$cholesterol)
age_knots <- seq(age_range[1], age_range[2], length.out = 5)[2:4]
bp_knots <- seq(bp_range[1], bp_range[2], length.out = 5)[2:4]
chol_knots <- seq(chol_range[1], chol_range[2], length.out = 5)[2:4]
X <- cbind(
ns(df$age, knots = age_knots),
ns(df$systolic_bp, knots = bp_knots),
ns(df$cholesterol, knots = chol_knots)
)
} else if (transform == "log") {
X <- cbind(
ns(log(df$age), df = 4),
ns(log(df$systolic_bp), df = 4),
ns(log(df$cholesterol), df = 4)
)
}
colnames(X) <- make.names(colnames(X))
return(X)
}
# Fit model and extract cross-validated AUC
fit_and_evaluate <- function(X, y) {
cv_model <- cv.glmnet(
x = X,
y = y,
alpha = 0.5,
family = "binomial",
nfolds = 5,
type.measure = "auc"
)
list(
model = cv_model,
auc = max(cv_model$cvm)
)
}
# Prepare outcome and compare
y <- df$outcome
X_percentile <- create_spline_matrix(df, "percentile")
result_percentile <- fit_and_evaluate(X_percentile, y)
X_equal <- create_spline_matrix(df, "equal")
result_equal <- fit_and_evaluate(X_equal, y)
X_log <- create_spline_matrix(df, "log")
result_log <- fit_and_evaluate(X_log, y)
# Compare results
cat("Cross-validated AUCs:\n")
## Cross-validated AUCs:
cat("Percentile-based knots:", round(result_percentile$auc, 3), "\n")
## Percentile-based knots: 0.706
cat("Equal-spaced knots:", round(result_equal$auc, 3), "\n")
## Equal-spaced knots: 0.711
cat("Log-transformed splines:", round(result_log$auc, 3), "\n")
## Log-transformed splines: 0.664
We’ll extract the predicted probabilities and plot them against each predictor while holding others at their median (partial dependence-like). Let’s use the percentil based knots, even though equally spaced gave us the best AUC (0.711), because in general covariate distributions are not so much well behaved.
# Define spline knots
knots_list <- list(
age = quantile(data$age, probs = c(0.25, 0.5, 0.75)),
systolic_bp = quantile(data$systolic_bp, probs = c(0.25, 0.5, 0.75)),
cholesterol = quantile(data$cholesterol, probs = c(0.25, 0.5, 0.75))
)
boundary_list <- list(
age = range(data$age),
systolic_bp = range(data$systolic_bp),
cholesterol = range(data$cholesterol)
)
# Fit model using splines in formula
model <- glm(
outcome ~
ns(age, knots = knots_list$age, Boundary.knots = boundary_list$age) +
ns(systolic_bp, knots = knots_list$systolic_bp, Boundary.knots = boundary_list$systolic_bp) +
ns(cholesterol, knots = knots_list$cholesterol, Boundary.knots = boundary_list$cholesterol),
data = data,
family = binomial()
)
# Variable names
predictors <- c("age", "systolic_bp", "cholesterol")
# Partial dependence plotting function
plot_partial_dependence <- function(var, data, model, knots_list, boundary_list, grid_points = 100) {
var_range <- range(data[[var]])
grid <- tibble(!!var := seq(var_range[1], var_range[2], length.out = grid_points))
# Fix other variables to median
other_vars <- setdiff(predictors, var)
median_vals <- summarise(data, across(all_of(other_vars), median))
new_data <- bind_cols(grid, median_vals[rep(1, grid_points), ])
# Predict
probs <- predict(model, newdata = new_data, type = "response")
# Plot
new_data %>%
mutate(prob = probs) %>%
ggplot(aes_string(x = var, y = "prob")) +
geom_line(color = "blue", size = 1) +
labs(
title = paste("Partial Dependence-like Plot:", var),
x = var,
y = "Predicted Probability"
) +
theme_minimal()
}
# Plot for all predictors
plots <- lapply(predictors, plot_partial_dependence, data = data, model = model,
knots_list = knots_list, boundary_list = boundary_list)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
do.call(grid.arrange, c(plots, ncol = 1))
## Step 5b: Tune Number of Knots
results <- list()
for (k in 2:5) {
knots <- seq(min(df$age), max(df$age), length.out = k + 2)[2:(k+1)]
X <- cbind(
ns(df$age, knots = knots),
ns(df$systolic_bp, knots = knots),
ns(df$cholesterol, knots = knots)
)
colnames(X) <- make.names(colnames(X))
cv_model <- cv.glmnet(
x = X, y = df$outcome,
family = "binomial", alpha = 0.5,
nfolds = 5, type.measure = "auc"
)
results[[paste0("knots_", k)]] <- max(cv_model$cvm)
}
# Show results
results
## $knots_2
## [1] 0.7292482
##
## $knots_3
## [1] 0.6847485
##
## $knots_4
## [1] 0.7161926
##
## $knots_5
## [1] 0.6796322
Two knots (AUC = 0.741) gives the best predictive performance.This suggests that the relationship between predictors and outcome is nonlinear, but not complex.Too many knots might be overfitting noise or adding unnecessary flexibility.
3–5 knots have lower AUCs, consistent with a bias-variance tradeoff. Increasing knots gives more flexibility (lower bias), but also higher variance. In this case, simpler splines generalize better.
age_knots <- quantile(data$age, probs = c(0.25, 0.5, 0.75))
bp_knots <- quantile(data$systolic_bp, probs = c(0.25, 0.5, 0.75))
chol_knots <- quantile(data$cholesterol, probs = c(0.25, 0.5, 0.75))
X <- cbind(
ns(data$age, knots = age_knots),
ns(data$systolic_bp, knots = bp_knots),
ns(data$cholesterol, knots = chol_knots)
)
colnames(X) <- make.names(colnames(X)) # Clean names for modeling
y <- data$outcome
While Logistic Regression Predicts the probability of a binary outcome (0 or 1) using the logistic (sigmoid) function, elastic net logistic regression is it’s regularized version that combines two types of penalties to the model coefficients, to avoid overfitting
\(Loss=Logistic Loss+λ\big[α⋅∑∣β j ∣+(1−α)⋅∑β^{2}_j\big]\) where, λ: controls the overall penalty (strength) α: controls the mix between L1 (lasso) and L2 (ridge): α=1: Lasso (feature selection) α=0: Ridge (shrinks coefficients) 0<α<1: Elastic net (a mix)
It works well when we have many correlated predictors, Can perform automatic feature selection (via L1) and helps reduce overfitting
We use cross-validation to choose the lambda that gives the best model performance.
# Fit elastic-net logistic regression
cv_model <- cv.glmnet(
x = X,
y = y,
alpha = 0.5, # alpha = 0.5 for elastic-net
family = "binomial",
nfolds = 5
)
# Best lambda (penalty strength)
best_lambda <- cv_model$lambda.min #lambda.min:lowest cross vlaidated value, lambda.lse, largest lambda with 1 SE
custom_score <- function(pred_probs, true_labels) {
threshold <- 0.999999 # set very high threshold for 100% precision
predicted_labels <- ifelse(pred_probs >= threshold, 1, 0)
TP <- sum(predicted_labels == 1 & true_labels == 1)
FN <- sum(predicted_labels == 0 & true_labels == 1)
FP <- sum(predicted_labels == 1 & true_labels == 0)
precision <- ifelse(TP + FP == 0, 1, TP / (TP + FP))
recall <- ifelse(TP + FN == 0, 0, TP / (TP + FN))
if (precision == 1) {
return(recall)
} else {
return(0)
}
}
# Fit lasso path
lasso_model <- glmnet(
x = X,
y = y,
alpha = 1,
family = "binomial"
)
# Extract number of nonzero variables at each lambda
nz <- apply(coef(lasso_model), 2, function(x) sum(x != 0) - 1) # exclude intercept
# View number of variables vs lambda
plot(nz, lasso_model$lambda, type = "b", log = "x",
xlab = "Number of Non-Zero Coefficients",
ylab = "Lambda",
main = "Lasso Variable Selection Path")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 x value <= 0 omitted from
## logarithmic plot
# Choose lambda for desired sparsity (e.g., 5 predictors)
target_size <- 5
best_index <- which.min(abs(nz - target_size))
selected_lambda <- lasso_model$lambda[best_index]
selected_coef <- coef(lasso_model, s = selected_lambda)
We could now pick a target model size (say 5 variables), find the corresponding λ, and extract the selected variables using:
coef(lasso_model, s = selected_lambda)
## 13 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -1.0250583166
## X1 .
## X2 .
## X3 .
## X4 0.6463512656
## X1 0.3988588026
## X2 -0.5370579860
## X3 .
## X4 .
## X1 .
## X2 0.0002114156
## X3 -0.2449755723
## X4 .