PredictiveModelBuildingTutorial

Step 1: Install Required Packages

library(rms) #intended for restricted cubic splines (RCS), but not actively used in the final code (recipes is used instead).

library(glmnet) #For fitting both elastic-net and lasso logistic regression models.

library(dplyr) #To filter, arrange, and manipulate the results dataframe

library(pROC) #ROC curve and AUC computations, not used in this code

library(rsample) #For creating 5-fold cross-validation splits with vfold_cv().

library(recipes) #For applying natural splines (step_ns) to continuous variables.

library(yardstick) #For computing precision and recall in the custom scoring function.

library(tidyr) #data reshaping, not actively used in the code below

library(purrr) #Used to apply the custom evaluation function over parameter combinations

library(tibble) #For creating and handling tibbles (tibble(...)), which are more robust than base data.frames.


library(splines)

library(ggplot2)

library(tidyverse)

library(splines)

library(gridExtra)

Step2: Simulate Data

set.seed(12345)
n <- 500
data <- tibble(
  age = rnorm(n, 50, 10),
  systolic_bp = rnorm(n, 120, 15),
  cholesterol = rnorm(n, 200, 25),
  outcome = rbinom(n, 1, 0.3)
)

str(data)

## tibble [500 × 4] (S3: tbl_df/tbl/data.frame)
##  $ age        : num [1:500] 55.9 57.1 48.9 45.5 56.1 ...
##  $ systolic_bp: num [1:500] 98.7 83 127.3 105.9 170 ...
##  $ cholesterol: num [1:500] 242 202 179 181 190 ...
##  $ outcome    : int [1:500] 0 1 0 1 0 1 1 0 0 1 ...

Step 3: Visualise Relationship, IMPORTANT!

ggplot(data, aes(x = age, y = outcome)) + geom_smooth(method = "loess") + ggtitle("Age vs Outcome L Curve")

## `geom_smooth()` using formula = 'y ~ x'

ggplot(data, aes(x = age, y = outcome)) +
  geom_point(alpha = 0.3, position = position_jitter(height = 0.05)) +
  geom_smooth(method = "glm", method.args = list(family = "binomial"), se = TRUE) +
  ggtitle("age vs Outcome (Logistic Curve)")

## `geom_smooth()` using formula = 'y ~ x'

Step 4: Compare equal-spaced knots, percentile-based knots and log transformed splines using cross validated AUC

Choosing the number and placement of knots in spline modeling balances bias and variance. Splines outperform linear models when relationships are nonlinear, but using them on skewed data can cause instability, especially in the tails.

Best Practices

Use percentile-based knots for skewed or irregularly distributed variables.
Use equally spaced knots when the variable is uniformly or normally distributed.
Log-transformed splines only if strong skew and nonlinearity justify it.
Use cross-validation to compare performance across spline methods.

AUC Results

Equal-spaced knots (AUC = 0.711) slightly outperformed others.

Percentile-based knots (AUC = 0.706) performed similarly.

Log-transformed splines (AUC = 0.664) under-performed, likely distorting meaningful relationships.

Conclusion Spline effectiveness depends on data structure.

set.seed(12345)
n <- 500
df <- tibble(
  age = rnorm(n, 50, 10),
  systolic_bp = rnorm(n, 120, 15),
  cholesterol = rnorm(n, 200, 25),
  outcome = rbinom(n, 1, 0.3)
)
df$outcome <- rbinom(n, 1, plogis(0.05 * df$age - 0.03 * df$systolic_bp + 0.02 * df$cholesterol)) #Simulate Stronger Signal (for Testing)

# Helper function to create spline matrix
create_spline_matrix <- function(df, transform = c("percentile", "equal", "log")) {
  transform <- match.arg(transform)
  if (transform == "percentile") {
    age_knots <- quantile(df$age, probs = c(0.25, 0.5, 0.75))
    bp_knots <- quantile(df$systolic_bp, probs = c(0.25, 0.5, 0.75))
    chol_knots <- quantile(df$cholesterol, probs = c(0.25, 0.5, 0.75))
    
    X <- cbind(
      ns(df$age, knots = age_knots),
      ns(df$systolic_bp, knots = bp_knots),
      ns(df$cholesterol, knots = chol_knots)
    )
    
  } else if (transform == "equal") {
    age_range <- range(df$age)
    bp_range <- range(df$systolic_bp)
    chol_range <- range(df$cholesterol)
    
    age_knots <- seq(age_range[1], age_range[2], length.out = 5)[2:4]
    bp_knots <- seq(bp_range[1], bp_range[2], length.out = 5)[2:4]
    chol_knots <- seq(chol_range[1], chol_range[2], length.out = 5)[2:4]
    
    X <- cbind(
      ns(df$age, knots = age_knots),
      ns(df$systolic_bp, knots = bp_knots),
      ns(df$cholesterol, knots = chol_knots)
    )
    
  } else if (transform == "log") {
    X <- cbind(
      ns(log(df$age), df = 4),
      ns(log(df$systolic_bp), df = 4),
      ns(log(df$cholesterol), df = 4)
    )
  }
  
  colnames(X) <- make.names(colnames(X))
  return(X)
}

# Fit model and extract cross-validated AUC
fit_and_evaluate <- function(X, y) {
  cv_model <- cv.glmnet(
    x = X,
    y = y,
    alpha = 0.5,
    family = "binomial",
    nfolds = 5,
    type.measure = "auc"
  )
  
  list(
    model = cv_model,
    auc = max(cv_model$cvm)
  )
}

# Prepare outcome and compare
y <- df$outcome

X_percentile <- create_spline_matrix(df, "percentile")
result_percentile <- fit_and_evaluate(X_percentile, y)

X_equal <- create_spline_matrix(df, "equal")
result_equal <- fit_and_evaluate(X_equal, y)

X_log <- create_spline_matrix(df, "log")
result_log <- fit_and_evaluate(X_log, y)

# Compare results
cat("Cross-validated AUCs:\n")

## Cross-validated AUCs:

cat("Percentile-based knots:", round(result_percentile$auc, 3), "\n")

## Percentile-based knots: 0.706

cat("Equal-spaced knots:", round(result_equal$auc, 3), "\n")

## Equal-spaced knots: 0.711

cat("Log-transformed splines:", round(result_log$auc, 3), "\n")

## Log-transformed splines: 0.664

Step5a: Visualize Spline Effects

We’ll extract the predicted probabilities and plot them against each predictor while holding others at their median (partial dependence-like). Let’s use the percentil based knots, even though equally spaced gave us the best AUC (0.711), because in general covariate distributions are not so much well behaved.

# Define spline knots
knots_list <- list(
  age = quantile(data$age, probs = c(0.25, 0.5, 0.75)),
  systolic_bp = quantile(data$systolic_bp, probs = c(0.25, 0.5, 0.75)),
  cholesterol = quantile(data$cholesterol, probs = c(0.25, 0.5, 0.75))
)
boundary_list <- list(
  age = range(data$age),
  systolic_bp = range(data$systolic_bp),
  cholesterol = range(data$cholesterol)
)

# Fit model using splines in formula
model <- glm(
  outcome ~ 
    ns(age, knots = knots_list$age, Boundary.knots = boundary_list$age) +
    ns(systolic_bp, knots = knots_list$systolic_bp, Boundary.knots = boundary_list$systolic_bp) +
    ns(cholesterol, knots = knots_list$cholesterol, Boundary.knots = boundary_list$cholesterol),
  data = data,
  family = binomial()
)

# Variable names
predictors <- c("age", "systolic_bp", "cholesterol")

# Partial dependence plotting function
plot_partial_dependence <- function(var, data, model, knots_list, boundary_list, grid_points = 100) {
  var_range <- range(data[[var]])
  grid <- tibble(!!var := seq(var_range[1], var_range[2], length.out = grid_points))
  
  # Fix other variables to median
  other_vars <- setdiff(predictors, var)
  median_vals <- summarise(data, across(all_of(other_vars), median))
  new_data <- bind_cols(grid, median_vals[rep(1, grid_points), ])

  # Predict
  probs <- predict(model, newdata = new_data, type = "response")

  # Plot
  new_data %>%
    mutate(prob = probs) %>%
    ggplot(aes_string(x = var, y = "prob")) +
    geom_line(color = "blue", size = 1) +
    labs(
      title = paste("Partial Dependence-like Plot:", var),
      x = var,
      y = "Predicted Probability"
    ) +
    theme_minimal()
}

# Plot for all predictors
plots <- lapply(predictors, plot_partial_dependence, data = data, model = model,
                knots_list = knots_list, boundary_list = boundary_list)

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

do.call(grid.arrange, c(plots, ncol = 1))

## Step 5b: Tune Number of Knots

results <- list()
for (k in 2:5) {
  knots <- seq(min(df$age), max(df$age), length.out = k + 2)[2:(k+1)]
  
  X <- cbind(
    ns(df$age, knots = knots),
    ns(df$systolic_bp, knots = knots),
    ns(df$cholesterol, knots = knots)
  )
  colnames(X) <- make.names(colnames(X))
  
  cv_model <- cv.glmnet(
    x = X, y = df$outcome,
    family = "binomial", alpha = 0.5,
    nfolds = 5, type.measure = "auc"
  )
  
  results[[paste0("knots_", k)]] <- max(cv_model$cvm)
}

# Show results
results

## $knots_2
## [1] 0.7292482
## 
## $knots_3
## [1] 0.6847485
## 
## $knots_4
## [1] 0.7161926
## 
## $knots_5
## [1] 0.6796322

Two knots (AUC = 0.741) gives the best predictive performance.This suggests that the relationship between predictors and outcome is nonlinear, but not complex.Too many knots might be overfitting noise or adding unnecessary flexibility.
3–5 knots have lower AUCs, consistent with a bias-variance tradeoff. Increasing knots gives more flexibility (lower bias), but also higher variance. In this case, simpler splines generalize better.

Step6: Generate design matrix with spline-transformed variables using optimal spline

age_knots <- quantile(data$age, probs = c(0.25, 0.5, 0.75))
bp_knots <- quantile(data$systolic_bp, probs = c(0.25, 0.5, 0.75))
chol_knots <- quantile(data$cholesterol, probs = c(0.25, 0.5, 0.75))

X <- cbind(
  ns(data$age, knots = age_knots),
  ns(data$systolic_bp, knots = bp_knots),
  ns(data$cholesterol, knots = chol_knots)
)

colnames(X) <- make.names(colnames(X))  # Clean names for modeling
y <- data$outcome

Step6: Elastic-net logistic regression (on spline-transformed variables)

While Logistic Regression Predicts the probability of a binary outcome (0 or 1) using the logistic (sigmoid) function, elastic net logistic regression is it’s regularized version that combines two types of penalties to the model coefficients, to avoid overfitting

\(Loss=Logistic Loss+λ\big[α⋅∑∣β j ∣+(1−α)⋅∑β^{2}_j\big]\) where, λ: controls the overall penalty (strength) α: controls the mix between L1 (lasso) and L2 (ridge): α=1: Lasso (feature selection) α=0: Ridge (shrinks coefficients) 0<α<1: Elastic net (a mix)

It works well when we have many correlated predictors, Can perform automatic feature selection (via L1) and helps reduce overfitting

We use cross-validation to choose the lambda that gives the best model performance.

# Fit elastic-net logistic regression
cv_model <- cv.glmnet(
  x = X,
  y = y,
  alpha = 0.5,            # alpha = 0.5 for elastic-net
  family = "binomial",
  nfolds = 5
)

# Best lambda (penalty strength)
best_lambda <- cv_model$lambda.min #lambda.min:lowest cross vlaidated value, lambda.lse, largest lambda with 1 SE

Step 4: Custom Scoring Function to maximize sensitvity/recall at 100% Precision

custom_score <- function(pred_probs, true_labels) {
  threshold <- 0.999999  # set very high threshold for 100% precision

  predicted_labels <- ifelse(pred_probs >= threshold, 1, 0)

  TP <- sum(predicted_labels == 1 & true_labels == 1)
  FN <- sum(predicted_labels == 0 & true_labels == 1)
  FP <- sum(predicted_labels == 1 & true_labels == 0)

  precision <- ifelse(TP + FP == 0, 1, TP / (TP + FP))
  recall <- ifelse(TP + FN == 0, 0, TP / (TP + FN))

  if (precision == 1) {
    return(recall)
  } else {
    return(0)
  }
}

Step 5: Use Lasso for final variable Selection

# Fit lasso path
lasso_model <- glmnet(
  x = X,
  y = y,
  alpha = 1,
  family = "binomial"
)

# Extract number of nonzero variables at each lambda
nz <- apply(coef(lasso_model), 2, function(x) sum(x != 0) - 1)  # exclude intercept

# View number of variables vs lambda
plot(nz, lasso_model$lambda, type = "b", log = "x",
     xlab = "Number of Non-Zero Coefficients",
     ylab = "Lambda",
     main = "Lasso Variable Selection Path")

## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 x value <= 0 omitted from
## logarithmic plot

# Choose lambda for desired sparsity (e.g., 5 predictors)
target_size <- 5
best_index <- which.min(abs(nz - target_size))
selected_lambda <- lasso_model$lambda[best_index]
selected_coef <- coef(lasso_model, s = selected_lambda)

Top-left Simple model, strong penalty, few variables
Bottom-right Complex model, weak penalty, many variables
Any dot A specific model with given lambda and number of active predictors

We could now pick a target model size (say 5 variables), find the corresponding λ, and extract the selected variables using:

coef(lasso_model, s = selected_lambda)

## 13 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept) -1.0250583166
## X1           .           
## X2           .           
## X3           .           
## X4           0.6463512656
## X1           0.3988588026
## X2          -0.5370579860
## X3           .           
## X4           .           
## X1           .           
## X2           0.0002114156
## X3          -0.2449755723
## X4           .