Performance Data Generation and Feature Selection using Lasso and PCA

Author

Julius Fenn

Global settings

Load packages and set seed.

library(glmnet)
library(dplyr)
library(ggplot2)
library(psych)

set.seed(123)

Introduction

This document describes a structured approach to generating performance data, evaluating the impact of irrelevant features, and applying feature selection techniques using Lasso and Principal Component Analysis (PCA). The methods aim to analyze the correlation between generated performance indicators and true scores while identifying the most relevant features.

Data Generation Process

The function generate_performance_data() creates a dataset simulating performance indicators for applicants.

Inputs:
- n_applicants: Number of applicants.
- n_items: Total number of items (features).
- n_relevant: Number of relevant items.
- binary: Whether performance indicators are binarized.
Process:
- Generates true_scores from a normal distribution.
- Assigns weights to relevant and irrelevant items.
- Constructs a performance matrix by introducing noise.
- Applies binary transformation if specified.
Output: A performance matrix and corresponding true scores.

generate_performance_data <- function(n_applicants, n_items, n_relevant, binary = TRUE) {

  # Ensure n_items is greater than or equal to n_relevant to avoid errors
  if (n_items < n_relevant) {
    stop("n_items must be greater than or equal to n_relevant")
  }

  # Generate true scores
  true_scores <- rnorm(n_applicants, mean = 0, sd = 1)

  # Define item weights (relevant vs. noise)
  relevant_weights <- runif(n_relevant, 1, 3)
  noise_weights <- if (n_items > n_relevant) runif(n_items - n_relevant, -0.3, 0.3) else numeric(0)
  item_weights <- c(relevant_weights, noise_weights)

  # Generate performance indicators
  performance_matrix <- outer(true_scores, item_weights, "*") +
    matrix(rnorm(n_applicants * n_items, 0, 1), nrow = n_applicants, ncol = n_items)

  # Apply binary transformation if requested
  if (binary) {
    # flip_prob <- rep(c(0, 0.3), times = c(n_relevant, n_items - n_relevant))
    # flip_mask <- matrix(rbinom(n_applicants * n_items, 1, flip_prob), nrow = n_applicants, ncol = n_items)
    performance_matrix <- ifelse(performance_matrix > 0, 1, 0)
    # performance_matrix <- abs(performance_matrix - flip_mask)
  }

  return(list(performance_matrix = performance_matrix, true_scores = true_scores))
}

apply function

# Parameters
n_applicants <- 200
n_relevant <- 5
n_items <- 20


# show data generation process
dat <- generate_performance_data(n_applicants = n_applicants, n_items = n_items, n_relevant = n_relevant, binary = TRUE)
dat <- as.data.frame(cbind(dat$performance_matrix, dat$true_scores))
colnames(dat) <- c(paste0("relI", 1:n_relevant), paste0("irelI", 1:(n_items - n_relevant)), "true score")
head(dat)

  relI1 relI2 relI3 relI4 relI5 irelI1 irelI2 irelI3 irelI4 irelI5 irelI6
1     0     0     0     0     1      1      0      1      1      0      1
2     0     0     0     0     1      0      1      1      0      1      1
3     1     1     1     1     1      1      0      1      1      1      0
4     0     0     1     1     1      0      1      0      0      1      1
5     0     0     1     1     1      0      1      0      0      0      1
6     1     0     1     1     1      1      1      0      0      1      0
  irelI7 irelI8 irelI9 irelI10 irelI11 irelI12 irelI13 irelI14 irelI15
1      0      0      0       1       1       1       0       0       1
2      0      0      0       0       1       1       1       0       1
3      1      0      1       0       1       1       0       1       0
4      1      0      1       1       1       0       0       0       0
5      0      1      1       0       1       0       0       1       1
6      0      1      0       1       0       1       0       0       1
   true score
1 -0.56047565
2 -0.23017749
3  1.55870831
4  0.07050839
5  0.12928774
6  1.71506499

cor.plot(r = cor(dat))

clearly one component is strongly correlated to the “true scores” (real preformance of LLMs)

Baseline Correlation Analysis

The function compute_baseline_correlation() assesses how adding irrelevant items affects the correlation between total performance scores and true scores.

compute_baseline_correlation <- function(n_applicants, n_relevant, n_total_items, binary = TRUE) {
  correlations <- data.frame(n_items = integer(), correlation = numeric())

  for (it_n_items in n_total_items) {
    data <- generate_performance_data(n_applicants = n_applicants, n_items = it_n_items, n_relevant = n_relevant, binary = TRUE)
    performance_matrix <- data$performance_matrix
    true_scores <- data$true_scores

    # Compute total score
    total_score <- rowSums(performance_matrix)

    # Compute correlation with true scores
    cor_value <- cor(total_score, true_scores)

    # Store results
    correlations <- rbind(correlations, data.frame(n_items = it_n_items, correlation = cor_value))
  }

  # Plot correlation vs. number of items
  ggplot(correlations, aes(x = n_items, y = correlation)) +
    geom_line(color = "blue") +
    geom_point(color = "red") +
    labs(title = "Baseline: Effect of Adding Irrelevant Items on Correlation between Total to True Scores",
         x = "Total Number of Items",
         y = "Correlation between Total and True Scores") +
    theme_minimal()
}

apply function

# Parameters
n_applicants <- 500
n_relevant <- 10
n_items <- 200
n_total_items <- seq(n_relevant, 200, by = 10)



# Compute and Plot Baseline Correlation (Turn Binary On/Off)
compute_baseline_correlation(n_applicants = n_applicants, n_relevant =  n_relevant, n_total_items = n_total_items, binary = TRUE)  # Set binary=FALSE for continuous

adding more and more irrelevant indicators shrinks the correlation between the “observed scores” and “true scores” (real preformance of LLMs)

Feature Selection via Lasso Regression

The function perform_lasso_feature_selection() applies Lasso (L1 regularization) to select relevant features.

perform_lasso_feature_selection <- function(n_applicants, n_items, n_relevant, binary = TRUE, lambda_threshold = 0.1) {

  # Generate dataset
  data <- generate_performance_data(n_applicants, n_items, n_relevant, binary)
  performance_matrix <- data$performance_matrix
  true_scores <- data$true_scores

  # Convert to DataFrame with proper naming
  dat <- as.data.frame(cbind(performance_matrix, true_scores))
  colnames(dat) <- c(paste0("relI", 1:n_relevant), paste0("irelI", 1:(n_items - n_relevant)), "true_score")

  # Standardize features
  x <- scale(as.matrix(performance_matrix))
  y <- scale(true_scores)

  # Fit Elastic Net
  lasso_model <- cv.glmnet(x, y, alpha = 0.5)

  # Use a custom lambda or optimize via cross-validation
  if (lambda_threshold == "auto") {
    lambda_threshold <- lasso_model$lambda[which.max(lasso_model$cvm)]
  }

  # Extract coefficients for chosen lambda
  coef_lasso <- coef(lasso_model, s = lambda_threshold)

  # Select nonzero coefficient items
  selected_items <- which(coef_lasso[-1] != 0)

  # Identify which selected features are relevant vs. irrelevant
  selected_colnames <- colnames(dat)[selected_items]
  n_relevant_selected <- sum(grepl("^relI", selected_colnames))  # Count relevant items
  n_irrelevant_selected <- sum(grepl("^irelI", selected_colnames))  # Count irrelevant items

  print(paste("Total selected features:", length(selected_items)))
  print(paste("Number of relevant items identified:", n_relevant_selected))
  print(paste("Number of irrelevant items identified:", n_irrelevant_selected))

  # Filter performance matrix based on selected features
  if (length(selected_items) > 0) {
    filtered_matrix <- performance_matrix[, selected_items]
    filtered_score <- rowSums(filtered_matrix)
  } else {
    filtered_score <- rep(0, length(true_scores))  # Avoid empty selection error
  }

  # Compute Pearson correlation after Lasso filtering
  cor_lasso <- cor(filtered_score, true_scores)
  print(paste("Pearson correlation after Lasso selection:", round(cor_lasso, 4)))

  # Compute Spearman rank correlation after Lasso filtering
  cor_rank_lasso <- cor(rank(filtered_score), rank(true_scores), method = "spearman")
  print(paste("Spearman rank correlation after Lasso selection:", round(cor_rank_lasso, 4)))


  # Plot results
  plot(filtered_score, true_scores, main = "Filtered Score vs. True Scores",
       xlab = "Filtered Score", ylab = "True Scores")

  return(list(selected_items = selected_items,
              correlation = cor_lasso,
              relevant_selected = n_relevant_selected,
              irrelevant_selected = n_irrelevant_selected))
}

apply function

# Parameters
n_applicants <- 500
n_relevant <- 10
n_items <- 200
n_total_items <- seq(n_relevant, 200, by = 10)



# different sets of lambda for
#> binary data
lasso_results <- perform_lasso_feature_selection(n_applicants, n_items, n_relevant, binary = TRUE, lambda_threshold = .1)  # Set binary=FALSE for continuous

[1] "Total selected features: 47"
[1] "Number of relevant items identified: 10"
[1] "Number of irrelevant items identified: 37"
[1] "Pearson correlation after Lasso selection: 0.7321"
[1] "Spearman rank correlation after Lasso selection: 0.7892"

lasso_results <- perform_lasso_feature_selection(n_applicants, n_items, n_relevant, binary = TRUE, lambda_threshold = .3)  # Set binary=FALSE for continuous

[1] "Total selected features: 10"
[1] "Number of relevant items identified: 10"
[1] "Number of irrelevant items identified: 0"
[1] "Pearson correlation after Lasso selection: 0.8926"
[1] "Spearman rank correlation after Lasso selection: 0.9507"

#> continuous data
lasso_results <- perform_lasso_feature_selection(n_applicants, n_items, n_relevant, binary = FALSE, lambda_threshold = .01)  # Set binary=FALSE for continuous

[1] "Total selected features: 83"
[1] "Number of relevant items identified: 10"
[1] "Number of irrelevant items identified: 73"
[1] "Pearson correlation after Lasso selection: 0.9365"
[1] "Spearman rank correlation after Lasso selection: 0.9298"

lasso_results <- perform_lasso_feature_selection(n_applicants, n_items, n_relevant, binary = FALSE, lambda_threshold = .1)  # Set binary=FALSE for continuous

[1] "Total selected features: 10"
[1] "Number of relevant items identified: 10"
[1] "Number of irrelevant items identified: 0"
[1] "Pearson correlation after Lasso selection: 0.9808"
[1] "Spearman rank correlation after Lasso selection: 0.9793"

Lasso / rigid regression will lead to better results if we use continous data and not code the answers of the LLMs are right or wrong (binary)

depending on the choosen lambda we can easily identify the subset of relevant indicators, e.g., to predict the rank of the LLM

perform PCA Dimensionality Reduction and Identify Important Features

perform_pca_feature_selection <- function(n_applicants, n_items, n_relevant, binary = TRUE, num_components = 10, top_n_features = 5) {

  # Generate dataset
  data <- generate_performance_data(n_applicants, n_items, n_relevant, binary)
  performance_matrix <- data$performance_matrix
  true_scores <- data$true_scores

  # Convert performance matrix to DataFrame with meaningful column names
  colnames(performance_matrix) <- c(paste0("relI", 1:n_relevant), paste0("irelI", 1:(n_items - n_relevant)))

  # Perform PCA
  pca_model <- prcomp(performance_matrix, center = TRUE, scale. = TRUE)

  # Handle case when num_components = 1
  if (num_components == 1) {
    pca_scores <- as.data.frame(predict(pca_model)[, 1, drop = FALSE])  # Ensure DataFrame format
    colnames(pca_scores) <- "PC1"
    pca_rank <- rank(pca_scores$PC1)
  } else {
    pca_scores <- as.data.frame(predict(pca_model)[, 1:num_components])  # Convert to DataFrame
    pca_rank <- rank(rowSums(pca_scores))
  }

  # Compute correlation between PCA scores and true scores
  cor_pca <- cor(pca_rank, rank(true_scores), method = "spearman")
  print(paste("Spearman correlation after PCA:", round(cor_pca, 4)))

  # Extract PCA loadings (rotation matrix)
  loadings <- abs(pca_model$rotation[, 1:num_components])  # Take absolute values

  # Identify top contributing items for each component
  if (num_components == 1) {
    # Single component: Extract top contributing features directly
    top_features <- names(sort(loadings, decreasing = TRUE))[1:min(top_n_features, length(loadings))]
    top_features_per_component <- list(PC1 = top_features)
  } else {
    # Multiple components: Extract for each PC
    top_features_per_component <- apply(loadings, 2, function(x) {
      names(sort(x, decreasing = TRUE))[1:min(top_n_features, length(x))]
    })
  }

  # Print the most important features per component
  print("Top Features per Component:")
  print(top_features_per_component)


  return(list(correlation = cor_pca,
              pca_model = pca_model,
              pca_scores = pca_scores,
              top_features = top_features_per_component))
}

apply function

# Parameters
n_applicants <- 500
n_relevant <- 10
n_items <- 200


# PCA Dimensionality Reduction and compute rank
cor_pca <- perform_pca_feature_selection(n_applicants, n_items, n_relevant, binary = FALSE, num_components = 1, top_n_features = 10)

[1] "Spearman correlation after PCA: 0.9776"
[1] "Top Features per Component:"
$PC1
 [1] "relI3"  "relI10" "relI2"  "relI4"  "relI8"  "relI1"  "relI5"  "relI6" 
 [9] "relI7"  "relI9"

cor_pca <- perform_pca_feature_selection(n_applicants, n_items, n_relevant, binary = FALSE, num_components = 2, top_n_features = 10)

[1] "Spearman correlation after PCA: -0.8938"
[1] "Top Features per Component:"
      PC1      PC2       
 [1,] "relI4"  "irelI177"
 [2,] "relI6"  "irelI38" 
 [3,] "relI3"  "irelI9"  
 [4,] "relI7"  "irelI169"
 [5,] "relI5"  "irelI96" 
 [6,] "relI9"  "irelI29" 
 [7,] "relI2"  "irelI155"
 [8,] "relI8"  "irelI113"
 [9,] "relI10" "irelI170"
[10,] "relI1"  "irelI60"

cor_pca <- perform_pca_feature_selection(n_applicants, n_items, n_relevant, binary = FALSE, num_components = 3, top_n_features = 10)

[1] "Spearman correlation after PCA: -0.8344"
[1] "Top Features per Component:"
      PC1      PC2        PC3       
 [1,] "relI8"  "irelI132" "irelI19" 
 [2,] "relI9"  "irelI152" "irelI65" 
 [3,] "relI5"  "irelI18"  "irelI7"  
 [4,] "relI1"  "irelI39"  "irelI54" 
 [5,] "relI10" "irelI33"  "irelI90" 
 [6,] "relI4"  "irelI149" "irelI124"
 [7,] "relI7"  "irelI28"  "irelI129"
 [8,] "relI3"  "irelI20"  "irelI159"
 [9,] "relI2"  "irelI94"  "irelI148"
[10,] "relI6"  "irelI176" "irelI41"

if number of relevant components (here 1) is unknown, including more components as necessary shrinks the correlation between the rank scores of the “observed scores” and “true scores” (real preformance of LLMs)