library(glmnet)
library(dplyr)
library(ggplot2)
library(psych)
set.seed(123)Performance Data Generation and Feature Selection using Lasso and PCA
Global settings
Load packages and set seed.
Introduction
This document describes a structured approach to generating performance data, evaluating the impact of irrelevant features, and applying feature selection techniques using Lasso and Principal Component Analysis (PCA). The methods aim to analyze the correlation between generated performance indicators and true scores while identifying the most relevant features.
Data Generation Process
The function generate_performance_data() creates a dataset simulating performance indicators for applicants.
- Inputs:
n_applicants: Number of applicants.n_items: Total number of items (features).n_relevant: Number of relevant items.binary: Whether performance indicators are binarized.
- Process:
- Generates
true_scoresfrom a normal distribution. - Assigns weights to relevant and irrelevant items.
- Constructs a performance matrix by introducing noise.
- Applies binary transformation if specified.
- Generates
- Output: A performance matrix and corresponding true scores.
generate_performance_data <- function(n_applicants, n_items, n_relevant, binary = TRUE) {
# Ensure n_items is greater than or equal to n_relevant to avoid errors
if (n_items < n_relevant) {
stop("n_items must be greater than or equal to n_relevant")
}
# Generate true scores
true_scores <- rnorm(n_applicants, mean = 0, sd = 1)
# Define item weights (relevant vs. noise)
relevant_weights <- runif(n_relevant, 1, 3)
noise_weights <- if (n_items > n_relevant) runif(n_items - n_relevant, -0.3, 0.3) else numeric(0)
item_weights <- c(relevant_weights, noise_weights)
# Generate performance indicators
performance_matrix <- outer(true_scores, item_weights, "*") +
matrix(rnorm(n_applicants * n_items, 0, 1), nrow = n_applicants, ncol = n_items)
# Apply binary transformation if requested
if (binary) {
# flip_prob <- rep(c(0, 0.3), times = c(n_relevant, n_items - n_relevant))
# flip_mask <- matrix(rbinom(n_applicants * n_items, 1, flip_prob), nrow = n_applicants, ncol = n_items)
performance_matrix <- ifelse(performance_matrix > 0, 1, 0)
# performance_matrix <- abs(performance_matrix - flip_mask)
}
return(list(performance_matrix = performance_matrix, true_scores = true_scores))
}apply function
# Parameters
n_applicants <- 200
n_relevant <- 5
n_items <- 20
# show data generation process
dat <- generate_performance_data(n_applicants = n_applicants, n_items = n_items, n_relevant = n_relevant, binary = TRUE)
dat <- as.data.frame(cbind(dat$performance_matrix, dat$true_scores))
colnames(dat) <- c(paste0("relI", 1:n_relevant), paste0("irelI", 1:(n_items - n_relevant)), "true score")
head(dat) relI1 relI2 relI3 relI4 relI5 irelI1 irelI2 irelI3 irelI4 irelI5 irelI6
1 0 0 0 0 1 1 0 1 1 0 1
2 0 0 0 0 1 0 1 1 0 1 1
3 1 1 1 1 1 1 0 1 1 1 0
4 0 0 1 1 1 0 1 0 0 1 1
5 0 0 1 1 1 0 1 0 0 0 1
6 1 0 1 1 1 1 1 0 0 1 0
irelI7 irelI8 irelI9 irelI10 irelI11 irelI12 irelI13 irelI14 irelI15
1 0 0 0 1 1 1 0 0 1
2 0 0 0 0 1 1 1 0 1
3 1 0 1 0 1 1 0 1 0
4 1 0 1 1 1 0 0 0 0
5 0 1 1 0 1 0 0 1 1
6 0 1 0 1 0 1 0 0 1
true score
1 -0.56047565
2 -0.23017749
3 1.55870831
4 0.07050839
5 0.12928774
6 1.71506499
cor.plot(r = cor(dat))clearly one component is strongly correlated to the “true scores” (real preformance of LLMs)
Baseline Correlation Analysis
The function compute_baseline_correlation() assesses how adding irrelevant items affects the correlation between total performance scores and true scores.
compute_baseline_correlation <- function(n_applicants, n_relevant, n_total_items, binary = TRUE) {
correlations <- data.frame(n_items = integer(), correlation = numeric())
for (it_n_items in n_total_items) {
data <- generate_performance_data(n_applicants = n_applicants, n_items = it_n_items, n_relevant = n_relevant, binary = TRUE)
performance_matrix <- data$performance_matrix
true_scores <- data$true_scores
# Compute total score
total_score <- rowSums(performance_matrix)
# Compute correlation with true scores
cor_value <- cor(total_score, true_scores)
# Store results
correlations <- rbind(correlations, data.frame(n_items = it_n_items, correlation = cor_value))
}
# Plot correlation vs. number of items
ggplot(correlations, aes(x = n_items, y = correlation)) +
geom_line(color = "blue") +
geom_point(color = "red") +
labs(title = "Baseline: Effect of Adding Irrelevant Items on Correlation between Total to True Scores",
x = "Total Number of Items",
y = "Correlation between Total and True Scores") +
theme_minimal()
}apply function
# Parameters
n_applicants <- 500
n_relevant <- 10
n_items <- 200
n_total_items <- seq(n_relevant, 200, by = 10)
# Compute and Plot Baseline Correlation (Turn Binary On/Off)
compute_baseline_correlation(n_applicants = n_applicants, n_relevant = n_relevant, n_total_items = n_total_items, binary = TRUE) # Set binary=FALSE for continuousadding more and more irrelevant indicators shrinks the correlation between the “observed scores” and “true scores” (real preformance of LLMs)
Feature Selection via Lasso Regression
The function perform_lasso_feature_selection() applies Lasso (L1 regularization) to select relevant features.
perform_lasso_feature_selection <- function(n_applicants, n_items, n_relevant, binary = TRUE, lambda_threshold = 0.1) {
# Generate dataset
data <- generate_performance_data(n_applicants, n_items, n_relevant, binary)
performance_matrix <- data$performance_matrix
true_scores <- data$true_scores
# Convert to DataFrame with proper naming
dat <- as.data.frame(cbind(performance_matrix, true_scores))
colnames(dat) <- c(paste0("relI", 1:n_relevant), paste0("irelI", 1:(n_items - n_relevant)), "true_score")
# Standardize features
x <- scale(as.matrix(performance_matrix))
y <- scale(true_scores)
# Fit Elastic Net
lasso_model <- cv.glmnet(x, y, alpha = 0.5)
# Use a custom lambda or optimize via cross-validation
if (lambda_threshold == "auto") {
lambda_threshold <- lasso_model$lambda[which.max(lasso_model$cvm)]
}
# Extract coefficients for chosen lambda
coef_lasso <- coef(lasso_model, s = lambda_threshold)
# Select nonzero coefficient items
selected_items <- which(coef_lasso[-1] != 0)
# Identify which selected features are relevant vs. irrelevant
selected_colnames <- colnames(dat)[selected_items]
n_relevant_selected <- sum(grepl("^relI", selected_colnames)) # Count relevant items
n_irrelevant_selected <- sum(grepl("^irelI", selected_colnames)) # Count irrelevant items
print(paste("Total selected features:", length(selected_items)))
print(paste("Number of relevant items identified:", n_relevant_selected))
print(paste("Number of irrelevant items identified:", n_irrelevant_selected))
# Filter performance matrix based on selected features
if (length(selected_items) > 0) {
filtered_matrix <- performance_matrix[, selected_items]
filtered_score <- rowSums(filtered_matrix)
} else {
filtered_score <- rep(0, length(true_scores)) # Avoid empty selection error
}
# Compute Pearson correlation after Lasso filtering
cor_lasso <- cor(filtered_score, true_scores)
print(paste("Pearson correlation after Lasso selection:", round(cor_lasso, 4)))
# Compute Spearman rank correlation after Lasso filtering
cor_rank_lasso <- cor(rank(filtered_score), rank(true_scores), method = "spearman")
print(paste("Spearman rank correlation after Lasso selection:", round(cor_rank_lasso, 4)))
# Plot results
plot(filtered_score, true_scores, main = "Filtered Score vs. True Scores",
xlab = "Filtered Score", ylab = "True Scores")
return(list(selected_items = selected_items,
correlation = cor_lasso,
relevant_selected = n_relevant_selected,
irrelevant_selected = n_irrelevant_selected))
}apply function
# Parameters
n_applicants <- 500
n_relevant <- 10
n_items <- 200
n_total_items <- seq(n_relevant, 200, by = 10)
# different sets of lambda for
#> binary data
lasso_results <- perform_lasso_feature_selection(n_applicants, n_items, n_relevant, binary = TRUE, lambda_threshold = .1) # Set binary=FALSE for continuous[1] "Total selected features: 47"
[1] "Number of relevant items identified: 10"
[1] "Number of irrelevant items identified: 37"
[1] "Pearson correlation after Lasso selection: 0.7321"
[1] "Spearman rank correlation after Lasso selection: 0.7892"
lasso_results <- perform_lasso_feature_selection(n_applicants, n_items, n_relevant, binary = TRUE, lambda_threshold = .3) # Set binary=FALSE for continuous[1] "Total selected features: 10"
[1] "Number of relevant items identified: 10"
[1] "Number of irrelevant items identified: 0"
[1] "Pearson correlation after Lasso selection: 0.8926"
[1] "Spearman rank correlation after Lasso selection: 0.9507"
#> continuous data
lasso_results <- perform_lasso_feature_selection(n_applicants, n_items, n_relevant, binary = FALSE, lambda_threshold = .01) # Set binary=FALSE for continuous[1] "Total selected features: 83"
[1] "Number of relevant items identified: 10"
[1] "Number of irrelevant items identified: 73"
[1] "Pearson correlation after Lasso selection: 0.9365"
[1] "Spearman rank correlation after Lasso selection: 0.9298"
lasso_results <- perform_lasso_feature_selection(n_applicants, n_items, n_relevant, binary = FALSE, lambda_threshold = .1) # Set binary=FALSE for continuous[1] "Total selected features: 10"
[1] "Number of relevant items identified: 10"
[1] "Number of irrelevant items identified: 0"
[1] "Pearson correlation after Lasso selection: 0.9808"
[1] "Spearman rank correlation after Lasso selection: 0.9793"
Lasso / rigid regression will lead to better results if we use continous data and not code the answers of the LLMs are right or wrong (binary)
depending on the choosen lambda we can easily identify the subset of relevant indicators, e.g., to predict the rank of the LLM
perform PCA Dimensionality Reduction and Identify Important Features
perform_pca_feature_selection <- function(n_applicants, n_items, n_relevant, binary = TRUE, num_components = 10, top_n_features = 5) {
# Generate dataset
data <- generate_performance_data(n_applicants, n_items, n_relevant, binary)
performance_matrix <- data$performance_matrix
true_scores <- data$true_scores
# Convert performance matrix to DataFrame with meaningful column names
colnames(performance_matrix) <- c(paste0("relI", 1:n_relevant), paste0("irelI", 1:(n_items - n_relevant)))
# Perform PCA
pca_model <- prcomp(performance_matrix, center = TRUE, scale. = TRUE)
# Handle case when num_components = 1
if (num_components == 1) {
pca_scores <- as.data.frame(predict(pca_model)[, 1, drop = FALSE]) # Ensure DataFrame format
colnames(pca_scores) <- "PC1"
pca_rank <- rank(pca_scores$PC1)
} else {
pca_scores <- as.data.frame(predict(pca_model)[, 1:num_components]) # Convert to DataFrame
pca_rank <- rank(rowSums(pca_scores))
}
# Compute correlation between PCA scores and true scores
cor_pca <- cor(pca_rank, rank(true_scores), method = "spearman")
print(paste("Spearman correlation after PCA:", round(cor_pca, 4)))
# Extract PCA loadings (rotation matrix)
loadings <- abs(pca_model$rotation[, 1:num_components]) # Take absolute values
# Identify top contributing items for each component
if (num_components == 1) {
# Single component: Extract top contributing features directly
top_features <- names(sort(loadings, decreasing = TRUE))[1:min(top_n_features, length(loadings))]
top_features_per_component <- list(PC1 = top_features)
} else {
# Multiple components: Extract for each PC
top_features_per_component <- apply(loadings, 2, function(x) {
names(sort(x, decreasing = TRUE))[1:min(top_n_features, length(x))]
})
}
# Print the most important features per component
print("Top Features per Component:")
print(top_features_per_component)
return(list(correlation = cor_pca,
pca_model = pca_model,
pca_scores = pca_scores,
top_features = top_features_per_component))
}apply function
# Parameters
n_applicants <- 500
n_relevant <- 10
n_items <- 200
# PCA Dimensionality Reduction and compute rank
cor_pca <- perform_pca_feature_selection(n_applicants, n_items, n_relevant, binary = FALSE, num_components = 1, top_n_features = 10)[1] "Spearman correlation after PCA: 0.9776"
[1] "Top Features per Component:"
$PC1
[1] "relI3" "relI10" "relI2" "relI4" "relI8" "relI1" "relI5" "relI6"
[9] "relI7" "relI9"
cor_pca <- perform_pca_feature_selection(n_applicants, n_items, n_relevant, binary = FALSE, num_components = 2, top_n_features = 10)[1] "Spearman correlation after PCA: -0.8938"
[1] "Top Features per Component:"
PC1 PC2
[1,] "relI4" "irelI177"
[2,] "relI6" "irelI38"
[3,] "relI3" "irelI9"
[4,] "relI7" "irelI169"
[5,] "relI5" "irelI96"
[6,] "relI9" "irelI29"
[7,] "relI2" "irelI155"
[8,] "relI8" "irelI113"
[9,] "relI10" "irelI170"
[10,] "relI1" "irelI60"
cor_pca <- perform_pca_feature_selection(n_applicants, n_items, n_relevant, binary = FALSE, num_components = 3, top_n_features = 10)[1] "Spearman correlation after PCA: -0.8344"
[1] "Top Features per Component:"
PC1 PC2 PC3
[1,] "relI8" "irelI132" "irelI19"
[2,] "relI9" "irelI152" "irelI65"
[3,] "relI5" "irelI18" "irelI7"
[4,] "relI1" "irelI39" "irelI54"
[5,] "relI10" "irelI33" "irelI90"
[6,] "relI4" "irelI149" "irelI124"
[7,] "relI7" "irelI28" "irelI129"
[8,] "relI3" "irelI20" "irelI159"
[9,] "relI2" "irelI94" "irelI148"
[10,] "relI6" "irelI176" "irelI41"
if number of relevant components (here 1) is unknown, including more components as necessary shrinks the correlation between the rank scores of the “observed scores” and “true scores” (real preformance of LLMs)