library(glmnet)
library(dplyr)
library(ggplot2)
library(psych)
set.seed(123)
Performance Data Generation and Feature Selection using Lasso and PCA
Global settings
Load packages and set seed.
Introduction
This document describes a structured approach to generating performance data, evaluating the impact of irrelevant features, and applying feature selection techniques using Lasso and Principal Component Analysis (PCA). The methods aim to analyze the correlation between generated performance indicators and true scores while identifying the most relevant features.
Data Generation Process
The function generate_performance_data()
creates a dataset simulating performance indicators for applicants.
- Inputs:
n_applicants
: Number of applicants.n_items
: Total number of items (features).n_relevant
: Number of relevant items.binary
: Whether performance indicators are binarized.
- Process:
- Generates
true_scores
from a normal distribution. - Assigns weights to relevant and irrelevant items.
- Constructs a performance matrix by introducing noise.
- Applies binary transformation if specified.
- Generates
- Output: A performance matrix and corresponding true scores.
<- function(n_applicants, n_items, n_relevant, binary = TRUE) {
generate_performance_data
# Ensure n_items is greater than or equal to n_relevant to avoid errors
if (n_items < n_relevant) {
stop("n_items must be greater than or equal to n_relevant")
}
# Generate true scores
<- rnorm(n_applicants, mean = 0, sd = 1)
true_scores
# Define item weights (relevant vs. noise)
<- runif(n_relevant, 1, 3)
relevant_weights <- if (n_items > n_relevant) runif(n_items - n_relevant, -0.3, 0.3) else numeric(0)
noise_weights <- c(relevant_weights, noise_weights)
item_weights
# Generate performance indicators
<- outer(true_scores, item_weights, "*") +
performance_matrix matrix(rnorm(n_applicants * n_items, 0, 1), nrow = n_applicants, ncol = n_items)
# Apply binary transformation if requested
if (binary) {
# flip_prob <- rep(c(0, 0.3), times = c(n_relevant, n_items - n_relevant))
# flip_mask <- matrix(rbinom(n_applicants * n_items, 1, flip_prob), nrow = n_applicants, ncol = n_items)
<- ifelse(performance_matrix > 0, 1, 0)
performance_matrix # performance_matrix <- abs(performance_matrix - flip_mask)
}
return(list(performance_matrix = performance_matrix, true_scores = true_scores))
}
apply function
# Parameters
<- 200
n_applicants <- 5
n_relevant <- 20
n_items
# show data generation process
<- generate_performance_data(n_applicants = n_applicants, n_items = n_items, n_relevant = n_relevant, binary = TRUE)
dat <- as.data.frame(cbind(dat$performance_matrix, dat$true_scores))
dat colnames(dat) <- c(paste0("relI", 1:n_relevant), paste0("irelI", 1:(n_items - n_relevant)), "true score")
head(dat)
relI1 relI2 relI3 relI4 relI5 irelI1 irelI2 irelI3 irelI4 irelI5 irelI6
1 0 0 0 0 1 1 0 1 1 0 1
2 0 0 0 0 1 0 1 1 0 1 1
3 1 1 1 1 1 1 0 1 1 1 0
4 0 0 1 1 1 0 1 0 0 1 1
5 0 0 1 1 1 0 1 0 0 0 1
6 1 0 1 1 1 1 1 0 0 1 0
irelI7 irelI8 irelI9 irelI10 irelI11 irelI12 irelI13 irelI14 irelI15
1 0 0 0 1 1 1 0 0 1
2 0 0 0 0 1 1 1 0 1
3 1 0 1 0 1 1 0 1 0
4 1 0 1 1 1 0 0 0 0
5 0 1 1 0 1 0 0 1 1
6 0 1 0 1 0 1 0 0 1
true score
1 -0.56047565
2 -0.23017749
3 1.55870831
4 0.07050839
5 0.12928774
6 1.71506499
cor.plot(r = cor(dat))
clearly one component is strongly correlated to the “true scores” (real preformance of LLMs)
Baseline Correlation Analysis
The function compute_baseline_correlation()
assesses how adding irrelevant items affects the correlation between total performance scores and true scores.
<- function(n_applicants, n_relevant, n_total_items, binary = TRUE) {
compute_baseline_correlation <- data.frame(n_items = integer(), correlation = numeric())
correlations
for (it_n_items in n_total_items) {
<- generate_performance_data(n_applicants = n_applicants, n_items = it_n_items, n_relevant = n_relevant, binary = TRUE)
data <- data$performance_matrix
performance_matrix <- data$true_scores
true_scores
# Compute total score
<- rowSums(performance_matrix)
total_score
# Compute correlation with true scores
<- cor(total_score, true_scores)
cor_value
# Store results
<- rbind(correlations, data.frame(n_items = it_n_items, correlation = cor_value))
correlations
}
# Plot correlation vs. number of items
ggplot(correlations, aes(x = n_items, y = correlation)) +
geom_line(color = "blue") +
geom_point(color = "red") +
labs(title = "Baseline: Effect of Adding Irrelevant Items on Correlation between Total to True Scores",
x = "Total Number of Items",
y = "Correlation between Total and True Scores") +
theme_minimal()
}
apply function
# Parameters
<- 500
n_applicants <- 10
n_relevant <- 200
n_items <- seq(n_relevant, 200, by = 10)
n_total_items
# Compute and Plot Baseline Correlation (Turn Binary On/Off)
compute_baseline_correlation(n_applicants = n_applicants, n_relevant = n_relevant, n_total_items = n_total_items, binary = TRUE) # Set binary=FALSE for continuous
adding more and more irrelevant indicators shrinks the correlation between the “observed scores” and “true scores” (real preformance of LLMs)
Feature Selection via Lasso Regression
The function perform_lasso_feature_selection()
applies Lasso (L1 regularization) to select relevant features.
<- function(n_applicants, n_items, n_relevant, binary = TRUE, lambda_threshold = 0.1) {
perform_lasso_feature_selection
# Generate dataset
<- generate_performance_data(n_applicants, n_items, n_relevant, binary)
data <- data$performance_matrix
performance_matrix <- data$true_scores
true_scores
# Convert to DataFrame with proper naming
<- as.data.frame(cbind(performance_matrix, true_scores))
dat colnames(dat) <- c(paste0("relI", 1:n_relevant), paste0("irelI", 1:(n_items - n_relevant)), "true_score")
# Standardize features
<- scale(as.matrix(performance_matrix))
x <- scale(true_scores)
y
# Fit Elastic Net
<- cv.glmnet(x, y, alpha = 0.5)
lasso_model
# Use a custom lambda or optimize via cross-validation
if (lambda_threshold == "auto") {
<- lasso_model$lambda[which.max(lasso_model$cvm)]
lambda_threshold
}
# Extract coefficients for chosen lambda
<- coef(lasso_model, s = lambda_threshold)
coef_lasso
# Select nonzero coefficient items
<- which(coef_lasso[-1] != 0)
selected_items
# Identify which selected features are relevant vs. irrelevant
<- colnames(dat)[selected_items]
selected_colnames <- sum(grepl("^relI", selected_colnames)) # Count relevant items
n_relevant_selected <- sum(grepl("^irelI", selected_colnames)) # Count irrelevant items
n_irrelevant_selected
print(paste("Total selected features:", length(selected_items)))
print(paste("Number of relevant items identified:", n_relevant_selected))
print(paste("Number of irrelevant items identified:", n_irrelevant_selected))
# Filter performance matrix based on selected features
if (length(selected_items) > 0) {
<- performance_matrix[, selected_items]
filtered_matrix <- rowSums(filtered_matrix)
filtered_score else {
} <- rep(0, length(true_scores)) # Avoid empty selection error
filtered_score
}
# Compute Pearson correlation after Lasso filtering
<- cor(filtered_score, true_scores)
cor_lasso print(paste("Pearson correlation after Lasso selection:", round(cor_lasso, 4)))
# Compute Spearman rank correlation after Lasso filtering
<- cor(rank(filtered_score), rank(true_scores), method = "spearman")
cor_rank_lasso print(paste("Spearman rank correlation after Lasso selection:", round(cor_rank_lasso, 4)))
# Plot results
plot(filtered_score, true_scores, main = "Filtered Score vs. True Scores",
xlab = "Filtered Score", ylab = "True Scores")
return(list(selected_items = selected_items,
correlation = cor_lasso,
relevant_selected = n_relevant_selected,
irrelevant_selected = n_irrelevant_selected))
}
apply function
# Parameters
<- 500
n_applicants <- 10
n_relevant <- 200
n_items <- seq(n_relevant, 200, by = 10)
n_total_items
# different sets of lambda for
#> binary data
<- perform_lasso_feature_selection(n_applicants, n_items, n_relevant, binary = TRUE, lambda_threshold = .1) # Set binary=FALSE for continuous lasso_results
[1] "Total selected features: 47"
[1] "Number of relevant items identified: 10"
[1] "Number of irrelevant items identified: 37"
[1] "Pearson correlation after Lasso selection: 0.7321"
[1] "Spearman rank correlation after Lasso selection: 0.7892"
<- perform_lasso_feature_selection(n_applicants, n_items, n_relevant, binary = TRUE, lambda_threshold = .3) # Set binary=FALSE for continuous lasso_results
[1] "Total selected features: 10"
[1] "Number of relevant items identified: 10"
[1] "Number of irrelevant items identified: 0"
[1] "Pearson correlation after Lasso selection: 0.8926"
[1] "Spearman rank correlation after Lasso selection: 0.9507"
#> continuous data
<- perform_lasso_feature_selection(n_applicants, n_items, n_relevant, binary = FALSE, lambda_threshold = .01) # Set binary=FALSE for continuous lasso_results
[1] "Total selected features: 83"
[1] "Number of relevant items identified: 10"
[1] "Number of irrelevant items identified: 73"
[1] "Pearson correlation after Lasso selection: 0.9365"
[1] "Spearman rank correlation after Lasso selection: 0.9298"
<- perform_lasso_feature_selection(n_applicants, n_items, n_relevant, binary = FALSE, lambda_threshold = .1) # Set binary=FALSE for continuous lasso_results
[1] "Total selected features: 10"
[1] "Number of relevant items identified: 10"
[1] "Number of irrelevant items identified: 0"
[1] "Pearson correlation after Lasso selection: 0.9808"
[1] "Spearman rank correlation after Lasso selection: 0.9793"
Lasso / rigid regression will lead to better results if we use continous data and not code the answers of the LLMs are right or wrong (binary)
depending on the choosen lambda we can easily identify the subset of relevant indicators, e.g., to predict the rank of the LLM
perform PCA Dimensionality Reduction and Identify Important Features
<- function(n_applicants, n_items, n_relevant, binary = TRUE, num_components = 10, top_n_features = 5) {
perform_pca_feature_selection
# Generate dataset
<- generate_performance_data(n_applicants, n_items, n_relevant, binary)
data <- data$performance_matrix
performance_matrix <- data$true_scores
true_scores
# Convert performance matrix to DataFrame with meaningful column names
colnames(performance_matrix) <- c(paste0("relI", 1:n_relevant), paste0("irelI", 1:(n_items - n_relevant)))
# Perform PCA
<- prcomp(performance_matrix, center = TRUE, scale. = TRUE)
pca_model
# Handle case when num_components = 1
if (num_components == 1) {
<- as.data.frame(predict(pca_model)[, 1, drop = FALSE]) # Ensure DataFrame format
pca_scores colnames(pca_scores) <- "PC1"
<- rank(pca_scores$PC1)
pca_rank else {
} <- as.data.frame(predict(pca_model)[, 1:num_components]) # Convert to DataFrame
pca_scores <- rank(rowSums(pca_scores))
pca_rank
}
# Compute correlation between PCA scores and true scores
<- cor(pca_rank, rank(true_scores), method = "spearman")
cor_pca print(paste("Spearman correlation after PCA:", round(cor_pca, 4)))
# Extract PCA loadings (rotation matrix)
<- abs(pca_model$rotation[, 1:num_components]) # Take absolute values
loadings
# Identify top contributing items for each component
if (num_components == 1) {
# Single component: Extract top contributing features directly
<- names(sort(loadings, decreasing = TRUE))[1:min(top_n_features, length(loadings))]
top_features <- list(PC1 = top_features)
top_features_per_component else {
} # Multiple components: Extract for each PC
<- apply(loadings, 2, function(x) {
top_features_per_component names(sort(x, decreasing = TRUE))[1:min(top_n_features, length(x))]
})
}
# Print the most important features per component
print("Top Features per Component:")
print(top_features_per_component)
return(list(correlation = cor_pca,
pca_model = pca_model,
pca_scores = pca_scores,
top_features = top_features_per_component))
}
apply function
# Parameters
<- 500
n_applicants <- 10
n_relevant <- 200
n_items
# PCA Dimensionality Reduction and compute rank
<- perform_pca_feature_selection(n_applicants, n_items, n_relevant, binary = FALSE, num_components = 1, top_n_features = 10) cor_pca
[1] "Spearman correlation after PCA: 0.9776"
[1] "Top Features per Component:"
$PC1
[1] "relI3" "relI10" "relI2" "relI4" "relI8" "relI1" "relI5" "relI6"
[9] "relI7" "relI9"
<- perform_pca_feature_selection(n_applicants, n_items, n_relevant, binary = FALSE, num_components = 2, top_n_features = 10) cor_pca
[1] "Spearman correlation after PCA: -0.8938"
[1] "Top Features per Component:"
PC1 PC2
[1,] "relI4" "irelI177"
[2,] "relI6" "irelI38"
[3,] "relI3" "irelI9"
[4,] "relI7" "irelI169"
[5,] "relI5" "irelI96"
[6,] "relI9" "irelI29"
[7,] "relI2" "irelI155"
[8,] "relI8" "irelI113"
[9,] "relI10" "irelI170"
[10,] "relI1" "irelI60"
<- perform_pca_feature_selection(n_applicants, n_items, n_relevant, binary = FALSE, num_components = 3, top_n_features = 10) cor_pca
[1] "Spearman correlation after PCA: -0.8344"
[1] "Top Features per Component:"
PC1 PC2 PC3
[1,] "relI8" "irelI132" "irelI19"
[2,] "relI9" "irelI152" "irelI65"
[3,] "relI5" "irelI18" "irelI7"
[4,] "relI1" "irelI39" "irelI54"
[5,] "relI10" "irelI33" "irelI90"
[6,] "relI4" "irelI149" "irelI124"
[7,] "relI7" "irelI28" "irelI129"
[8,] "relI3" "irelI20" "irelI159"
[9,] "relI2" "irelI94" "irelI148"
[10,] "relI6" "irelI176" "irelI41"
if number of relevant components (here 1) is unknown, including more components as necessary shrinks the correlation between the rank scores of the “observed scores” and “true scores” (real preformance of LLMs)