Project1

N-fold cross-validation

Name :- Grandhe Venkata Guna Sundhar

M Number :- M16552512

1. Load and Prepare the Data

  # Use the correct dataset
  file <- "TCGA_breast_cancer_ERpositive_vs_ERnegative_PAM50.tsv"
  # Load dataset
  data <- read.table(file, header=TRUE, sep="\t", stringsAsFactors=FALSE)
  # Extract ER status (First row, excluding 'id' column)
  er_status <- as.factor(as.character(unlist(data[1, -1])))

  # Extract Gene Expression Data
  gene_expression <- as.data.frame(t(data[-1, -1]))  
  colnames(gene_expression) <- data[-1, 1]  
  gene_expression[] <- lapply(gene_expression, as.numeric)

# Convert gene expression matrix to long format for ggplot
gene_data_long <- reshape2::melt(gene_expression, variable.name = "Gene", value.name = "Expression")

## No id variables; using all as measure variables

gene_data_long$Sample <- rep(rownames(gene_expression), times = ncol(gene_expression))  

# Create histogram
p1 <- ggplot(gene_data_long, aes(x = Expression)) +
  geom_histogram(binwidth = 1, fill = "blue", alpha = 0.7, color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Gene Expression Values", x = "Expression Level", y = "Frequency")

ggplotly(p1)  # Convert to interactive plot

2. Train Logistic Regression and KNN Models with Cross-Validation

# Define Cross-Validation Control
cv5 <- trainControl(method = "cv", number = 5)
cv10 <- trainControl(method = "cv", number = 10)

# Train Logistic Regression Model
log_reg_5 <- train(x = gene_expression, y = er_status, method = "glm", 
                   family = "binomial", trControl = cv5)

log_reg_10 <- train(x = gene_expression, y = er_status, method = "glm", 
                    family = "binomial", trControl = cv10)

# Train Nearest Neighbor Classifier (KNN)
knn_5 <- train(x = gene_expression, y = er_status, method = "knn", 
               tuneLength = 3, trControl = cv5)

knn_10 <- train(x = gene_expression, y = er_status, method = "knn", 
                tuneLength = 3, trControl = cv10)

3. Display Model Performance

# Store Results in a Table
results <- data.frame(
  Model = c("Logistic Regression", "Logistic Regression", "Nearest Neighbor", "Nearest Neighbor"),
  Cross_Validation = c("5-Fold", "10-Fold", "5-Fold", "10-Fold"),
  Mean_Accuracy = c(mean(log_reg_5$resample$Accuracy, na.rm=TRUE), 
                    mean(log_reg_10$resample$Accuracy, na.rm=TRUE),
                    mean(knn_5$resample$Accuracy, na.rm=TRUE), 
                    mean(knn_10$resample$Accuracy, na.rm=TRUE))
)

# Print Results Table
kable(results, caption = "Cross-Validation Accuracy for ER Prediction")

Cross-Validation Accuracy for ER Prediction
Model	Cross_Validation	Mean_Accuracy
Logistic Regression	5-Fold	0.9343785
Logistic Regression	10-Fold	0.9323922
Nearest Neighbor	5-Fold	0.9401616
Nearest Neighbor	10-Fold	0.9421005

### 3.1. Comparison Between Logistic Regression and NNC (2x2 Table)

# Generate predictions for both models (5-fold cross-validation)
logistic_pred_5 <- predict(log_reg_5, gene_expression)
knn_pred_5 <- predict(knn_5, gene_expression)

# Generate predictions for both models (10-fold cross-validation)
logistic_pred_10 <- predict(log_reg_10, gene_expression)
knn_pred_10 <- predict(knn_10, gene_expression)

# Compute confusion matrices for 5-fold models
conf_matrix_log_5 <- confusionMatrix(logistic_pred_5, er_status)
conf_matrix_knn_5 <- confusionMatrix(knn_pred_5, er_status)

# Compute confusion matrices for 10-fold models
conf_matrix_log_10 <- confusionMatrix(logistic_pred_10, er_status)
conf_matrix_knn_10 <- confusionMatrix(knn_pred_10, er_status)

# Create a 2x2 comparison table for 5-fold cross-validation
comparison_table_5 <- data.frame(
  Model = c("Logistic Regression (5-Fold)", "Nearest Neighbor (5-Fold)"),
  Accuracy = c(conf_matrix_log_5$overall['Accuracy'], conf_matrix_knn_5$overall['Accuracy']),
  Sensitivity = c(conf_matrix_log_5$byClass['Sensitivity'], conf_matrix_knn_5$byClass['Sensitivity']),
  Specificity = c(conf_matrix_log_5$byClass['Specificity'], conf_matrix_knn_5$byClass['Specificity'])
)

# Create a 2x2 comparison table for 10-fold cross-validation
comparison_table_10 <- data.frame(
  Model = c("Logistic Regression (10-Fold)", "Nearest Neighbor (10-Fold)"),
  Accuracy = c(conf_matrix_log_10$overall['Accuracy'], conf_matrix_knn_10$overall['Accuracy']),
  Sensitivity = c(conf_matrix_log_10$byClass['Sensitivity'], conf_matrix_knn_10$byClass['Sensitivity']),
  Specificity = c(conf_matrix_log_10$byClass['Specificity'], conf_matrix_knn_10$byClass['Specificity'])
)

# Display the 2x2 comparison tables
kable(comparison_table_5, caption = "Comparison of Logistic Regression and NNC (5-Fold CV)")

Comparison of Logistic Regression and NNC (5-Fold CV)
Model	Accuracy	Sensitivity	Specificity
Logistic Regression (5-Fold)	0.9498069	0.8808511	0.9700375
Nearest Neighbor (5-Fold)	0.9469112	0.8936170	0.9625468

kable(comparison_table_10, caption = "Comparison of Logistic Regression and NNC (10-Fold CV)")

Comparison of Logistic Regression and NNC (10-Fold CV)
Model	Accuracy	Sensitivity	Specificity
Logistic Regression (10-Fold)	0.9498069	0.8808511	0.9700375
Nearest Neighbor (10-Fold)	0.9440154	0.8851064	0.9612984

# Create dataframe for plotting
accuracy_data <- data.frame(
  Model = c("KNN 10-Fold", "KNN 5-Fold", "LogReg 10-Fold", "LogReg 5-Fold"),
  Accuracy = c(
    mean(knn_10$resample$Accuracy, na.rm=TRUE),
    mean(knn_5$resample$Accuracy, na.rm=TRUE),
    mean(log_reg_10$resample$Accuracy, na.rm=TRUE),
    mean(log_reg_5$resample$Accuracy, na.rm=TRUE)
  )
)

# Create interactive bar plot
p2 <- ggplot(accuracy_data, aes(x = Model, y = Accuracy, fill = Model)) +
  geom_bar(stat = "identity", alpha = 0.8, width = 0.6) + 
  geom_text(aes(label = round(Accuracy, 3)), vjust = -0.5, size = 4) +  
  theme_minimal() +
  labs(title = "Model Accuracy Comparison", x = "Model", y = "Accuracy") +
  theme(axis.text.x = element_text(angle = 20, hjust = 1, vjust = 1))

ggplotly(p2)  # Convert to interactive plot

4. Feature Selection: Selecting Top 50 and 100 Genes

# Remove genes with standard deviation < 1
gene_sd <- apply(gene_expression, 2, sd, na.rm = TRUE)
filtered_genes <- gene_expression[, gene_sd >= 1]

# Perform t-test to select top 50 and top 100 genes
t_stat <- apply(filtered_genes, 2, function(x) t.test(x ~ er_status)$statistic)

# Select top genes based on t-test scores
top_50_genes <- intersect(names(sort(t_stat, decreasing = TRUE))[1:50], colnames(filtered_genes))
top_100_genes <- intersect(names(sort(t_stat, decreasing = TRUE))[1:100], colnames(filtered_genes))

# Subset dataset
X_top50 <- filtered_genes[, top_50_genes, drop = FALSE]
X_top100 <- filtered_genes[, top_100_genes, drop = FALSE]

5.Train Logistic Regression with Feature-Selected Genes

# Train Logistic Regression models with top 50 and 100 genes
logistic_5fold_50 <- train(x = X_top50, y = as.factor(er_status), method = "glm",
                           family = "binomial", trControl = cv5)

logistic_10fold_50 <- train(x = X_top50, y = as.factor(er_status), method = "glm",
                            family = "binomial", trControl = cv10)

logistic_5fold_100 <- train(x = X_top100, y = as.factor(er_status), method = "glm",
                            family = "binomial", trControl = cv5)

logistic_10fold_100 <- train(x = X_top100, y = as.factor(er_status), method = "glm",
                             family = "binomial", trControl = cv10)

# Store Feature Selection Results
results_feature_selection <- data.frame(
  Model = c("Logistic Regression", "Logistic Regression", "Logistic Regression", "Logistic Regression"),
  Top_Genes = c(50, 50, 100, 100),
  Cross_Validation = c("5-Fold", "10-Fold", "5-Fold", "10-Fold"),
  Mean_Accuracy = c(
    mean(logistic_5fold_50$resample$Accuracy),
    mean(logistic_10fold_50$resample$Accuracy),
    mean(logistic_5fold_100$resample$Accuracy),
    mean(logistic_10fold_100$resample$Accuracy)
  )
)

# Print Feature Selection Results
kable(results_feature_selection, caption = "Accuracy after Feature Selection")

Accuracy after Feature Selection
Model	Top_Genes	Cross_Validation	Mean_Accuracy
Logistic Regression	50	5-Fold	0.9362876
Logistic Regression	50	10-Fold	0.9295743
Logistic Regression	100	5-Fold	0.9372306
Logistic Regression	100	10-Fold	0.9343251

6. Train LASSO Regression for Extra Credit

# Train LASSO Regression Model
lasso_5fold_50 <- train(x = X_top50, y = as.factor(er_status), method = "glmnet",
                        trControl = cv5, tuneGrid = expand.grid(alpha = 1, lambda = seq(0.001, 0.1, by = 0.01)))

lasso_10fold_50 <- train(x = X_top50, y = as.factor(er_status), method = "glmnet",
                         trControl = cv10, tuneGrid = expand.grid(alpha = 1, lambda = seq(0.001, 0.1, by = 0.01)))

lasso_5fold_100 <- train(x = X_top100, y = as.factor(er_status), method = "glmnet",
                         trControl = cv5, tuneGrid = expand.grid(alpha = 1, lambda = seq(0.001, 0.1, by = 0.01)))

lasso_10fold_100 <- train(x = X_top100, y = as.factor(er_status), method = "glmnet",
                          trControl = cv10, tuneGrid = expand.grid(alpha = 1, lambda = seq(0.001, 0.1, by = 0.01)))

# Print LASSO Results
kable(data.frame(
  Model = "LASSO Regression",
  Cross_Validation = c("5-Fold", "10-Fold", "5-Fold", "10-Fold"),
  Mean_Accuracy = c(
    mean(lasso_5fold_50$resample$Accuracy),
    mean(lasso_10fold_50$resample$Accuracy),
    mean(lasso_5fold_100$resample$Accuracy),
    mean(lasso_10fold_100$resample$Accuracy)
  )
), caption = "LASSO Regression Results")

LASSO Regression Results
Model	Cross_Validation	Mean_Accuracy
LASSO Regression	5-Fold	0.9459402
LASSO Regression	10-Fold	0.9469380
LASSO Regression	5-Fold	0.9449833
LASSO Regression	10-Fold	0.9401232

# Ensure the LASSO model exists
if (!exists("lasso_10fold_50")) {
  stop("🚨 ERROR: LASSO model not found. Check training step before plotting.")
}

# Extract best lambda from trained LASSO model
best_lambda <- lasso_10fold_50$bestTune$lambda

# Get LASSO coefficients using the best lambda
lasso_coeff <- coef(lasso_10fold_50$finalModel, s = best_lambda)

# Convert coefficients to a dataframe
lasso_coeff <- as.matrix(lasso_coeff)
lasso_coeff <- data.frame(Gene = rownames(lasso_coeff), Coefficient = lasso_coeff[, 1])

# Remove intercept
lasso_coeff <- lasso_coeff[lasso_coeff$Gene != "(Intercept)", ]

# Remove tiny coefficients (less than 0.01 in absolute value)
lasso_coeff <- lasso_coeff[abs(lasso_coeff$Coefficient) > 0.01, ]

# Sort by absolute coefficient value (important features first)
lasso_coeff <- lasso_coeff[order(abs(lasso_coeff$Coefficient), decreasing = TRUE), ]

# Create interactive bar plot
p3 <- ggplot(lasso_coeff, aes(x = reorder(Gene, Coefficient), y = Coefficient, fill = Coefficient > 0)) +
  geom_bar(stat = "identity", alpha = 0.8, width = 0.7) +
  coord_flip() +
  scale_fill_manual(values = c("red", "blue"), labels = c("Negative", "Positive")) +
  theme_minimal() +
  labs(title = "LASSO Regression: Top Important Features", 
       x = "Genes", 
       y = "Coefficient Value",
       fill = "Effect on Prediction") +
  theme(legend.position = "bottom")

ggplotly(p3)  # Convert to interactive plot

7. Discussion: Model Performance Analysis

This study evaluates Logistic Regression (GLM), K-Nearest Neighbors (KNN), and LASSO Regression for classifying ER-positive vs. ER-negative breast cancer samples. The performance of these models was assessed using 5-fold and 10-fold cross-validation, and further improved using feature selection and LASSO regression.

Key Observations:

Logistic Regression vs. KNN:
- Logistic Regression consistently performs well, achieving an accuracy of ~93%.
- KNN has slightly higher accuracy (~94%) in some cases but may suffer from increased computational complexity and sensitivity to noise.
Effect of Cross-Validation Strategy:
- The results indicate that 10-fold cross-validation provides slightly better accuracy compared to 5-fold cross-validation.
- The improvement is marginal but suggests that having more training data per fold leads to better generalization.
Feature Selection (Top 50 vs. Top 100 Genes):
- Using all genes may introduce noise and reduce model performance.
- After selecting the top 50 and top 100 genes, accuracy remains stable (~93%), confirming that many genes contribute little to classification.
- This means that reducing the number of features can maintain performance while improving model efficiency.
LASSO Regression Performance:
- LASSO regression achieves the highest accuracy (~95%), outperforming both Logistic Regression and KNN.
- The LASSO penalty eliminates less important genes, leading to better generalization and reduced overfitting.

Final Thoughts & Recommendations:

LASSO Regression is the best model for this classification task due to its ability to select the most informative genes and avoid overfitting.
Feature selection is beneficial—removing unimportant genes can maintain accuracy while improving computational efficiency.
10-fold cross-validation is recommended for model evaluation, as it provides better generalization over 5-fold cross-validation.
Future improvements could include experimenting with ensemble methods (Random Forest, XGBoost) or deep learning for more complex relationships.

🚀 Overall, LASSO regression combined with feature selection offers the best classification performance! 🚀