N-fold cross-validation
Name :- Grandhe Venkata Guna Sundhar
M Number :- M16552512
1. Load and Prepare the Data
# Use the correct dataset
file <- "TCGA_breast_cancer_ERpositive_vs_ERnegative_PAM50.tsv"
# Load dataset
data <- read.table(file, header=TRUE, sep="\t", stringsAsFactors=FALSE)
# Extract ER status (First row, excluding 'id' column)
er_status <- as.factor(as.character(unlist(data[1, -1])))
# Extract Gene Expression Data
gene_expression <- as.data.frame(t(data[-1, -1]))
colnames(gene_expression) <- data[-1, 1]
gene_expression[] <- lapply(gene_expression, as.numeric)
# Convert gene expression matrix to long format for ggplot
gene_data_long <- reshape2::melt(gene_expression, variable.name = "Gene", value.name = "Expression")
## No id variables; using all as measure variables
gene_data_long$Sample <- rep(rownames(gene_expression), times = ncol(gene_expression))
# Create histogram
p1 <- ggplot(gene_data_long, aes(x = Expression)) +
geom_histogram(binwidth = 1, fill = "blue", alpha = 0.7, color = "black") +
theme_minimal() +
labs(title = "Distribution of Gene Expression Values", x = "Expression Level", y = "Frequency")
ggplotly(p1) # Convert to interactive plot
2. Train Logistic Regression and KNN Models with
Cross-Validation
# Define Cross-Validation Control
cv5 <- trainControl(method = "cv", number = 5)
cv10 <- trainControl(method = "cv", number = 10)
# Train Logistic Regression Model
log_reg_5 <- train(x = gene_expression, y = er_status, method = "glm",
family = "binomial", trControl = cv5)
log_reg_10 <- train(x = gene_expression, y = er_status, method = "glm",
family = "binomial", trControl = cv10)
# Train Nearest Neighbor Classifier (KNN)
knn_5 <- train(x = gene_expression, y = er_status, method = "knn",
tuneLength = 3, trControl = cv5)
knn_10 <- train(x = gene_expression, y = er_status, method = "knn",
tuneLength = 3, trControl = cv10)
3. Display Model Performance
# Store Results in a Table
results <- data.frame(
Model = c("Logistic Regression", "Logistic Regression", "Nearest Neighbor", "Nearest Neighbor"),
Cross_Validation = c("5-Fold", "10-Fold", "5-Fold", "10-Fold"),
Mean_Accuracy = c(mean(log_reg_5$resample$Accuracy, na.rm=TRUE),
mean(log_reg_10$resample$Accuracy, na.rm=TRUE),
mean(knn_5$resample$Accuracy, na.rm=TRUE),
mean(knn_10$resample$Accuracy, na.rm=TRUE))
)
# Print Results Table
kable(results, caption = "Cross-Validation Accuracy for ER Prediction")
Cross-Validation Accuracy for ER Prediction
| Logistic Regression |
5-Fold |
0.9343785 |
| Logistic Regression |
10-Fold |
0.9323922 |
| Nearest Neighbor |
5-Fold |
0.9401616 |
| Nearest Neighbor |
10-Fold |
0.9421005 |
### 3.1. Comparison Between Logistic Regression and NNC (2x2 Table)
# Generate predictions for both models (5-fold cross-validation)
logistic_pred_5 <- predict(log_reg_5, gene_expression)
knn_pred_5 <- predict(knn_5, gene_expression)
# Generate predictions for both models (10-fold cross-validation)
logistic_pred_10 <- predict(log_reg_10, gene_expression)
knn_pred_10 <- predict(knn_10, gene_expression)
# Compute confusion matrices for 5-fold models
conf_matrix_log_5 <- confusionMatrix(logistic_pred_5, er_status)
conf_matrix_knn_5 <- confusionMatrix(knn_pred_5, er_status)
# Compute confusion matrices for 10-fold models
conf_matrix_log_10 <- confusionMatrix(logistic_pred_10, er_status)
conf_matrix_knn_10 <- confusionMatrix(knn_pred_10, er_status)
# Create a 2x2 comparison table for 5-fold cross-validation
comparison_table_5 <- data.frame(
Model = c("Logistic Regression (5-Fold)", "Nearest Neighbor (5-Fold)"),
Accuracy = c(conf_matrix_log_5$overall['Accuracy'], conf_matrix_knn_5$overall['Accuracy']),
Sensitivity = c(conf_matrix_log_5$byClass['Sensitivity'], conf_matrix_knn_5$byClass['Sensitivity']),
Specificity = c(conf_matrix_log_5$byClass['Specificity'], conf_matrix_knn_5$byClass['Specificity'])
)
# Create a 2x2 comparison table for 10-fold cross-validation
comparison_table_10 <- data.frame(
Model = c("Logistic Regression (10-Fold)", "Nearest Neighbor (10-Fold)"),
Accuracy = c(conf_matrix_log_10$overall['Accuracy'], conf_matrix_knn_10$overall['Accuracy']),
Sensitivity = c(conf_matrix_log_10$byClass['Sensitivity'], conf_matrix_knn_10$byClass['Sensitivity']),
Specificity = c(conf_matrix_log_10$byClass['Specificity'], conf_matrix_knn_10$byClass['Specificity'])
)
# Display the 2x2 comparison tables
kable(comparison_table_5, caption = "Comparison of Logistic Regression and NNC (5-Fold CV)")
Comparison of Logistic Regression and NNC (5-Fold CV)
| Logistic Regression (5-Fold) |
0.9498069 |
0.8808511 |
0.9700375 |
| Nearest Neighbor (5-Fold) |
0.9469112 |
0.8936170 |
0.9625468 |
kable(comparison_table_10, caption = "Comparison of Logistic Regression and NNC (10-Fold CV)")
Comparison of Logistic Regression and NNC (10-Fold
CV)
| Logistic Regression (10-Fold) |
0.9498069 |
0.8808511 |
0.9700375 |
| Nearest Neighbor (10-Fold) |
0.9440154 |
0.8851064 |
0.9612984 |
# Create dataframe for plotting
accuracy_data <- data.frame(
Model = c("KNN 10-Fold", "KNN 5-Fold", "LogReg 10-Fold", "LogReg 5-Fold"),
Accuracy = c(
mean(knn_10$resample$Accuracy, na.rm=TRUE),
mean(knn_5$resample$Accuracy, na.rm=TRUE),
mean(log_reg_10$resample$Accuracy, na.rm=TRUE),
mean(log_reg_5$resample$Accuracy, na.rm=TRUE)
)
)
# Create interactive bar plot
p2 <- ggplot(accuracy_data, aes(x = Model, y = Accuracy, fill = Model)) +
geom_bar(stat = "identity", alpha = 0.8, width = 0.6) +
geom_text(aes(label = round(Accuracy, 3)), vjust = -0.5, size = 4) +
theme_minimal() +
labs(title = "Model Accuracy Comparison", x = "Model", y = "Accuracy") +
theme(axis.text.x = element_text(angle = 20, hjust = 1, vjust = 1))
ggplotly(p2) # Convert to interactive plot
4. Feature Selection: Selecting Top 50 and 100 Genes
# Remove genes with standard deviation < 1
gene_sd <- apply(gene_expression, 2, sd, na.rm = TRUE)
filtered_genes <- gene_expression[, gene_sd >= 1]
# Perform t-test to select top 50 and top 100 genes
t_stat <- apply(filtered_genes, 2, function(x) t.test(x ~ er_status)$statistic)
# Select top genes based on t-test scores
top_50_genes <- intersect(names(sort(t_stat, decreasing = TRUE))[1:50], colnames(filtered_genes))
top_100_genes <- intersect(names(sort(t_stat, decreasing = TRUE))[1:100], colnames(filtered_genes))
# Subset dataset
X_top50 <- filtered_genes[, top_50_genes, drop = FALSE]
X_top100 <- filtered_genes[, top_100_genes, drop = FALSE]
5.Train Logistic Regression with Feature-Selected Genes
# Train Logistic Regression models with top 50 and 100 genes
logistic_5fold_50 <- train(x = X_top50, y = as.factor(er_status), method = "glm",
family = "binomial", trControl = cv5)
logistic_10fold_50 <- train(x = X_top50, y = as.factor(er_status), method = "glm",
family = "binomial", trControl = cv10)
logistic_5fold_100 <- train(x = X_top100, y = as.factor(er_status), method = "glm",
family = "binomial", trControl = cv5)
logistic_10fold_100 <- train(x = X_top100, y = as.factor(er_status), method = "glm",
family = "binomial", trControl = cv10)
# Store Feature Selection Results
results_feature_selection <- data.frame(
Model = c("Logistic Regression", "Logistic Regression", "Logistic Regression", "Logistic Regression"),
Top_Genes = c(50, 50, 100, 100),
Cross_Validation = c("5-Fold", "10-Fold", "5-Fold", "10-Fold"),
Mean_Accuracy = c(
mean(logistic_5fold_50$resample$Accuracy),
mean(logistic_10fold_50$resample$Accuracy),
mean(logistic_5fold_100$resample$Accuracy),
mean(logistic_10fold_100$resample$Accuracy)
)
)
# Print Feature Selection Results
kable(results_feature_selection, caption = "Accuracy after Feature Selection")
Accuracy after Feature Selection
| Logistic Regression |
50 |
5-Fold |
0.9362876 |
| Logistic Regression |
50 |
10-Fold |
0.9295743 |
| Logistic Regression |
100 |
5-Fold |
0.9372306 |
| Logistic Regression |
100 |
10-Fold |
0.9343251 |