# ComBat + SVM inside CV loop
library(GEOquery)
## Loading required package: Biobase
## Warning: package 'Biobase' was built under R version 4.5.2
## Loading required package: BiocGenerics
## Warning: package 'BiocGenerics' was built under R version 4.5.2
## Loading required package: generics
##
## Attaching package: 'generics'
## The following objects are masked from 'package:base':
##
## as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
## setequal, union
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
## unsplit, which.max, which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
library(limma)
## Warning: package 'limma' was built under R version 4.5.2
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
library(sva)
## Loading required package: mgcv
## Loading required package: nlme
## Warning: package 'nlme' was built under R version 4.5.2
## This is mgcv 1.9-4. For overview type '?mgcv'.
## Loading required package: genefilter
## Warning: package 'genefilter' was built under R version 4.5.2
## Loading required package: BiocParallel
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.2
##
## Attaching package: 'e1071'
## The following object is masked from 'package:generics':
##
## interpolate
library(caret)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:e1071':
##
## element
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.5.2
##
## Attaching package: 'caret'
## The following object is masked from 'package:generics':
##
## train
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:BiocGenerics':
##
## var
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# 1. Download GEO datasets
gse1 <- getGEO("GSE63060", GSEMatrix = TRUE)[[1]]
## Found 1 file(s)
## GSE63060_series_matrix.txt.gz
gse2 <- getGEO("GSE63061", GSEMatrix = TRUE)[[1]]
## Found 1 file(s)
## GSE63061_series_matrix.txt.gz
expr1 <- exprs(gse1)
expr2 <- exprs(gse2)
status1 <- pData(gse1)[, "status:ch1"]
status2 <- pData(gse2)[, "status:ch1"]
# 2. Keep common probes
common_probes <- intersect(rownames(expr1), rownames(expr2))
expr_combined <- cbind(
expr1[common_probes, ],
expr2[common_probes, ]
)
status_combined <- c(
as.character(status1),
as.character(status2)
)
batch_combined <- c(
rep("GSE63060", ncol(expr1)),
rep("GSE63061", ncol(expr2))
)
cat("Common probes:", length(common_probes), "\n")
## Common probes: 25549
cat("Total samples:", ncol(expr_combined), "\n")
## Total samples: 717
print(table(status_combined))
## status_combined
## AD borderline MCI CTL CTL to AD MCI
## 284 3 238 1 189
## MCI to CTL OTHER
## 1 1
print(table(batch_combined))
## batch_combined
## GSE63060 GSE63061
## 329 388
# 3. Keep selected groups
# Example: AD vs CTL only
keep <- status_combined %in% c("AD", "CTL")
expr_clean <- expr_combined[, keep]
status_clean <- factor(status_combined[keep], levels = c("CTL", "AD"))
batch_clean <- factor(batch_combined[keep])
# 4. Annotation: probe -> gene symbol
gpl_v3 <- getGEO("GPL6947")
## Using locally cached version of GPL6947 found here:
## /var/folders/db/7cy83j0n49z52b9xb686gbhm0000gn/T//Rtmp834kSh/GPL6947.soft.gz
gpl_v4 <- getGEO("GPL10558")
## Using locally cached version of GPL10558 found here:
## /var/folders/db/7cy83j0n49z52b9xb686gbhm0000gn/T//Rtmp834kSh/GPL10558.soft.gz
anno_v3 <- Table(gpl_v3)[, c("ID", "Symbol")]
anno_v4 <- Table(gpl_v4)[, c("ID", "Symbol")]
anno_combined <- rbind(anno_v3, anno_v4)
anno_combined <- anno_combined[!duplicated(anno_combined$ID), ]
anno_combined <- anno_combined[anno_combined$Symbol != "", ]
valid_probes <- intersect(rownames(expr_clean), anno_combined$ID)
expr_clean <- expr_clean[valid_probes, ]
matched <- match(rownames(expr_clean), anno_combined$ID)
rownames(expr_clean) <- anno_combined$Symbol[matched]
expr_clean <- avereps(expr_clean)
cat("Final genes:", nrow(expr_clean), "\n")
## Final genes: 17386
cat("Final samples:", ncol(expr_clean), "\n")
## Final samples: 522
print(table(status_clean))
## status_clean
## CTL AD
## 238 284
print(table(batch_clean))
## batch_clean
## GSE63060 GSE63061
## 249 273
# 5. Cross-validation setup
set.seed(123)
folds <- createFolds(status_clean, k = 5, returnTrain = FALSE)
results <- data.frame(
fold = integer(),
accuracy = numeric(),
sensitivity = numeric(),
specificity = numeric(),
auc = numeric()
)
all_predictions <- data.frame()
# 6. CV loop
for (i in seq_along(folds)) {
cat("Running fold", i, "\n")
test_idx <- folds[[i]]
train_idx <- setdiff(seq_along(status_clean), test_idx)
x_train_raw <- expr_clean[, train_idx]
x_test_raw <- expr_clean[, test_idx]
y_train <- status_clean[train_idx]
y_test <- status_clean[test_idx]
batch_train <- batch_clean[train_idx]
batch_test <- batch_clean[test_idx]
# 6.1 Feature selection inside CV
design_train <- model.matrix(~ y_train)
fit <- lmFit(x_train_raw, design_train)
fit <- eBayes(fit)
de_table <- topTable(
fit,
coef = 2,
number = Inf,
sort.by = "P"
)
top_genes <- rownames(de_table)[1:min(100, nrow(de_table))]
x_train_fs <- x_train_raw[top_genes, ]
x_test_fs <- x_test_raw[top_genes, ]
# 6.2 ComBat inside CV
# train fold only
mod_train <- model.matrix(~ y_train)
x_train_combat <- ComBat(
dat = x_train_fs,
batch = batch_train,
mod = mod_train,
par.prior = TRUE,
prior.plots = FALSE
)
# 6.3 Apply train-based scaling to test
train_means <- rowMeans(x_train_combat)
train_sds <- apply(x_train_combat, 1, sd)
train_sds[train_sds == 0] <- 1
x_train_scaled <- scale(
t(x_train_combat),
center = train_means,
scale = train_sds
)
x_test_scaled <- scale(
t(x_test_fs),
center = train_means,
scale = train_sds
)
train_df <- data.frame(
diagnosis = y_train,
x_train_scaled
)
test_df <- data.frame(
diagnosis = y_test,
x_test_scaled
)
# 6.4 Train SVM
svm_model <- svm(
diagnosis ~ .,
data = train_df,
kernel = "linear",
probability = TRUE,
scale = FALSE
)
pred_class <- predict(
svm_model,
newdata = test_df,
probability = TRUE
)
pred_prob <- attr(pred_class, "probabilities")[, "AD"]
cm <- confusionMatrix(
pred_class,
y_test,
positive = "AD"
)
auc_value <- auc(
response = y_test,
predictor = pred_prob,
levels = c("CTL", "AD")
)
results <- rbind(
results,
data.frame(
fold = i,
accuracy = cm$overall["Accuracy"],
sensitivity = cm$byClass["Sensitivity"],
specificity = cm$byClass["Specificity"],
auc = as.numeric(auc_value)
)
)
all_predictions <- rbind(
all_predictions,
data.frame(
fold = i,
true_label = y_test,
predicted_label = pred_class,
prob_AD = pred_prob
)
)
}
## Running fold 1
## Found2batches
## Adjusting for1covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
## Setting direction: controls < cases
## Running fold 2
## Found2batches
## Adjusting for1covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
## Setting direction: controls < cases
## Running fold 3
## Found2batches
## Adjusting for1covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
## Setting direction: controls < cases
## Running fold 4
## Found2batches
## Adjusting for1covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
## Setting direction: controls < cases
## Running fold 5
## Found2batches
## Adjusting for1covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
## Setting direction: controls < cases
# 7. Final CV performance
print(results)
## fold accuracy sensitivity specificity auc
## Accuracy 1 0.5096154 0.5000000 0.5208333 0.6283482
## Accuracy1 2 0.5047619 0.4561404 0.5625000 0.6619152
## Accuracy2 3 0.5000000 0.4736842 0.5319149 0.6364315
## Accuracy3 4 0.6190476 0.6315789 0.6041667 0.7540205
## Accuracy4 5 0.5865385 0.5789474 0.5957447 0.7331094
cat("Mean Accuracy:", mean(results$accuracy), "\n")
## Mean Accuracy: 0.5439927
cat("Mean Sensitivity:", mean(results$sensitivity), "\n")
## Mean Sensitivity: 0.5280702
cat("Mean Specificity:", mean(results$specificity), "\n")
## Mean Specificity: 0.5630319
cat("Mean AUC:", mean(results$auc), "\n")
## Mean AUC: 0.682765
library(pROC)
roc_obj <- roc(all_predictions$true_label,
all_predictions$prob_AD,
levels = c("CTL", "AD"))
## Setting direction: controls < cases
plot(roc_obj, col="blue", main="ROC Curve")
auc(roc_obj)
## Area under the curve: 0.6523
table(all_predictions$true_label,
all_predictions$predicted_label)
##
## CTL AD
## CTL 134 104
## AD 134 150
boxplot(results$auc,
main="AUC across folds",
ylab="AUC")
pca <- prcomp(t(expr_clean))
plot(pca$x[,1:2],
col=batch_clean,
pch=16,
main="PCA before correction")
Gene expression data contains predictive signal for AD classification, but the performance is limited (AUC ≈ 0.68). The relatively low accuracy (~0.54) suggests that gene expression alone may not be sufficient for reliable diagnosis. Sensitivity (~0.53) is relatively low,which may limit the model’s usefulness in screening settings. The strong batch effect observed highlights the importance of proper batch correction within the modelling pipeline.