# ComBat + SVM inside CV loop

library(GEOquery)
## Loading required package: Biobase
## Warning: package 'Biobase' was built under R version 4.5.2
## Loading required package: BiocGenerics
## Warning: package 'BiocGenerics' was built under R version 4.5.2
## Loading required package: generics
## 
## Attaching package: 'generics'
## The following objects are masked from 'package:base':
## 
##     as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
##     setequal, union
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
##     unsplit, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
library(limma)
## Warning: package 'limma' was built under R version 4.5.2
## 
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
library(sva)
## Loading required package: mgcv
## Loading required package: nlme
## Warning: package 'nlme' was built under R version 4.5.2
## This is mgcv 1.9-4. For overview type '?mgcv'.
## Loading required package: genefilter
## Warning: package 'genefilter' was built under R version 4.5.2
## Loading required package: BiocParallel
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.2
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:generics':
## 
##     interpolate
library(caret)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:e1071':
## 
##     element
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.5.2
## 
## Attaching package: 'caret'
## The following object is masked from 'package:generics':
## 
##     train
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:BiocGenerics':
## 
##     var
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
# 1. Download GEO datasets

gse1 <- getGEO("GSE63060", GSEMatrix = TRUE)[[1]]
## Found 1 file(s)
## GSE63060_series_matrix.txt.gz
gse2 <- getGEO("GSE63061", GSEMatrix = TRUE)[[1]]
## Found 1 file(s)
## GSE63061_series_matrix.txt.gz
expr1 <- exprs(gse1)
expr2 <- exprs(gse2)

status1 <- pData(gse1)[, "status:ch1"]
status2 <- pData(gse2)[, "status:ch1"]
# 2. Keep common probes
common_probes <- intersect(rownames(expr1), rownames(expr2))

expr_combined <- cbind(
  expr1[common_probes, ],
  expr2[common_probes, ]
)

status_combined <- c(
  as.character(status1),
  as.character(status2)
)

batch_combined <- c(
  rep("GSE63060", ncol(expr1)),
  rep("GSE63061", ncol(expr2))
)

cat("Common probes:", length(common_probes), "\n")
## Common probes: 25549
cat("Total samples:", ncol(expr_combined), "\n")
## Total samples: 717
print(table(status_combined))
## status_combined
##             AD borderline MCI            CTL      CTL to AD            MCI 
##            284              3            238              1            189 
##     MCI to CTL          OTHER 
##              1              1
print(table(batch_combined))
## batch_combined
## GSE63060 GSE63061 
##      329      388
# 3. Keep selected groups
# Example: AD vs CTL only

keep <- status_combined %in% c("AD", "CTL")

expr_clean <- expr_combined[, keep]
status_clean <- factor(status_combined[keep], levels = c("CTL", "AD"))
batch_clean <- factor(batch_combined[keep])
# 4. Annotation: probe -> gene symbol

gpl_v3 <- getGEO("GPL6947")
## Using locally cached version of GPL6947 found here:
## /var/folders/db/7cy83j0n49z52b9xb686gbhm0000gn/T//Rtmp834kSh/GPL6947.soft.gz
gpl_v4 <- getGEO("GPL10558")
## Using locally cached version of GPL10558 found here:
## /var/folders/db/7cy83j0n49z52b9xb686gbhm0000gn/T//Rtmp834kSh/GPL10558.soft.gz
anno_v3 <- Table(gpl_v3)[, c("ID", "Symbol")]
anno_v4 <- Table(gpl_v4)[, c("ID", "Symbol")]

anno_combined <- rbind(anno_v3, anno_v4)
anno_combined <- anno_combined[!duplicated(anno_combined$ID), ]
anno_combined <- anno_combined[anno_combined$Symbol != "", ]

valid_probes <- intersect(rownames(expr_clean), anno_combined$ID)

expr_clean <- expr_clean[valid_probes, ]

matched <- match(rownames(expr_clean), anno_combined$ID)
rownames(expr_clean) <- anno_combined$Symbol[matched]

expr_clean <- avereps(expr_clean)

cat("Final genes:", nrow(expr_clean), "\n")
## Final genes: 17386
cat("Final samples:", ncol(expr_clean), "\n")
## Final samples: 522
print(table(status_clean))
## status_clean
## CTL  AD 
## 238 284
print(table(batch_clean))
## batch_clean
## GSE63060 GSE63061 
##      249      273
# 5. Cross-validation setup

set.seed(123)

folds <- createFolds(status_clean, k = 5, returnTrain = FALSE)

results <- data.frame(
  fold = integer(),
  accuracy = numeric(),
  sensitivity = numeric(),
  specificity = numeric(),
  auc = numeric()
)

all_predictions <- data.frame()
# 6. CV loop

for (i in seq_along(folds)) {
  
  cat("Running fold", i, "\n")
  
  test_idx <- folds[[i]]
  train_idx <- setdiff(seq_along(status_clean), test_idx)
  
  x_train_raw <- expr_clean[, train_idx]
  x_test_raw  <- expr_clean[, test_idx]
  
  y_train <- status_clean[train_idx]
  y_test  <- status_clean[test_idx]
  
  batch_train <- batch_clean[train_idx]
  batch_test  <- batch_clean[test_idx]

  # 6.1 Feature selection inside CV
  
  design_train <- model.matrix(~ y_train)
  fit <- lmFit(x_train_raw, design_train)
  fit <- eBayes(fit)
  
  de_table <- topTable(
    fit,
    coef = 2,
    number = Inf,
    sort.by = "P"
  )
  
  top_genes <- rownames(de_table)[1:min(100, nrow(de_table))]
  
  x_train_fs <- x_train_raw[top_genes, ]
  x_test_fs  <- x_test_raw[top_genes, ]

  # 6.2 ComBat inside CV
  #  train fold only
 
  mod_train <- model.matrix(~ y_train)
  
  x_train_combat <- ComBat(
    dat = x_train_fs,
    batch = batch_train,
    mod = mod_train,
    par.prior = TRUE,
    prior.plots = FALSE
  )
  

  # 6.3 Apply train-based scaling to test

  
  train_means <- rowMeans(x_train_combat)
  train_sds <- apply(x_train_combat, 1, sd)
  train_sds[train_sds == 0] <- 1
  
  x_train_scaled <- scale(
    t(x_train_combat),
    center = train_means,
    scale = train_sds
  )
  
  x_test_scaled <- scale(
    t(x_test_fs),
    center = train_means,
    scale = train_sds
  )
  
  train_df <- data.frame(
    diagnosis = y_train,
    x_train_scaled
  )
  
  test_df <- data.frame(
    diagnosis = y_test,
    x_test_scaled
  )
  
  # 6.4 Train SVM
  
  svm_model <- svm(
    diagnosis ~ .,
    data = train_df,
    kernel = "linear",
    probability = TRUE,
    scale = FALSE
  )
  
  pred_class <- predict(
    svm_model,
    newdata = test_df,
    probability = TRUE
  )
  
  pred_prob <- attr(pred_class, "probabilities")[, "AD"]
  
  cm <- confusionMatrix(
    pred_class,
    y_test,
    positive = "AD"
  )
  
  auc_value <- auc(
    response = y_test,
    predictor = pred_prob,
    levels = c("CTL", "AD")
  )
  
  results <- rbind(
    results,
    data.frame(
      fold = i,
      accuracy = cm$overall["Accuracy"],
      sensitivity = cm$byClass["Sensitivity"],
      specificity = cm$byClass["Specificity"],
      auc = as.numeric(auc_value)
    )
  )
  
  all_predictions <- rbind(
    all_predictions,
    data.frame(
      fold = i,
      true_label = y_test,
      predicted_label = pred_class,
      prob_AD = pred_prob
    )
  )
}
## Running fold 1
## Found2batches
## Adjusting for1covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
## Setting direction: controls < cases
## Running fold 2
## Found2batches
## Adjusting for1covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
## Setting direction: controls < cases
## Running fold 3
## Found2batches
## Adjusting for1covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
## Setting direction: controls < cases
## Running fold 4
## Found2batches
## Adjusting for1covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
## Setting direction: controls < cases
## Running fold 5
## Found2batches
## Adjusting for1covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
## Setting direction: controls < cases
# 7. Final CV performance

print(results)
##           fold  accuracy sensitivity specificity       auc
## Accuracy     1 0.5096154   0.5000000   0.5208333 0.6283482
## Accuracy1    2 0.5047619   0.4561404   0.5625000 0.6619152
## Accuracy2    3 0.5000000   0.4736842   0.5319149 0.6364315
## Accuracy3    4 0.6190476   0.6315789   0.6041667 0.7540205
## Accuracy4    5 0.5865385   0.5789474   0.5957447 0.7331094
cat("Mean Accuracy:", mean(results$accuracy), "\n")
## Mean Accuracy: 0.5439927
cat("Mean Sensitivity:", mean(results$sensitivity), "\n")
## Mean Sensitivity: 0.5280702
cat("Mean Specificity:", mean(results$specificity), "\n")
## Mean Specificity: 0.5630319
cat("Mean AUC:", mean(results$auc), "\n")
## Mean AUC: 0.682765
library(pROC)

roc_obj <- roc(all_predictions$true_label,
               all_predictions$prob_AD,
               levels = c("CTL", "AD"))
## Setting direction: controls < cases
plot(roc_obj, col="blue", main="ROC Curve")

auc(roc_obj)
## Area under the curve: 0.6523
table(all_predictions$true_label,
      all_predictions$predicted_label)
##      
##       CTL  AD
##   CTL 134 104
##   AD  134 150
boxplot(results$auc,
        main="AUC across folds",
        ylab="AUC")

pca <- prcomp(t(expr_clean))

plot(pca$x[,1:2],
     col=batch_clean,
     pch=16,
     main="PCA before correction")

Gene expression data contains predictive signal for AD classification, but the performance is limited (AUC ≈ 0.68). The relatively low accuracy (~0.54) suggests that gene expression alone may not be sufficient for reliable diagnosis. Sensitivity (~0.53) is relatively low,which may limit the model’s usefulness in screening settings. The strong batch effect observed highlights the importance of proper batch correction within the modelling pipeline.