1 Data Download and Preprocessing

Dwnloaded a GEO series with a continuous phenotype and normalized the data.

# 1. Download GEO series 
gset_list <- getGEO("GSE53890", GSEMatrix = TRUE)
gset      <- gset_list[[1]]
expr      <- exprs(gset)                 # Gene × Sample matrix
pheno_chr <- pData(gset)$age             # Phenotype as character

# 2. Convert phenotype to numeric
pheno_num <- as.numeric(gsub("[^0-9\\.]", "", as.character(pheno_chr)))
stopifnot(length(pheno_num) == ncol(expr))

# 3. Plot raw intensities
boxplot(expr,
        las = 2,
        main = "Raw Intensities per Sample",
        outline = FALSE)

# 4. Quantile normalization
expr_norm <- normalizeBetweenArrays(expr, method = "quantile")

# 5. Plot normalized intensities
boxplot(expr_norm,
        las = 2,
        main = "Normalized Intensities per Sample",
        outline = FALSE)

Summary:
The boxplots before and after normalization show whether technical variation across arrays has been removed. Post-normalization shows better alignment across arrays.

2 Feature Screening

Compute Pearson correlation of each gene with the continuous phenotype and visualize the top predictor.

cors      <- apply(expr_norm, 1, function(g) cor(g, pheno_num, use = "complete.obs"))
best_gene <- names(which.max(abs(cors)))
best_r    <- cors[best_gene]

plot(pheno_num,
     expr_norm[best_gene, ],
     xlab = "Phenotype (Age)",
     ylab = paste("Expression of", best_gene),
     main = sprintf("%s vs Phenotype (r = %.2f)", best_gene, best_r),
     pch  = 16)
abline(lm(expr_norm[best_gene, ] ~ pheno_num), col = "blue", lwd = 2)

Summary:
The scatterplot and regression line illustrate the strength and direction of the association between the top gene’s expression and the phenotype.

3 50-Simulation 5-Fold Cross-Validation

Run 50 simulations of 5-fold CV, selecting the gene with lowest validation MSE for each fold.

n_folds     <- 5
n_sims      <- 50
all_best_50 <- character(n_sims * n_folds)
all_mse_50  <- numeric(n_sims * n_folds)
idx         <- 1

for (sim in seq_len(n_sims)) {
  folds <- createFolds(pheno_num, k = n_folds, returnTrain = TRUE)
  for (f in seq_along(folds)) {
    train_idx <- folds[[f]]
    valid_idx <- setdiff(seq_along(pheno_num), train_idx)

    mse_per_gene <- sapply(rownames(expr_norm), function(gene) {
      y_tr <- pheno_num[train_idx]; x_tr <- expr_norm[gene, train_idx]
      y_va <- pheno_num[valid_idx]; x_va <- expr_norm[gene, valid_idx]
      df_tr <- data.frame(y = y_tr, x = x_tr)
      df_va <- data.frame(x = x_va)
      fit   <- lm(y ~ x, data = df_tr)
      pred  <- predict(fit, newdata = df_va)
      mean((y_va - pred)^2)
    })

    best             <- names(which.min(mse_per_gene))
    all_best_50[idx] <- best
    fit_full         <- lm(pheno_num[train_idx] ~ expr_norm[best, train_idx])
    all_mse_50[idx]  <- mean(resid(fit_full)^2)
    idx              <- idx + 1
  }
}

# Summary
top5_50 <- head(sort(table(all_best_50), decreasing = TRUE), 5)
cat("Top 5 Genes (50 sims):\n")

## Top 5 Genes (50 sims):

print(top5_50)

## all_best_50
## 220030_at 236902_at 224856_at 226374_at 230765_at 
##        11         9         8         8         8

cat("\nTraining-set MSE summary (50 sims):\n")

## 
## Training-set MSE summary (50 sims):

print(summary(all_mse_50))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   231.9   318.1   372.8   396.1   457.8   702.6

Summary:
Tally the most frequently selected genes and the distribution of training-set MSEs across 250 folds.

4 250-Simulation 5-Fold Cross-Validation

Repeat the above for 250 simulations.

n_folds      <- 5
n_sims       <- 250
all_best_250 <- character(n_sims * n_folds)
all_mse_250  <- numeric(n_sims * n_folds)
idx          <- 1

for (sim in seq_len(n_sims)) {
  folds <- createFolds(pheno_num, k = n_folds, returnTrain = TRUE)
  for (f in seq_along(folds)) {
    train_idx <- folds[[f]]
    valid_idx <- setdiff(seq_along(pheno_num), train_idx)

    mse_per_gene <- sapply(rownames(expr_norm), function(gene) {
      y_tr <- pheno_num[train_idx]; x_tr <- expr_norm[gene, train_idx]
      y_va <- pheno_num[valid_idx]; x_va <- expr_norm[gene, valid_idx]
      df_tr <- data.frame(y = y_tr, x = x_tr)
      df_va <- data.frame(x = x_va)
      fit   <- lm(y ~ x, data = df_tr)
      pred  <- predict(fit, newdata = df_va)
      mean((y_va - pred)^2)
    })

    best              <- names(which.min(mse_per_gene))
    all_best_250[idx] <- best
    fit_full          <- lm(pheno_num[train_idx] ~ expr_norm[best, train_idx])
    all_mse_250[idx]  <- mean(resid(fit_full)^2)
    idx               <- idx + 1
  }
}

# Summary
top5_250 <- head(sort(table(all_best_250), decreasing = TRUE), 5)
cat("Top 5 Genes (250 sims):\n")

## Top 5 Genes (250 sims):

print(top5_250)

## all_best_250
## 226374_at 220030_at 224856_at 236902_at 203367_at 
##        45        41        41        38        25

cat("\nTraining-set MSE summary (250 sims):\n")

## 
## Training-set MSE summary (250 sims):

print(summary(all_mse_250))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   213.1   320.5   387.5   401.1   471.1   706.1

Summary:
A higher simulation count should stabilize gene-selection frequencies and MSE estimates.

5 Comparison of Results

par(mfrow = c(1, 2), mar = c(5, 4, 4, 2))
barplot(head(sort(table(all_best_50), decreasing = TRUE), 5), las = 2,
        ylab = "Count", main = "Top 5 Genes (50 sims)")
barplot(head(sort(table(all_best_250), decreasing = TRUE), 5), las = 2,
        ylab = "Count", main = "Top 5 Genes (250 sims)")

par(mfrow = c(1, 1))
boxplot(list("50 sims" = all_mse_50, "250 sims" = all_mse_250),
        ylab  = "Training MSE",
        main  = "Effect of Simulation Count on MSE")

Summary:
The barplots compare top-gene frequencies, and the boxplots compare error distributions between simulation sizes.

6 Outlier Sensitivity Check

keep         <- which(pheno_num < 100)
expr_sub     <- expr_norm[, keep, drop = FALSE]
age_sub      <- pheno_num[keep]
n_folds      <- 5
n_sims       <- 20
all_best_sub <- character(n_sims * n_folds)
idx          <- 1

for (sim in seq_len(n_sims)) {
  folds <- createFolds(age_sub, k = n_folds, returnTrain = TRUE)
  for (f in seq_along(folds)) {
    train_idx <- folds[[f]]
    valid_idx <- setdiff(seq_along(age_sub), train_idx)

    mse_per_gene <- sapply(rownames(expr_sub), function(gene) {
      y_tr <- age_sub[train_idx]; x_tr <- expr_sub[gene, train_idx]
      y_va <- age_sub[valid_idx]; x_va <- expr_sub[gene, valid_idx]
      df_tr <- data.frame(y = y_tr, x = x_tr)
      df_va <- data.frame(x = x_va)
      fit   <- lm(y ~ x, data = df_tr)
      pred  <- predict(fit, newdata = df_va)
      mean((y_va - pred)^2)
    })

    all_best_sub[idx] <- names(which.min(mse_per_gene))
    idx               <- idx + 1
  }
}

cat("Gene selection counts (without outliers):\n")

## Gene selection counts (without outliers):

print(table(all_best_sub))

## all_best_sub
##   1553864_at 1554112_a_at   1554607_at   1556365_at   1556432_at   1557180_at 
##            1            1            1            1            1            3 
## 1569013_s_at  200067_x_at    201237_at  202829_s_at  202975_s_at  203080_s_at 
##            1            3            3            3            1            2 
##    203086_at    203367_at    203414_at  203865_s_at  203946_s_at  204594_s_at 
##            1            2            1            1            2            1 
##  204990_s_at  205524_s_at  205625_s_at  205632_s_at  205882_x_at  205931_s_at 
##            1            1            1            1            1            1 
##    206474_at  207988_s_at  208911_s_at  209558_s_at  209822_s_at  209824_s_at 
##            1            1            2            1            2            1 
##  209907_s_at  210648_x_at  210675_s_at  211337_s_at    212299_at    212461_at 
##            1            1            3            1            1            1 
##  212548_s_at    213924_at  214553_s_at    214741_at  215171_s_at  215222_x_at 
##            1            1            1            2            2            1 
##    216307_at  218191_s_at    220343_at  220359_s_at    220914_at    221362_at 
##            1            1            1            1            1            1 
##  221482_s_at  221528_s_at  221850_x_at    222431_at  224407_s_at    224685_at 
##            1            1            1            1            1            1 
##    224856_at    224860_at    225439_at  225959_s_at    226374_at    226580_at 
##            3            1            2            3            1            2 
##    226996_at  227047_x_at    227166_at    228135_at    229537_at    231186_at 
##            1            1            1            1            1            1 
##    232027_at    232084_at  233124_s_at  234311_s_at    235334_at    235502_at 
##            2            1            1            1            1            1 
##    241672_at    242825_at    243804_at    244617_at 
##            1            1            1            1

Summary:
Testing without extreme-age samples shows if outliers drive gene selection.

Conclusion

Across 50 simulations (250 folds), the top five genes selected most often were 220030, 236902, 224856, 226374, and 230765, each winning between 8 and 11 folds. In a larger run of 250 simulations (1 250 folds), the same genes dominated but with much higher counts—226374 led with 46 wins, followed by 220030 (44), 224856 (41), 236902 (40), and 203367 (22). The training‐set MSE distributions were nearly identical in both cases (medians ≃373 vs. 384; IQRs ~318–458 vs. 318–469), with outliers around 700. Overall, increasing simulation count sharpened the gene‐selection rankings while leaving the error profile essentially unchanged.

HW7: Cross-Validation, Bootstrap, and Multiple Linear Regression

Megan Dorval

2025-05-20