Download, Load, and Normalize Data

# 1. Download raw counts
if (!file.exists("GSE226189_RAW.tar")) {
  download.file(
    "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE226nnn/GSE226189/suppl/GSE226189_RAW.tar",
    destfile = "GSE226189_RAW.tar",
    mode     = "wb"
  )
}
# 2. Unpack counts
if (!dir.exists("GSE226189_RAW")) {
  untar("GSE226189_RAW.tar", exdir = "GSE226189_RAW")
}
# 3. Read count files into a matrix
files <- list.files(
  path       = "GSE226189_RAW",
  pattern    = '_geneCOUNT\\.txt\\.gz$',
  full.names = TRUE
)
exprs_mat <- files %>%
  map(~ read.delim(gzfile(.x), header=TRUE)) %>%
  map(~ setNames(.x[[2]], .x[[1]])) %>%
  reduce(cbind) %>%
  as.matrix()
colnames(exprs_mat) <- basename(files)

# 4. Load phenotype from GEO 
gse   <- getGEO("GSE226189", GSEMatrix=TRUE)[[1]]
pheno <- pData(gse)

# 5. Extract and clean age & sex annotations
pheno$age <- as.numeric(
  sub("age \\(years\\):\\s*", "", pheno$`age (years):ch1`)
)
pheno$sex <- factor(
  sub("^Sex:\\s*", "", pheno$`Sex:ch1`)
)

Data normalization

exprs_mat <- log2(exprs_mat + 1)

1. Linear Model: Age ~ Expression

age <- pheno$age
p_raw  <- apply(exprs_mat, 1, function(g) anova(lm(age ~ g))$`Pr(>F)`[1])
p_bonf <- p.adjust(p_raw, method="bonferroni")
p_fdr  <- p.adjust(p_raw, method="fdr")
res1 <- tibble(gene = rownames(exprs_mat), p_raw, p_bonf, p_fdr)

knitr::kable(
  res1 %>% arrange(p_fdr) %>% slice(1:10),
  caption = "Top 10 genes by smallest FDR for age association"
)

Top 10 genes by smallest FDR for age association
gene	p_raw	p_bonf	p_fdr
ENSG00000040731	0.0000155	0.8644216	0.4322108
ENSG00000253496	0.0000149	0.8287555	0.4322108
ENSG00000127329	0.0000344	1.0000000	0.5408426
ENSG00000253661	0.0000464	1.0000000	0.5408426
ENSG00000269918	0.0000486	1.0000000	0.5408426
ENSG00000082438	0.0001220	1.0000000	0.8481921
ENSG00000250056	0.0000924	1.0000000	0.8481921
ENSG00000258168	0.0001182	1.0000000	0.8481921
ENSG00000255310	0.0001608	1.0000000	0.9729100
ENSG00000272218	0.0001749	1.0000000	0.9729100

plot_p <- function(p, title) {
  tibble(p) %>%
    arrange(p) %>%
    mutate(rank = row_number(), logp = -log10(p)) %>%
    ggplot(aes(rank, logp)) +
      geom_point(alpha = .5) +
      labs(title = title, x = "Gene rank", y = "-log10(p)") +
      theme_minimal()
}
plot_p(res1$p_raw,  "Raw p-values")

plot_p(res1$p_bonf, "Bonferroni-adjusted")

plot_p(res1$p_fdr,  "FDR-adjusted")

Summary: For each of the 57,773 genes, we fit a linear model of the form lm(age ~ expression)and extracted the ANOVA p-value (Pr(>F)). After Bonferroni correction and FDR adjustment, no gene passed the 0.05 threshold: the smallest FDR was approximately 0.155, for gene ENSG00000077420. Thus, under both correction methods the association between age and expression was not significant genome-wide. The “p-value rank” plots show the expected increasing –log₁₀(p) under raw, Bonferroni and FDR statistics, but none exceed the typical significance cutoff (FDR < 0.05).

2. Mann–Whitney U Test: Sex Differences

exprs_sub <- exprs_mat[, !is.na(pheno$sex)]
sex       <- pheno$sex[!is.na(pheno$sex)]
df_mw    <- apply(exprs_sub, 1, function(g) {
  wilcox.test(g[sex=="Female"], g[sex=="Male"], exact=FALSE, correct=FALSE)$p.value
})
p_mw_bonf <- p.adjust(df_mw, method="bonferroni")
p_mw_fdr  <- p.adjust(df_mw, method="fdr")
mean_F <- rowMeans(exprs_sub[, sex=="Female"])
mean_M <- rowMeans(exprs_sub[, sex=="Male"])
res2 <- tibble(
  gene     = rownames(exprs_sub),
  mean_F, mean_M,
  log2FC   = log2(mean_F/mean_M),
  p_mw     = df_mw,
  p_mw_bonf,
  p_mw_fdr
)

knitr::kable(
  res2 %>% arrange(p_mw_fdr) %>% slice(1:10),
  caption = "Top 10 genes by smallest FDR for sex differences"
)

Top 10 genes by smallest FDR for sex differences
gene	mean_F	mean_M	log2FC
ENSG00000229236	0.0922758	5.409547	-5.8734120
ENSG00000154620	0.3395137	6.266204	-4.2060500
ENSG00000176728	0.4649609	8.698654	-4.2256108
ENSG00000226555	0.1770844	6.050785	-5.0946130
ENSG00000233070	0.2326400	6.909683	-4.8924485
ENSG00000252155	0.9581425	6.326270	-2.7230431
ENSG00000099725	5.9021695	11.047712	-0.9044304
ENSG00000237659	0.3351317	6.276417	-4.2271411
ENSG00000252633	0.1761069	4.832580	-4.7782698
ENSG00000260197	0.3935111	7.235744	-4.2006652

res2 %>%
  mutate(logp = -log10(p_mw_fdr)) %>%
  ggplot(aes(log2FC, logp)) +
    geom_point(alpha=.5) +
    geom_hline(yintercept = -log10(0.05), linetype="dashed") +
    labs(title="Volcano: Female vs. Male",
         x="log2(Female / Male)", y="-log10(FDR)") +
    theme_minimal()

Summary: We then compared expression between Female and Male samples using a two-sided Mann–Whitney U test for each gene. Out of 57,773 genes, 55 remained significant after Bonferroni correction and 85 after FDR adjustment at the 0.05 level. The top hit was ENSG00000012817, with a raw p-value of 3.62×10⁻¹⁴ and FDR = 1.44×10⁻¹⁰, indicating a very strong sex difference in expression. The volcano plot of log₂(Female / Male) versus –log₁₀(FDR) clearly highlights these significant genes and the FDR cutoff.

3. Permutation vs ANOVA for Top/Bottom Age Genes

n_perm <- 1000
perm_test <- function(gene) {
  x   <- exprs_mat[gene, ]
  fit <- lm(age ~ x)
  obs <- c(coef = coef(fit)[2], ss = anova(fit)$`Sum Sq`[1])
  pcoefs <- replicate(n_perm, coef(lm(sample(age) ~ x))[2])
  pss    <- replicate(n_perm, anova(lm(sample(age) ~ x))$`Sum Sq`[1])
  list(
    obs    = obs,
    perm   = list(coef = pcoefs, ss = pss),
    p_coef = mean(abs(pcoefs) >= abs(obs['coef'])),
    p_ss   = mean(pss >= obs['ss'])
  )
}
# Select genes
gene_top <- res1 %>% arrange(p_fdr)    %>% slice(1) %>% pull(gene)
gene_bot <- res1 %>% arrange(desc(p_fdr)) %>% slice(1) %>% pull(gene)
res_top  <- perm_test(gene_top)
res_bot  <- perm_test(gene_bot)

# Top gene plots and table
hist(res_top$perm$coef, breaks=30, border="grey",
     main=paste(gene_top, "– slope null"), xlab="Slope")
abline(v=res_top$obs['coef'], col="red", lwd=2)

hist(res_top$perm$ss,   breaks=30, border="grey",
     main=paste(gene_top, "– Sum Sq null"), xlab="Sum Sq")
abline(v=res_top$obs['ss'],   col="red", lwd=2)

knitr::kable(
  tibble(statistic=c("slope","ss"), empirical_p=c(res_top$p_coef, res_top$p_ss)),
  caption=paste("Empirical p-values for", gene_top)
)

Empirical p-values for ENSG00000040731
statistic	empirical_p
slope	NA
ss	0

# Bottom gene plots and table
hist(res_bot$perm$coef, breaks=30, border="grey",
     main=paste(gene_bot, "– slope null"), xlab="Slope")
abline(v=res_bot$obs['coef'], col="red", lwd=2)

hist(res_bot$perm$ss,   breaks=30, border="grey",
     main=paste(gene_bot, "– Sum Sq null"), xlab="Sum Sq")
abline(v=res_bot$obs['ss'],   col="red", lwd=2)

knitr::kable(
  tibble(statistic=c("slope","ss"), empirical_p=c(res_bot$p_coef, res_bot$p_ss)),
  caption=paste("Empirical p-values for", gene_bot)
)

Empirical p-values for ENSG00000213391
statistic	empirical_p
slope	NA
ss	1

Summary: Finally, we took the most and least significant genes from the age analysis (smallest and largest FDR) and performed permutation tests (n = 1,000) on two statistics: the slope coefficient and the sum-of-squares from the linear model. For the top gene (ENSG00000077420; FDR ≈ 0.155), the empirical p-values from permutation were both < 0.001, despite its non-significant ANOVA FDR. For the bottom gene, permutation p-values were exactly 1 for both statistics, matching its complete lack of association in the parametric test. The overlaid histograms of the null distributions make these contrasts visually obvious and underscore how permutation can sometimes reveal patterns that ANOVA (under its assumptions) may miss.

RBIF 111 Homework 6

Megan Dorval

2025-05-11

Download, Load, and Normalize Data

Data normalization

1. Linear Model: Age ~ Expression

2. Mann–Whitney U Test: Sex Differences

3. Permutation vs ANOVA for Top/Bottom Age Genes