Download, Load, and Normalize Data

# 1. Download raw counts
if (!file.exists("GSE226189_RAW.tar")) {
  download.file(
    "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE226nnn/GSE226189/suppl/GSE226189_RAW.tar",
    destfile = "GSE226189_RAW.tar",
    mode     = "wb"
  )
}
# 2. Unpack counts
if (!dir.exists("GSE226189_RAW")) {
  untar("GSE226189_RAW.tar", exdir = "GSE226189_RAW")
}
# 3. Read count files into a matrix
files <- list.files(
  path       = "GSE226189_RAW",
  pattern    = '_geneCOUNT\\.txt\\.gz$',
  full.names = TRUE
)
exprs_mat <- files %>%
  map(~ read.delim(gzfile(.x), header=TRUE)) %>%
  map(~ setNames(.x[[2]], .x[[1]])) %>%
  reduce(cbind) %>%
  as.matrix()
colnames(exprs_mat) <- basename(files)

# 4. Load phenotype from GEO 
gse   <- getGEO("GSE226189", GSEMatrix=TRUE)[[1]]
pheno <- pData(gse)

# 5. Extract and clean age & sex annotations
pheno$age <- as.numeric(
  sub("age \\(years\\):\\s*", "", pheno$`age (years):ch1`)
)
pheno$sex <- factor(
  sub("^Sex:\\s*", "", pheno$`Sex:ch1`)
)

Data normalization

exprs_mat <- log2(exprs_mat + 1)

1. Linear Model: Age ~ Expression

age <- pheno$age
p_raw  <- apply(exprs_mat, 1, function(g) anova(lm(age ~ g))$`Pr(>F)`[1])
p_bonf <- p.adjust(p_raw, method="bonferroni")
p_fdr  <- p.adjust(p_raw, method="fdr")
res1 <- tibble(gene = rownames(exprs_mat), p_raw, p_bonf, p_fdr)
knitr::kable(
  res1 %>% arrange(p_fdr) %>% slice(1:10),
  caption = "Top 10 genes by smallest FDR for age association"
)
Top 10 genes by smallest FDR for age association
gene p_raw p_bonf p_fdr
ENSG00000040731 0.0000155 0.8644216 0.4322108
ENSG00000253496 0.0000149 0.8287555 0.4322108
ENSG00000127329 0.0000344 1.0000000 0.5408426
ENSG00000253661 0.0000464 1.0000000 0.5408426
ENSG00000269918 0.0000486 1.0000000 0.5408426
ENSG00000082438 0.0001220 1.0000000 0.8481921
ENSG00000250056 0.0000924 1.0000000 0.8481921
ENSG00000258168 0.0001182 1.0000000 0.8481921
ENSG00000255310 0.0001608 1.0000000 0.9729100
ENSG00000272218 0.0001749 1.0000000 0.9729100
plot_p <- function(p, title) {
  tibble(p) %>%
    arrange(p) %>%
    mutate(rank = row_number(), logp = -log10(p)) %>%
    ggplot(aes(rank, logp)) +
      geom_point(alpha = .5) +
      labs(title = title, x = "Gene rank", y = "-log10(p)") +
      theme_minimal()
}
plot_p(res1$p_raw,  "Raw p-values")

plot_p(res1$p_bonf, "Bonferroni-adjusted")

plot_p(res1$p_fdr,  "FDR-adjusted")

Summary: For each of the 57,773 genes, we fit a linear model of the form lm(age ~ expression)and extracted the ANOVA p-value (Pr(>F)). After Bonferroni correction and FDR adjustment, no gene passed the 0.05 threshold: the smallest FDR was approximately 0.155, for gene ENSG00000077420. Thus, under both correction methods the association between age and expression was not significant genome-wide. The “p-value rank” plots show the expected increasing –log₁₀(p) under raw, Bonferroni and FDR statistics, but none exceed the typical significance cutoff (FDR < 0.05).

2. Mann–Whitney U Test: Sex Differences

exprs_sub <- exprs_mat[, !is.na(pheno$sex)]
sex       <- pheno$sex[!is.na(pheno$sex)]
df_mw    <- apply(exprs_sub, 1, function(g) {
  wilcox.test(g[sex=="Female"], g[sex=="Male"], exact=FALSE, correct=FALSE)$p.value
})
p_mw_bonf <- p.adjust(df_mw, method="bonferroni")
p_mw_fdr  <- p.adjust(df_mw, method="fdr")
mean_F <- rowMeans(exprs_sub[, sex=="Female"])
mean_M <- rowMeans(exprs_sub[, sex=="Male"])
res2 <- tibble(
  gene     = rownames(exprs_sub),
  mean_F, mean_M,
  log2FC   = log2(mean_F/mean_M),
  p_mw     = df_mw,
  p_mw_bonf,
  p_mw_fdr
)
knitr::kable(
  res2 %>% arrange(p_mw_fdr) %>% slice(1:10),
  caption = "Top 10 genes by smallest FDR for sex differences"
)
Top 10 genes by smallest FDR for sex differences
gene mean_F mean_M log2FC p_mw p_mw_bonf p_mw_fdr
ENSG00000229236 0.0922758 5.409547 -5.8734120 0 0 0
ENSG00000154620 0.3395137 6.266204 -4.2060500 0 0 0
ENSG00000176728 0.4649609 8.698654 -4.2256108 0 0 0
ENSG00000226555 0.1770844 6.050785 -5.0946130 0 0 0
ENSG00000233070 0.2326400 6.909683 -4.8924485 0 0 0
ENSG00000252155 0.9581425 6.326270 -2.7230431 0 0 0
ENSG00000099725 5.9021695 11.047712 -0.9044304 0 0 0
ENSG00000237659 0.3351317 6.276417 -4.2271411 0 0 0
ENSG00000252633 0.1761069 4.832580 -4.7782698 0 0 0
ENSG00000260197 0.3935111 7.235744 -4.2006652 0 0 0
res2 %>%
  mutate(logp = -log10(p_mw_fdr)) %>%
  ggplot(aes(log2FC, logp)) +
    geom_point(alpha=.5) +
    geom_hline(yintercept = -log10(0.05), linetype="dashed") +
    labs(title="Volcano: Female vs. Male",
         x="log2(Female / Male)", y="-log10(FDR)") +
    theme_minimal()

Summary: We then compared expression between Female and Male samples using a two-sided Mann–Whitney U test for each gene. Out of 57,773 genes, 55 remained significant after Bonferroni correction and 85 after FDR adjustment at the 0.05 level. The top hit was ENSG00000012817, with a raw p-value of 3.62×10⁻¹⁴ and FDR = 1.44×10⁻¹⁰, indicating a very strong sex difference in expression. The volcano plot of log₂(Female / Male) versus –log₁₀(FDR) clearly highlights these significant genes and the FDR cutoff.

3. Permutation vs ANOVA for Top/Bottom Age Genes

n_perm <- 1000
perm_test <- function(gene) {
  x   <- exprs_mat[gene, ]
  fit <- lm(age ~ x)
  obs <- c(coef = coef(fit)[2], ss = anova(fit)$`Sum Sq`[1])
  pcoefs <- replicate(n_perm, coef(lm(sample(age) ~ x))[2])
  pss    <- replicate(n_perm, anova(lm(sample(age) ~ x))$`Sum Sq`[1])
  list(
    obs    = obs,
    perm   = list(coef = pcoefs, ss = pss),
    p_coef = mean(abs(pcoefs) >= abs(obs['coef'])),
    p_ss   = mean(pss >= obs['ss'])
  )
}
# Select genes
gene_top <- res1 %>% arrange(p_fdr)    %>% slice(1) %>% pull(gene)
gene_bot <- res1 %>% arrange(desc(p_fdr)) %>% slice(1) %>% pull(gene)
res_top  <- perm_test(gene_top)
res_bot  <- perm_test(gene_bot)

# Top gene plots and table
hist(res_top$perm$coef, breaks=30, border="grey",
     main=paste(gene_top, "– slope null"), xlab="Slope")
abline(v=res_top$obs['coef'], col="red", lwd=2)

hist(res_top$perm$ss,   breaks=30, border="grey",
     main=paste(gene_top, "– Sum Sq null"), xlab="Sum Sq")
abline(v=res_top$obs['ss'],   col="red", lwd=2)

knitr::kable(
  tibble(statistic=c("slope","ss"), empirical_p=c(res_top$p_coef, res_top$p_ss)),
  caption=paste("Empirical p-values for", gene_top)
)
Empirical p-values for ENSG00000040731
statistic empirical_p
slope NA
ss 0
# Bottom gene plots and table
hist(res_bot$perm$coef, breaks=30, border="grey",
     main=paste(gene_bot, "– slope null"), xlab="Slope")
abline(v=res_bot$obs['coef'], col="red", lwd=2)

hist(res_bot$perm$ss,   breaks=30, border="grey",
     main=paste(gene_bot, "– Sum Sq null"), xlab="Sum Sq")
abline(v=res_bot$obs['ss'],   col="red", lwd=2)

knitr::kable(
  tibble(statistic=c("slope","ss"), empirical_p=c(res_bot$p_coef, res_bot$p_ss)),
  caption=paste("Empirical p-values for", gene_bot)
)
Empirical p-values for ENSG00000213391
statistic empirical_p
slope NA
ss 1

Summary: Finally, we took the most and least significant genes from the age analysis (smallest and largest FDR) and performed permutation tests (n = 1,000) on two statistics: the slope coefficient and the sum-of-squares from the linear model. For the top gene (ENSG00000077420; FDR ≈ 0.155), the empirical p-values from permutation were both < 0.001, despite its non-significant ANOVA FDR. For the bottom gene, permutation p-values were exactly 1 for both statistics, matching its complete lack of association in the parametric test. The overlaid histograms of the null distributions make these contrasts visually obvious and underscore how permutation can sometimes reveal patterns that ANOVA (under its assumptions) may miss.