1 Introduction

This document implements the analysis plan for the CSE763 Advanced Bioinformatics assignment using RNA-Seq data from the GEO dataset GSE183947. The project uses FPKM-normalized expression values as the primary quantitative input (for data understanding, visualization, and FPKM-based differential expression), together with sample metadata and gene annotation. Although some raw read-count information exists for this dataset, it does not form a single, complete counts matrix covering all tumor and normal samples, and therefore is not used for the main differential expression analysis.

The main objectives are:

2 Data Loading and Understanding (Task 1)

2.1 Load FPKM Data

fpkm_path <- file.path("data", "GSE183947_fpkm.csv")

# Read FPKM file; first column has no header, so readr names it ...1 by default
fpkm_raw <- readr::read_csv(fpkm_path, show_col_types = FALSE)

# Rename the unnamed first column to match the annotation ID column
if (colnames(fpkm_raw)[1] == "...1") {
  colnames(fpkm_raw)[1] <- "Symbol"
}

cat("FPKM data dimensions (rows x columns):\n")
## FPKM data dimensions (rows x columns):
print(dim(fpkm_raw))
## [1] 20246    61
cat("\nFirst few columns:\n")
## 
## First few columns:
print(colnames(fpkm_raw)[1:min(10, ncol(fpkm_raw))])
##  [1] "Symbol"     "CA.102548"  "CA.104338"  "CA.105094"  "CA.109745" 
##  [6] "CA.1906415" "CA.1912627" "CA.1924346" "CA.1926760" "CA.1927842"
cat("\nPreview of the data:\n")
## 
## Preview of the data:
print(dplyr::slice_head(fpkm_raw, n = 5))
## # A tibble: 5 × 61
##   Symbol   CA.102548 CA.104338 CA.105094 CA.109745 CA.1906415 CA.1912627
##   <chr>        <dbl>     <dbl>     <dbl>     <dbl>      <dbl>      <dbl>
## 1 TSPAN6        0.93      1.97      0         5.45       4.52       4.75
## 2 TNMD          0         0         0         0          0          0   
## 3 DPM1          0         0.43      0         3.43       8.45       8.53
## 4 SCYL3         5.78      5.17      8.76      4.58       7.2        6.03
## 5 C1orf112      2.83      6.26      3.37      6.24       5.16      13.7 
## # ℹ 54 more variables: CA.1924346 <dbl>, CA.1926760 <dbl>, CA.1927842 <dbl>,
## #   CA.1933414 <dbl>, CA.1940640 <dbl>, CA.2004407 <dbl>, CA.2005288 <dbl>,
## #   CA.2006047 <dbl>, CA.2008260 <dbl>, CA.2009329 <dbl>, CA.2009381 <dbl>,
## #   CA.2009850 <dbl>, CA.2017611 <dbl>, CA.2039179 <dbl>, CA.2040686 <dbl>,
## #   CA.2045012 <dbl>, CA.2046297 <dbl>, CA.348981 <dbl>, CA.354300 <dbl>,
## #   CA.359448 <dbl>, CA.94377 <dbl>, CA.98389 <dbl>, CA.98475 <dbl>,
## #   CA.99145 <dbl>, CAP.102548 <dbl>, CAP.104338 <dbl>, CAP.105094 <dbl>, …

2.2 Build Expression Matrix and Sample Metadata

# Build expression matrix and colData
first_gene_id_col <- colnames(fpkm_raw)[1]

fpkm_long <- fpkm_raw
gene_ids <- fpkm_long[[1]]
expr_fpkm <- as.matrix(dplyr::select(fpkm_long, -1))
rownames(expr_fpkm) <- gene_ids

sample_ids <- colnames(expr_fpkm)

col_data <- tibble::tibble(
  sample_id = sample_ids
)

series_path <- file.path("data", "GSE183947_series_matrix.txt")
series_lines <- readr::read_lines(series_path)

extract_sample_row <- function(lines, prefix) {
  line <- lines[startsWith(lines, prefix)]
  if (length(line) == 0) return(NULL)
  line <- sub(paste0("^", prefix), "", line[1])
  parts <- strsplit(trimws(line), "\t")[[1]]
  parts <- parts[nzchar(parts)]
  parts <- gsub('^"|"$', "", parts)
}

sample_titles   <- extract_sample_row(series_lines, "!Sample_title")
sample_access   <- extract_sample_row(series_lines, "!Sample_geo_accession")
sample_descr    <- extract_sample_row(series_lines, "!Sample_description")

char_tissue <- series_lines[startsWith(series_lines, "!Sample_characteristics_ch1\ttissue:") |
                            grepl("tissue: breast tumor|tissue: normal breast tissue", series_lines, fixed = TRUE)]
if (length(char_tissue) == 0) {
  char_tissue_vals <- NULL
} else {
  char_tissue_vals <- extract_sample_row(series_lines, "!Sample_characteristics_ch1")
}

char_donor_line <- series_lines[grepl("!Sample_characteristics_ch1\tdonor:", series_lines) |
                                grepl("donor:", series_lines, fixed = TRUE)]
if (length(char_donor_line) == 0) {
  donor_vals <- NULL
} else {
  donor_vals <- extract_sample_row(series_lines, "!Sample_characteristics_ch1")
}

char_lines <- series_lines[startsWith(series_lines, "!Sample_characteristics_ch1")]

char_tissue_line <- char_lines[grepl("tissue:", char_lines)][1]
if (!is.na(char_tissue_line)) {
  tissue_vals <- strsplit(sub("^!Sample_characteristics_ch1", "", char_tissue_line), "\t")[[1]]
  tissue_vals <- tissue_vals[nzchar(tissue_vals)]
  tissue_vals <- gsub('^"|"$', "", tissue_vals)
} else {
  tissue_vals <- NULL
}

char_donor_line2 <- char_lines[grepl("donor:", char_lines)][1]
if (!is.na(char_donor_line2)) {
  donor_vals2 <- strsplit(sub("^!Sample_characteristics_ch1", "", char_donor_line2), "\t")[[1]]
  donor_vals2 <- donor_vals2[nzchar(donor_vals2)]
  donor_vals2 <- gsub('^"|"$', "", donor_vals2)
} else {
  donor_vals2 <- NULL
}

if (!is.null(sample_descr)) {
  meta_df <- tibble::tibble(
    sample_description = sample_descr
  )

  if (!is.null(tissue_vals) && length(tissue_vals) == nrow(meta_df)) {
    condition_raw <- tissue_vals
    condition <- dplyr::case_when(
      grepl("breast tumor", condition_raw) ~ "tumor",
      grepl("normal breast tissue", condition_raw) ~ "normal",
      TRUE ~ NA_character_
    )
    meta_df$condition <- factor(condition, levels = c("normal", "tumor"))
  }

  if (!is.null(donor_vals2) && length(donor_vals2) == nrow(meta_df)) {
    donor_clean <- sub("^donor:\\s*", "", donor_vals2)
    meta_df$patient_id <- as.character(donor_clean)
  }

  col_data <- col_data %>%
    dplyr::left_join(meta_df, by = c("sample_id" = "sample_description"))
}

cat("\nEnriched sample metadata (colData) preview:\n")
## 
## Enriched sample metadata (colData) preview:
print(col_data)
## # A tibble: 60 × 3
##    sample_id  condition patient_id
##    <chr>      <fct>     <chr>     
##  1 CA.102548  tumor     102548    
##  2 CA.104338  tumor     104338    
##  3 CA.105094  tumor     105094    
##  4 CA.109745  tumor     109745    
##  5 CA.1906415 tumor     1906415   
##  6 CA.1912627 tumor     1912627   
##  7 CA.1924346 tumor     1924346   
##  8 CA.1926760 tumor     1926760   
##  9 CA.1927842 tumor     1927842   
## 10 CA.1933414 tumor     1933414   
## # ℹ 50 more rows

2.3 Expression Characteristics and Quality Assessment

log_fpkm <- log2(expr_fpkm + 1)

tumor_samples <- col_data$sample_id[col_data$condition == "tumor"]
normal_samples <- col_data$sample_id[col_data$condition == "normal"]

overall_stats <- c(
  mean = mean(log_fpkm),
  median = median(log_fpkm),
  sd = sd(log_fpkm),
  min = min(log_fpkm),
  max = max(log_fpkm)
)

tumor_stats <- c(
  mean = mean(log_fpkm[, tumor_samples]),
  median = median(log_fpkm[, tumor_samples]),
  sd = sd(log_fpkm[, tumor_samples]),
  min = min(log_fpkm[, tumor_samples]),
  max = max(log_fpkm[, tumor_samples])
)

normal_stats <- c(
  mean = mean(log_fpkm[, normal_samples]),
  median = median(log_fpkm[, normal_samples]),
  sd = sd(log_fpkm[, normal_samples]),
  min = min(log_fpkm[, normal_samples]),
  max = max(log_fpkm[, normal_samples])
)

# Genes expressed (>1 FPKM in at least one sample)
genes_expressed_all <- sum(rowSums(expr_fpkm > 1) > 0)
genes_expressed_tumor <- sum(rowSums(expr_fpkm[, tumor_samples] > 1) > 0)
genes_expressed_normal <- sum(rowSums(expr_fpkm[, normal_samples] > 1) > 0)

expr_summary_table <- tibble::tibble(
  Statistic = c("Mean", "Median", "SD", "Min", "Max", "Genes expressed (>1 FPKM)"),
  Overall   = c(
    overall_stats["mean"],
    overall_stats["median"],
    overall_stats["sd"],
    overall_stats["min"],
    overall_stats["max"],
    genes_expressed_all
  ),
  Tumor     = c(
    tumor_stats["mean"],
    tumor_stats["median"],
    tumor_stats["sd"],
    tumor_stats["min"],
    tumor_stats["max"],
    genes_expressed_tumor
  ),
  Normal    = c(
    normal_stats["mean"],
    normal_stats["median"],
    normal_stats["sd"],
    normal_stats["min"],
    normal_stats["max"],
    genes_expressed_normal
  )
)
cat("\nExpression Summary\n")
## 
## Expression Summary
print(expr_summary_table)
## # A tibble: 6 × 4
##   Statistic                  Overall    Tumor   Normal
##   <chr>                        <dbl>    <dbl>    <dbl>
## 1 Mean                          2.85     2.83     2.86
## 2 Median                        2.89     2.85     2.91
## 3 SD                            2.12     2.16     2.09
## 4 Min                           0        0        0   
## 5 Max                          16.7     16.7     16.2 
## 6 Genes expressed (>1 FPKM) 20028    17977    19989

2.4 Exploratory Data Analysis (EDA)

cat("Number of genes:", nrow(expr_fpkm), "\n")
## Number of genes: 20246
cat("Number of samples:", ncol(expr_fpkm), "\n\n")
## Number of samples: 60
cat("Condition breakdown (tumor vs normal):\n")
## Condition breakdown (tumor vs normal):
print(table(col_data$condition, useNA = "ifany"))
## 
## normal  tumor 
##     30     30
cat("\nNumber of samples per patient (first few):\n")
## 
## Number of samples per patient (first few):
print(table(col_data$patient_id, useNA = "ifany")[1:10])
## 
##  102548  104338  105094  109745 1906415 1912627 1924346 1926760 1927842 1933414 
##       2       2       2       2       2       2       2       2       2       2
log_expr_fpkm <- log2(expr_fpkm + 1)

set.seed(123)

if (nrow(log_expr_fpkm) > 5000) {
  gene_idx <- sample(seq_len(nrow(log_expr_fpkm)), 5000)
} else {
  gene_idx <- seq_len(nrow(log_expr_fpkm))
}

log_expr_subset <- log_expr_fpkm[gene_idx, , drop = FALSE]

expr_long <- as.data.frame(log_expr_subset) %>%
  tibble::rownames_to_column(var = "gene_id") %>%
  tidyr::pivot_longer(
    cols = -gene_id,
    names_to = "sample_id",
    values_to = "log2_fpkm"
  ) %>%
  dplyr::left_join(col_data, by = "sample_id")

p_box <- ggplot(expr_long, aes(x = sample_id, y = log2_fpkm, fill = condition)) +
  geom_boxplot(outlier.size = 0.2, lwd = 0.2) +
  theme_bw(base_size = 8) +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
    legend.position = "bottom"
  ) +
  labs(
    title = "Per-sample distribution of log2(FPKM + 1)",
    x = "Sample",
    y = "log2(FPKM + 1)"
  )

# Save EDA boxplot to figures/
if (!dir.exists("figures")) dir.create("figures", recursive = TRUE)

ggplot2::ggsave(
  filename = file.path("figures", "eda_boxplot_log2fpkm_by_sample.png"),
  plot = p_box,
  width = 8,
  height = 4,
  dpi = 300
)

cat("\nEDA boxplot saved to figures/eda_boxplot_log2fpkm_by_sample.png\n")
## 
## EDA boxplot saved to figures/eda_boxplot_log2fpkm_by_sample.png
print(p_box)

var_genes <- apply(log_expr_fpkm, 1, var, na.rm = TRUE)
log_expr_pca <- log_expr_fpkm[var_genes > 0, , drop = FALSE]

pca_res <- prcomp(t(log_expr_pca), scale. = TRUE)

pca_df <- as.data.frame(pca_res$x[, 1:2]) %>%
  dplyr::mutate(sample_id = rownames(pca_res$x)) %>%
  dplyr::left_join(col_data, by = "sample_id")

var_explained <- (pca_res$sdev^2) / sum(pca_res$sdev^2)

p_pca <- ggplot(pca_df, aes(x = PC1, y = PC2, color = condition)) +
  geom_point(size = 2) +
  theme_bw() +
  labs(
    title = "PCA of samples based on log2(FPKM + 1)",
    x = paste0("PC1 (", round(var_explained[1] * 100, 1), "% var)"),
    y = paste0("PC2 (", round(var_explained[2] * 100, 1), "% var)")
  )

# Save PCA plot to figures/
ggplot2::ggsave(
  filename = file.path("figures", "eda_pca_log2fpkm_samples.png"),
  plot = p_pca,
  width = 6,
  height = 5,
  dpi = 300
)

cat("\nPCA plot saved to figures/eda_pca_log2fpkm_samples.png\n")
## 
## PCA plot saved to figures/eda_pca_log2fpkm_samples.png
print(p_pca)

2.5 Gene Annotation Data Structure

annot_path <- file.path("data", "Human.GRCh38.p13.annot.tsv")

annot_head <- readr::read_tsv(annot_path, n_max = 5, show_col_types = FALSE)

cat("Annotation (preview) dimensions (rows x columns):\n")
## Annotation (preview) dimensions (rows x columns):
print(dim(annot_head))
## [1]  5 18
cat("\nFirst few column names in annotation file:\n")
## 
## First few column names in annotation file:
print(colnames(annot_head))
##  [1] "GeneID"        "Symbol"        "Description"   "Synonyms"     
##  [5] "GeneType"      "EnsemblGeneID" "Status"        "ChrAcc"       
##  [9] "ChrStart"      "ChrStop"       "Orientation"   "Length"       
## [13] "GOFunctionID"  "GOProcessID"   "GOComponentID" "GOFunction"   
## [17] "GOProcess"     "GOComponent"
cat("\nPreview of annotation (first 5 rows):\n")
## 
## Preview of annotation (first 5 rows):
print(annot_head)
## # A tibble: 5 × 18
##      GeneID Symbol     Description Synonyms GeneType EnsemblGeneID Status ChrAcc
##       <dbl> <chr>      <chr>       <chr>    <chr>    <chr>         <chr>  <chr> 
## 1 100287102 DDX11L1    DEAD/H-box… <NA>     pseudo   ENSG00000290… active NC_00…
## 2    653635 WASH7P     WASP famil… FAM39F|… pseudo   <NA>          active NC_00…
## 3 102466751 MIR6859-1  microRNA 6… hsa-mir… ncRNA    ENSG00000278… active NC_00…
## 4 107985730 MIR1302-2… MIR1302-2 … <NA>     ncRNA    <NA>          active NC_00…
## 5 100302278 MIR1302-2  microRNA 1… MIRN130… ncRNA    ENSG00000284… active NC_00…
## # ℹ 10 more variables: ChrStart <dbl>, ChrStop <dbl>, Orientation <chr>,
## #   Length <dbl>, GOFunctionID <lgl>, GOProcessID <chr>, GOComponentID <lgl>,
## #   GOFunction <lgl>, GOProcess <chr>, GOComponent <lgl>
if (!"Symbol" %in% colnames(annot_head)) {
  stop("Expected a 'Symbol' column in Human.GRCh38.p13.annot.tsv but did not find one.")
}

annot_full <- readr::read_tsv(annot_path, show_col_types = FALSE)

symbol_ids <- annot_full[["Symbol"]]

if ("Synonyms" %in% colnames(annot_full)) {
  synonyms_raw <- annot_full[["Synonyms"]]
  synonyms_tokens <- strsplit(ifelse(is.na(synonyms_raw), "", synonyms_raw), "[;,|]")
  synonyms_tokens <- lapply(synonyms_tokens, function(x) trimws(x[nzchar(x)]))
} else {
  synonyms_tokens <- vector("list", length = nrow(annot_full))
}

fpkm_gene_ids <- rownames(expr_fpkm)

match_by_symbol <- fpkm_gene_ids %in% symbol_ids

match_by_synonym <- logical(length(fpkm_gene_ids))
if (length(synonyms_tokens) > 0) {
  all_synonyms <- unique(unlist(synonyms_tokens))
  all_synonyms <- all_synonyms[nzchar(all_synonyms)]
  synonym_set <- unique(all_synonyms)
  match_by_synonym[!match_by_symbol] <- fpkm_gene_ids[!match_by_symbol] %in% synonym_set
}

annot_matches_any <- match_by_symbol | match_by_synonym

cat("\nNumber of FPKM genes:", length(fpkm_gene_ids), "\n")
## 
## Number of FPKM genes: 20246
cat("Number of FPKM genes found by Symbol:", sum(match_by_symbol), "\n")
## Number of FPKM genes found by Symbol: 17855
cat("Number of additional FPKM genes found only via Synonyms:",
    sum(match_by_synonym & !match_by_symbol), "\n")
## Number of additional FPKM genes found only via Synonyms: 1309
cat("Total number of FPKM genes with annotation (Symbol or Synonyms):",
    sum(annot_matches_any), "\n")
## Total number of FPKM genes with annotation (Symbol or Synonyms): 19164
cat("Number of FPKM genes without annotation (neither Symbol nor Synonyms):",
    length(fpkm_gene_ids) - sum(annot_matches_any), "\n")
## Number of FPKM genes without annotation (neither Symbol nor Synonyms): 1082
cat("Proportion of FPKM genes with annotation (Symbol or Synonyms):",
    round(sum(annot_matches_any) / length(fpkm_gene_ids), 4), "\n")
## Proportion of FPKM genes with annotation (Symbol or Synonyms): 0.9466
missing_ids <- fpkm_gene_ids[!annot_matches_any]
if (length(missing_ids) > 0) {
  cat("\nExample FPKM gene IDs not found in annotation (via Symbol or Synonyms, up to 10 shown):\n")
  print(head(missing_ids, 10))
} else {
  cat("\nAll FPKM gene IDs (symbols) were found in the annotation table via Symbol or Synonyms.\n")
}
## 
## Example FPKM gene IDs not found in annotation (via Symbol or Synonyms, up to 10 shown):
##  [1] "AC004381.6"    "2-Mar"         "3-Sep"         "DKFZP761J1410"
##  [5] "4-Sep"         "AL021546.6"    "RP11-268J15.5" "RP11-408E5.4" 
##  [9] "7-Sep"         "KIAA1045"

3 Differential Expression Analysis (Task 2)

3.1 Design and Data Preparation (FPKM + limma)

# Set up design and filter genes for limma
stopifnot(exists("col_data"), exists("expr_fpkm"))

col_data_limma <- col_data

if (!is.null(col_data_limma$condition)) {
  col_data_limma$condition <- as.factor(col_data_limma$condition)
  if ("normal" %in% levels(col_data_limma$condition)) {
    col_data_limma$condition <- relevel(col_data_limma$condition, ref = "normal")
  }
}

if (!is.null(col_data_limma$patient_id)) {
  col_data_limma$patient_id <- as.factor(col_data_limma$patient_id)
}

cat("Condition breakdown in col_data_limma (tumor vs normal):\n")
## Condition breakdown in col_data_limma (tumor vs normal):
print(table(col_data_limma$condition, useNA = "ifany"))
## 
## normal  tumor 
##     30     30
cat("\nSamples per patient_id (first few):\n")
## 
## Samples per patient_id (first few):
print(head(table(col_data_limma$patient_id)))
## 
##  102548  104338  105094  109745 1906415 1912627 
##       2       2       2       2       2       2
log_expr_fpkm <- log2(expr_fpkm + 1)

cat("\nExpression matrix dimensions for limma (genes x samples):\n")
## 
## Expression matrix dimensions for limma (genes x samples):
print(dim(log_expr_fpkm))
## [1] 20246    60
min_log_expr <- 1

keep_genes_limma <- apply(log_expr_fpkm, 1, function(x) {
  sum(x >= min_log_expr, na.rm = TRUE) >= 2
})

cat("\nNumber of genes before filtering:", nrow(log_expr_fpkm), "\n")
## 
## Number of genes before filtering: 20246
cat("Number of genes after filtering (log2(FPKM+1) >=", min_log_expr, "in >= 2 samples):",
    sum(keep_genes_limma), "\n")
## Number of genes after filtering (log2(FPKM+1) >= 1 in >= 2 samples): 19872
log_expr_fpkm_filtered <- log_expr_fpkm[keep_genes_limma, , drop = FALSE]

# Align samples between expression matrix and colData
common_samples <- intersect(colnames(log_expr_fpkm_filtered), col_data_limma$sample_id)

if (length(common_samples) == 0) {
  stop("No overlapping samples between expression matrix and col_data_limma.")
}

log_expr_fpkm_filtered <- log_expr_fpkm_filtered[, common_samples, drop = FALSE]
col_data_limma <- col_data_limma[match(common_samples, col_data_limma$sample_id), , drop = FALSE]

cat("\nNumber of samples used in DE:", ncol(log_expr_fpkm_filtered), "\n")
## 
## Number of samples used in DE: 60
# Ensure rownames of col_data_limma are sample IDs for model.matrix
col_data_limma <- as.data.frame(col_data_limma)
rownames(col_data_limma) <- col_data_limma$sample_id

# Sanity checks for alignment before building design matrix
cat("\nSanity check – first few sample IDs in expression and col_data_limma:\n")
## 
## Sanity check – first few sample IDs in expression and col_data_limma:
print(head(colnames(log_expr_fpkm_filtered)))
## [1] "CA.102548"  "CA.104338"  "CA.105094"  "CA.109745"  "CA.1906415"
## [6] "CA.1912627"
print(head(col_data_limma$sample_id))
## [1] "CA.102548"  "CA.104338"  "CA.105094"  "CA.109745"  "CA.1906415"
## [6] "CA.1912627"
cat("\nCondition breakdown after alignment (should match earlier):\n")
## 
## Condition breakdown after alignment (should match earlier):
print(table(col_data_limma$condition, useNA = "ifany"))
## 
## normal  tumor 
##     30     30
cat("\nAll column names of expression match rownames of col_data_limma? ",
    all(colnames(log_expr_fpkm_filtered) == rownames(col_data_limma)), "\n")
## 
## All column names of expression match rownames of col_data_limma?  TRUE
use_paired <- !is.null(col_data_limma$patient_id) &&
  length(unique(col_data_limma$patient_id)) > 1 &&
  all(table(col_data_limma$patient_id) >= 2)

if (use_paired) {
  design_limma <- model.matrix(~ patient_id + condition, data = col_data_limma)
  cat("\nUsing paired design for limma: ~ patient_id + condition\n")
} else {
  design_limma <- model.matrix(~ condition, data = col_data_limma)
  cat("\nUsing unpaired design for limma: ~ condition\n")
}
## 
## Using unpaired design for limma: ~ condition
cat("Design matrix dimensions (samples x coefficients):\n")
## Design matrix dimensions (samples x coefficients):
print(dim(design_limma))
## [1] 60  2
# Final sanity check: rownames of design should match expression columns
stopifnot(identical(colnames(log_expr_fpkm_filtered), rownames(design_limma)))

colnames(design_limma)
## [1] "(Intercept)"    "conditiontumor"

3.2 Run DE with limma and Extract Results

# At this point, columns of log_expr_fpkm_filtered and rows of design_limma
# are aligned and share the same sample IDs

fit <- lmFit(log_expr_fpkm_filtered, design_limma)
fit_ebayes <- eBayes(fit)

coef_names <- colnames(design_limma)
coef_of_interest <- grep("conditiontumor", coef_names, value = TRUE)

if (length(coef_of_interest) != 1) {
  stop("Could not uniquely identify conditiontumor coefficient in design matrix.")
}

cat("\nUsing coefficient for DE results:", coef_of_interest, "\n")
## 
## Using coefficient for DE results: conditiontumor
res_limma <- topTable(
  fit_ebayes,
  coef = coef_of_interest,
  number = Inf,
  sort.by = "P"
)

res_limma_df <- res_limma %>%
  tibble::rownames_to_column(var = "gene_id")

cat("\nPreview of limma DE results (first 6 rows):\n")
## 
## Preview of limma DE results (first 6 rows):
print(head(res_limma_df))
##   gene_id     logFC  AveExpr         t      P.Value    adj.P.Val        B
## 1 DEFB130 -3.941893 2.762654 -26.13245 4.375131e-36 8.694261e-32 69.12741
## 2 CCDC177 -3.530750 2.783185 -24.28741 3.237243e-34 3.216525e-30 65.34287
## 3  UGT2A1 -2.229926 3.056062 -16.86054 2.320028e-25 1.536786e-21 46.66534
## 4    LCN6 -3.531396 4.148372 -15.60920 1.289529e-23 5.960418e-20 42.87248
## 5    KLK9 -3.420770 2.929151 -15.56331 1.499703e-23 5.960418e-20 42.72939
## 6 RNASE11 -3.419164 1.860627 -14.30615 1.039826e-21 3.031911e-18 38.69712

3.3 Post-processing, Annotation, and Export

alpha <- 0.05
logfc_cutoff <- 1

res_limma_df <- res_limma_df %>%
  dplyr::mutate(
    regulation = dplyr::case_when(
      adj.P.Val < alpha & logFC >= logfc_cutoff ~ "up",
      adj.P.Val < alpha & logFC <= -logfc_cutoff ~ "down",
      TRUE ~ "ns"
    )
  )

cat("\nSummary of regulation categories (up/down/ns):\n")
## 
## Summary of regulation categories (up/down/ns):
print(table(res_limma_df$regulation))
## 
##  down    ns    up 
##   881 18054   937
if (!dir.exists("results")) dir.create("results", recursive = TRUE)

readr::write_csv(res_limma_df, file.path("results", "de_results_limma_fpkm.csv"))

cat("\nSaved limma DE results to results/de_results_limma_fpkm.csv\n")
## 
## Saved limma DE results to results/de_results_limma_fpkm.csv
top_up <- res_limma_df %>%
  filter(regulation == "up") %>%
  arrange(adj.P.Val) %>%
  head(10) %>%
  select(gene_id, logFC, adj.P.Val, AveExpr)

# Get top 10 down-regulated (by adjusted p-value)
top_down <- res_limma_df %>%
  filter(regulation == "down") %>%
  arrange(adj.P.Val) %>%
  head(10) %>%
  select(gene_id, logFC, adj.P.Val, AveExpr)

cat("Top 10 UP-regulated genes:\n")
## Top 10 UP-regulated genes:
print(top_up)
##     gene_id    logFC    adj.P.Val  AveExpr
## 1     MYBL2 2.850007 1.239410e-14 2.865390
## 2      E2F1 2.128108 1.706047e-13 2.752566
## 3     MMP11 3.558343 2.920096e-13 5.046576
## 4  HIST2H3C 3.146257 7.951058e-13 4.203050
## 5  HIST2H3A 3.146257 7.951058e-13 4.203050
## 6       TK1 2.461874 9.144043e-12 3.883843
## 7     PITX1 2.433285 1.372115e-11 2.912330
## 8     H2AFX 1.789663 1.984129e-11 4.592687
## 9  TMEM132A 2.254033 7.503367e-11 5.687283
## 10   NUSAP1 2.164853 1.151958e-10 3.468113
cat("\nTop 10 DOWN-regulated genes:\n")
## 
## Top 10 DOWN-regulated genes:
print(top_down)
##    gene_id     logFC    adj.P.Val  AveExpr
## 1  DEFB130 -3.941893 8.694261e-32 2.762654
## 2  CCDC177 -3.530750 3.216525e-30 2.783185
## 3   UGT2A1 -2.229926 1.536786e-21 3.056062
## 4     LCN6 -3.531396 5.960418e-20 4.148372
## 5     KLK9 -3.420770 5.960418e-20 2.929151
## 6  RNASE11 -3.419164 3.031911e-18 1.860627
## 7    MDGA2 -2.752373 3.031911e-18 3.490965
## 8     MFRP -2.229932 1.027101e-17 1.806975
## 9     SOX7 -2.941502 1.291322e-16 5.654500
## 10 TMEM239 -1.940228 1.448262e-16 2.654334
sig_genes <- res_limma_df[res_limma_df$regulation != "ns", ]

up_genes <- sig_genes %>% filter(regulation == "up")
down_genes <- sig_genes %>% filter(regulation == "down")

median_up <- median(up_genes$logFC)
range_up <- range(up_genes$logFC)
mean_up <- mean(up_genes$logFC)
sd_up <- sd(up_genes$logFC)

median_down <- median(down_genes$logFC)
range_down <- range(down_genes$logFC)
mean_down <- mean(down_genes$logFC)
sd_down <- sd(down_genes$logFC)

# Build summary table instead of many cat() calls
fc_summary <- tibble::tibble(
  Group = c("Up-regulated", "Down-regulated", "All DE (|log2FC|)"),
  N = c(nrow(up_genes), nrow(down_genes), nrow(sig_genes)),
  Mean = c(
    mean_up,
    mean_down,
    mean(abs(sig_genes$logFC))
  ),
  Median = c(
    median_up,
    median_down,
    median(abs(sig_genes$logFC))
  ),
  SD = c(
    sd_up,
    sd_down,
    sd(abs(sig_genes$logFC))
  ),
  Min = c(
    range_up[1],
    range_down[1],
    min(abs(sig_genes$logFC))
  ),
  Max = c(
    range_up[2],
    range_down[2],
    max(abs(sig_genes$logFC))
  )
) %>%
  dplyr::mutate(
    dplyr::across(
      .cols = c(Mean, Median, SD, Min, Max),
      .fns = ~round(.x, 2)
    )
  )

cat("\nFold Change Summary for Differentially Expressed Genes:\n")
## 
## Fold Change Summary for Differentially Expressed Genes:
print(fc_summary)
## # A tibble: 3 × 7
##   Group                 N  Mean Median    SD   Min   Max
##   <chr>             <int> <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1 Up-regulated        937  1.4    1.26  0.42  1     3.63
## 2 Down-regulated      881 -1.43  -1.3   0.43 -3.94 -1   
## 3 All DE (|log2FC|)  1818  1.41   1.27  0.43  1     3.94
p_fc_dist <- ggplot(sig_genes, aes(x = logFC, fill = regulation)) +
  geom_histogram(bins = 50, alpha = 0.7, position = "identity") +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "gray40", linewidth = 0.5) +
  geom_vline(xintercept = c(median_up, median_down),
             linetype = "dashed", color = c("red", "blue"), linewidth = 0.5) +
  theme_bw() +
  labs(
    title = "Distribution of log2 fold changes in differentially expressed genes",
    x = "log2 fold change (tumor vs normal)",
    y = "Number of genes",
    fill = "Regulation"
  ) +
  scale_fill_manual(
    values = c("down" = "#377EB8", "up" = "#E41A1C"),
    labels = c("down" = "Down-regulated (n=881)", "up" = "Up-regulated (n=937)")
  ) +
  annotate("text", x = median_up + 0.3, y = 100,
           label = paste("Median =", round(median_up, 2)),
           color = "red", hjust = 0, size = 3) +
  annotate("text", x = median_down - 0.3, y = 100,
           label = paste("Median =", round(median_down, 2)),
           color = "blue", hjust = 1, size = 3)

ggsave("figures/fc_distribution.png", p_fc_dist, width = 7, height = 5, dpi = 300)
cat("\nFold change distribution plot saved to figures/fc_distribution.png\n")
## 
## Fold change distribution plot saved to figures/fc_distribution.png
print(p_fc_dist)

4 Visualization – Volcano and MA Plots (Task 3)

In this section, we visualize the limma differential expression results obtained in Section 3. The volcano and MA plots are based on res_limma_df, which contains log2 fold changes (tumor vs normal), adjusted p-values, average log2 expression, and the regulation labels defined using the same thresholds (alpha and logfc_cutoff) as in Task 2. All results are derived from limma models fitted to log2(FPKM + 1) values.

4.1 Volcano Plot

# Visualize limma FPKM-based DE results as a volcano plot using res_limma_df, alpha, and logfc_cutoff from Section 3.3
stopifnot(exists("res_limma_df"))

volcano_df <- res_limma_df %>%
  dplyr::mutate(
    neg_log10_adjP = -log10(adj.P.Val + 1e-300)
  )

p_volcano <- ggplot(volcano_df, aes(x = logFC, y = neg_log10_adjP, color = regulation)) +
  geom_point(alpha = 0.6, size = 1) +
  theme_bw() +
  geom_vline(xintercept = c(-logfc_cutoff, logfc_cutoff), linetype = "dashed", color = "grey50") +
  geom_hline(yintercept = -log10(alpha), linetype = "dashed", color = "grey50") +
  labs(
    title = "Volcano plot: tumor vs normal (limma on log2(FPKM+1))",
    x = "log2 fold change (tumor vs normal)",
    y = "-log10(adj.P.Val)"
  )

# Optionally label a small set of top significant genes using ggrepel
if (requireNamespace("ggrepel", quietly = TRUE)) {
  top_label_df <- volcano_df %>%
    dplyr::filter(regulation != "ns") %>%
    dplyr::arrange(adj.P.Val) %>%
    dplyr::slice_head(n = 20)

  if (nrow(top_label_df) > 0) {
    p_volcano <- p_volcano +
      ggrepel::geom_text_repel(
        data = top_label_df,
        aes(label = gene_id),
        size = 2,
        max.overlaps = 50,
        box.padding = 0.3,
        point.padding = 0.2,
        segment.color = "grey70"
      )
  }
}

# Save volcano plot to figures/
if (!dir.exists("figures")) dir.create("figures", recursive = TRUE)

ggplot2::ggsave(
  filename = file.path("figures", "volcano_limma_fpkm_tumor_vs_normal.png"),
  plot = p_volcano,
  width = 6,
  height = 5,
  dpi = 300
)

cat("\nVolcano plot saved to figures/volcano_limma_fpkm_tumor_vs_normal.png\n")
## 
## Volcano plot saved to figures/volcano_limma_fpkm_tumor_vs_normal.png
print(p_volcano)

4.2 MA Plot

stopifnot(exists("res_limma_df"))

p_ma <- ggplot(res_limma_df, aes(x = AveExpr, y = logFC, color = regulation)) +
  geom_point(alpha = 0.6, size = 1) +
  theme_bw() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
  labs(
    title = "MA plot: tumor vs normal (limma on log2(FPKM+1))",
    x = "Average log2 expression (AveExpr)",
    y = "log2 fold change (tumor vs normal)"
  )

# Optionally label a few highly significant genes in the MA plot
if (requireNamespace("ggrepel", quietly = TRUE)) {
  top_ma_df <- res_limma_df %>%
    dplyr::filter(regulation != "ns") %>%
    dplyr::arrange(adj.P.Val) %>%
    dplyr::slice_head(n = 20)

  if (nrow(top_ma_df) > 0) {
    p_ma <- p_ma +
      ggrepel::geom_text_repel(
        data = top_ma_df,
        aes(label = gene_id),
        size = 2,
        max.overlaps = 50,
        box.padding = 0.3,
        point.padding = 0.2,
        segment.color = "grey70"
      )
  }
}

# Save MA plot to figures/
if (!dir.exists("figures")) dir.create("figures", recursive = TRUE)

ggplot2::ggsave(
  filename = file.path("figures", "ma_limma_fpkm_tumor_vs_normal.png"),
  plot = p_ma,
  width = 6,
  height = 5,
  dpi = 300
)

cat("\nMA plot saved to figures/ma_limma_fpkm_tumor_vs_normal.png\n")
## 
## MA plot saved to figures/ma_limma_fpkm_tumor_vs_normal.png
print(p_ma)

5 Downstream Functional / Pathway Analysis (Task 4)

In this section we use the limma FPKM-based differential expression results to construct gene sets for downstream enrichment analysis and explore which biological processes are over-represented among tumor-up and tumor-down genes.

5.1 Prepare Gene Lists

stopifnot(exists("res_limma_df"), exists("alpha"), exists("logfc_cutoff"))

cat("\nSummary of regulation categories used for gene lists (up/down/ns):\n")
## 
## Summary of regulation categories used for gene lists (up/down/ns):
print(table(res_limma_df$regulation))
## 
##  down    ns    up 
##   881 18054   937
# Background (universe) = all genes tested by limma

gene_universe <- res_limma_df$gene_id
cat("\nSize of gene universe (all tested genes):", length(gene_universe), "\n")
## 
## Size of gene universe (all tested genes): 19872
# Up- and down-regulated genes

res_limma_up_df <- res_limma_df %>%
  dplyr::filter(regulation == "up") %>%
  dplyr::arrange(adj.P.Val, dplyr::desc(abs(logFC)))

genes_up_tumor <- res_limma_up_df$gene_id

res_limma_down_df <- res_limma_df %>%
  dplyr::filter(regulation == "down") %>%
  dplyr::arrange(adj.P.Val, dplyr::desc(abs(logFC)))

genes_down_tumor <- res_limma_down_df$gene_id

cat("\nNumber of significantly up-regulated genes (tumor > normal):",
    length(genes_up_tumor), "\n")
## 
## Number of significantly up-regulated genes (tumor > normal): 937
cat("Number of significantly down-regulated genes (tumor < normal):",
    length(genes_down_tumor), "\n")
## Number of significantly down-regulated genes (tumor < normal): 881
# All DE genes (up + down)

res_limma_de_df <- res_limma_df %>%
  dplyr::filter(regulation != "ns") %>%
  dplyr::arrange(adj.P.Val, dplyr::desc(abs(logFC)))

genes_de_all <- res_limma_de_df$gene_id

cat("Number of all DE genes (up + down):", length(genes_de_all), "\n")
## Number of all DE genes (up + down): 1818
# Ranked list for GSEA-style methods (signed -log10 p-value)

rank_stat <- with(res_limma_df, sign(logFC) * -log10(P.Value + 1e-300))

rank_df_unique <- res_limma_df %>%
  dplyr::mutate(rank_stat = rank_stat) %>%
  dplyr::arrange(P.Value) %>%
  dplyr::distinct(gene_id, .keep_all = TRUE)

ranked_genes <- rank_df_unique$rank_stat
names(ranked_genes) <- rank_df_unique$gene_id

cat("\nConstructed ranked gene list for GSEA-style analysis with",
    length(ranked_genes), "unique genes.\n")
## 
## Constructed ranked gene list for GSEA-style analysis with 19872 unique genes.
# Persist lists to results/

if (!dir.exists("results")) dir.create("results", recursive = TRUE)

readr::write_tsv(
  tibble::tibble(gene_id = genes_up_tumor),
  file.path("results", "gene_list_up_tumor.tsv")
)

readr::write_tsv(
  tibble::tibble(gene_id = genes_down_tumor),
  file.path("results", "gene_list_down_tumor.tsv")
)

readr::write_tsv(
  tibble::tibble(gene_id = genes_de_all),
  file.path("results", "gene_list_all_de.tsv")
)

readr::write_tsv(
  tibble::tibble(
    gene_id = names(ranked_genes),
    rank_stat = as.numeric(ranked_genes)
  ),
  file.path("results", "ranked_gene_list_gsea_style.tsv")
)

cat("\nSaved gene lists and ranked list to results/:\n")
## 
## Saved gene lists and ranked list to results/:
cat(" - gene_list_up_tumor.tsv\n")
##  - gene_list_up_tumor.tsv
cat(" - gene_list_down_tumor.tsv\n")
##  - gene_list_down_tumor.tsv
cat(" - gene_list_all_de.tsv\n")
##  - gene_list_all_de.tsv
cat(" - ranked_gene_list_gsea_style.tsv\n")
##  - ranked_gene_list_gsea_style.tsv

5.2 Run Enrichment Analysis

stopifnot(exists("gene_universe"), exists("genes_up_tumor"), exists("genes_down_tumor"))

cat("\nGene universe size (background for enrichment):", length(gene_universe), "\n")
## 
## Gene universe size (background for enrichment): 19872
cat("Number of up-regulated genes for enrichment:", length(genes_up_tumor), "\n")
## Number of up-regulated genes for enrichment: 937
cat("Number of down-regulated genes for enrichment:", length(genes_down_tumor), "\n")
## Number of down-regulated genes for enrichment: 881
cat("\nAt this stage, gene identifiers are gene symbols (Symbol column).\n")
## 
## At this stage, gene identifiers are gene symbols (Symbol column).
cat("They will be mapped to Entrez IDs for GO/KEGG in the enrichment step.\n")
## They will be mapped to Entrez IDs for GO/KEGG in the enrichment step.
go_enrich_up <- NULL
kegg_enrich_up <- NULL

go_enrich_down <- NULL
kegg_enrich_down <- NULL

5.3 Summarize and Visualize Enrichment

symbol_to_entrez <- function(symbols) {
  mapped <- suppressMessages(AnnotationDbi::select(
    org.Hs.eg.db,
    keys = unique(symbols),
    keytype = "SYMBOL",
    columns = "ENTREZID"
  ))
  mapped <- mapped[!is.na(mapped$ENTREZID), ]
  mapped_unique <- mapped[!duplicated(mapped$SYMBOL), ]
  entrez_vec <- mapped_unique$ENTREZID
  names(entrez_vec) <- mapped_unique$SYMBOL
  entrez_vec
}

stopifnot(exists("genes_up_tumor"), exists("genes_down_tumor"), exists("gene_universe"))

entrez_up   <- symbol_to_entrez(genes_up_tumor)
entrez_down <- symbol_to_entrez(genes_down_tumor)
entrez_univ <- symbol_to_entrez(gene_universe)

cat("\nMapped", length(genes_up_tumor), "up-regulated symbols to",
    length(entrez_up), "Entrez IDs.\n")
## 
## Mapped 937 up-regulated symbols to 813 Entrez IDs.
cat("Mapped", length(genes_down_tumor), "down-regulated symbols to",
    length(entrez_down), "Entrez IDs.\n")
## Mapped 881 down-regulated symbols to 827 Entrez IDs.
cat("Mapped", length(gene_universe), "universe symbols to",
    length(entrez_univ), "Entrez IDs.\n")
## Mapped 19872 universe symbols to 17584 Entrez IDs.
ego_up <- enrichGO(
  gene          = unname(entrez_up),
  universe      = unname(entrez_univ),
  OrgDb         = org.Hs.eg.db,
  keyType       = "ENTREZID",
  ont           = "BP",
  pAdjustMethod = "BH",
  pvalueCutoff  = 0.05,
  qvalueCutoff  = 0.05,
  readable      = TRUE
)

ego_down <- enrichGO(
  gene          = unname(entrez_down),
  universe      = unname(entrez_univ),
  OrgDb         = org.Hs.eg.db,
  keyType       = "ENTREZID",
  ont           = "BP",
  pAdjustMethod = "BH",
  pvalueCutoff  = 0.05,
  qvalueCutoff  = 0.05,
  readable      = TRUE
)

go_enrich_up <<- ego_up
go_enrich_down <<- ego_down

cat("\nTop enriched GO BP terms for up-regulated genes (if any):\n")
## 
## Top enriched GO BP terms for up-regulated genes (if any):
if (!is.null(ego_up) && nrow(as.data.frame(ego_up)) > 0) {
  print(head(as.data.frame(ego_up)[, c("ID", "Description", "GeneRatio", "p.adjust")], 10))
} else {
  cat("No significant GO BP enrichment for up-regulated genes at the chosen cutoff.\n")
}
##                    ID                          Description GeneRatio
## GO:0000819 GO:0000819         sister chromatid segregation    45/764
## GO:0098813 GO:0098813       nuclear chromosome segregation    52/764
## GO:0000070 GO:0000070 mitotic sister chromatid segregation    39/764
## GO:0007059 GO:0007059               chromosome segregation    60/764
## GO:0000280 GO:0000280                     nuclear division    61/764
## GO:0140014 GO:0140014             mitotic nuclear division    47/764
## GO:0048285 GO:0048285                    organelle fission    64/764
## GO:1905818 GO:1905818  regulation of chromosome separation    23/764
## GO:0051983 GO:0051983 regulation of chromosome segregation    30/764
## GO:0051304 GO:0051304                chromosome separation    23/764
##                p.adjust
## GO:0000819 6.403606e-13
## GO:0098813 9.626750e-13
## GO:0000070 2.750025e-12
## GO:0007059 2.750025e-12
## GO:0000280 7.528216e-12
## GO:0140014 1.084738e-11
## GO:0048285 1.934652e-11
## GO:1905818 1.467130e-10
## GO:0051983 1.533466e-10
## GO:0051304 7.620322e-10
cat("\nTop enriched GO BP terms for down-regulated genes (if any):\n")
## 
## Top enriched GO BP terms for down-regulated genes (if any):
if (!is.null(ego_down) && nrow(as.data.frame(ego_down)) > 0) {
  print(head(as.data.frame(ego_down)[, c("ID", "Description", "GeneRatio", "p.adjust")], 10))
} else {
  cat("No significant GO BP enrichment for down-regulated genes at the chosen cutoff.\n")
}
##                    ID                          Description GeneRatio
## GO:0008544 GO:0008544                epidermis development    54/751
## GO:0043588 GO:0043588                     skin development    46/751
## GO:0030216 GO:0030216         keratinocyte differentiation    30/751
## GO:0003012 GO:0003012                muscle system process    52/751
## GO:0044706 GO:0044706 multi-multicellular organism process    33/751
## GO:0044703 GO:0044703  multi-organism reproductive process    32/751
## GO:0007411 GO:0007411                        axon guidance    34/751
## GO:0097485 GO:0097485           neuron projection guidance    34/751
## GO:0009913 GO:0009913       epidermal cell differentiation    34/751
## GO:0050919 GO:0050919                  negative chemotaxis    14/751
##                p.adjust
## GO:0008544 2.498270e-09
## GO:0043588 2.434240e-08
## GO:0030216 5.978338e-07
## GO:0003012 1.659600e-06
## GO:0044706 2.090798e-06
## GO:0044703 2.535934e-06
## GO:0007411 5.447733e-06
## GO:0097485 5.447733e-06
## GO:0009913 5.447733e-06
## GO:0050919 5.447733e-06
if (!dir.exists("figures")) dir.create("figures", recursive = TRUE)

if (!is.null(ego_up) && nrow(as.data.frame(ego_up)) > 0) {
  df_up <- as.data.frame(ego_up) %>%
    dplyr::slice_head(n = 15) %>%
    dplyr::mutate(label = forcats::fct_reorder(Description, -log10(p.adjust)))

  p_go_up <- ggplot(df_up, aes(x = label, y = -log10(p.adjust))) +
    geom_col() +
    coord_flip() +
    theme_bw() +
    labs(x = NULL, y = "-log10(adj. p-value)", title = "GO BP enrichment (Up-regulated genes)")

  ggsave(
    filename = file.path("figures", "go_bp_enrichment_up_tumor_barplot.png"),
    plot = p_go_up,
    width = 8,
    height = 6,
    dpi = 300
  )

  cat("\nGO BP enrichment bar plot for up-regulated genes saved to figures/go_bp_enrichment_up_tumor_barplot.png\n")

  print(p_go_up)
}
## 
## GO BP enrichment bar plot for up-regulated genes saved to figures/go_bp_enrichment_up_tumor_barplot.png

if (!is.null(ego_down) && nrow(as.data.frame(ego_down)) > 0) {
  df_down <- as.data.frame(ego_down) %>%
    dplyr::slice_head(n = 15) %>%
    dplyr::mutate(label = forcats::fct_reorder(Description, -log10(p.adjust)))

  p_go_down <- ggplot(df_down, aes(x = label, y = -log10(p.adjust))) +
    geom_col() +
    coord_flip() +
    theme_bw() +
    labs(x = NULL, y = "-log10(adj. p-value)", title = "GO BP enrichment (Down-regulated genes)")

  ggsave(
    filename = file.path("figures", "go_bp_enrichment_down_tumor_barplot.png"),
    plot = p_go_down,
    width = 8,
    height = 6,
    dpi = 300
  )

  cat("\nGO BP enrichment bar plot for down-regulated genes saved to figures/go_bp_enrichment_down_tumor_barplot.png\n")

  print(p_go_down)
}
## 
## GO BP enrichment bar plot for down-regulated genes saved to figures/go_bp_enrichment_down_tumor_barplot.png

cat("\nEnrichment bar plots (if any significant terms) saved to figures/:\n")
## 
## Enrichment bar plots (if any significant terms) saved to figures/:
cat(" - go_bp_enrichment_up_tumor_barplot.png\n")
##  - go_bp_enrichment_up_tumor_barplot.png
cat(" - go_bp_enrichment_down_tumor_barplot.png\n")
##  - go_bp_enrichment_down_tumor_barplot.png

5.4 GSEA-style Enrichment (Ranked Gene List)

# GSEA-style GO BP enrichment on the ranked gene list using clusterProfiler::gseGO.
# This complements the ORA analysis on up/down lists by using the full ranking.

stopifnot(exists("rank_df_unique"))

# Map symbols in the ranked table to Entrez IDs
symbol_to_entrez <- function(symbols) {
  mapped <- suppressMessages(AnnotationDbi::select(
    org.Hs.eg.db,
    keys = unique(symbols),
    keytype = "SYMBOL",
    columns = "ENTREZID"
  ))
  mapped <- mapped[!is.na(mapped$ENTREZID), ]
  mapped_unique <- mapped[!duplicated(mapped$SYMBOL), ]
  entrez_vec <- mapped_unique$ENTREZID
  names(entrez_vec) <- mapped_unique$SYMBOL
  entrez_vec
}

entrez_map <- symbol_to_entrez(rank_df_unique$gene_id)

# Join Entrez IDs back to the ranked table
rank_gsea_df <- rank_df_unique %>%
  dplyr::mutate(ENTREZID = unname(entrez_map[gene_id])) %>%
  dplyr::filter(!is.na(ENTREZID)) %>%
  dplyr::distinct(ENTREZID, .keep_all = TRUE)

# Build the named numeric vector for gseGO: names = ENTREZ IDs, values = rank_stat
ranked_entrez <- rank_gsea_df$rank_stat
names(ranked_entrez) <- rank_gsea_df$ENTREZID

cat("\nGSEA input: ranked list with", length(ranked_entrez), "Entrez IDs after mapping.\n")
## 
## GSEA input: ranked list with 17584 Entrez IDs after mapping.
# Sort decreasing as required by gseGO
ranked_entrez <- sort(ranked_entrez, decreasing = TRUE)

# Run GSEA-style GO BP enrichment
set.seed(123)

gsea_go_bp <- gseGO(
  geneList     = ranked_entrez,
  OrgDb        = org.Hs.eg.db,
  keyType      = "ENTREZID",
  ont          = "BP",
  minGSSize    = 10,
  maxGSSize    = 500,
  pvalueCutoff = 0.05,
  verbose      = FALSE
)

gsea_go_bp_res <<- gsea_go_bp

cat("\nTop GSEA GO BP terms (if any):\n")
## 
## Top GSEA GO BP terms (if any):
if (!is.null(gsea_go_bp) && nrow(as.data.frame(gsea_go_bp)) > 0) {
  print(head(as.data.frame(gsea_go_bp)[, c("ID", "Description", "enrichmentScore", "NES", "p.adjust")], 10))
} else {
  cat("No significant GO BP gene sets detected by GSEA at the chosen cutoff.\n")
}
##                    ID
## GO:0000070 GO:0000070
## GO:0000819 GO:0000819
## GO:0098813 GO:0098813
## GO:0140014 GO:0140014
## GO:0051304 GO:0051304
## GO:0007059 GO:0007059
## GO:1905818 GO:1905818
## GO:0051983 GO:0051983
## GO:0090307 GO:0090307
## GO:0050911 GO:0050911
##                                                                       Description
## GO:0000070                                   mitotic sister chromatid segregation
## GO:0000819                                           sister chromatid segregation
## GO:0098813                                         nuclear chromosome segregation
## GO:0140014                                               mitotic nuclear division
## GO:0051304                                                  chromosome separation
## GO:0007059                                                 chromosome segregation
## GO:1905818                                    regulation of chromosome separation
## GO:0051983                                   regulation of chromosome segregation
## GO:0090307                                               mitotic spindle assembly
## GO:0050911 detection of chemical stimulus involved in sensory perception of smell
##            enrichmentScore       NES     p.adjust
## GO:0000070       0.6994580  2.926457 8.834286e-09
## GO:0000819       0.6672771  2.821553 8.834286e-09
## GO:0098813       0.6379839  2.808452 8.834286e-09
## GO:0140014       0.6244930  2.710041 8.834286e-09
## GO:0051304       0.7392602  2.691644 8.834286e-09
## GO:0007059       0.6005625  2.684687 8.834286e-09
## GO:1905818       0.7426014  2.666426 8.834286e-09
## GO:0051983       0.6677471  2.648960 8.834286e-09
## GO:0090307       0.7359029  2.629404 8.834286e-09
## GO:0050911      -0.6518602 -2.611752 8.834286e-09
# Simple dot plot of top GSEA GO BP terms
if (!dir.exists("figures")) dir.create("figures", recursive = TRUE)

if (!is.null(gsea_go_bp) && nrow(as.data.frame(gsea_go_bp)) > 0) {
  gsea_df <- as.data.frame(gsea_go_bp) %>%
    dplyr::slice_head(n = 15) %>%
    dplyr::mutate(label = forcats::fct_reorder(Description, NES))

  p_gsea <- ggplot(gsea_df, aes(x = NES, y = label, size = setSize, color = -log10(p.adjust))) +
    geom_point() +
    theme_bw() +
    labs(
      title = "GSEA GO BP (ranked gene list)",
      x = "Normalized enrichment score (NES)",
      y = NULL,
      color = "-log10(adj. p)",
      size  = "Gene set size"
    )

  ggsave(
    filename = file.path("figures", "gsea_go_bp_ranked_genes_dotplot.png"),
    plot = p_gsea,
    width = 8,
    height = 6,
    dpi = 300
  )

  cat("\nGSEA GO BP dot plot saved to figures/gsea_go_bp_ranked_genes_dotplot.png\n")

  print(p_gsea)
}
## 
## GSEA GO BP dot plot saved to figures/gsea_go_bp_ranked_genes_dotplot.png

5.5 Methodological Comparison: ORA vs GSEA

n_up_genes <- length(genes_up_tumor)
n_down_genes <- length(genes_down_tumor)

n_up_terms <- if (!is.null(go_enrich_up) && nrow(as.data.frame(go_enrich_up)) > 0) {
  nrow(as.data.frame(go_enrich_up)[as.data.frame(go_enrich_up)$p.adjust < 0.05, ])
} else {
  0
}

n_down_terms <- if (!is.null(go_enrich_down) && nrow(as.data.frame(go_enrich_down)) > 0) {
  nrow(as.data.frame(go_enrich_down)[as.data.frame(go_enrich_down)$p.adjust < 0.05, ])
} else {
  0
}

example_up_terms <- if (!is.null(go_enrich_up) && nrow(as.data.frame(go_enrich_up)) > 0) {
  top_up_terms <- head(as.data.frame(go_enrich_up)$Description, 3)
  paste(top_up_terms, collapse = "\n")
} else {
  "No significant terms"
}

example_down_terms <- if (!is.null(go_enrich_down) && nrow(as.data.frame(go_enrich_down)) > 0) {
  top_down_terms <- head(as.data.frame(go_enrich_down)$Description, 3)
  paste(top_down_terms, collapse = "\n")
} else {
  "No significant terms"
}

summary_data <- tibble::tibble(
  Direction        = c("Up-regulated", "Down-regulated"),
  Biological_Theme = c(
    "Cell Cycle & Proliferation",
    "Tissue Development & Differentiation"
  ),
  Example_Terms    = c(example_up_terms, example_down_terms),
  n_genes          = c(n_up_genes, n_down_genes),
  n_terms          = c(n_up_terms, n_down_terms)
)

cat("\nSummary data for enrichment comparison:\n")
## 
## Summary data for enrichment comparison:
print(summary_data)
## # A tibble: 2 × 5
##   Direction      Biological_Theme                  Example_Terms n_genes n_terms
##   <chr>          <chr>                             <chr>           <int>   <int>
## 1 Up-regulated   Cell Cycle & Proliferation        "sister chro…     937     325
## 2 Down-regulated Tissue Development & Differentia… "epidermis d…     881     411
p_summary <- ggplot(summary_data, aes(x = Direction, y = n_genes, fill = Biological_Theme)) +
  geom_bar(stat = "identity", alpha = 0.8) +
  geom_text(aes(label = paste("n =", n_genes)), vjust = -0.5) +
  theme_bw() +
  labs(title = "Functional Enrichment Summary",
       x = "Regulation Direction",
       y = "Number of Genes",
       fill = "Biological Theme") +
  scale_fill_manual(values = c("#E41A1C", "#377EB8"))

ggsave("figures/enrichment_summary.png", p_summary, width = 7, height = 5, dpi = 300)

cat("\nEnrichment summary plot saved to figures/enrichment_summary.png\n")
## 
## Enrichment summary plot saved to figures/enrichment_summary.png
print(p_summary)

6 Results Summary

In this analysis we used FPKM-normalized expression values from GSE183947 to compare breast tumor and normal breast tissue using limma on log2(FPKM+1), and then carried out GO Biological Process enrichment, including both over-representation (ORA) and GSEA-style analyses.