This document implements the analysis plan for the CSE763 Advanced Bioinformatics assignment using RNA-Seq data from the GEO dataset GSE183947. The project uses FPKM-normalized expression values as the primary quantitative input (for data understanding, visualization, and FPKM-based differential expression), together with sample metadata and gene annotation. Although some raw read-count information exists for this dataset, it does not form a single, complete counts matrix covering all tumor and normal samples, and therefore is not used for the main differential expression analysis.
The main objectives are:
fpkm_path <- file.path("data", "GSE183947_fpkm.csv")
# Read FPKM file; first column has no header, so readr names it ...1 by default
fpkm_raw <- readr::read_csv(fpkm_path, show_col_types = FALSE)
# Rename the unnamed first column to match the annotation ID column
if (colnames(fpkm_raw)[1] == "...1") {
colnames(fpkm_raw)[1] <- "Symbol"
}
cat("FPKM data dimensions (rows x columns):\n")
## FPKM data dimensions (rows x columns):
print(dim(fpkm_raw))
## [1] 20246 61
cat("\nFirst few columns:\n")
##
## First few columns:
print(colnames(fpkm_raw)[1:min(10, ncol(fpkm_raw))])
## [1] "Symbol" "CA.102548" "CA.104338" "CA.105094" "CA.109745"
## [6] "CA.1906415" "CA.1912627" "CA.1924346" "CA.1926760" "CA.1927842"
cat("\nPreview of the data:\n")
##
## Preview of the data:
print(dplyr::slice_head(fpkm_raw, n = 5))
## # A tibble: 5 × 61
## Symbol CA.102548 CA.104338 CA.105094 CA.109745 CA.1906415 CA.1912627
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TSPAN6 0.93 1.97 0 5.45 4.52 4.75
## 2 TNMD 0 0 0 0 0 0
## 3 DPM1 0 0.43 0 3.43 8.45 8.53
## 4 SCYL3 5.78 5.17 8.76 4.58 7.2 6.03
## 5 C1orf112 2.83 6.26 3.37 6.24 5.16 13.7
## # ℹ 54 more variables: CA.1924346 <dbl>, CA.1926760 <dbl>, CA.1927842 <dbl>,
## # CA.1933414 <dbl>, CA.1940640 <dbl>, CA.2004407 <dbl>, CA.2005288 <dbl>,
## # CA.2006047 <dbl>, CA.2008260 <dbl>, CA.2009329 <dbl>, CA.2009381 <dbl>,
## # CA.2009850 <dbl>, CA.2017611 <dbl>, CA.2039179 <dbl>, CA.2040686 <dbl>,
## # CA.2045012 <dbl>, CA.2046297 <dbl>, CA.348981 <dbl>, CA.354300 <dbl>,
## # CA.359448 <dbl>, CA.94377 <dbl>, CA.98389 <dbl>, CA.98475 <dbl>,
## # CA.99145 <dbl>, CAP.102548 <dbl>, CAP.104338 <dbl>, CAP.105094 <dbl>, …
# Build expression matrix and colData
first_gene_id_col <- colnames(fpkm_raw)[1]
fpkm_long <- fpkm_raw
gene_ids <- fpkm_long[[1]]
expr_fpkm <- as.matrix(dplyr::select(fpkm_long, -1))
rownames(expr_fpkm) <- gene_ids
sample_ids <- colnames(expr_fpkm)
col_data <- tibble::tibble(
sample_id = sample_ids
)
series_path <- file.path("data", "GSE183947_series_matrix.txt")
series_lines <- readr::read_lines(series_path)
extract_sample_row <- function(lines, prefix) {
line <- lines[startsWith(lines, prefix)]
if (length(line) == 0) return(NULL)
line <- sub(paste0("^", prefix), "", line[1])
parts <- strsplit(trimws(line), "\t")[[1]]
parts <- parts[nzchar(parts)]
parts <- gsub('^"|"$', "", parts)
}
sample_titles <- extract_sample_row(series_lines, "!Sample_title")
sample_access <- extract_sample_row(series_lines, "!Sample_geo_accession")
sample_descr <- extract_sample_row(series_lines, "!Sample_description")
char_tissue <- series_lines[startsWith(series_lines, "!Sample_characteristics_ch1\ttissue:") |
grepl("tissue: breast tumor|tissue: normal breast tissue", series_lines, fixed = TRUE)]
if (length(char_tissue) == 0) {
char_tissue_vals <- NULL
} else {
char_tissue_vals <- extract_sample_row(series_lines, "!Sample_characteristics_ch1")
}
char_donor_line <- series_lines[grepl("!Sample_characteristics_ch1\tdonor:", series_lines) |
grepl("donor:", series_lines, fixed = TRUE)]
if (length(char_donor_line) == 0) {
donor_vals <- NULL
} else {
donor_vals <- extract_sample_row(series_lines, "!Sample_characteristics_ch1")
}
char_lines <- series_lines[startsWith(series_lines, "!Sample_characteristics_ch1")]
char_tissue_line <- char_lines[grepl("tissue:", char_lines)][1]
if (!is.na(char_tissue_line)) {
tissue_vals <- strsplit(sub("^!Sample_characteristics_ch1", "", char_tissue_line), "\t")[[1]]
tissue_vals <- tissue_vals[nzchar(tissue_vals)]
tissue_vals <- gsub('^"|"$', "", tissue_vals)
} else {
tissue_vals <- NULL
}
char_donor_line2 <- char_lines[grepl("donor:", char_lines)][1]
if (!is.na(char_donor_line2)) {
donor_vals2 <- strsplit(sub("^!Sample_characteristics_ch1", "", char_donor_line2), "\t")[[1]]
donor_vals2 <- donor_vals2[nzchar(donor_vals2)]
donor_vals2 <- gsub('^"|"$', "", donor_vals2)
} else {
donor_vals2 <- NULL
}
if (!is.null(sample_descr)) {
meta_df <- tibble::tibble(
sample_description = sample_descr
)
if (!is.null(tissue_vals) && length(tissue_vals) == nrow(meta_df)) {
condition_raw <- tissue_vals
condition <- dplyr::case_when(
grepl("breast tumor", condition_raw) ~ "tumor",
grepl("normal breast tissue", condition_raw) ~ "normal",
TRUE ~ NA_character_
)
meta_df$condition <- factor(condition, levels = c("normal", "tumor"))
}
if (!is.null(donor_vals2) && length(donor_vals2) == nrow(meta_df)) {
donor_clean <- sub("^donor:\\s*", "", donor_vals2)
meta_df$patient_id <- as.character(donor_clean)
}
col_data <- col_data %>%
dplyr::left_join(meta_df, by = c("sample_id" = "sample_description"))
}
cat("\nEnriched sample metadata (colData) preview:\n")
##
## Enriched sample metadata (colData) preview:
print(col_data)
## # A tibble: 60 × 3
## sample_id condition patient_id
## <chr> <fct> <chr>
## 1 CA.102548 tumor 102548
## 2 CA.104338 tumor 104338
## 3 CA.105094 tumor 105094
## 4 CA.109745 tumor 109745
## 5 CA.1906415 tumor 1906415
## 6 CA.1912627 tumor 1912627
## 7 CA.1924346 tumor 1924346
## 8 CA.1926760 tumor 1926760
## 9 CA.1927842 tumor 1927842
## 10 CA.1933414 tumor 1933414
## # ℹ 50 more rows
log_fpkm <- log2(expr_fpkm + 1)
tumor_samples <- col_data$sample_id[col_data$condition == "tumor"]
normal_samples <- col_data$sample_id[col_data$condition == "normal"]
overall_stats <- c(
mean = mean(log_fpkm),
median = median(log_fpkm),
sd = sd(log_fpkm),
min = min(log_fpkm),
max = max(log_fpkm)
)
tumor_stats <- c(
mean = mean(log_fpkm[, tumor_samples]),
median = median(log_fpkm[, tumor_samples]),
sd = sd(log_fpkm[, tumor_samples]),
min = min(log_fpkm[, tumor_samples]),
max = max(log_fpkm[, tumor_samples])
)
normal_stats <- c(
mean = mean(log_fpkm[, normal_samples]),
median = median(log_fpkm[, normal_samples]),
sd = sd(log_fpkm[, normal_samples]),
min = min(log_fpkm[, normal_samples]),
max = max(log_fpkm[, normal_samples])
)
# Genes expressed (>1 FPKM in at least one sample)
genes_expressed_all <- sum(rowSums(expr_fpkm > 1) > 0)
genes_expressed_tumor <- sum(rowSums(expr_fpkm[, tumor_samples] > 1) > 0)
genes_expressed_normal <- sum(rowSums(expr_fpkm[, normal_samples] > 1) > 0)
expr_summary_table <- tibble::tibble(
Statistic = c("Mean", "Median", "SD", "Min", "Max", "Genes expressed (>1 FPKM)"),
Overall = c(
overall_stats["mean"],
overall_stats["median"],
overall_stats["sd"],
overall_stats["min"],
overall_stats["max"],
genes_expressed_all
),
Tumor = c(
tumor_stats["mean"],
tumor_stats["median"],
tumor_stats["sd"],
tumor_stats["min"],
tumor_stats["max"],
genes_expressed_tumor
),
Normal = c(
normal_stats["mean"],
normal_stats["median"],
normal_stats["sd"],
normal_stats["min"],
normal_stats["max"],
genes_expressed_normal
)
)
cat("\nExpression Summary\n")
##
## Expression Summary
print(expr_summary_table)
## # A tibble: 6 × 4
## Statistic Overall Tumor Normal
## <chr> <dbl> <dbl> <dbl>
## 1 Mean 2.85 2.83 2.86
## 2 Median 2.89 2.85 2.91
## 3 SD 2.12 2.16 2.09
## 4 Min 0 0 0
## 5 Max 16.7 16.7 16.2
## 6 Genes expressed (>1 FPKM) 20028 17977 19989
cat("Number of genes:", nrow(expr_fpkm), "\n")
## Number of genes: 20246
cat("Number of samples:", ncol(expr_fpkm), "\n\n")
## Number of samples: 60
cat("Condition breakdown (tumor vs normal):\n")
## Condition breakdown (tumor vs normal):
print(table(col_data$condition, useNA = "ifany"))
##
## normal tumor
## 30 30
cat("\nNumber of samples per patient (first few):\n")
##
## Number of samples per patient (first few):
print(table(col_data$patient_id, useNA = "ifany")[1:10])
##
## 102548 104338 105094 109745 1906415 1912627 1924346 1926760 1927842 1933414
## 2 2 2 2 2 2 2 2 2 2
log_expr_fpkm <- log2(expr_fpkm + 1)
set.seed(123)
if (nrow(log_expr_fpkm) > 5000) {
gene_idx <- sample(seq_len(nrow(log_expr_fpkm)), 5000)
} else {
gene_idx <- seq_len(nrow(log_expr_fpkm))
}
log_expr_subset <- log_expr_fpkm[gene_idx, , drop = FALSE]
expr_long <- as.data.frame(log_expr_subset) %>%
tibble::rownames_to_column(var = "gene_id") %>%
tidyr::pivot_longer(
cols = -gene_id,
names_to = "sample_id",
values_to = "log2_fpkm"
) %>%
dplyr::left_join(col_data, by = "sample_id")
p_box <- ggplot(expr_long, aes(x = sample_id, y = log2_fpkm, fill = condition)) +
geom_boxplot(outlier.size = 0.2, lwd = 0.2) +
theme_bw(base_size = 8) +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
legend.position = "bottom"
) +
labs(
title = "Per-sample distribution of log2(FPKM + 1)",
x = "Sample",
y = "log2(FPKM + 1)"
)
# Save EDA boxplot to figures/
if (!dir.exists("figures")) dir.create("figures", recursive = TRUE)
ggplot2::ggsave(
filename = file.path("figures", "eda_boxplot_log2fpkm_by_sample.png"),
plot = p_box,
width = 8,
height = 4,
dpi = 300
)
cat("\nEDA boxplot saved to figures/eda_boxplot_log2fpkm_by_sample.png\n")
##
## EDA boxplot saved to figures/eda_boxplot_log2fpkm_by_sample.png
print(p_box)
var_genes <- apply(log_expr_fpkm, 1, var, na.rm = TRUE)
log_expr_pca <- log_expr_fpkm[var_genes > 0, , drop = FALSE]
pca_res <- prcomp(t(log_expr_pca), scale. = TRUE)
pca_df <- as.data.frame(pca_res$x[, 1:2]) %>%
dplyr::mutate(sample_id = rownames(pca_res$x)) %>%
dplyr::left_join(col_data, by = "sample_id")
var_explained <- (pca_res$sdev^2) / sum(pca_res$sdev^2)
p_pca <- ggplot(pca_df, aes(x = PC1, y = PC2, color = condition)) +
geom_point(size = 2) +
theme_bw() +
labs(
title = "PCA of samples based on log2(FPKM + 1)",
x = paste0("PC1 (", round(var_explained[1] * 100, 1), "% var)"),
y = paste0("PC2 (", round(var_explained[2] * 100, 1), "% var)")
)
# Save PCA plot to figures/
ggplot2::ggsave(
filename = file.path("figures", "eda_pca_log2fpkm_samples.png"),
plot = p_pca,
width = 6,
height = 5,
dpi = 300
)
cat("\nPCA plot saved to figures/eda_pca_log2fpkm_samples.png\n")
##
## PCA plot saved to figures/eda_pca_log2fpkm_samples.png
print(p_pca)
annot_path <- file.path("data", "Human.GRCh38.p13.annot.tsv")
annot_head <- readr::read_tsv(annot_path, n_max = 5, show_col_types = FALSE)
cat("Annotation (preview) dimensions (rows x columns):\n")
## Annotation (preview) dimensions (rows x columns):
print(dim(annot_head))
## [1] 5 18
cat("\nFirst few column names in annotation file:\n")
##
## First few column names in annotation file:
print(colnames(annot_head))
## [1] "GeneID" "Symbol" "Description" "Synonyms"
## [5] "GeneType" "EnsemblGeneID" "Status" "ChrAcc"
## [9] "ChrStart" "ChrStop" "Orientation" "Length"
## [13] "GOFunctionID" "GOProcessID" "GOComponentID" "GOFunction"
## [17] "GOProcess" "GOComponent"
cat("\nPreview of annotation (first 5 rows):\n")
##
## Preview of annotation (first 5 rows):
print(annot_head)
## # A tibble: 5 × 18
## GeneID Symbol Description Synonyms GeneType EnsemblGeneID Status ChrAcc
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 100287102 DDX11L1 DEAD/H-box… <NA> pseudo ENSG00000290… active NC_00…
## 2 653635 WASH7P WASP famil… FAM39F|… pseudo <NA> active NC_00…
## 3 102466751 MIR6859-1 microRNA 6… hsa-mir… ncRNA ENSG00000278… active NC_00…
## 4 107985730 MIR1302-2… MIR1302-2 … <NA> ncRNA <NA> active NC_00…
## 5 100302278 MIR1302-2 microRNA 1… MIRN130… ncRNA ENSG00000284… active NC_00…
## # ℹ 10 more variables: ChrStart <dbl>, ChrStop <dbl>, Orientation <chr>,
## # Length <dbl>, GOFunctionID <lgl>, GOProcessID <chr>, GOComponentID <lgl>,
## # GOFunction <lgl>, GOProcess <chr>, GOComponent <lgl>
if (!"Symbol" %in% colnames(annot_head)) {
stop("Expected a 'Symbol' column in Human.GRCh38.p13.annot.tsv but did not find one.")
}
annot_full <- readr::read_tsv(annot_path, show_col_types = FALSE)
symbol_ids <- annot_full[["Symbol"]]
if ("Synonyms" %in% colnames(annot_full)) {
synonyms_raw <- annot_full[["Synonyms"]]
synonyms_tokens <- strsplit(ifelse(is.na(synonyms_raw), "", synonyms_raw), "[;,|]")
synonyms_tokens <- lapply(synonyms_tokens, function(x) trimws(x[nzchar(x)]))
} else {
synonyms_tokens <- vector("list", length = nrow(annot_full))
}
fpkm_gene_ids <- rownames(expr_fpkm)
match_by_symbol <- fpkm_gene_ids %in% symbol_ids
match_by_synonym <- logical(length(fpkm_gene_ids))
if (length(synonyms_tokens) > 0) {
all_synonyms <- unique(unlist(synonyms_tokens))
all_synonyms <- all_synonyms[nzchar(all_synonyms)]
synonym_set <- unique(all_synonyms)
match_by_synonym[!match_by_symbol] <- fpkm_gene_ids[!match_by_symbol] %in% synonym_set
}
annot_matches_any <- match_by_symbol | match_by_synonym
cat("\nNumber of FPKM genes:", length(fpkm_gene_ids), "\n")
##
## Number of FPKM genes: 20246
cat("Number of FPKM genes found by Symbol:", sum(match_by_symbol), "\n")
## Number of FPKM genes found by Symbol: 17855
cat("Number of additional FPKM genes found only via Synonyms:",
sum(match_by_synonym & !match_by_symbol), "\n")
## Number of additional FPKM genes found only via Synonyms: 1309
cat("Total number of FPKM genes with annotation (Symbol or Synonyms):",
sum(annot_matches_any), "\n")
## Total number of FPKM genes with annotation (Symbol or Synonyms): 19164
cat("Number of FPKM genes without annotation (neither Symbol nor Synonyms):",
length(fpkm_gene_ids) - sum(annot_matches_any), "\n")
## Number of FPKM genes without annotation (neither Symbol nor Synonyms): 1082
cat("Proportion of FPKM genes with annotation (Symbol or Synonyms):",
round(sum(annot_matches_any) / length(fpkm_gene_ids), 4), "\n")
## Proportion of FPKM genes with annotation (Symbol or Synonyms): 0.9466
missing_ids <- fpkm_gene_ids[!annot_matches_any]
if (length(missing_ids) > 0) {
cat("\nExample FPKM gene IDs not found in annotation (via Symbol or Synonyms, up to 10 shown):\n")
print(head(missing_ids, 10))
} else {
cat("\nAll FPKM gene IDs (symbols) were found in the annotation table via Symbol or Synonyms.\n")
}
##
## Example FPKM gene IDs not found in annotation (via Symbol or Synonyms, up to 10 shown):
## [1] "AC004381.6" "2-Mar" "3-Sep" "DKFZP761J1410"
## [5] "4-Sep" "AL021546.6" "RP11-268J15.5" "RP11-408E5.4"
## [9] "7-Sep" "KIAA1045"
# Set up design and filter genes for limma
stopifnot(exists("col_data"), exists("expr_fpkm"))
col_data_limma <- col_data
if (!is.null(col_data_limma$condition)) {
col_data_limma$condition <- as.factor(col_data_limma$condition)
if ("normal" %in% levels(col_data_limma$condition)) {
col_data_limma$condition <- relevel(col_data_limma$condition, ref = "normal")
}
}
if (!is.null(col_data_limma$patient_id)) {
col_data_limma$patient_id <- as.factor(col_data_limma$patient_id)
}
cat("Condition breakdown in col_data_limma (tumor vs normal):\n")
## Condition breakdown in col_data_limma (tumor vs normal):
print(table(col_data_limma$condition, useNA = "ifany"))
##
## normal tumor
## 30 30
cat("\nSamples per patient_id (first few):\n")
##
## Samples per patient_id (first few):
print(head(table(col_data_limma$patient_id)))
##
## 102548 104338 105094 109745 1906415 1912627
## 2 2 2 2 2 2
log_expr_fpkm <- log2(expr_fpkm + 1)
cat("\nExpression matrix dimensions for limma (genes x samples):\n")
##
## Expression matrix dimensions for limma (genes x samples):
print(dim(log_expr_fpkm))
## [1] 20246 60
min_log_expr <- 1
keep_genes_limma <- apply(log_expr_fpkm, 1, function(x) {
sum(x >= min_log_expr, na.rm = TRUE) >= 2
})
cat("\nNumber of genes before filtering:", nrow(log_expr_fpkm), "\n")
##
## Number of genes before filtering: 20246
cat("Number of genes after filtering (log2(FPKM+1) >=", min_log_expr, "in >= 2 samples):",
sum(keep_genes_limma), "\n")
## Number of genes after filtering (log2(FPKM+1) >= 1 in >= 2 samples): 19872
log_expr_fpkm_filtered <- log_expr_fpkm[keep_genes_limma, , drop = FALSE]
# Align samples between expression matrix and colData
common_samples <- intersect(colnames(log_expr_fpkm_filtered), col_data_limma$sample_id)
if (length(common_samples) == 0) {
stop("No overlapping samples between expression matrix and col_data_limma.")
}
log_expr_fpkm_filtered <- log_expr_fpkm_filtered[, common_samples, drop = FALSE]
col_data_limma <- col_data_limma[match(common_samples, col_data_limma$sample_id), , drop = FALSE]
cat("\nNumber of samples used in DE:", ncol(log_expr_fpkm_filtered), "\n")
##
## Number of samples used in DE: 60
# Ensure rownames of col_data_limma are sample IDs for model.matrix
col_data_limma <- as.data.frame(col_data_limma)
rownames(col_data_limma) <- col_data_limma$sample_id
# Sanity checks for alignment before building design matrix
cat("\nSanity check – first few sample IDs in expression and col_data_limma:\n")
##
## Sanity check – first few sample IDs in expression and col_data_limma:
print(head(colnames(log_expr_fpkm_filtered)))
## [1] "CA.102548" "CA.104338" "CA.105094" "CA.109745" "CA.1906415"
## [6] "CA.1912627"
print(head(col_data_limma$sample_id))
## [1] "CA.102548" "CA.104338" "CA.105094" "CA.109745" "CA.1906415"
## [6] "CA.1912627"
cat("\nCondition breakdown after alignment (should match earlier):\n")
##
## Condition breakdown after alignment (should match earlier):
print(table(col_data_limma$condition, useNA = "ifany"))
##
## normal tumor
## 30 30
cat("\nAll column names of expression match rownames of col_data_limma? ",
all(colnames(log_expr_fpkm_filtered) == rownames(col_data_limma)), "\n")
##
## All column names of expression match rownames of col_data_limma? TRUE
use_paired <- !is.null(col_data_limma$patient_id) &&
length(unique(col_data_limma$patient_id)) > 1 &&
all(table(col_data_limma$patient_id) >= 2)
if (use_paired) {
design_limma <- model.matrix(~ patient_id + condition, data = col_data_limma)
cat("\nUsing paired design for limma: ~ patient_id + condition\n")
} else {
design_limma <- model.matrix(~ condition, data = col_data_limma)
cat("\nUsing unpaired design for limma: ~ condition\n")
}
##
## Using unpaired design for limma: ~ condition
cat("Design matrix dimensions (samples x coefficients):\n")
## Design matrix dimensions (samples x coefficients):
print(dim(design_limma))
## [1] 60 2
# Final sanity check: rownames of design should match expression columns
stopifnot(identical(colnames(log_expr_fpkm_filtered), rownames(design_limma)))
colnames(design_limma)
## [1] "(Intercept)" "conditiontumor"
# At this point, columns of log_expr_fpkm_filtered and rows of design_limma
# are aligned and share the same sample IDs
fit <- lmFit(log_expr_fpkm_filtered, design_limma)
fit_ebayes <- eBayes(fit)
coef_names <- colnames(design_limma)
coef_of_interest <- grep("conditiontumor", coef_names, value = TRUE)
if (length(coef_of_interest) != 1) {
stop("Could not uniquely identify conditiontumor coefficient in design matrix.")
}
cat("\nUsing coefficient for DE results:", coef_of_interest, "\n")
##
## Using coefficient for DE results: conditiontumor
res_limma <- topTable(
fit_ebayes,
coef = coef_of_interest,
number = Inf,
sort.by = "P"
)
res_limma_df <- res_limma %>%
tibble::rownames_to_column(var = "gene_id")
cat("\nPreview of limma DE results (first 6 rows):\n")
##
## Preview of limma DE results (first 6 rows):
print(head(res_limma_df))
## gene_id logFC AveExpr t P.Value adj.P.Val B
## 1 DEFB130 -3.941893 2.762654 -26.13245 4.375131e-36 8.694261e-32 69.12741
## 2 CCDC177 -3.530750 2.783185 -24.28741 3.237243e-34 3.216525e-30 65.34287
## 3 UGT2A1 -2.229926 3.056062 -16.86054 2.320028e-25 1.536786e-21 46.66534
## 4 LCN6 -3.531396 4.148372 -15.60920 1.289529e-23 5.960418e-20 42.87248
## 5 KLK9 -3.420770 2.929151 -15.56331 1.499703e-23 5.960418e-20 42.72939
## 6 RNASE11 -3.419164 1.860627 -14.30615 1.039826e-21 3.031911e-18 38.69712
alpha <- 0.05
logfc_cutoff <- 1
res_limma_df <- res_limma_df %>%
dplyr::mutate(
regulation = dplyr::case_when(
adj.P.Val < alpha & logFC >= logfc_cutoff ~ "up",
adj.P.Val < alpha & logFC <= -logfc_cutoff ~ "down",
TRUE ~ "ns"
)
)
cat("\nSummary of regulation categories (up/down/ns):\n")
##
## Summary of regulation categories (up/down/ns):
print(table(res_limma_df$regulation))
##
## down ns up
## 881 18054 937
if (!dir.exists("results")) dir.create("results", recursive = TRUE)
readr::write_csv(res_limma_df, file.path("results", "de_results_limma_fpkm.csv"))
cat("\nSaved limma DE results to results/de_results_limma_fpkm.csv\n")
##
## Saved limma DE results to results/de_results_limma_fpkm.csv
top_up <- res_limma_df %>%
filter(regulation == "up") %>%
arrange(adj.P.Val) %>%
head(10) %>%
select(gene_id, logFC, adj.P.Val, AveExpr)
# Get top 10 down-regulated (by adjusted p-value)
top_down <- res_limma_df %>%
filter(regulation == "down") %>%
arrange(adj.P.Val) %>%
head(10) %>%
select(gene_id, logFC, adj.P.Val, AveExpr)
cat("Top 10 UP-regulated genes:\n")
## Top 10 UP-regulated genes:
print(top_up)
## gene_id logFC adj.P.Val AveExpr
## 1 MYBL2 2.850007 1.239410e-14 2.865390
## 2 E2F1 2.128108 1.706047e-13 2.752566
## 3 MMP11 3.558343 2.920096e-13 5.046576
## 4 HIST2H3C 3.146257 7.951058e-13 4.203050
## 5 HIST2H3A 3.146257 7.951058e-13 4.203050
## 6 TK1 2.461874 9.144043e-12 3.883843
## 7 PITX1 2.433285 1.372115e-11 2.912330
## 8 H2AFX 1.789663 1.984129e-11 4.592687
## 9 TMEM132A 2.254033 7.503367e-11 5.687283
## 10 NUSAP1 2.164853 1.151958e-10 3.468113
cat("\nTop 10 DOWN-regulated genes:\n")
##
## Top 10 DOWN-regulated genes:
print(top_down)
## gene_id logFC adj.P.Val AveExpr
## 1 DEFB130 -3.941893 8.694261e-32 2.762654
## 2 CCDC177 -3.530750 3.216525e-30 2.783185
## 3 UGT2A1 -2.229926 1.536786e-21 3.056062
## 4 LCN6 -3.531396 5.960418e-20 4.148372
## 5 KLK9 -3.420770 5.960418e-20 2.929151
## 6 RNASE11 -3.419164 3.031911e-18 1.860627
## 7 MDGA2 -2.752373 3.031911e-18 3.490965
## 8 MFRP -2.229932 1.027101e-17 1.806975
## 9 SOX7 -2.941502 1.291322e-16 5.654500
## 10 TMEM239 -1.940228 1.448262e-16 2.654334
sig_genes <- res_limma_df[res_limma_df$regulation != "ns", ]
up_genes <- sig_genes %>% filter(regulation == "up")
down_genes <- sig_genes %>% filter(regulation == "down")
median_up <- median(up_genes$logFC)
range_up <- range(up_genes$logFC)
mean_up <- mean(up_genes$logFC)
sd_up <- sd(up_genes$logFC)
median_down <- median(down_genes$logFC)
range_down <- range(down_genes$logFC)
mean_down <- mean(down_genes$logFC)
sd_down <- sd(down_genes$logFC)
# Build summary table instead of many cat() calls
fc_summary <- tibble::tibble(
Group = c("Up-regulated", "Down-regulated", "All DE (|log2FC|)"),
N = c(nrow(up_genes), nrow(down_genes), nrow(sig_genes)),
Mean = c(
mean_up,
mean_down,
mean(abs(sig_genes$logFC))
),
Median = c(
median_up,
median_down,
median(abs(sig_genes$logFC))
),
SD = c(
sd_up,
sd_down,
sd(abs(sig_genes$logFC))
),
Min = c(
range_up[1],
range_down[1],
min(abs(sig_genes$logFC))
),
Max = c(
range_up[2],
range_down[2],
max(abs(sig_genes$logFC))
)
) %>%
dplyr::mutate(
dplyr::across(
.cols = c(Mean, Median, SD, Min, Max),
.fns = ~round(.x, 2)
)
)
cat("\nFold Change Summary for Differentially Expressed Genes:\n")
##
## Fold Change Summary for Differentially Expressed Genes:
print(fc_summary)
## # A tibble: 3 × 7
## Group N Mean Median SD Min Max
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Up-regulated 937 1.4 1.26 0.42 1 3.63
## 2 Down-regulated 881 -1.43 -1.3 0.43 -3.94 -1
## 3 All DE (|log2FC|) 1818 1.41 1.27 0.43 1 3.94
p_fc_dist <- ggplot(sig_genes, aes(x = logFC, fill = regulation)) +
geom_histogram(bins = 50, alpha = 0.7, position = "identity") +
geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "gray40", linewidth = 0.5) +
geom_vline(xintercept = c(median_up, median_down),
linetype = "dashed", color = c("red", "blue"), linewidth = 0.5) +
theme_bw() +
labs(
title = "Distribution of log2 fold changes in differentially expressed genes",
x = "log2 fold change (tumor vs normal)",
y = "Number of genes",
fill = "Regulation"
) +
scale_fill_manual(
values = c("down" = "#377EB8", "up" = "#E41A1C"),
labels = c("down" = "Down-regulated (n=881)", "up" = "Up-regulated (n=937)")
) +
annotate("text", x = median_up + 0.3, y = 100,
label = paste("Median =", round(median_up, 2)),
color = "red", hjust = 0, size = 3) +
annotate("text", x = median_down - 0.3, y = 100,
label = paste("Median =", round(median_down, 2)),
color = "blue", hjust = 1, size = 3)
ggsave("figures/fc_distribution.png", p_fc_dist, width = 7, height = 5, dpi = 300)
cat("\nFold change distribution plot saved to figures/fc_distribution.png\n")
##
## Fold change distribution plot saved to figures/fc_distribution.png
print(p_fc_dist)
In this section, we visualize the limma differential expression
results obtained in Section 3. The volcano and MA plots are based on
res_limma_df, which contains log2 fold changes (tumor vs
normal), adjusted p-values, average log2 expression, and the
regulation labels defined using the same thresholds
(alpha and logfc_cutoff) as in Task 2. All
results are derived from limma models fitted to log2(FPKM + 1)
values.
# Visualize limma FPKM-based DE results as a volcano plot using res_limma_df, alpha, and logfc_cutoff from Section 3.3
stopifnot(exists("res_limma_df"))
volcano_df <- res_limma_df %>%
dplyr::mutate(
neg_log10_adjP = -log10(adj.P.Val + 1e-300)
)
p_volcano <- ggplot(volcano_df, aes(x = logFC, y = neg_log10_adjP, color = regulation)) +
geom_point(alpha = 0.6, size = 1) +
theme_bw() +
geom_vline(xintercept = c(-logfc_cutoff, logfc_cutoff), linetype = "dashed", color = "grey50") +
geom_hline(yintercept = -log10(alpha), linetype = "dashed", color = "grey50") +
labs(
title = "Volcano plot: tumor vs normal (limma on log2(FPKM+1))",
x = "log2 fold change (tumor vs normal)",
y = "-log10(adj.P.Val)"
)
# Optionally label a small set of top significant genes using ggrepel
if (requireNamespace("ggrepel", quietly = TRUE)) {
top_label_df <- volcano_df %>%
dplyr::filter(regulation != "ns") %>%
dplyr::arrange(adj.P.Val) %>%
dplyr::slice_head(n = 20)
if (nrow(top_label_df) > 0) {
p_volcano <- p_volcano +
ggrepel::geom_text_repel(
data = top_label_df,
aes(label = gene_id),
size = 2,
max.overlaps = 50,
box.padding = 0.3,
point.padding = 0.2,
segment.color = "grey70"
)
}
}
# Save volcano plot to figures/
if (!dir.exists("figures")) dir.create("figures", recursive = TRUE)
ggplot2::ggsave(
filename = file.path("figures", "volcano_limma_fpkm_tumor_vs_normal.png"),
plot = p_volcano,
width = 6,
height = 5,
dpi = 300
)
cat("\nVolcano plot saved to figures/volcano_limma_fpkm_tumor_vs_normal.png\n")
##
## Volcano plot saved to figures/volcano_limma_fpkm_tumor_vs_normal.png
print(p_volcano)
stopifnot(exists("res_limma_df"))
p_ma <- ggplot(res_limma_df, aes(x = AveExpr, y = logFC, color = regulation)) +
geom_point(alpha = 0.6, size = 1) +
theme_bw() +
geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
labs(
title = "MA plot: tumor vs normal (limma on log2(FPKM+1))",
x = "Average log2 expression (AveExpr)",
y = "log2 fold change (tumor vs normal)"
)
# Optionally label a few highly significant genes in the MA plot
if (requireNamespace("ggrepel", quietly = TRUE)) {
top_ma_df <- res_limma_df %>%
dplyr::filter(regulation != "ns") %>%
dplyr::arrange(adj.P.Val) %>%
dplyr::slice_head(n = 20)
if (nrow(top_ma_df) > 0) {
p_ma <- p_ma +
ggrepel::geom_text_repel(
data = top_ma_df,
aes(label = gene_id),
size = 2,
max.overlaps = 50,
box.padding = 0.3,
point.padding = 0.2,
segment.color = "grey70"
)
}
}
# Save MA plot to figures/
if (!dir.exists("figures")) dir.create("figures", recursive = TRUE)
ggplot2::ggsave(
filename = file.path("figures", "ma_limma_fpkm_tumor_vs_normal.png"),
plot = p_ma,
width = 6,
height = 5,
dpi = 300
)
cat("\nMA plot saved to figures/ma_limma_fpkm_tumor_vs_normal.png\n")
##
## MA plot saved to figures/ma_limma_fpkm_tumor_vs_normal.png
print(p_ma)
In this section we use the limma FPKM-based differential expression results to construct gene sets for downstream enrichment analysis and explore which biological processes are over-represented among tumor-up and tumor-down genes.
stopifnot(exists("res_limma_df"), exists("alpha"), exists("logfc_cutoff"))
cat("\nSummary of regulation categories used for gene lists (up/down/ns):\n")
##
## Summary of regulation categories used for gene lists (up/down/ns):
print(table(res_limma_df$regulation))
##
## down ns up
## 881 18054 937
# Background (universe) = all genes tested by limma
gene_universe <- res_limma_df$gene_id
cat("\nSize of gene universe (all tested genes):", length(gene_universe), "\n")
##
## Size of gene universe (all tested genes): 19872
# Up- and down-regulated genes
res_limma_up_df <- res_limma_df %>%
dplyr::filter(regulation == "up") %>%
dplyr::arrange(adj.P.Val, dplyr::desc(abs(logFC)))
genes_up_tumor <- res_limma_up_df$gene_id
res_limma_down_df <- res_limma_df %>%
dplyr::filter(regulation == "down") %>%
dplyr::arrange(adj.P.Val, dplyr::desc(abs(logFC)))
genes_down_tumor <- res_limma_down_df$gene_id
cat("\nNumber of significantly up-regulated genes (tumor > normal):",
length(genes_up_tumor), "\n")
##
## Number of significantly up-regulated genes (tumor > normal): 937
cat("Number of significantly down-regulated genes (tumor < normal):",
length(genes_down_tumor), "\n")
## Number of significantly down-regulated genes (tumor < normal): 881
# All DE genes (up + down)
res_limma_de_df <- res_limma_df %>%
dplyr::filter(regulation != "ns") %>%
dplyr::arrange(adj.P.Val, dplyr::desc(abs(logFC)))
genes_de_all <- res_limma_de_df$gene_id
cat("Number of all DE genes (up + down):", length(genes_de_all), "\n")
## Number of all DE genes (up + down): 1818
# Ranked list for GSEA-style methods (signed -log10 p-value)
rank_stat <- with(res_limma_df, sign(logFC) * -log10(P.Value + 1e-300))
rank_df_unique <- res_limma_df %>%
dplyr::mutate(rank_stat = rank_stat) %>%
dplyr::arrange(P.Value) %>%
dplyr::distinct(gene_id, .keep_all = TRUE)
ranked_genes <- rank_df_unique$rank_stat
names(ranked_genes) <- rank_df_unique$gene_id
cat("\nConstructed ranked gene list for GSEA-style analysis with",
length(ranked_genes), "unique genes.\n")
##
## Constructed ranked gene list for GSEA-style analysis with 19872 unique genes.
# Persist lists to results/
if (!dir.exists("results")) dir.create("results", recursive = TRUE)
readr::write_tsv(
tibble::tibble(gene_id = genes_up_tumor),
file.path("results", "gene_list_up_tumor.tsv")
)
readr::write_tsv(
tibble::tibble(gene_id = genes_down_tumor),
file.path("results", "gene_list_down_tumor.tsv")
)
readr::write_tsv(
tibble::tibble(gene_id = genes_de_all),
file.path("results", "gene_list_all_de.tsv")
)
readr::write_tsv(
tibble::tibble(
gene_id = names(ranked_genes),
rank_stat = as.numeric(ranked_genes)
),
file.path("results", "ranked_gene_list_gsea_style.tsv")
)
cat("\nSaved gene lists and ranked list to results/:\n")
##
## Saved gene lists and ranked list to results/:
cat(" - gene_list_up_tumor.tsv\n")
## - gene_list_up_tumor.tsv
cat(" - gene_list_down_tumor.tsv\n")
## - gene_list_down_tumor.tsv
cat(" - gene_list_all_de.tsv\n")
## - gene_list_all_de.tsv
cat(" - ranked_gene_list_gsea_style.tsv\n")
## - ranked_gene_list_gsea_style.tsv
stopifnot(exists("gene_universe"), exists("genes_up_tumor"), exists("genes_down_tumor"))
cat("\nGene universe size (background for enrichment):", length(gene_universe), "\n")
##
## Gene universe size (background for enrichment): 19872
cat("Number of up-regulated genes for enrichment:", length(genes_up_tumor), "\n")
## Number of up-regulated genes for enrichment: 937
cat("Number of down-regulated genes for enrichment:", length(genes_down_tumor), "\n")
## Number of down-regulated genes for enrichment: 881
cat("\nAt this stage, gene identifiers are gene symbols (Symbol column).\n")
##
## At this stage, gene identifiers are gene symbols (Symbol column).
cat("They will be mapped to Entrez IDs for GO/KEGG in the enrichment step.\n")
## They will be mapped to Entrez IDs for GO/KEGG in the enrichment step.
go_enrich_up <- NULL
kegg_enrich_up <- NULL
go_enrich_down <- NULL
kegg_enrich_down <- NULL
symbol_to_entrez <- function(symbols) {
mapped <- suppressMessages(AnnotationDbi::select(
org.Hs.eg.db,
keys = unique(symbols),
keytype = "SYMBOL",
columns = "ENTREZID"
))
mapped <- mapped[!is.na(mapped$ENTREZID), ]
mapped_unique <- mapped[!duplicated(mapped$SYMBOL), ]
entrez_vec <- mapped_unique$ENTREZID
names(entrez_vec) <- mapped_unique$SYMBOL
entrez_vec
}
stopifnot(exists("genes_up_tumor"), exists("genes_down_tumor"), exists("gene_universe"))
entrez_up <- symbol_to_entrez(genes_up_tumor)
entrez_down <- symbol_to_entrez(genes_down_tumor)
entrez_univ <- symbol_to_entrez(gene_universe)
cat("\nMapped", length(genes_up_tumor), "up-regulated symbols to",
length(entrez_up), "Entrez IDs.\n")
##
## Mapped 937 up-regulated symbols to 813 Entrez IDs.
cat("Mapped", length(genes_down_tumor), "down-regulated symbols to",
length(entrez_down), "Entrez IDs.\n")
## Mapped 881 down-regulated symbols to 827 Entrez IDs.
cat("Mapped", length(gene_universe), "universe symbols to",
length(entrez_univ), "Entrez IDs.\n")
## Mapped 19872 universe symbols to 17584 Entrez IDs.
ego_up <- enrichGO(
gene = unname(entrez_up),
universe = unname(entrez_univ),
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = TRUE
)
ego_down <- enrichGO(
gene = unname(entrez_down),
universe = unname(entrez_univ),
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = TRUE
)
go_enrich_up <<- ego_up
go_enrich_down <<- ego_down
cat("\nTop enriched GO BP terms for up-regulated genes (if any):\n")
##
## Top enriched GO BP terms for up-regulated genes (if any):
if (!is.null(ego_up) && nrow(as.data.frame(ego_up)) > 0) {
print(head(as.data.frame(ego_up)[, c("ID", "Description", "GeneRatio", "p.adjust")], 10))
} else {
cat("No significant GO BP enrichment for up-regulated genes at the chosen cutoff.\n")
}
## ID Description GeneRatio
## GO:0000819 GO:0000819 sister chromatid segregation 45/764
## GO:0098813 GO:0098813 nuclear chromosome segregation 52/764
## GO:0000070 GO:0000070 mitotic sister chromatid segregation 39/764
## GO:0007059 GO:0007059 chromosome segregation 60/764
## GO:0000280 GO:0000280 nuclear division 61/764
## GO:0140014 GO:0140014 mitotic nuclear division 47/764
## GO:0048285 GO:0048285 organelle fission 64/764
## GO:1905818 GO:1905818 regulation of chromosome separation 23/764
## GO:0051983 GO:0051983 regulation of chromosome segregation 30/764
## GO:0051304 GO:0051304 chromosome separation 23/764
## p.adjust
## GO:0000819 6.403606e-13
## GO:0098813 9.626750e-13
## GO:0000070 2.750025e-12
## GO:0007059 2.750025e-12
## GO:0000280 7.528216e-12
## GO:0140014 1.084738e-11
## GO:0048285 1.934652e-11
## GO:1905818 1.467130e-10
## GO:0051983 1.533466e-10
## GO:0051304 7.620322e-10
cat("\nTop enriched GO BP terms for down-regulated genes (if any):\n")
##
## Top enriched GO BP terms for down-regulated genes (if any):
if (!is.null(ego_down) && nrow(as.data.frame(ego_down)) > 0) {
print(head(as.data.frame(ego_down)[, c("ID", "Description", "GeneRatio", "p.adjust")], 10))
} else {
cat("No significant GO BP enrichment for down-regulated genes at the chosen cutoff.\n")
}
## ID Description GeneRatio
## GO:0008544 GO:0008544 epidermis development 54/751
## GO:0043588 GO:0043588 skin development 46/751
## GO:0030216 GO:0030216 keratinocyte differentiation 30/751
## GO:0003012 GO:0003012 muscle system process 52/751
## GO:0044706 GO:0044706 multi-multicellular organism process 33/751
## GO:0044703 GO:0044703 multi-organism reproductive process 32/751
## GO:0007411 GO:0007411 axon guidance 34/751
## GO:0097485 GO:0097485 neuron projection guidance 34/751
## GO:0009913 GO:0009913 epidermal cell differentiation 34/751
## GO:0050919 GO:0050919 negative chemotaxis 14/751
## p.adjust
## GO:0008544 2.498270e-09
## GO:0043588 2.434240e-08
## GO:0030216 5.978338e-07
## GO:0003012 1.659600e-06
## GO:0044706 2.090798e-06
## GO:0044703 2.535934e-06
## GO:0007411 5.447733e-06
## GO:0097485 5.447733e-06
## GO:0009913 5.447733e-06
## GO:0050919 5.447733e-06
if (!dir.exists("figures")) dir.create("figures", recursive = TRUE)
if (!is.null(ego_up) && nrow(as.data.frame(ego_up)) > 0) {
df_up <- as.data.frame(ego_up) %>%
dplyr::slice_head(n = 15) %>%
dplyr::mutate(label = forcats::fct_reorder(Description, -log10(p.adjust)))
p_go_up <- ggplot(df_up, aes(x = label, y = -log10(p.adjust))) +
geom_col() +
coord_flip() +
theme_bw() +
labs(x = NULL, y = "-log10(adj. p-value)", title = "GO BP enrichment (Up-regulated genes)")
ggsave(
filename = file.path("figures", "go_bp_enrichment_up_tumor_barplot.png"),
plot = p_go_up,
width = 8,
height = 6,
dpi = 300
)
cat("\nGO BP enrichment bar plot for up-regulated genes saved to figures/go_bp_enrichment_up_tumor_barplot.png\n")
print(p_go_up)
}
##
## GO BP enrichment bar plot for up-regulated genes saved to figures/go_bp_enrichment_up_tumor_barplot.png
if (!is.null(ego_down) && nrow(as.data.frame(ego_down)) > 0) {
df_down <- as.data.frame(ego_down) %>%
dplyr::slice_head(n = 15) %>%
dplyr::mutate(label = forcats::fct_reorder(Description, -log10(p.adjust)))
p_go_down <- ggplot(df_down, aes(x = label, y = -log10(p.adjust))) +
geom_col() +
coord_flip() +
theme_bw() +
labs(x = NULL, y = "-log10(adj. p-value)", title = "GO BP enrichment (Down-regulated genes)")
ggsave(
filename = file.path("figures", "go_bp_enrichment_down_tumor_barplot.png"),
plot = p_go_down,
width = 8,
height = 6,
dpi = 300
)
cat("\nGO BP enrichment bar plot for down-regulated genes saved to figures/go_bp_enrichment_down_tumor_barplot.png\n")
print(p_go_down)
}
##
## GO BP enrichment bar plot for down-regulated genes saved to figures/go_bp_enrichment_down_tumor_barplot.png
cat("\nEnrichment bar plots (if any significant terms) saved to figures/:\n")
##
## Enrichment bar plots (if any significant terms) saved to figures/:
cat(" - go_bp_enrichment_up_tumor_barplot.png\n")
## - go_bp_enrichment_up_tumor_barplot.png
cat(" - go_bp_enrichment_down_tumor_barplot.png\n")
## - go_bp_enrichment_down_tumor_barplot.png
# GSEA-style GO BP enrichment on the ranked gene list using clusterProfiler::gseGO.
# This complements the ORA analysis on up/down lists by using the full ranking.
stopifnot(exists("rank_df_unique"))
# Map symbols in the ranked table to Entrez IDs
symbol_to_entrez <- function(symbols) {
mapped <- suppressMessages(AnnotationDbi::select(
org.Hs.eg.db,
keys = unique(symbols),
keytype = "SYMBOL",
columns = "ENTREZID"
))
mapped <- mapped[!is.na(mapped$ENTREZID), ]
mapped_unique <- mapped[!duplicated(mapped$SYMBOL), ]
entrez_vec <- mapped_unique$ENTREZID
names(entrez_vec) <- mapped_unique$SYMBOL
entrez_vec
}
entrez_map <- symbol_to_entrez(rank_df_unique$gene_id)
# Join Entrez IDs back to the ranked table
rank_gsea_df <- rank_df_unique %>%
dplyr::mutate(ENTREZID = unname(entrez_map[gene_id])) %>%
dplyr::filter(!is.na(ENTREZID)) %>%
dplyr::distinct(ENTREZID, .keep_all = TRUE)
# Build the named numeric vector for gseGO: names = ENTREZ IDs, values = rank_stat
ranked_entrez <- rank_gsea_df$rank_stat
names(ranked_entrez) <- rank_gsea_df$ENTREZID
cat("\nGSEA input: ranked list with", length(ranked_entrez), "Entrez IDs after mapping.\n")
##
## GSEA input: ranked list with 17584 Entrez IDs after mapping.
# Sort decreasing as required by gseGO
ranked_entrez <- sort(ranked_entrez, decreasing = TRUE)
# Run GSEA-style GO BP enrichment
set.seed(123)
gsea_go_bp <- gseGO(
geneList = ranked_entrez,
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "BP",
minGSSize = 10,
maxGSSize = 500,
pvalueCutoff = 0.05,
verbose = FALSE
)
gsea_go_bp_res <<- gsea_go_bp
cat("\nTop GSEA GO BP terms (if any):\n")
##
## Top GSEA GO BP terms (if any):
if (!is.null(gsea_go_bp) && nrow(as.data.frame(gsea_go_bp)) > 0) {
print(head(as.data.frame(gsea_go_bp)[, c("ID", "Description", "enrichmentScore", "NES", "p.adjust")], 10))
} else {
cat("No significant GO BP gene sets detected by GSEA at the chosen cutoff.\n")
}
## ID
## GO:0000070 GO:0000070
## GO:0000819 GO:0000819
## GO:0098813 GO:0098813
## GO:0140014 GO:0140014
## GO:0051304 GO:0051304
## GO:0007059 GO:0007059
## GO:1905818 GO:1905818
## GO:0051983 GO:0051983
## GO:0090307 GO:0090307
## GO:0050911 GO:0050911
## Description
## GO:0000070 mitotic sister chromatid segregation
## GO:0000819 sister chromatid segregation
## GO:0098813 nuclear chromosome segregation
## GO:0140014 mitotic nuclear division
## GO:0051304 chromosome separation
## GO:0007059 chromosome segregation
## GO:1905818 regulation of chromosome separation
## GO:0051983 regulation of chromosome segregation
## GO:0090307 mitotic spindle assembly
## GO:0050911 detection of chemical stimulus involved in sensory perception of smell
## enrichmentScore NES p.adjust
## GO:0000070 0.6994580 2.926457 8.834286e-09
## GO:0000819 0.6672771 2.821553 8.834286e-09
## GO:0098813 0.6379839 2.808452 8.834286e-09
## GO:0140014 0.6244930 2.710041 8.834286e-09
## GO:0051304 0.7392602 2.691644 8.834286e-09
## GO:0007059 0.6005625 2.684687 8.834286e-09
## GO:1905818 0.7426014 2.666426 8.834286e-09
## GO:0051983 0.6677471 2.648960 8.834286e-09
## GO:0090307 0.7359029 2.629404 8.834286e-09
## GO:0050911 -0.6518602 -2.611752 8.834286e-09
# Simple dot plot of top GSEA GO BP terms
if (!dir.exists("figures")) dir.create("figures", recursive = TRUE)
if (!is.null(gsea_go_bp) && nrow(as.data.frame(gsea_go_bp)) > 0) {
gsea_df <- as.data.frame(gsea_go_bp) %>%
dplyr::slice_head(n = 15) %>%
dplyr::mutate(label = forcats::fct_reorder(Description, NES))
p_gsea <- ggplot(gsea_df, aes(x = NES, y = label, size = setSize, color = -log10(p.adjust))) +
geom_point() +
theme_bw() +
labs(
title = "GSEA GO BP (ranked gene list)",
x = "Normalized enrichment score (NES)",
y = NULL,
color = "-log10(adj. p)",
size = "Gene set size"
)
ggsave(
filename = file.path("figures", "gsea_go_bp_ranked_genes_dotplot.png"),
plot = p_gsea,
width = 8,
height = 6,
dpi = 300
)
cat("\nGSEA GO BP dot plot saved to figures/gsea_go_bp_ranked_genes_dotplot.png\n")
print(p_gsea)
}
##
## GSEA GO BP dot plot saved to figures/gsea_go_bp_ranked_genes_dotplot.png
n_up_genes <- length(genes_up_tumor)
n_down_genes <- length(genes_down_tumor)
n_up_terms <- if (!is.null(go_enrich_up) && nrow(as.data.frame(go_enrich_up)) > 0) {
nrow(as.data.frame(go_enrich_up)[as.data.frame(go_enrich_up)$p.adjust < 0.05, ])
} else {
0
}
n_down_terms <- if (!is.null(go_enrich_down) && nrow(as.data.frame(go_enrich_down)) > 0) {
nrow(as.data.frame(go_enrich_down)[as.data.frame(go_enrich_down)$p.adjust < 0.05, ])
} else {
0
}
example_up_terms <- if (!is.null(go_enrich_up) && nrow(as.data.frame(go_enrich_up)) > 0) {
top_up_terms <- head(as.data.frame(go_enrich_up)$Description, 3)
paste(top_up_terms, collapse = "\n")
} else {
"No significant terms"
}
example_down_terms <- if (!is.null(go_enrich_down) && nrow(as.data.frame(go_enrich_down)) > 0) {
top_down_terms <- head(as.data.frame(go_enrich_down)$Description, 3)
paste(top_down_terms, collapse = "\n")
} else {
"No significant terms"
}
summary_data <- tibble::tibble(
Direction = c("Up-regulated", "Down-regulated"),
Biological_Theme = c(
"Cell Cycle & Proliferation",
"Tissue Development & Differentiation"
),
Example_Terms = c(example_up_terms, example_down_terms),
n_genes = c(n_up_genes, n_down_genes),
n_terms = c(n_up_terms, n_down_terms)
)
cat("\nSummary data for enrichment comparison:\n")
##
## Summary data for enrichment comparison:
print(summary_data)
## # A tibble: 2 × 5
## Direction Biological_Theme Example_Terms n_genes n_terms
## <chr> <chr> <chr> <int> <int>
## 1 Up-regulated Cell Cycle & Proliferation "sister chro… 937 325
## 2 Down-regulated Tissue Development & Differentia… "epidermis d… 881 411
p_summary <- ggplot(summary_data, aes(x = Direction, y = n_genes, fill = Biological_Theme)) +
geom_bar(stat = "identity", alpha = 0.8) +
geom_text(aes(label = paste("n =", n_genes)), vjust = -0.5) +
theme_bw() +
labs(title = "Functional Enrichment Summary",
x = "Regulation Direction",
y = "Number of Genes",
fill = "Biological Theme") +
scale_fill_manual(values = c("#E41A1C", "#377EB8"))
ggsave("figures/enrichment_summary.png", p_summary, width = 7, height = 5, dpi = 300)
cat("\nEnrichment summary plot saved to figures/enrichment_summary.png\n")
##
## Enrichment summary plot saved to figures/enrichment_summary.png
print(p_summary)
In this analysis we used FPKM-normalized expression values from GSE183947 to compare breast tumor and normal breast tissue using limma on log2(FPKM+1), and then carried out GO Biological Process enrichment, including both over-representation (ORA) and GSEA-style analyses.
condition (normal vs tumor) and patient_id for
a paired-aware design.results/de_results_limma_fpkm.csv.genes_up_tumor,
genes_down_tumor, an all-DE list, and a ranked gene list
and exported all lists to results/.figures/, summarizing the dominant functional
themes.