This document implements the analysis plan for the CSE763 Advanced Bioinformatics assignment using RNA-Seq data from the GEO dataset GSE183947. The project uses FPKM-normalized expression values as the primary quantitative input (for data understanding, visualization, and FPKM-based differential expression), together with sample metadata and gene annotation. Although some raw read-count information exists for this dataset, it does not form a single, complete counts matrix covering all tumor and normal samples, and therefore is not used for the main differential expression analysis.
The main objectives are:
fpkm_path <- file.path("data", "GSE183947_fpkm.csv")
# Read FPKM file; first column has no header, so readr names it ...1 by default
fpkm_raw <- readr::read_csv(fpkm_path, show_col_types = FALSE)
# Rename the unnamed first column to match the annotation ID column
if (colnames(fpkm_raw)[1] == "...1") {
colnames(fpkm_raw)[1] <- "Symbol"
}
cat("FPKM data dimensions (rows x columns):\n")
## FPKM data dimensions (rows x columns):
print(dim(fpkm_raw))
## [1] 20246 61
cat("\nFirst few columns:\n")
##
## First few columns:
print(colnames(fpkm_raw)[1:min(10, ncol(fpkm_raw))])
## [1] "Symbol" "CA.102548" "CA.104338" "CA.105094" "CA.109745"
## [6] "CA.1906415" "CA.1912627" "CA.1924346" "CA.1926760" "CA.1927842"
cat("\nPreview of the data:\n")
##
## Preview of the data:
print(dplyr::slice_head(fpkm_raw, n = 5))
## # A tibble: 5 × 61
## Symbol CA.102548 CA.104338 CA.105094 CA.109745 CA.1906415 CA.1912627
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TSPAN6 0.93 1.97 0 5.45 4.52 4.75
## 2 TNMD 0 0 0 0 0 0
## 3 DPM1 0 0.43 0 3.43 8.45 8.53
## 4 SCYL3 5.78 5.17 8.76 4.58 7.2 6.03
## 5 C1orf112 2.83 6.26 3.37 6.24 5.16 13.7
## # ℹ 54 more variables: CA.1924346 <dbl>, CA.1926760 <dbl>, CA.1927842 <dbl>,
## # CA.1933414 <dbl>, CA.1940640 <dbl>, CA.2004407 <dbl>, CA.2005288 <dbl>,
## # CA.2006047 <dbl>, CA.2008260 <dbl>, CA.2009329 <dbl>, CA.2009381 <dbl>,
## # CA.2009850 <dbl>, CA.2017611 <dbl>, CA.2039179 <dbl>, CA.2040686 <dbl>,
## # CA.2045012 <dbl>, CA.2046297 <dbl>, CA.348981 <dbl>, CA.354300 <dbl>,
## # CA.359448 <dbl>, CA.94377 <dbl>, CA.98389 <dbl>, CA.98475 <dbl>,
## # CA.99145 <dbl>, CAP.102548 <dbl>, CAP.104338 <dbl>, CAP.105094 <dbl>, …
# Build expression matrix and colData
first_gene_id_col <- colnames(fpkm_raw)[1]
fpkm_long <- fpkm_raw
gene_ids <- fpkm_long[[1]]
expr_fpkm <- as.matrix(dplyr::select(fpkm_long, -1))
rownames(expr_fpkm) <- gene_ids
sample_ids <- colnames(expr_fpkm)
col_data <- tibble::tibble(
sample_id = sample_ids
)
series_path <- file.path("data", "GSE183947_series_matrix.txt")
series_lines <- readr::read_lines(series_path)
extract_sample_row <- function(lines, prefix) {
line <- lines[startsWith(lines, prefix)]
if (length(line) == 0) return(NULL)
line <- sub(paste0("^", prefix), "", line[1])
parts <- strsplit(trimws(line), "\t")[[1]]
parts <- parts[nzchar(parts)]
parts <- gsub('^"|"$', "", parts)
}
sample_titles <- extract_sample_row(series_lines, "!Sample_title")
sample_access <- extract_sample_row(series_lines, "!Sample_geo_accession")
sample_descr <- extract_sample_row(series_lines, "!Sample_description")
char_tissue <- series_lines[startsWith(series_lines, "!Sample_characteristics_ch1\ttissue:") |
grepl("tissue: breast tumor|tissue: normal breast tissue", series_lines, fixed = TRUE)]
if (length(char_tissue) == 0) {
char_tissue_vals <- NULL
} else {
char_tissue_vals <- extract_sample_row(series_lines, "!Sample_characteristics_ch1")
}
char_donor_line <- series_lines[grepl("!Sample_characteristics_ch1\tdonor:", series_lines) |
grepl("donor:", series_lines, fixed = TRUE)]
if (length(char_donor_line) == 0) {
donor_vals <- NULL
} else {
donor_vals <- extract_sample_row(series_lines, "!Sample_characteristics_ch1")
}
char_lines <- series_lines[startsWith(series_lines, "!Sample_characteristics_ch1")]
char_tissue_line <- char_lines[grepl("tissue:", char_lines)][1]
if (!is.na(char_tissue_line)) {
tissue_vals <- strsplit(sub("^!Sample_characteristics_ch1", "", char_tissue_line), "\t")[[1]]
tissue_vals <- tissue_vals[nzchar(tissue_vals)]
tissue_vals <- gsub('^"|"$', "", tissue_vals)
} else {
tissue_vals <- NULL
}
char_donor_line2 <- char_lines[grepl("donor:", char_lines)][1]
if (!is.na(char_donor_line2)) {
donor_vals2 <- strsplit(sub("^!Sample_characteristics_ch1", "", char_donor_line2), "\t")[[1]]
donor_vals2 <- donor_vals2[nzchar(donor_vals2)]
donor_vals2 <- gsub('^"|"$', "", donor_vals2)
} else {
donor_vals2 <- NULL
}
if (!is.null(sample_descr)) {
meta_df <- tibble::tibble(
sample_description = sample_descr
)
if (!is.null(tissue_vals) && length(tissue_vals) == nrow(meta_df)) {
condition_raw <- tissue_vals
condition <- dplyr::case_when(
grepl("breast tumor", condition_raw) ~ "tumor",
grepl("normal breast tissue", condition_raw) ~ "normal",
TRUE ~ NA_character_
)
meta_df$condition <- factor(condition, levels = c("normal", "tumor"))
}
if (!is.null(donor_vals2) && length(donor_vals2) == nrow(meta_df)) {
donor_clean <- sub("^donor:\\s*", "", donor_vals2)
meta_df$patient_id <- as.character(donor_clean)
}
col_data <- col_data %>%
dplyr::left_join(meta_df, by = c("sample_id" = "sample_description"))
}
cat("\nEnriched sample metadata (colData) preview:\n")
##
## Enriched sample metadata (colData) preview:
print(col_data)
## # A tibble: 60 × 3
## sample_id condition patient_id
## <chr> <fct> <chr>
## 1 CA.102548 tumor 102548
## 2 CA.104338 tumor 104338
## 3 CA.105094 tumor 105094
## 4 CA.109745 tumor 109745
## 5 CA.1906415 tumor 1906415
## 6 CA.1912627 tumor 1912627
## 7 CA.1924346 tumor 1924346
## 8 CA.1926760 tumor 1926760
## 9 CA.1927842 tumor 1927842
## 10 CA.1933414 tumor 1933414
## # ℹ 50 more rows
cat("Number of genes:", nrow(expr_fpkm), "\n")
## Number of genes: 20246
cat("Number of samples:", ncol(expr_fpkm), "\n\n")
## Number of samples: 60
cat("Condition breakdown (tumor vs normal):\n")
## Condition breakdown (tumor vs normal):
print(table(col_data$condition, useNA = "ifany"))
##
## normal tumor
## 30 30
cat("\nNumber of samples per patient (first few):\n")
##
## Number of samples per patient (first few):
print(table(col_data$patient_id, useNA = "ifany")[1:10])
##
## 102548 104338 105094 109745 1906415 1912627 1924346 1926760 1927842 1933414
## 2 2 2 2 2 2 2 2 2 2
log_expr_fpkm <- log2(expr_fpkm + 1)
set.seed(123)
if (nrow(log_expr_fpkm) > 5000) {
gene_idx <- sample(seq_len(nrow(log_expr_fpkm)), 5000)
} else {
gene_idx <- seq_len(nrow(log_expr_fpkm))
}
log_expr_subset <- log_expr_fpkm[gene_idx, , drop = FALSE]
expr_long <- as.data.frame(log_expr_subset) %>%
tibble::rownames_to_column(var = "gene_id") %>%
tidyr::pivot_longer(
cols = -gene_id,
names_to = "sample_id",
values_to = "log2_fpkm"
) %>%
dplyr::left_join(col_data, by = "sample_id")
p_box <- ggplot(expr_long, aes(x = sample_id, y = log2_fpkm, fill = condition)) +
geom_boxplot(outlier.size = 0.2, lwd = 0.2) +
theme_bw(base_size = 8) +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
legend.position = "bottom"
) +
labs(
title = "Per-sample distribution of log2(FPKM + 1)",
x = "Sample",
y = "log2(FPKM + 1)"
)
# Save EDA boxplot to figures/
if (!dir.exists("figures")) dir.create("figures", recursive = TRUE)
ggplot2::ggsave(
filename = file.path("figures", "eda_boxplot_log2fpkm_by_sample.png"),
plot = p_box,
width = 8,
height = 4,
dpi = 300
)
cat("\nEDA boxplot saved to figures/eda_boxplot_log2fpkm_by_sample.png\n")
##
## EDA boxplot saved to figures/eda_boxplot_log2fpkm_by_sample.png
print(p_box)
var_genes <- apply(log_expr_fpkm, 1, var, na.rm = TRUE)
log_expr_pca <- log_expr_fpkm[var_genes > 0, , drop = FALSE]
pca_res <- prcomp(t(log_expr_pca), scale. = TRUE)
pca_df <- as.data.frame(pca_res$x[, 1:2]) %>%
dplyr::mutate(sample_id = rownames(pca_res$x)) %>%
dplyr::left_join(col_data, by = "sample_id")
var_explained <- (pca_res$sdev^2) / sum(pca_res$sdev^2)
p_pca <- ggplot(pca_df, aes(x = PC1, y = PC2, color = condition)) +
geom_point(size = 2) +
theme_bw() +
labs(
title = "PCA of samples based on log2(FPKM + 1)",
x = paste0("PC1 (", round(var_explained[1] * 100, 1), "% var)"),
y = paste0("PC2 (", round(var_explained[2] * 100, 1), "% var)")
)
# Save PCA plot to figures/
ggplot2::ggsave(
filename = file.path("figures", "eda_pca_log2fpkm_samples.png"),
plot = p_pca,
width = 6,
height = 5,
dpi = 300
)
cat("\nPCA plot saved to figures/eda_pca_log2fpkm_samples.png\n")
##
## PCA plot saved to figures/eda_pca_log2fpkm_samples.png
print(p_pca)
annot_path <- file.path("data", "Human.GRCh38.p13.annot.tsv")
annot_head <- readr::read_tsv(annot_path, n_max = 5, show_col_types = FALSE)
cat("Annotation (preview) dimensions (rows x columns):\n")
## Annotation (preview) dimensions (rows x columns):
print(dim(annot_head))
## [1] 5 18
cat("\nFirst few column names in annotation file:\n")
##
## First few column names in annotation file:
print(colnames(annot_head))
## [1] "GeneID" "Symbol" "Description" "Synonyms"
## [5] "GeneType" "EnsemblGeneID" "Status" "ChrAcc"
## [9] "ChrStart" "ChrStop" "Orientation" "Length"
## [13] "GOFunctionID" "GOProcessID" "GOComponentID" "GOFunction"
## [17] "GOProcess" "GOComponent"
cat("\nPreview of annotation (first 5 rows):\n")
##
## Preview of annotation (first 5 rows):
print(annot_head)
## # A tibble: 5 × 18
## GeneID Symbol Description Synonyms GeneType EnsemblGeneID Status ChrAcc
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 100287102 DDX11L1 DEAD/H-box… <NA> pseudo ENSG00000290… active NC_00…
## 2 653635 WASH7P WASP famil… FAM39F|… pseudo <NA> active NC_00…
## 3 102466751 MIR6859-1 microRNA 6… hsa-mir… ncRNA ENSG00000278… active NC_00…
## 4 107985730 MIR1302-2… MIR1302-2 … <NA> ncRNA <NA> active NC_00…
## 5 100302278 MIR1302-2 microRNA 1… MIRN130… ncRNA ENSG00000284… active NC_00…
## # ℹ 10 more variables: ChrStart <dbl>, ChrStop <dbl>, Orientation <chr>,
## # Length <dbl>, GOFunctionID <lgl>, GOProcessID <chr>, GOComponentID <lgl>,
## # GOFunction <lgl>, GOProcess <chr>, GOComponent <lgl>
if (!"Symbol" %in% colnames(annot_head)) {
stop("Expected a 'Symbol' column in Human.GRCh38.p13.annot.tsv but did not find one.")
}
annot_full <- readr::read_tsv(annot_path, show_col_types = FALSE)
symbol_ids <- annot_full[["Symbol"]]
if ("Synonyms" %in% colnames(annot_full)) {
synonyms_raw <- annot_full[["Synonyms"]]
synonyms_tokens <- strsplit(ifelse(is.na(synonyms_raw), "", synonyms_raw), "[;,|]")
synonyms_tokens <- lapply(synonyms_tokens, function(x) trimws(x[nzchar(x)]))
} else {
synonyms_tokens <- vector("list", length = nrow(annot_full))
}
fpkm_gene_ids <- rownames(expr_fpkm)
match_by_symbol <- fpkm_gene_ids %in% symbol_ids
match_by_synonym <- logical(length(fpkm_gene_ids))
if (length(synonyms_tokens) > 0) {
all_synonyms <- unique(unlist(synonyms_tokens))
all_synonyms <- all_synonyms[nzchar(all_synonyms)]
synonym_set <- unique(all_synonyms)
match_by_synonym[!match_by_symbol] <- fpkm_gene_ids[!match_by_symbol] %in% synonym_set
}
annot_matches_any <- match_by_symbol | match_by_synonym
cat("\nNumber of FPKM genes:", length(fpkm_gene_ids), "\n")
##
## Number of FPKM genes: 20246
cat("Number of FPKM genes found by Symbol:", sum(match_by_symbol), "\n")
## Number of FPKM genes found by Symbol: 17855
cat("Number of additional FPKM genes found only via Synonyms:",
sum(match_by_synonym & !match_by_symbol), "\n")
## Number of additional FPKM genes found only via Synonyms: 1309
cat("Total number of FPKM genes with annotation (Symbol or Synonyms):",
sum(annot_matches_any), "\n")
## Total number of FPKM genes with annotation (Symbol or Synonyms): 19164
cat("Number of FPKM genes without annotation (neither Symbol nor Synonyms):",
length(fpkm_gene_ids) - sum(annot_matches_any), "\n")
## Number of FPKM genes without annotation (neither Symbol nor Synonyms): 1082
cat("Proportion of FPKM genes with annotation (Symbol or Synonyms):",
round(sum(annot_matches_any) / length(fpkm_gene_ids), 4), "\n")
## Proportion of FPKM genes with annotation (Symbol or Synonyms): 0.9466
missing_ids <- fpkm_gene_ids[!annot_matches_any]
if (length(missing_ids) > 0) {
cat("\nExample FPKM gene IDs not found in annotation (via Symbol or Synonyms, up to 10 shown):\n")
print(head(missing_ids, 10))
} else {
cat("\nAll FPKM gene IDs (symbols) were found in the annotation table via Symbol or Synonyms.\n")
}
##
## Example FPKM gene IDs not found in annotation (via Symbol or Synonyms, up to 10 shown):
## [1] "AC004381.6" "2-Mar" "3-Sep" "DKFZP761J1410"
## [5] "4-Sep" "AL021546.6" "RP11-268J15.5" "RP11-408E5.4"
## [9] "7-Sep" "KIAA1045"
# Set up design and filter genes for limma
stopifnot(exists("col_data"), exists("expr_fpkm"))
col_data_limma <- col_data
if (!is.null(col_data_limma$condition)) {
col_data_limma$condition <- as.factor(col_data_limma$condition)
if ("normal" %in% levels(col_data_limma$condition)) {
col_data_limma$condition <- relevel(col_data_limma$condition, ref = "normal")
}
}
if (!is.null(col_data_limma$patient_id)) {
col_data_limma$patient_id <- as.factor(col_data_limma$patient_id)
}
cat("Condition breakdown in col_data_limma (tumor vs normal):\n")
## Condition breakdown in col_data_limma (tumor vs normal):
print(table(col_data_limma$condition, useNA = "ifany"))
##
## normal tumor
## 30 30
cat("\nSamples per patient_id (first few):\n")
##
## Samples per patient_id (first few):
print(head(table(col_data_limma$patient_id)))
##
## 102548 104338 105094 109745 1906415 1912627
## 2 2 2 2 2 2
log_expr_fpkm <- log2(expr_fpkm + 1)
cat("\nExpression matrix dimensions for limma (genes x samples):\n")
##
## Expression matrix dimensions for limma (genes x samples):
print(dim(log_expr_fpkm))
## [1] 20246 60
min_log_expr <- 1
keep_genes_limma <- apply(log_expr_fpkm, 1, function(x) {
sum(x >= min_log_expr, na.rm = TRUE) >= 2
})
cat("\nNumber of genes before filtering:", nrow(log_expr_fpkm), "\n")
##
## Number of genes before filtering: 20246
cat("Number of genes after filtering (log2(FPKM+1) >=", min_log_expr, "in >= 2 samples):",
sum(keep_genes_limma), "\n")
## Number of genes after filtering (log2(FPKM+1) >= 1 in >= 2 samples): 19872
log_expr_fpkm_filtered <- log_expr_fpkm[keep_genes_limma, , drop = FALSE]
# Align samples between expression matrix and colData
common_samples <- intersect(colnames(log_expr_fpkm_filtered), col_data_limma$sample_id)
if (length(common_samples) == 0) {
stop("No overlapping samples between expression matrix and col_data_limma.")
}
log_expr_fpkm_filtered <- log_expr_fpkm_filtered[, common_samples, drop = FALSE]
col_data_limma <- col_data_limma[match(common_samples, col_data_limma$sample_id), , drop = FALSE]
cat("\nNumber of samples used in DE:", ncol(log_expr_fpkm_filtered), "\n")
##
## Number of samples used in DE: 60
# Ensure rownames of col_data_limma are sample IDs for model.matrix
col_data_limma <- as.data.frame(col_data_limma)
rownames(col_data_limma) <- col_data_limma$sample_id
# Sanity checks for alignment before building design matrix
cat("\nSanity check – first few sample IDs in expression and col_data_limma:\n")
##
## Sanity check – first few sample IDs in expression and col_data_limma:
print(head(colnames(log_expr_fpkm_filtered)))
## [1] "CA.102548" "CA.104338" "CA.105094" "CA.109745" "CA.1906415"
## [6] "CA.1912627"
print(head(col_data_limma$sample_id))
## [1] "CA.102548" "CA.104338" "CA.105094" "CA.109745" "CA.1906415"
## [6] "CA.1912627"
cat("\nCondition breakdown after alignment (should match earlier):\n")
##
## Condition breakdown after alignment (should match earlier):
print(table(col_data_limma$condition, useNA = "ifany"))
##
## normal tumor
## 30 30
cat("\nAll column names of expression match rownames of col_data_limma? ",
all(colnames(log_expr_fpkm_filtered) == rownames(col_data_limma)), "\n")
##
## All column names of expression match rownames of col_data_limma? TRUE
use_paired <- !is.null(col_data_limma$patient_id) &&
length(unique(col_data_limma$patient_id)) > 1 &&
all(table(col_data_limma$patient_id) >= 2)
if (use_paired) {
design_limma <- model.matrix(~ patient_id + condition, data = col_data_limma)
cat("\nUsing paired design for limma: ~ patient_id + condition\n")
} else {
design_limma <- model.matrix(~ condition, data = col_data_limma)
cat("\nUsing unpaired design for limma: ~ condition\n")
}
##
## Using unpaired design for limma: ~ condition
cat("Design matrix dimensions (samples x coefficients):\n")
## Design matrix dimensions (samples x coefficients):
print(dim(design_limma))
## [1] 60 2
# Final sanity check: rownames of design should match expression columns
stopifnot(identical(colnames(log_expr_fpkm_filtered), rownames(design_limma)))
colnames(design_limma)
## [1] "(Intercept)" "conditiontumor"
# At this point, columns of log_expr_fpkm_filtered and rows of design_limma
# are aligned and share the same sample IDs
fit <- lmFit(log_expr_fpkm_filtered, design_limma)
fit_ebayes <- eBayes(fit)
coef_names <- colnames(design_limma)
coef_of_interest <- grep("conditiontumor", coef_names, value = TRUE)
if (length(coef_of_interest) != 1) {
stop("Could not uniquely identify conditiontumor coefficient in design matrix.")
}
cat("\nUsing coefficient for DE results:", coef_of_interest, "\n")
##
## Using coefficient for DE results: conditiontumor
res_limma <- topTable(
fit_ebayes,
coef = coef_of_interest,
number = Inf,
sort.by = "P"
)
res_limma_df <- res_limma %>%
tibble::rownames_to_column(var = "gene_id")
cat("\nPreview of limma DE results (first 6 rows):\n")
##
## Preview of limma DE results (first 6 rows):
print(head(res_limma_df))
## gene_id logFC AveExpr t P.Value adj.P.Val B
## 1 DEFB130 -3.941893 2.762654 -26.13245 4.375131e-36 8.694261e-32 69.12741
## 2 CCDC177 -3.530750 2.783185 -24.28741 3.237243e-34 3.216525e-30 65.34287
## 3 UGT2A1 -2.229926 3.056062 -16.86054 2.320028e-25 1.536786e-21 46.66534
## 4 LCN6 -3.531396 4.148372 -15.60920 1.289529e-23 5.960418e-20 42.87248
## 5 KLK9 -3.420770 2.929151 -15.56331 1.499703e-23 5.960418e-20 42.72939
## 6 RNASE11 -3.419164 1.860627 -14.30615 1.039826e-21 3.031911e-18 38.69712
alpha <- 0.05
logfc_cutoff <- 1
res_limma_df <- res_limma_df %>%
dplyr::mutate(
regulation = dplyr::case_when(
adj.P.Val < alpha & logFC >= logfc_cutoff ~ "up",
adj.P.Val < alpha & logFC <= -logfc_cutoff ~ "down",
TRUE ~ "ns"
)
)
cat("\nSummary of regulation categories (up/down/ns):\n")
##
## Summary of regulation categories (up/down/ns):
print(table(res_limma_df$regulation))
##
## down ns up
## 881 18054 937
if (!dir.exists("results")) dir.create("results", recursive = TRUE)
readr::write_csv(res_limma_df, file.path("results", "de_results_limma_fpkm.csv"))
cat("\nSaved limma DE results to results/de_results_limma_fpkm.csv\n")
##
## Saved limma DE results to results/de_results_limma_fpkm.csv
In this section, we visualize the limma differential expression
results obtained in Section 3. The volcano and MA plots are based on
res_limma_df, which contains log2 fold changes (tumor vs
normal), adjusted p-values, average log2 expression, and the
regulation labels defined using the same thresholds
(alpha and logfc_cutoff) as in Task 2. All
results are derived from limma models fitted to log2(FPKM + 1)
values.
# Visualize limma FPKM-based DE results as a volcano plot using res_limma_df, alpha, and logfc_cutoff from Section 3.3
stopifnot(exists("res_limma_df"))
volcano_df <- res_limma_df %>%
dplyr::mutate(
neg_log10_adjP = -log10(adj.P.Val + 1e-300)
)
p_volcano <- ggplot(volcano_df, aes(x = logFC, y = neg_log10_adjP, color = regulation)) +
geom_point(alpha = 0.6, size = 1) +
theme_bw() +
geom_vline(xintercept = c(-logfc_cutoff, logfc_cutoff), linetype = "dashed", color = "grey50") +
geom_hline(yintercept = -log10(alpha), linetype = "dashed", color = "grey50") +
labs(
title = "Volcano plot: tumor vs normal (limma on log2(FPKM+1))",
x = "log2 fold change (tumor vs normal)",
y = "-log10(adj.P.Val)"
)
# Optionally label a small set of top significant genes using ggrepel
if (requireNamespace("ggrepel", quietly = TRUE)) {
top_label_df <- volcano_df %>%
dplyr::filter(regulation != "ns") %>%
dplyr::arrange(adj.P.Val) %>%
dplyr::slice_head(n = 20)
if (nrow(top_label_df) > 0) {
p_volcano <- p_volcano +
ggrepel::geom_text_repel(
data = top_label_df,
aes(label = gene_id),
size = 2,
max.overlaps = 50,
box.padding = 0.3,
point.padding = 0.2,
segment.color = "grey70"
)
}
}
# Save volcano plot to figures/
if (!dir.exists("figures")) dir.create("figures", recursive = TRUE)
ggplot2::ggsave(
filename = file.path("figures", "volcano_limma_fpkm_tumor_vs_normal.png"),
plot = p_volcano,
width = 6,
height = 5,
dpi = 300
)
cat("\nVolcano plot saved to figures/volcano_limma_fpkm_tumor_vs_normal.png\n")
##
## Volcano plot saved to figures/volcano_limma_fpkm_tumor_vs_normal.png
print(p_volcano)
stopifnot(exists("res_limma_df"))
p_ma <- ggplot(res_limma_df, aes(x = AveExpr, y = logFC, color = regulation)) +
geom_point(alpha = 0.6, size = 1) +
theme_bw() +
geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
labs(
title = "MA plot: tumor vs normal (limma on log2(FPKM+1))",
x = "Average log2 expression (AveExpr)",
y = "log2 fold change (tumor vs normal)"
)
# Optionally label a few highly significant genes in the MA plot
if (requireNamespace("ggrepel", quietly = TRUE)) {
top_ma_df <- res_limma_df %>%
dplyr::filter(regulation != "ns") %>%
dplyr::arrange(adj.P.Val) %>%
dplyr::slice_head(n = 20)
if (nrow(top_ma_df) > 0) {
p_ma <- p_ma +
ggrepel::geom_text_repel(
data = top_ma_df,
aes(label = gene_id),
size = 2,
max.overlaps = 50,
box.padding = 0.3,
point.padding = 0.2,
segment.color = "grey70"
)
}
}
# Save MA plot to figures/
if (!dir.exists("figures")) dir.create("figures", recursive = TRUE)
ggplot2::ggsave(
filename = file.path("figures", "ma_limma_fpkm_tumor_vs_normal.png"),
plot = p_ma,
width = 6,
height = 5,
dpi = 300
)
cat("\nMA plot saved to figures/ma_limma_fpkm_tumor_vs_normal.png\n")
##
## MA plot saved to figures/ma_limma_fpkm_tumor_vs_normal.png
print(p_ma)
In this section we use the limma FPKM-based differential expression results to construct gene sets for downstream enrichment analysis and explore which biological processes are over-represented among tumor-up and tumor-down genes.
stopifnot(exists("res_limma_df"), exists("alpha"), exists("logfc_cutoff"))
cat("\nSummary of regulation categories used for gene lists (up/down/ns):\n")
##
## Summary of regulation categories used for gene lists (up/down/ns):
print(table(res_limma_df$regulation))
##
## down ns up
## 881 18054 937
# Background (universe) = all genes tested by limma
gene_universe <- res_limma_df$gene_id
cat("\nSize of gene universe (all tested genes):", length(gene_universe), "\n")
##
## Size of gene universe (all tested genes): 19872
# Up- and down-regulated genes
res_limma_up_df <- res_limma_df %>%
dplyr::filter(regulation == "up") %>%
dplyr::arrange(adj.P.Val, dplyr::desc(abs(logFC)))
genes_up_tumor <- res_limma_up_df$gene_id
res_limma_down_df <- res_limma_df %>%
dplyr::filter(regulation == "down") %>%
dplyr::arrange(adj.P.Val, dplyr::desc(abs(logFC)))
genes_down_tumor <- res_limma_down_df$gene_id
cat("\nNumber of significantly up-regulated genes (tumor > normal):",
length(genes_up_tumor), "\n")
##
## Number of significantly up-regulated genes (tumor > normal): 937
cat("Number of significantly down-regulated genes (tumor < normal):",
length(genes_down_tumor), "\n")
## Number of significantly down-regulated genes (tumor < normal): 881
# All DE genes (up + down)
res_limma_de_df <- res_limma_df %>%
dplyr::filter(regulation != "ns") %>%
dplyr::arrange(adj.P.Val, dplyr::desc(abs(logFC)))
genes_de_all <- res_limma_de_df$gene_id
cat("Number of all DE genes (up + down):", length(genes_de_all), "\n")
## Number of all DE genes (up + down): 1818
# Ranked list for GSEA-style methods (signed -log10 p-value)
rank_stat <- with(res_limma_df, sign(logFC) * -log10(P.Value + 1e-300))
rank_df_unique <- res_limma_df %>%
dplyr::mutate(rank_stat = rank_stat) %>%
dplyr::arrange(P.Value) %>%
dplyr::distinct(gene_id, .keep_all = TRUE)
ranked_genes <- rank_df_unique$rank_stat
names(ranked_genes) <- rank_df_unique$gene_id
cat("\nConstructed ranked gene list for GSEA-style analysis with",
length(ranked_genes), "unique genes.\n")
##
## Constructed ranked gene list for GSEA-style analysis with 19872 unique genes.
# Persist lists to results/
if (!dir.exists("results")) dir.create("results", recursive = TRUE)
readr::write_tsv(
tibble::tibble(gene_id = genes_up_tumor),
file.path("results", "gene_list_up_tumor.tsv")
)
readr::write_tsv(
tibble::tibble(gene_id = genes_down_tumor),
file.path("results", "gene_list_down_tumor.tsv")
)
readr::write_tsv(
tibble::tibble(gene_id = genes_de_all),
file.path("results", "gene_list_all_de.tsv")
)
readr::write_tsv(
tibble::tibble(
gene_id = names(ranked_genes),
rank_stat = as.numeric(ranked_genes)
),
file.path("results", "ranked_gene_list_gsea_style.tsv")
)
cat("\nSaved gene lists and ranked list to results/:\n")
##
## Saved gene lists and ranked list to results/:
cat(" - gene_list_up_tumor.tsv\n")
## - gene_list_up_tumor.tsv
cat(" - gene_list_down_tumor.tsv\n")
## - gene_list_down_tumor.tsv
cat(" - gene_list_all_de.tsv\n")
## - gene_list_all_de.tsv
cat(" - ranked_gene_list_gsea_style.tsv\n")
## - ranked_gene_list_gsea_style.tsv
stopifnot(exists("gene_universe"), exists("genes_up_tumor"), exists("genes_down_tumor"))
cat("\nGene universe size (background for enrichment):", length(gene_universe), "\n")
##
## Gene universe size (background for enrichment): 19872
cat("Number of up-regulated genes for enrichment:", length(genes_up_tumor), "\n")
## Number of up-regulated genes for enrichment: 937
cat("Number of down-regulated genes for enrichment:", length(genes_down_tumor), "\n")
## Number of down-regulated genes for enrichment: 881
cat("\nAt this stage, gene identifiers are gene symbols (Symbol column).\n")
##
## At this stage, gene identifiers are gene symbols (Symbol column).
cat("They will be mapped to Entrez IDs for GO/KEGG in the enrichment step.\n")
## They will be mapped to Entrez IDs for GO/KEGG in the enrichment step.
go_enrich_up <- NULL
kegg_enrich_up <- NULL
go_enrich_down <- NULL
kegg_enrich_down <- NULL
symbol_to_entrez <- function(symbols) {
mapped <- suppressMessages(AnnotationDbi::select(
org.Hs.eg.db,
keys = unique(symbols),
keytype = "SYMBOL",
columns = "ENTREZID"
))
mapped <- mapped[!is.na(mapped$ENTREZID), ]
mapped_unique <- mapped[!duplicated(mapped$SYMBOL), ]
entrez_vec <- mapped_unique$ENTREZID
names(entrez_vec) <- mapped_unique$SYMBOL
entrez_vec
}
stopifnot(exists("genes_up_tumor"), exists("genes_down_tumor"), exists("gene_universe"))
entrez_up <- symbol_to_entrez(genes_up_tumor)
entrez_down <- symbol_to_entrez(genes_down_tumor)
entrez_univ <- symbol_to_entrez(gene_universe)
cat("\nMapped", length(genes_up_tumor), "up-regulated symbols to",
length(entrez_up), "Entrez IDs.\n")
##
## Mapped 937 up-regulated symbols to 813 Entrez IDs.
cat("Mapped", length(genes_down_tumor), "down-regulated symbols to",
length(entrez_down), "Entrez IDs.\n")
## Mapped 881 down-regulated symbols to 827 Entrez IDs.
cat("Mapped", length(gene_universe), "universe symbols to",
length(entrez_univ), "Entrez IDs.\n")
## Mapped 19872 universe symbols to 17584 Entrez IDs.
ego_up <- enrichGO(
gene = unname(entrez_up),
universe = unname(entrez_univ),
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = TRUE
)
ego_down <- enrichGO(
gene = unname(entrez_down),
universe = unname(entrez_univ),
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = TRUE
)
go_enrich_up <<- ego_up
go_enrich_down <<- ego_down
cat("\nTop enriched GO BP terms for up-regulated genes (if any):\n")
##
## Top enriched GO BP terms for up-regulated genes (if any):
if (!is.null(ego_up) && nrow(as.data.frame(ego_up)) > 0) {
print(head(as.data.frame(ego_up)[, c("ID", "Description", "GeneRatio", "p.adjust")], 10))
} else {
cat("No significant GO BP enrichment for up-regulated genes at the chosen cutoff.\n")
}
## ID Description GeneRatio
## GO:0000819 GO:0000819 sister chromatid segregation 45/764
## GO:0098813 GO:0098813 nuclear chromosome segregation 52/764
## GO:0000070 GO:0000070 mitotic sister chromatid segregation 39/764
## GO:0007059 GO:0007059 chromosome segregation 60/764
## GO:0000280 GO:0000280 nuclear division 61/764
## GO:0140014 GO:0140014 mitotic nuclear division 47/764
## GO:0048285 GO:0048285 organelle fission 64/764
## GO:1905818 GO:1905818 regulation of chromosome separation 23/764
## GO:0051983 GO:0051983 regulation of chromosome segregation 30/764
## GO:0051304 GO:0051304 chromosome separation 23/764
## p.adjust
## GO:0000819 6.403606e-13
## GO:0098813 9.626750e-13
## GO:0000070 2.750025e-12
## GO:0007059 2.750025e-12
## GO:0000280 7.528216e-12
## GO:0140014 1.084738e-11
## GO:0048285 1.934652e-11
## GO:1905818 1.467130e-10
## GO:0051983 1.533466e-10
## GO:0051304 7.620322e-10
cat("\nTop enriched GO BP terms for down-regulated genes (if any):\n")
##
## Top enriched GO BP terms for down-regulated genes (if any):
if (!is.null(ego_down) && nrow(as.data.frame(ego_down)) > 0) {
print(head(as.data.frame(ego_down)[, c("ID", "Description", "GeneRatio", "p.adjust")], 10))
} else {
cat("No significant GO BP enrichment for down-regulated genes at the chosen cutoff.\n")
}
## ID Description GeneRatio
## GO:0008544 GO:0008544 epidermis development 54/751
## GO:0043588 GO:0043588 skin development 46/751
## GO:0030216 GO:0030216 keratinocyte differentiation 30/751
## GO:0003012 GO:0003012 muscle system process 52/751
## GO:0044706 GO:0044706 multi-multicellular organism process 33/751
## GO:0044703 GO:0044703 multi-organism reproductive process 32/751
## GO:0007411 GO:0007411 axon guidance 34/751
## GO:0097485 GO:0097485 neuron projection guidance 34/751
## GO:0009913 GO:0009913 epidermal cell differentiation 34/751
## GO:0050919 GO:0050919 negative chemotaxis 14/751
## p.adjust
## GO:0008544 2.498270e-09
## GO:0043588 2.434240e-08
## GO:0030216 5.978338e-07
## GO:0003012 1.659600e-06
## GO:0044706 2.090798e-06
## GO:0044703 2.535934e-06
## GO:0007411 5.447733e-06
## GO:0097485 5.447733e-06
## GO:0009913 5.447733e-06
## GO:0050919 5.447733e-06
if (!dir.exists("figures")) dir.create("figures", recursive = TRUE)
if (!is.null(ego_up) && nrow(as.data.frame(ego_up)) > 0) {
df_up <- as.data.frame(ego_up) %>%
dplyr::slice_head(n = 15) %>%
dplyr::mutate(label = forcats::fct_reorder(Description, -log10(p.adjust)))
p_go_up <- ggplot(df_up, aes(x = label, y = -log10(p.adjust))) +
geom_col() +
coord_flip() +
theme_bw() +
labs(x = NULL, y = "-log10(adj. p-value)", title = "GO BP enrichment (Up-regulated genes)")
ggsave(
filename = file.path("figures", "go_bp_enrichment_up_tumor_barplot.png"),
plot = p_go_up,
width = 8,
height = 6,
dpi = 300
)
cat("\nGO BP enrichment bar plot for up-regulated genes saved to figures/go_bp_enrichment_up_tumor_barplot.png\n")
print(p_go_up)
}
##
## GO BP enrichment bar plot for up-regulated genes saved to figures/go_bp_enrichment_up_tumor_barplot.png
if (!is.null(ego_down) && nrow(as.data.frame(ego_down)) > 0) {
df_down <- as.data.frame(ego_down) %>%
dplyr::slice_head(n = 15) %>%
dplyr::mutate(label = forcats::fct_reorder(Description, -log10(p.adjust)))
p_go_down <- ggplot(df_down, aes(x = label, y = -log10(p.adjust))) +
geom_col() +
coord_flip() +
theme_bw() +
labs(x = NULL, y = "-log10(adj. p-value)", title = "GO BP enrichment (Down-regulated genes)")
ggsave(
filename = file.path("figures", "go_bp_enrichment_down_tumor_barplot.png"),
plot = p_go_down,
width = 8,
height = 6,
dpi = 300
)
cat("\nGO BP enrichment bar plot for down-regulated genes saved to figures/go_bp_enrichment_down_tumor_barplot.png\n")
print(p_go_down)
}
##
## GO BP enrichment bar plot for down-regulated genes saved to figures/go_bp_enrichment_down_tumor_barplot.png
cat("\nEnrichment bar plots (if any significant terms) saved to figures/:\n")
##
## Enrichment bar plots (if any significant terms) saved to figures/:
cat(" - go_bp_enrichment_up_tumor_barplot.png\n")
## - go_bp_enrichment_up_tumor_barplot.png
cat(" - go_bp_enrichment_down_tumor_barplot.png\n")
## - go_bp_enrichment_down_tumor_barplot.png
# GSEA-style GO BP enrichment on the ranked gene list using clusterProfiler::gseGO.
# This complements the ORA analysis on up/down lists by using the full ranking.
stopifnot(exists("rank_df_unique"))
# Map symbols in the ranked table to Entrez IDs
symbol_to_entrez <- function(symbols) {
mapped <- suppressMessages(AnnotationDbi::select(
org.Hs.eg.db,
keys = unique(symbols),
keytype = "SYMBOL",
columns = "ENTREZID"
))
mapped <- mapped[!is.na(mapped$ENTREZID), ]
mapped_unique <- mapped[!duplicated(mapped$SYMBOL), ]
entrez_vec <- mapped_unique$ENTREZID
names(entrez_vec) <- mapped_unique$SYMBOL
entrez_vec
}
entrez_map <- symbol_to_entrez(rank_df_unique$gene_id)
# Join Entrez IDs back to the ranked table
rank_gsea_df <- rank_df_unique %>%
dplyr::mutate(ENTREZID = unname(entrez_map[gene_id])) %>%
dplyr::filter(!is.na(ENTREZID)) %>%
dplyr::distinct(ENTREZID, .keep_all = TRUE)
# Build the named numeric vector for gseGO: names = ENTREZ IDs, values = rank_stat
ranked_entrez <- rank_gsea_df$rank_stat
names(ranked_entrez) <- rank_gsea_df$ENTREZID
cat("\nGSEA input: ranked list with", length(ranked_entrez), "Entrez IDs after mapping.\n")
##
## GSEA input: ranked list with 17584 Entrez IDs after mapping.
# Sort decreasing as required by gseGO
ranked_entrez <- sort(ranked_entrez, decreasing = TRUE)
# Run GSEA-style GO BP enrichment
set.seed(123)
gsea_go_bp <- gseGO(
geneList = ranked_entrez,
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "BP",
minGSSize = 10,
maxGSSize = 500,
pvalueCutoff = 0.05,
verbose = FALSE
)
gsea_go_bp_res <<- gsea_go_bp
cat("\nTop GSEA GO BP terms (if any):\n")
##
## Top GSEA GO BP terms (if any):
if (!is.null(gsea_go_bp) && nrow(as.data.frame(gsea_go_bp)) > 0) {
print(head(as.data.frame(gsea_go_bp)[, c("ID", "Description", "enrichmentScore", "NES", "p.adjust")], 10))
} else {
cat("No significant GO BP gene sets detected by GSEA at the chosen cutoff.\n")
}
## ID
## GO:0000070 GO:0000070
## GO:0000819 GO:0000819
## GO:0098813 GO:0098813
## GO:0140014 GO:0140014
## GO:0051304 GO:0051304
## GO:0007059 GO:0007059
## GO:1905818 GO:1905818
## GO:0051983 GO:0051983
## GO:0090307 GO:0090307
## GO:0050911 GO:0050911
## Description
## GO:0000070 mitotic sister chromatid segregation
## GO:0000819 sister chromatid segregation
## GO:0098813 nuclear chromosome segregation
## GO:0140014 mitotic nuclear division
## GO:0051304 chromosome separation
## GO:0007059 chromosome segregation
## GO:1905818 regulation of chromosome separation
## GO:0051983 regulation of chromosome segregation
## GO:0090307 mitotic spindle assembly
## GO:0050911 detection of chemical stimulus involved in sensory perception of smell
## enrichmentScore NES p.adjust
## GO:0000070 0.6994580 2.926457 8.834286e-09
## GO:0000819 0.6672771 2.821553 8.834286e-09
## GO:0098813 0.6379839 2.808452 8.834286e-09
## GO:0140014 0.6244930 2.710041 8.834286e-09
## GO:0051304 0.7392602 2.691644 8.834286e-09
## GO:0007059 0.6005625 2.684687 8.834286e-09
## GO:1905818 0.7426014 2.666426 8.834286e-09
## GO:0051983 0.6677471 2.648960 8.834286e-09
## GO:0090307 0.7359029 2.629404 8.834286e-09
## GO:0050911 -0.6518602 -2.611752 8.834286e-09
# Simple dot plot of top GSEA GO BP terms
if (!dir.exists("figures")) dir.create("figures", recursive = TRUE)
if (!is.null(gsea_go_bp) && nrow(as.data.frame(gsea_go_bp)) > 0) {
gsea_df <- as.data.frame(gsea_go_bp) %>%
dplyr::slice_head(n = 15) %>%
dplyr::mutate(label = forcats::fct_reorder(Description, NES))
p_gsea <- ggplot(gsea_df, aes(x = NES, y = label, size = setSize, color = -log10(p.adjust))) +
geom_point() +
theme_bw() +
labs(
title = "GSEA GO BP (ranked gene list)",
x = "Normalized enrichment score (NES)",
y = NULL,
color = "-log10(adj. p)",
size = "Gene set size"
)
ggsave(
filename = file.path("figures", "gsea_go_bp_ranked_genes_dotplot.png"),
plot = p_gsea,
width = 8,
height = 6,
dpi = 300
)
cat("\nGSEA GO BP dot plot saved to figures/gsea_go_bp_ranked_genes_dotplot.png\n")
print(p_gsea)
}
##
## GSEA GO BP dot plot saved to figures/gsea_go_bp_ranked_genes_dotplot.png
In this analysis we used FPKM-normalized expression values from GSE183947 to compare breast tumor and normal breast tissue using limma on log2(FPKM+1), and then carried out GO Biological Process enrichment, including both over-representation (ORA) and GSEA-style analyses.
condition (normal vs tumor) and patient_id for
a paired-aware design.results/de_results_limma_fpkm.csv.genes_up_tumor,
genes_down_tumor, an all-DE list, and a ranked gene list
and exported all lists to results/.figures/, summarizing the dominant functional
themes.The FPKM-based limma analysis reveals marked gene expression differences between breast tumors and normal breast tissue in GSE183947. At the chosen thresholds (FDR < 0.05, |log2(FC)| ≥ 1) we observe substantial numbers of both up- and down-regulated genes, consistent with broad changes in cellular state rather than a small set of isolated markers.
The volcano and MA plots confirm that these differences are well distributed across expression levels, and together with the PCA suggest a robust separation of tumor and normal samples driven by many independent genes. This supports the use of downstream enrichment to interpret the results at the pathway level.
GO BP enrichment of tumor-up genes points to coordinated activation of proliferative and cell cycle–related programs (for example, terms related to mitotic division, DNA replication, and checkpoint control, depending on the exact output), which is in line with increased growth and genomic instability in tumors. Conversely, GO terms enriched among tumor-down genes tend to involve processes more characteristic of normal or differentiated breast tissue (such as tissue organization, adhesion, or specific signaling pathways), reflecting functions that are attenuated or lost in the tumor state. These process-level trends provide a concise biological summary of the DE results.
The additional GSEA-style analysis corroborates these findings by identifying gene sets with coordinated expression changes that correlate with the overall tumor vs normal differentiation signal. GSEA can capture more subtle and widespread changes in pathway activity that may not be as apparent when looking at individual gene lists.
Using FPKM-normalized RNA-Seq data from GSE183947, we implemented a complete workflow from data loading and QC through limma-based tumor vs normal differential expression, visualization with volcano and MA plots, and GO BP enrichment on tumor-up and tumor-down gene sets. The analysis identifies large, directionally consistent sets of DE genes and highlights biologically plausible process-level changes in breast tumors.
Future work could include repeating the analysis on a unified raw counts matrix with count-based methods, extending enrichment to additional resources such as KEGG, Reactome, or MSigDB, and applying GSEA to the ranked gene list to capture more subtle coordinated shifts in pathway activity. A more detailed report could then link the specific enriched GO terms and pathways to the breast cancer literature.