Load required libraries

# Load libraries
library(DESeq2)
library(fgsea)
library(pheatmap)
library(ggplot2)
library(tibble)
library(dplyr)
library(RColorBrewer)
library(EnhancedVolcano)
library(rmarkdown)
library(knitr)

Load Data: Count matrix

# Load count data

countdata <- read.delim("/Users/s.rao/Desktop/TH3 Files/Section 4/Count_Data.txt", row.names = 1, header = TRUE)
# Convert count data to a matrix
countdata <- as.matrix(countdata)

# Define experimental conditions (e.g., WT vs Tumor)
#condition <- factor(c(rep("WT", 3), rep("Tumor", 3)))

# Define experimental conditions (e.g., Primary vs Metastatic)
condition <- factor(c(rep("Primary", 3), rep("Metastatic", 3)))

# Create a metadata table
metadata <- data.frame(row.names = colnames(countdata), condition)

Construct DESEQDataSet Object

We use the DESeqDataSetFromMatrix function to create a DESeq2 dataset from the count matrix and metadata. We then filter out lowly expressed genes to improve the quality of the analysis.

# Create DESeq2 dataset
dds <- DESeqDataSetFromMatrix(countData = countdata, colData = metadata, design = ~ condition)

# Filter out lowly expressed genes (row sums > 20)
dds <- dds[rowSums(counts(dds)) > 20, ]

# Normalization
dds <- estimateSizeFactors(dds)

dds
## class: DESeqDataSet 
## dim: 17951 6 
## metadata(1): version
## assays(1): counts
## rownames(17951): TSPAN6 TNMD ... AL135905.2 AC009090.6
## rowData names(0):
## colnames(6): P_Rep1 P_Rep2 ... M_Rep2 M_Rep3
## colData names(2): condition sizeFactor

#Run DESeq2 Analysis We run the DESeq2 pipeline to perform differential expression analysis. The results are extracted for the comparison between “Tumor” and “WT” conditions.

# Run DESeq2
dds <- DESeq(dds)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# Extract results
#res <- results(dds, contrast = c("condition", "Tumor", "WT"))
# Extract results : Assignment
res <- results(dds, contrast = c("condition", "Primary", "Metastatic"))
head(res)
## log2 fold change (MLE): condition Primary vs Metastatic 
## Wald test p-value: condition Primary vs Metastatic 
## DataFrame with 6 rows and 6 columns
##           baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##          <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## TSPAN6     903.454       0.155496  0.371673  0.418366 6.75679e-01 8.15927e-01
## TNMD         4.061       0.988793  1.186196  0.833583 4.04516e-01 6.05792e-01
## DPM1       431.190      -0.906044  0.333584 -2.716085 6.60589e-03 3.20924e-02
## SCYL3      205.001       0.324770  0.349337  0.929675 3.52540e-01 5.55745e-01
## C1orf112    87.016      -1.799805  0.510133 -3.528109 4.18539e-04 3.50311e-03
## FGR        170.310      -2.322546  0.434333 -5.347385 8.92342e-08 2.19742e-06
# Identify upregulated and downregulated genes
# Remove rows with NA values in log2FoldChange or padj columns
res_cleaned <- res[!is.na(res$log2FoldChange) & !is.na(res$padj), ]

# Now apply the filtering conditions
res_filtered <- res_cleaned[abs(res_cleaned$log2FoldChange) >= 1 & res_cleaned$padj < 0.05, ]

upregulated_genes <- res_filtered[res_filtered$log2FoldChange > 0,]
downregulated_genes <- res_filtered[res_filtered$log2FoldChange < 0,]
# Number of upregulated and downregulated genes
cat("Upregulated genes: ", nrow(upregulated_genes), "\n")
## Upregulated genes:  1681
cat("Downregulated genes: ", nrow(downregulated_genes), "\n")
## Downregulated genes:  1794

Heatmap

Generates a heatmap of Top 50 Differentially Expressed Genes

# Extract top 50 differentially expressed genes
top_genes <- rownames(res)[order(res$padj)][1:50]

# Normalize counts for visualization
rlog_counts <- rlog(dds, blind = FALSE)

# Extract normalized counts for top genes
top_gene_counts <- assay(rlog_counts)[top_genes, ]

# Generate heatmap
pheatmap(top_gene_counts,
         scale = "row",
         clustering_distance_rows = "euclidean",
         clustering_distance_cols = "euclidean",
         clustering_method = "complete",
         show_rownames = TRUE, fontsize = 6,
         color = colorRampPalette(rev(brewer.pal(9, "RdBu")))(100),
         main = "Heatmap of Top 50 Differentially Expressed Genes")

Volcano Plot

A volcano plot is used to visualize the relationship between fold change and statistical significance. Genes with significant changes are highlighted; fold change 4, fdr < 0.05

EnhancedVolcano(res,
                lab = rownames(res), # Gene labels
                x = 'log2FoldChange', # X-axis: log2 fold change
                y = 'padj', # Y-axis: adjusted p-value (FDR)
                pCutoff = 0.05, # FDR cutoff of 0.05
                FCcutoff = 2, # Fold-change cutoff of 4 (log2FC = 2)
                pointSize = 2.0, # Size of points
                labSize = 3.0, # Size of labels
                col = c('lightskyblue4', 'steelblue1', 'rosybrown1', 'hotpink1'), # Custom colors
                title = "Volcano Plot", # Plot title
                subtitle = "FDR < 0.05, |log2FC| > 2" # Plot subtitle
)

PCA Plot

Principal Component Analysis (PCA) is performed to visualize the overall structure of the data and identify any batch effects or outliers.

# Perform PCA on rlog-transformed data
pca_data <- prcomp(t(assay(rlog_counts)))

# Prepare data for plotting
pca_df <- as.data.frame(pca_data$x)

# Generate PCA plot
ggplot(pca_df, aes(x = PC1, y = PC2, color = condition)) +
  geom_point(size = 3) +
  theme_bw() +
  labs(title = "PCA Plot", x = "PC1", y = "PC2") +
  scale_color_brewer(palette = "Set1")

Pathway Analysis

We perform Gene Set Enrichment Analysis (GSEA) to identify pathways that are significantly enriched in the differentially expressed genes. We are using Hallmark Pathways

# Prepare data for GSEA
res <- as.data.frame(res)
res <- res %>%
  tibble::rownames_to_column(var = "GeneSymbol")

res2 <- res %>% 
  dplyr::select(GeneSymbol, log2FoldChange)

ranks <- tibble::deframe(res2)

# Load Hallmark gene sets
pathways.hallmark <- gmtPathways("/Users/s.rao/Desktop/TH3 Files/Section 4/h.all.v7.1.symbols.gmt")

# Run GSEA
fgseaRes <- fgsea(pathways = pathways.hallmark, stats = ranks, nperm = 1000)
## Warning in fgsea(pathways = pathways.hallmark, stats = ranks, nperm = 1000):
## You are trying to run fgseaSimple. It is recommended to use fgseaMultilevel. To
## run fgseaMultilevel, you need to remove the nperm argument in the fgsea
## function call.
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.02% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
# Tidy GSEA results
fgseaResTidy <- fgseaRes %>%
  as_tibble() %>%
  arrange(desc(NES))

# Filter significant pathways
fgseaResTidy1 <- filter(fgseaResTidy, fgseaResTidy$padj < 0.05)
fgseaResTidy1 <- fgseaResTidy1 %>% 
  mutate(variables = case_when(
    NES > 0 ~ "up", 
    NES < 0  ~ "down"))

fgseaResTidy1
## # A tibble: 28 × 9
##    pathway    pval    padj     ES   NES nMoreExtreme  size leadingEdge variables
##    <chr>     <dbl>   <dbl>  <dbl> <dbl>        <dbl> <int> <list>      <chr>    
##  1 HALLMA… 0.00194 0.00572  0.707  1.97            0    30 <chr [14]>  up       
##  2 HALLMA… 0.0231  0.0413  -0.335 -1.33            8   195 <chr [38]>  down     
##  3 HALLMA… 0.00525 0.00972 -0.359 -1.43            1   198 <chr [58]>  down     
##  4 HALLMA… 0.00515 0.00972 -0.363 -1.44            1   192 <chr [62]>  down     
##  5 HALLMA… 0.00501 0.00972 -0.377 -1.46            1   150 <chr [53]>  down     
##  6 HALLMA… 0.00263 0.00572 -0.370 -1.47            0   196 <chr [58]>  down     
##  7 HALLMA… 0.00256 0.00572 -0.379 -1.51            0   191 <chr [61]>  down     
##  8 HALLMA… 0.00244 0.00572 -0.394 -1.54            0   158 <chr [36]>  down     
##  9 HALLMA… 0.00444 0.00926 -0.445 -1.59            1    82 <chr [29]>  down     
## 10 HALLMA… 0.00251 0.00572 -0.418 -1.61            0   149 <chr [85]>  down     
## # ℹ 18 more rows
# Plot GSEA results
ggplot(fgseaResTidy1, aes(reorder(pathway, NES), NES)) +
  geom_col(aes(fill = variables)) +
  coord_flip() + scale_fill_manual(values = c("#4d9221", "#c51b7d")) +
  labs(x = "Pathway", y = "Normalized Enrichment Score") +
  theme_bw(base_size = 12)