Install and load packages

library(BiocManager)
library(tximport)
library(DESeq2)
library(biomaRt)
library(pheatmap)
library(tidyverse)
library(ggrepel)

Load Sample tables

sample_table <- read_csv(
  "https://raw.githubusercontent.com/sjcockell/mmb8052/main/practicals/practical_08/data/sample_table.csv"
)

sample_table
## # A tibble: 12 × 11
##    Run        GEO_ID  Group Age   Cell_type Instrument Isolate Sample_Name sex  
##    <chr>      <chr>   <chr> <chr> <chr>     <chr>      <chr>   <chr>       <chr>
##  1 SRR7457551 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 24h Re… R05_AM_All… male 
##  2 SRR7457552 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 24h Re… R06_AM_All… male 
##  3 SRR7457553 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 2h Rep… R03_AM_All… male 
##  4 SRR7457554 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 2h Rep… R04_AM_All… male 
##  5 SRR7457555 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 2h Rep… R01_AM_All… male 
##  6 SRR7457556 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 2h Rep… R02_AM_All… male 
##  7 SRR7457557 GSM324… Naive 14 w… Alveolar… NextSeq 5… Naive … N03_AM_Nai… male 
##  8 SRR7457558 GSM324… Naive 14 w… Alveolar… NextSeq 5… Naive … N04_AM_Nai… male 
##  9 SRR7457559 GSM324… Naive 14 w… Alveolar… NextSeq 5… Naive … N01_AM_Nai… male 
## 10 SRR7457560 GSM324… Naive 14 w… Alveolar… NextSeq 5… Naive … N02_AM_Nai… male 
## 11 SRR7457561 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 24h Re… R07_AM_All… male 
## 12 SRR7457562 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 24h Re… R08_AM_All… male 
## # ℹ 2 more variables: STRAIN <chr>, Tissue <chr>

Build vectors of file paths

files <- pull(sample_table, Run)
files <- paste0(files, "/quant.sf")
names(files) <- pull(sample_table, Run)

head(files)
##            SRR7457551            SRR7457552            SRR7457553 
## "SRR7457551/quant.sf" "SRR7457552/quant.sf" "SRR7457553/quant.sf" 
##            SRR7457554            SRR7457555            SRR7457556 
## "SRR7457554/quant.sf" "SRR7457555/quant.sf" "SRR7457556/quant.sf"

Load Transcipts to gene mapping

gene_map <- read_csv(
  "https://github.com/sjcockell/mmb8052/raw/main/practicals/practical_08/extdata/gene_map.csv"
)

Import quantification data with tximport

txi <- tximport(files,
                type = "salmon",
                tx2gene = gene_map,
                ignoreTxVersion = TRUE
)

str(txi)
## List of 4
##  $ abundance          : num [1:55858, 1:12] 82.36 0 34.68 0 0.12 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:55858] "ENSMUSG00000000001" "ENSMUSG00000000003" "ENSMUSG00000000028" "ENSMUSG00000000031" ...
##   .. ..$ : chr [1:12] "SRR7457551" "SRR7457552" "SRR7457553" "SRR7457554" ...
##  $ counts             : num [1:55858, 1:12] 5400 0 1332 0 12 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:55858] "ENSMUSG00000000001" "ENSMUSG00000000003" "ENSMUSG00000000028" "ENSMUSG00000000031" ...
##   .. ..$ : chr [1:12] "SRR7457551" "SRR7457552" "SRR7457553" "SRR7457554" ...
##  $ length             : num [1:55858, 1:12] 3012 550 1764 1409 4592 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:55858] "ENSMUSG00000000001" "ENSMUSG00000000003" "ENSMUSG00000000028" "ENSMUSG00000000031" ...
##   .. ..$ : chr [1:12] "SRR7457551" "SRR7457552" "SRR7457553" "SRR7457554" ...
##  $ countsFromAbundance: chr "no"

Build DESeq2 data object

dds <- DESeqDataSetFromTximport(
  txi,
  colData = sample_table,
  design = ~ Group
)

dds <- DESeq(dds)

plotDispEsts(dds)

rlog transformation with PCA

rld <- rlog(dds)

assay(rld)[1:5, 1:5]
##                    SRR7457551 SRR7457552 SRR7457553 SRR7457554 SRR7457555
## ENSMUSG00000000001  11.984264  12.041706  11.973296  12.011140  11.995211
## ENSMUSG00000000003   0.000000   0.000000   0.000000   0.000000   0.000000
## ENSMUSG00000000028   9.464706   9.634814   7.432353   7.619907   7.384439
## ENSMUSG00000000031   0.000000   0.000000   0.000000   0.000000   0.000000
## ENSMUSG00000000037   2.519403   2.624355   2.393059   3.554851   2.443859
plotPCA(rld, intgroup = "Group")

Sample Distance Heatmap

# Step 1: Compute sample-to-sample distances
sample_distance <- dist(t(assay(rld)), method = "euclidean")
sample_distance_matrix <- as.matrix(sample_distance)

# Step 2: Create annotation for sample groups
heatmap_annotation <- data.frame(
  Group = colData(dds)[, "Group"]
)
rownames(heatmap_annotation) <- rownames(colData(dds))

# Step 3: Plot heatmap with improved color and clustering
pheatmap(
  sample_distance_matrix,
  clustering_distance_rows = sample_distance,
  clustering_distance_cols = sample_distance,
  annotation_col = heatmap_annotation,
  color = colorRampPalette(c("navy", "white", "firebrick3"))(100),
  main = "Sample-to-Sample Distance Heatmap",
  fontsize = 10,
  border_color = NA
)

Volcano plot: Allo24h vs Naive

# Step 1: Prepare results
results_table <- results(dds, contrast = c("Group", "Allo24h", "Naive"))
results_tibble <- as_tibble(results_table, rownames = "ensembl_gene_id") %>%
  filter(complete.cases(.)) %>%
  mutate(
    logPVal = -log10(padj),
    regulation = case_when(
      padj < 0.05 & log2FoldChange > 1 ~ "Upregulated",
      padj < 0.05 & log2FoldChange < -1 ~ "Downregulated",
      TRUE ~ "Not Significant"
    )
  )

# Step 2: Plot volcano with color
ggplot(results_tibble, aes(x = log2FoldChange, y = logPVal, color = regulation)) +
  geom_point(alpha = 0.6, size = 1.5) +
  scale_color_manual(values = c("Upregulated" = "red", "Downregulated" = "blue", "Not Significant" = "grey")) +
  labs(
    title = "Volcano Plot: Allo24h vs Naive",
    x = "log2 Fold Change",
    y = "-log10 Adjusted p-value",
    color = "Regulation"
  ) +
  theme_minimal(base_size = 14)

Volcano plot: Allo2h vs Naive

# Step 1: Extract and prepare results
results_table_2h <- results(dds, contrast = c("Group", "Allo2h", "Naive"))

results_tibble_2h <- as_tibble(results_table_2h, rownames = "ensembl_gene_id") %>%
  filter(complete.cases(.)) %>%
  mutate(
    logPVal = -log10(padj),
    regulation = case_when(
      padj < 0.05 & log2FoldChange > 1 ~ "Upregulated",
      padj < 0.05 & log2FoldChange < -1 ~ "Downregulated",
      TRUE ~ "Not Significant"
    )
  )

# Step 2: Plot volcano with color and theme
ggplot(results_tibble_2h, aes(x = log2FoldChange, y = logPVal, color = regulation)) +
  geom_point(alpha = 0.6, size = 1.5) +
  scale_color_manual(values = c("Upregulated" = "red", "Downregulated" = "blue", "Not Significant" = "grey")) +
  labs(
    title = "Volcano Plot: Allo2h vs Naive",
    x = "log2 Fold Change",
    y = "-log10 Adjusted p-value",
    color = "Regulation"
  ) +
  theme_minimal(base_size = 14)

#Gene Annotation(BiomaRt)

# 1. Get DESeq2 results
res24 <- results(dds, contrast = c("Group", "Allo24h", "Naive"))

# 2. Convert to tibble and filter
res24_tbl <- as_tibble(res24, rownames = "ensembl_gene_id") %>%
  filter(!is.na(padj)) %>%
  mutate(logPVal = -log10(padj))

# 3. THIS is the corrected line — you were missing it
filtered_results <- res24_tbl

# 4. Connect to Ensembl v108
ensembl108 <- useEnsembl(biomart = "ensembl", version = 108)
ensembl108 <- useDataset("mmusculus_gene_ensembl", mart = ensembl108)

# 5. Retrieve annotation
annotation <- getBM(
  attributes = c(
    "ensembl_gene_id", "external_gene_name", "gene_biotype",
    "chromosome_name", "start_position", "end_position",
    "strand", "description"
  ),
  filters = "ensembl_gene_id",
  values = filtered_results$ensembl_gene_id,   # <-- this now works
  mart = ensembl108
)

# 6. Merge annotation
annot_results <- left_join(filtered_results, annotation, by = "ensembl_gene_id")


# 7. Extract DEGs
degs <- annot_results %>%
  filter(abs(log2FoldChange) > 1, padj < 0.05)

# 8. Select top 10 DEGs
top_genes <- degs %>%
  slice_max(abs(log2FoldChange), n = 10)

Volcano plot with upregulated and downregulated

annot_results <- annot_results %>%
  mutate(
    logPVal = -log10(padj),
    regulation = case_when(
      padj < 0.05 & log2FoldChange > 1 ~ "Upregulated",
      padj < 0.05 & log2FoldChange < -1 ~ "Downregulated",
      TRUE ~ "Not Significant"
    )
  )

top_genes <- annot_results %>%
  filter(regulation != "Not Significant") %>%
  slice_max(abs(log2FoldChange), n = 10)

 ggplot(annot_results, aes(x = log2FoldChange, y = logPVal, color = regulation)) +
  geom_point(alpha = 0.6, size = 1.5) +
  scale_color_manual(values = c(
    "Upregulated" = "red",
    "Downregulated" = "blue",
    "Not Significant" = "grey"
  )) +
  geom_text_repel(
    data = top_genes,
    aes(label = external_gene_name),
    size = 3,
    max.overlaps = Inf,
    box.padding = 0.4,
    point.padding = 0.3,
    segment.color = "black"
  ) +
  labs(
    title = "Volcano Plot with Top DE Genes Labelled",
    x = "log2 Fold Change",
    y = "-log10 Adjusted p-value",
    color = "Regulation"
  ) +
  theme_minimal(base_size = 14)

Heatmap of DEGs for Allo2h vs Naive

# 1. DEGs for Allo2h vs Naive
res2h <- results(dds, contrast = c("Group", "Allo2h", "Naive"))
res2h_tbl <- as_tibble(res2h, rownames = "ensembl_gene_id") %>%
  filter(!is.na(padj), padj < 0.05, abs(log2FoldChange) > 1)

# 2. Extract expression matrix
rld_mat <- assay(rld)

deg_mat_2h <- rld_mat[rownames(rld_mat) %in% res2h_tbl$ensembl_gene_id, ]

# 3. Annotation
annotation_col <- data.frame(Group = colData(dds)$Group)
rownames(annotation_col) <- colnames(deg_mat_2h)

# 4. Heatmap
pheatmap(
  deg_mat_2h,
  scale = "row",
  clustering_distance_rows = "euclidean",
  clustering_distance_cols = "euclidean",
  annotation_col = annotation_col,
  color = colorRampPalette(c("navy", "white", "firebrick3"))(100),
  show_rownames = FALSE,
  main = "DEG Heatmap: Allo2h vs Naive"
)

Heatmap of DEGs for Allo24 vs Naive

# 1. DEGs for Allo24h vs Naive
res24 <- results(dds, contrast = c("Group", "Allo24h", "Naive"))
res24_tbl <- as_tibble(res24, rownames = "ensembl_gene_id") %>%
  filter(!is.na(padj), padj < 0.05, abs(log2FoldChange) > 1)

# 2. Extract expression matrix
deg_mat_24h <- rld_mat[rownames(rld_mat) %in% res24_tbl$ensembl_gene_id, ]

# 3. Annotation
annotation_col <- data.frame(Group = colData(dds)$Group)
rownames(annotation_col) <- colnames(deg_mat_24h)

# 4. Heatmap
pheatmap(
  deg_mat_24h,
  scale = "row",
  clustering_distance_rows = "euclidean",
  clustering_distance_cols = "euclidean",
  annotation_col = annotation_col,
  color = colorRampPalette(c("navy", "white", "firebrick3"))(100),
  show_rownames = FALSE,
  main = "DEG Heatmap: Allo24h vs Naive"
)

Top-10 DEGs: Allo24h vs Naive

# Top 5 upregulated (positive log2FC)
top_up <- res24_tbl %>%
  filter(padj < 0.05, log2FoldChange > 1) %>%
  arrange(padj) %>%
  slice(1:5)

# Top 5 downregulated (negative log2FC)
top_down <- res24_tbl %>%
  filter(padj < 0.05, log2FoldChange < -1) %>%
  arrange(padj) %>%
  slice(1:5)

# Combine and annotate
top10_combined <- bind_rows(top_up, top_down) %>%
  left_join(annotation, by = "ensembl_gene_id")

#plot
ggplot(top10_combined, aes(
  x = reorder(external_gene_name, log2FoldChange),
  y = log2FoldChange,
  fill = log2FoldChange > 0
)) +
  geom_col() +
  coord_flip() +
  scale_fill_manual(
    values = c("steelblue", "firebrick"),
    labels = c("Downregulated", "Upregulated")
  ) +
  labs(
    title = "Top 10 DEGs (Up & Down): Allo24h vs Naive",
    x = "Gene",
    y = "log2 Fold Change",
    fill = "Direction"
  ) +
  theme_minimal(base_size = 14)