Install and load packages

library(BiocManager)
library(tximport)
library(DESeq2)
library(biomaRt)
library(pheatmap)
library(tidyverse)

Load Sample tables

sample_table <- read_csv(
  "https://raw.githubusercontent.com/sjcockell/mmb8052/main/practicals/practical_08/data/sample_table.csv"
)

sample_table
## # A tibble: 12 × 11
##    Run        GEO_ID  Group Age   Cell_type Instrument Isolate Sample_Name sex  
##    <chr>      <chr>   <chr> <chr> <chr>     <chr>      <chr>   <chr>       <chr>
##  1 SRR7457551 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 24h Re… R05_AM_All… male 
##  2 SRR7457552 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 24h Re… R06_AM_All… male 
##  3 SRR7457553 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 2h Rep… R03_AM_All… male 
##  4 SRR7457554 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 2h Rep… R04_AM_All… male 
##  5 SRR7457555 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 2h Rep… R01_AM_All… male 
##  6 SRR7457556 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 2h Rep… R02_AM_All… male 
##  7 SRR7457557 GSM324… Naive 14 w… Alveolar… NextSeq 5… Naive … N03_AM_Nai… male 
##  8 SRR7457558 GSM324… Naive 14 w… Alveolar… NextSeq 5… Naive … N04_AM_Nai… male 
##  9 SRR7457559 GSM324… Naive 14 w… Alveolar… NextSeq 5… Naive … N01_AM_Nai… male 
## 10 SRR7457560 GSM324… Naive 14 w… Alveolar… NextSeq 5… Naive … N02_AM_Nai… male 
## 11 SRR7457561 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 24h Re… R07_AM_All… male 
## 12 SRR7457562 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 24h Re… R08_AM_All… male 
## # ℹ 2 more variables: STRAIN <chr>, Tissue <chr>

Build vectors of file paths

files <- pull(sample_table, Run)
files <- paste0(files, "/quant.sf")
names(files) <- pull(sample_table, Run)

head(files)
##            SRR7457551            SRR7457552            SRR7457553 
## "SRR7457551/quant.sf" "SRR7457552/quant.sf" "SRR7457553/quant.sf" 
##            SRR7457554            SRR7457555            SRR7457556 
## "SRR7457554/quant.sf" "SRR7457555/quant.sf" "SRR7457556/quant.sf"

Load Transcipts to gene mapping

gene_map <- read_csv(
  "https://github.com/sjcockell/mmb8052/raw/main/practicals/practical_08/extdata/gene_map.csv"
)

Import quantification data with tximport

txi <- tximport(files,
                type = "salmon",
                tx2gene = gene_map,
                ignoreTxVersion = TRUE
)

str(txi)
## List of 4
##  $ abundance          : num [1:55858, 1:12] 82.36 0 34.68 0 0.12 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:55858] "ENSMUSG00000000001" "ENSMUSG00000000003" "ENSMUSG00000000028" "ENSMUSG00000000031" ...
##   .. ..$ : chr [1:12] "SRR7457551" "SRR7457552" "SRR7457553" "SRR7457554" ...
##  $ counts             : num [1:55858, 1:12] 5400 0 1332 0 12 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:55858] "ENSMUSG00000000001" "ENSMUSG00000000003" "ENSMUSG00000000028" "ENSMUSG00000000031" ...
##   .. ..$ : chr [1:12] "SRR7457551" "SRR7457552" "SRR7457553" "SRR7457554" ...
##  $ length             : num [1:55858, 1:12] 3012 550 1764 1409 4592 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:55858] "ENSMUSG00000000001" "ENSMUSG00000000003" "ENSMUSG00000000028" "ENSMUSG00000000031" ...
##   .. ..$ : chr [1:12] "SRR7457551" "SRR7457552" "SRR7457553" "SRR7457554" ...
##  $ countsFromAbundance: chr "no"

Build DESeq2 data object

dds <- DESeqDataSetFromTximport(
  txi,
  colData = sample_table,
  design = ~ Group
)

dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
dds <- nbinomWaldTest(dds)

plotDispEsts(dds)

rlog transformation with PCA

rld <- rlog(dds)

assay(rld)[1:5, 1:5]
##                    SRR7457551 SRR7457552 SRR7457553 SRR7457554 SRR7457555
## ENSMUSG00000000001  11.984264  12.041706  11.973296  12.011140  11.995211
## ENSMUSG00000000003   0.000000   0.000000   0.000000   0.000000   0.000000
## ENSMUSG00000000028   9.464706   9.634814   7.432353   7.619907   7.384439
## ENSMUSG00000000031   0.000000   0.000000   0.000000   0.000000   0.000000
## ENSMUSG00000000037   2.519403   2.624355   2.393059   3.554851   2.443859
plotPCA(rld, intgroup = "Group")

Sample Distance Heatmap

sample_distance <- dist(t(assay(rld)), method = "euclidean")
sample_distance_matrix <- as.matrix(sample_distance)

heatmap_annotation <- data.frame(
  group = colData(dds)[, "Group"],
  row.names = rownames(colData(dds))
)

pheatmap(
  sample_distance_matrix,
  clustering_distance_rows = sample_distance,
  clustering_distance_cols = sample_distance,
  annotation_col = heatmap_annotation
)

Volcano plot: Allo24h vs Naive

results_table <- results(dds, contrast = c("Group", "Allo24h", "Naive"))

results_tibble <- as_tibble(results_table, rownames = "ensembl_gene_id")
filtered_results <- filter(results_tibble, complete.cases(results_tibble))
filtered_results <- mutate(filtered_results, logPVal = -log10(padj))

ggplot(filtered_results, aes(x = log2FoldChange, y = logPVal)) +
  geom_point() +
  labs(
    title = "Volcano Plot: Allo24h vs Naive",
    x = "log2 Fold Change",
    y = "-log10 Adjusted p-value"
  )

Volcano plot: Allo2h vs Naive

results_table_2h <- results(dds, contrast = c("Group", "Allo2h", "Naive"))

results_tibble_2h <- as_tibble(results_table_2h, rownames = "ensembl_gene_id")
filtered_results_2h <- filter(results_tibble_2h, complete.cases(results_tibble_2h))
filtered_results_2h <- mutate(filtered_results_2h, logPVal = -log10(padj))

ggplot(filtered_results_2h, aes(x = log2FoldChange, y = logPVal)) +
  geom_point() +
  labs(
    title = "Volcano Plot: Allo2h vs Naive",
    x = "log2 Fold Change",
    y = "-log10 Adjusted p-value"
  )

#Gene Annotation(BiomaRt)

ensembl108 <- useEnsembl(biomart = "ensembl", version = 108)
ensembl108 <- useDataset("mmusculus_gene_ensembl", mart = ensembl108)

annotation <- getBM(
  attributes = c(
    'ensembl_gene_id', 'chromosome_name', 'start_position',
    'end_position', 'strand', 'gene_biotype',
    'external_gene_name', 'description'
  ),
  filters = 'ensembl_gene_id',
  values = filtered_results$ensembl_gene_id,
  mart = ensembl108
)

annot_results <- left_join(filtered_results, annotation)
annot_results <- arrange(annot_results, padj)

degs <- filter(annot_results, abs(log2FoldChange) > 1 & padj < 0.05)

Gene Biotype Summary Plot

biotype_summary <- annot_results %>%
  count(gene_biotype) %>%
  arrange(desc(n))

ggplot(biotype_summary, aes(x = reorder(gene_biotype, n), y = n)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Distribution of Gene Biotypes Among DEGs",
    x = "Gene Biotype",
    y = "Number of DEGs"
  ) +
  theme_minimal()

Top-10 DEGs: Allo24h vs Naive

res24 <- results(dds, contrast = c("Group", "Allo24h", "Naive"))
res24_tbl <- as_tibble(res24, rownames = "ensembl_gene_id")
res24_tbl <- filter(res24_tbl, !is.na(padj))

top10 <- res24_tbl %>%
  arrange(padj) %>%
  slice(1:10)

top10_annot <- left_join(top10, annotation, by = "ensembl_gene_id")

ggplot(top10_annot, aes(
  x = reorder(external_gene_name, log2FoldChange),
  y = log2FoldChange,
  fill = log2FoldChange > 0
)) +
  geom_col() +
  coord_flip() +
  scale_fill_manual(
    values = c("steelblue", "firebrick"),
    labels = c("Downregulated", "Upregulated")
  ) +
  labs(
    title = "Top 10 DEGs: Allo24h vs Naive",
    x = "Gene",
    y = "log2 Fold Change",
    fill = "Direction"
  ) +
  theme_minimal()