library(BiocManager)
library(tximport)
library(DESeq2)
library(biomaRt)
library(pheatmap)
library(tidyverse)
sample_table <- read_csv(
"https://raw.githubusercontent.com/sjcockell/mmb8052/main/practicals/practical_08/data/sample_table.csv"
)
sample_table
## # A tibble: 12 × 11
## Run GEO_ID Group Age Cell_type Instrument Isolate Sample_Name sex
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 SRR7457551 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 24h Re… R05_AM_All… male
## 2 SRR7457552 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 24h Re… R06_AM_All… male
## 3 SRR7457553 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 2h Rep… R03_AM_All… male
## 4 SRR7457554 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 2h Rep… R04_AM_All… male
## 5 SRR7457555 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 2h Rep… R01_AM_All… male
## 6 SRR7457556 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 2h Rep… R02_AM_All… male
## 7 SRR7457557 GSM324… Naive 14 w… Alveolar… NextSeq 5… Naive … N03_AM_Nai… male
## 8 SRR7457558 GSM324… Naive 14 w… Alveolar… NextSeq 5… Naive … N04_AM_Nai… male
## 9 SRR7457559 GSM324… Naive 14 w… Alveolar… NextSeq 5… Naive … N01_AM_Nai… male
## 10 SRR7457560 GSM324… Naive 14 w… Alveolar… NextSeq 5… Naive … N02_AM_Nai… male
## 11 SRR7457561 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 24h Re… R07_AM_All… male
## 12 SRR7457562 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 24h Re… R08_AM_All… male
## # ℹ 2 more variables: STRAIN <chr>, Tissue <chr>
files <- pull(sample_table, Run)
files <- paste0(files, "/quant.sf")
names(files) <- pull(sample_table, Run)
head(files)
## SRR7457551 SRR7457552 SRR7457553
## "SRR7457551/quant.sf" "SRR7457552/quant.sf" "SRR7457553/quant.sf"
## SRR7457554 SRR7457555 SRR7457556
## "SRR7457554/quant.sf" "SRR7457555/quant.sf" "SRR7457556/quant.sf"
gene_map <- read_csv(
"https://github.com/sjcockell/mmb8052/raw/main/practicals/practical_08/extdata/gene_map.csv"
)
txi <- tximport(files,
type = "salmon",
tx2gene = gene_map,
ignoreTxVersion = TRUE
)
str(txi)
## List of 4
## $ abundance : num [1:55858, 1:12] 82.36 0 34.68 0 0.12 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:55858] "ENSMUSG00000000001" "ENSMUSG00000000003" "ENSMUSG00000000028" "ENSMUSG00000000031" ...
## .. ..$ : chr [1:12] "SRR7457551" "SRR7457552" "SRR7457553" "SRR7457554" ...
## $ counts : num [1:55858, 1:12] 5400 0 1332 0 12 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:55858] "ENSMUSG00000000001" "ENSMUSG00000000003" "ENSMUSG00000000028" "ENSMUSG00000000031" ...
## .. ..$ : chr [1:12] "SRR7457551" "SRR7457552" "SRR7457553" "SRR7457554" ...
## $ length : num [1:55858, 1:12] 3012 550 1764 1409 4592 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:55858] "ENSMUSG00000000001" "ENSMUSG00000000003" "ENSMUSG00000000028" "ENSMUSG00000000031" ...
## .. ..$ : chr [1:12] "SRR7457551" "SRR7457552" "SRR7457553" "SRR7457554" ...
## $ countsFromAbundance: chr "no"
dds <- DESeqDataSetFromTximport(
txi,
colData = sample_table,
design = ~ Group
)
dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
dds <- nbinomWaldTest(dds)
plotDispEsts(dds)
rld <- rlog(dds)
assay(rld)[1:5, 1:5]
## SRR7457551 SRR7457552 SRR7457553 SRR7457554 SRR7457555
## ENSMUSG00000000001 11.984264 12.041706 11.973296 12.011140 11.995211
## ENSMUSG00000000003 0.000000 0.000000 0.000000 0.000000 0.000000
## ENSMUSG00000000028 9.464706 9.634814 7.432353 7.619907 7.384439
## ENSMUSG00000000031 0.000000 0.000000 0.000000 0.000000 0.000000
## ENSMUSG00000000037 2.519403 2.624355 2.393059 3.554851 2.443859
plotPCA(rld, intgroup = "Group")
sample_distance <- dist(t(assay(rld)), method = "euclidean")
sample_distance_matrix <- as.matrix(sample_distance)
heatmap_annotation <- data.frame(
group = colData(dds)[, "Group"],
row.names = rownames(colData(dds))
)
pheatmap(
sample_distance_matrix,
clustering_distance_rows = sample_distance,
clustering_distance_cols = sample_distance,
annotation_col = heatmap_annotation
)
results_table <- results(dds, contrast = c("Group", "Allo24h", "Naive"))
results_tibble <- as_tibble(results_table, rownames = "ensembl_gene_id")
filtered_results <- filter(results_tibble, complete.cases(results_tibble))
filtered_results <- mutate(filtered_results, logPVal = -log10(padj))
ggplot(filtered_results, aes(x = log2FoldChange, y = logPVal)) +
geom_point() +
labs(
title = "Volcano Plot: Allo24h vs Naive",
x = "log2 Fold Change",
y = "-log10 Adjusted p-value"
)
results_table_2h <- results(dds, contrast = c("Group", "Allo2h", "Naive"))
results_tibble_2h <- as_tibble(results_table_2h, rownames = "ensembl_gene_id")
filtered_results_2h <- filter(results_tibble_2h, complete.cases(results_tibble_2h))
filtered_results_2h <- mutate(filtered_results_2h, logPVal = -log10(padj))
ggplot(filtered_results_2h, aes(x = log2FoldChange, y = logPVal)) +
geom_point() +
labs(
title = "Volcano Plot: Allo2h vs Naive",
x = "log2 Fold Change",
y = "-log10 Adjusted p-value"
)
#Gene Annotation(BiomaRt)
ensembl108 <- useEnsembl(biomart = "ensembl", version = 108)
ensembl108 <- useDataset("mmusculus_gene_ensembl", mart = ensembl108)
annotation <- getBM(
attributes = c(
'ensembl_gene_id', 'chromosome_name', 'start_position',
'end_position', 'strand', 'gene_biotype',
'external_gene_name', 'description'
),
filters = 'ensembl_gene_id',
values = filtered_results$ensembl_gene_id,
mart = ensembl108
)
annot_results <- left_join(filtered_results, annotation)
annot_results <- arrange(annot_results, padj)
degs <- filter(annot_results, abs(log2FoldChange) > 1 & padj < 0.05)
biotype_summary <- annot_results %>%
count(gene_biotype) %>%
arrange(desc(n))
ggplot(biotype_summary, aes(x = reorder(gene_biotype, n), y = n)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(
title = "Distribution of Gene Biotypes Among DEGs",
x = "Gene Biotype",
y = "Number of DEGs"
) +
theme_minimal()
res24 <- results(dds, contrast = c("Group", "Allo24h", "Naive"))
res24_tbl <- as_tibble(res24, rownames = "ensembl_gene_id")
res24_tbl <- filter(res24_tbl, !is.na(padj))
top10 <- res24_tbl %>%
arrange(padj) %>%
slice(1:10)
top10_annot <- left_join(top10, annotation, by = "ensembl_gene_id")
ggplot(top10_annot, aes(
x = reorder(external_gene_name, log2FoldChange),
y = log2FoldChange,
fill = log2FoldChange > 0
)) +
geom_col() +
coord_flip() +
scale_fill_manual(
values = c("steelblue", "firebrick"),
labels = c("Downregulated", "Upregulated")
) +
labs(
title = "Top 10 DEGs: Allo24h vs Naive",
x = "Gene",
y = "log2 Fold Change",
fill = "Direction"
) +
theme_minimal()