library(BiocManager)
library(tximport)
library(DESeq2)
library(biomaRt)
library(pheatmap)
library(tidyverse)
library(ggrepel)
sample_table <- read_csv(
"https://raw.githubusercontent.com/sjcockell/mmb8052/main/practicals/practical_08/data/sample_table.csv"
)
sample_table
## # A tibble: 12 × 11
## Run GEO_ID Group Age Cell_type Instrument Isolate Sample_Name sex
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 SRR7457551 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 24h Re… R05_AM_All… male
## 2 SRR7457552 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 24h Re… R06_AM_All… male
## 3 SRR7457553 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 2h Rep… R03_AM_All… male
## 4 SRR7457554 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 2h Rep… R04_AM_All… male
## 5 SRR7457555 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 2h Rep… R01_AM_All… male
## 6 SRR7457556 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 2h Rep… R02_AM_All… male
## 7 SRR7457557 GSM324… Naive 14 w… Alveolar… NextSeq 5… Naive … N03_AM_Nai… male
## 8 SRR7457558 GSM324… Naive 14 w… Alveolar… NextSeq 5… Naive … N04_AM_Nai… male
## 9 SRR7457559 GSM324… Naive 14 w… Alveolar… NextSeq 5… Naive … N01_AM_Nai… male
## 10 SRR7457560 GSM324… Naive 14 w… Alveolar… NextSeq 5… Naive … N02_AM_Nai… male
## 11 SRR7457561 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 24h Re… R07_AM_All… male
## 12 SRR7457562 GSM324… Allo… 14 w… Alveolar… NextSeq 5… 24h Re… R08_AM_All… male
## # ℹ 2 more variables: STRAIN <chr>, Tissue <chr>
files <- pull(sample_table, Run)
files <- paste0(files, "/quant.sf")
names(files) <- pull(sample_table, Run)
head(files)
## SRR7457551 SRR7457552 SRR7457553
## "SRR7457551/quant.sf" "SRR7457552/quant.sf" "SRR7457553/quant.sf"
## SRR7457554 SRR7457555 SRR7457556
## "SRR7457554/quant.sf" "SRR7457555/quant.sf" "SRR7457556/quant.sf"
gene_map <- read_csv(
"https://github.com/sjcockell/mmb8052/raw/main/practicals/practical_08/extdata/gene_map.csv"
)
txi <- tximport(files,
type = "salmon",
tx2gene = gene_map,
ignoreTxVersion = TRUE
)
str(txi)
## List of 4
## $ abundance : num [1:55858, 1:12] 82.36 0 34.68 0 0.12 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:55858] "ENSMUSG00000000001" "ENSMUSG00000000003" "ENSMUSG00000000028" "ENSMUSG00000000031" ...
## .. ..$ : chr [1:12] "SRR7457551" "SRR7457552" "SRR7457553" "SRR7457554" ...
## $ counts : num [1:55858, 1:12] 5400 0 1332 0 12 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:55858] "ENSMUSG00000000001" "ENSMUSG00000000003" "ENSMUSG00000000028" "ENSMUSG00000000031" ...
## .. ..$ : chr [1:12] "SRR7457551" "SRR7457552" "SRR7457553" "SRR7457554" ...
## $ length : num [1:55858, 1:12] 3012 550 1764 1409 4592 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:55858] "ENSMUSG00000000001" "ENSMUSG00000000003" "ENSMUSG00000000028" "ENSMUSG00000000031" ...
## .. ..$ : chr [1:12] "SRR7457551" "SRR7457552" "SRR7457553" "SRR7457554" ...
## $ countsFromAbundance: chr "no"
dds <- DESeqDataSetFromTximport(
txi,
colData = sample_table,
design = ~ Group
)
dds <- DESeq(dds)
plotDispEsts(dds)
rld <- rlog(dds)
assay(rld)[1:5, 1:5]
## SRR7457551 SRR7457552 SRR7457553 SRR7457554 SRR7457555
## ENSMUSG00000000001 11.984264 12.041706 11.973296 12.011140 11.995211
## ENSMUSG00000000003 0.000000 0.000000 0.000000 0.000000 0.000000
## ENSMUSG00000000028 9.464706 9.634814 7.432353 7.619907 7.384439
## ENSMUSG00000000031 0.000000 0.000000 0.000000 0.000000 0.000000
## ENSMUSG00000000037 2.519403 2.624355 2.393059 3.554851 2.443859
plotPCA(rld, intgroup = "Group")
# Step 1: Compute sample-to-sample distances
sample_distance <- dist(t(assay(rld)), method = "euclidean")
sample_distance_matrix <- as.matrix(sample_distance)
# Step 2: Create annotation for sample groups
heatmap_annotation <- data.frame(
Group = colData(dds)[, "Group"]
)
rownames(heatmap_annotation) <- rownames(colData(dds))
# Step 3: Plot heatmap with improved color and clustering
pheatmap(
sample_distance_matrix,
clustering_distance_rows = sample_distance,
clustering_distance_cols = sample_distance,
annotation_col = heatmap_annotation,
color = colorRampPalette(c("navy", "white", "firebrick3"))(100),
main = "Sample-to-Sample Distance Heatmap",
fontsize = 10,
border_color = NA
)
# Step 1: Prepare results
results_table <- results(dds, contrast = c("Group", "Allo24h", "Naive"))
results_tibble <- as_tibble(results_table, rownames = "ensembl_gene_id") %>%
filter(complete.cases(.)) %>%
mutate(
logPVal = -log10(padj),
regulation = case_when(
padj < 0.05 & log2FoldChange > 1 ~ "Upregulated",
padj < 0.05 & log2FoldChange < -1 ~ "Downregulated",
TRUE ~ "Not Significant"
)
)
# Step 2: Plot volcano with color
ggplot(results_tibble, aes(x = log2FoldChange, y = logPVal, color = regulation)) +
geom_point(alpha = 0.6, size = 1.5) +
scale_color_manual(values = c("Upregulated" = "red", "Downregulated" = "blue", "Not Significant" = "grey")) +
labs(
title = "Volcano Plot: Allo24h vs Naive",
x = "log2 Fold Change",
y = "-log10 Adjusted p-value",
color = "Regulation"
) +
theme_minimal(base_size = 14)
# Step 1: Extract and prepare results
results_table_2h <- results(dds, contrast = c("Group", "Allo2h", "Naive"))
results_tibble_2h <- as_tibble(results_table_2h, rownames = "ensembl_gene_id") %>%
filter(complete.cases(.)) %>%
mutate(
logPVal = -log10(padj),
regulation = case_when(
padj < 0.05 & log2FoldChange > 1 ~ "Upregulated",
padj < 0.05 & log2FoldChange < -1 ~ "Downregulated",
TRUE ~ "Not Significant"
)
)
# Step 2: Plot volcano with color and theme
ggplot(results_tibble_2h, aes(x = log2FoldChange, y = logPVal, color = regulation)) +
geom_point(alpha = 0.6, size = 1.5) +
scale_color_manual(values = c("Upregulated" = "red", "Downregulated" = "blue", "Not Significant" = "grey")) +
labs(
title = "Volcano Plot: Allo2h vs Naive",
x = "log2 Fold Change",
y = "-log10 Adjusted p-value",
color = "Regulation"
) +
theme_minimal(base_size = 14)
#Gene Annotation(BiomaRt)
# 1. Get DESeq2 results
res24 <- results(dds, contrast = c("Group", "Allo24h", "Naive"))
# 2. Convert to tibble and filter
res24_tbl <- as_tibble(res24, rownames = "ensembl_gene_id") %>%
filter(!is.na(padj)) %>%
mutate(logPVal = -log10(padj))
# 3. THIS is the corrected line — you were missing it
filtered_results <- res24_tbl
# 4. Connect to Ensembl v108
ensembl108 <- useEnsembl(biomart = "ensembl", version = 108)
ensembl108 <- useDataset("mmusculus_gene_ensembl", mart = ensembl108)
# 5. Retrieve annotation
annotation <- getBM(
attributes = c(
"ensembl_gene_id", "external_gene_name", "gene_biotype",
"chromosome_name", "start_position", "end_position",
"strand", "description"
),
filters = "ensembl_gene_id",
values = filtered_results$ensembl_gene_id, # <-- this now works
mart = ensembl108
)
# 6. Merge annotation
annot_results <- left_join(filtered_results, annotation, by = "ensembl_gene_id")
# 7. Extract DEGs
degs <- annot_results %>%
filter(abs(log2FoldChange) > 1, padj < 0.05)
# 8. Select top 10 DEGs
top_genes <- degs %>%
slice_max(abs(log2FoldChange), n = 10)
annot_results <- annot_results %>%
mutate(
logPVal = -log10(padj),
regulation = case_when(
padj < 0.05 & log2FoldChange > 1 ~ "Upregulated",
padj < 0.05 & log2FoldChange < -1 ~ "Downregulated",
TRUE ~ "Not Significant"
)
)
top_genes <- annot_results %>%
filter(regulation != "Not Significant") %>%
slice_max(abs(log2FoldChange), n = 10)
ggplot(annot_results, aes(x = log2FoldChange, y = logPVal, color = regulation)) +
geom_point(alpha = 0.6, size = 1.5) +
scale_color_manual(values = c(
"Upregulated" = "red",
"Downregulated" = "blue",
"Not Significant" = "grey"
)) +
geom_text_repel(
data = top_genes,
aes(label = external_gene_name),
size = 3,
max.overlaps = Inf,
box.padding = 0.4,
point.padding = 0.3,
segment.color = "black"
) +
labs(
title = "Volcano Plot with Top DE Genes Labelled",
x = "log2 Fold Change",
y = "-log10 Adjusted p-value",
color = "Regulation"
) +
theme_minimal(base_size = 14)
# 1. DEGs for Allo2h vs Naive
res2h <- results(dds, contrast = c("Group", "Allo2h", "Naive"))
res2h_tbl <- as_tibble(res2h, rownames = "ensembl_gene_id") %>%
filter(!is.na(padj), padj < 0.05, abs(log2FoldChange) > 1)
# 2. Extract expression matrix
rld_mat <- assay(rld)
deg_mat_2h <- rld_mat[rownames(rld_mat) %in% res2h_tbl$ensembl_gene_id, ]
# 3. Annotation
annotation_col <- data.frame(Group = colData(dds)$Group)
rownames(annotation_col) <- colnames(deg_mat_2h)
# 4. Heatmap
pheatmap(
deg_mat_2h,
scale = "row",
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
annotation_col = annotation_col,
color = colorRampPalette(c("navy", "white", "firebrick3"))(100),
show_rownames = FALSE,
main = "DEG Heatmap: Allo2h vs Naive"
)
# 1. DEGs for Allo24h vs Naive
res24 <- results(dds, contrast = c("Group", "Allo24h", "Naive"))
res24_tbl <- as_tibble(res24, rownames = "ensembl_gene_id") %>%
filter(!is.na(padj), padj < 0.05, abs(log2FoldChange) > 1)
# 2. Extract expression matrix
deg_mat_24h <- rld_mat[rownames(rld_mat) %in% res24_tbl$ensembl_gene_id, ]
# 3. Annotation
annotation_col <- data.frame(Group = colData(dds)$Group)
rownames(annotation_col) <- colnames(deg_mat_24h)
# 4. Heatmap
pheatmap(
deg_mat_24h,
scale = "row",
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
annotation_col = annotation_col,
color = colorRampPalette(c("navy", "white", "firebrick3"))(100),
show_rownames = FALSE,
main = "DEG Heatmap: Allo24h vs Naive"
)
# Top 5 upregulated (positive log2FC)
top_up <- res24_tbl %>%
filter(padj < 0.05, log2FoldChange > 1) %>%
arrange(padj) %>%
slice(1:5)
# Top 5 downregulated (negative log2FC)
top_down <- res24_tbl %>%
filter(padj < 0.05, log2FoldChange < -1) %>%
arrange(padj) %>%
slice(1:5)
# Combine and annotate
top10_combined <- bind_rows(top_up, top_down) %>%
left_join(annotation, by = "ensembl_gene_id")
#plot
ggplot(top10_combined, aes(
x = reorder(external_gene_name, log2FoldChange),
y = log2FoldChange,
fill = log2FoldChange > 0
)) +
geom_col() +
coord_flip() +
scale_fill_manual(
values = c("steelblue", "firebrick"),
labels = c("Downregulated", "Upregulated")
) +
labs(
title = "Top 10 DEGs (Up & Down): Allo24h vs Naive",
x = "Gene",
y = "log2 Fold Change",
fill = "Direction"
) +
theme_minimal(base_size = 14)