library(Rsamtools) #to import BAM files
library(GenomicFeatures) #to import GTF file
library(GenomicAlignments) #to summarise overlaps, counting reads
library(DESeq2) #for differential expression analysis
library(pheatmap) #plot heatmap sample distances
library(RColorBrewer) #nice colour palettes
library(ggplot2) #customise plots
library(genefilter) #to select genes with most variance
library(AnnotationDbi) #functions to annotate the data
library(org.Mm.eg.db) #mouse database
library(dplyr)
library(reshape2)
library(PoiClaClu) #for Poisson distances
library(glmpca) #for generalized PCAs
library(ggbeeswarm) #for gene expression plots
library(apeglm) #for MA plots
library(IHW) #independent hypothesis weighting
Import metadata to R
sample_table <- read.csv("/Volumes/T7/Project_2311/miR128a_primary/docs/Sample_list_128a.csv", row.names = 1, stringsAsFactors = FALSE)
# make factors
sample_table$cell <- as.factor(sample_table$cell)
sample_table$niche <- as.factor(sample_table$niche)
sample_table$miR <- as.factor(sample_table$miR)
sample_table$transplant <- as.factor(sample_table$transplant)
sample_table$replicate <- as.factor(sample_table$replicate)
Import BAM files
filenames <- paste0(sample_table$index, ".Aligned.out.bam") # list of the filepaths for each BAM file
filepaths <- paste0("/Volumes/T7/Project_2311/miR128a_primary/data/cogent_bam/", filenames) # can check file paths work with command file.exists(filepaths)
bam_files <- BamFileList(filepaths, yieldSize = 2000000) # yield size is 2mill recommended default
Import GTF file
gtf_path <- "/Volumes/T7/Project_2311/miR128a_primary/docs/Mus_musculus.GRCm39.105.gtf"
txdb <- makeTxDbFromGFF(gtf_path, format = "gtf", circ_seqs = character(0)) # circ_seq = character(0) is how to tell it there is no circular sequence elements (and is a required element), making a tuxedo database
## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .get_cds_IDX(mcols0$type, mcols0$phase): The "phase" metadata column contains non-NA values for features of type
## stop_codon. This information was ignored.
## OK
ebg <- exonsBy(txdb, by = "gene") # rearranges the tuxedo database, exons by genes
se <- summarizeOverlaps(features = ebg, # se = summarised experiment format
reads = bam_files,
mode = "Union", # alignment method, explanation in help, but keep "Union" as default
fragments = TRUE,
singleEnd = FALSE)
colData(se) <- DataFrame(sample_table) # adding sample table data to columns, can the call up se data based on any sample table attributes by using $ eg sample name, treated/untreated, cell line etc (view by colData(se)$whatever_column_name)
data(se)
## Warning in data(se): data set 'se' not found
head(assay(se), 3)
## TACCGAGGAGTTCAGG CGTTAGAAGACCTGAA AGCCTCATTCTCTACT
## ENSMUSG00000000001 86 148 57
## ENSMUSG00000000003 0 0 0
## ENSMUSG00000000028 6 5 11
## GATTCTGCCTCTCGTC TCGTAGTGCCAAGTCT CTACGACATTGGACTC
## ENSMUSG00000000001 56 124 143
## ENSMUSG00000000003 2 2 1
## ENSMUSG00000000028 11 18 15
## TAAGTGGTGGCTTAAG CGGACAACAATCCGGA ATATGGATTAATACAG
## ENSMUSG00000000001 79 89 103
## ENSMUSG00000000003 0 3 0
## ENSMUSG00000000028 22 16 16
## GCGCAAGCCGGCGTGA AAGATACTATGTAAGT GGAGCGTCGCACGGAC
## ENSMUSG00000000001 77 147 110
## ENSMUSG00000000003 0 2 2
## ENSMUSG00000000028 5 16 5
## ATGGCATGGGTACCTT GCAATGCAAACGTTCC GTTCCAATGCAGAATT
## ENSMUSG00000000001 90 95 136
## ENSMUSG00000000003 5 0 0
## ENSMUSG00000000028 11 15 34
## ACCTTGGCATGAGGCC
## ENSMUSG00000000001 145
## ENSMUSG00000000003 1
## ENSMUSG00000000028 6
se$niche <- relevel(se$niche, "BM") # relevel highlighting the baseline (BM) to compare against
dds <- DESeqDataSet(se, design = ~ niche + cell) # ~ niche + cell as experimental variables
nrow(dds) # number of genes pre-filtering
## [1] 55414
keep <- rowSums(counts(dds) >= 1) >=3 # at least 3 samples with read count 1 or higher
dds <- dds[keep, ]
nrow(dds) # number of genes post-filtering
## [1] 48622
## rlog transformation
rld <- rlog(dds, blind = FALSE) #keep blind = FALSE, rld = r log dds, r log transformed
#check with head (assay(rld), 3) transformed data so no longer include count()
## VST
vsd <- vst(dds, blind = FALSE)
# Comparing different transformations, with distances between samples (deviating from x=y) which contribute to dsitance calculations and PCA, low counts can be excessively variable (even on ordinary log scale), compressed by VST and rlog trasnformation
# Plotting first sample against the second
dds <- estimateSizeFactors(dds)
df <- bind_rows(
as_data_frame(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
mutate(transformation = "log2(x + 1)"),
as_data_frame(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"),
as_data_frame(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"))
## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
colnames(df)[1:2] <- c("x", "y")
lvls <- c("log2(x + 1)", "vst", "rlog")
df$transformation <- factor(df$transformation, levels=lvls)
ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
coord_fixed() + facet_grid( . ~ transformation)
sample_dist <- dist(t(assay(vsd)), diag = TRUE) #requires the matrix to be transposed for the function to work
sample_dist_matrix <- as.matrix(sample_dist) #make into matrix for heat maps to eliminate empty values
rownames(sample_dist_matrix) <- paste(vsd$niche, vsd$cell, sep = " - ") # adding meaningful data to row and columns
colnames(sample_dist_matrix) <- NULL
pheatmap(sample_dist_matrix,
color = colorRampPalette( rev(brewer.pal(9, "Blues")))(255),
clustering_distance_rows = sample_dist,
clustering_distance_cols = sample_dist
)
poisd <- PoissonDistance(t(counts(dds)))
samplePoisDistMatrix <- as.matrix( poisd$dd )
rownames(samplePoisDistMatrix) <- paste( dds$niche, dds$cell, sep=" - " )
colnames(samplePoisDistMatrix) <- NULL
pheatmap(samplePoisDistMatrix,
col = colorRampPalette( rev(brewer.pal(9, "Blues")))(255),
clustering_distance_rows = poisd$dd,
clustering_distance_cols = poisd$dd)
Data points projected onto a 2D plane spread out in the two directions that explain most of the differences.
# using rlog data
pca_data <- plotPCA(rld, intgroup = c( "niche", "cell"), returnData = TRUE) #input is rlog transformed data, intrgroup (interesting groups)
percent_var <- round(attr(pca_data, "percentVar") *100) #to extact percentage variance for ggplot axis labels
# instead of returning a plot, returnData = TRUE returns pca data for improved appearance eg with ggplot
ggplot(data = pca_data, mapping = aes(x = PC1, y = PC2, color = niche, shape = cell)) +
geom_point(size =3) +
xlab(paste("PC1: ", percent_var[1], "% variance")) +
ylab(paste("PC2: ", percent_var[2], "% variance")) +
ggtitle("PCA with rlog data")
# using VSD data
pcaData <- plotPCA(vsd, intgroup = c( "niche", "cell"), returnData = TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(x = PC1, y = PC2, color = niche, shape = cell)) +
geom_point(size =3) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
coord_fixed() +
ggtitle("PCA with VST data")
#GLM-PCA
gpca <- glmpca(counts(dds), L=2)
gpca.dat <- gpca$factors
gpca.dat$niche <- dds$niche
gpca.dat$cell <- dds$cell
ggplot(gpca.dat, aes(x = dim1, y = dim2, color = niche, shape = cell)) +
geom_point(size =3) +
coord_fixed(2) +
ggtitle("glmpca - Generalized PCA")
mdsPois <- as.data.frame(colData(dds)) %>%
cbind(cmdscale(samplePoisDistMatrix))
ggplot(mdsPois, aes(x = `1`, y = `2`, color = niche, shape = cell)) +
geom_point(size = 3) + coord_fixed(ratio = 2) + ggtitle("MDS with PoissonDistances")
DESeq steps: estimaties size factors (controlling for differences in the sequencing depth of the samples), estimates of dispersion values for each gene, and fits a generalized linear model
dds <- DESeq(dds) #to include all outliers and minimally expressed genes include (dds, minReplicatesForReplace = Inf)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds, contrast = c("niche", "CNS", "BM")) # independent hypothesis weighting
summary(res)
##
## out of 48622 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 51, 0.1%
## LFC < 0 (down) : 16, 0.033%
## outliers [1] : 5, 0.01%
## low counts [2] : 24509, 50%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_ihw <- results(dds, contrast = c("niche", "CNS", "BM"), filterFun=ihw) # independent hypothesis weighting
summary(res_ihw)
##
## out of 48622 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 45, 0.093%
## LFC < 0 (down) : 41, 0.084%
## outliers [1] : 5, 0.01%
## [1] see 'cooksCutoff' argument of ?results
## see metadata(res)$ihwResult on hypothesis weighting
cell_res <- results(dds, contrast = c("cell", "LK", "LSK_IL7R"))
summary(cell_res)
##
## out of 48622 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 99, 0.2%
## LFC < 0 (down) : 219, 0.45%
## outliers [1] : 5, 0.01%
## low counts [2] : 26395, 54%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
cell_res_ihw <- results(dds, contrast = c("cell", "LK", "LSK_IL7R"),filterFun=ihw)
summary(cell_res_ihw)
##
## out of 48622 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 97, 0.2%
## LFC < 0 (down) : 279, 0.57%
## outliers [1] : 5, 0.01%
## [1] see 'cooksCutoff' argument of ?results
## see metadata(res)$ihwResult on hypothesis weighting
res_sig <- subset(res, padj < 0.1)
head(res_sig[order(res_sig$log2FoldChange),])
## log2 fold change (MLE): niche CNS vs BM
## Wald test p-value: niche CNS vs BM
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000024127 33.3050 -0.750633 0.205343 -3.65552 2.56664e-04
## ENSMUSG00000092056 33.5568 -0.740958 0.203476 -3.64151 2.71048e-04
## ENSMUSG00000059762 167.0756 -0.581683 0.134663 -4.31954 1.56356e-05
## ENSMUSG00000037465 307.6232 -0.514638 0.114700 -4.48683 7.22916e-06
## ENSMUSG00000019836 114.3921 -0.446693 0.118795 -3.76021 1.69773e-04
## ENSMUSG00000028629 98.4919 -0.440313 0.117141 -3.75883 1.70713e-04
## padj
## <numeric>
## ENSMUSG00000024127 0.0995124
## ENSMUSG00000092056 0.0995124
## ENSMUSG00000059762 0.0628239
## ENSMUSG00000037465 0.0507682
## ENSMUSG00000019836 0.0941105
## ENSMUSG00000028629 0.0941105
cell_res_sig <- subset(cell_res, padj < 0.1)
head(cell_res_sig[order(cell_res_sig$log2FoldChange),])
## log2 fold change (MLE): cell LK vs LSK_IL7R
## Wald test p-value: cell LK vs LSK_IL7R
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000035456 4.57128 -2.72940 0.617841 -4.41764 9.97834e-06
## ENSMUSG00000097906 5.32515 -2.47842 0.618143 -4.00946 6.08577e-05
## ENSMUSG00000067613 5.17657 -2.46157 0.661200 -3.72288 1.96964e-04
## ENSMUSG00000099596 5.38045 -2.21758 0.545556 -4.06481 4.80715e-05
## ENSMUSG00000026227 4.49556 -2.16991 0.610713 -3.55308 3.80746e-04
## ENSMUSG00000035626 4.83965 -2.14065 0.573493 -3.73266 1.89467e-04
## padj
## <numeric>
## ENSMUSG00000035456 0.0276594
## ENSMUSG00000097906 0.0397759
## ENSMUSG00000067613 0.0589254
## ENSMUSG00000099596 0.0381516
## ENSMUSG00000026227 0.0650842
## ENSMUSG00000035626 0.0589254
top_gene <- rownames(res)[which.min(res$pvalue)] #to select the minimum p-value
plotCounts(dds, gene = top_gene, intgroup=c("niche"))
geneCounts <- plotCounts(dds, gene = top_gene, intgroup = c("niche", "cell", "replicate"), returnData = TRUE)
ggplot(geneCounts, aes(x = niche, y = count, color = replicate, group = replicate, shape = cell)) +
scale_y_log10() + geom_point(size = 3) + geom_line() + ggtitle(top_gene)
count_data <- plotCounts(dds, gene = top_gene, intgroup = c("niche","cell"), returnData = TRUE)
ggplot(data = count_data, mapping = aes(x = niche, y = count, color = cell)) +
geom_jitter(width = 0.1) +
scale_y_log10()
resultsNames(dds)
## [1] "Intercept" "niche_CNS_vs_BM" "cell_LSK_IL7R_vs_LK"
#using apeglm method to shrink co-efficients
res_MA <- lfcShrink(dds, coef="niche_CNS_vs_BM", type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
plotMA(res_MA, ylim = c(-2, 4))
res.noshr <- results(dds, name="niche_CNS_vs_BM")
plotMA(res.noshr, ylim = c(-4, 4))
plotMA(res, ylim = c(-5,5))
top_gene <- rownames(res)[which.min(res$padj)]
with(res[top_gene, ], {
points(baseMean, log2FoldChange, col="dodgerblue", cex=2, lwd=2)
text(baseMean, log2FoldChange, top_gene, pos=2, col="dodgerblue")
})
# Histogram of p values for genes with mean normalized count larger than 1
hist(res$pvalue[res$baseMean > 1], breaks = 0:20/20,
col = "grey50", border = "white")
Select genes to plot (top genes with the highest variance) by heatmap
top_var_genes <- head(order(rowVars(assay(vsd)), decreasing = TRUE), 20) #requires transformed reads
mat <- assay(vsd)[top_var_genes,] #making matrix
mat <- mat - rowMeans(mat)
df <- as.data.frame(colData(vsd)[,c("cell", "niche")])
pheatmap(mat, annotation_col = df) #can also force clustering as before and add annotations (annotations.row) for gene names
Adding gene symbols and short description to results to annotate figures
res$symbol <- mapIds(org.Mm.eg.db,
keys = row.names(res),
column = "SYMBOL",
keytype ="ENSEMBL",
multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
res$genename <- mapIds(org.Mm.eg.db,
keys = row.names(res),
keytype = "ENSEMBL",
column = "GENENAME",
multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
cell_res$symbol <- mapIds(org.Mm.eg.db,
keys = row.names(cell_res),
column = "SYMBOL",
keytype ="ENSEMBL",
multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
cell_res$genename <- mapIds(org.Mm.eg.db,
keys = row.names(cell_res),
keytype = "ENSEMBL",
column = "GENENAME",
multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
res_ordered <- res[order(res$padj), ]
res_ordered_DF <- as.data.frame(res_ordered) #to convert from bioconductor DataFrame to regular data frame to write as csv
write.csv(res_ordered_DF, file = "/Volumes/T7/Project_2311/miR128a_primary/results/de_genes_niche.csv") #de genes is what is usually what journals want to be submitted
res_ordered <- cell_res[order(cell_res$padj), ]
res_ordered_DF <- as.data.frame(res_ordered) #to convert from bioconductor DataFrame to regular data frame to write as csv
write.csv(res_ordered_DF, file = "/Volumes/T7/Project_2311/miR128a_primary/results/de_genes_cells.csv") #de genes is what is usually what journals want to be submitted
gene_of_interest <- "ENSMUSG00000037071" #SCD1 -stearoyl-Coenzyme A desaturase 1
plotCounts(dds, gene = gene_of_interest, intgroup=c("niche"))
geneCounts <- plotCounts(dds, gene = gene_of_interest, intgroup = c("niche", "cell", "replicate"), returnData = TRUE)
ggplot(geneCounts, aes(x = niche, y = count, color = replicate, group = replicate, shape = cell)) +
scale_y_log10() + geom_point(size = 3) + geom_line() + ggtitle(gene_of_interest)