Libraries

library(Rsamtools) #to import BAM files
library(GenomicFeatures) #to import GTF file
library(GenomicAlignments) #to summarise overlaps, counting reads
library(DESeq2) #for differential expression analysis
library(pheatmap) #plot heatmap sample distances
library(RColorBrewer) #nice colour palettes
library(ggplot2) #customise plots 
library(genefilter) #to select genes with most variance
library(AnnotationDbi) #functions to annotate the data
library(org.Mm.eg.db) #mouse database 
library(dplyr)
library(reshape2)
library(PoiClaClu) #for Poisson distances
library(glmpca) #for generalized PCAs
library(ggbeeswarm) #for gene expression plots
library(apeglm) #for MA plots
library(IHW) #independent hypothesis weighting

Import the data

Import metadata to R

sample_table <- read.csv("/Volumes/T7/Project_2311/miR128a_primary/docs/Sample_list_128a.csv", row.names = 1, stringsAsFactors = FALSE)

# make factors
sample_table$cell <- as.factor(sample_table$cell)
sample_table$niche <- as.factor(sample_table$niche)
sample_table$miR <- as.factor(sample_table$miR)
sample_table$transplant <- as.factor(sample_table$transplant)
sample_table$replicate <- as.factor(sample_table$replicate)

Import BAM files

filenames <- paste0(sample_table$index, ".Aligned.out.bam") #  list of the filepaths for each BAM file
filepaths <- paste0("/Volumes/T7/Project_2311/miR128a_primary/data/cogent_bam/", filenames) # can check file paths work with command file.exists(filepaths)
bam_files <- BamFileList(filepaths, yieldSize = 2000000) # yield size is 2mill recommended default

Import GTF file

gtf_path <- "/Volumes/T7/Project_2311/miR128a_primary/docs/Mus_musculus.GRCm39.105.gtf"
txdb <- makeTxDbFromGFF(gtf_path, format = "gtf", circ_seqs = character(0)) # circ_seq = character(0) is how to tell it there is no circular sequence elements (and is a required element), making a tuxedo database
## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .get_cds_IDX(mcols0$type, mcols0$phase): The "phase" metadata column contains non-NA values for features of type
##   stop_codon. This information was ignored.
## OK
ebg <- exonsBy(txdb, by = "gene") # rearranges the tuxedo database, exons by genes

Read counting - create a SummarizedExperiment object

se <- summarizeOverlaps(features = ebg, # se = summarised experiment format
                        reads = bam_files,
                        mode = "Union", # alignment method, explanation in help, but keep "Union" as default
                        fragments = TRUE,
                        singleEnd = FALSE) 

colData(se) <- DataFrame(sample_table) # adding sample table data to columns, can the call up se data based on any sample table attributes by using $ eg sample name, treated/untreated, cell line etc (view by colData(se)$whatever_column_name)

data(se)
## Warning in data(se): data set 'se' not found
head(assay(se), 3)
##                    TACCGAGGAGTTCAGG CGTTAGAAGACCTGAA AGCCTCATTCTCTACT
## ENSMUSG00000000001               86              148               57
## ENSMUSG00000000003                0                0                0
## ENSMUSG00000000028                6                5               11
##                    GATTCTGCCTCTCGTC TCGTAGTGCCAAGTCT CTACGACATTGGACTC
## ENSMUSG00000000001               56              124              143
## ENSMUSG00000000003                2                2                1
## ENSMUSG00000000028               11               18               15
##                    TAAGTGGTGGCTTAAG CGGACAACAATCCGGA ATATGGATTAATACAG
## ENSMUSG00000000001               79               89              103
## ENSMUSG00000000003                0                3                0
## ENSMUSG00000000028               22               16               16
##                    GCGCAAGCCGGCGTGA AAGATACTATGTAAGT GGAGCGTCGCACGGAC
## ENSMUSG00000000001               77              147              110
## ENSMUSG00000000003                0                2                2
## ENSMUSG00000000028                5               16                5
##                    ATGGCATGGGTACCTT GCAATGCAAACGTTCC GTTCCAATGCAGAATT
## ENSMUSG00000000001               90               95              136
## ENSMUSG00000000003                5                0                0
## ENSMUSG00000000028               11               15               34
##                    ACCTTGGCATGAGGCC
## ENSMUSG00000000001              145
## ENSMUSG00000000003                1
## ENSMUSG00000000028                6

DESeq2 - estimate variance-mean dependence in count data and test for differential epxression

Design formula

se$niche <- relevel(se$niche, "BM") # relevel highlighting the baseline (BM) to compare against
dds <- DESeqDataSet(se, design = ~ niche + cell) # ~ niche + cell as experimental variables

Exploratory analysis

Pre-filter the dataset

nrow(dds) # number of genes pre-filtering 
## [1] 55414
keep <- rowSums(counts(dds) >= 1) >=3 # at least 3 samples with read count 1 or higher
dds <- dds[keep, ] 

nrow(dds) # number of genes post-filtering
## [1] 48622

Variance stabilizing transformation

## rlog transformation

rld <- rlog(dds, blind = FALSE) #keep blind = FALSE, rld = r log dds, r log transformed
#check with head (assay(rld), 3) transformed data so no longer include count()

## VST

vsd <- vst(dds, blind = FALSE)

# Comparing different transformations, with distances between samples (deviating from x=y) which contribute to dsitance calculations and PCA, low counts can be excessively variable (even on ordinary log scale), compressed by VST and rlog trasnformation

# Plotting first sample against the second

dds <- estimateSizeFactors(dds)

df <- bind_rows(
  as_data_frame(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
         mutate(transformation = "log2(x + 1)"),
  as_data_frame(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"),
  as_data_frame(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"))
## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
colnames(df)[1:2] <- c("x", "y")  

lvls <- c("log2(x + 1)", "vst", "rlog")
df$transformation <- factor(df$transformation, levels=lvls)

ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
  coord_fixed() + facet_grid( . ~ transformation)  

Sample distances; assessing overall similarity between samples

sample_dist <- dist(t(assay(vsd)), diag = TRUE) #requires the matrix to be transposed for the function to work
sample_dist_matrix <- as.matrix(sample_dist) #make into matrix for heat maps to eliminate empty values
rownames(sample_dist_matrix) <- paste(vsd$niche, vsd$cell, sep = " - ") # adding meaningful data to row and columns
colnames(sample_dist_matrix) <- NULL

pheatmap(sample_dist_matrix, 
         color = colorRampPalette( rev(brewer.pal(9, "Blues")))(255),
         clustering_distance_rows = sample_dist,
         clustering_distance_cols = sample_dist
         )

Samples distribution by Poisson Distance

poisd <- PoissonDistance(t(counts(dds)))

samplePoisDistMatrix <- as.matrix( poisd$dd )
rownames(samplePoisDistMatrix) <- paste( dds$niche, dds$cell, sep=" - " )
colnames(samplePoisDistMatrix) <- NULL
pheatmap(samplePoisDistMatrix,
         col = colorRampPalette( rev(brewer.pal(9, "Blues")))(255),
         clustering_distance_rows = poisd$dd,
         clustering_distance_cols = poisd$dd)

Principal component ananlysis / PCA plot

Data points projected onto a 2D plane spread out in the two directions that explain most of the differences.

# using rlog data
pca_data <- plotPCA(rld, intgroup = c( "niche", "cell"), returnData = TRUE) #input is rlog transformed data, intrgroup (interesting groups) 
percent_var <- round(attr(pca_data, "percentVar") *100) #to extact percentage variance for ggplot axis labels
# instead of returning a plot, returnData = TRUE returns pca data for improved appearance eg with ggplot

ggplot(data = pca_data, mapping = aes(x = PC1, y = PC2, color = niche, shape = cell)) +
  geom_point(size =3) +
  xlab(paste("PC1: ", percent_var[1], "% variance")) +
  ylab(paste("PC2: ", percent_var[2], "% variance")) +
  ggtitle("PCA with rlog data")

# using VSD data
pcaData <- plotPCA(vsd, intgroup = c( "niche", "cell"), returnData = TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(x = PC1, y = PC2, color = niche, shape = cell)) +
  geom_point(size =3) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  coord_fixed() +
  ggtitle("PCA with VST data")

#GLM-PCA
gpca <- glmpca(counts(dds), L=2)
gpca.dat <- gpca$factors
gpca.dat$niche <- dds$niche
gpca.dat$cell <- dds$cell
ggplot(gpca.dat, aes(x = dim1, y = dim2, color = niche, shape = cell)) +
  geom_point(size =3) + 
  coord_fixed(2) + 
  ggtitle("glmpca - Generalized PCA")

Multidimentional scalling / MDS plot

mdsPois <- as.data.frame(colData(dds)) %>%
   cbind(cmdscale(samplePoisDistMatrix))

ggplot(mdsPois, aes(x = `1`, y = `2`, color = niche, shape = cell)) +
  geom_point(size = 3) + coord_fixed(ratio = 2) + ggtitle("MDS with PoissonDistances")

Differential expression analysis

DESeq steps: estimaties size factors (controlling for differences in the sequencing depth of the samples), estimates of dispersion values for each gene, and fits a generalized linear model

dds <- DESeq(dds) #to include all outliers and minimally expressed genes include (dds, minReplicatesForReplace = Inf)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds, contrast = c("niche", "CNS", "BM")) # independent hypothesis weighting
summary(res)
## 
## out of 48622 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 51, 0.1%
## LFC < 0 (down)     : 16, 0.033%
## outliers [1]       : 5, 0.01%
## low counts [2]     : 24509, 50%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_ihw <- results(dds, contrast = c("niche", "CNS", "BM"), filterFun=ihw) # independent hypothesis weighting
summary(res_ihw)
## 
## out of 48622 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 45, 0.093%
## LFC < 0 (down)     : 41, 0.084%
## outliers [1]       : 5, 0.01%
## [1] see 'cooksCutoff' argument of ?results
## see metadata(res)$ihwResult on hypothesis weighting

Extract other comparisons

cell_res <- results(dds, contrast = c("cell", "LK", "LSK_IL7R"))
summary(cell_res)
## 
## out of 48622 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 99, 0.2%
## LFC < 0 (down)     : 219, 0.45%
## outliers [1]       : 5, 0.01%
## low counts [2]     : 26395, 54%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
cell_res_ihw <- results(dds, contrast = c("cell", "LK", "LSK_IL7R"),filterFun=ihw)
summary(cell_res_ihw)
## 
## out of 48622 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 97, 0.2%
## LFC < 0 (down)     : 279, 0.57%
## outliers [1]       : 5, 0.01%
## [1] see 'cooksCutoff' argument of ?results
## see metadata(res)$ihwResult on hypothesis weighting

Extract significant results

res_sig <- subset(res, padj < 0.1)
head(res_sig[order(res_sig$log2FoldChange),]) 
## log2 fold change (MLE): niche CNS vs BM 
## Wald test p-value: niche CNS vs BM 
## DataFrame with 6 rows and 6 columns
##                     baseMean log2FoldChange     lfcSE      stat      pvalue
##                    <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSMUSG00000024127   33.3050      -0.750633  0.205343  -3.65552 2.56664e-04
## ENSMUSG00000092056   33.5568      -0.740958  0.203476  -3.64151 2.71048e-04
## ENSMUSG00000059762  167.0756      -0.581683  0.134663  -4.31954 1.56356e-05
## ENSMUSG00000037465  307.6232      -0.514638  0.114700  -4.48683 7.22916e-06
## ENSMUSG00000019836  114.3921      -0.446693  0.118795  -3.76021 1.69773e-04
## ENSMUSG00000028629   98.4919      -0.440313  0.117141  -3.75883 1.70713e-04
##                         padj
##                    <numeric>
## ENSMUSG00000024127 0.0995124
## ENSMUSG00000092056 0.0995124
## ENSMUSG00000059762 0.0628239
## ENSMUSG00000037465 0.0507682
## ENSMUSG00000019836 0.0941105
## ENSMUSG00000028629 0.0941105
cell_res_sig <- subset(cell_res, padj < 0.1)
head(cell_res_sig[order(cell_res_sig$log2FoldChange),]) 
## log2 fold change (MLE): cell LK vs LSK_IL7R 
## Wald test p-value: cell LK vs LSK_IL7R 
## DataFrame with 6 rows and 6 columns
##                     baseMean log2FoldChange     lfcSE      stat      pvalue
##                    <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSMUSG00000035456   4.57128       -2.72940  0.617841  -4.41764 9.97834e-06
## ENSMUSG00000097906   5.32515       -2.47842  0.618143  -4.00946 6.08577e-05
## ENSMUSG00000067613   5.17657       -2.46157  0.661200  -3.72288 1.96964e-04
## ENSMUSG00000099596   5.38045       -2.21758  0.545556  -4.06481 4.80715e-05
## ENSMUSG00000026227   4.49556       -2.16991  0.610713  -3.55308 3.80746e-04
## ENSMUSG00000035626   4.83965       -2.14065  0.573493  -3.73266 1.89467e-04
##                         padj
##                    <numeric>
## ENSMUSG00000035456 0.0276594
## ENSMUSG00000097906 0.0397759
## ENSMUSG00000067613 0.0589254
## ENSMUSG00000099596 0.0381516
## ENSMUSG00000026227 0.0650842
## ENSMUSG00000035626 0.0589254

Plotting results

counts

top_gene <- rownames(res)[which.min(res$pvalue)] #to select the minimum p-value

plotCounts(dds, gene = top_gene, intgroup=c("niche"))

geneCounts <- plotCounts(dds, gene = top_gene, intgroup = c("niche", "cell", "replicate"), returnData = TRUE)

ggplot(geneCounts, aes(x = niche, y = count, color = replicate, group = replicate, shape = cell)) +
  scale_y_log10() + geom_point(size = 3) + geom_line() + ggtitle(top_gene)

count_data <- plotCounts(dds, gene = top_gene, intgroup = c("niche","cell"), returnData = TRUE)
ggplot(data = count_data, mapping = aes(x = niche, y = count, color = cell)) +
  geom_jitter(width = 0.1) +
  scale_y_log10()

MA plot (aka mean difference plot)

resultsNames(dds)
## [1] "Intercept"           "niche_CNS_vs_BM"     "cell_LSK_IL7R_vs_LK"
#using apeglm method to shrink co-efficients
res_MA <- lfcShrink(dds, coef="niche_CNS_vs_BM", type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
plotMA(res_MA, ylim = c(-2, 4))

res.noshr <- results(dds, name="niche_CNS_vs_BM")
plotMA(res.noshr, ylim = c(-4, 4))

plotMA(res, ylim = c(-5,5))
top_gene <- rownames(res)[which.min(res$padj)]
with(res[top_gene, ], {
  points(baseMean, log2FoldChange, col="dodgerblue", cex=2, lwd=2)
  text(baseMean, log2FoldChange, top_gene, pos=2, col="dodgerblue")
})

# Histogram of p values for genes with mean normalized count larger than 1
hist(res$pvalue[res$baseMean > 1], breaks = 0:20/20,
     col = "grey50", border = "white")

Gene clustering

Select genes to plot (top genes with the highest variance) by heatmap

top_var_genes <- head(order(rowVars(assay(vsd)), decreasing = TRUE), 20) #requires transformed reads
mat <- assay(vsd)[top_var_genes,] #making matrix
mat <- mat - rowMeans(mat)
df <- as.data.frame(colData(vsd)[,c("cell", "niche")])
pheatmap(mat, annotation_col = df) #can also force clustering as before and add annotations (annotations.row) for gene names

Annotation

Adding gene symbols and short description to results to annotate figures

res$symbol <- mapIds(org.Mm.eg.db, 
                     keys = row.names(res),
                     column = "SYMBOL",
                     keytype ="ENSEMBL",
                     multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
res$genename <- mapIds(org.Mm.eg.db,
                       keys = row.names(res),
                       keytype = "ENSEMBL",
                       column = "GENENAME",
                       multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
cell_res$symbol <- mapIds(org.Mm.eg.db, 
                     keys = row.names(cell_res),
                     column = "SYMBOL",
                     keytype ="ENSEMBL",
                     multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
cell_res$genename <- mapIds(org.Mm.eg.db,
                       keys = row.names(cell_res),
                       keytype = "ENSEMBL",
                       column = "GENENAME",
                       multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns

Export the de results

res_ordered <- res[order(res$padj), ]
res_ordered_DF <- as.data.frame(res_ordered) #to convert from bioconductor DataFrame to regular data frame to write as csv
write.csv(res_ordered_DF, file = "/Volumes/T7/Project_2311/miR128a_primary/results/de_genes_niche.csv") #de genes is what is usually what journals want to be submitted

res_ordered <- cell_res[order(cell_res$padj), ]
res_ordered_DF <- as.data.frame(res_ordered) #to convert from bioconductor DataFrame to regular data frame to write as csv
write.csv(res_ordered_DF, file = "/Volumes/T7/Project_2311/miR128a_primary/results/de_genes_cells.csv") #de genes is what is usually what journals want to be submitted

Plotting candidate gene expression

gene_of_interest <- "ENSMUSG00000037071" #SCD1 -stearoyl-Coenzyme A desaturase 1

plotCounts(dds, gene = gene_of_interest, intgroup=c("niche"))

geneCounts <- plotCounts(dds, gene = gene_of_interest, intgroup = c("niche", "cell", "replicate"), returnData = TRUE)

ggplot(geneCounts, aes(x = niche, y = count, color = replicate, group = replicate, shape = cell)) +
  scale_y_log10() + geom_point(size = 3) + geom_line() + ggtitle(gene_of_interest)