#source("http://bioconductor.org/workflows.R")
#workflowInstall("RNAseq123")
# Additionally, the tutorial uses some R packages available on CRAN,
# which you can install by running the following:
#install.packages(c("gplots", "RColorBrewer", "R.utils"))
url <- "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE63310&format=file"
utils::download.file(url, destfile = "GSE63310_RAW.tar", mode = "wb")
utils::untar("GSE63310_RAW.tar", exdir = ".")
files_gz <- Sys.glob("GSM*txt.gz")
for(f in files_gz)
R.utils::gunzip(f, overwrite = TRUE)
How would you describe the data? Where does the data come from? The example data used in this tutorial is RNA-seq of 3 cell populations in the mouse mammary gland collected by Sheridan et al., 2015. The code below downloads the count data from the Gene Expression Omnibus (GEO), an NCBI repository of genomics data.
What are R packages? Where do we usually get them from? What are R packages that you have used before?
# Load the packages we will use
library("limma") # LM for differential expression
library("Glimma") # Interactive plots (good for data exploration)
library("edgeR") # Process count data from RNA seq exp
library("Mus.musculus") # Gene annotations for the Mus musculus genome
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:limma':
##
## plotMA
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colMeans,
## colnames, colSums, do.call, duplicated, eval, evalq, Filter,
## Find, get, grep, grepl, intersect, is.unsorted, lapply,
## lengths, Map, mapply, match, mget, order, paste, pmax,
## pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
## rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
## tapply, union, unique, unsplit, which, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: OrganismDbi
## Loading required package: GenomicFeatures
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: GO.db
##
## Loading required package: org.Mm.eg.db
##
## Loading required package: TxDb.Mmusculus.UCSC.mm10.knownGene
# Import the data
files <- c("GSM1545535_10_6_5_11.txt", "GSM1545536_9_6_5_11.txt",
"GSM1545538_purep53.txt", "GSM1545539_JMS8-2.txt",
"GSM1545540_JMS8-3.txt", "GSM1545541_JMS8-4.txt",
"GSM1545542_JMS8-5.txt", "GSM1545544_JMS9-P7c.txt",
"GSM1545545_JMS9-P8c.txt")
read.delim(files[1], nrow = 5)
## EntrezID GeneLength Count
## 1 497097 3634 1
## 2 100503874 3259 0
## 3 100038431 1634 0
## 4 19888 9747 0
## 5 20671 3130 1
What are properties of the dataset that we want to know before we begin processing it?
x <- readDGE(files, columns = c(1, 3))
class(x)
## [1] "DGEList"
## attr(,"package")
## [1] "edgeR"
dim(x)
## [1] 27179 9
names(x)
## [1] "samples" "counts"
str(x)
## Formal class 'DGEList' [package "edgeR"] with 1 slot
## ..@ .Data:List of 2
## .. ..$ :'data.frame': 9 obs. of 4 variables:
## .. .. ..$ files : chr [1:9] "GSM1545535_10_6_5_11.txt" "GSM1545536_9_6_5_11.txt" "GSM1545538_purep53.txt" "GSM1545539_JMS8-2.txt" ...
## .. .. ..$ group : Factor w/ 1 level "1": 1 1 1 1 1 1 1 1 1
## .. .. ..$ lib.size : num [1:9] 32863052 35335491 57160817 51368625 75795034 ...
## .. .. ..$ norm.factors: num [1:9] 1 1 1 1 1 1 1 1 1
## .. ..$ : num [1:27179, 1:9] 1 0 0 0 1 431 768 4 810 452 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ Tags : chr [1:27179] "497097" "100503874" "100038431" "19888" ...
## .. .. .. ..$ Samples: chr [1:9] "GSM1545535_10_6_5_11" "GSM1545536_9_6_5_11" "GSM1545538_purep53" "GSM1545539_JMS8-2" ...
The samples consist of different cell types and were sequenced on different lanes.
# Annotate the samples
x$samples
## files group lib.size norm.factors
## GSM1545535_10_6_5_11 GSM1545535_10_6_5_11.txt 1 32863052 1
## GSM1545536_9_6_5_11 GSM1545536_9_6_5_11.txt 1 35335491 1
## GSM1545538_purep53 GSM1545538_purep53.txt 1 57160817 1
## GSM1545539_JMS8-2 GSM1545539_JMS8-2.txt 1 51368625 1
## GSM1545540_JMS8-3 GSM1545540_JMS8-3.txt 1 75795034 1
## GSM1545541_JMS8-4 GSM1545541_JMS8-4.txt 1 60517657 1
## GSM1545542_JMS8-5 GSM1545542_JMS8-5.txt 1 55086324 1
## GSM1545544_JMS9-P7c GSM1545544_JMS9-P7c.txt 1 21311068 1
## GSM1545545_JMS9-P8c GSM1545545_JMS9-P8c.txt 1 19958838 1
samplenames <- substring(colnames(x), 12, nchar(colnames(x)))
samplenames
## [1] "10_6_5_11" "9_6_5_11" "purep53" "JMS8-2" "JMS8-3" "JMS8-4"
## [7] "JMS8-5" "JMS9-P7c" "JMS9-P8c"
colnames(x) <- samplenames
group <- as.factor(c("LP", "ML", "Basal", "Basal", "ML", "LP",
"Basal", "ML", "LP"))
x$samples$group <- group
lane <- as.factor(rep(c("L004", "L006", "L008"), c(3, 4, 2)))
x$samples$lane <- lane
x$samples
## files group lib.size norm.factors lane
## 10_6_5_11 GSM1545535_10_6_5_11.txt LP 32863052 1 L004
## 9_6_5_11 GSM1545536_9_6_5_11.txt ML 35335491 1 L004
## purep53 GSM1545538_purep53.txt Basal 57160817 1 L004
## JMS8-2 GSM1545539_JMS8-2.txt Basal 51368625 1 L006
## JMS8-3 GSM1545540_JMS8-3.txt ML 75795034 1 L006
## JMS8-4 GSM1545541_JMS8-4.txt LP 60517657 1 L006
## JMS8-5 GSM1545542_JMS8-5.txt Basal 55086324 1 L006
## JMS9-P7c GSM1545544_JMS9-P7c.txt ML 21311068 1 L008
## JMS9-P8c GSM1545545_JMS9-P8c.txt LP 19958838 1 L008
# Annotate the genes
head(x$counts)
## Samples
## Tags 10_6_5_11 9_6_5_11 purep53 JMS8-2 JMS8-3 JMS8-4 JMS8-5
## 497097 1 2 342 526 3 3 535
## 100503874 0 0 5 6 0 0 5
## 100038431 0 0 0 0 0 0 1
## 19888 0 1 0 0 17 2 0
## 20671 1 1 76 40 33 14 98
## 27395 431 771 1368 1268 1564 769 818
## Samples
## Tags JMS9-P7c JMS9-P8c
## 497097 2 0
## 100503874 0 0
## 100038431 0 0
## 19888 1 0
## 20671 18 8
## 27395 468 342
dim(x$counts)
## [1] 27179 9
geneid <- rownames(x)
genes <- select(Mus.musculus, keys = geneid,
columns = c("SYMBOL", "TXCHROM"),
keytype = "ENTREZID")
## 'select()' returned 1:many mapping between keys and columns
head(genes)
## ENTREZID SYMBOL TXCHROM
## 1 497097 Xkr4 chr1
## 2 100503874 Gm19938 <NA>
## 3 100038431 Gm10568 <NA>
## 4 19888 Rp1 chr1
## 5 20671 Sox17 chr1
## 6 27395 Mrpl15 chr1
genes <- genes[!duplicated(genes$ENTREZID), ]
x$genes <- genes
x
## An object of class "DGEList"
## $samples
## files group lib.size norm.factors lane
## 10_6_5_11 GSM1545535_10_6_5_11.txt LP 32863052 1 L004
## 9_6_5_11 GSM1545536_9_6_5_11.txt ML 35335491 1 L004
## purep53 GSM1545538_purep53.txt Basal 57160817 1 L004
## JMS8-2 GSM1545539_JMS8-2.txt Basal 51368625 1 L006
## JMS8-3 GSM1545540_JMS8-3.txt ML 75795034 1 L006
## JMS8-4 GSM1545541_JMS8-4.txt LP 60517657 1 L006
## JMS8-5 GSM1545542_JMS8-5.txt Basal 55086324 1 L006
## JMS9-P7c GSM1545544_JMS9-P7c.txt ML 21311068 1 L008
## JMS9-P8c GSM1545545_JMS9-P8c.txt LP 19958838 1 L008
##
## $counts
## Samples
## Tags 10_6_5_11 9_6_5_11 purep53 JMS8-2 JMS8-3 JMS8-4 JMS8-5
## 497097 1 2 342 526 3 3 535
## 100503874 0 0 5 6 0 0 5
## 100038431 0 0 0 0 0 0 1
## 19888 0 1 0 0 17 2 0
## 20671 1 1 76 40 33 14 98
## Samples
## Tags JMS9-P7c JMS9-P8c
## 497097 2 0
## 100503874 0 0
## 100038431 0 0
## 19888 1 0
## 20671 18 8
## 27174 more rows ...
##
## $genes
## ENTREZID SYMBOL TXCHROM
## 1 497097 Xkr4 chr1
## 2 100503874 Gm19938 <NA>
## 3 100038431 Gm10568 <NA>
## 4 19888 Rp1 chr1
## 5 20671 Sox17 chr1
## 27174 more rows ...
Why can’t we compare the raw counts directly? (e.g. Gene 1 has an average count 2 in cases but an average count of 15 in controls)
# Calculate (log) counts-per-million (cpm)
cpm <- cpm(x)
lcpm <- cpm(x, log = TRUE)
# Detect genes with 0 counts aacross 9 samples
table(rowSums(x$counts == 0) == 9)
##
## FALSE TRUE
## 22026 5153
# Visualize the distribution of gene expression levels
plotDensities(lcpm, legend = FALSE, main = "Before filtering")
abline(v = 0, lty = 3)
At this point, we want to filter genes with a raw count of 0 across the samples as well as lowly expressed genes. The rule that we are going to use in this analysis pipleline is to only keep genes which have cpm > 1 in at least 3 samples.
Why do we want to filter genes with a raw count of 0 across the samples? Why do we also want to filter very lowly expressed genes? Hint: what is the role of technical variation/noise versus biological signal when a gene is consistently lowly expressed?
# Only keep genes which have cpm greater than 1 in at least 3 samples
keep.exprs <- rowSums(cpm > 1) >= 3
x <- x[keep.exprs, , keep.lib.sizes = FALSE]
dim(x)
## [1] 14165 9
lcpm <- cpm(x, log=TRUE)
# Visualize the distribution of gene expression levels after filtering
plotDensities(lcpm, legend = FALSE, main = "After filtering")
abline(v = 0, lty = 3)
What do we notice about the distribution of the gene expression log cpm levels after filtering?
The default normalization provided by edgeR is TMM (trimmed mean of M-values).
# Let's calculate the normalization factors for our data
x <- calcNormFactors(x, method = "TMM")
x$samples$norm.factors
## [1] 0.8957309 1.0349196 1.0439552 1.0405040 1.0323599 0.9223424 0.9836603
## [8] 1.0827381 0.9792607
# Let's see how it works in an extreme (toy) example
x2 <- x
x2$samples$norm.factors <- 1
x2$counts[,1] <- ceiling(x2$counts[, 1] * 0.05)
x2$counts[,2] <- x2$counts[, 2] * 5
lcpm2 <- cpm(x2, log = TRUE)
boxplot(lcpm2, las = 2, main = "Before normalization")
x2 <- calcNormFactors(x2)
x2$samples$norm.factors
## [1] 0.05472223 6.13059440 1.22927355 1.17051887 1.21487709 1.05622968
## [7] 1.14587663 1.26129350 1.11702264
lcpm2 <- cpm(x2, log = TRUE)
boxplot(lcpm2, las=2, main = "After normalization")
# Visualize sample relationships with multidimensional scaling (MDS)
library("RColorBrewer")
group
## [1] LP ML Basal Basal ML LP Basal ML LP
## Levels: Basal LP ML
col.group <- group
levels(col.group) <- brewer.pal(nlevels(col.group), "Set1")
col.group <- as.character(col.group)
lane
## [1] L004 L004 L004 L006 L006 L006 L006 L008 L008
## Levels: L004 L006 L008
col.lane <- lane
levels(col.lane) <- brewer.pal(nlevels(col.lane), "Set2")
col.lane <- as.character(col.lane)
plotMDS(lcpm, labels = group, col = col.group,
main = "group")
plotMDS(lcpm, labels = lane, col = col.lane, dim = c(3, 4),
main = "lane")
# Make an interaction MDS plot
glMDSPlot(lcpm, labels = paste(group, lane, sep = "_"),
groups = x$samples[, c(2, 5)], launch = TRUE)
In this data, what is our biological variable of interest? What technical variable can we also include in our model? If we weren’t just given two variables, how would we decide which biological and technical variables to include?
The model that we are going to write out will not have an intercept. It is called the group-means parameterization in the limma User’s Guide. In this model, each coefficient (beta) will have the mean expression for that group. This paramaterization is more convenient to test specific hypotheses. For a given experiment, there are usually several equivalent ways to set up an appropriate design matrix. For example, ~0+group+lane removes the intercept from the first factor, group, but an intercept remains in the second factor lane.
Warning: There are limitations to when you can use this.
# First, we will make a design matrix. This holds information about our samples
design <- model.matrix(~0 + group + lane)
colnames(design) <- gsub("group", "", colnames(design))
design
## Basal LP ML laneL006 laneL008
## 1 0 1 0 0 0
## 2 0 0 1 0 0
## 3 1 0 0 0 0
## 4 1 0 0 1 0
## 5 0 0 1 1 0
## 6 0 1 0 1 0
## 7 1 0 0 1 0
## 8 0 0 1 0 1
## 9 0 1 0 0 1
## attr(,"assign")
## [1] 1 1 1 2 2
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
##
## attr(,"contrasts")$lane
## [1] "contr.treatment"
Now, we are interested in asking the following. For each hypothesis, how would we compare these using our model?
To do this in limma, we are going to create a contrast matrix.
contr.matrix <- makeContrasts(BasalvsLP = Basal - LP,
BasalvsML = Basal - ML,
LPvsML = LP - ML,
levels = colnames(design))
contr.matrix
## Contrasts
## Levels BasalvsLP BasalvsML LPvsML
## Basal 1 1 0
## LP -1 0 1
## ML 0 -1 -1
## laneL006 0 0 0
## laneL008 0 0 0
We are not done transforming the gene expression levels. Even after TMM normalization, there is still a relationship between mean and variance of the log cpm gene expression levels. Therefore, we will use a function called “voom” to calculate weights that will offset this mean-variance relationship in the linear model. Again, these weights will go into the linear model. Note that voom can also do additional normalizations.
# The function `voom` calculates weights to offset this mean-variance relationship.
v <- voom(x, design, plot = TRUE)
v
## An object of class "EList"
## $genes
## ENTREZID SYMBOL TXCHROM
## 1 497097 Xkr4 chr1
## 6 27395 Mrpl15 chr1
## 7 18777 Lypla1 chr1
## 9 21399 Tcea1 chr1
## 10 58175 Rgs20 chr1
## 14160 more rows ...
##
## $targets
## files group lib.size norm.factors lane
## 10_6_5_11 GSM1545535_10_6_5_11.txt LP 29409426 0.8957309 L004
## 9_6_5_11 GSM1545536_9_6_5_11.txt ML 36528591 1.0349196 L004
## purep53 GSM1545538_purep53.txt Basal 59598629 1.0439552 L004
## JMS8-2 GSM1545539_JMS8-2.txt Basal 53382070 1.0405040 L006
## JMS8-3 GSM1545540_JMS8-3.txt ML 78175314 1.0323599 L006
## JMS8-4 GSM1545541_JMS8-4.txt LP 55762781 0.9223424 L006
## JMS8-5 GSM1545542_JMS8-5.txt Basal 54115150 0.9836603 L006
## JMS9-P7c GSM1545544_JMS9-P7c.txt ML 23043111 1.0827381 L008
## JMS9-P8c GSM1545545_JMS9-P8c.txt LP 19525423 0.9792607 L008
##
## $E
## Samples
## Tags 10_6_5_11 9_6_5_11 purep53 JMS8-2 JMS8-3 JMS8-4
## 497097 -4.293244 -3.869026 2.522753 3.302006 -4.481286 -3.993876
## 27395 3.875010 4.400568 4.521172 4.570624 4.322845 3.786547
## 18777 4.707695 5.559334 5.400569 5.171235 5.627798 5.081794
## 21399 4.784462 4.741999 5.374548 5.130925 4.848030 4.944024
## 58175 3.943567 3.294875 -1.767924 -1.880302 2.993289 3.357379
## Samples
## Tags JMS8-5 JMS9-P7c JMS9-P8c
## 497097 3.306782 -3.204336 -5.287282
## 27395 3.918878 4.345642 4.132678
## 18777 5.080061 5.757404 5.150470
## 21399 5.158292 5.036933 4.987679
## 58175 -2.114104 3.142621 3.523290
## 14160 more rows ...
##
## $weights
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 1.183974 1.183974 20.526779 20.97747 1.773562 1.217142 21.125740
## [2,] 20.879554 26.561871 31.596323 29.66102 32.558344 26.745293 29.792090
## [3,] 28.003202 33.695540 34.845507 34.45673 35.148529 33.550527 34.517259
## [4,] 27.670233 29.595778 34.901302 34.43298 34.841349 33.159425 34.493456
## [5,] 19.737381 18.658333 3.184207 2.62986 24.191635 24.014937 2.648747
## [,8] [,9]
## [1,] 1.183974 1.183974
## [2,] 21.900102 17.150677
## [3,] 31.440457 25.228325
## [4,] 26.136796 24.502247
## [5,] 13.149278 14.351930
## 14160 more rows ...
##
## $design
## Basal LP ML laneL006 laneL008
## 1 0 1 0 0 0
## 2 0 0 1 0 0
## 3 1 0 0 0 0
## 4 1 0 0 1 0
## 5 0 0 1 1 0
## 6 0 1 0 1 0
## 7 1 0 0 1 0
## 8 0 0 1 0 1
## 9 0 1 0 0 1
## attr(,"assign")
## [1] 1 1 1 2 2
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
##
## attr(,"contrasts")$lane
## [1] "contr.treatment"
Use the gene expression levels + voom weights (in the variable “v”) and the design matrix to specify the coefficients in the linear model (in the object “design”).
# Fit a linear model for each gene
vfit <- lmFit(v, design)
View(vfit$coefficients)
View(vfit$stdev.unscaled)
# Caculate the statistics for our specific contrasts of interest
vfit <- contrasts.fit(vfit, contrasts = contr.matrix)
View(vfit$coefficients)
View(vfit$stdev.unscaled)
How were the views of vfit different between the two stages?
Are all of the genes independent? We know that genes are correlated so we can share information across the genes– specifically shrinking the gene- specific variances towards the average variance across all genes.
Normally, how do we calculate the t score for gene expression? X_group1 - X_group2 / SE of both groups Limma uses a moderated t statistic which takes into account the relationship between genes.
# Calculate the t statistics for each gene in each comparison (hypothesis)
efit <- eBayes(vfit)
View(efit$t)
# Explore the relationship between residual variation versus the mean gene expression level.
plotSA(efit, main = "Final model: Mean−variance trend")
Based on this plot, does voom remove the relationship between the mean and variance?
What thresholds can we use to define “differentially expressed” genes? (Hint: magnitude and p-value)
# Tabulate the results
summary(decideTests(efit))
## BasalvsLP BasalvsML LPvsML
## Down 4127 4338 2895
## NotSig 5740 5655 8825
## Up 4298 4172 2445
# If the magnitude of the effect size is important for your downstream analysis (e.g. perhaps you are trying to prioritize genes for further experiments), you can specify a minimal log-fold-change with the function "treat" instead of using "eBayes"
tfit <- treat(vfit, lfc = 1)
dt <- decideTests(tfit)
summary(dt)
## BasalvsLP BasalvsML LPvsML
## Down 1417 1512 203
## NotSig 11030 10895 13780
## Up 1718 1758 182
# Create a venn diagram of the results
head(dt)
## Contrasts
## BasalvsLP BasalvsML LPvsML
## 497097 1 1 0
## 27395 0 0 0
## 18777 0 0 0
## 21399 0 0 0
## 58175 -1 -1 0
## 108664 0 0 0
de.common <- which(dt[, 1] != 0 & dt[, 2] != 0)
length(de.common)
## [1] 2409
head(tfit$genes$SYMBOL[de.common], n = 20)
## [1] "Xkr4" "Rgs20" "Cpa6" "Sulf1"
## [5] "Eya1" "Msc" "Sbspon" "Pi15"
## [9] "Crispld1" "Kcnq5" "Ptpn18" "Arhgef4"
## [13] "2010300C02Rik" "Aff3" "Npas2" "Tbc1d8"
## [17] "Creg2" "Il1r1" "Il18r1" "Il18rap"
vennDiagram(dt[, 1:2], circle.col = c("turquoise", "salmon"))
#write.fit(tfit, dt, file = "results.txt")
# Identify the top DE genes
basal.vs.lp <- topTreat(tfit, coef = 1, n = Inf)
basal.vs.ml <- topTreat(tfit, coef = 2, n = Inf)
head(basal.vs.lp)
## ENTREZID SYMBOL TXCHROM logFC AveExpr t P.Value
## 12759 12759 Clu chr14 -5.442877 8.857907 -33.44429 3.990899e-10
## 53624 53624 Cldn7 chr11 -5.514605 6.296762 -32.94533 4.503694e-10
## 242505 242505 Rasef chr4 -5.921741 5.119585 -31.77625 6.063249e-10
## 67451 67451 Pkp2 chr16 -5.724823 4.420495 -30.65370 8.010456e-10
## 228543 228543 Rhov chr2 -6.253427 5.486640 -29.46244 1.112729e-09
## 70350 70350 Basp1 chr15 -6.073297 5.248349 -28.64890 1.380545e-09
## adj.P.Val
## 12759 2.703871e-06
## 53624 2.703871e-06
## 242505 2.703871e-06
## 67451 2.703871e-06
## 228543 2.703871e-06
## 70350 2.703871e-06
head(basal.vs.ml)
## ENTREZID SYMBOL TXCHROM logFC AveExpr t P.Value
## 242505 242505 Rasef chr4 -6.510470 5.119585 -35.49093 2.573575e-10
## 53624 53624 Cldn7 chr11 -5.469160 6.296762 -32.52520 4.978446e-10
## 12521 12521 Cd82 chr2 -4.667737 7.070963 -31.82187 5.796191e-10
## 71740 71740 Nectin4 chr1 -5.556046 5.166292 -31.29987 6.760578e-10
## 20661 20661 Sort1 chr3 -4.908119 6.705784 -31.23083 6.761331e-10
## 15375 15375 Foxa1 chr12 -5.753884 5.625064 -28.34612 1.487280e-09
## adj.P.Val
## 242505 1.915485e-06
## 53624 1.915485e-06
## 12521 1.915485e-06
## 71740 1.915485e-06
## 20661 1.915485e-06
## 15375 2.281914e-06
# Visualize the DE genes
plotMD(tfit, column = 1, status = dt[, 1], main = colnames(tfit)[1],
xlim = c(-8, 13))
#glMDPlot(tfit, coef = 1, status = dt, main = colnames(tfit)[1],
# id.column = "ENTREZID", counts = x$counts, groups = group,
# launch = TRUE)
glMDPlot(tfit, coef = 1, status = dt, main = colnames(tfit)[1],
side.main = "ENTREZID", counts = x$counts, groups = group,
launch = TRUE)
# View heatmap of top 100 DE genes between Basal and LP cells
library("gplots")
##
## Attaching package: 'gplots'
## The following object is masked from 'package:IRanges':
##
## space
## The following object is masked from 'package:S4Vectors':
##
## space
## The following object is masked from 'package:stats':
##
## lowess
basal.vs.lp.topgenes <- basal.vs.lp$ENTREZID[1:100]
i <- which(v$genes$ENTREZID %in% basal.vs.lp.topgenes)
mycol <- colorpanel(1000, "blue", "white", "red")
heatmap.2(v$E[i, ], scale = "row",
labRow = v$genes$SYMBOL[i], labCol = group,
col = mycol, trace = "none", density.info = "none")
Why do we scale by row (per gene) in the heatmap?
Why might we want to do gene set enrichment of the DE genes? Why might we be interested in the gene sets that are specifically enriched in 1 contrast versus the others (e.g. gene sets in Basal versus LP but not Basal versus ML and not LP versus LM)? What are some of the limitations of the use of enrichment of specific gene sets (e.g. use of Gene Ontology or KEGG pathways)?
# A common first-step post-DE testing is to test for enrichment of specific gene sets e.g. Gene Ontology (GO) categories or KEGG pathways. Here, we will use the gene set collection MSigDB c2 from the Broad Institue.
load(system.file("extdata", "mouse_c2_v5p1.rda", package = "RNAseq123"))
class(Mm.c2)
## [1] "list"
Mm.c2$KEGG_GLYCOLYSIS_GLUCONEOGENESIS
## [1] "100042025" "100042427" "103988" "106557" "109785"
## [6] "110695" "11522" "11529" "11532" "11668"
## [11] "11669" "11670" "11671" "11674" "11676"
## [16] "12039" "12040" "12183" "13382" "13806"
## [21] "13807" "13808" "14120" "14121" "14377"
## [26] "14378" "14433" "14447" "14751" "14782"
## [31] "15275" "15277" "15452" "16497" "16498"
## [36] "16499" "16828" "16832" "16833" "18534"
## [41] "18597" "18598" "18641" "18642" "18648"
## [46] "18655" "18663" "18746" "18770" "19378"
## [51] "20322" "212032" "215974" "216019" "21881"
## [56] "21991" "226041" "230163" "232223" "232491"
## [61] "235339" "26358" "26462" "26876" "26926"
## [66] "277333" "319625" "353204" "380660" "433182"
## [71] "50493" "56012" "56421" "56752" "56847"
## [76] "58810" "60525" "621603" "622339" "638833"
## [81] "640374" "666258" "666488" "66681" "667451"
## [86] "668435" "669429" "67689" "68263" "68738"
## [91] "69117" "70974" "72157" "72168" "72535"
## [96] "73458" "74419" "74551" "78894" "79459"
## [101] "83553"
idx <- ids2indices(Mm.c2, id = rownames(v))
# The limma function "camera" tests for enrichment for a specific contrast of interest by comparing all the gene sets against one another.
cam.BasalvsLP <- camera(v, idx, design, contrast = contr.matrix[, 1])
head(cam.BasalvsLP, 5)
## NGenes Direction PValue
## LIM_MAMMARY_STEM_CELL_UP 739 Up 1.134757e-18
## LIM_MAMMARY_STEM_CELL_DN 630 Down 1.569957e-15
## ROSTY_CERVICAL_CANCER_PROLIFERATION_CLUSTER 163 Up 1.437987e-13
## SOTIRIOU_BREAST_CANCER_GRADE_1_VS_3_UP 183 Up 2.181862e-13
## LIM_MAMMARY_LUMINAL_PROGENITOR_UP 87 Down 6.734613e-13
## FDR
## LIM_MAMMARY_STEM_CELL_UP 5.360590e-15
## LIM_MAMMARY_STEM_CELL_DN 3.708238e-12
## ROSTY_CERVICAL_CANCER_PROLIFERATION_CLUSTER 2.264351e-10
## SOTIRIOU_BREAST_CANCER_GRADE_1_VS_3_UP 2.576779e-10
## LIM_MAMMARY_LUMINAL_PROGENITOR_UP 6.362863e-10
cam.BasalvsML <- camera(v, idx, design, contrast = contr.matrix[, 2])
head(cam.BasalvsML, 5)
## NGenes Direction PValue
## LIM_MAMMARY_STEM_CELL_UP 739 Up 5.090937e-23
## LIM_MAMMARY_STEM_CELL_DN 630 Down 5.132446e-19
## LIM_MAMMARY_LUMINAL_MATURE_DN 166 Up 8.875174e-16
## LIM_MAMMARY_LUMINAL_MATURE_UP 180 Down 6.287301e-13
## ROSTY_CERVICAL_CANCER_PROLIFERATION_CLUSTER 163 Up 1.684323e-12
## FDR
## LIM_MAMMARY_STEM_CELL_UP 2.404959e-19
## LIM_MAMMARY_STEM_CELL_DN 1.212284e-15
## LIM_MAMMARY_LUMINAL_MATURE_DN 1.397544e-12
## LIM_MAMMARY_LUMINAL_MATURE_UP 7.425303e-10
## ROSTY_CERVICAL_CANCER_PROLIFERATION_CLUSTER 1.591348e-09
cam.LPvsML <- camera(v, idx, design, contrast = contr.matrix[, 3])
head(cam.LPvsML, 5)
## NGenes Direction PValue
## LIM_MAMMARY_LUMINAL_MATURE_UP 180 Down 8.497295e-14
## LIM_MAMMARY_LUMINAL_MATURE_DN 166 Up 1.439890e-13
## LIM_MAMMARY_LUMINAL_PROGENITOR_UP 87 Up 3.840915e-11
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT 91 Down 2.655349e-08
## NABA_CORE_MATRISOME 222 Up 4.430361e-08
## FDR
## LIM_MAMMARY_LUMINAL_MATURE_UP 3.401020e-10
## LIM_MAMMARY_LUMINAL_MATURE_DN 3.401020e-10
## LIM_MAMMARY_LUMINAL_PROGENITOR_UP 6.048160e-08
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT 3.135967e-05
## NABA_CORE_MATRISOME 4.185805e-05
# Visualize the gebe set enrichment with a barcode plot.
barcodeplot(efit$t[, 3], index = idx$LIM_MAMMARY_LUMINAL_MATURE_UP,
index2 = idx$LIM_MAMMARY_LUMINAL_MATURE_DN,
main = "LPvsML")
Once you have DE genes, what might be further steps for (computational) analysis?
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: OS X El Capitan 10.11.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] gplots_3.0.1
## [2] RColorBrewer_1.1-2
## [3] Mus.musculus_1.3.1
## [4] TxDb.Mmusculus.UCSC.mm10.knownGene_3.4.0
## [5] org.Mm.eg.db_3.5.0
## [6] GO.db_3.5.0
## [7] OrganismDbi_1.20.0
## [8] GenomicFeatures_1.30.3
## [9] GenomicRanges_1.30.3
## [10] GenomeInfoDb_1.14.0
## [11] AnnotationDbi_1.40.0
## [12] IRanges_2.12.0
## [13] S4Vectors_0.16.0
## [14] Biobase_2.38.0
## [15] BiocGenerics_0.24.0
## [16] edgeR_3.20.7
## [17] Glimma_1.6.0
## [18] limma_3.34.9
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.15 locfit_1.5-9.1
## [3] lattice_0.20-35 gtools_3.5.0
## [5] prettyunits_1.0.2 Rsamtools_1.30.0
## [7] Biostrings_2.46.0 assertthat_0.2.0
## [9] rprojroot_1.3-2 digest_0.6.15
## [11] R6_2.2.2 backports_1.1.2
## [13] RSQLite_2.0 evaluate_0.10.1
## [15] httr_1.3.1 BiocInstaller_1.28.0
## [17] pillar_1.1.0 zlibbioc_1.24.0
## [19] rlang_0.1.6 progress_1.1.2
## [21] gdata_2.18.0 blob_1.1.0
## [23] R.utils_2.6.0 R.oo_1.21.0
## [25] Matrix_1.2-13 rmarkdown_1.9
## [27] RMySQL_0.10.14 BiocParallel_1.12.0
## [29] stringr_1.3.0 RCurl_1.95-4.10
## [31] bit_1.1-12 biomaRt_2.34.2
## [33] DelayedArray_0.4.1 compiler_3.4.3
## [35] rtracklayer_1.38.3 pkgconfig_2.0.1
## [37] htmltools_0.3.6 SummarizedExperiment_1.8.1
## [39] tibble_1.4.2 GenomeInfoDbData_1.0.0
## [41] matrixStats_0.53.1 XML_3.98-1.10
## [43] GenomicAlignments_1.14.2 bitops_1.0-6
## [45] R.methodsS3_1.7.1 grid_3.4.3
## [47] RBGL_1.54.0 jsonlite_1.5
## [49] DBI_0.7 magrittr_1.5
## [51] KernSmooth_2.23-15 graph_1.56.0
## [53] stringi_1.1.7 XVector_0.18.0
## [55] tools_3.4.3 bit64_0.9-7
## [57] yaml_2.1.18 caTools_1.17.1
## [59] memoise_1.1.0 knitr_1.20
Why is it important to report the versions of the software you used to generate the results?
Further reading: The Limma User’s Guide is great! Good explanations with lots of examples!
Next time: Use of lm in R