Installation

#source("http://bioconductor.org/workflows.R")
#workflowInstall("RNAseq123")
# Additionally, the tutorial uses some R packages available on CRAN,
# which you can install by running the following:
#install.packages(c("gplots", "RColorBrewer", "R.utils"))

Download data

url <- "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE63310&format=file"
utils::download.file(url, destfile = "GSE63310_RAW.tar", mode = "wb")
utils::untar("GSE63310_RAW.tar", exdir = ".")
files_gz <- Sys.glob("GSM*txt.gz")
for(f in files_gz)
R.utils::gunzip(f, overwrite = TRUE)

Setup

How would you describe the data? Where does the data come from? The example data used in this tutorial is RNA-seq of 3 cell populations in the mouse mammary gland collected by Sheridan et al., 2015. The code below downloads the count data from the Gene Expression Omnibus (GEO), an NCBI repository of genomics data.

What are R packages? Where do we usually get them from? What are R packages that you have used before?

# Load the packages we will use
library("limma") # LM for differential expression
library("Glimma") # Interactive plots (good for data exploration)
library("edgeR") # Process count data from RNA seq exp
library("Mus.musculus") # Gene annotations for the Mus musculus genome

## Loading required package: AnnotationDbi

## Loading required package: stats4

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following object is masked from 'package:limma':
## 
##     plotMA

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colMeans,
##     colnames, colSums, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply,
##     lengths, Map, mapply, match, mget, order, paste, pmax,
##     pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
##     rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
##     tapply, union, unique, unsplit, which, which.max, which.min

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Loading required package: IRanges

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:base':
## 
##     expand.grid

## Loading required package: OrganismDbi

## Loading required package: GenomicFeatures

## Loading required package: GenomeInfoDb

## Loading required package: GenomicRanges

## Loading required package: GO.db

##

## Loading required package: org.Mm.eg.db

##

## Loading required package: TxDb.Mmusculus.UCSC.mm10.knownGene

# Import the data

files <- c("GSM1545535_10_6_5_11.txt", "GSM1545536_9_6_5_11.txt",
           "GSM1545538_purep53.txt", "GSM1545539_JMS8-2.txt",
           "GSM1545540_JMS8-3.txt", "GSM1545541_JMS8-4.txt",
           "GSM1545542_JMS8-5.txt", "GSM1545544_JMS9-P7c.txt",
           "GSM1545545_JMS9-P8c.txt")

read.delim(files[1], nrow = 5)

##    EntrezID GeneLength Count
## 1    497097       3634     1
## 2 100503874       3259     0
## 3 100038431       1634     0
## 4     19888       9747     0
## 5     20671       3130     1

What are properties of the dataset that we want to know before we begin processing it?

x <- readDGE(files, columns = c(1, 3))
class(x)

## [1] "DGEList"
## attr(,"package")
## [1] "edgeR"

dim(x)

## [1] 27179     9

names(x)

## [1] "samples" "counts"

str(x)

## Formal class 'DGEList' [package "edgeR"] with 1 slot
##   ..@ .Data:List of 2
##   .. ..$ :'data.frame':  9 obs. of  4 variables:
##   .. .. ..$ files       : chr [1:9] "GSM1545535_10_6_5_11.txt" "GSM1545536_9_6_5_11.txt" "GSM1545538_purep53.txt" "GSM1545539_JMS8-2.txt" ...
##   .. .. ..$ group       : Factor w/ 1 level "1": 1 1 1 1 1 1 1 1 1
##   .. .. ..$ lib.size    : num [1:9] 32863052 35335491 57160817 51368625 75795034 ...
##   .. .. ..$ norm.factors: num [1:9] 1 1 1 1 1 1 1 1 1
##   .. ..$ : num [1:27179, 1:9] 1 0 0 0 1 431 768 4 810 452 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ Tags   : chr [1:27179] "497097" "100503874" "100038431" "19888" ...
##   .. .. .. ..$ Samples: chr [1:9] "GSM1545535_10_6_5_11" "GSM1545536_9_6_5_11" "GSM1545538_purep53" "GSM1545539_JMS8-2" ...

The samples consist of different cell types and were sequenced on different lanes.

# Annotate the samples

x$samples

##                                         files group lib.size norm.factors
## GSM1545535_10_6_5_11 GSM1545535_10_6_5_11.txt     1 32863052            1
## GSM1545536_9_6_5_11   GSM1545536_9_6_5_11.txt     1 35335491            1
## GSM1545538_purep53     GSM1545538_purep53.txt     1 57160817            1
## GSM1545539_JMS8-2       GSM1545539_JMS8-2.txt     1 51368625            1
## GSM1545540_JMS8-3       GSM1545540_JMS8-3.txt     1 75795034            1
## GSM1545541_JMS8-4       GSM1545541_JMS8-4.txt     1 60517657            1
## GSM1545542_JMS8-5       GSM1545542_JMS8-5.txt     1 55086324            1
## GSM1545544_JMS9-P7c   GSM1545544_JMS9-P7c.txt     1 21311068            1
## GSM1545545_JMS9-P8c   GSM1545545_JMS9-P8c.txt     1 19958838            1

samplenames <- substring(colnames(x), 12, nchar(colnames(x)))
samplenames

## [1] "10_6_5_11" "9_6_5_11"  "purep53"   "JMS8-2"    "JMS8-3"    "JMS8-4"   
## [7] "JMS8-5"    "JMS9-P7c"  "JMS9-P8c"

colnames(x) <- samplenames
group <- as.factor(c("LP", "ML", "Basal", "Basal", "ML", "LP",
                     "Basal", "ML", "LP"))
x$samples$group <- group
lane <- as.factor(rep(c("L004", "L006", "L008"), c(3, 4, 2)))
x$samples$lane <- lane
x$samples

##                              files group lib.size norm.factors lane
## 10_6_5_11 GSM1545535_10_6_5_11.txt    LP 32863052            1 L004
## 9_6_5_11   GSM1545536_9_6_5_11.txt    ML 35335491            1 L004
## purep53     GSM1545538_purep53.txt Basal 57160817            1 L004
## JMS8-2       GSM1545539_JMS8-2.txt Basal 51368625            1 L006
## JMS8-3       GSM1545540_JMS8-3.txt    ML 75795034            1 L006
## JMS8-4       GSM1545541_JMS8-4.txt    LP 60517657            1 L006
## JMS8-5       GSM1545542_JMS8-5.txt Basal 55086324            1 L006
## JMS9-P7c   GSM1545544_JMS9-P7c.txt    ML 21311068            1 L008
## JMS9-P8c   GSM1545545_JMS9-P8c.txt    LP 19958838            1 L008

# Annotate the genes

head(x$counts)

##            Samples
## Tags        10_6_5_11 9_6_5_11 purep53 JMS8-2 JMS8-3 JMS8-4 JMS8-5
##   497097            1        2     342    526      3      3    535
##   100503874         0        0       5      6      0      0      5
##   100038431         0        0       0      0      0      0      1
##   19888             0        1       0      0     17      2      0
##   20671             1        1      76     40     33     14     98
##   27395           431      771    1368   1268   1564    769    818
##            Samples
## Tags        JMS9-P7c JMS9-P8c
##   497097           2        0
##   100503874        0        0
##   100038431        0        0
##   19888            1        0
##   20671           18        8
##   27395          468      342

dim(x$counts)

## [1] 27179     9

geneid <- rownames(x)
genes <- select(Mus.musculus, keys = geneid,
                columns = c("SYMBOL", "TXCHROM"),
                keytype = "ENTREZID")

## 'select()' returned 1:many mapping between keys and columns

head(genes)

##    ENTREZID  SYMBOL TXCHROM
## 1    497097    Xkr4    chr1
## 2 100503874 Gm19938    <NA>
## 3 100038431 Gm10568    <NA>
## 4     19888     Rp1    chr1
## 5     20671   Sox17    chr1
## 6     27395  Mrpl15    chr1

genes <- genes[!duplicated(genes$ENTREZID), ]

x$genes <- genes
x

## An object of class "DGEList"
## $samples
##                              files group lib.size norm.factors lane
## 10_6_5_11 GSM1545535_10_6_5_11.txt    LP 32863052            1 L004
## 9_6_5_11   GSM1545536_9_6_5_11.txt    ML 35335491            1 L004
## purep53     GSM1545538_purep53.txt Basal 57160817            1 L004
## JMS8-2       GSM1545539_JMS8-2.txt Basal 51368625            1 L006
## JMS8-3       GSM1545540_JMS8-3.txt    ML 75795034            1 L006
## JMS8-4       GSM1545541_JMS8-4.txt    LP 60517657            1 L006
## JMS8-5       GSM1545542_JMS8-5.txt Basal 55086324            1 L006
## JMS9-P7c   GSM1545544_JMS9-P7c.txt    ML 21311068            1 L008
## JMS9-P8c   GSM1545545_JMS9-P8c.txt    LP 19958838            1 L008
## 
## $counts
##            Samples
## Tags        10_6_5_11 9_6_5_11 purep53 JMS8-2 JMS8-3 JMS8-4 JMS8-5
##   497097            1        2     342    526      3      3    535
##   100503874         0        0       5      6      0      0      5
##   100038431         0        0       0      0      0      0      1
##   19888             0        1       0      0     17      2      0
##   20671             1        1      76     40     33     14     98
##            Samples
## Tags        JMS9-P7c JMS9-P8c
##   497097           2        0
##   100503874        0        0
##   100038431        0        0
##   19888            1        0
##   20671           18        8
## 27174 more rows ...
## 
## $genes
##    ENTREZID  SYMBOL TXCHROM
## 1    497097    Xkr4    chr1
## 2 100503874 Gm19938    <NA>
## 3 100038431 Gm10568    <NA>
## 4     19888     Rp1    chr1
## 5     20671   Sox17    chr1
## 27174 more rows ...

Filter genes

Why can’t we compare the raw counts directly? (e.g. Gene 1 has an average count 2 in cases but an average count of 15 in controls)

# Calculate (log) counts-per-million (cpm)

cpm <- cpm(x)
lcpm <- cpm(x, log = TRUE)

# Detect genes with 0 counts aacross 9 samples

table(rowSums(x$counts == 0) == 9)

## 
## FALSE  TRUE 
## 22026  5153

# Visualize the distribution of gene expression levels

plotDensities(lcpm, legend = FALSE, main = "Before filtering")
abline(v = 0, lty = 3)

At this point, we want to filter genes with a raw count of 0 across the samples as well as lowly expressed genes. The rule that we are going to use in this analysis pipleline is to only keep genes which have cpm > 1 in at least 3 samples.

Why do we want to filter genes with a raw count of 0 across the samples? Why do we also want to filter very lowly expressed genes? Hint: what is the role of technical variation/noise versus biological signal when a gene is consistently lowly expressed?

# Only keep genes which have cpm greater than 1 in at least 3 samples

keep.exprs <- rowSums(cpm > 1) >= 3
x <- x[keep.exprs, , keep.lib.sizes = FALSE]
dim(x)

## [1] 14165     9

lcpm <- cpm(x, log=TRUE)

# Visualize the distribution of gene expression levels after filtering

plotDensities(lcpm, legend = FALSE, main = "After filtering")
abline(v = 0, lty = 3)

What do we notice about the distribution of the gene expression log cpm levels after filtering?

Normalization

The default normalization provided by edgeR is TMM (trimmed mean of M-values).

# Let's calculate the normalization factors for our data

x <- calcNormFactors(x, method = "TMM")
x$samples$norm.factors

## [1] 0.8957309 1.0349196 1.0439552 1.0405040 1.0323599 0.9223424 0.9836603
## [8] 1.0827381 0.9792607

# Let's see how it works in an extreme (toy) example

x2 <- x
x2$samples$norm.factors <- 1
x2$counts[,1] <- ceiling(x2$counts[, 1] * 0.05)
x2$counts[,2] <- x2$counts[, 2] * 5

lcpm2 <- cpm(x2, log = TRUE)
boxplot(lcpm2, las = 2, main = "Before normalization")

x2 <- calcNormFactors(x2)
x2$samples$norm.factors

## [1] 0.05472223 6.13059440 1.22927355 1.17051887 1.21487709 1.05622968
## [7] 1.14587663 1.26129350 1.11702264

lcpm2 <- cpm(x2, log = TRUE)
boxplot(lcpm2, las=2, main = "After normalization")

Data Exploration

# Visualize sample relationships with multidimensional scaling (MDS)

library("RColorBrewer")
group

## [1] LP    ML    Basal Basal ML    LP    Basal ML    LP   
## Levels: Basal LP ML

col.group <- group
levels(col.group) <-  brewer.pal(nlevels(col.group), "Set1")
col.group <- as.character(col.group)
lane

## [1] L004 L004 L004 L006 L006 L006 L006 L008 L008
## Levels: L004 L006 L008

col.lane <- lane
levels(col.lane) <-  brewer.pal(nlevels(col.lane), "Set2")
col.lane <- as.character(col.lane)
plotMDS(lcpm, labels = group, col = col.group,
        main = "group")

plotMDS(lcpm, labels = lane, col = col.lane, dim = c(3, 4),
        main = "lane")

# Make an interaction MDS plot

glMDSPlot(lcpm, labels = paste(group, lane, sep = "_"),
          groups = x$samples[, c(2, 5)], launch = TRUE)

Construct the linear model

In this data, what is our biological variable of interest? What technical variable can we also include in our model? If we weren’t just given two variables, how would we decide which biological and technical variables to include?

The model that we are going to write out will not have an intercept. It is called the group-means parameterization in the limma User’s Guide. In this model, each coefficient (beta) will have the mean expression for that group. This paramaterization is more convenient to test specific hypotheses. For a given experiment, there are usually several equivalent ways to set up an appropriate design matrix. For example, ~0+group+lane removes the intercept from the first factor, group, but an intercept remains in the second factor lane.

Warning: There are limitations to when you can use this.

# First, we will make a design matrix. This holds information about our samples

design <- model.matrix(~0 + group + lane)
colnames(design) <- gsub("group", "", colnames(design))
design

##   Basal LP ML laneL006 laneL008
## 1     0  1  0        0        0
## 2     0  0  1        0        0
## 3     1  0  0        0        0
## 4     1  0  0        1        0
## 5     0  0  1        1        0
## 6     0  1  0        1        0
## 7     1  0  0        1        0
## 8     0  0  1        0        1
## 9     0  1  0        0        1
## attr(,"assign")
## [1] 1 1 1 2 2
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
## 
## attr(,"contrasts")$lane
## [1] "contr.treatment"

Now, we are interested in asking the following. For each hypothesis, how would we compare these using our model?

Which genes are DE between basal and LP cells?
Which genes are DE between basal and ML cells?
Which genes are DE between LP and ML cells?

To do this in limma, we are going to create a contrast matrix.

contr.matrix <- makeContrasts(BasalvsLP = Basal - LP,
                              BasalvsML = Basal - ML,
                              LPvsML = LP - ML,
                              levels = colnames(design))
contr.matrix

##           Contrasts
## Levels     BasalvsLP BasalvsML LPvsML
##   Basal            1         1      0
##   LP              -1         0      1
##   ML               0        -1     -1
##   laneL006         0         0      0
##   laneL008         0         0      0

Convert counts to be used in linear model

We are not done transforming the gene expression levels. Even after TMM normalization, there is still a relationship between mean and variance of the log cpm gene expression levels. Therefore, we will use a function called “voom” to calculate weights that will offset this mean-variance relationship in the linear model. Again, these weights will go into the linear model. Note that voom can also do additional normalizations.

# The function `voom` calculates weights to offset this mean-variance relationship.

v <- voom(x, design, plot = TRUE)

## An object of class "EList"
## $genes
##    ENTREZID SYMBOL TXCHROM
## 1    497097   Xkr4    chr1
## 6     27395 Mrpl15    chr1
## 7     18777 Lypla1    chr1
## 9     21399  Tcea1    chr1
## 10    58175  Rgs20    chr1
## 14160 more rows ...
## 
## $targets
##                              files group lib.size norm.factors lane
## 10_6_5_11 GSM1545535_10_6_5_11.txt    LP 29409426    0.8957309 L004
## 9_6_5_11   GSM1545536_9_6_5_11.txt    ML 36528591    1.0349196 L004
## purep53     GSM1545538_purep53.txt Basal 59598629    1.0439552 L004
## JMS8-2       GSM1545539_JMS8-2.txt Basal 53382070    1.0405040 L006
## JMS8-3       GSM1545540_JMS8-3.txt    ML 78175314    1.0323599 L006
## JMS8-4       GSM1545541_JMS8-4.txt    LP 55762781    0.9223424 L006
## JMS8-5       GSM1545542_JMS8-5.txt Basal 54115150    0.9836603 L006
## JMS9-P7c   GSM1545544_JMS9-P7c.txt    ML 23043111    1.0827381 L008
## JMS9-P8c   GSM1545545_JMS9-P8c.txt    LP 19525423    0.9792607 L008
## 
## $E
##         Samples
## Tags     10_6_5_11  9_6_5_11   purep53    JMS8-2    JMS8-3    JMS8-4
##   497097 -4.293244 -3.869026  2.522753  3.302006 -4.481286 -3.993876
##   27395   3.875010  4.400568  4.521172  4.570624  4.322845  3.786547
##   18777   4.707695  5.559334  5.400569  5.171235  5.627798  5.081794
##   21399   4.784462  4.741999  5.374548  5.130925  4.848030  4.944024
##   58175   3.943567  3.294875 -1.767924 -1.880302  2.993289  3.357379
##         Samples
## Tags        JMS8-5  JMS9-P7c  JMS9-P8c
##   497097  3.306782 -3.204336 -5.287282
##   27395   3.918878  4.345642  4.132678
##   18777   5.080061  5.757404  5.150470
##   21399   5.158292  5.036933  4.987679
##   58175  -2.114104  3.142621  3.523290
## 14160 more rows ...
## 
## $weights
##           [,1]      [,2]      [,3]     [,4]      [,5]      [,6]      [,7]
## [1,]  1.183974  1.183974 20.526779 20.97747  1.773562  1.217142 21.125740
## [2,] 20.879554 26.561871 31.596323 29.66102 32.558344 26.745293 29.792090
## [3,] 28.003202 33.695540 34.845507 34.45673 35.148529 33.550527 34.517259
## [4,] 27.670233 29.595778 34.901302 34.43298 34.841349 33.159425 34.493456
## [5,] 19.737381 18.658333  3.184207  2.62986 24.191635 24.014937  2.648747
##           [,8]      [,9]
## [1,]  1.183974  1.183974
## [2,] 21.900102 17.150677
## [3,] 31.440457 25.228325
## [4,] 26.136796 24.502247
## [5,] 13.149278 14.351930
## 14160 more rows ...
## 
## $design
##   Basal LP ML laneL006 laneL008
## 1     0  1  0        0        0
## 2     0  0  1        0        0
## 3     1  0  0        0        0
## 4     1  0  0        1        0
## 5     0  0  1        1        0
## 6     0  1  0        1        0
## 7     1  0  0        1        0
## 8     0  0  1        0        1
## 9     0  1  0        0        1
## attr(,"assign")
## [1] 1 1 1 2 2
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
## 
## attr(,"contrasts")$lane
## [1] "contr.treatment"

Test for differentially expressed (DE) genes

Use the gene expression levels + voom weights (in the variable “v”) and the design matrix to specify the coefficients in the linear model (in the object “design”).

# Fit a linear model for each gene

vfit <- lmFit(v, design)

View(vfit$coefficients)
View(vfit$stdev.unscaled)

# Caculate the statistics for our specific contrasts of interest

vfit <- contrasts.fit(vfit, contrasts = contr.matrix)

View(vfit$coefficients)
View(vfit$stdev.unscaled)

How were the views of vfit different between the two stages?

Are all of the genes independent? We know that genes are correlated so we can share information across the genes– specifically shrinking the gene- specific variances towards the average variance across all genes.

Normally, how do we calculate the t score for gene expression? X_group1 - X_group2 / SE of both groups Limma uses a moderated t statistic which takes into account the relationship between genes.

# Calculate the t statistics for each gene in each comparison (hypothesis)

efit <- eBayes(vfit)
View(efit$t)

# Explore the relationship between residual variation versus the mean gene expression level. 

plotSA(efit, main = "Final model: Mean−variance trend")

Based on this plot, does voom remove the relationship between the mean and variance?

Explore the results/DE genes

What thresholds can we use to define “differentially expressed” genes? (Hint: magnitude and p-value)

# Tabulate the results

summary(decideTests(efit))

##        BasalvsLP BasalvsML LPvsML
## Down        4127      4338   2895
## NotSig      5740      5655   8825
## Up          4298      4172   2445

# If the magnitude of the effect size is important for your downstream analysis (e.g. perhaps you are trying to prioritize genes for further experiments), you can specify a minimal log-fold-change with the function "treat" instead of using "eBayes"

tfit <- treat(vfit, lfc = 1)
dt <- decideTests(tfit)
summary(dt)

##        BasalvsLP BasalvsML LPvsML
## Down        1417      1512    203
## NotSig     11030     10895  13780
## Up          1718      1758    182

# Create a venn diagram of the results

head(dt)

##         Contrasts
##          BasalvsLP BasalvsML LPvsML
##   497097         1         1      0
##   27395          0         0      0
##   18777          0         0      0
##   21399          0         0      0
##   58175         -1        -1      0
##   108664         0         0      0

de.common <- which(dt[, 1] != 0 & dt[, 2] != 0)
length(de.common)

## [1] 2409

head(tfit$genes$SYMBOL[de.common], n = 20)

##  [1] "Xkr4"          "Rgs20"         "Cpa6"          "Sulf1"        
##  [5] "Eya1"          "Msc"           "Sbspon"        "Pi15"         
##  [9] "Crispld1"      "Kcnq5"         "Ptpn18"        "Arhgef4"      
## [13] "2010300C02Rik" "Aff3"          "Npas2"         "Tbc1d8"       
## [17] "Creg2"         "Il1r1"         "Il18r1"        "Il18rap"

vennDiagram(dt[, 1:2], circle.col = c("turquoise", "salmon"))

#write.fit(tfit, dt, file = "results.txt")

# Identify the top DE genes

basal.vs.lp <- topTreat(tfit, coef = 1, n = Inf)
basal.vs.ml <- topTreat(tfit, coef = 2, n = Inf)
head(basal.vs.lp)

##        ENTREZID SYMBOL TXCHROM     logFC  AveExpr         t      P.Value
## 12759     12759    Clu   chr14 -5.442877 8.857907 -33.44429 3.990899e-10
## 53624     53624  Cldn7   chr11 -5.514605 6.296762 -32.94533 4.503694e-10
## 242505   242505  Rasef    chr4 -5.921741 5.119585 -31.77625 6.063249e-10
## 67451     67451   Pkp2   chr16 -5.724823 4.420495 -30.65370 8.010456e-10
## 228543   228543   Rhov    chr2 -6.253427 5.486640 -29.46244 1.112729e-09
## 70350     70350  Basp1   chr15 -6.073297 5.248349 -28.64890 1.380545e-09
##           adj.P.Val
## 12759  2.703871e-06
## 53624  2.703871e-06
## 242505 2.703871e-06
## 67451  2.703871e-06
## 228543 2.703871e-06
## 70350  2.703871e-06

head(basal.vs.ml)

##        ENTREZID  SYMBOL TXCHROM     logFC  AveExpr         t      P.Value
## 242505   242505   Rasef    chr4 -6.510470 5.119585 -35.49093 2.573575e-10
## 53624     53624   Cldn7   chr11 -5.469160 6.296762 -32.52520 4.978446e-10
## 12521     12521    Cd82    chr2 -4.667737 7.070963 -31.82187 5.796191e-10
## 71740     71740 Nectin4    chr1 -5.556046 5.166292 -31.29987 6.760578e-10
## 20661     20661   Sort1    chr3 -4.908119 6.705784 -31.23083 6.761331e-10
## 15375     15375   Foxa1   chr12 -5.753884 5.625064 -28.34612 1.487280e-09
##           adj.P.Val
## 242505 1.915485e-06
## 53624  1.915485e-06
## 12521  1.915485e-06
## 71740  1.915485e-06
## 20661  1.915485e-06
## 15375  2.281914e-06

# Visualize the DE genes

plotMD(tfit, column = 1, status = dt[, 1], main = colnames(tfit)[1],
       xlim = c(-8, 13))

#glMDPlot(tfit, coef = 1, status = dt, main = colnames(tfit)[1],
#         id.column = "ENTREZID", counts = x$counts, groups = group,
#         launch = TRUE)

glMDPlot(tfit, coef = 1, status = dt, main = colnames(tfit)[1],
         side.main = "ENTREZID", counts = x$counts, groups = group,
         launch = TRUE)

# View heatmap of top 100 DE genes between Basal and LP cells

library("gplots")

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:IRanges':
## 
##     space

## The following object is masked from 'package:S4Vectors':
## 
##     space

## The following object is masked from 'package:stats':
## 
##     lowess

basal.vs.lp.topgenes <- basal.vs.lp$ENTREZID[1:100]
i <- which(v$genes$ENTREZID %in% basal.vs.lp.topgenes)
mycol <- colorpanel(1000, "blue", "white", "red")
heatmap.2(v$E[i, ], scale = "row", 
          labRow = v$genes$SYMBOL[i], labCol = group,
          col = mycol, trace = "none", density.info = "none")

Why do we scale by row (per gene) in the heatmap?

Test for enrichment of gene sets

Why might we want to do gene set enrichment of the DE genes? Why might we be interested in the gene sets that are specifically enriched in 1 contrast versus the others (e.g. gene sets in Basal versus LP but not Basal versus ML and not LP versus LM)? What are some of the limitations of the use of enrichment of specific gene sets (e.g. use of Gene Ontology or KEGG pathways)?

# A common first-step post-DE testing is to test for enrichment of specific gene sets e.g. Gene Ontology (GO) categories or KEGG pathways. Here, we will use the gene set collection MSigDB c2 from the Broad Institue.

load(system.file("extdata", "mouse_c2_v5p1.rda", package = "RNAseq123"))
class(Mm.c2)

## [1] "list"

Mm.c2$KEGG_GLYCOLYSIS_GLUCONEOGENESIS

##   [1] "100042025" "100042427" "103988"    "106557"    "109785"   
##   [6] "110695"    "11522"     "11529"     "11532"     "11668"    
##  [11] "11669"     "11670"     "11671"     "11674"     "11676"    
##  [16] "12039"     "12040"     "12183"     "13382"     "13806"    
##  [21] "13807"     "13808"     "14120"     "14121"     "14377"    
##  [26] "14378"     "14433"     "14447"     "14751"     "14782"    
##  [31] "15275"     "15277"     "15452"     "16497"     "16498"    
##  [36] "16499"     "16828"     "16832"     "16833"     "18534"    
##  [41] "18597"     "18598"     "18641"     "18642"     "18648"    
##  [46] "18655"     "18663"     "18746"     "18770"     "19378"    
##  [51] "20322"     "212032"    "215974"    "216019"    "21881"    
##  [56] "21991"     "226041"    "230163"    "232223"    "232491"   
##  [61] "235339"    "26358"     "26462"     "26876"     "26926"    
##  [66] "277333"    "319625"    "353204"    "380660"    "433182"   
##  [71] "50493"     "56012"     "56421"     "56752"     "56847"    
##  [76] "58810"     "60525"     "621603"    "622339"    "638833"   
##  [81] "640374"    "666258"    "666488"    "66681"     "667451"   
##  [86] "668435"    "669429"    "67689"     "68263"     "68738"    
##  [91] "69117"     "70974"     "72157"     "72168"     "72535"    
##  [96] "73458"     "74419"     "74551"     "78894"     "79459"    
## [101] "83553"

idx <- ids2indices(Mm.c2, id = rownames(v))

# The limma function "camera" tests for enrichment for a specific contrast of interest by comparing all the gene sets against one another.

cam.BasalvsLP <- camera(v, idx, design, contrast = contr.matrix[, 1])
head(cam.BasalvsLP, 5)

##                                             NGenes Direction       PValue
## LIM_MAMMARY_STEM_CELL_UP                       739        Up 1.134757e-18
## LIM_MAMMARY_STEM_CELL_DN                       630      Down 1.569957e-15
## ROSTY_CERVICAL_CANCER_PROLIFERATION_CLUSTER    163        Up 1.437987e-13
## SOTIRIOU_BREAST_CANCER_GRADE_1_VS_3_UP         183        Up 2.181862e-13
## LIM_MAMMARY_LUMINAL_PROGENITOR_UP               87      Down 6.734613e-13
##                                                      FDR
## LIM_MAMMARY_STEM_CELL_UP                    5.360590e-15
## LIM_MAMMARY_STEM_CELL_DN                    3.708238e-12
## ROSTY_CERVICAL_CANCER_PROLIFERATION_CLUSTER 2.264351e-10
## SOTIRIOU_BREAST_CANCER_GRADE_1_VS_3_UP      2.576779e-10
## LIM_MAMMARY_LUMINAL_PROGENITOR_UP           6.362863e-10

cam.BasalvsML <- camera(v, idx, design, contrast = contr.matrix[, 2])
head(cam.BasalvsML, 5)

##                                             NGenes Direction       PValue
## LIM_MAMMARY_STEM_CELL_UP                       739        Up 5.090937e-23
## LIM_MAMMARY_STEM_CELL_DN                       630      Down 5.132446e-19
## LIM_MAMMARY_LUMINAL_MATURE_DN                  166        Up 8.875174e-16
## LIM_MAMMARY_LUMINAL_MATURE_UP                  180      Down 6.287301e-13
## ROSTY_CERVICAL_CANCER_PROLIFERATION_CLUSTER    163        Up 1.684323e-12
##                                                      FDR
## LIM_MAMMARY_STEM_CELL_UP                    2.404959e-19
## LIM_MAMMARY_STEM_CELL_DN                    1.212284e-15
## LIM_MAMMARY_LUMINAL_MATURE_DN               1.397544e-12
## LIM_MAMMARY_LUMINAL_MATURE_UP               7.425303e-10
## ROSTY_CERVICAL_CANCER_PROLIFERATION_CLUSTER 1.591348e-09

cam.LPvsML <- camera(v, idx, design, contrast = contr.matrix[, 3])
head(cam.LPvsML, 5)

##                                         NGenes Direction       PValue
## LIM_MAMMARY_LUMINAL_MATURE_UP              180      Down 8.497295e-14
## LIM_MAMMARY_LUMINAL_MATURE_DN              166        Up 1.439890e-13
## LIM_MAMMARY_LUMINAL_PROGENITOR_UP           87        Up 3.840915e-11
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT     91      Down 2.655349e-08
## NABA_CORE_MATRISOME                        222        Up 4.430361e-08
##                                                  FDR
## LIM_MAMMARY_LUMINAL_MATURE_UP           3.401020e-10
## LIM_MAMMARY_LUMINAL_MATURE_DN           3.401020e-10
## LIM_MAMMARY_LUMINAL_PROGENITOR_UP       6.048160e-08
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT 3.135967e-05
## NABA_CORE_MATRISOME                     4.185805e-05

# Visualize the gebe set enrichment with a barcode plot. 

barcodeplot(efit$t[, 3], index = idx$LIM_MAMMARY_LUMINAL_MATURE_UP,
            index2 = idx$LIM_MAMMARY_LUMINAL_MATURE_DN,
            main = "LPvsML")

Once you have DE genes, what might be further steps for (computational) analysis?

Report session info

sessionInfo()

## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: OS X El Capitan 10.11.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] gplots_3.0.1                            
##  [2] RColorBrewer_1.1-2                      
##  [3] Mus.musculus_1.3.1                      
##  [4] TxDb.Mmusculus.UCSC.mm10.knownGene_3.4.0
##  [5] org.Mm.eg.db_3.5.0                      
##  [6] GO.db_3.5.0                             
##  [7] OrganismDbi_1.20.0                      
##  [8] GenomicFeatures_1.30.3                  
##  [9] GenomicRanges_1.30.3                    
## [10] GenomeInfoDb_1.14.0                     
## [11] AnnotationDbi_1.40.0                    
## [12] IRanges_2.12.0                          
## [13] S4Vectors_0.16.0                        
## [14] Biobase_2.38.0                          
## [15] BiocGenerics_0.24.0                     
## [16] edgeR_3.20.7                            
## [17] Glimma_1.6.0                            
## [18] limma_3.34.9                            
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.15               locfit_1.5-9.1            
##  [3] lattice_0.20-35            gtools_3.5.0              
##  [5] prettyunits_1.0.2          Rsamtools_1.30.0          
##  [7] Biostrings_2.46.0          assertthat_0.2.0          
##  [9] rprojroot_1.3-2            digest_0.6.15             
## [11] R6_2.2.2                   backports_1.1.2           
## [13] RSQLite_2.0                evaluate_0.10.1           
## [15] httr_1.3.1                 BiocInstaller_1.28.0      
## [17] pillar_1.1.0               zlibbioc_1.24.0           
## [19] rlang_0.1.6                progress_1.1.2            
## [21] gdata_2.18.0               blob_1.1.0                
## [23] R.utils_2.6.0              R.oo_1.21.0               
## [25] Matrix_1.2-13              rmarkdown_1.9             
## [27] RMySQL_0.10.14             BiocParallel_1.12.0       
## [29] stringr_1.3.0              RCurl_1.95-4.10           
## [31] bit_1.1-12                 biomaRt_2.34.2            
## [33] DelayedArray_0.4.1         compiler_3.4.3            
## [35] rtracklayer_1.38.3         pkgconfig_2.0.1           
## [37] htmltools_0.3.6            SummarizedExperiment_1.8.1
## [39] tibble_1.4.2               GenomeInfoDbData_1.0.0    
## [41] matrixStats_0.53.1         XML_3.98-1.10             
## [43] GenomicAlignments_1.14.2   bitops_1.0-6              
## [45] R.methodsS3_1.7.1          grid_3.4.3                
## [47] RBGL_1.54.0                jsonlite_1.5              
## [49] DBI_0.7                    magrittr_1.5              
## [51] KernSmooth_2.23-15         graph_1.56.0              
## [53] stringi_1.1.7              XVector_0.18.0            
## [55] tools_3.4.3                bit64_0.9-7               
## [57] yaml_2.1.18                caTools_1.17.1            
## [59] memoise_1.1.0              knitr_1.20

Why is it important to report the versions of the software you used to generate the results?

For more information

Further reading: The Limma User’s Guide is great! Good explanations with lots of examples!

Next time: Use of lm in R

RNA_seq_tutorial

Lauren Blake

April 26, 2018