#When performing gene set analysis it is critical to use the same annotation as was used in pre-processing steps. Read the paper behind the Bottomly data set on the ReCount database: http://www.ncbi.nlm.nih.gov/pubmed?term=21455293
# Using the paper and the function: 𝚜𝚞𝚙𝚙𝚘𝚛𝚝𝚎𝚍𝙶𝚎𝚗𝚘𝚖𝚎𝚜() in the 𝚐𝚘𝚜𝚎𝚚 package can you figure out which of the Mouse genome builds they aligned the reads to.
# Use the supportedgenomes
# Load the Biobase library for data
library(Biobase)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colnames,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unsplit, which, which.max,
## which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
# Load the limma pacakge for the stats
library(limma)
## Warning: package 'limma' was built under R version 3.3.3
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
# Now load data
con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bottomly_eset.RData")
load(file=con)
close(con)
# Take out the relevant data
bot = bottomly.eset
# Save the phenotype data
pdata_bot=pData(bot)
# Save the feature data
fdata_bot = featureData(bot)
# Save the expression data
edata = exprs(bot)
# Remove lowly expressed genes
fdata_bot = fdata_bot[rowMeans(edata) > 5]
edata = edata[rowMeans(edata) > 5, ]
# Log2 transform data
edata = log2(edata+1)
# The question was to model gene expression as a function of
# strain.
# Build the model matrix first
mod_adj = model.matrix(~ pdata_bot$strain)
# Then fit a linear model on expression data using the model matrix
fit_limma = lmFit(edata,mod_adj)
# The statistics by limma is not moderated
# Now moderate (correct) the statistics with eBayes
ebayes_limma = eBayes(fit_limma)
#Use the topTable function to get only 6 relevant variables of the eBayes output
limma_output = topTable(ebayes_limma, number = dim(edata)[1], adjust.method="BH", sort="none")
## Removing intercept from test coefficients
# Look at the variables we selected
names(limma_output)
## [1] "logFC" "AveExpr" "t" "P.Value" "adj.P.Val" "B"
# Look at the dimensions to confirm we have 6 variables
# Also confirm that we have 9431 genes tested
dim(limma_output)
## [1] 9431 6
# Save all the adjusted p_values in a variable called limma_pvals_adj
limma_pvals_adj = limma_output$adj.P.Val
# Now susbet just data whose p_val is greater than 0.05
limma2 = subset(limma_output,limma_output$adj.P.Val < 0.05)
# Find the number of genes that made this cut off
dim(limma2)
## [1] 223 6
# Only 223 genes were significant at a 5% FDR
# Number of genes at adj p-value less than 0.05 is 223.
# Whats the first gene?
limma2[1:3,1:3]
## logFC AveExpr t
## ENSMUSG00000000402 -1.222062 4.292471 -4.509076
## ENSMUSG00000000560 1.149669 9.735186 4.228283
## ENSMUSG00000001473 -1.097277 5.012434 -4.038952
# Now I have the differential gene expression, the next step is to take
# all those genes and determine what processes are over represented in the
# set of those genes using Gene Ontology analysis
# Load the GOSEQ pacakge for the gene ontology analysis
library(goseq)
## Loading required package: BiasedUrn
## Loading required package: geneLenDataBase
##
# Lets see which genomes are supported
head(supportedGenomes())
## db species date name
## 2 hg19 Human Feb. 2009 Genome Reference Consortium GRCh37
## 3 hg18 Human Mar. 2006 NCBI Build 36.1
## 4 hg17 Human May 2004 NCBI Build 35
## 5 hg16 Human Jul. 2003 NCBI Build 34
## 19 felCat3 Cat Mar. 2006 Broad Institute Release 3
## 23 panTro2 Chimp Mar. 2006 CGSC Build 2.1
## AvailableGeneIDs
## 2 ccdsGene,ensGene,exoniphy,geneSymbol,knownGene,nscanGene,refGene,xenoRefGene
## 3 acembly,acescan,ccdsGene,ensGene,exoniphy,geneSymbol,geneid,genscan,knownGene,knownGeneOld3,refGene,sgpGene,sibGene,xenoRefGene
## 4 acembly,acescan,ccdsGene,ensGene,exoniphy,geneSymbol,geneid,genscan,knownGene,refGene,sgpGene,vegaGene,vegaPseudoGene,xenoRefGene
## 5 acembly,ensGene,exoniphy,geneSymbol,geneid,genscan,knownGene,refGene,sgpGene
## 19 ensGene,geneSymbol,geneid,genscan,nscanGene,refGene,sgpGene,xenoRefGene
## 23 ensGene,geneSymbol,genscan,nscanGene,refGene,xenoRefGene
# Select all genes whose p_adj is less than 0.05
# This will convert the statistically different genes into 1
# and the non-differentially expressed genes as 0
genes = as.integer(limma_output$adj.P.Val < 0.05)
not_na = !is.na(genes)
# make the row names as the gene names of the expression data
names(genes) = rownames(edata)
genes = genes[not_na]
### Pick the right genome for GO analysis
# Calculate the probability weighting function. We use the mm9
# genome build and the naming convention as ensemble gene id
pwf=nullp(genes,"mm9","ensGene")
## Loading mm9 length data...
head(pwf)
## DEgenes bias.data pwf
## ENSMUSG00000000001 0 3213 0.02367927
## ENSMUSG00000000056 0 4405 0.01859459
## ENSMUSG00000000058 0 976 0.02871594
## ENSMUSG00000000078 0 4221 0.01929324
## ENSMUSG00000000088 0 628 0.02871594
## ENSMUSG00000000093 0 3569 0.02199937
# Now use the goseq function to look for enriched groups
#source("https://bioconductor.org/biocLite.R")
#biocLite("org.Mm.eg.db")
library(org.Mm.eg.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 3.3.3
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 3.3.3
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## colMeans, colSums, expand.grid, rowMeans, rowSums
##
GO.wall=goseq(pwf,"mm9","ensGene")
## Fetching GO annotations...
## For 508 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
# The above computation gets you the enriched processes in your differential
# genes. You can look at the first 6 here
head(GO.wall)
## category over_represented_pvalue under_represented_pvalue
## 1932 GO:0004888 1.679786e-06 0.9999995
## 15845 GO:0099600 5.054510e-06 0.9999985
## 9139 GO:0038023 1.319361e-05 0.9999958
## 1922 GO:0004872 2.913349e-05 0.9999898
## 12953 GO:0060089 2.913349e-05 0.9999898
## 923 GO:0002430 4.292152e-05 0.9999998
## numDEInCat numInCat term
## 1932 24 406 transmembrane signaling receptor activity
## 15845 24 431 transmembrane receptor activity
## 9139 24 455 signaling receptor activity
## 1922 26 539 receptor activity
## 12953 26 539 molecular transducer activity
## 923 3 4 complement receptor mediated signaling pathway
## ontology
## 1932 MF
## 15845 MF
## 9139 MF
## 1922 MF
## 12953 MF
## 923 BP
# What we modelled above was gene expression as a function of mouse strains
# Sometimes we might have reason to believe that all the differences in gene
# expression are not driven by the factor we care about, in this case,
# mouse strains.
# The next task is where we find the differences in gene expression due to
# our favourite variable, mouse strains, but now we believe that the lane
# in which the samples were sequenced affected gene expression too.
# So we correct for that confounding variable
# Again, as we did before, build a model matrix but this time,
# add the lane variable
mod_adj1 = model.matrix(~ pdata_bot$strain+pdata_bot$lane.number)
# Then fit the model with the new model matrix
fit_limma1 = lmFit(edata,mod_adj1)
# Moderate the statistics using eBayes
ebayes_limma1 = eBayes(fit_limma1)
#Use the topTable function to get only 6 relevant variables of the eBayes output
limma_output1 = topTable(ebayes_limma1, number = dim(edata)[1], adjust.method="BH", sort="none")
## Removing intercept from test coefficients
# Confirm those variables
names(limma_output1)
## [1] "pdata_bot.strainDBA.2J" "pdata_bot.lane.number"
## [3] "AveExpr" "F"
## [5] "P.Value" "adj.P.Val"
# Look at the dimensions to confirm we have 6 variables
# Also confirm that we have 9431 genes tested
dim(limma_output1)
## [1] 9431 6
#Save all the adjusted p_values in a variable called limma_pvals_adj
limma_pvals_adj1 = limma_output1$adj.P.Val
# Now susbet just data whose p_val is greater than 0.05
limma2_1 = subset(limma_output1,limma_output1$adj.P.Val < 0.05)
# Select all genes whose p_adj is less than 0.05
# This will convert the statistically different genes into 1
# and the non-differentially expressed genes as 0
genes1 = as.integer(limma_output1$adj.P.Val < 0.05)
not_na1 = !is.na(genes1)
# make the row names as the gene names of the expression data
names(genes1) = rownames(edata)
genes1 = genes1[not_na1]
### Pick the right genome for GO analysis
# Calculate the probability weighting function. We use the mm10
# genome build and the naming convention as ensemble gene id
pwf1=nullp(genes1,"mm9","ensGene")
## Loading mm9 length data...
head(pwf1)
## DEgenes bias.data pwf
## ENSMUSG00000000001 0 3213 0.01621764
## ENSMUSG00000000056 0 4405 0.01221733
## ENSMUSG00000000058 0 976 0.02240936
## ENSMUSG00000000078 0 4221 0.01267104
## ENSMUSG00000000088 0 628 0.02379767
## ENSMUSG00000000093 0 3569 0.01467915
GO.wall1=goseq(pwf1,"mm9","ensGene")
## Fetching GO annotations...
## For 508 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
head(GO.wall1)
## category over_represented_pvalue under_represented_pvalue
## 923 GO:0002430 1.691749e-05 0.9999999
## 1924 GO:0004875 1.691749e-05 0.9999999
## 3078 GO:0007129 4.824823e-05 0.9999986
## 10581 GO:0045143 1.641737e-04 0.9999927
## 13831 GO:0070192 2.137497e-04 0.9999897
## 1608 GO:0004252 2.431368e-04 0.9999736
## numDEInCat numInCat
## 923 3 4
## 1924 3 4
## 3078 4 13
## 10581 4 19
## 13831 4 20
## 1608 6 53
## term ontology
## 923 complement receptor mediated signaling pathway BP
## 1924 complement receptor activity MF
## 3078 synapsis BP
## 10581 homologous chromosome segregation BP
## 13831 chromosome organization involved in meiotic cell cycle BP
## 1608 serine-type endopeptidase activity MF
# Now we have the enriched processes in our gene differentials after correction
# for the lane variable
# The next task was to compare the top 10 enriched processes for the
# corrected and uncorrected analysis to determine how many are common
# Get top ten of GO.wall
GO.wall_top_10=GO.wall[1:10,1]
# Get top ten of GO.wall
GO.wall1_top_10=GO.wall1[1:10,1]
# Now match them up
new = match(GO.wall_top_10,GO.wall1_top_10)
# We can also use the intersect function to do this
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.