#When performing gene set analysis it is critical to use the same annotation as was used in pre-processing steps. Read the paper behind the Bottomly data set on the ReCount database: http://www.ncbi.nlm.nih.gov/pubmed?term=21455293

# Using the paper and the function: 𝚜𝚞𝚙𝚙𝚘𝚛𝚝𝚎𝚍𝙶𝚎𝚗𝚘𝚖𝚎𝚜() in the 𝚐𝚘𝚜𝚎𝚚 package can you figure out which of the Mouse genome builds they aligned the reads to.

# Use the supportedgenomes

Load the Bottomly data with the following code and perform a differential expression analysis using 𝚕𝚒𝚖𝚖𝚊 with only the strain variable as an outcome. How many genes are differentially expressed at the 5% FDR level using Benjamini-Hochberg correction? What is the gene identifier of the first gene differentially expressed at this level (just in order, not the smallest FDR)

# Load the Biobase library for data
library(Biobase)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit, which, which.max,
##     which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
# Load the limma pacakge for the stats
library(limma)
## Warning: package 'limma' was built under R version 3.3.3
## 
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
# Now load data
con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bottomly_eset.RData")
load(file=con)
close(con)
# Take out the relevant data
bot = bottomly.eset
# Save the phenotype data
pdata_bot=pData(bot)
# Save the feature data
fdata_bot = featureData(bot)
# Save the expression data
edata = exprs(bot)
# Remove lowly expressed genes
fdata_bot = fdata_bot[rowMeans(edata) > 5]
edata = edata[rowMeans(edata) > 5, ]
# Log2 transform data
edata = log2(edata+1)

# The question was to model gene expression as a function of 
# strain.
# Build the model matrix first
mod_adj = model.matrix(~ pdata_bot$strain)
# Then fit a linear model on expression data using the model matrix
fit_limma = lmFit(edata,mod_adj)
# The statistics by limma is not moderated
# Now moderate (correct) the statistics with eBayes
ebayes_limma = eBayes(fit_limma)
#Use the topTable function to get only 6 relevant variables of the eBayes output
limma_output = topTable(ebayes_limma, number = dim(edata)[1], adjust.method="BH", sort="none")
## Removing intercept from test coefficients
# Look at the variables we selected
names(limma_output)
## [1] "logFC"     "AveExpr"   "t"         "P.Value"   "adj.P.Val" "B"
# Look at the dimensions to confirm we have 6 variables
# Also confirm that we have 9431 genes tested 
dim(limma_output)
## [1] 9431    6
# Save all the adjusted p_values in a variable called limma_pvals_adj
limma_pvals_adj = limma_output$adj.P.Val
# Now susbet just data whose p_val is greater than 0.05
limma2 = subset(limma_output,limma_output$adj.P.Val < 0.05)
# Find the number of genes that made this cut off
dim(limma2)
## [1] 223   6
# Only 223 genes were significant at a 5% FDR
# Number of genes at adj p-value less than 0.05 is 223.
# Whats the first gene?
limma2[1:3,1:3]
##                        logFC  AveExpr         t
## ENSMUSG00000000402 -1.222062 4.292471 -4.509076
## ENSMUSG00000000560  1.149669 9.735186  4.228283
## ENSMUSG00000001473 -1.097277 5.012434 -4.038952
# Now I have the differential gene expression, the next step is to take
# all those genes and determine what processes are over represented in the 
# set of those genes using Gene Ontology analysis

# Load the GOSEQ pacakge for the gene ontology analysis
library(goseq)
## Loading required package: BiasedUrn
## Loading required package: geneLenDataBase
## 
# Lets see which genomes are supported
head(supportedGenomes())
##         db species      date                               name
## 2     hg19   Human Feb. 2009 Genome Reference Consortium GRCh37
## 3     hg18   Human Mar. 2006                    NCBI Build 36.1
## 4     hg17   Human  May 2004                      NCBI Build 35
## 5     hg16   Human Jul. 2003                      NCBI Build 34
## 19 felCat3     Cat Mar. 2006          Broad Institute Release 3
## 23 panTro2   Chimp Mar. 2006                     CGSC Build 2.1
##                                                                                                                     AvailableGeneIDs
## 2                                                       ccdsGene,ensGene,exoniphy,geneSymbol,knownGene,nscanGene,refGene,xenoRefGene
## 3    acembly,acescan,ccdsGene,ensGene,exoniphy,geneSymbol,geneid,genscan,knownGene,knownGeneOld3,refGene,sgpGene,sibGene,xenoRefGene
## 4  acembly,acescan,ccdsGene,ensGene,exoniphy,geneSymbol,geneid,genscan,knownGene,refGene,sgpGene,vegaGene,vegaPseudoGene,xenoRefGene
## 5                                                       acembly,ensGene,exoniphy,geneSymbol,geneid,genscan,knownGene,refGene,sgpGene
## 19                                                           ensGene,geneSymbol,geneid,genscan,nscanGene,refGene,sgpGene,xenoRefGene
## 23                                                                          ensGene,geneSymbol,genscan,nscanGene,refGene,xenoRefGene
# Select all genes whose p_adj is less than 0.05
# This will convert the statistically different genes into 1
# and the non-differentially expressed genes as 0
genes = as.integer(limma_output$adj.P.Val < 0.05)
not_na = !is.na(genes)
# make the row names as the gene names of the expression data
names(genes) = rownames(edata)
genes = genes[not_na]

### Pick the right genome for GO analysis
# Calculate the probability weighting function. We use the mm9
# genome build and the naming convention as ensemble gene id
pwf=nullp(genes,"mm9","ensGene")
## Loading mm9 length data...

head(pwf)
##                    DEgenes bias.data        pwf
## ENSMUSG00000000001       0      3213 0.02367927
## ENSMUSG00000000056       0      4405 0.01859459
## ENSMUSG00000000058       0       976 0.02871594
## ENSMUSG00000000078       0      4221 0.01929324
## ENSMUSG00000000088       0       628 0.02871594
## ENSMUSG00000000093       0      3569 0.02199937
# Now use the goseq function to look for enriched groups
#source("https://bioconductor.org/biocLite.R")
#biocLite("org.Mm.eg.db")
library(org.Mm.eg.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 3.3.3
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 3.3.3
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     colMeans, colSums, expand.grid, rowMeans, rowSums
## 
GO.wall=goseq(pwf,"mm9","ensGene")
## Fetching GO annotations...
## For 508 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
# The above computation gets you the enriched processes in your differential
# genes. You can look at the first 6 here
head(GO.wall)
##         category over_represented_pvalue under_represented_pvalue
## 1932  GO:0004888            1.679786e-06                0.9999995
## 15845 GO:0099600            5.054510e-06                0.9999985
## 9139  GO:0038023            1.319361e-05                0.9999958
## 1922  GO:0004872            2.913349e-05                0.9999898
## 12953 GO:0060089            2.913349e-05                0.9999898
## 923   GO:0002430            4.292152e-05                0.9999998
##       numDEInCat numInCat                                           term
## 1932          24      406      transmembrane signaling receptor activity
## 15845         24      431                transmembrane receptor activity
## 9139          24      455                    signaling receptor activity
## 1922          26      539                              receptor activity
## 12953         26      539                  molecular transducer activity
## 923            3        4 complement receptor mediated signaling pathway
##       ontology
## 1932        MF
## 15845       MF
## 9139        MF
## 1922        MF
## 12953       MF
## 923         BP
# What we modelled above was gene expression as a function of mouse strains
# Sometimes we might have reason to believe that all the differences in gene
# expression are not driven by the factor we care about, in this case,
# mouse strains. 
# The next task is where we find the differences in gene expression due to
# our favourite variable, mouse strains, but now we believe that the lane
# in which the samples were sequenced affected gene expression too.
# So we correct for that confounding variable

# Again, as we did before, build a model matrix but this time,
# add the lane variable
mod_adj1 = model.matrix(~ pdata_bot$strain+pdata_bot$lane.number)
# Then fit the model with the new model matrix
fit_limma1 = lmFit(edata,mod_adj1)
# Moderate the statistics using eBayes
ebayes_limma1 = eBayes(fit_limma1)
#Use the topTable function to get only 6 relevant variables of the eBayes output
limma_output1 = topTable(ebayes_limma1, number = dim(edata)[1], adjust.method="BH", sort="none")
## Removing intercept from test coefficients
# Confirm those variables
names(limma_output1)
## [1] "pdata_bot.strainDBA.2J" "pdata_bot.lane.number" 
## [3] "AveExpr"                "F"                     
## [5] "P.Value"                "adj.P.Val"
# Look at the dimensions to confirm we have 6 variables
# Also confirm that we have 9431 genes tested 
dim(limma_output1)
## [1] 9431    6
#Save all the adjusted p_values in a variable called limma_pvals_adj
limma_pvals_adj1 = limma_output1$adj.P.Val
# Now susbet just data whose p_val is greater than 0.05
limma2_1 = subset(limma_output1,limma_output1$adj.P.Val < 0.05)
# Select all genes whose p_adj is less than 0.05
# This will convert the statistically different genes into 1
# and the non-differentially expressed genes as 0
genes1 = as.integer(limma_output1$adj.P.Val < 0.05)
not_na1 = !is.na(genes1)
# make the row names as the gene names of the expression data
names(genes1) = rownames(edata)
genes1 = genes1[not_na1]
### Pick the right genome for GO analysis
# Calculate the probability weighting function. We use the mm10
# genome build and the naming convention as ensemble gene id
pwf1=nullp(genes1,"mm9","ensGene")
## Loading mm9 length data...

head(pwf1)
##                    DEgenes bias.data        pwf
## ENSMUSG00000000001       0      3213 0.01621764
## ENSMUSG00000000056       0      4405 0.01221733
## ENSMUSG00000000058       0       976 0.02240936
## ENSMUSG00000000078       0      4221 0.01267104
## ENSMUSG00000000088       0       628 0.02379767
## ENSMUSG00000000093       0      3569 0.01467915
GO.wall1=goseq(pwf1,"mm9","ensGene")
## Fetching GO annotations...
## For 508 genes, we could not find any categories. These genes will be excluded.
## To force their use, please run with use_genes_without_cat=TRUE (see documentation).
## This was the default behavior for version 1.15.1 and earlier.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
head(GO.wall1)
##         category over_represented_pvalue under_represented_pvalue
## 923   GO:0002430            1.691749e-05                0.9999999
## 1924  GO:0004875            1.691749e-05                0.9999999
## 3078  GO:0007129            4.824823e-05                0.9999986
## 10581 GO:0045143            1.641737e-04                0.9999927
## 13831 GO:0070192            2.137497e-04                0.9999897
## 1608  GO:0004252            2.431368e-04                0.9999736
##       numDEInCat numInCat
## 923            3        4
## 1924           3        4
## 3078           4       13
## 10581          4       19
## 13831          4       20
## 1608           6       53
##                                                         term ontology
## 923           complement receptor mediated signaling pathway       BP
## 1924                            complement receptor activity       MF
## 3078                                                synapsis       BP
## 10581                      homologous chromosome segregation       BP
## 13831 chromosome organization involved in meiotic cell cycle       BP
## 1608                      serine-type endopeptidase activity       MF
# Now we have the enriched processes in our gene differentials after correction
# for the lane variable

# The next task was to compare the top 10 enriched processes for the
# corrected and uncorrected analysis to determine how many are common
# Get top ten of GO.wall
GO.wall_top_10=GO.wall[1:10,1]
# Get top ten of GO.wall
GO.wall1_top_10=GO.wall1[1:10,1]
# Now match them up 
new = match(GO.wall_top_10,GO.wall1_top_10)
# We can also use the intersect function to do this

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.