This document describes the analysis of RNA sequence data from Arabidopsis thaliana. The analysis is divided into four main sections; Mapping short reads to genome sequence, converting mapped reads to measures of gene expression, quality control and detection of differential gene expression. Thera are a number of different parameter settings that can, and should, be adjusted depending on the data and experimental design. These settings as well choice of programs to use in analysis depends on many factors and if you prefer to use other methods and seetings than the once I have used here, you can, if applicable, modify the code in this document or let me know and I can hopefully help out in setting up a new analysis with the requirements you might have.
Sample | Treatment | Replicate | Number of sequenced fragments |
---|---|---|---|
TQ01 | DMSO_mock | R01 | 19525079 |
TQ02 | IAA_50µM | R01 | 19746564 |
TQ03 | 45H02_50µM | R01 | 18675266 |
TQ04 | 60D09_50µM | R01 | 20686795 |
TQ05 | Hit45_50µM | R01 | 17418684 |
TQ06 | DMSO_mock | R02 | 20114660 |
TQ07 | IAA_50µM | R02 | 19923169 |
TQ08 | 45H02_50µM | R02 | 18943350 |
TQ09 | 60D09_50µM | R02 | 18224871 |
TQ10 | Hit45_50µM | R02 | 20158572 |
TQ11 | DMSO_mock | R03 | 21896845 |
TQ12 | IAA_50µM | R03 | 21001968 |
TQ13 | 45H02_50µM | R03 | 19603460 |
TQ14 | 60D09_50µM | R03 | 19419595 |
TQ15 | Hit45_50µM | R03 | 19402201 |
The pair-end reads were mapped using the short read aligner Subreadalign. The analysis were done from within R with the code below. To convert read data to counts per gene eg. gene expreession the function featureCounts from the the same package was used. All these results were then saved as ‘Analysis.data’ that I shared with you via Uppsala University Dropbox service.
library(limma) # Loaded already here for the read targets function
library(Rsubread)
targets <- readTargets('../References/Targets.txt') # Reads in Target file that contains all information for input/output for the short read alignment step. The function is part of the Limma package
buildindex(basename = "TAIR10", reference = '../References/TAIR10_chr_all.fasta') # builds index of genome fasta reference
align(index = 'TAIR10', readfile1 = targets$Read1, readfile2 = targets$Read2, input_format = 'gzFASTQ', output_format = 'BAM', output_file = targets$OutputFile, tieBreakHamming=TRUE, unique=TRUE, indels=5, nthreads = 16) # map reads allow for indels up to 5bp
fc <- featureCounts(files = targets$OutputFile, annot.ext = "/proj/b2014273/private/BILS/References/TAIR4.gtf",isGTFAnnotationFile = TRUE, isPairedEnd = TRUE, strandSpecific = 2, requireBothEndsMapped = FALSE, GTF.featureType = "exon", GTF.attrType = 'gene_id') # count the reads mapping to exons and summarize over genes
library(limma)
library(edgeR)
load('Analysis.rdata') # Read in the data
rm(list = c('y','design','TQ.dge.list','TQ.dge.list.rpkm','treat','isexpr','targets','fit')) # make sure that the name space are not containing old objects with same naming
TQ.dge.list <- DGEList(counts=fc$counts, genes=fc$annotation[,c("GeneID","Length")]) # Convert counts to dge object
TQ.dge.list <- calcNormFactors(TQ.dge.list) # Take library sizes into accaount
TQ.dge.list.rpkm <- rpkm(TQ.dge.list,TQ.dge.list$genes$Length) # create object with rpkm values estimated from the counts
isexpr <- rowSums(cpm(TQ.dge.list) > 5) >= 3 # Create filter genes that have less than 5 counts per willion in at least three libraries
TQ.dge.list <- TQ.dge.list[isexpr,] # Do the actual filtering
targets <- readTargets('../Reference/Targets.txt', sep = ' ') # Read in info on soamples and experimental design
treat <- factor(targets$Treatment) # Treatment as factor in stat model
rep <- factor(targets$Replicate) # Replicate .g experiment as factor in model
design.2 <- model.matrix(~0 + rep + treat) # create the model matrix. Expression of a gene is the effect of rep + treat
y.2 <- voom(TQ.dge.list,design.2) # Transform counts to log2-counts and estimate mean-variance relationships. Makes data suitable for linear modelling
plotMDS(y.2)
fit.2 <- lmFit(y.2,design.2) # fit the model
fit.2 <- eBayes(fit.2)
IAA.DMSO.cont.2 <- makeContrasts(treatIAA_50 - treatDMSO_mock, levels= design.2) # extract contrast of interest
IAA.DMSO.2 <- contrasts.fit(fit.2, IAA.DMSO.cont.2) # extract contrast from full fitted model
IAA.DMSO.2 <- eBayes(IAA.DMSO.2) # Compute stats using empirical Bayes moderation
topTableF(IAA.DMSO.2, adjust="BH") # A look at the top liot of differentially exprssed genes
## GeneID Length treatIAA_50...treatDMSO_mock AveExpr
## AT5G47370.1 AT5G47370.1 1284 2.0793921 8.278883
## AT3G23030.1 AT3G23030.1 941 1.2271537 7.177746
## AT5G08790.1 AT5G08790.1 1234 1.6877856 7.085074
## AT4G37390.1 AT4G37390.1 2131 2.5509457 7.015521
## AT5G63790.1 AT5G63790.1 1317 1.6759545 6.920436
## AT4G17460.1 AT4G17460.1 1211 2.5044397 4.902314
## AT1G78100.1 AT1G78100.1 1318 1.4749252 6.746593
## AT3G62100.1 AT3G62100.1 731 1.4917814 4.799656
## AT2G29420.1 AT2G29420.1 887 0.8658984 6.539921
## AT1G12580.1 AT1G12580.1 1972 0.8487150 5.600427
## F P.Value adj.P.Val
## AT5G47370.1 1003.2279 2.340138e-16 2.713623e-12
## AT3G23030.1 648.8865 8.309455e-15 4.817822e-11
## AT5G08790.1 568.6986 2.432394e-14 9.402012e-11
## AT4G37390.1 506.2851 6.248288e-14 1.811379e-10
## AT5G63790.1 377.6137 6.644866e-13 1.375280e-09
## AT4G17460.1 374.4015 7.115972e-13 1.375280e-09
## AT1G78100.1 259.3400 1.318254e-11 2.183781e-08
## AT3G62100.1 250.1701 1.749845e-11 2.536400e-08
## AT2G29420.1 229.5698 3.431993e-11 4.421933e-08
## AT1G12580.1 212.5317 6.261948e-11 7.216451e-08
resultsIAA <- decideTests(IAA.DMSO.2, adjust.method = 'BH') # How many are oign. up- and downregulated
write.fit(IAA.DMSO.2, results = resultsIAA, file = 'IAA.csv' , digits = 20, sep = ',', adjust = 'BH') # save results
Hit45.DMSO.cont.2 <- makeContrasts(treatHit45_50 - treatDMSO_mock, levels= design.2)
Hit45.DMSO.2 <- contrasts.fit(fit.2, Hit45.DMSO.cont.2)
Hit45.DMSO.2 <- eBayes(Hit45.DMSO.2)
topTableF(Hit45.DMSO.2, adjust="BH")
## GeneID Length treatHit45_50...treatDMSO_mock AveExpr
## AT5G08790.1 AT5G08790.1 1234 2.1718231 7.085074
## AT2G18700.1 AT2G18700.1 2800 1.1799491 7.156457
## AT5G54490.1 AT5G54490.1 528 2.4145587 5.046543
## AT4G22710.1 AT4G22710.1 1787 1.6967262 6.947000
## AT4G37390.1 AT4G37390.1 2131 2.5868164 7.015521
## AT5G63790.1 AT5G63790.1 1317 1.9086805 6.920436
## AT1G31130.1 AT1G31130.1 1495 0.8476447 7.817863
## AT2G41410.1 AT2G41410.1 1157 1.4830376 7.561215
## AT5G47370.1 AT5G47370.1 1284 1.3446698 8.278883
## AT4G34150.1 AT4G34150.1 1171 1.3859837 6.853141
## F P.Value adj.P.Val
## AT5G08790.1 955.9919 3.478961e-16 4.034203e-12
## AT2G18700.1 661.4033 7.110207e-15 4.122498e-11
## AT5G54490.1 620.2332 1.200752e-14 4.641305e-11
## AT4G22710.1 545.8257 3.395257e-14 9.842849e-11
## AT4G37390.1 519.9162 5.037835e-14 1.168375e-10
## AT5G63790.1 492.5951 7.801400e-14 1.507751e-10
## AT1G31130.1 450.2505 1.613296e-13 2.672540e-10
## AT2G41410.1 434.4379 2.152577e-13 3.120161e-10
## AT5G47370.1 408.0428 3.565855e-13 4.594406e-10
## AT4G34150.1 402.1902 4.005123e-13 4.644340e-10
resultsHit45 <- decideTests(Hit45.DMSO.2, adjust.method = 'BH')
write.fit(Hit45.DMSO.2, results = resultsHit45, file = 'Hit45.csv' , digits = 20, sep = ',', adjust = 'BH')
A60D09.DMSO.cont.2 <- makeContrasts(treatA60D09_50 - treatDMSO_mock, levels= design.2)
A60D09.DMSO.2 <- contrasts.fit(fit.2, A60D09.DMSO.cont.2)
A60D09.DMSO.2 <- eBayes(A60D09.DMSO.2)
topTableF(A60D09.DMSO.2, adjust="BH")
## GeneID Length treatA60D09_50...treatDMSO_mock AveExpr
## AT2G18700.1 AT2G18700.1 2800 1.419974 7.156457
## AT5G08790.1 AT5G08790.1 1234 2.141623 7.085074
## AT4G22710.1 AT4G22710.1 1787 2.016704 6.947000
## AT5G54490.1 AT5G54490.1 528 2.512516 5.046543
## AT3G13310.1 AT3G13310.1 834 1.619399 7.092454
## AT5G37260.1 AT5G37260.1 980 2.295896 4.422890
## AT2G38470.1 AT2G38470.1 1902 4.194540 5.768547
## AT4G37390.1 AT4G37390.1 2131 2.586716 7.015521
## AT3G23030.1 AT3G23030.1 941 1.013491 7.177746
## AT4G38620.1 AT4G38620.1 1287 4.188742 2.672706
## F P.Value adj.P.Val
## AT2G18700.1 968.3029 3.131655e-16 2.554049e-12
## AT5G08790.1 928.9223 4.405051e-16 2.554049e-12
## AT4G22710.1 779.5571 1.855153e-15 7.170783e-12
## AT5G54490.1 673.8216 6.108724e-15 1.770919e-11
## AT3G13310.1 639.0534 9.411291e-15 2.182667e-11
## AT5G37260.1 606.1105 1.448524e-14 2.799514e-11
## AT2G38470.1 554.3708 2.992843e-14 4.957857e-11
## AT4G37390.1 520.1640 5.018408e-14 7.274182e-11
## AT3G23030.1 436.5394 2.070440e-13 2.667647e-10
## AT4G38620.1 425.4245 2.548787e-13 2.955573e-10
resultsA60D09 <- decideTests(A60D09.DMSO.2, adjust.method = 'BH')
write.fit(A60D09.DMSO.2, results = resultsA60D09, file = 'A60D09.csv' , digits = 20, sep = ',', adjust = 'BH')
#volcanoplot(A60D09.DMSO.2)
sessionInfo()
## R version 3.1.3 (2015-03-09)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.3 (Yosemite)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] edgeR_3.8.6 limma_3.22.7
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.8 evaluate_0.6 formatR_1.1 htmltools_0.2.6
## [5] knitr_1.9 rmarkdown_0.5.1 stringr_0.6.2 tools_3.1.3
## [9] yaml_2.1.13