# setting the working directory
setwd("/Users/pratap/Desktop/Oi_Fah_Eye_Project/QuantificationFiles/")
## To install edgeR and limma R packages
source("https://bioconductor.org/biocLite.R")
## Bioconductor version 3.6 (BiocInstaller 1.28.0), ?biocLite for help
## A new version of Bioconductor is available after installing the most
## recent version of R; see http://bioconductor.org/install
biocLite(c("edgeR","limma","tximport"))
## BioC_mirror: https://bioconductor.org
## Using Bioconductor 3.6 (BiocInstaller 1.28.0), R 3.4.3 (2017-11-30).
## Installing package(s) 'edgeR', 'limma', 'tximport'
##
## The downloaded binary packages are in
## /var/folders/m5/7xqcr_2j3gb0zn6w32n7t_lr0000gp/T//Rtmpdnxpvc/downloaded_packages
## Old packages: 'cli', 'data.table', 'doParallel', 'htmlwidgets', 'nloptr',
## 'Rcpp', 'robustbase', 'rvcheck', 'units'
library(edgeR)
## Loading required package: limma
library(limma)
library(GenomicFeatures)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:limma':
##
## plotMA
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colMeans,
## colnames, colSums, do.call, duplicated, eval, evalq, Filter,
## Find, get, grep, grepl, intersect, is.unsorted, lapply,
## lengths, Map, mapply, match, mget, order, paste, pmax,
## pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
## rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
## tapply, union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
library(readr)
library(rjson)
## Warning: package 'rjson' was built under R version 3.4.4
### TXIMPORT ANALYSIS
# create a transcript to gene matching table (tx2gene) that will be used to aggregate Salmon transcript quantifications to gene level summary
txdb = makeTxDbFromGFF("gencode.v28.annotation.gtf")
## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .get_cds_IDX(type, phase): The "phase" metadata column contains non-NA values for features of
## type stop_codon. This information was ignored.
## OK
k = keys(txdb,keytype = "GENEID")
df <- select(txdb, keys = k, columns = "TXNAME", keytype = "GENEID")
## 'select()' returned 1:many mapping between keys and columns
tx2gene <- df[, 2:1]
# Tximport
# listing the files which has pattern "quant_slim.sf". When full.names is TRUE, they return the path of the files also
files <- list.files( pattern = "quant_slim.sf",full.names = TRUE)
# renaming the filenames as sample 1 to sample 2
names(files) <- paste0("sample", 1:30)
write.table(files,"samplematchingInfo_OiFahProject.txt",sep = "\t",col.names = TRUE, quote = F )
# to check the existence of a sf quantification files in the current working directory. If it gives TRUE, they are present.
all(file.exists(files))
## [1] TRUE
# To create length scaled TPM values to pass this as input to limma
txi_lengthscaledTPMvalues = tximport::tximport(files,type = "salmon",txIn = TRUE, countsFromAbundance = "lengthScaledTPM",tx2gene = tx2gene,dropInfReps = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## summarizing abundance
## summarizing counts
## summarizing length
# Use txi_lengthscaledTPMvalues$counts as input to edgeR as suggested in tximport manual
# The method we used here is called "bias corrected count without an offset"
### LIMMA ANALYSIS
# Reading Clinical Info
clinicalInfo = read.table("ClinicalInfo_OiFahEyeProject.txt",header = TRUE, sep = "\t")
#clinicalInfo = as.matrix(clinicalInfo)
##### Very very important step
# sample names are shown as s1 to s30 by tximport. Now we have to rename the sample ids by looking at file object or samplematchingInfo_OiFahProject.txt
# Do this step very carefully; otherwise sample ids would be wrongly named.
colnames(txi_lengthscaledTPMvalues$counts) = c("S1","S10","S11","S12","S13","S14","S15","S16","S17","S18", "S19","S2","S20","S21","S22","S23","S24","S25","S26","S27","S28","S29","S3","S30","S4","S5","S6","S7","S8","S9")
colnames(txi_lengthscaledTPMvalues$abundance) = c("S1","S10","S11","S12","S13","S14","S15","S16","S17","S18", "S19","S2","S20","S21","S22","S23","S24","S25","S26","S27","S28","S29","S3","S30","S4","S5","S6","S7","S8","S9")
colnames(txi_lengthscaledTPMvalues$length) = c("S1","S10","S11","S12","S13","S14","S15","S16","S17","S18", "S19","S2","S20","S21","S22","S23","S24","S25","S26","S27","S28","S29","S3","S30","S4","S5","S6","S7","S8","S9")
## Reading the data and sample Information and saving as DGE list object
# we used the option lengthScaledTPM values in the tximport so that the counts generated were adjusted accoring to lengthscaled TPM values
# The authors recommended to use counts which are adjusted for lengthscaledTPMvalues as input to voom function
y = DGEList(counts = txi_lengthscaledTPMvalues$counts,lib.size = colSums(txi_lengthscaledTPMvalues$counts),norm.factors = calcNormFactors(txi_lengthscaledTPMvalues$counts),samples = clinicalInfo,group =clinicalInfo$Group,genes = NULL,remove.zeros = FALSE)
## Filtering lowly expressed genes
# This function is used in tximport manual to remove lowly expressed genes. so we have implemented from there directly
keep = filterByExpr(y)
y = y[keep,]
# Calculating normalization factors
y = calcNormFactors(y)
# adding 1 to the whole expression values to avoid zero values
# creating experimental design matrix
Design = model.matrix(~0+y$samples$group+y$samples$Age)
# To rename column names
colnames(Design) = c("Diseaseeighthours","Diseasefourhours","Diseasesixhours","Diseasetwohours","Diseasezerohours","Normaleighthours","Normalfourhours","Normalsixhours","Normaltwohours","Normalzerohours","Age")
# making contrasts
contrasts = makeContrasts(Disease_vs_Normal_0hours = Diseasezerohours-Normalzerohours, Disease_vs_Normal_2hours = Diseasetwohours-Normaltwohours, Disease_vs_Normal_4hours = Diseasefourhours-Normalfourhours, Disease_vs_Normal_6hours = Diseasesixhours-Normalsixhours, Disease_vs_Normal_8hours = Diseaseeighthours-Normaleighthours, levels = colnames(Design))
# voom function
# we have given rawcounts (adjusted by lengthscaledTPM) to voom with out adding any prior count values.
# The voom function automatically tranforms raw counts to log CPM values
# After looking voom function code, i understood that it has added 0.5 values
v = voom(y,design = Design)
write.table(v$E, "v$E.txt",sep = "\t", col.names = NA,quote = F)
#usual limma pipeline
# Fitting linear model to our data
vfit = lmFit(v, Design)
vfit_afterfit = contrasts.fit(vfit,contrasts = contrasts)
efit = eBayes(vfit_afterfit)
plotSA(efit)

summary(decideTests(efit))
## Disease_vs_Normal_0hours Disease_vs_Normal_2hours
## Down 0 2
## NotSig 18421 18419
## Up 0 0
## Disease_vs_Normal_4hours Disease_vs_Normal_6hours
## Down 0 0
## NotSig 18421 18421
## Up 0 0
## Disease_vs_Normal_8hours
## Down 1
## NotSig 18420
## Up 0
#Access Results
voom.Results_Disease_vs_Normal_0hours_28thsep2018 = topTable(efit,coef = 1,adjust.method = "BH",number = nrow(y$counts))
voom.Results_Disease_vs_Normal_2hours_28thsep2018 = topTable(efit,coef = 2,adjust.method = "BH",number = nrow(y$counts))
voom.Results_Disease_vs_Normal_4hours_28thsep2018 = topTable(efit,coef = 3,adjust.method = "BH",number = nrow(y$counts))
voom.Results_Disease_vs_Normal_6hours_28thsep2018 = topTable(efit,coef = 4,adjust.method = "BH",number = nrow(y$counts))
voom.Results_Disease_vs_Normal_8hours_28thsep2018 = topTable(efit,coef = 5,adjust.method = "BH",number = nrow(y$counts))
write.table(voom.Results_Disease_vs_Normal_0hours_28thsep2018, "voom.Results_Disease_vs_Normal_0hours_28thsep2018.txt",sep = "\t",col.names = NA,quote = F)
write.table(voom.Results_Disease_vs_Normal_2hours_28thsep2018, "voom.Results_Disease_vs_Normal_2hours_28thsep2018.txt",sep = "\t",col.names = NA,quote = F)
write.table(voom.Results_Disease_vs_Normal_4hours_28thsep2018, "voom.Results_Disease_vs_Normal_4hours_28thsep2018.txt",sep = "\t",col.names = NA,quote = F)
write.table(voom.Results_Disease_vs_Normal_6hours_28thsep2018, "voom.Results_Disease_vs_Normal_6hours_28thsep2018.txt",sep = "\t",col.names = NA,quote = F)
write.table(voom.Results_Disease_vs_Normal_8hours_28thsep2018, "voom.Results_Disease_vs_Normal_8hours_28thsep2018.txt",sep = "\t",col.names = NA,quote = F)
### Covariates(age) adjusted log cpm values
v.batcheffect.eyes = removeBatchEffect(v,batch = NULL,covariates = y$samples$Age,design = model.matrix(~0+y$samples$group))
# Gene expression data was adjusted for covariate age and saved as text file
write.table(v.batcheffect.eyes, "Covariate_Adjusted_LogCPMvalues_Expressiondata.txt", sep = "\t",col.names = NA, quote = FALSE)
# To save packages used and what version we have used
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] rjson_0.2.20 readr_1.1.1 GenomicFeatures_1.30.3
## [4] AnnotationDbi_1.40.0 Biobase_2.38.0 GenomicRanges_1.30.3
## [7] GenomeInfoDb_1.14.0 IRanges_2.12.0 S4Vectors_0.16.0
## [10] BiocGenerics_0.24.0 edgeR_3.20.9 limma_3.34.9
## [13] BiocInstaller_1.28.0
##
## loaded via a namespace (and not attached):
## [1] tximport_1.6.0 SummarizedExperiment_1.8.1
## [3] progress_1.2.0 locfit_1.5-9.1
## [5] lattice_0.20-35 htmltools_0.3.6
## [7] rtracklayer_1.38.3 blob_1.1.1
## [9] XML_3.98-1.16 rlang_0.2.2
## [11] pillar_1.3.0 DBI_1.0.0
## [13] BiocParallel_1.12.0 bit64_0.9-7
## [15] matrixStats_0.54.0 GenomeInfoDbData_1.0.0
## [17] stringr_1.3.1 zlibbioc_1.24.0
## [19] Biostrings_2.46.0 memoise_1.1.0
## [21] evaluate_0.11 knitr_1.20
## [23] biomaRt_2.34.2 Rcpp_0.12.18
## [25] backports_1.1.2 DelayedArray_0.4.1
## [27] XVector_0.18.0 bit_1.1-14
## [29] Rsamtools_1.30.0 RMySQL_0.10.15
## [31] hms_0.4.2 digest_0.6.17
## [33] stringi_1.2.4 grid_3.4.3
## [35] rprojroot_1.3-2 tools_3.4.3
## [37] bitops_1.0-6 magrittr_1.5
## [39] RCurl_1.95-4.11 RSQLite_2.1.1
## [41] tibble_1.4.2 crayon_1.3.4
## [43] pkgconfig_2.0.2 Matrix_1.2-14
## [45] prettyunits_1.0.2 assertthat_0.2.0
## [47] rmarkdown_1.10 httr_1.3.1
## [49] R6_2.2.2 GenomicAlignments_1.14.2
## [51] compiler_3.4.3