# setting the working directory
setwd("/Users/pratap/Desktop/Oi_Fah_Eye_Project/QuantificationFiles/")
## To install edgeR and limma R packages
source("https://bioconductor.org/biocLite.R")
## Bioconductor version 3.6 (BiocInstaller 1.28.0), ?biocLite for help
## A new version of Bioconductor is available after installing the most
##   recent version of R; see http://bioconductor.org/install
biocLite(c("edgeR","limma","tximport"))
## BioC_mirror: https://bioconductor.org
## Using Bioconductor 3.6 (BiocInstaller 1.28.0), R 3.4.3 (2017-11-30).
## Installing package(s) 'edgeR', 'limma', 'tximport'
## 
## The downloaded binary packages are in
##  /var/folders/m5/7xqcr_2j3gb0zn6w32n7t_lr0000gp/T//Rtmpdnxpvc/downloaded_packages
## Old packages: 'cli', 'data.table', 'doParallel', 'htmlwidgets', 'nloptr',
##   'Rcpp', 'robustbase', 'rvcheck', 'units'
library(edgeR)
## Loading required package: limma
library(limma)
library(GenomicFeatures)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:limma':
## 
##     plotMA
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colMeans,
##     colnames, colSums, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply,
##     lengths, Map, mapply, match, mget, order, paste, pmax,
##     pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
##     rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
##     tapply, union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
library(readr)
library(rjson)
## Warning: package 'rjson' was built under R version 3.4.4
### TXIMPORT ANALYSIS
# create a transcript to gene matching table (tx2gene) that will be used to aggregate Salmon transcript quantifications to gene level summary
txdb = makeTxDbFromGFF("gencode.v28.annotation.gtf")
## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .get_cds_IDX(type, phase): The "phase" metadata column contains non-NA values for features of
##   type stop_codon. This information was ignored.
## OK
k = keys(txdb,keytype = "GENEID")
df <- select(txdb, keys = k,  columns = "TXNAME", keytype = "GENEID")
## 'select()' returned 1:many mapping between keys and columns
tx2gene <- df[, 2:1]
# Tximport 
# listing the files which has pattern "quant_slim.sf". When full.names is TRUE, they return the path of the files also 
files <- list.files( pattern = "quant_slim.sf",full.names = TRUE)
# renaming the filenames as sample 1 to sample 2
names(files) <- paste0("sample", 1:30)
write.table(files,"samplematchingInfo_OiFahProject.txt",sep = "\t",col.names = TRUE, quote = F )
# to check the existence of a sf quantification files in the current working directory. If it gives TRUE, they are present.
all(file.exists(files))
## [1] TRUE
# To create length scaled TPM values to pass this as input to limma
txi_lengthscaledTPMvalues = tximport::tximport(files,type = "salmon",txIn = TRUE, countsFromAbundance = "lengthScaledTPM",tx2gene = tx2gene,dropInfReps = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 
## summarizing abundance
## summarizing counts
## summarizing length
# Use txi_lengthscaledTPMvalues$counts as input to edgeR as suggested in tximport manual
# The method we used here is called "bias corrected count without an offset"
### LIMMA ANALYSIS 
# Reading Clinical Info
clinicalInfo = read.table("ClinicalInfo_OiFahEyeProject.txt",header = TRUE, sep = "\t")
#clinicalInfo = as.matrix(clinicalInfo)


##### Very very important step 
# sample names are shown as s1 to s30 by tximport. Now we have to rename the sample ids by looking at file object or samplematchingInfo_OiFahProject.txt
# Do this step very carefully; otherwise sample ids would be wrongly named.
colnames(txi_lengthscaledTPMvalues$counts) = c("S1","S10","S11","S12","S13","S14","S15","S16","S17","S18", "S19","S2","S20","S21","S22","S23","S24","S25","S26","S27","S28","S29","S3","S30","S4","S5","S6","S7","S8","S9")
colnames(txi_lengthscaledTPMvalues$abundance) = c("S1","S10","S11","S12","S13","S14","S15","S16","S17","S18", "S19","S2","S20","S21","S22","S23","S24","S25","S26","S27","S28","S29","S3","S30","S4","S5","S6","S7","S8","S9")
colnames(txi_lengthscaledTPMvalues$length) = c("S1","S10","S11","S12","S13","S14","S15","S16","S17","S18", "S19","S2","S20","S21","S22","S23","S24","S25","S26","S27","S28","S29","S3","S30","S4","S5","S6","S7","S8","S9")
## Reading the data and sample Information and saving as DGE list object
# we used the option lengthScaledTPM values in the tximport so that the counts generated were adjusted accoring to lengthscaled TPM values
# The authors recommended to use counts which are adjusted for lengthscaledTPMvalues as input to voom function
y = DGEList(counts = txi_lengthscaledTPMvalues$counts,lib.size = colSums(txi_lengthscaledTPMvalues$counts),norm.factors = calcNormFactors(txi_lengthscaledTPMvalues$counts),samples = clinicalInfo,group =clinicalInfo$Group,genes = NULL,remove.zeros = FALSE)
## Filtering lowly expressed genes
# This function is used in tximport manual to remove lowly expressed genes. so we have implemented from there directly
keep = filterByExpr(y)
y = y[keep,]
# Calculating normalization factors
y = calcNormFactors(y)
# adding 1 to the whole expression values to avoid zero values
# creating experimental design matrix
Design = model.matrix(~0+y$samples$group+y$samples$Age)
# To rename column names
colnames(Design) = c("Diseaseeighthours","Diseasefourhours","Diseasesixhours","Diseasetwohours","Diseasezerohours","Normaleighthours","Normalfourhours","Normalsixhours","Normaltwohours","Normalzerohours","Age")
# making contrasts 
contrasts = makeContrasts(Disease_vs_Normal_0hours = Diseasezerohours-Normalzerohours, Disease_vs_Normal_2hours = Diseasetwohours-Normaltwohours, Disease_vs_Normal_4hours = Diseasefourhours-Normalfourhours, Disease_vs_Normal_6hours = Diseasesixhours-Normalsixhours, Disease_vs_Normal_8hours = Diseaseeighthours-Normaleighthours, levels = colnames(Design))
# voom function
# we have given rawcounts (adjusted by lengthscaledTPM) to voom with out adding any prior count values.
# The voom function automatically tranforms raw counts to log CPM values
# After looking voom function code, i understood that it has added 0.5 values
v = voom(y,design = Design)
write.table(v$E, "v$E.txt",sep = "\t", col.names = NA,quote = F)
#usual limma pipeline
# Fitting linear model to our data
vfit = lmFit(v, Design)
vfit_afterfit = contrasts.fit(vfit,contrasts = contrasts)
efit = eBayes(vfit_afterfit)
plotSA(efit)

summary(decideTests(efit))
##        Disease_vs_Normal_0hours Disease_vs_Normal_2hours
## Down                          0                        2
## NotSig                    18421                    18419
## Up                            0                        0
##        Disease_vs_Normal_4hours Disease_vs_Normal_6hours
## Down                          0                        0
## NotSig                    18421                    18421
## Up                            0                        0
##        Disease_vs_Normal_8hours
## Down                          1
## NotSig                    18420
## Up                            0
#Access Results
voom.Results_Disease_vs_Normal_0hours_28thsep2018 = topTable(efit,coef = 1,adjust.method = "BH",number = nrow(y$counts))
voom.Results_Disease_vs_Normal_2hours_28thsep2018 = topTable(efit,coef = 2,adjust.method = "BH",number = nrow(y$counts))
voom.Results_Disease_vs_Normal_4hours_28thsep2018 = topTable(efit,coef = 3,adjust.method = "BH",number = nrow(y$counts))
voom.Results_Disease_vs_Normal_6hours_28thsep2018 = topTable(efit,coef = 4,adjust.method = "BH",number = nrow(y$counts))
voom.Results_Disease_vs_Normal_8hours_28thsep2018 = topTable(efit,coef = 5,adjust.method = "BH",number = nrow(y$counts))

write.table(voom.Results_Disease_vs_Normal_0hours_28thsep2018, "voom.Results_Disease_vs_Normal_0hours_28thsep2018.txt",sep = "\t",col.names = NA,quote = F)
write.table(voom.Results_Disease_vs_Normal_2hours_28thsep2018, "voom.Results_Disease_vs_Normal_2hours_28thsep2018.txt",sep = "\t",col.names = NA,quote = F)
write.table(voom.Results_Disease_vs_Normal_4hours_28thsep2018, "voom.Results_Disease_vs_Normal_4hours_28thsep2018.txt",sep = "\t",col.names = NA,quote = F)
write.table(voom.Results_Disease_vs_Normal_6hours_28thsep2018, "voom.Results_Disease_vs_Normal_6hours_28thsep2018.txt",sep = "\t",col.names = NA,quote = F)
write.table(voom.Results_Disease_vs_Normal_8hours_28thsep2018, "voom.Results_Disease_vs_Normal_8hours_28thsep2018.txt",sep = "\t",col.names = NA,quote = F)


### Covariates(age) adjusted log cpm values 
v.batcheffect.eyes = removeBatchEffect(v,batch = NULL,covariates = y$samples$Age,design = model.matrix(~0+y$samples$group))
# Gene expression data was adjusted for covariate age and saved as text file
write.table(v.batcheffect.eyes, "Covariate_Adjusted_LogCPMvalues_Expressiondata.txt", sep = "\t",col.names = NA, quote = FALSE)
# To save packages used and what version we have used
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] rjson_0.2.20           readr_1.1.1            GenomicFeatures_1.30.3
##  [4] AnnotationDbi_1.40.0   Biobase_2.38.0         GenomicRanges_1.30.3  
##  [7] GenomeInfoDb_1.14.0    IRanges_2.12.0         S4Vectors_0.16.0      
## [10] BiocGenerics_0.24.0    edgeR_3.20.9           limma_3.34.9          
## [13] BiocInstaller_1.28.0  
## 
## loaded via a namespace (and not attached):
##  [1] tximport_1.6.0             SummarizedExperiment_1.8.1
##  [3] progress_1.2.0             locfit_1.5-9.1            
##  [5] lattice_0.20-35            htmltools_0.3.6           
##  [7] rtracklayer_1.38.3         blob_1.1.1                
##  [9] XML_3.98-1.16              rlang_0.2.2               
## [11] pillar_1.3.0               DBI_1.0.0                 
## [13] BiocParallel_1.12.0        bit64_0.9-7               
## [15] matrixStats_0.54.0         GenomeInfoDbData_1.0.0    
## [17] stringr_1.3.1              zlibbioc_1.24.0           
## [19] Biostrings_2.46.0          memoise_1.1.0             
## [21] evaluate_0.11              knitr_1.20                
## [23] biomaRt_2.34.2             Rcpp_0.12.18              
## [25] backports_1.1.2            DelayedArray_0.4.1        
## [27] XVector_0.18.0             bit_1.1-14                
## [29] Rsamtools_1.30.0           RMySQL_0.10.15            
## [31] hms_0.4.2                  digest_0.6.17             
## [33] stringi_1.2.4              grid_3.4.3                
## [35] rprojroot_1.3-2            tools_3.4.3               
## [37] bitops_1.0-6               magrittr_1.5              
## [39] RCurl_1.95-4.11            RSQLite_2.1.1             
## [41] tibble_1.4.2               crayon_1.3.4              
## [43] pkgconfig_2.0.2            Matrix_1.2-14             
## [45] prettyunits_1.0.2          assertthat_0.2.0          
## [47] rmarkdown_1.10             httr_1.3.1                
## [49] R6_2.2.2                   GenomicAlignments_1.14.2  
## [51] compiler_3.4.3