# To read feature counts data
library(edgeR)
## Loading required package: limma
library(limma)
library(RColorBrewer)
## Importing gene expression (features count) data
Dataset = read.table("Tidy_FeatureCounts.txt", header = TRUE, sep = "\t",row.names = 1,
                          as.is = TRUE)
## Importing sample Info (drug treatment and cell lines info)
SampleInfo = read.table("SampleInfo.txt", header = TRUE, sep = "\t")
SampleInfo = as.matrix(SampleInfo)

# Using DGElist function from edgeR package so that it would be easy to do further analysis
DGE = edgeR::DGEList(counts = Dataset, lib.size = colSums(Dataset),
                     norm.factors = calcNormFactors(Dataset), samples = SampleInfo,group = NULL,
                     genes = NULL, remove.zeros = FALSE)
# more normalization factor value for less library size samples and viceversa was generated by
# calcNormFactors in the DGEList function

cpm = edgeR::cpm(DGE)
# while log transformation, 0.25 is added to 0 values to overcome log transformation of zero values
lcpm = edgeR::cpm(DGE, log = TRUE)

# to show count of genes that are not at all expressed in all the samples
# Total number of samples in this analysis are 16
table(rowSums(DGE$counts == 0)== 16) 
## 
## FALSE  TRUE 
## 42808 15573
# Result is 42808 genes are expressed, 15573 genes not at all expressed

# To remove low expressed genes. The cut-off is cpm has to be >1 across 20 % of samples or the size
# of the group with less number of samples. The 20 % of my total samples(n = 16)is 3 samples
# and 16937 genes were passed this filter
keep.exprs = rowSums(cpm > 1)>= 3
DGE_FeatureCounts_CPM_GT_1_GT_3Samples = DGE[keep.exprs, ,keep.lib.sizes = FALSE]
## To show distribution for raw data and filtered data
# For Raw data
nsamples = ncol(DGE_FeatureCounts_CPM_GT_1_GT_3Samples)
getPalette = colorRampPalette(brewer.pal(9, "Set1"))
col = brewer.pal(nsamples, "Dark2")
## Warning in brewer.pal(nsamples, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors
par(mfrow = c(1,2))
plot(density(lcpm[,1]), col = col[1],lwd = 2, ylim = c(0,0.21),las = 2, main = "",
     xlab = "")
title(main = "A.Raw data", xlab = "Log-cpm")
abline(v = 0, lty = 3)
for(i in 2:nsamples){
  den = density(lcpm[,i])
  lines(den$x, den$y, col = col[i],lwd = 2)
}
# For Filtered data
lcpm_ExpressedGenes_CPM_GT_1_GT_3samples = edgeR::cpm(DGE_FeatureCounts_CPM_GT_1_GT_3Samples,
                                                      log = TRUE)
plot(density(lcpm_ExpressedGenes_CPM_GT_1_GT_3samples[,1]), col = col[1],lwd = 2, ylim = c(0,0.21),
     las = 2, main = "", xlab = "")
title(main="B. Filtered data", xlab="Log-cpm")
abline(v = 0, lty = 3)
for(i in 2:nsamples){
  den = density(lcpm_ExpressedGenes_CPM_GT_1_GT_3samples[,i])
  lines(den$x, den$y, col = col[i],lwd = 2)
}

# TMM normalisation (for overcoming library size differences among samples)
## The TMM normalisation was done on feature counts data and then that data is transformed to 
                                                                               #log cpm values
# Before TMM normalisation
par(mfrow = c (1,2))
boxplot(lcpm_ExpressedGenes_CPM_GT_1_GT_3samples, las = 2,col = col, main = "")
title(main="A. Before TMM Normalisation", ylab="Log-cpm")

# After TMM normalisation
DGE_FeatureCounts_CPM_GT_1_GT_3Samples_TMM_Normalisation = 
  edgeR::calcNormFactors(DGE_FeatureCounts_CPM_GT_1_GT_3Samples,method = "TMM")

lcpm_ExpressedGenes_CPM_GT_1_GT_3samples_TMM_Normalised = 
  edgeR::cpm(DGE_FeatureCounts_CPM_GT_1_GT_3Samples_TMM_Normalisation, log = TRUE)

boxplot(lcpm_ExpressedGenes_CPM_GT_1_GT_3samples_TMM_Normalised,las = 2, col = col,main = "")
title(main="A. After TMM Normalisation", ylab="Log-cpm")

# MDS plots (Unsupervised Clustering of samples)
#par(mfrow = c (1,2))
# to create a 5 x 5 inch image
png("MDS_plot.png",width=11*300,height=7*300,res = 300, pointsize = 8)
plotMDS(lcpm_ExpressedGenes_CPM_GT_1_GT_3samples_TMM_Normalised, 
        labels = DGE_FeatureCounts_CPM_GT_1_GT_3Samples_TMM_Normalisation$samples$Group,
        col = col)
dev.off() #close the PNG device. otherwise, we could not open the saved PNG image on our computer
## quartz_off_screen 
##                 2
# voom ( to take out mean variance relationship)
Design = model.matrix(~0+DGE_FeatureCounts_CPM_GT_1_GT_3Samples_TMM_Normalisation$samples$Group)
colnames(Design)= gsub("DGE_FeatureCounts_CPM_GT_1_GT_3Samples_TMM_Normalisation$samples$Group"
                       ,"",colnames(Design), fixed = TRUE)



v = voom(DGE_FeatureCounts_CPM_GT_1_GT_3Samples_TMM_Normalisation,
         normalize.method="none",plot = TRUE)

sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.3
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] RColorBrewer_1.1-2 edgeR_3.24.3       limma_3.38.3      
## 
## loaded via a namespace (and not attached):
##  [1] locfit_1.5-9.1  Rcpp_1.0.0      lattice_0.20-38 digest_0.6.18  
##  [5] grid_3.5.2      magrittr_1.5    evaluate_0.12   highr_0.7      
##  [9] stringi_1.2.4   rmarkdown_1.11  tools_3.5.2     stringr_1.3.1  
## [13] xfun_0.4        compiler_3.5.2  htmltools_0.3.6 knitr_1.21
################ R objects description in this project ############################
######## part 1: all genes (expressed genes + non expressed genes)
# Dataset : Feature counts
# SampleInfo : Clinical Info
# DEG : is a object which has feature counts + clinical Info + normalisation factor
# cpm = cpm values  for all feature counts
# lcpm = logcpm values for all feature counts
####### part 2 : only expressed genes i.e genes with CPM > 1 >= 20 % samples or group with smallsize
#### This R objects holds only expressed genes
# keep.exprs = logical to keep only expressed genes
# DGE_FeatureCounts_CPM_GT_1_GT_3Samples : feature counts of genes whose CPM >1 >= 3 samples(20% of
                                           # samples)
# lcpm_ExpressedGenes_CPM_GT_1_GT_3samples = log cpm values of 16 thousand genes. This 16 thousand
                                             # genes will have CPM >1 >= 3 samples

# DGE_FeatureCounts_CPM_GT_1_GT_3Samples_TMM_Normalisation = same as 
                                                             #DGE_FeatureCounts_CPM_GT_1_GT_3Samples
                                                             # but TMM normalised

# lcpm_ExpressedGenes_CPM_GT_1_GT_3samples_TMM_Normalised = same as 
                                                           #lcpm_ExpressedGenes_CPM_GT_1_GT_3samples
                                                           # but TMM normalised