# To read feature counts data
library(edgeR)
## Loading required package: limma
library(limma)
library(RColorBrewer)
## Importing gene expression (features count) data
Dataset = read.table("Tidy_FeatureCounts.txt", header = TRUE, sep = "\t",row.names = 1,
as.is = TRUE)
## Importing sample Info (drug treatment and cell lines info)
SampleInfo = read.table("SampleInfo.txt", header = TRUE, sep = "\t")
SampleInfo = as.matrix(SampleInfo)
# Using DGElist function from edgeR package so that it would be easy to do further analysis
DGE = edgeR::DGEList(counts = Dataset, lib.size = colSums(Dataset),
norm.factors = calcNormFactors(Dataset), samples = SampleInfo,group = NULL,
genes = NULL, remove.zeros = FALSE)
# more normalization factor value for less library size samples and viceversa was generated by
# calcNormFactors in the DGEList function
cpm = edgeR::cpm(DGE)
# while log transformation, 0.25 is added to 0 values to overcome log transformation of zero values
lcpm = edgeR::cpm(DGE, log = TRUE)
# to show count of genes that are not at all expressed in all the samples
# Total number of samples in this analysis are 16
table(rowSums(DGE$counts == 0)== 16)
##
## FALSE TRUE
## 42808 15573
# Result is 42808 genes are expressed, 15573 genes not at all expressed
# To remove low expressed genes. The cut-off is cpm has to be >1 across 20 % of samples or the size
# of the group with less number of samples. The 20 % of my total samples(n = 16)is 3 samples
# and 16937 genes were passed this filter
keep.exprs = rowSums(cpm > 1)>= 3
DGE_FeatureCounts_CPM_GT_1_GT_3Samples = DGE[keep.exprs, ,keep.lib.sizes = FALSE]
## To show distribution for raw data and filtered data
# For Raw data
nsamples = ncol(DGE_FeatureCounts_CPM_GT_1_GT_3Samples)
getPalette = colorRampPalette(brewer.pal(9, "Set1"))
col = brewer.pal(nsamples, "Dark2")
## Warning in brewer.pal(nsamples, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors
par(mfrow = c(1,2))
plot(density(lcpm[,1]), col = col[1],lwd = 2, ylim = c(0,0.21),las = 2, main = "",
xlab = "")
title(main = "A.Raw data", xlab = "Log-cpm")
abline(v = 0, lty = 3)
for(i in 2:nsamples){
den = density(lcpm[,i])
lines(den$x, den$y, col = col[i],lwd = 2)
}
# For Filtered data
lcpm_ExpressedGenes_CPM_GT_1_GT_3samples = edgeR::cpm(DGE_FeatureCounts_CPM_GT_1_GT_3Samples,
log = TRUE)
plot(density(lcpm_ExpressedGenes_CPM_GT_1_GT_3samples[,1]), col = col[1],lwd = 2, ylim = c(0,0.21),
las = 2, main = "", xlab = "")
title(main="B. Filtered data", xlab="Log-cpm")
abline(v = 0, lty = 3)
for(i in 2:nsamples){
den = density(lcpm_ExpressedGenes_CPM_GT_1_GT_3samples[,i])
lines(den$x, den$y, col = col[i],lwd = 2)
}

# TMM normalisation (for overcoming library size differences among samples)
## The TMM normalisation was done on feature counts data and then that data is transformed to
#log cpm values
# Before TMM normalisation
par(mfrow = c (1,2))
boxplot(lcpm_ExpressedGenes_CPM_GT_1_GT_3samples, las = 2,col = col, main = "")
title(main="A. Before TMM Normalisation", ylab="Log-cpm")
# After TMM normalisation
DGE_FeatureCounts_CPM_GT_1_GT_3Samples_TMM_Normalisation =
edgeR::calcNormFactors(DGE_FeatureCounts_CPM_GT_1_GT_3Samples,method = "TMM")
lcpm_ExpressedGenes_CPM_GT_1_GT_3samples_TMM_Normalised =
edgeR::cpm(DGE_FeatureCounts_CPM_GT_1_GT_3Samples_TMM_Normalisation, log = TRUE)
boxplot(lcpm_ExpressedGenes_CPM_GT_1_GT_3samples_TMM_Normalised,las = 2, col = col,main = "")
title(main="A. After TMM Normalisation", ylab="Log-cpm")

# MDS plots (Unsupervised Clustering of samples)
#par(mfrow = c (1,2))
# to create a 5 x 5 inch image
png("MDS_plot.png",width=11*300,height=7*300,res = 300, pointsize = 8)
plotMDS(lcpm_ExpressedGenes_CPM_GT_1_GT_3samples_TMM_Normalised,
labels = DGE_FeatureCounts_CPM_GT_1_GT_3Samples_TMM_Normalisation$samples$Group,
col = col)
dev.off() #close the PNG device. otherwise, we could not open the saved PNG image on our computer
## quartz_off_screen
## 2
# voom ( to take out mean variance relationship)
Design = model.matrix(~0+DGE_FeatureCounts_CPM_GT_1_GT_3Samples_TMM_Normalisation$samples$Group)
colnames(Design)= gsub("DGE_FeatureCounts_CPM_GT_1_GT_3Samples_TMM_Normalisation$samples$Group"
,"",colnames(Design), fixed = TRUE)
v = voom(DGE_FeatureCounts_CPM_GT_1_GT_3Samples_TMM_Normalisation,
normalize.method="none",plot = TRUE)

sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.3
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] RColorBrewer_1.1-2 edgeR_3.24.3 limma_3.38.3
##
## loaded via a namespace (and not attached):
## [1] locfit_1.5-9.1 Rcpp_1.0.0 lattice_0.20-38 digest_0.6.18
## [5] grid_3.5.2 magrittr_1.5 evaluate_0.12 highr_0.7
## [9] stringi_1.2.4 rmarkdown_1.11 tools_3.5.2 stringr_1.3.1
## [13] xfun_0.4 compiler_3.5.2 htmltools_0.3.6 knitr_1.21
################ R objects description in this project ############################
######## part 1: all genes (expressed genes + non expressed genes)
# Dataset : Feature counts
# SampleInfo : Clinical Info
# DEG : is a object which has feature counts + clinical Info + normalisation factor
# cpm = cpm values for all feature counts
# lcpm = logcpm values for all feature counts
####### part 2 : only expressed genes i.e genes with CPM > 1 >= 20 % samples or group with smallsize
#### This R objects holds only expressed genes
# keep.exprs = logical to keep only expressed genes
# DGE_FeatureCounts_CPM_GT_1_GT_3Samples : feature counts of genes whose CPM >1 >= 3 samples(20% of
# samples)
# lcpm_ExpressedGenes_CPM_GT_1_GT_3samples = log cpm values of 16 thousand genes. This 16 thousand
# genes will have CPM >1 >= 3 samples
# DGE_FeatureCounts_CPM_GT_1_GT_3Samples_TMM_Normalisation = same as
#DGE_FeatureCounts_CPM_GT_1_GT_3Samples
# but TMM normalised
# lcpm_ExpressedGenes_CPM_GT_1_GT_3samples_TMM_Normalised = same as
#lcpm_ExpressedGenes_CPM_GT_1_GT_3samples
# but TMM normalised