#Reanalysis of gene expression analysis, or, best practices for skeptics
##################################################################
# This exercise is based on Gilad Y and Mizrahi-Man O. A reanalysis of mouse ENCODE comparative gene expression data [version 1; referees: 3 approved, 1 approved with reservations]. F1000Research 2015, 4:121 (doi: 10.12688/f1000research.6536.1)
# The source code that was used to construct the exercise corresponds to Supplementary information found in this location: https://zenodo.org/record/17606/files/Supplementary_text_2.txt
# Some adjustments were made for installing and loading packages
##################################################################
# R commands to reanalyze the data from Lin et al (2014) starting from the raw fragment counts
# IF A PACKAGE DOES NOT EXIST IN YOUR R, TRY THIS
# install.packages("nameofprogram",repos = 'http://cran.us.r-project.org')
# if that doesn't work, try this
# source("https://bioconductor.org/biocLite.R")
# biocLite("nameofpackage")
# if that still doesn't work, email me......
# download the following directory and extract the files on your desktop
# http://teaching.cgrb.oregonstate.edu/MCB/Rhodes/RNA_Seq_Winter_2016/Lin_ex.zip
# these next two lines (not visible in the notebook, visible in the code) are so I can create a nice notebook for your class, this would not be required for your code to work

the code for class begins below if you have moved the directory to a different location, make sure you setwd to that location

#setwd("~/Desktop/Lin_ex") #MAC
setwd("C:/Users/rhodesad/Desktop/Lin_ex") #PC
  1. load data
datasets = as.data.frame(scan("Stanford_datasets.txt",list(setname="",seqBatch="",species="",tissue=""),sep="\t"))
rawCounts <- as.matrix(read.table('Stanford_datasets_rawCountsMat.txt',header=FALSE,sep='\t'))
colnames(rawCounts) <- datasets$setname
geneDetails <- as.data.frame(scan("ortholog_GC_table.txt",skip=1,list(mouse_name="",mouse_GC = 0.0,human_name = "",human_GC=0.0)))
rownames(rawCounts) <- geneDetails$human_name 
rownames(datasets) <- datasets$setname
rownames(geneDetails) <- geneDetails$human_name 
  1. filter out lower 30% + mitochondrial genes
rowSums = apply(rawCounts,1,function(x) sum(x))
quantile(rowSums,probs = 0.3) # result is 2947.9 
##    30% 
## 2947.9
filter <- apply(rawCounts,1,function(x) sum(x)>2947.9 )
mt <- grep("mt-",geneDetails$mouse_name)
filteredNames <- setdiff(rownames(rawCounts[filter,]),rownames(rawCounts[mt,])) 
filteredRawCounts <- rawCounts[filteredNames,]
  1. Use EDASeq to normalize data within lanes, accounting for GC content
install.packages("Hmisc",repos = 'http://cran.us.r-project.org')
## package 'Hmisc' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\rhodesad\AppData\Local\Temp\Rtmp4qmCWm\downloaded_packages

answer NO, we want the binary, install from source is broken

source("https://bioconductor.org/biocLite.R")
biocLite("EDASeq")
## package 'EDASeq' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\rhodesad\AppData\Local\Temp\Rtmp4qmCWm\downloaded_packages

answer all answer NO, we want the binary, install from source is broken

library(EDASeq)
GCnormCounts <- filteredRawCounts
GCnormCounts[,1:13] <- withinLaneNormalization(filteredRawCounts[,1:13],geneDetails[filteredNames,"human_GC"],which="loess",round=TRUE)
GCnormCounts[,14:26] <- withinLaneNormalization(filteredRawCounts[,14:26],geneDetails[filteredNames,"mouse_GC"],which="loess",round=TRUE)
  1. depth normalize,using TMM scaling factors - divide by sum, then multiply by mean of sums uncomment the following if edgeR not installed biocLite(“edgeR”)
library(edgeR)
origColSums <- apply(rawCounts,2,function(x) sum(x))
normFactors <- calcNormFactors(GCnormCounts,method='TMM')
colSums = apply(GCnormCounts,2,function(x) sum(x))
normalizedColSums <- origColSums
i <- 1
while (i<length(colSums)){
  normalizedColSums[i] <- origColSums[i]* normFactors[i]
  i <- i+1
}
meanDepth <- mean(normalizedColSums)
filteredDepthNormCounts <- GCnormCounts
i <- 1
while (i<ncol(filteredDepthNormCounts)){
  filteredDepthNormCounts[,i] <- (GCnormCounts[,i]/normalizedColSums[i])*meanDepth
  i <- i+1
}
  1. log transformation
logTransformedDepthNormCounts <- log2(filteredDepthNormCounts+1)
  1. use combat on log2 values to remove batch effects
biocLite("sva")
## package 'sva' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\rhodesad\AppData\Local\Temp\Rtmp4qmCWm\downloaded_packages

answer all answer yes Ignore error message about Hmisc and non-zero exit status

library(sva)
meta <- data.frame(seqBatch = datasets$seqBatch,tissue=datasets$tissue,species=datasets$species)
design <- model.matrix(~1,data=meta)
#combat <- ComBat(dat= logTransformedDepthNormCounts,batch=meta$seqBatch,mod=design,numCovs=NULL,par.prior=TRUE)
combat <- ComBat(dat= logTransformedDepthNormCounts,batch=meta$seqBatch,mod=design,par.prior=TRUE)
## Found 5 batches
## Adjusting for 0 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

7.plot heatmaps

install.packages("pheatmap",repos = 'http://cran.us.r-project.org')
## package 'pheatmap' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\rhodesad\AppData\Local\Temp\Rtmp4qmCWm\downloaded_packages

answer all answer n0 —— the Hmisc binary is better than the source

library(pheatmap)
pheatmap(cor(combat)) #pearson correlation; complete linkage; euclidean distance

pheatmap(cor(combat),clustering_method="average") #pearson correlation; average linkage

pheatmap(cor(combat),clustering_method="single") #pearson correlation; single linkage

pheatmap(cor(combat),clustering_distance_rows="manhattan",clustering_distance_cols="manhattan") #pearson correlation; manhattan distance, complete linkage

pheatmap(cor(combat),clustering_distance_rows="canberra",clustering_distance_cols="canberra") #pearson correlation; canberra distance, complete linkage

pheatmap(cor(combat,method="spearman")) #spearman correlation; complete linkage

pheatmap(combat, clustering_distance_cols="correlation", show_rownames = FALSE) # clustering of 'clean' log-transformed data, Pearson correlation distance, complete linkage

  1. PCA
transposed_combat <- t(combat)

perform prcomp on the transposed matrix from which columns (genes) of zero variance have been removed

pca_proc <- prcomp(transposed_combat[,apply(transposed_combat, 2, var, na.rm=TRUE) != 0],scale=TRUE,center=TRUE,retX=TRUE)
summary(pca_proc)
## Importance of components:
##                            PC1     PC2      PC3      PC4      PC5      PC6
## Standard deviation     46.1729 34.5340 29.87140 28.60640 26.55177 25.54213
## Proportion of Variance  0.2068  0.1157  0.08656  0.07938  0.06839  0.06328
## Cumulative Proportion   0.2068  0.3225  0.40904  0.48842  0.55681  0.62009
##                            PC7      PC8      PC9     PC10     PC11
## Standard deviation     21.4675 20.27704 18.79279 18.45900 16.80437
## Proportion of Variance  0.0447  0.03988  0.03426  0.03305  0.02739
## Cumulative Proportion   0.6648  0.70468  0.73894  0.77199  0.79938
##                            PC12     PC13     PC14     PC15     PC16
## Standard deviation     16.39394 15.52684 15.21368 14.84465 13.91730
## Proportion of Variance  0.02607  0.02339  0.02245  0.02138  0.01879
## Cumulative Proportion   0.82546  0.84884  0.87129  0.89267  0.91146
##                            PC17     PC18     PC19     PC20     PC21
## Standard deviation     12.85659 12.61067 11.64782 11.06210 10.50878
## Proportion of Variance  0.01603  0.01543  0.01316  0.01187  0.01071
## Cumulative Proportion   0.92749  0.94292  0.95608  0.96795  0.97866
##                            PC22    PC23    PC24   PC25      PC26
## Standard deviation     10.26773 7.74366 6.08532 4.1913 8.607e-14
## Proportion of Variance  0.01023 0.00582 0.00359 0.0017 0.000e+00
## Cumulative Proportion   0.98889 0.99470 0.99830 1.0000 1.000e+00
plotData = datasets[,c("setname","species","tissue")]
plotData$PC1 <- pca_proc$x[,1]
plotData$PC2 <- pca_proc$x[,2]
plotData$PC3 <- pca_proc$x[,3]
plotData$PC4 <- pca_proc$x[,4]
plotData$PC5 <- pca_proc$x[,5]
plotData$PC6 <- pca_proc$x[,6]

library(ggplot2)

2D plots of pairs of principal components

qplot(PC1,PC2,data=plotData,color=species,shape=tissue,xlab="PC1 (21% variability)",ylab="PC2 (12% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))

qplot(PC1,PC3,data=plotData,color=species,shape=tissue,xlab="PC1 (21% variability)",ylab="PC3 (9% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))

qplot(PC2,PC3,data=plotData,color=species,shape=tissue,xlab="PC2 (12% variability)",ylab="PC3 (9% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))

qplot(PC4,PC5,data=plotData,color=species,shape=tissue,xlab="PC4 (8% variability)",ylab="PC5 (7% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))

qplot(PC4,PC6,data=plotData,color=species,shape=tissue,xlab="PC4 (8% variability)",ylab="PC6 (6% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))

qplot(PC5,PC6,data=plotData,color=species,shape=tissue,xlab="PC5 (7% variability)",ylab="PC6 (6% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))

2D plots of individual principle components against tissue

qplot(PC1,tissue,data=plotData,color=species,shape=tissue,xlab="PC1 (21% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))

qplot(PC2,tissue,data=plotData,color=species,shape=tissue,xlab="PC2 (12% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))

qplot(PC3,tissue,data=plotData,color=species,shape=tissue,xlab="PC3 (9% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))

qplot(PC4,tissue,data=plotData,color=species,shape=tissue,xlab="PC4 (8% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))

qplot(PC5,tissue,data=plotData,color=species,shape=tissue,xlab="PC5 (7% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))

qplot(PC6,tissue,data=plotData,color=species,shape=tissue,xlab="PC6 (6% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))

testing for significance of correlations between the matched tissues PC values of human and mouse

cor.test(plotData$PC1[1:13],plotData$PC1[14:26],method="spearman") # rho=0.3076923 ; p-value = 0.3063
## 
##  Spearman's rank correlation rho
## 
## data:  plotData$PC1[1:13] and plotData$PC1[14:26]
## S = 252, p-value = 0.3063
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.3076923
cor.test(plotData$PC2[1:13],plotData$PC2[14:26],method="spearman") # rho=0.0.7912088 ; p-value = 0.002082
## 
##  Spearman's rank correlation rho
## 
## data:  plotData$PC2[1:13] and plotData$PC2[14:26]
## S = 76, p-value = 0.002082
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.7912088
cor.test(plotData$PC3[1:13],plotData$PC3[14:26],method="spearman") # rho=0.6923077 ; p-value= 0.01115
## 
##  Spearman's rank correlation rho
## 
## data:  plotData$PC3[1:13] and plotData$PC3[14:26]
## S = 112, p-value = 0.01115
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.6923077
cor.test(plotData$PC4[1:13],plotData$PC4[14:26],method="spearman") # rho=0.8736264 ; p-value = 6.817e-05
## 
##  Spearman's rank correlation rho
## 
## data:  plotData$PC4[1:13] and plotData$PC4[14:26]
## S = 46, p-value = 6.817e-05
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8736264
cor.test(plotData$PC5[1:13],plotData$PC5[14:26],method="spearman") # rho=0.7967033 ; p-value=0.001844
## 
##  Spearman's rank correlation rho
## 
## data:  plotData$PC5[1:13] and plotData$PC5[14:26]
## S = 74, p-value = 0.001844
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.7967033
cor.test(plotData$PC6[1:13],plotData$PC6[14:26],method="spearman") # rho=0.02197802; p-value=0.9494 
## 
##  Spearman's rank correlation rho
## 
## data:  plotData$PC6[1:13] and plotData$PC6[14:26]
## S = 356, p-value = 0.9494
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.02197802
cor.test(plotData$PC1[1:13],plotData$PC1[14:26],method="pearson") # cor=0.2972731  ; p-value = 0.3239
## 
##  Pearson's product-moment correlation
## 
## data:  plotData$PC1[1:13] and plotData$PC1[14:26]
## t = 1.0326, df = 11, p-value = 0.3239
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3034084  0.7288739
## sample estimates:
##       cor 
## 0.2972731
cor.test(plotData$PC2[1:13],plotData$PC2[14:26],method="pearson") # cor=0.8482408 ; p-value = 0.0002478
## 
##  Pearson's product-moment correlation
## 
## data:  plotData$PC2[1:13] and plotData$PC2[14:26]
## t = 5.312, df = 11, p-value = 0.0002478
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5580881 0.9535617
## sample estimates:
##       cor 
## 0.8482408
cor.test(plotData$PC3[1:13],plotData$PC3[14:26],method="pearson") # cor=0.6788079 ; p-value = 0.01074
## 
##  Pearson's product-moment correlation
## 
## data:  plotData$PC3[1:13] and plotData$PC3[14:26]
## t = 3.0659, df = 11, p-value = 0.01074
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2041937 0.8950374
## sample estimates:
##       cor 
## 0.6788079
cor.test(plotData$PC4[1:13],plotData$PC4[14:26],method="pearson") # cor = 0.7004334  ; p-value=0.007668
## 
##  Pearson's product-moment correlation
## 
## data:  plotData$PC4[1:13] and plotData$PC4[14:26]
## t = 3.2549, df = 11, p-value = 0.007668
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2433725 0.9029461
## sample estimates:
##       cor 
## 0.7004334
cor.test(plotData$PC5[1:13],plotData$PC5[14:26],method="pearson") # cor = 0.8382749 ; p-value=0.0003446
## 
##  Pearson's product-moment correlation
## 
## data:  plotData$PC5[1:13] and plotData$PC5[14:26]
## t = 5.099, df = 11, p-value = 0.0003446
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5338736 0.9503262
## sample estimates:
##       cor 
## 0.8382749
cor.test(plotData$PC6[1:13],plotData$PC6[14:26],method="pearson") #cor=-0.02760841 ; p-value = 0.9287
## 
##  Pearson's product-moment correlation
## 
## data:  plotData$PC6[1:13] and plotData$PC6[14:26]
## t = -0.091602, df = 11, p-value = 0.9287
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5699241  0.5314614
## sample estimates:
##         cor 
## -0.02760841
  1. plots of data before batch-correction with ComBat heatmap of Pearson correlations
pheatmap(cor(logTransformedDepthNormCounts)) # complete linkage; euclidean distance

pca

transposed_logTransformedDepthNormCounts <- t(logTransformedDepthNormCounts)

perform prcomp on the transposed matrix from which columns (genes) of zero variance have been removed

pca_proc_beforeCombat <- prcomp(transposed_logTransformedDepthNormCounts[,apply(transposed_logTransformedDepthNormCounts, 2, var, na.rm=TRUE) != 0],scale=TRUE,center=TRUE,retx=TRUE)

summary(pca_proc_beforeCombat)
## Importance of components:
##                            PC1     PC2     PC3      PC4      PC5      PC6
## Standard deviation     48.8872 37.0410 32.9175 28.76029 25.87420 23.82773
## Proportion of Variance  0.2318  0.1331  0.1051  0.08024  0.06494  0.05507
## Cumulative Proportion   0.2318  0.3649  0.4700  0.55027  0.61521  0.67028
##                            PC7      PC8      PC9     PC10     PC11
## Standard deviation     21.5134 19.10016 17.14287 15.88775 15.04607
## Proportion of Variance  0.0449  0.03539  0.02851  0.02449  0.02196
## Cumulative Proportion   0.7152  0.75057  0.77907  0.80356  0.82552
##                            PC12     PC13     PC14     PC15     PC16
## Standard deviation     14.44407 14.11145 13.42179 12.82748 12.10684
## Proportion of Variance  0.02024  0.01932  0.01747  0.01596  0.01422
## Cumulative Proportion   0.84576  0.86507  0.88255  0.89851  0.91273
##                            PC17     PC18     PC19     PC20    PC21    PC22
## Standard deviation     11.67716 11.24912 10.82649 10.29446 9.99497 9.49378
## Proportion of Variance  0.01323  0.01227  0.01137  0.01028 0.00969 0.00874
## Cumulative Proportion   0.92595  0.93823  0.94960  0.95988 0.96957 0.97831
##                          PC23    PC24    PC25      PC26
## Standard deviation     9.2525 8.53607 8.06905 7.051e-14
## Proportion of Variance 0.0083 0.00707 0.00632 0.000e+00
## Cumulative Proportion  0.9866 0.99368 1.00000 1.000e+00
plotData_beforeCombat = datasets[,c("setname","species","tissue")]
plotData_beforeCombat$PC1 <- pca_proc_beforeCombat$x[,1]
plotData_beforeCombat$PC2 <- pca_proc_beforeCombat$x[,2]
plotData_beforeCombat$PC3 <- pca_proc_beforeCombat$x[,3]
plotData_beforeCombat$PC4 <- pca_proc_beforeCombat$x[,4]
plotData_beforeCombat$PC5 <- pca_proc_beforeCombat$x[,5]
plotData_beforeCombat$PC6 <- pca_proc_beforeCombat$x[,6]

2D plots of pairs of principal components

qplot(PC1,PC2,data=plotData_beforeCombat,color=species,shape=tissue,xlab="PC1 (23% variability)",ylab="PC2 (13% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))