#Reanalysis of gene expression analysis, or, best practices for skeptics
##################################################################
# This exercise is based on Gilad Y and Mizrahi-Man O. A reanalysis of mouse ENCODE comparative gene expression data [version 1; referees: 3 approved, 1 approved with reservations]. F1000Research 2015, 4:121 (doi: 10.12688/f1000research.6536.1)
# The source code that was used to construct the exercise corresponds to Supplementary information found in this location: https://zenodo.org/record/17606/files/Supplementary_text_2.txt
# Some adjustments were made for installing and loading packages
##################################################################
# R commands to reanalyze the data from Lin et al (2014) starting from the raw fragment counts
# IF A PACKAGE DOES NOT EXIST IN YOUR R, TRY THIS
# install.packages("nameofprogram",repos = 'http://cran.us.r-project.org')
# if that doesn't work, try this
# source("https://bioconductor.org/biocLite.R")
# biocLite("nameofpackage")
# if that still doesn't work, email me......
# download the following directory and extract the files on your desktop
# http://teaching.cgrb.oregonstate.edu/MCB/Rhodes/RNA_Seq_Winter_2016/Lin_ex.zip
# these next two lines (not visible in the notebook, visible in the code) are so I can create a nice notebook for your class, this would not be required for your code to work
the code for class begins below if you have moved the directory to a different location, make sure you setwd to that location
#setwd("~/Desktop/Lin_ex") #MAC
setwd("C:/Users/rhodesad/Desktop/Lin_ex") #PC
datasets = as.data.frame(scan("Stanford_datasets.txt",list(setname="",seqBatch="",species="",tissue=""),sep="\t"))
rawCounts <- as.matrix(read.table('Stanford_datasets_rawCountsMat.txt',header=FALSE,sep='\t'))
colnames(rawCounts) <- datasets$setname
geneDetails <- as.data.frame(scan("ortholog_GC_table.txt",skip=1,list(mouse_name="",mouse_GC = 0.0,human_name = "",human_GC=0.0)))
rownames(rawCounts) <- geneDetails$human_name
rownames(datasets) <- datasets$setname
rownames(geneDetails) <- geneDetails$human_name
rowSums = apply(rawCounts,1,function(x) sum(x))
quantile(rowSums,probs = 0.3) # result is 2947.9
## 30%
## 2947.9
filter <- apply(rawCounts,1,function(x) sum(x)>2947.9 )
mt <- grep("mt-",geneDetails$mouse_name)
filteredNames <- setdiff(rownames(rawCounts[filter,]),rownames(rawCounts[mt,]))
filteredRawCounts <- rawCounts[filteredNames,]
install.packages("Hmisc",repos = 'http://cran.us.r-project.org')
## package 'Hmisc' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\rhodesad\AppData\Local\Temp\Rtmp4qmCWm\downloaded_packages
answer NO, we want the binary, install from source is broken
source("https://bioconductor.org/biocLite.R")
biocLite("EDASeq")
## package 'EDASeq' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\rhodesad\AppData\Local\Temp\Rtmp4qmCWm\downloaded_packages
answer all answer NO, we want the binary, install from source is broken
library(EDASeq)
GCnormCounts <- filteredRawCounts
GCnormCounts[,1:13] <- withinLaneNormalization(filteredRawCounts[,1:13],geneDetails[filteredNames,"human_GC"],which="loess",round=TRUE)
GCnormCounts[,14:26] <- withinLaneNormalization(filteredRawCounts[,14:26],geneDetails[filteredNames,"mouse_GC"],which="loess",round=TRUE)
library(edgeR)
origColSums <- apply(rawCounts,2,function(x) sum(x))
normFactors <- calcNormFactors(GCnormCounts,method='TMM')
colSums = apply(GCnormCounts,2,function(x) sum(x))
normalizedColSums <- origColSums
i <- 1
while (i<length(colSums)){
normalizedColSums[i] <- origColSums[i]* normFactors[i]
i <- i+1
}
meanDepth <- mean(normalizedColSums)
filteredDepthNormCounts <- GCnormCounts
i <- 1
while (i<ncol(filteredDepthNormCounts)){
filteredDepthNormCounts[,i] <- (GCnormCounts[,i]/normalizedColSums[i])*meanDepth
i <- i+1
}
logTransformedDepthNormCounts <- log2(filteredDepthNormCounts+1)
biocLite("sva")
## package 'sva' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\rhodesad\AppData\Local\Temp\Rtmp4qmCWm\downloaded_packages
answer all answer yes Ignore error message about Hmisc and non-zero exit status
library(sva)
meta <- data.frame(seqBatch = datasets$seqBatch,tissue=datasets$tissue,species=datasets$species)
design <- model.matrix(~1,data=meta)
#combat <- ComBat(dat= logTransformedDepthNormCounts,batch=meta$seqBatch,mod=design,numCovs=NULL,par.prior=TRUE)
combat <- ComBat(dat= logTransformedDepthNormCounts,batch=meta$seqBatch,mod=design,par.prior=TRUE)
## Found 5 batches
## Adjusting for 0 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
7.plot heatmaps
install.packages("pheatmap",repos = 'http://cran.us.r-project.org')
## package 'pheatmap' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\rhodesad\AppData\Local\Temp\Rtmp4qmCWm\downloaded_packages
answer all answer n0 —— the Hmisc binary is better than the source
library(pheatmap)
pheatmap(cor(combat)) #pearson correlation; complete linkage; euclidean distance
pheatmap(cor(combat),clustering_method="average") #pearson correlation; average linkage
pheatmap(cor(combat),clustering_method="single") #pearson correlation; single linkage
pheatmap(cor(combat),clustering_distance_rows="manhattan",clustering_distance_cols="manhattan") #pearson correlation; manhattan distance, complete linkage
pheatmap(cor(combat),clustering_distance_rows="canberra",clustering_distance_cols="canberra") #pearson correlation; canberra distance, complete linkage
pheatmap(cor(combat,method="spearman")) #spearman correlation; complete linkage
pheatmap(combat, clustering_distance_cols="correlation", show_rownames = FALSE) # clustering of 'clean' log-transformed data, Pearson correlation distance, complete linkage
transposed_combat <- t(combat)
perform prcomp on the transposed matrix from which columns (genes) of zero variance have been removed
pca_proc <- prcomp(transposed_combat[,apply(transposed_combat, 2, var, na.rm=TRUE) != 0],scale=TRUE,center=TRUE,retX=TRUE)
summary(pca_proc)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 46.1729 34.5340 29.87140 28.60640 26.55177 25.54213
## Proportion of Variance 0.2068 0.1157 0.08656 0.07938 0.06839 0.06328
## Cumulative Proportion 0.2068 0.3225 0.40904 0.48842 0.55681 0.62009
## PC7 PC8 PC9 PC10 PC11
## Standard deviation 21.4675 20.27704 18.79279 18.45900 16.80437
## Proportion of Variance 0.0447 0.03988 0.03426 0.03305 0.02739
## Cumulative Proportion 0.6648 0.70468 0.73894 0.77199 0.79938
## PC12 PC13 PC14 PC15 PC16
## Standard deviation 16.39394 15.52684 15.21368 14.84465 13.91730
## Proportion of Variance 0.02607 0.02339 0.02245 0.02138 0.01879
## Cumulative Proportion 0.82546 0.84884 0.87129 0.89267 0.91146
## PC17 PC18 PC19 PC20 PC21
## Standard deviation 12.85659 12.61067 11.64782 11.06210 10.50878
## Proportion of Variance 0.01603 0.01543 0.01316 0.01187 0.01071
## Cumulative Proportion 0.92749 0.94292 0.95608 0.96795 0.97866
## PC22 PC23 PC24 PC25 PC26
## Standard deviation 10.26773 7.74366 6.08532 4.1913 8.607e-14
## Proportion of Variance 0.01023 0.00582 0.00359 0.0017 0.000e+00
## Cumulative Proportion 0.98889 0.99470 0.99830 1.0000 1.000e+00
plotData = datasets[,c("setname","species","tissue")]
plotData$PC1 <- pca_proc$x[,1]
plotData$PC2 <- pca_proc$x[,2]
plotData$PC3 <- pca_proc$x[,3]
plotData$PC4 <- pca_proc$x[,4]
plotData$PC5 <- pca_proc$x[,5]
plotData$PC6 <- pca_proc$x[,6]
library(ggplot2)
2D plots of pairs of principal components
qplot(PC1,PC2,data=plotData,color=species,shape=tissue,xlab="PC1 (21% variability)",ylab="PC2 (12% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
qplot(PC1,PC3,data=plotData,color=species,shape=tissue,xlab="PC1 (21% variability)",ylab="PC3 (9% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
qplot(PC2,PC3,data=plotData,color=species,shape=tissue,xlab="PC2 (12% variability)",ylab="PC3 (9% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
qplot(PC4,PC5,data=plotData,color=species,shape=tissue,xlab="PC4 (8% variability)",ylab="PC5 (7% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
qplot(PC4,PC6,data=plotData,color=species,shape=tissue,xlab="PC4 (8% variability)",ylab="PC6 (6% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
qplot(PC5,PC6,data=plotData,color=species,shape=tissue,xlab="PC5 (7% variability)",ylab="PC6 (6% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
2D plots of individual principle components against tissue
qplot(PC1,tissue,data=plotData,color=species,shape=tissue,xlab="PC1 (21% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
qplot(PC2,tissue,data=plotData,color=species,shape=tissue,xlab="PC2 (12% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
qplot(PC3,tissue,data=plotData,color=species,shape=tissue,xlab="PC3 (9% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
qplot(PC4,tissue,data=plotData,color=species,shape=tissue,xlab="PC4 (8% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
qplot(PC5,tissue,data=plotData,color=species,shape=tissue,xlab="PC5 (7% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
qplot(PC6,tissue,data=plotData,color=species,shape=tissue,xlab="PC6 (6% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))
testing for significance of correlations between the matched tissues PC values of human and mouse
cor.test(plotData$PC1[1:13],plotData$PC1[14:26],method="spearman") # rho=0.3076923 ; p-value = 0.3063
##
## Spearman's rank correlation rho
##
## data: plotData$PC1[1:13] and plotData$PC1[14:26]
## S = 252, p-value = 0.3063
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.3076923
cor.test(plotData$PC2[1:13],plotData$PC2[14:26],method="spearman") # rho=0.0.7912088 ; p-value = 0.002082
##
## Spearman's rank correlation rho
##
## data: plotData$PC2[1:13] and plotData$PC2[14:26]
## S = 76, p-value = 0.002082
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.7912088
cor.test(plotData$PC3[1:13],plotData$PC3[14:26],method="spearman") # rho=0.6923077 ; p-value= 0.01115
##
## Spearman's rank correlation rho
##
## data: plotData$PC3[1:13] and plotData$PC3[14:26]
## S = 112, p-value = 0.01115
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.6923077
cor.test(plotData$PC4[1:13],plotData$PC4[14:26],method="spearman") # rho=0.8736264 ; p-value = 6.817e-05
##
## Spearman's rank correlation rho
##
## data: plotData$PC4[1:13] and plotData$PC4[14:26]
## S = 46, p-value = 6.817e-05
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8736264
cor.test(plotData$PC5[1:13],plotData$PC5[14:26],method="spearman") # rho=0.7967033 ; p-value=0.001844
##
## Spearman's rank correlation rho
##
## data: plotData$PC5[1:13] and plotData$PC5[14:26]
## S = 74, p-value = 0.001844
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.7967033
cor.test(plotData$PC6[1:13],plotData$PC6[14:26],method="spearman") # rho=0.02197802; p-value=0.9494
##
## Spearman's rank correlation rho
##
## data: plotData$PC6[1:13] and plotData$PC6[14:26]
## S = 356, p-value = 0.9494
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.02197802
cor.test(plotData$PC1[1:13],plotData$PC1[14:26],method="pearson") # cor=0.2972731 ; p-value = 0.3239
##
## Pearson's product-moment correlation
##
## data: plotData$PC1[1:13] and plotData$PC1[14:26]
## t = 1.0326, df = 11, p-value = 0.3239
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3034084 0.7288739
## sample estimates:
## cor
## 0.2972731
cor.test(plotData$PC2[1:13],plotData$PC2[14:26],method="pearson") # cor=0.8482408 ; p-value = 0.0002478
##
## Pearson's product-moment correlation
##
## data: plotData$PC2[1:13] and plotData$PC2[14:26]
## t = 5.312, df = 11, p-value = 0.0002478
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5580881 0.9535617
## sample estimates:
## cor
## 0.8482408
cor.test(plotData$PC3[1:13],plotData$PC3[14:26],method="pearson") # cor=0.6788079 ; p-value = 0.01074
##
## Pearson's product-moment correlation
##
## data: plotData$PC3[1:13] and plotData$PC3[14:26]
## t = 3.0659, df = 11, p-value = 0.01074
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2041937 0.8950374
## sample estimates:
## cor
## 0.6788079
cor.test(plotData$PC4[1:13],plotData$PC4[14:26],method="pearson") # cor = 0.7004334 ; p-value=0.007668
##
## Pearson's product-moment correlation
##
## data: plotData$PC4[1:13] and plotData$PC4[14:26]
## t = 3.2549, df = 11, p-value = 0.007668
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2433725 0.9029461
## sample estimates:
## cor
## 0.7004334
cor.test(plotData$PC5[1:13],plotData$PC5[14:26],method="pearson") # cor = 0.8382749 ; p-value=0.0003446
##
## Pearson's product-moment correlation
##
## data: plotData$PC5[1:13] and plotData$PC5[14:26]
## t = 5.099, df = 11, p-value = 0.0003446
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5338736 0.9503262
## sample estimates:
## cor
## 0.8382749
cor.test(plotData$PC6[1:13],plotData$PC6[14:26],method="pearson") #cor=-0.02760841 ; p-value = 0.9287
##
## Pearson's product-moment correlation
##
## data: plotData$PC6[1:13] and plotData$PC6[14:26]
## t = -0.091602, df = 11, p-value = 0.9287
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.5699241 0.5314614
## sample estimates:
## cor
## -0.02760841
pheatmap(cor(logTransformedDepthNormCounts)) # complete linkage; euclidean distance
pca
transposed_logTransformedDepthNormCounts <- t(logTransformedDepthNormCounts)
perform prcomp on the transposed matrix from which columns (genes) of zero variance have been removed
pca_proc_beforeCombat <- prcomp(transposed_logTransformedDepthNormCounts[,apply(transposed_logTransformedDepthNormCounts, 2, var, na.rm=TRUE) != 0],scale=TRUE,center=TRUE,retx=TRUE)
summary(pca_proc_beforeCombat)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 48.8872 37.0410 32.9175 28.76029 25.87420 23.82773
## Proportion of Variance 0.2318 0.1331 0.1051 0.08024 0.06494 0.05507
## Cumulative Proportion 0.2318 0.3649 0.4700 0.55027 0.61521 0.67028
## PC7 PC8 PC9 PC10 PC11
## Standard deviation 21.5134 19.10016 17.14287 15.88775 15.04607
## Proportion of Variance 0.0449 0.03539 0.02851 0.02449 0.02196
## Cumulative Proportion 0.7152 0.75057 0.77907 0.80356 0.82552
## PC12 PC13 PC14 PC15 PC16
## Standard deviation 14.44407 14.11145 13.42179 12.82748 12.10684
## Proportion of Variance 0.02024 0.01932 0.01747 0.01596 0.01422
## Cumulative Proportion 0.84576 0.86507 0.88255 0.89851 0.91273
## PC17 PC18 PC19 PC20 PC21 PC22
## Standard deviation 11.67716 11.24912 10.82649 10.29446 9.99497 9.49378
## Proportion of Variance 0.01323 0.01227 0.01137 0.01028 0.00969 0.00874
## Cumulative Proportion 0.92595 0.93823 0.94960 0.95988 0.96957 0.97831
## PC23 PC24 PC25 PC26
## Standard deviation 9.2525 8.53607 8.06905 7.051e-14
## Proportion of Variance 0.0083 0.00707 0.00632 0.000e+00
## Cumulative Proportion 0.9866 0.99368 1.00000 1.000e+00
plotData_beforeCombat = datasets[,c("setname","species","tissue")]
plotData_beforeCombat$PC1 <- pca_proc_beforeCombat$x[,1]
plotData_beforeCombat$PC2 <- pca_proc_beforeCombat$x[,2]
plotData_beforeCombat$PC3 <- pca_proc_beforeCombat$x[,3]
plotData_beforeCombat$PC4 <- pca_proc_beforeCombat$x[,4]
plotData_beforeCombat$PC5 <- pca_proc_beforeCombat$x[,5]
plotData_beforeCombat$PC6 <- pca_proc_beforeCombat$x[,6]
2D plots of pairs of principal components
qplot(PC1,PC2,data=plotData_beforeCombat,color=species,shape=tissue,xlab="PC1 (23% variability)",ylab="PC2 (13% variability)")+scale_shape_manual(values=c("adipose"=0,"adrenal"=1,"brain"=2,"sigmoid"=3,"heart"=4,"kidney"=5,"liver"=6,"lung"=7,"ovary"=8,"pancreas"=9,"small_bowel"=10,"spleen"=11,"testis"=12))