Question 1 After first creating the two ExpressionSets, I added the gene annotation data from the annotation data package hgu95av2.db to each of them.
#load("C:/Users/HP ProBook 430 G1/Dropbox/Academics/Fall 2015/PH240D/HW1/examiningDoxorubicinInDetail.RData")
load("C:/Users/ASRock Z87 Pro4/Dropbox/Academics/Fall 2015/PH240D/HW1/examiningDoxorubicinInDetail.RData")
#Create ExpressionSet for the doxorubicin07Numbers
library(Biobase)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
## grep, grepl, intersect, is.unsorted, lapply, lengths, Map,
## mapply, match, mget, order, paste, pmax, pmax.int, pmin,
## pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
## setdiff, sort, table, tapply, union, unique, unlist, unsplit
##
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
data_07<-new("ExpressionSet",exprs=as.matrix(doxorubicin07Numbers),phenoData=new("AnnotatedDataFrame",data=doxorubicin07Info))
#Create ExpressionSet for the doxorubicinNCI60Scaled
data_NCI<-new("ExpressionSet",exprs=doxorubicinNCI60Scaled,phenoData=new("AnnotatedDataFrame",data=data.frame(status=as.factor(ifelse(colnames(doxorubicinNCI60Scaled) %in% cellLinesUsed$doxorubicin$listPotti06CorrAug08$Sensitive,"Sensitive","Resistant")),row.names=colnames(doxorubicinNCI60Scaled))))
#Need to add gene annotation data to both ExpressionSets
library(hgu95av2.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: S4Vectors
## Loading required package: org.Hs.eg.db
## Loading required package: DBI
gene_annotation_NCI<-links(hgu95av2SYMBOL[rownames(data_NCI)])
gene_annotation_NCI<-AnnotatedDataFrame(gene_annotation_NCI)
#featureData(data_NCI)<-gene_annotation_NCI
gene_annotation_07<-links(hgu95av2SYMBOL[rownames(data_07)])
gene_annotation_07<-AnnotatedDataFrame(gene_annotation_07)
#featureData(data_07)<-gene_annotation_07
Question 2 To look at how sensitivity status was applied to the training data, I used hierarchical clustering to see how the training data and NCI60 data clustered together. In all plots, samples labeled resistant are colored green and sensitive are colored blue. From the hierarchical clustering it is clear that the labels on the training data were switched: almost all of the sensitive training data clusters with the resistant NCI60 data and vice versa. I also used PAM clustering with k=2, where we also see that those labeled resistant vs. sensitive (green and blue) do not agree with the PAM clustering (group 1 is triangles, group 2 is circles).
In addition, when using the table function to see how our clustering techniques compare to the “real” labels, it is obvious in both cases, but is slightly more clear in PAM clustering where there half of the data is clustered completely opposite to the labels.
library(dendextend)
##
## Welcome to dendextend version 1.1.0
##
## Type ?dendextend to access the overall documentation and
## browseVignettes(package = 'dendextend') for the package vignette.
## You can execute a demo of the package via: demo(dendextend)
##
## More information is available on the dendextend project web-site:
## https://github.com/talgalili/dendextend/
##
## Contact: <tal.galili@gmail.com>
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
##
## To suppress the this message use:
## suppressPackageStartupMessages(library(dendextend))
##
##
## Attaching package: 'dendextend'
##
## The following object is masked from 'package:stats':
##
## cutree
#First I created a distance matrix based on 1-correlation.
common_data<-combine(data_NCI[rownames(data_NCI) %in% rownames(data_07),],data_07[,1:22])
r_NCI<-cor(exprs(common_data))
d_NCI<-1-r_NCI
dend_common<-as.dendrogram(hclust(as.dist(d_NCI),method="average"))
labels_colors(dend_common)<-ifelse(common_data$status=="Resistant","green","blue")[order.dendrogram(dend_common)]
dend_common<-set(dend_common,"branches_lwd",2)
plot(color_branches(dend_common,k=2),main="Hclust of training/NCI data")
table(common_data$status,cutree(dend_common,k=2))
##
## 1 2
## Resistant 13 9
## Sensitive 13 9
library(cluster)
#PAM clustering
pam_common<-pam(as.dist(d_NCI),k=2,diss=TRUE)
clusplot(pam_common,col.p=ifelse(common_data$status=="Resistant","green","blue"),col.clus="black",main="PAM clustering of training/NCI data")
table(common_data$status,pam_common$clustering)
##
## 1 2
## Resistant 11 11
## Sensitive 11 11
Question 3 Using hierarchical cluster on the distance matrix (1-correlation) on the test set there is no strong clustering of the assigned sensitivity statuses (x-axis labels; red is resistant and blue is sensitive), compared to the observed clustering when we cut the tree at k=2 (the colors of the branches). This is very likely due to duplication of samples and mislabeling of samples (including duplicated ones).
PAM clustering when preformed on the testing set shows similar results to the hierarchical clustering. The algorihm clustering is shown by triangles vs. circles while assigned sensitivity status is by color.
Looking at the PCA plot for the first 3 principal components, there does not seem to be any clear seperation by status. The first PC accounts for 67% of the variance, while the second only 1.8% of variance, and the third accounts for 1.3%. The first PC is mostly driven by a single outlier, which is Test95. We remove this outlier to see if there will be clearer seperation by drug status afterwards; however it is still difficult to see any clear seperation in the PCA plot. In addition, after the removal of Test95, the first PCA only accounds for 5.5% of total variance, the second 4.1%, and the third 3.8%.
Next we want to check the pairwise correlations between the test samples. After creating the correlation matrix and discarding the diagonal, it appears that only 84 of the 122 test samples are distinct, which is the same result found by Baggerly and Coombs. In addition, there were some samples that were duplicated up to four times.
#We want to preform QA/QC on the test set (stored in data_07).
#Hierarchical Clustering
r_07<-cor(exprs(data_07[,23:144]))
d_07<-1-r_07
dend_07<-as.dendrogram(hclust(as.dist(d_07)))
labels_colors(dend_07)<-ifelse(data_07$status[23:144]=="Resistant","red","blue")[order.dendrogram(dend_07)]
dend_07<-set(dend_07,"branches_lwd",2)
plot(color_branches(dend_07,k=2),main="Hclust of testing data")
#we want to check to see how our clustering matches to the labels:
table(data_07$status[23:144],cutree(dend_07,k=2))
##
## 1 2
## Resistant 55 44
## Sensitive 17 6
#PAM clustering
pam_07<-pam(as.dist(d_07),k=2,diss=TRUE)
clusplot(pam_07,col.p=ifelse(data_07[,23:144]$status=="Resistant","red","blue"),col.clus="black",main="PAM clustering of test data")
#we want to check to see how our clustering matches to the labels:
table(data_07$status[23:144],pam_07$clustering)
##
## 1 2
## Resistant 39 60
## Sensitive 15 8
#Dimensionality Reduction
#Testing set
pc_07<-prcomp(t(as.matrix(doxorubicin07Numbers[,23:144])))
#Resistant is red, sensitive is blue
pairs(pc_07$x[,1:3],col=ifelse(data_07[,23:144]$status=="Resistant","red","blue"),pch=ifelse(data_07[,23:144]$status=="Resistant",16,17),main="Samples by first 3 PCs")
#We should now check to see what % of variance is explained by these PCs
((pc_07$sdev)^2/sum(pc_07$sdev^2))[1:3]
## [1] 0.67488725 0.01797783 0.01330332
#Outlier in the first PCA
which.max(pc_07$x[,1])
## Test95
## 95
#PCA without Test95
pc_outlier<-prcomp(t(as.matrix(doxorubicin07Numbers[,(23:144)[-95]])))
pairs(pc_outlier$x[,1:3],col=ifelse(data_07[,(23:144)[-95]]$status=="Resistant","red","blue"),pch=ifelse(data_07[,(23:144)[-95]]$status=="Resistant",16,17),main="PCA without Test95")
#% of variance
((pc_outlier$sdev)^2/sum(pc_outlier$sdev^2))[1:3]
## [1] 0.05523785 0.04089976 0.03788810
#Finding off-diagonal elements of correlation matrix that equal 1 (evidence of duplicated samples)
corr_matrix<-r_07
diag(corr_matrix)<-.5 #set diagonal elements to some non-1 value
log_matrix<-apply(corr_matrix,c(1,2),function(x) ifelse(x==1,TRUE,FALSE))
which(log_matrix,arr.ind=TRUE)
## row col
## Test33 33 1
## Test36 36 2
## Test58 58 2
## Test87 87 2
## Test39 39 7
## Test44 44 10
## Test67 67 10
## Test97 97 10
## Test98 98 12
## Test99 99 13
## Test74 74 15
## Test103 103 15
## Test50 50 17
## Test76 76 17
## Test106 106 17
## Test51 51 26
## Test82 82 26
## Test1 1 33
## Test57 57 35
## Test2 2 36
## Test58 58 36
## Test87 87 36
## Test90 90 37
## Test92 92 38
## Test7 7 39
## Test65 65 40
## Test94 94 40
## Test96 96 43
## Test10 10 44
## Test67 67 44
## Test97 97 44
## Test70 70 46
## Test101 101 46
## Test17 17 50
## Test76 76 50
## Test106 106 50
## Test26 26 51
## Test82 82 51
## Test35 35 57
## Test2 2 58
## Test36 36 58
## Test87 87 58
## Test40 40 65
## Test94 94 65
## Test10 10 67
## Test44 44 67
## Test97 97 67
## Test46 46 70
## Test101 101 70
## Test15 15 74
## Test103 103 74
## Test17 17 76
## Test50 50 76
## Test106 106 76
## Test26 26 82
## Test51 51 82
## Test2 2 87
## Test36 36 87
## Test58 58 87
## Test37 37 90
## Test38 38 92
## Test40 40 94
## Test65 65 94
## Test43 43 96
## Test10 10 97
## Test44 44 97
## Test67 67 97
## Test12 12 98
## Test13 13 99
## Test46 46 101
## Test70 70 101
## Test15 15 103
## Test74 74 103
## Test17 17 106
## Test50 50 106
## Test76 76 106
122-(dim(which(log_matrix,arr.ind=TRUE))[1]/2)
## [1] 84
#We see that only 84 of the 122 test samples are distinct, which is also what was found by Baggerly and Coombs, and that some samples were duplicated up to four times.
high_corr<-(corr_matrix > 0.99999)
table(apply(high_corr,1,sum))
##
## 0 1 2 3
## 60 28 18 16
Question 4 Part A Cyclic loess and quantile normalization was used to normalize the doxorubicinNCI60Scaled data. To view the need for normalization, we look at the log2-transformed unnormalized data in a boxplot, colored by drug status. Fast cyclic loess as implemented in limma normalizes all arrays against a synthetic refrence array made by averaging across all samples. A loess regression is then fit for each array to the difference between array intensities and synthetic refrence intensities which is used to normalize each array. While the unnormalized data is generally centered around zero, we see that both loess and quantile normalization improve the distribution of intensities.
we then do 2 MD-plots prior to and post normalization for both normalization methods comparing the 2 most negatively correlated arrays. The x-axis is the rowmeans; that is, the probeset intensity averaged over the arrays being compared. The y-axis is the difference in probeset intensity between the two arrays being compared. We hope to see that difference in expression is independent of the mean expression of that probeset. Unfortunately it seems that cyclic loess did not preform as well as hoped; the loess fit after normalization is not centered on 0. However, it seems that quantile normalization worked very well; looking at the MD-plot after quantile normalization it is clear that the difference in expression is independent of mean expression. Thus we will use the quantile normalized data to proceed.
library(limma)
##
## Attaching package: 'limma'
##
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
#log-2 transform the exprs matricies for both data sets; because microarray data is measured on a 16-bit scale we should re-log-transform the data.
boxplot(log2(exprs(data_NCI)),col=ifelse(data_NCI$status=="Resistant","red","blue"),main="Unnormalized data (red=resistant, blue=sensitive)")
data_NCInorm<-data_NCI
exprs(data_NCInorm)<-normalizeBetweenArrays(log2(exprs(data_NCI)),method="cyclicloess")
quant_norm<-normalizeQuantiles(log2(exprs(data_NCI)))
boxplot(exprs(data_NCInorm),col=ifelse(data_NCInorm$status=="Resistant","red","blue"),main="Loess normalized data (red=resistant, blue=sensitive)")
boxplot(quant_norm,col=ifelse(data_NCInorm$status=="Resistant","red","blue"),main="Quantile normalized data (red=resistant, blue=sensitive)")
#We want to use an MD plot to see how normalization affected the distribution of the two most negatively correlated samples.
mdplot_array<-function(x,y, ...){
mean<-rowMeans(x[,y]+0.1)
difference<-(x[,y[2]]+0.1)-(x[,y[1]]+0.1)
smoothScatter(mean,difference,...)
lines(lowess(mean,difference),col="red")
abline(h=0,lty=2)
}
#To visualize how normalization will affect the data, we find the two arrays that have the largest negative correlation for comparison.
neg_samp<-cor(exprs(data_NCI))
arrayInd(which.min(neg_samp),dim(neg_samp))
## [,1] [,2]
## [1,] 17 7
mdplot_array(log2(exprs(data_NCI)),y=c(7,18),main="Unnormalized intensities")
mdplot_array((exprs(data_NCInorm)),y=c(7,18),main="Loess normalized intensities")
mdplot_array(quant_norm,y=c(7,18),main="Quantile normalized intensities")
Part B Plotting the arrays against first 3 principal components and coloring by drug status reveals that the data shows evidence of clustering by the 2nd principal component, suggesting it is associated with drug response. For comparison we plot both quantile and loess normalized data, and it is encouraging that the results are very similar.
#Dimensionality Reduction
pc_NCI<-prcomp(t(exprs(data_NCInorm)))
pairs(pc_NCI$x[,1:3],col=ifelse(data_NCInorm$status=="Resistant","green","blue"),pch=ifelse(data_NCInorm$status=="Resistant",16,17),main="loess normalized")
#% of variance explained by the first 3 PCs
((pc_NCI$sdev)^2/sum(pc_NCI$sdev^2))[1:3]
## [1] 0.16638498 0.09386862 0.07021265
pc_quant<-prcomp(t(quant_norm))
pairs(pc_quant$x[,1:3],col=ifelse(data_NCInorm$status=="Resistant","green","blue"),pch=ifelse(data_NCInorm$status=="Resistant",16,17),main="quantile normalized")
#% of variance explained by the first 3 PCs
((pc_quant$sdev)^2/sum(pc_quant$sdev^2))[1:3]
## [1] 0.15828194 0.09763860 0.07064125
#Hierarchical Clustering
r_NCInorm<-cor(exprs(data_NCInorm))
d_NCInorm<-1-r_NCInorm
dend_NCI<-as.dendrogram(hclust(as.dist(d_NCInorm),method="average"))
labels_colors(dend_NCI)<-ifelse(data_NCInorm$status=="Resistant","red","blue")[order.dendrogram(dend_NCI)]
plot(color_branches(dend_NCI,k=2),main="Hclust of NCI data")
#PAM Clustering
p_NCI<-pam(as.dist(d_NCInorm),k=2,diss=TRUE)
clusplot(p_NCI,col.p=ifelse(data_NCInorm$status=="Resistant","red","blue"),main="PAM clustering of NCI data")
Part C The first test used to look for differences in mean expression levels between drug status was a two sample t-test. The function rowttests runs a t-test for each gene. It derives a test statistics by dividing the difference in group means for that probeset by the standard deviation of that gene across all samples, weighted by the number of samples. The p-values from the test were adjusted to control FDR by the Benjamini & Hochberg procedure. This method found 267 genes with an adjusted p-value < 0.05.
The second test I used to test for differences in mean expression levels between drug status was the Wilcoxon signed rank test. This test is non-parametric meaning that it does not assume that the expression measurements in each group are normally distributed. The Wilcoxon test ranks gene expression across all samples, and then preforms a two-tailed test of equality of means between the two groups of interest (sensitive or resistant) to gain a p-value. I then used the Benjamini & Hochberg method to control the FDR, a less conservative method of accounting for multiple testing than the Bonferroni correction to control the FWER. This method found 218 genes with an adjusted p-value < 0.05.
When I compared the 80 DE genes that Potti et al. identified as DE between resistant and sensitive cell lines with the top 80 DE genes from my list of top DE genes identified by t-test, I found 70 genes from Potti et al. included in the top 80 of my DE genes. Comparing the Potti et al. list to the top 80 DE genes from my list of top DE genes identified by Wilxocon signed rank test p-value statistic, I found 57 genes from Potti et al. included in the top 80 of my DE genes.
To better visualize genes that are both biologically and statistically significant, a volcano plot was made and “interesting” genes were plotted. A gene is called interesting if it has a significant adjusted p-value and a log fold-change greater than 2. The x-axis plot the log2 fold change of each probeset between resistant and sensitive, and the y-axis is -log10 of the adjusted p-value. A volcano plot thus applies “double filtering” to find interesting genes, the logic being that large outliers in one of the groups can cause large fold change, and that significant p-value could be caused by low variance or expression level. Double filtering will only return genes that are significant on both scales.
#DE by row t-tests
library(genefilter)
##
## Attaching package: 'genefilter'
##
## The following object is masked from 'package:base':
##
## anyNA
library(multtest)
ttest<-rowttests(data_NCInorm,fac=data_NCInorm$status)
mtest<-mt.rawp2adjp(ttest$p.value,proc="BH")
de_genes<-featureNames(data_NCInorm)[mtest$index[1:80]]
links(hgu95av2SYMBOL[de_genes])
## probe_id symbol
## 1 1051_g_at MLANA
## 2 110_at CSPG4
## 3 1319_at DDR2
## 4 1519_at ETS2
## 5 1537_at EGFR
## 6 2011_s_at BIK
## 7 2047_s_at JUP
## 8 266_s_at CD24
## 9 32139_at ZNF185
## 10 32168_s_at RCAN1
## 11 32612_at GSN
## 12 32715_at VAMP8
## 13 32821_at LCN2
## 14 32967_at FCMR
## 15 33004_g_at NCK2
## 16 33240_at PDZRN3
## 17 33409_at FKBP2
## 18 33483_at NMU
## 19 33824_at KRT8
## 20 33842_at FAM214B
## 21 33853_s_at NRP2
## 22 33892_at PKP2
## 23 33904_at CLDN3
## 24 33906_at SSSCA1
## 25 33908_at CAPN1
## 26 33956_at LY96
## 27 34213_at WWC1
## 28 34348_at SPINT2
## 29 34859_at MAGED2
## 30 34885_at SYNGR2
## 31 34993_at SGCD
## 32 35280_at LAMC2
## 33 35444_at MISP
## 34 35628_at TM7SF2
## 35 35630_at LLGL2
## 36 35681_r_at ZEB2
## 37 35766_at KRT18
## 38 35807_at CYBA
## 39 36133_at DSP
## 40 36618_g_at ID1
## 41 36619_r_at ID1
## 42 36795_at PSAP
## 43 36828_at ZNF629
## 44 37222_at GSTT1
## 45 37327_at EGFR
## 46 37552_at KCNK1
## 47 37695_at RNF144A
## 48 37743_at FEZ1
## 49 37749_at MEST
## 50 37926_at KLF5
## 51 38004_at CSPG4
## 52 38078_at FLNB
## 53 38119_at GYPC
## 54 38297_at PITPNM1
## 55 38379_at GPNMB
## 56 38653_at PMP22
## 57 39298_at ST3GAL6
## 58 39316_at RAB40C
## 59 39386_at MAD2L1BP
## 60 39801_at PLOD3
## 61 39918_at TUBGCP2
## 62 40103_at EZR
## 63 40202_at KLF9
## 64 40434_at PODXL
## 65 40568_at ATP6V1B2
## 66 41158_at PLP1
## 67 41294_at KRT7
## 68 41359_at PKP3
## 69 41378_at SGCD
## 70 41453_at DLG3
## 71 41503_at ZHX2
## 72 41610_at LAMA5
## 73 41644_at SASH1
## 74 41839_at GAS1
## 75 575_s_at EPCAM
## 76 661_at GAS1
## 77 999_at CYP27A1
#DE by Wilcoxon-signed rank test
status<-data_NCInorm$status
wilcox<-apply(exprs(data_NCInorm),1,function(x) wilcox.test(x~status))
wilcox_result<-sapply(wilcox,function(x) x$p.value)
wilcox_result<-as.data.frame(wilcox_result)
wilcox_result$FDR<-p.adjust(wilcox_result[,1],method="fdr")
wilcox_result<-wilcox_result[order(wilcox_result$FDR),]
wilcox_result$gene<-sapply(mget(rownames(wilcox_result),hgu95av2SYMBOL),function(x) x)
#Comparing results to results of Potti et al.
#Comparing to t-test DE genes
sum(doxorubicinGenes %in% de_genes)
## [1] 70
#Comparing to Wilcoxon signed rank test DE genes
sum(doxorubicinGenes %in% rownames(wilcox_result)[1:80])
## [1] 57
#Getting Log FC and making volcano plot
sens_mean<-rowMeans(2^(exprs(data_NCInorm[,status=="Sensitive"])))
res_mean<-rowMeans(2^(exprs(data_NCInorm[,status=="Resistant"])))
log_fc<-log2(sens_mean/res_mean)
log_p<--log10(ttest$p.value)
names(log_p)<-names(log_fc)
plot(log_fc,log_p,main="Volcano Plot (DE genes by status)",xlab="log2(FC)",ylab="-log10(p-value)",col=rgb(0,0,1,.3))
#plotting interesting genes
sum(log_p > -log10(0.05/length(log_p)))
## [1] 16
sum(abs(log_fc) > 2)
## [1] 464
sum(log_p > -log10(0.05/length(log_p)) & abs(log_fc) > 2)
## [1] 16
points(log_fc[which(log_p > -log10(0.5/length(log_p)) & abs(log_fc) > 2)],log_p[which(log_p > -log10(0.5/length(log_p)) & abs(log_fc) > 2)],col="red",pch=16)
text(log_fc[which(log_p > -log10(0.5/length(log_p)) & abs(log_fc) > 2)],log_p[which(log_p > -log10(0.5/length(log_p)) & abs(log_fc) > 2)],labels=links(hgu95av2SYMBOL[names(log_fc)[which(log_p > -log10(0.5/length(log_p)) & abs(log_fc) > 2)]])[,"symbol"],pos=1,col="red")