Question 1 After first creating the two ExpressionSets, I added the gene annotation data from the annotation data package hgu95av2.db to each of them.

#load("C:/Users/HP ProBook 430 G1/Dropbox/Academics/Fall 2015/PH240D/HW1/examiningDoxorubicinInDetail.RData")
load("C:/Users/ASRock Z87 Pro4/Dropbox/Academics/Fall 2015/PH240D/HW1/examiningDoxorubicinInDetail.RData")
#Create ExpressionSet for the doxorubicin07Numbers
library(Biobase)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     grep, grepl, intersect, is.unsorted, lapply, lengths, Map,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unlist, unsplit
## 
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
data_07<-new("ExpressionSet",exprs=as.matrix(doxorubicin07Numbers),phenoData=new("AnnotatedDataFrame",data=doxorubicin07Info))

#Create ExpressionSet for the doxorubicinNCI60Scaled
data_NCI<-new("ExpressionSet",exprs=doxorubicinNCI60Scaled,phenoData=new("AnnotatedDataFrame",data=data.frame(status=as.factor(ifelse(colnames(doxorubicinNCI60Scaled) %in% cellLinesUsed$doxorubicin$listPotti06CorrAug08$Sensitive,"Sensitive","Resistant")),row.names=colnames(doxorubicinNCI60Scaled))))

#Need to add gene annotation data to both ExpressionSets
library(hgu95av2.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: S4Vectors
## Loading required package: org.Hs.eg.db
## Loading required package: DBI
gene_annotation_NCI<-links(hgu95av2SYMBOL[rownames(data_NCI)])
gene_annotation_NCI<-AnnotatedDataFrame(gene_annotation_NCI)
#featureData(data_NCI)<-gene_annotation_NCI
gene_annotation_07<-links(hgu95av2SYMBOL[rownames(data_07)])
gene_annotation_07<-AnnotatedDataFrame(gene_annotation_07)
#featureData(data_07)<-gene_annotation_07

Question 2 To look at how sensitivity status was applied to the training data, I used hierarchical clustering to see how the training data and NCI60 data clustered together. In all plots, samples labeled resistant are colored green and sensitive are colored blue. From the hierarchical clustering it is clear that the labels on the training data were switched: almost all of the sensitive training data clusters with the resistant NCI60 data and vice versa. I also used PAM clustering with k=2, where we also see that those labeled resistant vs. sensitive (green and blue) do not agree with the PAM clustering (group 1 is triangles, group 2 is circles).

In addition, when using the table function to see how our clustering techniques compare to the “real” labels, it is obvious in both cases, but is slightly more clear in PAM clustering where there half of the data is clustered completely opposite to the labels.

library(dendextend)
## 
## Welcome to dendextend version 1.1.0
## 
## Type ?dendextend to access the overall documentation and
## browseVignettes(package = 'dendextend') for the package vignette.
## You can execute a demo of the package via: demo(dendextend)
## 
## More information is available on the dendextend project web-site:
## https://github.com/talgalili/dendextend/
## 
## Contact: <tal.galili@gmail.com>
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## 
##          To suppress the this message use:
##          suppressPackageStartupMessages(library(dendextend))
## 
## 
## Attaching package: 'dendextend'
## 
## The following object is masked from 'package:stats':
## 
##     cutree
#First I created a distance matrix based on 1-correlation.
common_data<-combine(data_NCI[rownames(data_NCI) %in% rownames(data_07),],data_07[,1:22])
r_NCI<-cor(exprs(common_data))
d_NCI<-1-r_NCI
dend_common<-as.dendrogram(hclust(as.dist(d_NCI),method="average"))
labels_colors(dend_common)<-ifelse(common_data$status=="Resistant","green","blue")[order.dendrogram(dend_common)]
dend_common<-set(dend_common,"branches_lwd",2)
plot(color_branches(dend_common,k=2),main="Hclust of training/NCI data")

table(common_data$status,cutree(dend_common,k=2))
##            
##              1  2
##   Resistant 13  9
##   Sensitive 13  9
library(cluster)
#PAM clustering
pam_common<-pam(as.dist(d_NCI),k=2,diss=TRUE)
clusplot(pam_common,col.p=ifelse(common_data$status=="Resistant","green","blue"),col.clus="black",main="PAM clustering of training/NCI data")

table(common_data$status,pam_common$clustering)
##            
##              1  2
##   Resistant 11 11
##   Sensitive 11 11

Question 3 Using hierarchical cluster on the distance matrix (1-correlation) on the test set there is no strong clustering of the assigned sensitivity statuses (x-axis labels; red is resistant and blue is sensitive), compared to the observed clustering when we cut the tree at k=2 (the colors of the branches). This is very likely due to duplication of samples and mislabeling of samples (including duplicated ones).

PAM clustering when preformed on the testing set shows similar results to the hierarchical clustering. The algorihm clustering is shown by triangles vs. circles while assigned sensitivity status is by color.

Looking at the PCA plot for the first 3 principal components, there does not seem to be any clear seperation by status. The first PC accounts for 67% of the variance, while the second only 1.8% of variance, and the third accounts for 1.3%. The first PC is mostly driven by a single outlier, which is Test95. We remove this outlier to see if there will be clearer seperation by drug status afterwards; however it is still difficult to see any clear seperation in the PCA plot. In addition, after the removal of Test95, the first PCA only accounds for 5.5% of total variance, the second 4.1%, and the third 3.8%.

Next we want to check the pairwise correlations between the test samples. After creating the correlation matrix and discarding the diagonal, it appears that only 84 of the 122 test samples are distinct, which is the same result found by Baggerly and Coombs. In addition, there were some samples that were duplicated up to four times.

#We want to preform QA/QC on the test set (stored in data_07).
#Hierarchical Clustering
r_07<-cor(exprs(data_07[,23:144]))
d_07<-1-r_07
dend_07<-as.dendrogram(hclust(as.dist(d_07)))
labels_colors(dend_07)<-ifelse(data_07$status[23:144]=="Resistant","red","blue")[order.dendrogram(dend_07)]
dend_07<-set(dend_07,"branches_lwd",2)
plot(color_branches(dend_07,k=2),main="Hclust of testing data")

#we want to check to see how our clustering matches to the labels:
table(data_07$status[23:144],cutree(dend_07,k=2))
##            
##              1  2
##   Resistant 55 44
##   Sensitive 17  6
#PAM clustering
pam_07<-pam(as.dist(d_07),k=2,diss=TRUE)
clusplot(pam_07,col.p=ifelse(data_07[,23:144]$status=="Resistant","red","blue"),col.clus="black",main="PAM clustering of test data")

#we want to check to see how our clustering matches to the labels:
table(data_07$status[23:144],pam_07$clustering)
##            
##              1  2
##   Resistant 39 60
##   Sensitive 15  8
#Dimensionality Reduction
#Testing set
pc_07<-prcomp(t(as.matrix(doxorubicin07Numbers[,23:144])))
#Resistant is red, sensitive is blue
pairs(pc_07$x[,1:3],col=ifelse(data_07[,23:144]$status=="Resistant","red","blue"),pch=ifelse(data_07[,23:144]$status=="Resistant",16,17),main="Samples by first 3 PCs")

#We should now check to see what % of variance is explained by these PCs
((pc_07$sdev)^2/sum(pc_07$sdev^2))[1:3]
## [1] 0.67488725 0.01797783 0.01330332
#Outlier in the first PCA
which.max(pc_07$x[,1])
## Test95 
##     95
#PCA without Test95
pc_outlier<-prcomp(t(as.matrix(doxorubicin07Numbers[,(23:144)[-95]])))
pairs(pc_outlier$x[,1:3],col=ifelse(data_07[,(23:144)[-95]]$status=="Resistant","red","blue"),pch=ifelse(data_07[,(23:144)[-95]]$status=="Resistant",16,17),main="PCA without Test95")

#% of variance
((pc_outlier$sdev)^2/sum(pc_outlier$sdev^2))[1:3]
## [1] 0.05523785 0.04089976 0.03788810
#Finding off-diagonal elements of correlation matrix that equal 1 (evidence of duplicated samples)
corr_matrix<-r_07
diag(corr_matrix)<-.5 #set diagonal elements to some non-1 value
log_matrix<-apply(corr_matrix,c(1,2),function(x) ifelse(x==1,TRUE,FALSE))
which(log_matrix,arr.ind=TRUE)
##         row col
## Test33   33   1
## Test36   36   2
## Test58   58   2
## Test87   87   2
## Test39   39   7
## Test44   44  10
## Test67   67  10
## Test97   97  10
## Test98   98  12
## Test99   99  13
## Test74   74  15
## Test103 103  15
## Test50   50  17
## Test76   76  17
## Test106 106  17
## Test51   51  26
## Test82   82  26
## Test1     1  33
## Test57   57  35
## Test2     2  36
## Test58   58  36
## Test87   87  36
## Test90   90  37
## Test92   92  38
## Test7     7  39
## Test65   65  40
## Test94   94  40
## Test96   96  43
## Test10   10  44
## Test67   67  44
## Test97   97  44
## Test70   70  46
## Test101 101  46
## Test17   17  50
## Test76   76  50
## Test106 106  50
## Test26   26  51
## Test82   82  51
## Test35   35  57
## Test2     2  58
## Test36   36  58
## Test87   87  58
## Test40   40  65
## Test94   94  65
## Test10   10  67
## Test44   44  67
## Test97   97  67
## Test46   46  70
## Test101 101  70
## Test15   15  74
## Test103 103  74
## Test17   17  76
## Test50   50  76
## Test106 106  76
## Test26   26  82
## Test51   51  82
## Test2     2  87
## Test36   36  87
## Test58   58  87
## Test37   37  90
## Test38   38  92
## Test40   40  94
## Test65   65  94
## Test43   43  96
## Test10   10  97
## Test44   44  97
## Test67   67  97
## Test12   12  98
## Test13   13  99
## Test46   46 101
## Test70   70 101
## Test15   15 103
## Test74   74 103
## Test17   17 106
## Test50   50 106
## Test76   76 106
122-(dim(which(log_matrix,arr.ind=TRUE))[1]/2)
## [1] 84
#We see that only 84 of the 122 test samples are distinct, which is also what was found by Baggerly and Coombs, and that some samples were duplicated up to four times.
high_corr<-(corr_matrix > 0.99999)
table(apply(high_corr,1,sum))
## 
##  0  1  2  3 
## 60 28 18 16

Question 4 Part A Cyclic loess and quantile normalization was used to normalize the doxorubicinNCI60Scaled data. To view the need for normalization, we look at the log2-transformed unnormalized data in a boxplot, colored by drug status. Fast cyclic loess as implemented in limma normalizes all arrays against a synthetic refrence array made by averaging across all samples. A loess regression is then fit for each array to the difference between array intensities and synthetic refrence intensities which is used to normalize each array. While the unnormalized data is generally centered around zero, we see that both loess and quantile normalization improve the distribution of intensities.

we then do 2 MD-plots prior to and post normalization for both normalization methods comparing the 2 most negatively correlated arrays. The x-axis is the rowmeans; that is, the probeset intensity averaged over the arrays being compared. The y-axis is the difference in probeset intensity between the two arrays being compared. We hope to see that difference in expression is independent of the mean expression of that probeset. Unfortunately it seems that cyclic loess did not preform as well as hoped; the loess fit after normalization is not centered on 0. However, it seems that quantile normalization worked very well; looking at the MD-plot after quantile normalization it is clear that the difference in expression is independent of mean expression. Thus we will use the quantile normalized data to proceed.

library(limma)
## 
## Attaching package: 'limma'
## 
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
#log-2 transform the exprs matricies for both data sets; because microarray data is measured on a 16-bit scale we should re-log-transform the data.
boxplot(log2(exprs(data_NCI)),col=ifelse(data_NCI$status=="Resistant","red","blue"),main="Unnormalized data (red=resistant, blue=sensitive)")

data_NCInorm<-data_NCI
exprs(data_NCInorm)<-normalizeBetweenArrays(log2(exprs(data_NCI)),method="cyclicloess")
quant_norm<-normalizeQuantiles(log2(exprs(data_NCI)))
boxplot(exprs(data_NCInorm),col=ifelse(data_NCInorm$status=="Resistant","red","blue"),main="Loess normalized data (red=resistant, blue=sensitive)")

boxplot(quant_norm,col=ifelse(data_NCInorm$status=="Resistant","red","blue"),main="Quantile normalized data (red=resistant, blue=sensitive)")

#We want to use an MD plot to see how normalization affected the distribution of the two most negatively correlated samples.
mdplot_array<-function(x,y, ...){
  mean<-rowMeans(x[,y]+0.1)
  difference<-(x[,y[2]]+0.1)-(x[,y[1]]+0.1)
  smoothScatter(mean,difference,...)
  lines(lowess(mean,difference),col="red")
  abline(h=0,lty=2)
}

#To visualize how normalization will affect the data, we find the two arrays that have the largest negative correlation for comparison.
neg_samp<-cor(exprs(data_NCI))
arrayInd(which.min(neg_samp),dim(neg_samp))
##      [,1] [,2]
## [1,]   17    7
mdplot_array(log2(exprs(data_NCI)),y=c(7,18),main="Unnormalized intensities")

mdplot_array((exprs(data_NCInorm)),y=c(7,18),main="Loess normalized intensities")

mdplot_array(quant_norm,y=c(7,18),main="Quantile normalized intensities")

Part B Plotting the arrays against first 3 principal components and coloring by drug status reveals that the data shows evidence of clustering by the 2nd principal component, suggesting it is associated with drug response. For comparison we plot both quantile and loess normalized data, and it is encouraging that the results are very similar.

#Dimensionality Reduction
pc_NCI<-prcomp(t(exprs(data_NCInorm)))
pairs(pc_NCI$x[,1:3],col=ifelse(data_NCInorm$status=="Resistant","green","blue"),pch=ifelse(data_NCInorm$status=="Resistant",16,17),main="loess normalized")

#% of variance explained by the first 3 PCs
((pc_NCI$sdev)^2/sum(pc_NCI$sdev^2))[1:3]
## [1] 0.16638498 0.09386862 0.07021265
pc_quant<-prcomp(t(quant_norm))
pairs(pc_quant$x[,1:3],col=ifelse(data_NCInorm$status=="Resistant","green","blue"),pch=ifelse(data_NCInorm$status=="Resistant",16,17),main="quantile normalized")

#% of variance explained by the first 3 PCs
((pc_quant$sdev)^2/sum(pc_quant$sdev^2))[1:3]
## [1] 0.15828194 0.09763860 0.07064125
#Hierarchical Clustering
r_NCInorm<-cor(exprs(data_NCInorm))
d_NCInorm<-1-r_NCInorm
dend_NCI<-as.dendrogram(hclust(as.dist(d_NCInorm),method="average"))
labels_colors(dend_NCI)<-ifelse(data_NCInorm$status=="Resistant","red","blue")[order.dendrogram(dend_NCI)]
plot(color_branches(dend_NCI,k=2),main="Hclust of NCI data")

#PAM Clustering
p_NCI<-pam(as.dist(d_NCInorm),k=2,diss=TRUE)
clusplot(p_NCI,col.p=ifelse(data_NCInorm$status=="Resistant","red","blue"),main="PAM clustering of NCI data")

Part C The first test used to look for differences in mean expression levels between drug status was a two sample t-test. The function rowttests runs a t-test for each gene. It derives a test statistics by dividing the difference in group means for that probeset by the standard deviation of that gene across all samples, weighted by the number of samples. The p-values from the test were adjusted to control FDR by the Benjamini & Hochberg procedure. This method found 267 genes with an adjusted p-value < 0.05.

The second test I used to test for differences in mean expression levels between drug status was the Wilcoxon signed rank test. This test is non-parametric meaning that it does not assume that the expression measurements in each group are normally distributed. The Wilcoxon test ranks gene expression across all samples, and then preforms a two-tailed test of equality of means between the two groups of interest (sensitive or resistant) to gain a p-value. I then used the Benjamini & Hochberg method to control the FDR, a less conservative method of accounting for multiple testing than the Bonferroni correction to control the FWER. This method found 218 genes with an adjusted p-value < 0.05.

When I compared the 80 DE genes that Potti et al. identified as DE between resistant and sensitive cell lines with the top 80 DE genes from my list of top DE genes identified by t-test, I found 70 genes from Potti et al. included in the top 80 of my DE genes. Comparing the Potti et al. list to the top 80 DE genes from my list of top DE genes identified by Wilxocon signed rank test p-value statistic, I found 57 genes from Potti et al. included in the top 80 of my DE genes.

To better visualize genes that are both biologically and statistically significant, a volcano plot was made and “interesting” genes were plotted. A gene is called interesting if it has a significant adjusted p-value and a log fold-change greater than 2. The x-axis plot the log2 fold change of each probeset between resistant and sensitive, and the y-axis is -log10 of the adjusted p-value. A volcano plot thus applies “double filtering” to find interesting genes, the logic being that large outliers in one of the groups can cause large fold change, and that significant p-value could be caused by low variance or expression level. Double filtering will only return genes that are significant on both scales.

#DE by row t-tests
library(genefilter)
## 
## Attaching package: 'genefilter'
## 
## The following object is masked from 'package:base':
## 
##     anyNA
library(multtest)
ttest<-rowttests(data_NCInorm,fac=data_NCInorm$status)
mtest<-mt.rawp2adjp(ttest$p.value,proc="BH")
de_genes<-featureNames(data_NCInorm)[mtest$index[1:80]]
links(hgu95av2SYMBOL[de_genes])
##      probe_id   symbol
## 1   1051_g_at    MLANA
## 2      110_at    CSPG4
## 3     1319_at     DDR2
## 4     1519_at     ETS2
## 5     1537_at     EGFR
## 6   2011_s_at      BIK
## 7   2047_s_at      JUP
## 8    266_s_at     CD24
## 9    32139_at   ZNF185
## 10 32168_s_at    RCAN1
## 11   32612_at      GSN
## 12   32715_at    VAMP8
## 13   32821_at     LCN2
## 14   32967_at     FCMR
## 15 33004_g_at     NCK2
## 16   33240_at   PDZRN3
## 17   33409_at    FKBP2
## 18   33483_at      NMU
## 19   33824_at     KRT8
## 20   33842_at  FAM214B
## 21 33853_s_at     NRP2
## 22   33892_at     PKP2
## 23   33904_at    CLDN3
## 24   33906_at   SSSCA1
## 25   33908_at    CAPN1
## 26   33956_at     LY96
## 27   34213_at     WWC1
## 28   34348_at   SPINT2
## 29   34859_at   MAGED2
## 30   34885_at   SYNGR2
## 31   34993_at     SGCD
## 32   35280_at    LAMC2
## 33   35444_at     MISP
## 34   35628_at   TM7SF2
## 35   35630_at    LLGL2
## 36 35681_r_at     ZEB2
## 37   35766_at    KRT18
## 38   35807_at     CYBA
## 39   36133_at      DSP
## 40 36618_g_at      ID1
## 41 36619_r_at      ID1
## 42   36795_at     PSAP
## 43   36828_at   ZNF629
## 44   37222_at    GSTT1
## 45   37327_at     EGFR
## 46   37552_at    KCNK1
## 47   37695_at  RNF144A
## 48   37743_at     FEZ1
## 49   37749_at     MEST
## 50   37926_at     KLF5
## 51   38004_at    CSPG4
## 52   38078_at     FLNB
## 53   38119_at     GYPC
## 54   38297_at  PITPNM1
## 55   38379_at    GPNMB
## 56   38653_at    PMP22
## 57   39298_at  ST3GAL6
## 58   39316_at   RAB40C
## 59   39386_at MAD2L1BP
## 60   39801_at    PLOD3
## 61   39918_at  TUBGCP2
## 62   40103_at      EZR
## 63   40202_at     KLF9
## 64   40434_at    PODXL
## 65   40568_at ATP6V1B2
## 66   41158_at     PLP1
## 67   41294_at     KRT7
## 68   41359_at     PKP3
## 69   41378_at     SGCD
## 70   41453_at     DLG3
## 71   41503_at     ZHX2
## 72   41610_at    LAMA5
## 73   41644_at    SASH1
## 74   41839_at     GAS1
## 75   575_s_at    EPCAM
## 76     661_at     GAS1
## 77     999_at  CYP27A1
#DE by Wilcoxon-signed rank test
status<-data_NCInorm$status
wilcox<-apply(exprs(data_NCInorm),1,function(x) wilcox.test(x~status))
wilcox_result<-sapply(wilcox,function(x) x$p.value)
wilcox_result<-as.data.frame(wilcox_result)
wilcox_result$FDR<-p.adjust(wilcox_result[,1],method="fdr")
wilcox_result<-wilcox_result[order(wilcox_result$FDR),]
wilcox_result$gene<-sapply(mget(rownames(wilcox_result),hgu95av2SYMBOL),function(x) x)

#Comparing results to results of Potti et al.
#Comparing to t-test DE genes
sum(doxorubicinGenes %in% de_genes)
## [1] 70
#Comparing to Wilcoxon signed rank test DE genes
sum(doxorubicinGenes %in% rownames(wilcox_result)[1:80])
## [1] 57
#Getting Log FC and making volcano plot
sens_mean<-rowMeans(2^(exprs(data_NCInorm[,status=="Sensitive"])))
res_mean<-rowMeans(2^(exprs(data_NCInorm[,status=="Resistant"])))
log_fc<-log2(sens_mean/res_mean)
log_p<--log10(ttest$p.value)
names(log_p)<-names(log_fc)
plot(log_fc,log_p,main="Volcano Plot (DE genes by status)",xlab="log2(FC)",ylab="-log10(p-value)",col=rgb(0,0,1,.3))

#plotting interesting genes
sum(log_p > -log10(0.05/length(log_p)))
## [1] 16
sum(abs(log_fc) > 2)
## [1] 464
sum(log_p > -log10(0.05/length(log_p)) & abs(log_fc) > 2)
## [1] 16
points(log_fc[which(log_p > -log10(0.5/length(log_p)) & abs(log_fc) > 2)],log_p[which(log_p > -log10(0.5/length(log_p)) & abs(log_fc) > 2)],col="red",pch=16)
text(log_fc[which(log_p > -log10(0.5/length(log_p)) & abs(log_fc) > 2)],log_p[which(log_p > -log10(0.5/length(log_p)) & abs(log_fc) > 2)],labels=links(hgu95av2SYMBOL[names(log_fc)[which(log_p > -log10(0.5/length(log_p)) & abs(log_fc) > 2)]])[,"symbol"],pos=1,col="red")