HPV positive vs HPV negative analysis for Head and Neck cancer dataset - GSE3292

Load Required Packages

# if a new package from bioconductor has to be installed. write following two commands
# source("https://bioconductor.org/biocLite.R")
# biocLite("package name")
library(GEOquery)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist, unsplit
## 
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Setting options('download.file.method.GEOquery'='auto')
library(limma)
## 
## Attaching package: 'limma'
## 
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
#install.packages("rafalib")
library("rafalib")

Download input data

gset3292 <- getGEO("GSE3292", GSEMatrix =TRUE)
## ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE3nnn/GSE3292/matrix/
## Found 1 file(s)
## GSE3292_series_matrix.txt.gz
## File stored at: 
## /var/folders/nt/jmcs3y6d4d3ghsv7j3yzpfrh0000gn/T//Rtmp9tJVTO/GPL570.soft
#setwd("/Users/shruti/Dropbox/SHRUTIM/Microarray/Microarray_R_Scripts/HNSCC/HPV/GSE3292/")
#save(gset3292,file="./final/gset3292.rda")
#load("/Users/shruti/Dropbox/SHRUTIM/Microarray/Microarray_R_Scripts/HNSCC/HPV/GSE3292/final/gset3292.rda")

Data Munging

GPLid <- levels((gset3292[[1]])$platform_id)
if (length(gset3292) > 1) 
{
  idx <- grep(GPLid, attr(gset3292, "names"))
} else 
{ 
  idx <- 1
}
gset3292 <- gset3292[[idx]]

# log2 transform
ex3292 <- exprs(gset3292)
qx <- as.numeric(quantile(ex3292, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
  (qx[6]-qx[1] > 50 && qx[2] > 0) ||
  (qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
if (LogC) { ex3292[which(ex3292 <= 0)] <- NaN
            exprs(gset3292) <- log2(ex3292) }

## look at the pattern in which the sample types are arranged. 
#colnames(pData(gset3292))
#pData(gset3292)[1:2,]
#pData(gset3292)[1:2,c(1,2,6,10,16)]

# since HPV information is not available through gset, downlaod its meta data
dfl3292 <- getGSEDataTables("GSE3292")
#save(dfl3292,file="./final/dfl3292.rda")
#load("/Users/shruti/Dropbox/SHRUTIM/Microarray/Microarray_R_Scripts/HNSCC/HPV/GSE3292/final/dfl3292.rda")

#dfl3292[[2]]
gse3292_metadata <- dfl3292[[2]][complete.cases(dfl3292[[2]]),]
# find a common column between gse3292_metadata and padata to merge them
#pData(gset3292)[1:2,]
#gse3292_metadata[1:2,]
head(pData(gset3292)[,1])
##              V2              V3              V4              V5 
## VU HNSCCV300142 VU HNSCCV300171 VU HNSCCV300251 VU HNSCCV300373 
##              V6              V7 
## VU HNSCCV300383 VU HNSCCV300400 
## 36 Levels: UNC HNSCC00-0318 UNC HNSCC01-0291 ... VU HNSCCV300710
head(gse3292_metadata[,1:2])
##   Institute     ID
## 1        VU 300142
## 2        VU 300148
## 3        VU 300171
## 4        VU 300251
## 5        VU 300373
## 6        VU 300383
# as you can see, Institute and ID from gse3292_metadata and title from pData(gset3292) are similar but not identical.
titles <- pData(gset3292)$title
titles <- gsub("\\D+","",titles)
pData2 <- cbind(titles,pData(gset3292)[,1:2])

# In ID from gse3292_metadata "0" have been removed infront some. In order to make sure it matches titles from pData, one has too add them 
# add zero infront of entries 16-33. One has to be very careful
Ids <- gse3292_metadata$ID
#Ids
#Ids[16]
#titles[grep("318",titles)]
Ids[16] <- paste("000",Ids[16],sep="")

titles[grep("30079",titles)]
## [1] "030079"
#Ids[33]
Ids[17:33] <- sapply(Ids[17:33],function(i) {paste("0",i,sep="")},USE.NAMES =F)
#table(sapply(Ids,nchar))
gse3292_metadata2 <- cbind(Ids,gse3292_metadata)

# now merge gse3292_metadata and pData(gset3292) # the order of Ids and titles are not same
#Ids
#pData2
# very imp to keep sort as False inorder to maintian the order of gsm ids as in gset
sample_information <- merge(pData2,gse3292_metadata2,by.x="titles",by.y="Ids",sort=F) 
#head(sample_information)
# ensure the order of gsm files in sample_information is same as in gsetfile
setdiff(pData2$geo_accession,sample_information$geo_accession)
## character(0)
#apply(sample_information[,c(4,7:12,15,16,17)],2,table)

HPV_status <- factor(sample_information$HPV,levels=c("Negative","Positive"))

Exploratory Data Analysis

dist_matrix_3292 <- dist(t(ex3292)) # very important to take transpose "t"
mds_3292 <- cmdscale(dist_matrix_3292)

mypar(1,1)
plot(mds_3292[,1],mds_3292[,2],bg=as.numeric(HPV_status),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: HPV status")
legend("bottomleft",levels(HPV_status),col=seq(along=levels(HPV_status)),pch=15,cex=1)

plot(mds_3292[,1],mds_3292[,2],bg=as.numeric(factor(sample_information$Site)),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: Anatomical Location")
legend("bottomleft",levels(factor(sample_information$Site)),col=seq(along=levels(factor(sample_information$Site))),pch=15,cex=1)

plot(mds_3292[,1],mds_3292[,2],bg=as.numeric(factor(sample_information$Institute)),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: Institute")
legend("bottomleft",levels(factor(sample_information$Institute)),col=seq(along=levels(factor(sample_information$Institute))),pch=15,cex=1)

plot(mds_3292[,1],mds_3292[,2],bg=as.numeric(factor(sample_information$Sex)),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: Sex Status")
legend("bottomleft",levels(factor(sample_information$Sex)),col=seq(along=levels(factor(sample_information$Sex))),pch=15,cex=1)

plot(mds_3292[,1],mds_3292[,2],bg=as.numeric(factor(sample_information$Race)),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: Race")
legend("bottomleft",levels(factor(sample_information$Race)),col=seq(along=levels(factor(sample_information$Race))),pch=15,cex=1)

plot(mds_3292[,1],mds_3292[,2],bg=as.numeric(factor(sample_information$Tobacco)),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: Tobacco Status")
legend("bottomleft",levels(factor(sample_information$Tobacco)),col=seq(along=levels(factor(sample_information$Tobacco))),pch=15,cex=1)

plot(mds_3292[,1],mds_3292[,2],bg=as.numeric(factor(sample_information$Grade)),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: Grade of Cancer")
legend("bottomleft",levels(factor(sample_information$Grade)),col=seq(along=levels(factor(sample_information$Grade))),pch=15,cex=1)

RunLimma <- function(gset,groups){
  design_matrix <- model.matrix(~0+groups)
  colnames(design_matrix) <- c("CONTROL","TREATMENT")
  # to check order is correct
  # head(design_matrix)
  
  ## fit linear model for each gene given a series of arrays
  fit <- lmFit(gset, design_matrix)
  
  ## tell which levels should be compared
  colnames(design_matrix)
  cont.matrix <- makeContrasts(TREATMENT-CONTROL, levels=design_matrix)
  
  ## given a linear model fit to microarray data, compute estimated coefficients & standard errors for given set of contratsts
  fit2 <- contrasts.fit(fit, cont.matrix)
  fit2 <- eBayes(fit2, 0.01)
  
  ## to get the no.of genes in each array
  nrow_gset <- length(featureNames(gset))
  tT <- topTable(fit2, adjust="fdr", sort.by="logFC",number=nrow_gset)
  colnames(tT)
  tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","logFC","t","B","AveExpr","ENTREZ_GENE_ID","Gene.Symbol","Gene.Title"))
  return(tT)
}

gse3292_HPV_positiveVsnegative <- RunLimma(gset3292,HPV_status)
RemoveRows <- function(MatrixName,ColumnName){ #ColumnName is name of column containing gene id
  # remove rows that contain "NA" in any column
  complete <- complete.cases(MatrixName)
  Matrix_complete <- MatrixName[complete,]
  # remove rows that contain "" in column for gene id
  nonempty_gene_ids <- Matrix_complete[,ColumnName]!=""
  Matrix2 <- Matrix_complete[nonempty_gene_ids,]
  
  return(Matrix2)
}

gse3292_HPV_positiveVsnegative2 <- RemoveRows(gse3292_HPV_positiveVsnegative,"ENTREZ_GENE_ID")
# volcano plot
plot(gse3292_HPV_positiveVsnegative2$logFC,-log10(gse3292_HPV_positiveVsnegative2$adj.P.Val), xlab="Effect size",ylab="- log (base 10) adj.p-values",main="Volcano Plot for HPVpositive vs HPVnegative")

DEG_gse3292_HPV_positiveVsnegative <- subset(gse3292_HPV_positiveVsnegative2,adj.P.Val<0.05 & abs(logFC)>0.585,)
#write.table(DEG_gse3292_HPV_positiveVsnegative, file="./rough/DEG_gse3292_HPV_positiveVsnegative.txt", row.names=F, sep="\t")
#save(DEG_gse3292_HPV_positiveVsnegative, file="./rough/DEG_gse3292_HPV_positiveVsnegative.rda")