HPV positive vs HPV negative analysis for Head and Neck cancer (OroPharyngeal Squamous Cell Carcinoma) dataset - GSE55542

Load Required Packages

# if a new package from bioconductor has to be installed. write following two commands
# source("https://bioconductor.org/biocLite.R")
# biocLite("package name")
library(GEOquery)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist, unsplit
## 
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Setting options('download.file.method.GEOquery'='auto')
library(limma)
## 
## Attaching package: 'limma'
## 
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
#install.packages("rafalib")
library("rafalib")

Download input data

gset55542 <- getGEO("GSE55542", GSEMatrix =TRUE)
## ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE55nnn/GSE55542/matrix/
## Found 1 file(s)
## GSE55542_series_matrix.txt.gz
## File stored at: 
## /var/folders/nt/jmcs3y6d4d3ghsv7j3yzpfrh0000gn/T//Rtmp4BhpRU/GPL17077.soft
#setwd("/Users/shruti/Dropbox/SHRUTIM/Microarray/Microarray_R_Scripts/HNSCC/HPV/GSE55542/")
#save(gset55542,file="./final/gset55542.rda")
#load("/Users/shruti/Dropbox/SHRUTIM/Microarray/Microarray_R_Scripts/HNSCC/HPV/GSE55542/final/gset55542.rda")

Data Munging

GPLid <- levels((gset55542[[1]])$platform_id)
if (length(gset55542) > 1) 
{
  idx <- grep(GPLid, attr(gset55542, "names"))
} else 
{ 
  idx <- 1
}
gset55542 <- gset55542[[idx]]

# log2 transform
ex55542 <- exprs(gset55542)
qx <- as.numeric(quantile(ex55542, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
  (qx[6]-qx[1] > 50 && qx[2] > 0) ||
  (qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
if (LogC) { ex55542[which(ex55542 <= 0)] <- NaN
            exprs(gset55542) <- log2(ex55542) }

## look at the pattern in which the sample types are arranged. 
#colnames(pData(gset55542))
#pData(gset55542)[1:2,]
#pData(gset55542)[1:2,c(1,2,7,8,10,11,13,19,20)]
#apply(pData(gset55542)[,c(1,8,10,11)],2,table)

sample_information <- pData(gset55542)[,c(1,2,8,10,11)]
sample_information2 <- sample_information
sample_information2$source_name_ch1 <- sapply(sample_information2$source_name_ch1,function(i) {sub("Oropharyngeal squamous cell carcinoma - ","",i)})
sample_information2$characteristics_ch1 <- sapply(sample_information2$characteristics_ch1,function(i) {sub("gender: ","",i)})
sample_information2$characteristics_ch1.1 <- sapply(sample_information2$characteristics_ch1.1,function(i) {sub("race: ","",i)})

HPV_status <- factor(sample_information2$source_name_ch1)

Exploratory Data Analysis for all samples

# MultiDimension Scaling plots
dist_matrix_55542 <- dist(t(ex55542)) # very important to take transpose "t"
mds_55542 <- cmdscale(dist_matrix_55542)

mypar(1,1)
plot(mds_55542[,1],mds_55542[,2],bg=as.numeric(HPV_status),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: HPV status")
legend("topright",levels(HPV_status),col=seq(along=levels(HPV_status)),pch=15,cex=1)
identify(mds_55542)

## integer(0)
sample_information2[14,]
##                                                        title geo_accession
## GSM1338943 Analysis 1: HPV Inactive - Biological replicate 2    GSM1338943
##            source_name_ch1 characteristics_ch1 characteristics_ch1.1
## GSM1338943    HPV Inactive              Female     European American
plot(mds_55542[,1],mds_55542[,2],bg=as.numeric(factor(sample_information$characteristics_ch1)),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: Gender")
legend("bottomleft",levels(factor(sample_information$characteristics_ch1)),col=seq(along=levels(factor(sample_information$characteristics_ch1))),pch=15,cex=0.8)

plot(mds_55542[,1],mds_55542[,2],bg=as.numeric(factor(sample_information$characteristics_ch1.1)),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: Race")
legend("topright",levels(factor(sample_information$characteristics_ch1.1)),col=seq(along=levels(factor(sample_information$characteristics_ch1.1))),pch=15,cex=0.8)

HPV Active Vs HPV Negative

HPVactive_gsmids <- rownames(subset(sample_information2,source_name_ch1=="HPV Active",))
HPVinactive_gsmids <- rownames(subset(sample_information2,source_name_ch1=="HPV Inactive",))
HPVnegative_gsmids <- rownames(subset(sample_information2,source_name_ch1=="HPV Negative",))
sample_information_HPV_active_negative <- sample_information2[c(HPVactive_gsmids,HPVnegative_gsmids),]

gset55542_HPV_active_negative <- gset55542[,c(HPVactive_gsmids,HPVnegative_gsmids)]
ex55542_HPV_active_negative <- ex55542[,c(HPVactive_gsmids,HPVnegative_gsmids)]
HPV_status_active_negative <- factor(sample_information_HPV_active_negative$source_name_ch1,levels=c("HPV Negative","HPV Active"))
  
dist_matrix_55542_HPV_active_negative <- dist(t(ex55542_HPV_active_negative)) # very important to take transpose "t"
mds_55542_HPV_active_negative <- cmdscale(dist_matrix_55542_HPV_active_negative)

plot(mds_55542_HPV_active_negative[,1],mds_55542_HPV_active_negative[,2],bg=as.numeric(HPV_status_active_negative),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: HPV status")
legend("topright",levels(HPV_status_active_negative),col=seq(along=levels(HPV_status_active_negative)),pch=15,cex=1)
identify(mds_55542_HPV_active_negative)
## integer(0)
c(HPVactive_gsmids,HPVnegative_gsmids)[20]
## [1] "GSM1338957"
sample_information_HPV_active_negative[20,]
##                                                        title geo_accession
## GSM1338957 Analysis 1: HPV Negative - Biological replicate 8    GSM1338957
##            source_name_ch1 characteristics_ch1 characteristics_ch1.1
## GSM1338957    HPV Negative                Male      African American
text(mds_55542_HPV_active_negative[,1],mds_55542_HPV_active_negative[,2],labels=rownames(mds_55542_HPV_active_negative),cex=0.8)

hierarchial_cluster_HPV_active_negative <- hclust(dist_matrix_55542_HPV_active_negative)
plot(hierarchial_cluster_HPV_active_negative,cex=0.8,main="Hierarchical clustering of samples")

plot(hierarchial_cluster_HPV_active_negative,cex=0.8,main="Hierarchical clustering of samples",label=HPV_status_active_negative)

sample_information_HPV_active_negative[c("GSM1338957","GSM1338940"),]
##                                                        title geo_accession
## GSM1338957 Analysis 1: HPV Negative - Biological replicate 8    GSM1338957
## GSM1338940  Analysis 1: HPV Active - Biological replicate 11    GSM1338940
##            source_name_ch1 characteristics_ch1 characteristics_ch1.1
## GSM1338957    HPV Negative                Male      African American
## GSM1338940      HPV Active                Male     European American
sample_information_HPV_active_negative[c("GSM1338960","GSM1338936"),]
##                                                         title
## GSM1338960 Analysis 1: HPV Negative - Biological replicate 11
## GSM1338936    Analysis 1: HPV Active - Biological replicate 7
##            geo_accession source_name_ch1 characteristics_ch1
## GSM1338960    GSM1338960    HPV Negative                Male
## GSM1338936    GSM1338936      HPV Active                Male
##            characteristics_ch1.1
## GSM1338960     European American
## GSM1338936     European American
plot(mds_55542_HPV_active_negative[,1],mds_55542_HPV_active_negative[,2],bg=as.numeric(factor(sample_information_HPV_active_negative$characteristics_ch1.1)),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: Race status")
legend("topleft",levels(factor(sample_information_HPV_active_negative$characteristics_ch1.1)),col=seq(along=levels(factor(sample_information_HPV_active_negative$characteristics_ch1.1))),pch=15,cex=1)

# since "GSM1338957" sample is clustering with HPV active samples, remove it from analysis
#gset55542_HPV_active_negative2 <- gset55542_HPV_active_negative[,-grep("GSM1338957",sampleNames(gset55542_HPV_active_negative))]
gset55542_HPV_active_negative2 <- gset55542_HPV_active_negative[,-20]
ex55542_HPV_active_negative2 <- ex55542_HPV_active_negative[,-20] 
sample_information_HPV_active_negative2 <- sample_information_HPV_active_negative[,-20]
HPV_status_active_negative2 <- HPV_status_active_negative[-20]

dist_matrix_55542_HPV_active_negative2 <- dist(t(ex55542_HPV_active_negative2)) # very important to take transpose "t"
mds_55542_HPV_active_negative2 <- cmdscale(dist_matrix_55542_HPV_active_negative2)

plot(mds_55542_HPV_active_negative2[,1],mds_55542_HPV_active_negative2[,2],bg=as.numeric(HPV_status_active_negative2),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: HPV status")
legend("topright",levels(HPV_status_active_negative2),col=seq(along=levels(HPV_status_active_negative2)),pch=15,cex=1)

RunLimma <- function(gset,groups){
  design_matrix <- model.matrix(~0+groups)
  colnames(design_matrix) <- c("CONTROL","TREATMENT")
  # to check order is correct
  # head(design_matrix)
  
  ## fit linear model for each gene given a series of arrays
  fit <- lmFit(gset, design_matrix)
  
  ## tell which levels should be compared
  colnames(design_matrix)
  cont.matrix <- makeContrasts(TREATMENT-CONTROL, levels=design_matrix)
  
  ## given a linear model fit to microarray data, compute estimated coefficients & standard errors for given set of contratsts
  fit2 <- contrasts.fit(fit, cont.matrix)
  fit2 <- eBayes(fit2, 0.01)
  
  ## to get the no.of genes in each array
  nrow_gset <- length(featureNames(gset))
  tT <- topTable(fit2, adjust="fdr", sort.by="logFC",number=nrow_gset)
  colnames(tT)
  tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","logFC","t","B","AveExpr","GENE_SYMBOL","GENE_NAME","ENSEMBL_ID","REFSEQ","GB_ACC","UNIGENE_ID"))
  return(tT)
}

# HPVpositive vs HPVnegative
gse55542_HPV_positiveVsnegative <- RunLimma(gset55542_HPV_active_negative2,HPV_status_active_negative2)

# volcano plot
plot(gse55542_HPV_positiveVsnegative$logFC,-log10(gse55542_HPV_positiveVsnegative$adj.P.Val), xlab="Effect size",ylab="- log (base 10) adj.p-values",main="Volcano Plot for HPVpositive vs HPVnegative")

library("biomaRt")
## Warning: package 'biomaRt' was built under R version 3.2.2
## Creating a generic function for 'nchar' from package 'base' in package 'S4Vectors'
## Warning in .recacheSubclasses(def@className, def, doSubclasses, env):
## undefined subclass "externalRefMethod" of class "expressionORfunction";
## definition not updated
## Warning in .recacheSubclasses(def@className, def, doSubclasses, env):
## undefined subclass "externalRefMethod" of class "functionORNULL";
## definition not updated
listMarts(host="www.ensembl.org")
##                biomart               version
## 1 ENSEMBL_MART_ENSEMBL      Ensembl Genes 82
## 2     ENSEMBL_MART_SNP  Ensembl Variation 82
## 3 ENSEMBL_MART_FUNCGEN Ensembl Regulation 82
## 4    ENSEMBL_MART_VEGA               Vega 62
## 5                pride        PRIDE (EBI UK)
database <- useMart("ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host="www.ensembl.org")
listDatasets(database)[grep("sapiens",listDatasets(database)$description,),]
##                  dataset                    description   version
## 32 hsapiens_gene_ensembl Homo sapiens genes (GRCh38.p3) GRCh38.p3
## Filters (one or more) that should be used in the query. So first check what do the probes represent.
#pData(gset55542)[1,c(12:20)]
#pData(gset55542)[1,"hyb_protocol"] # "Agilent-039494 SurePrint G3 Human GE v2 8x60K Microarray (G4858A-039494)"
#head(gse55542_HPV_positiveVsnegative$ID)
filters <- listFilters(database)
#head(filters)
filters[grep("agilent",filters$description,ignore.case=T),]
##                                          name
## 144                      with_agilent_cgh_44b
## 145     with_efg_agilent_wholegenome_4x44k_v1
## 146     with_efg_agilent_wholegenome_4x44k_v2
## 147    with_efg_agilent_sureprint_g3_ge_8x60k
## 148 with_efg_agilent_sureprint_g3_ge_8x60k_v2
## 177                           agilent_cgh_44b
## 178         efg_agilent_sureprint_g3_ge_8x60k
## 179      efg_agilent_sureprint_g3_ge_8x60k_v2
## 180          efg_agilent_wholegenome_4x44k_v1
## 181          efg_agilent_wholegenome_4x44k_v2
##                                                          description
## 144                                 with Agilent CGH 44b probe ID(s)
## 145                      with Efg agilent wholegenome 4x44k v1 ID(s)
## 146                      with Efg agilent wholegenome 4x44k v2 ID(s)
## 147                     with Efg agilent sureprint g3 ge 8x60k ID(s)
## 148                  with Efg agilent sureprint g3 ge 8x60k v2 ID(s)
## 177                  Agilent CGH 44b probe ID(s) [e.g. A_14_P131077]
## 178   Agilent Sureprint G3 GE 8x60k probe ID(s) [e.g. A_33_P3356022]
## 179 Agilent Sureprint G3 GE 8x60k v2 probe ID(s) [e.g. A_24_P368544]
## 180     Agilent WholeGenome 4x44k v1 probe ID(s) [e.g. A_32_P196615]
## 181    Agilent WholeGenome 4x44k v2 probe ID(s) [e.g. A_33_P3356022]
filters[179,]
##                                     name
## 179 efg_agilent_sureprint_g3_ge_8x60k_v2
##                                                          description
## 179 Agilent Sureprint G3 GE 8x60k v2 probe ID(s) [e.g. A_24_P368544]
## attribites are values that you are interested in to retrieve
attributes <- listAttributes(database)
#head(attributes)
grep("entrez",attributes$description,ignore.case=T)
## [1] 58 59
grep("UNIGENE",attributes$description,ignore.case=T)
## [1] 91
grep("symbol",attributes$description,ignore.case=T)
## [1] 65
grep("agilent",attributes$description,ignore.case=T)
## [1] 100 101 102 103 123
attributes[c(1,58,59,65,91,100:103,123),]
##                                     name
## 1                        ensembl_gene_id
## 58                            entrezgene
## 59            entrezgene_transcript_name
## 65                           hgnc_symbol
## 91                               unigene
## 100    efg_agilent_sureprint_g3_ge_8x60k
## 101 efg_agilent_sureprint_g3_ge_8x60k_v2
## 102     efg_agilent_wholegenome_4x44k_v1
## 103     efg_agilent_wholegenome_4x44k_v2
## 123                      agilent_cgh_44b
##                                description
## 1                          Ensembl Gene ID
## 58                           EntrezGene ID
## 59           EntrezGene transcript name ID
## 65                             HGNC symbol
## 91                              Unigene ID
## 100    Agilent SurePrint G3 GE 8x60k probe
## 101 Agilent SurePrint G3 GE 8x60k v2 probe
## 102     Agilent WholeGenome 4x44k v1 probe
## 103     Agilent WholeGenome 4x44k v2 probe
## 123                  Agilent CGH 44b probe
gene_ids <- getBM(attributes=c("entrezgene","unigene","efg_agilent_sureprint_g3_ge_8x60k_v2"), filters="efg_agilent_sureprint_g3_ge_8x60k_v2", values=gse55542_HPV_positiveVsnegative$ID, mart=database, uniqueRows=T)
gse55542_HPV_positiveVsnegative2 <- merge(gse55542_HPV_positiveVsnegative,gene_ids,by.x="ID",by.y="efg_agilent_sureprint_g3_ge_8x60k_v2",all.x=T)
RemoveRows <- function(MatrixName,ColumnName){ #ColumnName is name of column containing gene id
  # remove rows that contain "NA" in any column
  complete <- complete.cases(MatrixName)
  Matrix_complete <- MatrixName[complete,]
  # remove rows that contain "" in column for gene id
  nonempty_gene_ids <- Matrix_complete[,ColumnName]!=""
  Matrix2 <- Matrix_complete[nonempty_gene_ids,]
  
  return(Matrix2)
}

gse55542_HPV_positiveVsnegative3 <- RemoveRows(gse55542_HPV_positiveVsnegative2,"entrezgene")
DEG_gse55542_HPV_positiveVsnegative <- subset(gse55542_HPV_positiveVsnegative3,adj.P.Val<0.05 & abs(logFC)>0.585,)
#write.table(DEG_gse55542_HPV_positiveVsnegative, file="./rough/DEG_gse55542_HPV_positiveVsnegative.txt", row.names=F, sep="\t")
#save(DEG_gse55542_HPV_positiveVsnegative, file="./rough/DEG_gse55542_HPV_positiveVsnegative.rda")