HPV positive vs HPV negative analysis for Head and Neck cancer (OroPharyngeal Squamous Cell Carcinoma) dataset - GSE55542
Load Required Packages
# if a new package from bioconductor has to be installed. write following two commands
# source("https://bioconductor.org/biocLite.R")
# biocLite("package name")
library(GEOquery)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
## intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unlist, unsplit
##
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Setting options('download.file.method.GEOquery'='auto')
library(limma)
##
## Attaching package: 'limma'
##
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
#install.packages("rafalib")
library("rafalib")
Download input data
gset55542 <- getGEO("GSE55542", GSEMatrix =TRUE)
## ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE55nnn/GSE55542/matrix/
## Found 1 file(s)
## GSE55542_series_matrix.txt.gz
## File stored at:
## /var/folders/nt/jmcs3y6d4d3ghsv7j3yzpfrh0000gn/T//Rtmp4BhpRU/GPL17077.soft
#setwd("/Users/shruti/Dropbox/SHRUTIM/Microarray/Microarray_R_Scripts/HNSCC/HPV/GSE55542/")
#save(gset55542,file="./final/gset55542.rda")
#load("/Users/shruti/Dropbox/SHRUTIM/Microarray/Microarray_R_Scripts/HNSCC/HPV/GSE55542/final/gset55542.rda")
Data Munging
GPLid <- levels((gset55542[[1]])$platform_id)
if (length(gset55542) > 1)
{
idx <- grep(GPLid, attr(gset55542, "names"))
} else
{
idx <- 1
}
gset55542 <- gset55542[[idx]]
# log2 transform
ex55542 <- exprs(gset55542)
qx <- as.numeric(quantile(ex55542, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
(qx[6]-qx[1] > 50 && qx[2] > 0) ||
(qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
if (LogC) { ex55542[which(ex55542 <= 0)] <- NaN
exprs(gset55542) <- log2(ex55542) }
## look at the pattern in which the sample types are arranged.
#colnames(pData(gset55542))
#pData(gset55542)[1:2,]
#pData(gset55542)[1:2,c(1,2,7,8,10,11,13,19,20)]
#apply(pData(gset55542)[,c(1,8,10,11)],2,table)
sample_information <- pData(gset55542)[,c(1,2,8,10,11)]
sample_information2 <- sample_information
sample_information2$source_name_ch1 <- sapply(sample_information2$source_name_ch1,function(i) {sub("Oropharyngeal squamous cell carcinoma - ","",i)})
sample_information2$characteristics_ch1 <- sapply(sample_information2$characteristics_ch1,function(i) {sub("gender: ","",i)})
sample_information2$characteristics_ch1.1 <- sapply(sample_information2$characteristics_ch1.1,function(i) {sub("race: ","",i)})
HPV_status <- factor(sample_information2$source_name_ch1)
Exploratory Data Analysis for all samples
# MultiDimension Scaling plots
dist_matrix_55542 <- dist(t(ex55542)) # very important to take transpose "t"
mds_55542 <- cmdscale(dist_matrix_55542)
mypar(1,1)
plot(mds_55542[,1],mds_55542[,2],bg=as.numeric(HPV_status),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: HPV status")
legend("topright",levels(HPV_status),col=seq(along=levels(HPV_status)),pch=15,cex=1)
identify(mds_55542)
## integer(0)
sample_information2[14,]
## title geo_accession
## GSM1338943 Analysis 1: HPV Inactive - Biological replicate 2 GSM1338943
## source_name_ch1 characteristics_ch1 characteristics_ch1.1
## GSM1338943 HPV Inactive Female European American
plot(mds_55542[,1],mds_55542[,2],bg=as.numeric(factor(sample_information$characteristics_ch1)),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: Gender")
legend("bottomleft",levels(factor(sample_information$characteristics_ch1)),col=seq(along=levels(factor(sample_information$characteristics_ch1))),pch=15,cex=0.8)
plot(mds_55542[,1],mds_55542[,2],bg=as.numeric(factor(sample_information$characteristics_ch1.1)),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: Race")
legend("topright",levels(factor(sample_information$characteristics_ch1.1)),col=seq(along=levels(factor(sample_information$characteristics_ch1.1))),pch=15,cex=0.8)
HPV Active Vs HPV Negative
HPVactive_gsmids <- rownames(subset(sample_information2,source_name_ch1=="HPV Active",))
HPVinactive_gsmids <- rownames(subset(sample_information2,source_name_ch1=="HPV Inactive",))
HPVnegative_gsmids <- rownames(subset(sample_information2,source_name_ch1=="HPV Negative",))
sample_information_HPV_active_negative <- sample_information2[c(HPVactive_gsmids,HPVnegative_gsmids),]
gset55542_HPV_active_negative <- gset55542[,c(HPVactive_gsmids,HPVnegative_gsmids)]
ex55542_HPV_active_negative <- ex55542[,c(HPVactive_gsmids,HPVnegative_gsmids)]
HPV_status_active_negative <- factor(sample_information_HPV_active_negative$source_name_ch1,levels=c("HPV Negative","HPV Active"))
dist_matrix_55542_HPV_active_negative <- dist(t(ex55542_HPV_active_negative)) # very important to take transpose "t"
mds_55542_HPV_active_negative <- cmdscale(dist_matrix_55542_HPV_active_negative)
plot(mds_55542_HPV_active_negative[,1],mds_55542_HPV_active_negative[,2],bg=as.numeric(HPV_status_active_negative),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: HPV status")
legend("topright",levels(HPV_status_active_negative),col=seq(along=levels(HPV_status_active_negative)),pch=15,cex=1)
identify(mds_55542_HPV_active_negative)
## integer(0)
c(HPVactive_gsmids,HPVnegative_gsmids)[20]
## [1] "GSM1338957"
sample_information_HPV_active_negative[20,]
## title geo_accession
## GSM1338957 Analysis 1: HPV Negative - Biological replicate 8 GSM1338957
## source_name_ch1 characteristics_ch1 characteristics_ch1.1
## GSM1338957 HPV Negative Male African American
text(mds_55542_HPV_active_negative[,1],mds_55542_HPV_active_negative[,2],labels=rownames(mds_55542_HPV_active_negative),cex=0.8)
hierarchial_cluster_HPV_active_negative <- hclust(dist_matrix_55542_HPV_active_negative)
plot(hierarchial_cluster_HPV_active_negative,cex=0.8,main="Hierarchical clustering of samples")
plot(hierarchial_cluster_HPV_active_negative,cex=0.8,main="Hierarchical clustering of samples",label=HPV_status_active_negative)
sample_information_HPV_active_negative[c("GSM1338957","GSM1338940"),]
## title geo_accession
## GSM1338957 Analysis 1: HPV Negative - Biological replicate 8 GSM1338957
## GSM1338940 Analysis 1: HPV Active - Biological replicate 11 GSM1338940
## source_name_ch1 characteristics_ch1 characteristics_ch1.1
## GSM1338957 HPV Negative Male African American
## GSM1338940 HPV Active Male European American
sample_information_HPV_active_negative[c("GSM1338960","GSM1338936"),]
## title
## GSM1338960 Analysis 1: HPV Negative - Biological replicate 11
## GSM1338936 Analysis 1: HPV Active - Biological replicate 7
## geo_accession source_name_ch1 characteristics_ch1
## GSM1338960 GSM1338960 HPV Negative Male
## GSM1338936 GSM1338936 HPV Active Male
## characteristics_ch1.1
## GSM1338960 European American
## GSM1338936 European American
plot(mds_55542_HPV_active_negative[,1],mds_55542_HPV_active_negative[,2],bg=as.numeric(factor(sample_information_HPV_active_negative$characteristics_ch1.1)),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: Race status")
legend("topleft",levels(factor(sample_information_HPV_active_negative$characteristics_ch1.1)),col=seq(along=levels(factor(sample_information_HPV_active_negative$characteristics_ch1.1))),pch=15,cex=1)
# since "GSM1338957" sample is clustering with HPV active samples, remove it from analysis
#gset55542_HPV_active_negative2 <- gset55542_HPV_active_negative[,-grep("GSM1338957",sampleNames(gset55542_HPV_active_negative))]
gset55542_HPV_active_negative2 <- gset55542_HPV_active_negative[,-20]
ex55542_HPV_active_negative2 <- ex55542_HPV_active_negative[,-20]
sample_information_HPV_active_negative2 <- sample_information_HPV_active_negative[,-20]
HPV_status_active_negative2 <- HPV_status_active_negative[-20]
dist_matrix_55542_HPV_active_negative2 <- dist(t(ex55542_HPV_active_negative2)) # very important to take transpose "t"
mds_55542_HPV_active_negative2 <- cmdscale(dist_matrix_55542_HPV_active_negative2)
plot(mds_55542_HPV_active_negative2[,1],mds_55542_HPV_active_negative2[,2],bg=as.numeric(HPV_status_active_negative2),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: HPV status")
legend("topright",levels(HPV_status_active_negative2),col=seq(along=levels(HPV_status_active_negative2)),pch=15,cex=1)
RunLimma <- function(gset,groups){
design_matrix <- model.matrix(~0+groups)
colnames(design_matrix) <- c("CONTROL","TREATMENT")
# to check order is correct
# head(design_matrix)
## fit linear model for each gene given a series of arrays
fit <- lmFit(gset, design_matrix)
## tell which levels should be compared
colnames(design_matrix)
cont.matrix <- makeContrasts(TREATMENT-CONTROL, levels=design_matrix)
## given a linear model fit to microarray data, compute estimated coefficients & standard errors for given set of contratsts
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2, 0.01)
## to get the no.of genes in each array
nrow_gset <- length(featureNames(gset))
tT <- topTable(fit2, adjust="fdr", sort.by="logFC",number=nrow_gset)
colnames(tT)
tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","logFC","t","B","AveExpr","GENE_SYMBOL","GENE_NAME","ENSEMBL_ID","REFSEQ","GB_ACC","UNIGENE_ID"))
return(tT)
}
# HPVpositive vs HPVnegative
gse55542_HPV_positiveVsnegative <- RunLimma(gset55542_HPV_active_negative2,HPV_status_active_negative2)
# volcano plot
plot(gse55542_HPV_positiveVsnegative$logFC,-log10(gse55542_HPV_positiveVsnegative$adj.P.Val), xlab="Effect size",ylab="- log (base 10) adj.p-values",main="Volcano Plot for HPVpositive vs HPVnegative")
library("biomaRt")
## Warning: package 'biomaRt' was built under R version 3.2.2
## Creating a generic function for 'nchar' from package 'base' in package 'S4Vectors'
## Warning in .recacheSubclasses(def@className, def, doSubclasses, env):
## undefined subclass "externalRefMethod" of class "expressionORfunction";
## definition not updated
## Warning in .recacheSubclasses(def@className, def, doSubclasses, env):
## undefined subclass "externalRefMethod" of class "functionORNULL";
## definition not updated
listMarts(host="www.ensembl.org")
## biomart version
## 1 ENSEMBL_MART_ENSEMBL Ensembl Genes 82
## 2 ENSEMBL_MART_SNP Ensembl Variation 82
## 3 ENSEMBL_MART_FUNCGEN Ensembl Regulation 82
## 4 ENSEMBL_MART_VEGA Vega 62
## 5 pride PRIDE (EBI UK)
database <- useMart("ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host="www.ensembl.org")
listDatasets(database)[grep("sapiens",listDatasets(database)$description,),]
## dataset description version
## 32 hsapiens_gene_ensembl Homo sapiens genes (GRCh38.p3) GRCh38.p3
## Filters (one or more) that should be used in the query. So first check what do the probes represent.
#pData(gset55542)[1,c(12:20)]
#pData(gset55542)[1,"hyb_protocol"] # "Agilent-039494 SurePrint G3 Human GE v2 8x60K Microarray (G4858A-039494)"
#head(gse55542_HPV_positiveVsnegative$ID)
filters <- listFilters(database)
#head(filters)
filters[grep("agilent",filters$description,ignore.case=T),]
## name
## 144 with_agilent_cgh_44b
## 145 with_efg_agilent_wholegenome_4x44k_v1
## 146 with_efg_agilent_wholegenome_4x44k_v2
## 147 with_efg_agilent_sureprint_g3_ge_8x60k
## 148 with_efg_agilent_sureprint_g3_ge_8x60k_v2
## 177 agilent_cgh_44b
## 178 efg_agilent_sureprint_g3_ge_8x60k
## 179 efg_agilent_sureprint_g3_ge_8x60k_v2
## 180 efg_agilent_wholegenome_4x44k_v1
## 181 efg_agilent_wholegenome_4x44k_v2
## description
## 144 with Agilent CGH 44b probe ID(s)
## 145 with Efg agilent wholegenome 4x44k v1 ID(s)
## 146 with Efg agilent wholegenome 4x44k v2 ID(s)
## 147 with Efg agilent sureprint g3 ge 8x60k ID(s)
## 148 with Efg agilent sureprint g3 ge 8x60k v2 ID(s)
## 177 Agilent CGH 44b probe ID(s) [e.g. A_14_P131077]
## 178 Agilent Sureprint G3 GE 8x60k probe ID(s) [e.g. A_33_P3356022]
## 179 Agilent Sureprint G3 GE 8x60k v2 probe ID(s) [e.g. A_24_P368544]
## 180 Agilent WholeGenome 4x44k v1 probe ID(s) [e.g. A_32_P196615]
## 181 Agilent WholeGenome 4x44k v2 probe ID(s) [e.g. A_33_P3356022]
filters[179,]
## name
## 179 efg_agilent_sureprint_g3_ge_8x60k_v2
## description
## 179 Agilent Sureprint G3 GE 8x60k v2 probe ID(s) [e.g. A_24_P368544]
## attribites are values that you are interested in to retrieve
attributes <- listAttributes(database)
#head(attributes)
grep("entrez",attributes$description,ignore.case=T)
## [1] 58 59
grep("UNIGENE",attributes$description,ignore.case=T)
## [1] 91
grep("symbol",attributes$description,ignore.case=T)
## [1] 65
grep("agilent",attributes$description,ignore.case=T)
## [1] 100 101 102 103 123
attributes[c(1,58,59,65,91,100:103,123),]
## name
## 1 ensembl_gene_id
## 58 entrezgene
## 59 entrezgene_transcript_name
## 65 hgnc_symbol
## 91 unigene
## 100 efg_agilent_sureprint_g3_ge_8x60k
## 101 efg_agilent_sureprint_g3_ge_8x60k_v2
## 102 efg_agilent_wholegenome_4x44k_v1
## 103 efg_agilent_wholegenome_4x44k_v2
## 123 agilent_cgh_44b
## description
## 1 Ensembl Gene ID
## 58 EntrezGene ID
## 59 EntrezGene transcript name ID
## 65 HGNC symbol
## 91 Unigene ID
## 100 Agilent SurePrint G3 GE 8x60k probe
## 101 Agilent SurePrint G3 GE 8x60k v2 probe
## 102 Agilent WholeGenome 4x44k v1 probe
## 103 Agilent WholeGenome 4x44k v2 probe
## 123 Agilent CGH 44b probe
gene_ids <- getBM(attributes=c("entrezgene","unigene","efg_agilent_sureprint_g3_ge_8x60k_v2"), filters="efg_agilent_sureprint_g3_ge_8x60k_v2", values=gse55542_HPV_positiveVsnegative$ID, mart=database, uniqueRows=T)
gse55542_HPV_positiveVsnegative2 <- merge(gse55542_HPV_positiveVsnegative,gene_ids,by.x="ID",by.y="efg_agilent_sureprint_g3_ge_8x60k_v2",all.x=T)
RemoveRows <- function(MatrixName,ColumnName){ #ColumnName is name of column containing gene id
# remove rows that contain "NA" in any column
complete <- complete.cases(MatrixName)
Matrix_complete <- MatrixName[complete,]
# remove rows that contain "" in column for gene id
nonempty_gene_ids <- Matrix_complete[,ColumnName]!=""
Matrix2 <- Matrix_complete[nonempty_gene_ids,]
return(Matrix2)
}
gse55542_HPV_positiveVsnegative3 <- RemoveRows(gse55542_HPV_positiveVsnegative2,"entrezgene")
DEG_gse55542_HPV_positiveVsnegative <- subset(gse55542_HPV_positiveVsnegative3,adj.P.Val<0.05 & abs(logFC)>0.585,)
#write.table(DEG_gse55542_HPV_positiveVsnegative, file="./rough/DEG_gse55542_HPV_positiveVsnegative.txt", row.names=F, sep="\t")
#save(DEG_gse55542_HPV_positiveVsnegative, file="./rough/DEG_gse55542_HPV_positiveVsnegative.rda")