HPV positive vs HPV negative analysis for Head and Neck cancer dataset - GSE65858
Load Required Packages
# if a new package from bioconductor has to be installed. write following two commands
# source("https://bioconductor.org/biocLite.R")
# biocLite("package name")
library(GEOquery)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
## intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unlist, unsplit
##
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Setting options('download.file.method.GEOquery'='auto')
library(limma)
##
## Attaching package: 'limma'
##
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
#install.packages("rafalib")
library("rafalib")
Download Input Data
gset65858 <- getGEO("GSE65858", GSEMatrix =TRUE)
## ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE65nnn/GSE65858/matrix/
## Found 1 file(s)
## GSE65858_series_matrix.txt.gz
## File stored at:
## /var/folders/nt/jmcs3y6d4d3ghsv7j3yzpfrh0000gn/T//Rtmp5Lgkum/GPL10558.soft
#setwd("/Users/shruti/Dropbox/SHRUTIM/Microarray/Microarray_R_Scripts/HNSCC/HPV/GSE65858/")
#save(gset65858,file="/Users/shruti/Dropbox/SHRUTIM/Microarray/Microarray_R_Scripts/HNSCC/HPV/GSE65858/final/gset65858.rda")
#load("/Users/shruti/Dropbox/SHRUTIM/Microarray/Microarray_R_Scripts/HNSCC/HPV/GSE65858/final/gset65858.rda")
Data Munging
GPLid <- levels((gset65858[[1]])$platform_id)
if (length(gset65858) > 1)
{
idx <- grep(GPLid, attr(gset65858, "names"))
} else
{
idx <- 1
}
gset65858 <- gset65858[[idx]]
#colnames(pData(gset65858))
#pData(gset65858)[1:2,]
#pData(gset65858)[1:2,c(11:18,21:24,62)]
#apply(pData(gset65858)[,c(11,13:18,21:24,62)],2,table)
# log2 transform
ex65858 <- exprs(gset65858)
qx <- as.numeric(quantile(ex65858, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
(qx[6]-qx[1] > 50 && qx[2] > 0) ||
(qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
if (LogC) { ex65858[which(ex65858 <= 0)] <- NaN
exprs(gset65858) <- log2(ex65858) }
hpv_status <- pData(gset65858)$characteristics_ch1.14
hpv_status <- sub(pattern="hpv16_dna_rna: ", replacement="", x=hpv_status,ignore.case=T)
hpv_status_factors <- factor(hpv_status)
Exploratory Data Analysis for all samples
distance_matrix_65858 <- dist(t(ex65858)) # very important to take transpose "t"
mds_65858 <- cmdscale(distance_matrix_65858)
mypar(1,1)
plot(mds_65858[,1],mds_65858[,2],bg=as.numeric(hpv_status_factors),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: HPV status")
legend("bottomleft",levels(hpv_status_factors),col=seq(along=levels(hpv_status_factors)),pch=15,cex=1)
HPV Active VS HPV Negative
table( pData(gset65858)$characteristics_ch1.14)
##
## hpv16_dna_rna: DNA- hpv16_dna_rna: DNA+RNA- hpv16_dna_rna: DNA+RNA+
## 196 19 35
## hpv16_dna_rna: NA
## 20
hpv_positive <- which(pData(gset65858)$characteristics_ch1.14=="hpv16_dna_rna: DNA+RNA+")
hpv_negative <- which(pData(gset65858)$characteristics_ch1.14=="hpv16_dna_rna: DNA-")
gset65858_hpv <- gset65858[,c(hpv_positive,hpv_negative)]
ex65858_hpv <- exprs(gset65858_hpv)
Exploratory Data Analysis for HPV Active VS HPV Negative
hpv_status <- pData(gset65858_hpv)$characteristics_ch1.14
table(hpv_status)
## hpv_status
## hpv16_dna_rna: DNA- hpv16_dna_rna: DNA+RNA- hpv16_dna_rna: DNA+RNA+
## 196 0 35
## hpv16_dna_rna: NA
## 0
hpv_status <- sub(pattern="hpv16_dna_rna: ", replacement="", x=hpv_status,ignore.case=T)
hpv_status_factors <- factor(hpv_status)
tissues <- pData(gset65858_hpv)$characteristics_ch1.7
tissues <- sub(pattern="tumor_site: ", replacement="", x=tissues,ignore.case=T)
tissues <- factor(tissues)
sex <- pData(gset65858_hpv)$characteristics_ch1.1
sex <- sub(pattern="gender: ", replacement="", x=sex,ignore.case=T)
sex <- factor(sex)
distance_matrix_65858_hpv <- dist(t(ex65858_hpv)) # very important to take transpose "t"
mds_65858_hpv <- cmdscale(distance_matrix_65858_hpv)
plot(mds_65858_hpv[,1],mds_65858_hpv[,2],bg=as.numeric(hpv_status_factors),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: HPV status")
legend("bottomleft",levels(hpv_status_factors),col=seq(along=levels(hpv_status_factors)),pch=15,cex=0.8)
table(hpv_status_factors)
## hpv_status_factors
## DNA- DNA+RNA+
## 196 35
plot(mds_65858_hpv[,1],mds_65858_hpv[,2],bg=as.numeric(tissues),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: Anatomical Location")
legend("bottomleft",levels(tissues),col=seq(along=levels(tissues)),pch=15,cex=0.5)
table(tissues)
## tissues
## Cavum Oris Hypopharynx Larynx NA Oropharynx
## 75 31 46 4 75
plot(mds_65858_hpv[,1],mds_65858_hpv[,2],bg=as.numeric(sex),pch=21,xlab="First dimension",ylab="Second dimension",cex=2,main="MultiDimension Scaling PLot: Sex Status")
legend("bottomleft",levels(sex),col=seq(along=levels(sex)),pch=15,cex=1)
table(sex)
## sex
## F M
## 38 193
hierarchial_cluster_65858 <- hclust(distance_matrix_65858_hpv)
plot(hierarchial_cluster_65858,cex=0.5,lablel=hpv_status_factors)
## Warning in graphics:::plotHclust(n1, merge, height, order(x$order), hang, :
## "lablel" is not a graphical parameter
## Warning in graphics:::plotHclust(n1, merge, height, order(x$order), hang, :
## "lablel" is not a graphical parameter
## Warning in axis(2, at = pretty(range(height)), ...): "lablel" is not a
## graphical parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "lablel" is not a graphical parameter
plot(hierarchial_cluster_65858,cex=0.5,lablel=tissues)
## Warning in graphics:::plotHclust(n1, merge, height, order(x$order), hang, :
## "lablel" is not a graphical parameter
## Warning in graphics:::plotHclust(n1, merge, height, order(x$order), hang, :
## "lablel" is not a graphical parameter
## Warning in axis(2, at = pretty(range(height)), ...): "lablel" is not a
## graphical parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "lablel" is not a graphical parameter
plot(hierarchial_cluster_65858,cex=0.5,lablel=sex)
## Warning in graphics:::plotHclust(n1, merge, height, order(x$order), hang, :
## "lablel" is not a graphical parameter
## Warning in graphics:::plotHclust(n1, merge, height, order(x$order), hang, :
## "lablel" is not a graphical parameter
## Warning in axis(2, at = pretty(range(height)), ...): "lablel" is not a
## graphical parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "lablel" is not a graphical parameter
Group <- factor(hpv_status,levels=c("DNA-","DNA+RNA+")) ## order of levels is important
design65858 <- model.matrix(~0+Group)
colnames(design65858) <- c("HPV_negative","HPV_positive")
# to check order is correct
#head(design65858)
#head(pData(gset65858_hpv)$characteristics_ch1.14)
## fit linear model for each gene given a series of arrays
fit <- lmFit(gset65858_hpv, design65858)
## tell which levels should be compared
colnames(design65858)
## [1] "HPV_negative" "HPV_positive"
cont.matrix <- makeContrasts(HPV_positive-HPV_negative, levels=design65858)
## given a linear model fit to microarray data, compute estimated coefficients & standard errors for given set of contratsts
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2, 0.01)
## to get the no.of genes in each array
nrow65858 <- length(featureNames(gset65858_hpv))
tT <- topTable(fit2, adjust="fdr", sort.by="logFC",number=nrow65858)
colnames(tT)
## [1] "ID" "Species"
## [3] "Source" "Search_Key"
## [5] "Transcript" "ILMN_Gene"
## [7] "Source_Reference_ID" "RefSeq_ID"
## [9] "Unigene_ID" "Entrez_Gene_ID"
## [11] "GI" "Accession"
## [13] "Symbol" "Protein_Product"
## [15] "Probe_Id" "Array_Address_Id"
## [17] "Probe_Type" "Probe_Start"
## [19] "SEQUENCE" "Chromosome"
## [21] "Probe_Chr_Orientation" "Probe_Coordinates"
## [23] "Cytoband" "Definition"
## [25] "Ontology_Component" "Ontology_Process"
## [27] "Ontology_Function" "Synonyms"
## [29] "Obsolete_Probe_Id" "GB_ACC"
## [31] "logFC" "AveExpr"
## [33] "t" "P.Value"
## [35] "adj.P.Val" "B"
tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","logFC","t","B","AveExpr","Entrez_Gene_ID","Symbol","Definition"))
RemoveRows <- function(MatrixName,ColumnName){ #ColumnName is name of column containing gene id
# remove rows that contain "NA" in any column
complete <- complete.cases(MatrixName)
Matrix_complete <- MatrixName[complete,]
# remove rows that contain "" in column for gene id
nonempty_gene_ids <- Matrix_complete[,ColumnName]!=""
Matrix2 <- Matrix_complete[nonempty_gene_ids,]
return(Matrix2)
}
tT2 <- RemoveRows(tT,"Entrez_Gene_ID")
# select genes with FRD<0.05 and more than 1.5 fold change
Gse65858DEGFinal <- subset(tT2,adj.P.Val<0.05 & abs(logFC)>0.585,)
# volcano plot
plot(tT2$logFC,-log10(tT2$adj.P.Val), xlab="Effect size",ylab="- log (base 10) adj.p-values",main="Volcano Plot for HPV Active vs HPV negative")