Read the required files from the directory where you have the information on gene vz. sample
lgg <- read.csv("~/Google Drive/0-official-MD-Anderson/Projects/lgg/expression_data_LGG.csv")
new_expression_group0 <- lgg[, which(colnames(lgg) %in% c("NAME",
"TCGA.CS.4941.01.RNAseq",
"TCGA.DU.5852.01.RNAseq",
"TCGA.DU.7292.01.RNAseq",
"TCGA.HT.7469.01.RNAseq",
"TCGA.DU.6402.01.RNAseq",
"TCGA.DU.6396.01.RNAseq",
"TCGA.DU.8161.01.RNAseq",
"TCGA.HT.8564.01.RNAseq",
"TCGA.IK.7675.01.RNAseq",
"TCGA.HT.8012.01.RNAseq",
"TCGA.E1.5303.01.RNAseq",
"TCGA.S9.A6UA.01.RNAseq",
"TCGA.DU.7013.01.RNAseq",
"TCGA.CS.6186.01.RNAseq",
"TCGA.DU.8166.01.RNAseq",
"TCGA.DU.7010.01.RNAseq",
"TCGA.DU.7298.01.RNAseq",
"TCGA.DU.5854.01.RNAseq",
"TCGA.DU.A5TP.01.RNAseq",
"TCGA.DU.A76K.01.RNAseq",
"TCGA.FG.6689.01.RNAseq",
"TCGA.S9.A6WL.01.RNAseq",
"TCGA.FG.7643.01.RNAseq",
"TCGA.DH.A669.01.RNAseq",
"TCGA.DH.A669.02.RNAseq",
"TCGA.DU.5872.01.RNAseq",
"TCGA.DU.5872.02.RNAseq",
"TCGA.HT.8011.01.RNAseq",
"TCGA.HT.7620.01.RNAseq",
"TCGA.FG.A6J3.01.RNAseq",
"TCGA.CS.5395.01.RNAseq",
"TCGA.DU.7304.01.RNAseq",
"TCGA.DU.7304.02.RNAseq",
"TCGA.DU.7300.01.RNAseq",
"TCGA.DU.7008.01.RNAseq",
"TCGA.DU.A76R.01.RNAseq"))]
new_expression_group1 <- lgg[, which(colnames(lgg) %in% c(
"TCGA.E1.5319.01.RNAseq",
"TCGA.TM.A7CF.01.RNAseq",
"TCGA.TM.A7CF.02.RNAseq",
"TCGA.DU.6392.01.RNAseq",
"TCGA.S9.A7R3.01.RNAseq",
"TCGA.FG.5965.01.RNAseq",
"TCGA.FG.5965.02.RNAseq",
"TCGA.E1.5322.01.RNAseq",
"TCGA.HT.7854.01.RNAseq",
"TCGA.DU.6404.01.RNAseq",
"TCGA.DU.6404.02.RNAseq",
"TCGA.DB.5277.01.RNAseq",
"TCGA.CS.4942.01.RNAseq",
"TCGA.DU.6395.01.RNAseq",
"TCGA.HT.7610.01.RNAseq",
"TCGA.E1.5302.01.RNAseq",
"TCGA.DU.7306.01.RNAseq",
"TCGA.HT.8013.01.RNAseq",
"TCGA.TM.A7C3.01.RNAseq",
"TCGA.S9.A6TS.01.RNAseq",
"TCGA.DU.7302.01.RNAseq",
"TCGA.E1.5307.01.RNAseq",
"TCGA.S9.A6TU.01.RNAseq",
"TCGA.E1.5305.01.RNAseq",
"TCGA.DU.6399.01.RNAseq",
"TCGA.DU.6401.01.RNAseq",
"TCGA.HT.7480.01.RNAseq",
"TCGA.DU.6408.01.RNAseq",
"TCGA.DU.6407.01.RNAseq",
"TCGA.DU.6407.02.RNAseq",
"TCGA.E1.5311.01.RNAseq",
"TCGA.DU.7009.01.RNAseq",
"TCGA.DU.A7T8.01.RNAseq",
"TCGA.DU.7014.01.RNAseq",
"TCGA.DU.5870.01.RNAseq",
"TCGA.DU.5870.02.RNAseq"))]
patient_list <- cbind(new_expression_group0,new_expression_group1)
names(patient_list)[1] <- "Gene"
rownames(patient_list) = make.names(patient_list$Gene, unique=TRUE)
Now we need to remove the additional column of Gene
patient_list$Gene<-NULL
write.csv(patient_list,file = "patient_list.csv")
Now we read the sample info file where the the samples are divided into two groups - control and experimental group
sampleinfo <- read.csv("~/Google Drive/0-official-MD-Anderson/Projects/lgg/sampleinfo.LGG.csv")
34 columns - group0 32 columns - group1
library(genefilter)
## Creating a generic function for 'nchar' from package 'base' in package 'S4Vectors'
##
## Attaching package: 'genefilter'
## The following object is masked from 'package:base':
##
## anyNA
LGGmat<-as.matrix(sapply(patient_list, as.numeric))
slotNames(LGGmat)
## NULL
group<-factor(sampleinfo$group)
Now plot the normalized data:
plot ( density ( LGGmat[,1], na.rm=TRUE ), xlim = c(0,16), ylim = c(0,0.35),
main = "Normalized", xlab = "Intensity" )
for(i in 2: ncol ( LGGmat ) ) {
points ( density ( LGGmat[,i], na.rm = TRUE ), type = "l", col = rainbow(20)[i] )
}
Lets try in random one of the gene and do a simple t-test.Let us take for eg. 2500th gene and do a simple t-test
library(rafalib)
mypar(1,2)
gene<-patient_list[2500,]
To use t.test, we need to check the normality assumptions.
qqnorm(gene[group==1],main="Normal Q-Q - group 0")
qqline(gene[group==1])
qqnorm(gene[group==0],main="Normal Q-Q - group 1")
qqline(gene[group==0])
t.test(gene[group==1],gene[group==0])
##
## Welch Two Sample t-test
##
## data: gene[group == 1] and gene[group == 0]
## t = -0.014559, df = 63.993, p-value = 0.9884
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1909576 0.1881943
## sample estimates:
## mean of x mean of y
## 6.656769 6.658150
Welch Two Sample t-test gives data: gene[group == 1] and gene[group == 0] t = -1.8657, df = 61.523, p-value = 0.06685 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -0.76438663 0.02641006 sample estimates: mean of x mean of y 2.401514 2.770503 Here the p value is 0.06685 - this is larger than alpha = 0.05. Hence null hypothesis is true - No difference between two groups for 2th gene in the data. Now do t-test for each row
results1<-rowttests(LGGmat,group)
#Look at the top 10 from each of the two t-tests:
sort( results1$p.value )[1:10]
## [1] 1.273034e-06 2.342231e-06 2.498085e-06 2.681133e-06 4.491045e-06
## [6] 6.809912e-06 7.907597e-06 9.589072e-06 9.915291e-06 1.191924e-05
#to get the names of genes do the following
row.names(LGGmat) <- rownames(patient_list)
AffyID <- rownames( LGGmat )
AffyID[order(results1$p.value)][1:10]
## [1] "TCF7L2" "ULK4" "ACVR2B" "ABCC3" "SUFU" "SULF2" "SMARCC2"
## [8] "GBF1" "CTBP2" "IGF2BP3"
sort( results1$p.value )[1:10]
## [1] 1.273034e-06 2.342231e-06 2.498085e-06 2.681133e-06 4.491045e-06
## [6] 6.809912e-06 7.907597e-06 9.589072e-06 9.915291e-06 1.191924e-05
In addition, it is nice to know if the genes are over or under expressed relative to the control condition.
plot(results1$dm,results1$p.value)
#Vulcano plot (results1$dm is same as log2 fold change)
###################################################
### volcano
###################################################
plot(results1$dm, -log10(results1$p.value), pch=".",
xlab = expression(mean~log[2]~fold~change),
ylab = expression(-log[10](p)))
###################################################
install.packages("ggplot2")
##
## The downloaded binary packages are in
## /var/folders/71/vnhmr1ts6w354s54vd5n1xw80000gn/T//RtmpIgp1o7/downloaded_packages
# Then to construct the volcano plot, we use the following commands:
require(ggplot2)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.3
##Highlight genes that have an absolute fold change > 2 and a p-value < Bonferroni cut-off
no_of_genes = dim(LGGmat)[1]
results1$threshold = as.factor(abs(results1$dm) > 2 & results1$p.value < 0.05/no_of_genes)
dim(results1$threshold)
## NULL
results1$threshold = as.factor(abs(results1$dm) > 2 & results1$p.value < 0.05)
dim(results1$threshold)
## NULL
##Construct the plot object
g = ggplot(data=results1, aes(x=dm, y=-log10(p.value), colour=threshold)) +
geom_point(alpha=0.4, size=1.75) +
theme(legend.position = "none") +
xlim(c(-5, 5)) + ylim(c(0, 7.5)) +
xlab("log2 fold change") + ylab("-log10 p-value")
g
## Warning: Removed 13 rows containing missing values (geom_point).
g + geom_text(aes(x=results1$dm, y=-log10(results1$p.value),
label=AffyID, size=1.2), colour="black")
## Warning: Removed 13 rows containing missing values (geom_point).
## Warning: Removed 13 rows containing missing values (geom_text).
Multiple test - Bonferroni, Holm, Hochberg, SidakSS…
require(multtest)
## Loading required package: multtest
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:stats':
##
## xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
## intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unlist, unsplit
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
res = mt.rawp2adjp(results1$p.value, c("Bonferroni", "Holm", "Hochberg", "SidakSS", "SidakSD","BH", "BY","ABH","TSBH"), alpha = 0.05, na.rm = FALSE)
adjp = data.frame(res$adjp[order(res$index), ])
sum(adjp$rawp<0.05,na.rm=TRUE) #Ans 2630
## [1] 2630
sum(adjp$BH<0.05,na.rm=TRUE) #Ans 222
## [1] 222
sum(adjp$BY<0.05,na.rm=TRUE) #Ans 0
## [1] 0
sum(adjp$Bonferroni<0.05,na.rm=TRUE) #Ans 4
## [1] 4
sum(adjp$ABH<0.05,na.rm=TRUE) #Ans 242
## [1] 242
sum(adjp$Holm<0.05,na.rm=TRUE) #ANS 4
## [1] 4
sum(adjp$Hochberg<0.05,na.rm=TRUE) #Ans 4
## [1] 4
sum(adjp$SidakSS<0.05,na.rm=TRUE) #Ans 4
## [1] 4
sum(adjp$SidakSD<0.05,na.rm=TRUE) #Ans 4
## [1] 4
sum(adjp$TSBH_0.05<0.05,na.rm=TRUE) #Ans 223
## [1] 223
For testing purpose lets use just 100 rows
smallLGGmat<-LGGmat[1:100,]
# Permutation unadjusted p-values and adjusted p-values for maxT procedure
res1<-mt.maxT(smallLGGmat,group)
## b=100 b=200 b=300 b=400 b=500 b=600 b=700 b=800 b=900 b=1000
## b=1100 b=1200 b=1300 b=1400 b=1500 b=1600 b=1700 b=1800 b=1900 b=2000
## b=2100 b=2200 b=2300 b=2400 b=2500 b=2600 b=2700 b=2800 b=2900 b=3000
## b=3100 b=3200 b=3300 b=3400 b=3500 b=3600 b=3700 b=3800 b=3900 b=4000
## b=4100 b=4200 b=4300 b=4400 b=4500 b=4600 b=4700 b=4800 b=4900 b=5000
## b=5100 b=5200 b=5300 b=5400 b=5500 b=5600 b=5700 b=5800 b=5900 b=6000
## b=6100 b=6200 b=6300 b=6400 b=6500 b=6600 b=6700 b=6800 b=6900 b=7000
## b=7100 b=7200 b=7300 b=7400 b=7500 b=7600 b=7700 b=7800 b=7900 b=8000
## b=8100 b=8200 b=8300 b=8400 b=8500 b=8600 b=8700 b=8800 b=8900 b=9000
## b=9100 b=9200 b=9300 b=9400 b=9500 b=9600 b=9700 b=9800 b=9900 b=10000
rawp<-res1$rawp[order(res1$index)]
teststat<-res1$teststat[order(res1$index)]
# Permutation adjusted p-values for simple multiple testing procedures
procs<-c("Bonferroni","Holm","Hochberg","SidakSS","SidakSD","BH","BY")
res2<-mt.rawp2adjp(rawp,procs)
# Plot results from all multiple testing procedures
allp<-cbind(res2$adjp[order(res2$index),],res1$adjp[order(res1$index)])
dimnames(allp)[[2]][9]<-"maxT"
procs<-dimnames(allp)[[2]]
procs[7:9]<-c("maxT","BH","BY")
allp<-allp[,procs]
cols<-c(1:4,"orange","brown","purple",5:6)
ltypes<-c(3,rep(1,6),rep(2,2))
# Ordered adjusted p-values
mt.plot(allp,teststat,plottype="pvsr",proc=procs,leg=c(80,0.4),lty=ltypes,col=cols,lwd=2)
# Adjusted p-values in original data order
mt.plot(allp,teststat,plottype="pvsi",proc=procs,leg=c(80,0.4),lty=ltypes,col=cols,lwd=2)
# Number of rejected hypotheses vs. level of the test
mt.plot(allp,teststat,plottype="rvsa",proc=procs,leg=c(0.05,100),lty=ltypes,col=cols,lwd=2)
# Adjusted p-values vs. test statistics
mt.plot(allp,teststat,plottype="pvst",logscale=TRUE,proc=procs,leg=c(0,4),pch=ltypes,col=cols)
Histogram, boxplot etc..
hist(results1$p.value, breaks=100);
boxplot(results1$p.value)
sum(abs(results1$dm) > 2) & sum(results1$p.Value < 0.05,na.rm=TRUE)
## [1] FALSE
sum(results1$p.value<0.05,na.rm=TRUE)
## [1] 2630
sum(abs(results1$dm)>2)
## [1] 64
Here we do the calculations of corrected p-values for each list - calculations of FPR and FNR for each list
bon1 <- p.adjust(results1$p.value, method = "bonferroni")
bon1final<-sum(bon1<0.05,na.rm=TRUE) #4
sum(results1$p.value<0.05,na.rm=TRUE) #2630
## [1] 2630
length(results1$p.value)
## [1] 12717
#(1)Bonferroni correction to achieve an FWER of 0.05
k = 0.05/length(results1$p.value)
Bonf<-sum(results1$p.value<k,na.rm=TRUE) #4
#(2)p-adjust estimates of FDR to achieve an FDR of 0.05, and
fdr = p.adjust(results1$p.value,method="fdr")
fdrfinal<-sum(fdr<0.05,na.rm=TRUE) #Ans 222
#(3)qvalue estimates of FDR to to achieve an FDR of 0.05.
library(qvalue)
## Warning: replacing previous import by 'grid::arrow' when loading 'qvalue'
## Warning: replacing previous import by 'grid::unit' when loading 'qvalue'
table( is.na(results1$p.value))
##
## FALSE TRUE
## 12704 13
newplist <- results1$p.value[!is.na(results1$p.value)]
length(newplist)
## [1] 12704
res = qvalue(newplist)
qvals = res$qvalues
sum(qvals<0.05) #Ans 648
## [1] 648