gene-expression-data-LGG

Read the required files from the directory where you have the information on gene vz. sample

lgg <- read.csv("~/Google Drive/0-official-MD-Anderson/Projects/lgg/expression_data_LGG.csv")

new_expression_group0 <- lgg[, which(colnames(lgg) %in% c("NAME",
                                                        "TCGA.CS.4941.01.RNAseq",
                                                        "TCGA.DU.5852.01.RNAseq",
                                                        "TCGA.DU.7292.01.RNAseq",
                                                        "TCGA.HT.7469.01.RNAseq",
                                                        "TCGA.DU.6402.01.RNAseq",
                                                        "TCGA.DU.6396.01.RNAseq",
                                                        "TCGA.DU.8161.01.RNAseq",
                                                        "TCGA.HT.8564.01.RNAseq",
                                                        "TCGA.IK.7675.01.RNAseq",
                                                        "TCGA.HT.8012.01.RNAseq",
                                                        "TCGA.E1.5303.01.RNAseq",
                                                        "TCGA.S9.A6UA.01.RNAseq",
                                                        "TCGA.DU.7013.01.RNAseq",
                                                        "TCGA.CS.6186.01.RNAseq",
                                                        "TCGA.DU.8166.01.RNAseq",
                                                        "TCGA.DU.7010.01.RNAseq",
                                                        "TCGA.DU.7298.01.RNAseq",
                                                        "TCGA.DU.5854.01.RNAseq",
                                                        "TCGA.DU.A5TP.01.RNAseq",
                                                        "TCGA.DU.A76K.01.RNAseq",
                                                        "TCGA.FG.6689.01.RNAseq",
                                                        "TCGA.S9.A6WL.01.RNAseq",
                                                        "TCGA.FG.7643.01.RNAseq",
                                                        "TCGA.DH.A669.01.RNAseq",
                                                        "TCGA.DH.A669.02.RNAseq",
                                                        "TCGA.DU.5872.01.RNAseq",
                                                        "TCGA.DU.5872.02.RNAseq",
                                                        "TCGA.HT.8011.01.RNAseq",
                                                        "TCGA.HT.7620.01.RNAseq",
                                                        "TCGA.FG.A6J3.01.RNAseq",
                                                        "TCGA.CS.5395.01.RNAseq",
                                                        "TCGA.DU.7304.01.RNAseq",
                                                        "TCGA.DU.7304.02.RNAseq",
                                                        "TCGA.DU.7300.01.RNAseq",
                                                        "TCGA.DU.7008.01.RNAseq",
                                                        "TCGA.DU.A76R.01.RNAseq"))]
new_expression_group1 <- lgg[, which(colnames(lgg) %in% c(                                                       
                                                        "TCGA.E1.5319.01.RNAseq",
                                                        "TCGA.TM.A7CF.01.RNAseq",
                                                        "TCGA.TM.A7CF.02.RNAseq",
                                                        "TCGA.DU.6392.01.RNAseq",
                                                        "TCGA.S9.A7R3.01.RNAseq",
                                                        "TCGA.FG.5965.01.RNAseq",
                                                        "TCGA.FG.5965.02.RNAseq",
                                                        "TCGA.E1.5322.01.RNAseq",
                                                        "TCGA.HT.7854.01.RNAseq",
                                                        "TCGA.DU.6404.01.RNAseq",
                                                        "TCGA.DU.6404.02.RNAseq",
                                                        "TCGA.DB.5277.01.RNAseq",
                                                        "TCGA.CS.4942.01.RNAseq",
                                                        "TCGA.DU.6395.01.RNAseq",
                                                        "TCGA.HT.7610.01.RNAseq",
                                                        "TCGA.E1.5302.01.RNAseq",
                                                        "TCGA.DU.7306.01.RNAseq",
                                                        "TCGA.HT.8013.01.RNAseq",
                                                        "TCGA.TM.A7C3.01.RNAseq",
                                                        "TCGA.S9.A6TS.01.RNAseq",
                                                        "TCGA.DU.7302.01.RNAseq",
                                                        "TCGA.E1.5307.01.RNAseq",
                                                        "TCGA.S9.A6TU.01.RNAseq",
                                                        "TCGA.E1.5305.01.RNAseq",
                                                        "TCGA.DU.6399.01.RNAseq",
                                                        "TCGA.DU.6401.01.RNAseq",
                                                        "TCGA.HT.7480.01.RNAseq",
                                                        "TCGA.DU.6408.01.RNAseq",
                                                        "TCGA.DU.6407.01.RNAseq",
                                                        "TCGA.DU.6407.02.RNAseq",
                                                        "TCGA.E1.5311.01.RNAseq",
                                                        "TCGA.DU.7009.01.RNAseq",
                                                        "TCGA.DU.A7T8.01.RNAseq",
                                                        "TCGA.DU.7014.01.RNAseq",
                                                        "TCGA.DU.5870.01.RNAseq",
                                                        "TCGA.DU.5870.02.RNAseq"))]
patient_list <- cbind(new_expression_group0,new_expression_group1)
names(patient_list)[1] <- "Gene"
rownames(patient_list) = make.names(patient_list$Gene, unique=TRUE)

Now we need to remove the additional column of Gene

patient_list$Gene<-NULL
write.csv(patient_list,file = "patient_list.csv")

Now we read the sample info file where the the samples are divided into two groups - control and experimental group

sampleinfo <- read.csv("~/Google Drive/0-official-MD-Anderson/Projects/lgg/sampleinfo.LGG.csv")

34 columns - group0 32 columns - group1

library(genefilter)

## Creating a generic function for 'nchar' from package 'base' in package 'S4Vectors'

## 
## Attaching package: 'genefilter'

## The following object is masked from 'package:base':
## 
##     anyNA

LGGmat<-as.matrix(sapply(patient_list, as.numeric))
slotNames(LGGmat)

## NULL

group<-factor(sampleinfo$group)

Now plot the normalized data:

plot ( density ( LGGmat[,1], na.rm=TRUE ), xlim = c(0,16), ylim = c(0,0.35), 
       main = "Normalized", xlab = "Intensity" )
for(i in 2: ncol ( LGGmat ) ) {  
    points ( density ( LGGmat[,i], na.rm = TRUE ), type = "l", col = rainbow(20)[i] ) 
}

Lets try in random one of the gene and do a simple t-test.Let us take for eg. 2500th gene and do a simple t-test

library(rafalib)
mypar(1,2)

gene<-patient_list[2500,]

To use t.test, we need to check the normality assumptions.

qqnorm(gene[group==1],main="Normal Q-Q - group 0")
qqline(gene[group==1])

qqnorm(gene[group==0],main="Normal Q-Q - group 1")
qqline(gene[group==0])

t.test(gene[group==1],gene[group==0])

## 
##  Welch Two Sample t-test
## 
## data:  gene[group == 1] and gene[group == 0]
## t = -0.014559, df = 63.993, p-value = 0.9884
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1909576  0.1881943
## sample estimates:
## mean of x mean of y 
##  6.656769  6.658150

Welch Two Sample t-test gives data: gene[group == 1] and gene[group == 0] t = -1.8657, df = 61.523, p-value = 0.06685 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -0.76438663 0.02641006 sample estimates: mean of x mean of y 2.401514 2.770503 Here the p value is 0.06685 - this is larger than alpha = 0.05. Hence null hypothesis is true - No difference between two groups for 2th gene in the data. Now do t-test for each row

results1<-rowttests(LGGmat,group)

#Look at the top 10 from each of the two t-tests: 
sort( results1$p.value )[1:10]

##  [1] 1.273034e-06 2.342231e-06 2.498085e-06 2.681133e-06 4.491045e-06
##  [6] 6.809912e-06 7.907597e-06 9.589072e-06 9.915291e-06 1.191924e-05

#to get the names of genes do the following

row.names(LGGmat) <- rownames(patient_list)

AffyID <- rownames( LGGmat )

AffyID[order(results1$p.value)][1:10]

##  [1] "TCF7L2"  "ULK4"    "ACVR2B"  "ABCC3"   "SUFU"    "SULF2"   "SMARCC2"
##  [8] "GBF1"    "CTBP2"   "IGF2BP3"

sort( results1$p.value )[1:10]

##  [1] 1.273034e-06 2.342231e-06 2.498085e-06 2.681133e-06 4.491045e-06
##  [6] 6.809912e-06 7.907597e-06 9.589072e-06 9.915291e-06 1.191924e-05

In addition, it is nice to know if the genes are over or under expressed relative to the control condition.

plot(results1$dm,results1$p.value)

#Vulcano plot (results1$dm is same as log2 fold change)
###################################################
###  volcano
###################################################
plot(results1$dm, -log10(results1$p.value), pch=".", 
     xlab = expression(mean~log[2]~fold~change),
     ylab = expression(-log[10](p)))
###################################################

install.packages("ggplot2")

## 
## The downloaded binary packages are in
##  /var/folders/71/vnhmr1ts6w354s54vd5n1xw80000gn/T//RtmpIgp1o7/downloaded_packages

# Then to construct the volcano plot, we use the following commands:
require(ggplot2)

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.2.3

##Highlight genes that have an absolute fold change > 2 and a p-value < Bonferroni cut-off
no_of_genes = dim(LGGmat)[1]
results1$threshold = as.factor(abs(results1$dm) > 2 & results1$p.value < 0.05/no_of_genes)
dim(results1$threshold)

## NULL

results1$threshold = as.factor(abs(results1$dm) > 2 & results1$p.value < 0.05)
dim(results1$threshold)

## NULL

##Construct the plot object
g = ggplot(data=results1, aes(x=dm, y=-log10(p.value), colour=threshold)) +
    geom_point(alpha=0.4, size=1.75) +
    theme(legend.position = "none") +
    xlim(c(-5, 5)) + ylim(c(0, 7.5)) +
    xlab("log2 fold change") + ylab("-log10 p-value")
g

## Warning: Removed 13 rows containing missing values (geom_point).

g + geom_text(aes(x=results1$dm, y=-log10(results1$p.value),
                  label=AffyID, size=1.2), colour="black")

## Warning: Removed 13 rows containing missing values (geom_point).

## Warning: Removed 13 rows containing missing values (geom_text).

Multiple test - Bonferroni, Holm, Hochberg, SidakSS…

require(multtest)

## Loading required package: multtest

## Loading required package: Biobase

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following object is masked from 'package:stats':
## 
##     xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist, unsplit

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

res = mt.rawp2adjp(results1$p.value, c("Bonferroni", "Holm", "Hochberg", "SidakSS", "SidakSD","BH", "BY","ABH","TSBH"), alpha = 0.05, na.rm = FALSE)
adjp = data.frame(res$adjp[order(res$index), ])


sum(adjp$rawp<0.05,na.rm=TRUE)  #Ans 2630

## [1] 2630

sum(adjp$BH<0.05,na.rm=TRUE)  #Ans 222

## [1] 222

sum(adjp$BY<0.05,na.rm=TRUE)   #Ans 0

## [1] 0

sum(adjp$Bonferroni<0.05,na.rm=TRUE)  #Ans 4

## [1] 4

sum(adjp$ABH<0.05,na.rm=TRUE)  #Ans 242

## [1] 242

sum(adjp$Holm<0.05,na.rm=TRUE)  #ANS 4

## [1] 4

sum(adjp$Hochberg<0.05,na.rm=TRUE) #Ans 4

## [1] 4

sum(adjp$SidakSS<0.05,na.rm=TRUE)  #Ans 4

## [1] 4

sum(adjp$SidakSD<0.05,na.rm=TRUE)   #Ans 4

## [1] 4

sum(adjp$TSBH_0.05<0.05,na.rm=TRUE)   #Ans 223

## [1] 223

For testing purpose lets use just 100 rows

smallLGGmat<-LGGmat[1:100,] 
# Permutation unadjusted p-values and adjusted p-values for maxT procedure
res1<-mt.maxT(smallLGGmat,group)

## b=100    b=200   b=300   b=400   b=500   b=600   b=700   b=800   b=900   b=1000  
## b=1100   b=1200  b=1300  b=1400  b=1500  b=1600  b=1700  b=1800  b=1900  b=2000  
## b=2100   b=2200  b=2300  b=2400  b=2500  b=2600  b=2700  b=2800  b=2900  b=3000  
## b=3100   b=3200  b=3300  b=3400  b=3500  b=3600  b=3700  b=3800  b=3900  b=4000  
## b=4100   b=4200  b=4300  b=4400  b=4500  b=4600  b=4700  b=4800  b=4900  b=5000  
## b=5100   b=5200  b=5300  b=5400  b=5500  b=5600  b=5700  b=5800  b=5900  b=6000  
## b=6100   b=6200  b=6300  b=6400  b=6500  b=6600  b=6700  b=6800  b=6900  b=7000  
## b=7100   b=7200  b=7300  b=7400  b=7500  b=7600  b=7700  b=7800  b=7900  b=8000  
## b=8100   b=8200  b=8300  b=8400  b=8500  b=8600  b=8700  b=8800  b=8900  b=9000  
## b=9100   b=9200  b=9300  b=9400  b=9500  b=9600  b=9700  b=9800  b=9900  b=10000

rawp<-res1$rawp[order(res1$index)]
teststat<-res1$teststat[order(res1$index)]

# Permutation adjusted p-values for simple multiple testing procedures
procs<-c("Bonferroni","Holm","Hochberg","SidakSS","SidakSD","BH","BY")
res2<-mt.rawp2adjp(rawp,procs)

# Plot results from all multiple testing procedures
allp<-cbind(res2$adjp[order(res2$index),],res1$adjp[order(res1$index)])
dimnames(allp)[[2]][9]<-"maxT"
procs<-dimnames(allp)[[2]]
procs[7:9]<-c("maxT","BH","BY")
allp<-allp[,procs]

cols<-c(1:4,"orange","brown","purple",5:6)
ltypes<-c(3,rep(1,6),rep(2,2))

# Ordered adjusted p-values
mt.plot(allp,teststat,plottype="pvsr",proc=procs,leg=c(80,0.4),lty=ltypes,col=cols,lwd=2)

# Adjusted p-values in original data order
mt.plot(allp,teststat,plottype="pvsi",proc=procs,leg=c(80,0.4),lty=ltypes,col=cols,lwd=2)

# Number of rejected hypotheses vs. level of the test
mt.plot(allp,teststat,plottype="rvsa",proc=procs,leg=c(0.05,100),lty=ltypes,col=cols,lwd=2)

# Adjusted p-values vs. test statistics
mt.plot(allp,teststat,plottype="pvst",logscale=TRUE,proc=procs,leg=c(0,4),pch=ltypes,col=cols)

Histogram, boxplot etc..

hist(results1$p.value, breaks=100);

boxplot(results1$p.value)

sum(abs(results1$dm) > 2) & sum(results1$p.Value < 0.05,na.rm=TRUE)

## [1] FALSE

sum(results1$p.value<0.05,na.rm=TRUE)

## [1] 2630

sum(abs(results1$dm)>2)

## [1] 64

Here we do the calculations of corrected p-values for each list - calculations of FPR and FNR for each list

bon1 <- p.adjust(results1$p.value, method = "bonferroni")
bon1final<-sum(bon1<0.05,na.rm=TRUE)  #4

sum(results1$p.value<0.05,na.rm=TRUE)  #2630

## [1] 2630

length(results1$p.value)

## [1] 12717

#(1)Bonferroni correction to achieve an FWER of 0.05
k = 0.05/length(results1$p.value)
Bonf<-sum(results1$p.value<k,na.rm=TRUE)   #4

#(2)p-adjust estimates of FDR to achieve an FDR of 0.05, and

fdr = p.adjust(results1$p.value,method="fdr")
fdrfinal<-sum(fdr<0.05,na.rm=TRUE)  #Ans 222

#(3)qvalue estimates of FDR to to achieve an FDR of 0.05.
library(qvalue)

## Warning: replacing previous import by 'grid::arrow' when loading 'qvalue'

## Warning: replacing previous import by 'grid::unit' when loading 'qvalue'

table( is.na(results1$p.value))

## 
## FALSE  TRUE 
## 12704    13

newplist <- results1$p.value[!is.na(results1$p.value)]
length(newplist)

## [1] 12704

res = qvalue(newplist)
qvals = res$qvalues
sum(qvals<0.05)  #Ans 648

## [1] 648