# source('http://bioconductor.org/biocLite.R') biocLite('limma')
# biocLite('edgeR') biocLite('tweeDEseqCountData')
# biocLite('preprocessCore') biocLite('DESeq')
library(limma)
library(edgeR)
library(tweeDEseqCountData)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, as.data.frame, cbind, colnames, duplicated,
## eval, Filter, Find, get, intersect, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rep.int, rownames, sapply,
## setdiff, sort, table, tapply, union, unique, unlist
##
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
library(DESeq)
## Loading required package: locfit
## locfit 1.5-9.1 2013-03-22
## Loading required package: lattice
##
## Attaching package: 'DESeq'
##
## The following object is masked from 'package:limma':
##
## plotMA
# load data ####
data(pickrell1)
counts = exprs(pickrell1.eset)
dim(counts)
## [1] 38415 69
gender = pickrell1.eset$gender
table(gender)
## gender
## female male
## 40 29
barplot(colSums(counts) * 1e-06, names = 1:69, ylab = "Library size (millions)")
# filter by coverage: at least 20 samples have at least 20 reads
counts = counts[apply(counts >= 20, 1, sum) >= 20, ]
dim(counts)
## [1] 16618 69
# normalization ##### quantile by hand
mn = apply(counts, 2, sort)
mn = apply(mn, 1, mean)
counts.qnorm = apply(counts, 2, function(x) {
mn[rank(x, ties.method = "random")]
})
# actually there is a function: library(preprocessCore) counts.qnorm =
# normalize.quantiles(counts)
par(mfrow = c(2, 3), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0),
oma = c(0, 0, 0, 1))
col = rainbow(ncol(counts))
plot(density(log(counts[, 1]), bw = 0.2), ylim = c(0, 0.3), col = col[1], "Row data")
for (i in 2:ncol(counts)) lines(density(log(counts[, i]), bw = 0.2), col = col[i])
plot(density(log(counts.qnorm[, 1]), bw = 0.2), ylim = c(0, 0.3), col = col[1],
"Quantile normalization")
for (i in 2:ncol(counts.qnorm)) lines(density(log(counts.qnorm[, i]), bw = 0.2),
col = col[i])
# norm to lib size
counts.ls = sweep(counts, 2, apply(counts, 2, sum), "/")
plot(density(log(counts.ls[, 1])), ylim = c(0, 0.3), col = col[1], "Lib size normalization")
for (i in 2:ncol(counts.ls)) lines(density(log(counts.ls[, i]), bw = 0.2), col = col[i])
# use edgeR fot normalization upperquartile
lib.size = apply(counts, 2, sum)
edger.uq = lib.size * calcNormFactors(counts, "upperquartile")
counts.uq = sweep(counts, 2, edger.uq, "/")
plot(density(log(counts.uq[, 1])), ylim = c(0, 0.3), col = col[1], "Upperquartile normalization")
for (i in 2:ncol(counts.uq)) lines(density(log(counts.uq[, i]), bw = 0.2), col = col[i])
# legend('topright',col=col,lwd=1,legend=1:length(lib.size),ncol=3,cex=0.8)
# RLE
edger.rle = lib.size * calcNormFactors(counts, "RLE")
counts.rle = sweep(counts, 2, edger.rle, "/")
plot(density(log(counts.rle[, 1])), ylim = c(0, 0.3), col = col[1], "RLE normalization")
for (i in 2:ncol(counts.rle)) lines(density(log(counts.rle[, i]), bw = 0.2),
col = col[i])
# TMM
edger.tmm = lib.size * calcNormFactors(counts, "TMM")
counts.tmm = sweep(counts, 2, edger.tmm, "/")
plot(density(log(counts.tmm[, 1])), ylim = c(0, 0.3), col = col[1], "TMM normalization")
for (i in 2:ncol(counts.tmm)) lines(density(log(counts.tmm[, i]), bw = 0.2),
col = col[i])
# edgeR normalizations by hand upperquartile
uq = apply(counts, 2, function(x) {
x = sort(x)
sum(x[(length(x) * 0.75)])
})
plot(uq, edger.uq)
cor(uq, edger.uq)
## [1] 1
# RLE
mc = apply(counts, 1, prod)^(1/ncol(counts))
mc = apply(counts, 1, function(x) {
x = x[x != 0]
prod(x)^(1/length(x))
})
nf = sweep(counts, 1, mc, "/")
rle = apply(nf, 2, median, na.rm = T)
plot(rle, edger.rle)
cor(rle, edger.rle)
## [1] 1
# TMM is a bit too hard...
# make figure for presentation #####
cnts = read.table("/home/mazin/teaching/NGS2014/r.examples/mg.cnts")
cnts[1:3, ]
## gene.chr_id gene.start gene.stop gene.strand
## GCW_00005 gi|564749496|gb|CP006916.1| 1892 2680 1
## GCW_00010 gi|564749496|gb|CP006916.1| 3161 4546 1
## GCW_00015 gi|564749496|gb|CP006916.1| 4779 4877 -1
## cnts.lcE4 cnts.scE4 rpkm.lcE4 rpkm.scE4 exp.lcE4 exp.scE4
## GCW_00005 51 0 38.43 0.0 5.265 -5.087
## GCW_00010 67 3 29.48 6.8 4.883 2.772
## GCW_00015 0 0 0.00 0.0 -5.087 -5.087
c = cnts[, 5] + 1
t = cnts[, 6] + 1
par(mfrow = c(2, 3), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0),
oma = c(0, 0, 0, 1), lwd = 3, cex = 1.1)
bw = 0.4
xlim = log(range(c, t))
ylim = c(0, 0.27)
xlab = ""
libsize = c(sum(c), sum(t))
plot(density(log(c), from = xlim[1], to = xlim[2], bw = bw), ylim = ylim, col = "darkgray",
main = "Row data", xlab = xlab)
lines(density(log(t), from = xlim[1], to = xlim[2], bw = bw), col = "red")
xlim = c(-16, 0)
plot(density(log(c/libsize[1]), bw = bw), xlim = xlim, ylim = ylim, col = "darkgray",
main = "Lib size normalization", xlab = xlab)
lines(density(log(t/libsize[2]), bw = bw), col = "red")
uq = calcNormFactors(cbind(c, t), method = "upperquartile") * libsize
plot(density(log(c/uq[1]), bw = bw), xlim = xlim, ylim = ylim, col = "darkgray",
main = "Upperquartile normalization", xlab = xlab)
lines(density(log(t/uq[2]), bw = bw), col = "red")
rle = calcNormFactors(cbind(c, t), method = "RLE") * libsize
plot(density(log(c/rle[1]), bw = bw), xlim = xlim, ylim = ylim, col = "darkgray",
main = "RLE normalization", xlab = xlab)
lines(density(log(t/rle[2]), bw = bw), col = "red")
tmm = calcNormFactors(cbind(c, t), method = "TMM") * libsize
plot(density(log(c/tmm[1]), bw = bw), xlim = xlim, ylim = ylim, col = "darkgray",
main = "TMM normalization", xlab = xlab)
lines(density(log(t/tmm[2]), bw = bw), col = "red")
library(preprocessCore)
qn = normalize.quantiles(cbind(c, t))
plot(density(log(qn[, 1]), bw = bw), ylim = ylim, col = "darkgray", main = "Quantile normalization",
xlab = xlab)
lines(density(log(qn[, 1]), bw = bw), col = "red")
par(mfrow = c(1, 1), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0),
oma = c(0, 0, 0, 1), lwd = 2, cex = 1)
dc = density(log(c/uq[1]), bw = bw, from = -16, to = 0)
dt = density(log(t/uq[2]), bw = bw, from = -16, to = 0)
plot(dc, ylim = ylim, col = "blue", main = "Upperquartile normalization", xlab = xlab)
lines(dt, col = "red")
q = sort(log(t/uq[2]))[length(t) * 0.75]
polygon(c(q, dc$x[dc$x >= q], q), c(0, dc$y[dc$x >= q], 0), col = "#0000FF60",
border = NA)
polygon(c(q, dt$x[dt$x >= q], q), c(0, dt$y[dt$x >= q], 0), col = "#FF000060",
border = NA)
# look at differential expression #### use edgeR
par(mfrow = c(1, 1), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0),
oma = c(0, 0, 0, 1), lwd = 1, cex = 1)
edger = DGEList(counts)
edger = calcNormFactors(edger, method = "RLE")
design = model.matrix(~gender)
edger = estimateGLMCommonDisp(edger, design)
edger = estimateGLMTrendedDisp(edger, design)
## Loading required package: splines
edger = estimateGLMTagwiseDisp(edger, design)
plotBCV(edger)
strict.disp = pmax(edger$tagwise.dispersion, edger$trended.dispersion, edger$common.dispersion)
glm = glmFit(edger, design)
glm.st = glmFit(edger, design, dispersion = strict.disp)
pv = glmLRT(glm)
pv.st = glmLRT(glm.st)
sgn.genes = data.frame(edger = p.adjust(pv.st$table$PValue, method = "BH") <
0.05)
table(p.adjust(pv$table$PValue, method = "BH") < 0.05)
##
## FALSE TRUE
## 16552 66
table(sgn.genes$edger)
##
## FALSE TRUE
## 16566 52
plotSmear(pv.st, de.tags = rownames(pv.st$table)[sgn.genes$edger], cex = 1)
# use linear models through limma let's use RLE normalization
# prepare data
v = voom(edger, design, plot = TRUE)
# fit model
fit = lmFit(v, design)
fit = eBayes(fit)
options(digits = 3)
topTable(fit, coef = 2, n = 16, sort = "p")
## ID logFC AveExpr t P.Value adj.P.Val B
## 10235 ENSG00000229807 -9.814 3.7595 -34.3 3.89e-46 6.46e-42 69.8
## 5164 ENSG00000099749 4.252 0.2657 27.2 2.00e-39 1.66e-35 64.8
## 13006 ENSG00000157828 3.285 3.2592 26.6 8.27e-39 4.58e-35 73.0
## 16431 ENSG00000233864 4.897 -0.6027 25.9 5.32e-38 2.21e-34 62.3
## 7812 ENSG00000131002 5.440 -0.2199 21.9 1.96e-33 6.50e-30 56.1
## 9547 ENSG00000198692 2.397 2.6317 20.7 6.19e-32 1.71e-28 59.2
## 15647 ENSG00000213318 4.306 2.2165 19.4 3.26e-30 7.73e-27 54.3
## 4721 ENSG00000165246 5.331 -0.5406 19.0 1.21e-29 2.51e-26 49.7
## 12123 ENSG00000129824 2.784 4.6629 17.6 1.23e-27 2.26e-24 51.4
## 13958 ENSG00000183878 1.877 2.6941 16.7 2.09e-26 3.48e-23 48.1
## 8854 ENSG00000012817 1.472 4.6557 14.8 1.80e-23 2.71e-20 42.5
## 7432 ENSG00000146938 4.470 -0.8290 14.5 6.61e-23 9.16e-20 37.4
## 15025 ENSG00000243209 2.525 -0.0668 14.0 3.87e-22 4.95e-19 35.8
## 2478 ENSG00000067048 1.672 5.2588 13.5 2.27e-21 2.69e-18 37.9
## 6612 ENSG00000006757 -0.987 2.4851 -10.3 7.47e-16 8.27e-13 25.4
## 9959 ENSG00000232928 1.435 3.2017 10.3 8.74e-16 9.08e-13 25.4
limma = topTable(fit, adjust.method = "BH", number = Inf, coef = 2, sort.by = "none")
sgn.genes$limma = limma$adj.P.Val < 0.05
table(sgn.genes$limma)
##
## FALSE TRUE
## 16558 60
plot(limma$P.Value, pv.st$table$PValue, log = "xy", xlab = "limma p-value",
ylab = "edgeR p-value", pch = 19, col = sgn.genes$limma + sgn.genes$edger +
1)
abline(v = max(limma$P.Value[sgn.genes$limma]), col = "red")
abline(h = max(pv.st$table$PValue[sgn.genes$edger]), col = "red")
table(sgn.genes)
## limma
## edger FALSE TRUE
## FALSE 16552 14
## TRUE 6 46
# use DESeq
deseq = newCountDataSet(counts, gender)
deseq = estimateSizeFactors(deseq)
# DESeq uses RLE
plot(sizeFactors(deseq), edger.rle, xlab = "DESeq size factors", ylab = "edgeR RLE sizes")
deseq = estimateDispersions(deseq)
plotDispEsts(deseq)
plot(fitInfo(deseq)$perGeneDispEsts, edger$tagwise.dispersion, log = "xy", pch = 16,
col = "#00000030", xlab = "DESseq disp", ylab = "edgeR disp")
deseq.glm0 = fitNbinomGLMs(deseq, count ~ 1)
## ......
## Warning: glm.fit: algorithm did not converge
## ..........
deseq.glm = fitNbinomGLMs(deseq, count ~ gender)
## .....
## Warning: glm.fit: algorithm did not converge
## .
## Warning: glm.fit: algorithm did not converge
## ..........
## Warning: glm.fit: algorithm did not converge
deseq.pv = nbinomGLMTest(deseq.glm, deseq.glm0)
deseq.qv = p.adjust(deseq.pv, method = "BH")
sgn.genes$deseq = deseq.qv < 0.05
table(sgn.genes)
## , , deseq = FALSE
##
## limma
## edger FALSE TRUE
## FALSE 16552 14
## TRUE 6 1
##
## , , deseq = TRUE
##
## limma
## edger FALSE TRUE
## FALSE 0 0
## TRUE 0 45
vennDiagram(vennCounts(sgn.genes))
# different Venn diagrams
x = matrix(runif(200) > 0.3, ncol = 4)
vennDiagram(vennCounts(x)) #limma
library(venneuler)
## Loading required package: rJava
plot(venneuler(x))
library(Vennerable)
## Loading required package: graph
## Loading required package: RBGL
## Loading required package: grid
## Loading required package: RColorBrewer
## Loading required package: reshape
## Loading required package: plyr
##
## Attaching package: 'plyr'
##
## The following object is masked from 'package:graph':
##
## join
##
##
## Attaching package: 'reshape'
##
## The following object is masked from 'package:plyr':
##
## rename, round_any
##
## Loading required package: gtools
## Loading required package: xtable
plot(Venn(apply(x[, 1:3], 2, which)), doWeights = F)
plot(Venn(apply(x[, 1:3], 2, which)), doWeights = T)
plot(Venn(apply(x, 2, which)), doWeights = F)
plot(Venn(apply(x, 2, which)), doWeights = T)
# GOstat source('http://bioconductor.org/biocLite.R') biocLite('GO.db')
# biocLite('hgu95av2.db') biocLite('goseq')
library(GO.db)
## Loading required package: AnnotationDbi
## Loading required package: DBI
ls("package:GO.db")
## [1] "GO" "GOBPANCESTOR" "GOBPCHILDREN" "GOBPOFFSPRING"
## [5] "GOBPPARENTS" "GOCCANCESTOR" "GOCCCHILDREN" "GOCCOFFSPRING"
## [9] "GOCCPARENTS" "GO.db" "GO_dbconn" "GO_dbfile"
## [13] "GO_dbInfo" "GO_dbschema" "GOMAPCOUNTS" "GOMFANCESTOR"
## [17] "GOMFCHILDREN" "GOMFOFFSPRING" "GOMFPARENTS" "GOOBSOLETE"
## [21] "GOSYNONYM" "GOTERM"
names(as.list(GOTERM[1:10]))
## [1] "GO:0000001" "GO:0000002" "GO:0000003" "GO:0000006" "GO:0000007"
## [6] "GO:0000009" "GO:0000010" "GO:0000011" "GO:0000012" "GO:0000014"
GOTERM[["GO:0000006"]]
## GOID: GO:0000006
## Term: high affinity zinc uptake transmembrane transporter activity
## Ontology: MF
## Definition: Catalysis of the transfer of a solute or solutes from
## one side of a membrane to the other according to the reaction:
## Zn2+(out) = Zn2+(in), probably powered by proton motive force.
## In high affinity transport the transporter is able to bind the
## solute even if it is only present at very low concentrations.
library(goseq)
## Loading required package: BiasedUrn
## Loading required package: geneLenDataBase
sg = supportedGenomes()
sg[sg$species == "Human", ]
## db species date name
## 1 hg38 Human Dec. 2013 Genome Reference Consortium GRCh38
## 2 hg19 Human Feb. 2009 Genome Reference Consortium GRCh37
## 3 hg18 Human Mar. 2006 NCBI Build 36.1
## 4 hg17 Human May 2004 NCBI Build 35
## 5 hg16 Human Jul. 2003 NCBI Build 34
## AvailableGeneIDs
## 1
## 2 ccdsGene,ensGene,exoniphy,geneSymbol,knownGene,nscanGene,refGene,xenoRefGene
## 3 acembly,acescan,ccdsGene,ensGene,exoniphy,geneSymbol,geneid,genscan,knownGene,knownGeneOld3,refGene,sgpGene,sibGene,xenoRefGene
## 4 acembly,acescan,ccdsGene,ensGene,exoniphy,geneSymbol,geneid,genscan,knownGene,refGene,sgpGene,vegaGene,vegaPseudoGene,xenoRefGene
## 5 acembly,ensGene,exoniphy,geneSymbol,geneid,genscan,knownGene,refGene,sgpGene
sgi = supportedGeneIDs()
sgi[sgi$db == "ensGene", ]
## db track subtrack GeneID
## 11 ensGene Ensembl Genes <NA> Ensembl gene ID
## AvailableGenomes
## 11 anoCar1,anoGam1,apiMel2,bosTau3,bosTau4,canFam1,canFam2,cavPor3,ce6,ci2,danRer3,danRer4,danRer5,danRer6,equCab2,felCat3,fr1,fr2,galGal2,galGal3,gasAcu1,hg16,hg17,hg18,hg19,mm7,mm8,mm9,monDom4,monDom5,ornAna1,oryLat2,panTro1,panTro2,ponAbe2,rheMac2,rn3,rn4,sacCer1,sacCer2,taeGut1,tetNig1,tetNig2,xenTro2
de.genes = sgn.genes[, 1]
names(de.genes) = rownames(counts)
t = nullp(de.genes, "hg38", "ensGene", bias.data = log(apply(counts, 1, sum)),
plot.fit = TRUE)
gores = goseq(t, "hg38", "ensGene", method = "Wallenius")
## Fetching GO annotations...
##
## Calculating the p-values...
dim(gores)
## [1] 14907 5
sort(table(gores$numInCat))
##
## 109 121 139 156 167 179 189 190 191 192 196 197
## 1 1 1 1 1 1 1 1 1 1 1 1
## 202 204 206 210 211 214 215 220 227 230 234 235
## 1 1 1 1 1 1 1 1 1 1 1 1
## 237 240 243 256 257 258 264 265 266 267 268 269
## 1 1 1 1 1 1 1 1 1 1 1 1
## 279 286 292 295 297 301 305 307 312 320 322 326
## 1 1 1 1 1 1 1 1 1 1 1 1
## 329 332 334 338 340 343 345 347 348 358 359 360
## 1 1 1 1 1 1 1 1 1 1 1 1
## 375 376 378 381 385 389 390 399 400 402 405 407
## 1 1 1 1 1 1 1 1 1 1 1 1
## 408 415 416 417 419 420 425 431 437 438 443 446
## 1 1 1 1 1 1 1 1 1 1 1 1
## 448 450 451 452 456 458 459 460 462 464 465 470
## 1 1 1 1 1 1 1 1 1 1 1 1
## 472 473 480 481 483 487 488 489 498 500 502 508
## 1 1 1 1 1 1 1 1 1 1 1 1
## 510 511 514 517 522 524 531 539 544 547 555 556
## 1 1 1 1 1 1 1 1 1 1 1 1
## 557 558 559 565 567 572 574 577 578 580 582 593
## 1 1 1 1 1 1 1 1 1 1 1 1
## 594 595 597 598 603 610 615 617 618 621 624 627
## 1 1 1 1 1 1 1 1 1 1 1 1
## 628 635 638 640 644 646 648 649 651 653 654 656
## 1 1 1 1 1 1 1 1 1 1 1 1
## 661 673 674 675 676 688 689 696 697 706 707 716
## 1 1 1 1 1 1 1 1 1 1 1 1
## 720 727 731 734 737 749 753 761 765 769 772 777
## 1 1 1 1 1 1 1 1 1 1 1 1
## 782 794 797 802 809 818 822 831 833 834 838 845
## 1 1 1 1 1 1 1 1 1 1 1 1
## 848 852 864 865 876 901 911 915 917 950 963 967
## 1 1 1 1 1 1 1 1 1 1 1 1
## 968 975 982 989 992 1011 1017 1018 1076 1084 1085 1095
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1098 1101 1116 1127 1139 1166 1187 1208 1209 1210 1211 1213
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1225 1235 1246 1251 1254 1259 1266 1267 1268 1273 1276 1291
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1303 1323 1340 1356 1359 1363 1366 1368 1372 1374 1386 1391
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1393 1410 1440 1448 1490 1496 1509 1544 1568 1625 1654 1660
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1690 1698 1701 1732 1754 1756 1764 1775 1841 1865 1907 1908
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1909 1915 1920 1949 2013 2044 2048 2056 2065 2066 2074 2103
## 1 1 1 1 1 1 1 1 1 1 1 1
## 2157 2166 2182 2212 2223 2245 2263 2294 2345 2352 2366 2390
## 1 1 1 1 1 1 1 1 1 1 1 1
## 2430 2438 2441 2443 2473 2490 2492 2538 2550 2574 2612 2637
## 1 1 1 1 1 1 1 1 1 1 1 1
## 2643 2724 2732 2762 2832 2850 2900 2912 2945 2964 2965 3037
## 1 1 1 1 1 1 1 1 1 1 1 1
## 3043 3092 3106 3121 3244 3355 3357 3622 3678 3692 3716 3802
## 1 1 1 1 1 1 1 1 1 1 1 1
## 3807 3830 3898 3913 3973 3980 4080 4109 4124 4329 4498 4690
## 1 1 1 1 1 1 1 1 1 1 1 1
## 4733 4806 4905 4995 5007 5201 5212 5479 5938 6131 6160 6267
## 1 1 1 1 1 1 1 1 1 1 1 1
## 6408 6467 6546 7020 7037 7541 7555 7704 8391 8547 8688 9142
## 1 1 1 1 1 1 1 1 1 1 1 1
## 9228 10015 110 144 159 160 163 166 174 177 178 180
## 1 1 2 2 2 2 2 2 2 2 2 2
## 183 199 205 208 217 219 221 224 225 226 228 229
## 2 2 2 2 2 2 2 2 2 2 2 2
## 239 245 254 260 262 263 274 275 284 288 293 294
## 2 2 2 2 2 2 2 2 2 2 2 2
## 296 298 302 303 309 310 311 314 315 316 317 321
## 2 2 2 2 2 2 2 2 2 2 2 2
## 333 335 336 337 339 342 350 351 353 354 355 361
## 2 2 2 2 2 2 2 2 2 2 2 2
## 364 365 366 371 374 384 392 397 398 409 410 421
## 2 2 2 2 2 2 2 2 2 2 2 2
## 427 432 434 449 453 455 475 492 504 529 530 537
## 2 2 2 2 2 2 2 2 2 2 2 2
## 550 560 564 570 575 576 579 605 614 623 625 641
## 2 2 2 2 2 2 2 2 2 2 2 2
## 659 705 778 789 824 883 1112 1361 1774 1824 2119 2287
## 2 2 2 2 2 2 2 2 2 2 2 2
## 2385 2589 2692 9203 93 137 149 162 165 168 169 173
## 2 2 2 2 3 3 3 3 3 3 3 3
## 185 186 193 194 200 203 209 212 222 223 232 233
## 3 3 3 3 3 3 3 3 3 3 3 3
## 236 238 246 247 248 250 251 253 255 271 323 324
## 3 3 3 3 3 3 3 3 3 3 3 3
## 349 352 356 367 383 396 495 563 642 694 978 116
## 3 3 3 3 3 3 3 3 3 3 3 4
## 117 125 136 142 143 153 161 170 172 184 187 198
## 4 4 4 4 4 4 4 4 4 4 4 4
## 213 216 242 261 282 325 357 363 394 404 413 592
## 4 4 4 4 4 4 4 4 4 4 4 4
## 88 95 100 111 119 130 138 140 145 147 148 152
## 5 5 5 5 5 5 5 5 5 5 5 5
## 154 164 175 176 181 182 280 313 89 96 97 123
## 5 5 5 5 5 5 5 5 6 6 6 6
## 124 127 129 133 135 150 151 155 158 249 368 85
## 6 6 6 6 6 6 6 6 6 6 6 7
## 101 112 113 120 146 157 171 87 103 106 107 114
## 7 7 7 7 7 7 7 8 8 8 8 8
## 115 122 131 132 134 141 82 102 108 319 70 91
## 8 8 8 8 8 8 9 9 9 9 10 10
## 98 126 66 69 79 92 105 118 128 83 90 56
## 10 10 11 11 11 11 11 11 11 12 12 13
## 76 81 99 67 74 77 84 94 61 78 104 57
## 13 13 13 14 14 14 14 14 15 15 15 16
## 58 64 71 73 86 53 80 54 65 62 60 68
## 16 16 16 17 17 18 18 19 19 20 21 21
## 75 49 50 72 55 51 52 59 47 41 45 63
## 22 23 23 24 25 26 26 26 27 28 28 28
## 46 37 43 42 38 33 39 48 44 35 40 30
## 29 31 31 32 33 34 34 34 37 38 39 40
## 36 31 28 32 34 29 26 25 23 22 27 24
## 43 46 47 49 49 53 55 60 63 66 66 67
## 21 20 17 18 19 15 16 14 13 11 12 10
## 75 82 94 96 111 133 134 141 189 192 201 231
## 9 8 7 6 5 4 3 2 1
## 295 328 393 494 621 888 1188 1982 3922
gores = gores[gores$numInCat > 2, ]
gores = cbind(gores, p.over.adj = p.adjust(gores$over_represented_pvalue, m = "BH"))
gores = gores[gores$p.over.adj < 0.2, ]
for (i in 1:nrow(gores)) print(GOTERM[[gores[i, 1]]]@Term)
## [1] "histone demethylase activity"
## [1] "demethylase activity"
## [1] "neurexin family protein binding"
## [1] "histone demethylase activity (H3-K4 specific)"
## [1] "histone H3-K4 demethylation"
## [1] "oxidoreductase activity, acting on paired donors, with incorporation or reduction of molecular oxygen, 2-oxoglutarate as one donor, and incorporation of one atom each of oxygen into both donors"
#
library("hgu95av2.db")
##
keytypes(hgu95av2.db)
## [1] "ENTREZID" "PFAM" "IPI" "PROSITE"
## [5] "ACCNUM" "ALIAS" "CHR" "CHRLOC"
## [9] "CHRLOCEND" "ENZYME" "MAP" "PATH"
## [13] "PMID" "REFSEQ" "SYMBOL" "UNIGENE"
## [17] "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS" "GENENAME"
## [21] "UNIPROT" "GO" "EVIDENCE" "ONTOLOGY"
## [25] "GOALL" "EVIDENCEALL" "ONTOLOGYALL" "PROBEID"
## [29] "OMIM" "UCSCKG"
keys(hgu95av2.db, "ENSEMBL")[1:10]
## [1] "ENSG00000121410" "ENSG00000175899" "ENSG00000256069"
## [4] "ENSG00000171428" "ENSG00000156006" "ENSG00000196136"
## [7] "ENSG00000114771" "ENSG00000127837" "ENSG00000129673"
## [10] "ENSG00000090861"
cols(hgu95av2.db)
## [1] "PROBEID" "ENTREZID" "PFAM" "IPI"
## [5] "PROSITE" "ACCNUM" "ALIAS" "CHR"
## [9] "CHRLOC" "CHRLOCEND" "ENZYME" "MAP"
## [13] "PATH" "PMID" "REFSEQ" "SYMBOL"
## [17] "UNIGENE" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [21] "GENENAME" "UNIPROT" "GO" "EVIDENCE"
## [25] "ONTOLOGY" "GOALL" "EVIDENCEALL" "ONTOLOGYALL"
## [29] "OMIM" "UCSCKG"
go.ann = select(hgu95av2.db, cols = c("ENSEMBL", "GO"), keys = rownames(counts),
keytype = "ENSEMBL")
## Warning: 'select' resulted in 1:many mapping between keys and return rows
go.ann[1:10, ]
## ENSEMBL GO EVIDENCE ONTOLOGY
## 1 ENSG00000127720 GO:0008168 IEA MF
## 2 ENSG00000242018 <NA> <NA> <NA>
## 3 ENSG00000051596 GO:0000346 IDA CC
## 4 ENSG00000051596 GO:0003723 IEA MF
## 5 ENSG00000051596 GO:0006397 IEA BP
## 6 ENSG00000051596 GO:0006406 IDA BP
## 7 ENSG00000051596 GO:0008380 IEA BP
## 8 ENSG00000051596 GO:0046784 IDA BP
## 9 ENSG00000236211 <NA> <NA> <NA>
## 10 ENSG00000213697 <NA> <NA> <NA>
go.ann = split(go.ann$GO, go.ann$ENSEMBL)
go.ann[1:3]
## $ENSG00000000419
## [1] "GO:0004169" "GO:0004582" "GO:0005515" "GO:0005537" "GO:0005783"
## [6] "GO:0005789" "GO:0005789" "GO:0006488" "GO:0006501" "GO:0006506"
## [11] "GO:0016020" "GO:0018279" "GO:0019348" "GO:0019673" "GO:0033185"
## [16] "GO:0035268" "GO:0035269" "GO:0043178" "GO:0043687" "GO:0044267"
##
## $ENSG00000000457
## [1] "GO:0005515" "GO:0005524" "GO:0005737" "GO:0005794" "GO:0006468"
## [6] "GO:0016477" "GO:0030027"
##
## $ENSG00000000460
## [1] NA
t = nullp(de.genes, "test", "test", bias.data = log(apply(counts, 1, sum)),
plot.fit = TRUE)
gores = goseq(t, "test", "test", method = "Wallenius", gene2cat = go.ann)
## Using manually entered categories.
## Calculating the p-values...
gores = gores[gores$numInCat > 2, ]
gores = cbind(gores, p.over.adj = p.adjust(gores$over_represented_pvalue, m = "BH"))
gores = gores[gores$p.over.adj < 0.2, ]
for (i in 1:nrow(gores)) print(GOTERM[[gores[i, 1]]]@Term)
## [1] "histone demethylase activity (H3-K4 specific)"
## [1] "neurexin family protein binding"
## [1] "histone H3-K4 demethylation"