# source('http://bioconductor.org/biocLite.R') biocLite('limma')
# biocLite('edgeR') biocLite('tweeDEseqCountData')
# biocLite('preprocessCore') biocLite('DESeq')

library(limma)
library(edgeR)
library(tweeDEseqCountData)

## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, as.data.frame, cbind, colnames, duplicated,
##     eval, Filter, Find, get, intersect, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rep.int, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unlist
## 
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

library(DESeq)

## Loading required package: locfit
## locfit 1.5-9.1    2013-03-22
## Loading required package: lattice
## 
## Attaching package: 'DESeq'
## 
## The following object is masked from 'package:limma':
## 
##     plotMA


# load data ####
data(pickrell1)
counts = exprs(pickrell1.eset)
dim(counts)

## [1] 38415    69

gender = pickrell1.eset$gender
table(gender)

## gender
## female   male 
##     40     29


barplot(colSums(counts) * 1e-06, names = 1:69, ylab = "Library size (millions)")

plot of chunk unnamed-chunk-1


# filter by coverage: at least 20 samples have at least 20 reads
counts = counts[apply(counts >= 20, 1, sum) >= 20, ]
dim(counts)

## [1] 16618    69


# normalization ##### quantile by hand
mn = apply(counts, 2, sort)
mn = apply(mn, 1, mean)
counts.qnorm = apply(counts, 2, function(x) {
    mn[rank(x, ties.method = "random")]
})

# actually there is a function: library(preprocessCore) counts.qnorm =
# normalize.quantiles(counts)
par(mfrow = c(2, 3), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0), 
    oma = c(0, 0, 0, 1))
col = rainbow(ncol(counts))
plot(density(log(counts[, 1]), bw = 0.2), ylim = c(0, 0.3), col = col[1], "Row data")
for (i in 2:ncol(counts)) lines(density(log(counts[, i]), bw = 0.2), col = col[i])

plot(density(log(counts.qnorm[, 1]), bw = 0.2), ylim = c(0, 0.3), col = col[1], 
    "Quantile normalization")
for (i in 2:ncol(counts.qnorm)) lines(density(log(counts.qnorm[, i]), bw = 0.2), 
    col = col[i])

# norm to lib size
counts.ls = sweep(counts, 2, apply(counts, 2, sum), "/")
plot(density(log(counts.ls[, 1])), ylim = c(0, 0.3), col = col[1], "Lib size normalization")
for (i in 2:ncol(counts.ls)) lines(density(log(counts.ls[, i]), bw = 0.2), col = col[i])

# use edgeR fot normalization upperquartile
lib.size = apply(counts, 2, sum)
edger.uq = lib.size * calcNormFactors(counts, "upperquartile")
counts.uq = sweep(counts, 2, edger.uq, "/")

plot(density(log(counts.uq[, 1])), ylim = c(0, 0.3), col = col[1], "Upperquartile normalization")
for (i in 2:ncol(counts.uq)) lines(density(log(counts.uq[, i]), bw = 0.2), col = col[i])
# legend('topright',col=col,lwd=1,legend=1:length(lib.size),ncol=3,cex=0.8)
# RLE
edger.rle = lib.size * calcNormFactors(counts, "RLE")
counts.rle = sweep(counts, 2, edger.rle, "/")

plot(density(log(counts.rle[, 1])), ylim = c(0, 0.3), col = col[1], "RLE normalization")
for (i in 2:ncol(counts.rle)) lines(density(log(counts.rle[, i]), bw = 0.2), 
    col = col[i])

# TMM
edger.tmm = lib.size * calcNormFactors(counts, "TMM")
counts.tmm = sweep(counts, 2, edger.tmm, "/")

plot(density(log(counts.tmm[, 1])), ylim = c(0, 0.3), col = col[1], "TMM normalization")
for (i in 2:ncol(counts.tmm)) lines(density(log(counts.tmm[, i]), bw = 0.2), 
    col = col[i])

plot of chunk unnamed-chunk-1



# edgeR normalizations by hand upperquartile
uq = apply(counts, 2, function(x) {
    x = sort(x)
    sum(x[(length(x) * 0.75)])
})
plot(uq, edger.uq)
cor(uq, edger.uq)

## [1] 1


# RLE
mc = apply(counts, 1, prod)^(1/ncol(counts))
mc = apply(counts, 1, function(x) {
    x = x[x != 0]
    prod(x)^(1/length(x))
})
nf = sweep(counts, 1, mc, "/")
rle = apply(nf, 2, median, na.rm = T)
plot(rle, edger.rle)
cor(rle, edger.rle)

## [1] 1


# TMM is a bit too hard...

# make figure for presentation #####
cnts = read.table("/home/mazin/teaching/NGS2014/r.examples/mg.cnts")
cnts[1:3, ]

##                           gene.chr_id gene.start gene.stop gene.strand
## GCW_00005 gi|564749496|gb|CP006916.1|       1892      2680           1
## GCW_00010 gi|564749496|gb|CP006916.1|       3161      4546           1
## GCW_00015 gi|564749496|gb|CP006916.1|       4779      4877          -1
##           cnts.lcE4 cnts.scE4 rpkm.lcE4 rpkm.scE4 exp.lcE4 exp.scE4
## GCW_00005        51         0     38.43       0.0    5.265   -5.087
## GCW_00010        67         3     29.48       6.8    4.883    2.772
## GCW_00015         0         0      0.00       0.0   -5.087   -5.087

c = cnts[, 5] + 1
t = cnts[, 6] + 1

par(mfrow = c(2, 3), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0), 
    oma = c(0, 0, 0, 1), lwd = 3, cex = 1.1)

plot of chunk unnamed-chunk-1

bw = 0.4
xlim = log(range(c, t))
ylim = c(0, 0.27)
xlab = ""

libsize = c(sum(c), sum(t))
plot(density(log(c), from = xlim[1], to = xlim[2], bw = bw), ylim = ylim, col = "darkgray", 
    main = "Row data", xlab = xlab)
lines(density(log(t), from = xlim[1], to = xlim[2], bw = bw), col = "red")

xlim = c(-16, 0)
plot(density(log(c/libsize[1]), bw = bw), xlim = xlim, ylim = ylim, col = "darkgray", 
    main = "Lib size normalization", xlab = xlab)
lines(density(log(t/libsize[2]), bw = bw), col = "red")

uq = calcNormFactors(cbind(c, t), method = "upperquartile") * libsize
plot(density(log(c/uq[1]), bw = bw), xlim = xlim, ylim = ylim, col = "darkgray", 
    main = "Upperquartile normalization", xlab = xlab)
lines(density(log(t/uq[2]), bw = bw), col = "red")

rle = calcNormFactors(cbind(c, t), method = "RLE") * libsize
plot(density(log(c/rle[1]), bw = bw), xlim = xlim, ylim = ylim, col = "darkgray", 
    main = "RLE normalization", xlab = xlab)
lines(density(log(t/rle[2]), bw = bw), col = "red")

tmm = calcNormFactors(cbind(c, t), method = "TMM") * libsize
plot(density(log(c/tmm[1]), bw = bw), xlim = xlim, ylim = ylim, col = "darkgray", 
    main = "TMM normalization", xlab = xlab)
lines(density(log(t/tmm[2]), bw = bw), col = "red")

library(preprocessCore)
qn = normalize.quantiles(cbind(c, t))
plot(density(log(qn[, 1]), bw = bw), ylim = ylim, col = "darkgray", main = "Quantile normalization", 
    xlab = xlab)
lines(density(log(qn[, 1]), bw = bw), col = "red")

plot of chunk unnamed-chunk-1


par(mfrow = c(1, 1), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0), 
    oma = c(0, 0, 0, 1), lwd = 2, cex = 1)

dc = density(log(c/uq[1]), bw = bw, from = -16, to = 0)
dt = density(log(t/uq[2]), bw = bw, from = -16, to = 0)
plot(dc, ylim = ylim, col = "blue", main = "Upperquartile normalization", xlab = xlab)
lines(dt, col = "red")
q = sort(log(t/uq[2]))[length(t) * 0.75]
polygon(c(q, dc$x[dc$x >= q], q), c(0, dc$y[dc$x >= q], 0), col = "#0000FF60", 
    border = NA)
polygon(c(q, dt$x[dt$x >= q], q), c(0, dt$y[dt$x >= q], 0), col = "#FF000060", 
    border = NA)

plot of chunk unnamed-chunk-1


# look at differential expression #### use edgeR
par(mfrow = c(1, 1), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0), 
    oma = c(0, 0, 0, 1), lwd = 1, cex = 1)
edger = DGEList(counts)
edger = calcNormFactors(edger, method = "RLE")
design = model.matrix(~gender)
edger = estimateGLMCommonDisp(edger, design)
edger = estimateGLMTrendedDisp(edger, design)

## Loading required package: splines

edger = estimateGLMTagwiseDisp(edger, design)
plotBCV(edger)

plot of chunk unnamed-chunk-1

strict.disp = pmax(edger$tagwise.dispersion, edger$trended.dispersion, edger$common.dispersion)

glm = glmFit(edger, design)
glm.st = glmFit(edger, design, dispersion = strict.disp)

pv = glmLRT(glm)
pv.st = glmLRT(glm.st)
sgn.genes = data.frame(edger = p.adjust(pv.st$table$PValue, method = "BH") < 
    0.05)

table(p.adjust(pv$table$PValue, method = "BH") < 0.05)

## 
## FALSE  TRUE 
## 16552    66

table(sgn.genes$edger)

## 
## FALSE  TRUE 
## 16566    52

plotSmear(pv.st, de.tags = rownames(pv.st$table)[sgn.genes$edger], cex = 1)

plot of chunk unnamed-chunk-1


# use linear models through limma let's use RLE normalization

# prepare data
v = voom(edger, design, plot = TRUE)

plot of chunk unnamed-chunk-1

# fit model
fit = lmFit(v, design)
fit = eBayes(fit)
options(digits = 3)
topTable(fit, coef = 2, n = 16, sort = "p")

##                    ID  logFC AveExpr     t  P.Value adj.P.Val    B
## 10235 ENSG00000229807 -9.814  3.7595 -34.3 3.89e-46  6.46e-42 69.8
## 5164  ENSG00000099749  4.252  0.2657  27.2 2.00e-39  1.66e-35 64.8
## 13006 ENSG00000157828  3.285  3.2592  26.6 8.27e-39  4.58e-35 73.0
## 16431 ENSG00000233864  4.897 -0.6027  25.9 5.32e-38  2.21e-34 62.3
## 7812  ENSG00000131002  5.440 -0.2199  21.9 1.96e-33  6.50e-30 56.1
## 9547  ENSG00000198692  2.397  2.6317  20.7 6.19e-32  1.71e-28 59.2
## 15647 ENSG00000213318  4.306  2.2165  19.4 3.26e-30  7.73e-27 54.3
## 4721  ENSG00000165246  5.331 -0.5406  19.0 1.21e-29  2.51e-26 49.7
## 12123 ENSG00000129824  2.784  4.6629  17.6 1.23e-27  2.26e-24 51.4
## 13958 ENSG00000183878  1.877  2.6941  16.7 2.09e-26  3.48e-23 48.1
## 8854  ENSG00000012817  1.472  4.6557  14.8 1.80e-23  2.71e-20 42.5
## 7432  ENSG00000146938  4.470 -0.8290  14.5 6.61e-23  9.16e-20 37.4
## 15025 ENSG00000243209  2.525 -0.0668  14.0 3.87e-22  4.95e-19 35.8
## 2478  ENSG00000067048  1.672  5.2588  13.5 2.27e-21  2.69e-18 37.9
## 6612  ENSG00000006757 -0.987  2.4851 -10.3 7.47e-16  8.27e-13 25.4
## 9959  ENSG00000232928  1.435  3.2017  10.3 8.74e-16  9.08e-13 25.4

limma = topTable(fit, adjust.method = "BH", number = Inf, coef = 2, sort.by = "none")
sgn.genes$limma = limma$adj.P.Val < 0.05
table(sgn.genes$limma)

## 
## FALSE  TRUE 
## 16558    60


plot(limma$P.Value, pv.st$table$PValue, log = "xy", xlab = "limma p-value", 
    ylab = "edgeR p-value", pch = 19, col = sgn.genes$limma + sgn.genes$edger + 
        1)

abline(v = max(limma$P.Value[sgn.genes$limma]), col = "red")
abline(h = max(pv.st$table$PValue[sgn.genes$edger]), col = "red")

plot of chunk unnamed-chunk-1

table(sgn.genes)

##        limma
## edger   FALSE  TRUE
##   FALSE 16552    14
##   TRUE      6    46



# use DESeq
deseq = newCountDataSet(counts, gender)
deseq = estimateSizeFactors(deseq)

# DESeq uses RLE
plot(sizeFactors(deseq), edger.rle, xlab = "DESeq size factors", ylab = "edgeR RLE sizes")

plot of chunk unnamed-chunk-1

deseq = estimateDispersions(deseq)
plotDispEsts(deseq)

plot of chunk unnamed-chunk-1


plot(fitInfo(deseq)$perGeneDispEsts, edger$tagwise.dispersion, log = "xy", pch = 16, 
    col = "#00000030", xlab = "DESseq disp", ylab = "edgeR disp")

plot of chunk unnamed-chunk-1


deseq.glm0 = fitNbinomGLMs(deseq, count ~ 1)

## ......

## Warning: glm.fit: algorithm did not converge

## ..........

deseq.glm = fitNbinomGLMs(deseq, count ~ gender)

## .....

## Warning: glm.fit: algorithm did not converge

## .

## Warning: glm.fit: algorithm did not converge

## ..........

## Warning: glm.fit: algorithm did not converge

deseq.pv = nbinomGLMTest(deseq.glm, deseq.glm0)
deseq.qv = p.adjust(deseq.pv, method = "BH")

sgn.genes$deseq = deseq.qv < 0.05
table(sgn.genes)

## , , deseq = FALSE
## 
##        limma
## edger   FALSE  TRUE
##   FALSE 16552    14
##   TRUE      6     1
## 
## , , deseq = TRUE
## 
##        limma
## edger   FALSE  TRUE
##   FALSE     0     0
##   TRUE      0    45


vennDiagram(vennCounts(sgn.genes))

plot of chunk unnamed-chunk-1


# different Venn diagrams
x = matrix(runif(200) > 0.3, ncol = 4)
vennDiagram(vennCounts(x))  #limma

plot of chunk unnamed-chunk-1

library(venneuler)

## Loading required package: rJava

plot(venneuler(x))

plot of chunk unnamed-chunk-1

library(Vennerable)

## Loading required package: graph
## Loading required package: RBGL
## Loading required package: grid
## Loading required package: RColorBrewer
## Loading required package: reshape
## Loading required package: plyr
## 
## Attaching package: 'plyr'
## 
## The following object is masked from 'package:graph':
## 
##     join
## 
## 
## Attaching package: 'reshape'
## 
## The following object is masked from 'package:plyr':
## 
##     rename, round_any
## 
## Loading required package: gtools
## Loading required package: xtable

plot(Venn(apply(x[, 1:3], 2, which)), doWeights = F)

plot of chunk unnamed-chunk-1

plot(Venn(apply(x[, 1:3], 2, which)), doWeights = T)

plot of chunk unnamed-chunk-1

plot(Venn(apply(x, 2, which)), doWeights = F)

plot of chunk unnamed-chunk-1

plot(Venn(apply(x, 2, which)), doWeights = T)

plot of chunk unnamed-chunk-1


# GOstat source('http://bioconductor.org/biocLite.R') biocLite('GO.db')
# biocLite('hgu95av2.db') biocLite('goseq')

library(GO.db)

## Loading required package: AnnotationDbi
## Loading required package: DBI

ls("package:GO.db")

##  [1] "GO"            "GOBPANCESTOR"  "GOBPCHILDREN"  "GOBPOFFSPRING"
##  [5] "GOBPPARENTS"   "GOCCANCESTOR"  "GOCCCHILDREN"  "GOCCOFFSPRING"
##  [9] "GOCCPARENTS"   "GO.db"         "GO_dbconn"     "GO_dbfile"    
## [13] "GO_dbInfo"     "GO_dbschema"   "GOMAPCOUNTS"   "GOMFANCESTOR" 
## [17] "GOMFCHILDREN"  "GOMFOFFSPRING" "GOMFPARENTS"   "GOOBSOLETE"   
## [21] "GOSYNONYM"     "GOTERM"

names(as.list(GOTERM[1:10]))

##  [1] "GO:0000001" "GO:0000002" "GO:0000003" "GO:0000006" "GO:0000007"
##  [6] "GO:0000009" "GO:0000010" "GO:0000011" "GO:0000012" "GO:0000014"

GOTERM[["GO:0000006"]]

## GOID: GO:0000006
## Term: high affinity zinc uptake transmembrane transporter activity
## Ontology: MF
## Definition: Catalysis of the transfer of a solute or solutes from
##     one side of a membrane to the other according to the reaction:
##     Zn2+(out) = Zn2+(in), probably powered by proton motive force.
##     In high affinity transport the transporter is able to bind the
##     solute even if it is only present at very low concentrations.



library(goseq)

## Loading required package: BiasedUrn
## Loading required package: geneLenDataBase

sg = supportedGenomes()
sg[sg$species == "Human", ]

##     db species      date                               name
## 1 hg38   Human Dec. 2013 Genome Reference Consortium GRCh38
## 2 hg19   Human Feb. 2009 Genome Reference Consortium GRCh37
## 3 hg18   Human Mar. 2006                    NCBI Build 36.1
## 4 hg17   Human  May 2004                      NCBI Build 35
## 5 hg16   Human Jul. 2003                      NCBI Build 34
##                                                                                                                    AvailableGeneIDs
## 1                                                                                                                                  
## 2                                                      ccdsGene,ensGene,exoniphy,geneSymbol,knownGene,nscanGene,refGene,xenoRefGene
## 3   acembly,acescan,ccdsGene,ensGene,exoniphy,geneSymbol,geneid,genscan,knownGene,knownGeneOld3,refGene,sgpGene,sibGene,xenoRefGene
## 4 acembly,acescan,ccdsGene,ensGene,exoniphy,geneSymbol,geneid,genscan,knownGene,refGene,sgpGene,vegaGene,vegaPseudoGene,xenoRefGene
## 5                                                      acembly,ensGene,exoniphy,geneSymbol,geneid,genscan,knownGene,refGene,sgpGene


sgi = supportedGeneIDs()
sgi[sgi$db == "ensGene", ]

##         db         track subtrack          GeneID
## 11 ensGene Ensembl Genes     <NA> Ensembl gene ID
##                                                                                                                                                                                                                                                                                                   AvailableGenomes
## 11 anoCar1,anoGam1,apiMel2,bosTau3,bosTau4,canFam1,canFam2,cavPor3,ce6,ci2,danRer3,danRer4,danRer5,danRer6,equCab2,felCat3,fr1,fr2,galGal2,galGal3,gasAcu1,hg16,hg17,hg18,hg19,mm7,mm8,mm9,monDom4,monDom5,ornAna1,oryLat2,panTro1,panTro2,ponAbe2,rheMac2,rn3,rn4,sacCer1,sacCer2,taeGut1,tetNig1,tetNig2,xenTro2


de.genes = sgn.genes[, 1]
names(de.genes) = rownames(counts)
t = nullp(de.genes, "hg38", "ensGene", bias.data = log(apply(counts, 1, sum)), 
    plot.fit = TRUE)
gores = goseq(t, "hg38", "ensGene", method = "Wallenius")

## Fetching GO annotations...
## 
## Calculating the p-values...

dim(gores)

## [1] 14907     5

sort(table(gores$numInCat))

## 
##   109   121   139   156   167   179   189   190   191   192   196   197 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##   202   204   206   210   211   214   215   220   227   230   234   235 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##   237   240   243   256   257   258   264   265   266   267   268   269 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##   279   286   292   295   297   301   305   307   312   320   322   326 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##   329   332   334   338   340   343   345   347   348   358   359   360 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##   375   376   378   381   385   389   390   399   400   402   405   407 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##   408   415   416   417   419   420   425   431   437   438   443   446 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##   448   450   451   452   456   458   459   460   462   464   465   470 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##   472   473   480   481   483   487   488   489   498   500   502   508 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##   510   511   514   517   522   524   531   539   544   547   555   556 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##   557   558   559   565   567   572   574   577   578   580   582   593 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##   594   595   597   598   603   610   615   617   618   621   624   627 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##   628   635   638   640   644   646   648   649   651   653   654   656 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##   661   673   674   675   676   688   689   696   697   706   707   716 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##   720   727   731   734   737   749   753   761   765   769   772   777 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##   782   794   797   802   809   818   822   831   833   834   838   845 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##   848   852   864   865   876   901   911   915   917   950   963   967 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##   968   975   982   989   992  1011  1017  1018  1076  1084  1085  1095 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  1098  1101  1116  1127  1139  1166  1187  1208  1209  1210  1211  1213 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  1225  1235  1246  1251  1254  1259  1266  1267  1268  1273  1276  1291 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  1303  1323  1340  1356  1359  1363  1366  1368  1372  1374  1386  1391 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  1393  1410  1440  1448  1490  1496  1509  1544  1568  1625  1654  1660 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  1690  1698  1701  1732  1754  1756  1764  1775  1841  1865  1907  1908 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  1909  1915  1920  1949  2013  2044  2048  2056  2065  2066  2074  2103 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  2157  2166  2182  2212  2223  2245  2263  2294  2345  2352  2366  2390 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  2430  2438  2441  2443  2473  2490  2492  2538  2550  2574  2612  2637 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  2643  2724  2732  2762  2832  2850  2900  2912  2945  2964  2965  3037 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  3043  3092  3106  3121  3244  3355  3357  3622  3678  3692  3716  3802 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  3807  3830  3898  3913  3973  3980  4080  4109  4124  4329  4498  4690 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  4733  4806  4905  4995  5007  5201  5212  5479  5938  6131  6160  6267 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  6408  6467  6546  7020  7037  7541  7555  7704  8391  8547  8688  9142 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  9228 10015   110   144   159   160   163   166   174   177   178   180 
##     1     1     2     2     2     2     2     2     2     2     2     2 
##   183   199   205   208   217   219   221   224   225   226   228   229 
##     2     2     2     2     2     2     2     2     2     2     2     2 
##   239   245   254   260   262   263   274   275   284   288   293   294 
##     2     2     2     2     2     2     2     2     2     2     2     2 
##   296   298   302   303   309   310   311   314   315   316   317   321 
##     2     2     2     2     2     2     2     2     2     2     2     2 
##   333   335   336   337   339   342   350   351   353   354   355   361 
##     2     2     2     2     2     2     2     2     2     2     2     2 
##   364   365   366   371   374   384   392   397   398   409   410   421 
##     2     2     2     2     2     2     2     2     2     2     2     2 
##   427   432   434   449   453   455   475   492   504   529   530   537 
##     2     2     2     2     2     2     2     2     2     2     2     2 
##   550   560   564   570   575   576   579   605   614   623   625   641 
##     2     2     2     2     2     2     2     2     2     2     2     2 
##   659   705   778   789   824   883  1112  1361  1774  1824  2119  2287 
##     2     2     2     2     2     2     2     2     2     2     2     2 
##  2385  2589  2692  9203    93   137   149   162   165   168   169   173 
##     2     2     2     2     3     3     3     3     3     3     3     3 
##   185   186   193   194   200   203   209   212   222   223   232   233 
##     3     3     3     3     3     3     3     3     3     3     3     3 
##   236   238   246   247   248   250   251   253   255   271   323   324 
##     3     3     3     3     3     3     3     3     3     3     3     3 
##   349   352   356   367   383   396   495   563   642   694   978   116 
##     3     3     3     3     3     3     3     3     3     3     3     4 
##   117   125   136   142   143   153   161   170   172   184   187   198 
##     4     4     4     4     4     4     4     4     4     4     4     4 
##   213   216   242   261   282   325   357   363   394   404   413   592 
##     4     4     4     4     4     4     4     4     4     4     4     4 
##    88    95   100   111   119   130   138   140   145   147   148   152 
##     5     5     5     5     5     5     5     5     5     5     5     5 
##   154   164   175   176   181   182   280   313    89    96    97   123 
##     5     5     5     5     5     5     5     5     6     6     6     6 
##   124   127   129   133   135   150   151   155   158   249   368    85 
##     6     6     6     6     6     6     6     6     6     6     6     7 
##   101   112   113   120   146   157   171    87   103   106   107   114 
##     7     7     7     7     7     7     7     8     8     8     8     8 
##   115   122   131   132   134   141    82   102   108   319    70    91 
##     8     8     8     8     8     8     9     9     9     9    10    10 
##    98   126    66    69    79    92   105   118   128    83    90    56 
##    10    10    11    11    11    11    11    11    11    12    12    13 
##    76    81    99    67    74    77    84    94    61    78   104    57 
##    13    13    13    14    14    14    14    14    15    15    15    16 
##    58    64    71    73    86    53    80    54    65    62    60    68 
##    16    16    16    17    17    18    18    19    19    20    21    21 
##    75    49    50    72    55    51    52    59    47    41    45    63 
##    22    23    23    24    25    26    26    26    27    28    28    28 
##    46    37    43    42    38    33    39    48    44    35    40    30 
##    29    31    31    32    33    34    34    34    37    38    39    40 
##    36    31    28    32    34    29    26    25    23    22    27    24 
##    43    46    47    49    49    53    55    60    63    66    66    67 
##    21    20    17    18    19    15    16    14    13    11    12    10 
##    75    82    94    96   111   133   134   141   189   192   201   231 
##     9     8     7     6     5     4     3     2     1 
##   295   328   393   494   621   888  1188  1982  3922

gores = gores[gores$numInCat > 2, ]
gores = cbind(gores, p.over.adj = p.adjust(gores$over_represented_pvalue, m = "BH"))

gores = gores[gores$p.over.adj < 0.2, ]
for (i in 1:nrow(gores)) print(GOTERM[[gores[i, 1]]]@Term)

## [1] "histone demethylase activity"
## [1] "demethylase activity"
## [1] "neurexin family protein binding"
## [1] "histone demethylase activity (H3-K4 specific)"
## [1] "histone H3-K4 demethylation"
## [1] "oxidoreductase activity, acting on paired donors, with incorporation or reduction of molecular oxygen, 2-oxoglutarate as one donor, and incorporation of one atom each of oxygen into both donors"


# 
library("hgu95av2.db")

##

keytypes(hgu95av2.db)

##  [1] "ENTREZID"     "PFAM"         "IPI"          "PROSITE"     
##  [5] "ACCNUM"       "ALIAS"        "CHR"          "CHRLOC"      
##  [9] "CHRLOCEND"    "ENZYME"       "MAP"          "PATH"        
## [13] "PMID"         "REFSEQ"       "SYMBOL"       "UNIGENE"     
## [17] "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS" "GENENAME"    
## [21] "UNIPROT"      "GO"           "EVIDENCE"     "ONTOLOGY"    
## [25] "GOALL"        "EVIDENCEALL"  "ONTOLOGYALL"  "PROBEID"     
## [29] "OMIM"         "UCSCKG"

keys(hgu95av2.db, "ENSEMBL")[1:10]

##  [1] "ENSG00000121410" "ENSG00000175899" "ENSG00000256069"
##  [4] "ENSG00000171428" "ENSG00000156006" "ENSG00000196136"
##  [7] "ENSG00000114771" "ENSG00000127837" "ENSG00000129673"
## [10] "ENSG00000090861"

cols(hgu95av2.db)

##  [1] "PROBEID"      "ENTREZID"     "PFAM"         "IPI"         
##  [5] "PROSITE"      "ACCNUM"       "ALIAS"        "CHR"         
##  [9] "CHRLOC"       "CHRLOCEND"    "ENZYME"       "MAP"         
## [13] "PATH"         "PMID"         "REFSEQ"       "SYMBOL"      
## [17] "UNIGENE"      "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
## [21] "GENENAME"     "UNIPROT"      "GO"           "EVIDENCE"    
## [25] "ONTOLOGY"     "GOALL"        "EVIDENCEALL"  "ONTOLOGYALL" 
## [29] "OMIM"         "UCSCKG"

go.ann = select(hgu95av2.db, cols = c("ENSEMBL", "GO"), keys = rownames(counts), 
    keytype = "ENSEMBL")

## Warning: 'select' resulted in 1:many mapping between keys and return rows

go.ann[1:10, ]

##            ENSEMBL         GO EVIDENCE ONTOLOGY
## 1  ENSG00000127720 GO:0008168      IEA       MF
## 2  ENSG00000242018       <NA>     <NA>     <NA>
## 3  ENSG00000051596 GO:0000346      IDA       CC
## 4  ENSG00000051596 GO:0003723      IEA       MF
## 5  ENSG00000051596 GO:0006397      IEA       BP
## 6  ENSG00000051596 GO:0006406      IDA       BP
## 7  ENSG00000051596 GO:0008380      IEA       BP
## 8  ENSG00000051596 GO:0046784      IDA       BP
## 9  ENSG00000236211       <NA>     <NA>     <NA>
## 10 ENSG00000213697       <NA>     <NA>     <NA>

go.ann = split(go.ann$GO, go.ann$ENSEMBL)
go.ann[1:3]

## $ENSG00000000419
##  [1] "GO:0004169" "GO:0004582" "GO:0005515" "GO:0005537" "GO:0005783"
##  [6] "GO:0005789" "GO:0005789" "GO:0006488" "GO:0006501" "GO:0006506"
## [11] "GO:0016020" "GO:0018279" "GO:0019348" "GO:0019673" "GO:0033185"
## [16] "GO:0035268" "GO:0035269" "GO:0043178" "GO:0043687" "GO:0044267"
## 
## $ENSG00000000457
## [1] "GO:0005515" "GO:0005524" "GO:0005737" "GO:0005794" "GO:0006468"
## [6] "GO:0016477" "GO:0030027"
## 
## $ENSG00000000460
## [1] NA


t = nullp(de.genes, "test", "test", bias.data = log(apply(counts, 1, sum)), 
    plot.fit = TRUE)

plot of chunk unnamed-chunk-1

gores = goseq(t, "test", "test", method = "Wallenius", gene2cat = go.ann)

## Using manually entered categories.
## Calculating the p-values...

gores = gores[gores$numInCat > 2, ]
gores = cbind(gores, p.over.adj = p.adjust(gores$over_represented_pvalue, m = "BH"))

gores = gores[gores$p.over.adj < 0.2, ]
for (i in 1:nrow(gores)) print(GOTERM[[gores[i, 1]]]@Term)

## [1] "histone demethylase activity (H3-K4 specific)"
## [1] "neurexin family protein binding"
## [1] "histone H3-K4 demethylation"