statistics for bioinformatics

library(TeachingDemos)
library(ALL)

## Loading required package: Biobase

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colMeans,
##     colnames, colSums, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply,
##     lengths, Map, mapply, match, mget, order, paste, pmax,
##     pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
##     rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
##     tapply, union, unique, unsplit, which, which.max, which.min

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

data(ALL)
library(multtest)
data(golub)
golub.gnames[1042,]

## [1] "2354"            "CCND3 Cyclin D3" "M92287_at"

gol.fac <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
mall <- apply(golub[,gol.fac=="ALL"], 1, mean)
maml <- apply(golub[,gol.fac=="AML"], 1, mean)
o <- order(abs(mall-maml), decreasing=TRUE)
print(golub.gnames[o[1:5],2])

## [1] "CST3 Cystatin C (amyloid angiopathy and cerebral hemorrhage)"
## [2] "INTERLEUKIN-8 PRECURSOR"                                     
## [3] "Interleukin 8 (IL8) gene"                                    
## [4] "DF D component of complement (adipsin)"                      
## [5] "MPO Myeloperoxidase"

library(ape)
table(read.GenBank(c("X94991.1"),as.character=TRUE))

## 
##   a   c   g   t 
## 410 789 573 394

data(golub, package = "multtest")
gol.fac <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
stripchart(golub[1042,] ~ gol.fac, method="jitter")

hist(golub[1042, gol.fac=="ALL"])

x <- sort(golub[1042, gol.fac=="ALL"], decreasing = FALSE)
x[1:5]

## [1] 0.45827 1.10546 1.27645 1.32551 1.36844

boxplot(golub[1042,] ~ gol.fac)

pvec <- seq(0,1,0.25)
quantile(golub[1042, gol.fac=="ALL"],pvec)

##       0%      25%      50%      75%     100% 
## 0.458270 1.796065 1.927760 2.178705 2.766100

qqnorm(golub[1042, gol.fac=="ALL"])
qqline(golub[1042, gol.fac=="ALL"])

library(multtest); data(golub)
gol.fac <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
x <- golub[1042,gol.fac=="ALL"]
z <- (x-1.90)/0.50
sum(z^2)

## [1] 25.03312

pchisq(sum(z^2),27, lower.tail=FALSE)

## [1] 0.5726059

n <- 11
x <- golub[2058, gol.fac=="AML"]
t.value <- sqrt(n)*(mean(x)-0)/sd(x)
t.value

## [1] 1.236324

var(golub[1042,gol.fac=="ALL"])/var(golub[1042,gol.fac=="AML"])

## [1] 0.7116441

f<-function(x){dnorm(x,1.9,0.5)}
plot(f,0,4,xlab="x-axis",ylab="density f(x)")

plot(f,0,4,xlab="x-axis",ylab="density f(x)")
x<-seq(0,1.4,0.01)
polygon(c(0,x,1.4), c(0,f(x),0), col="lightblue")

data(golub, package = "multtest")
gol.fac <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
sigma <- 0.25; n <- 27; mu0 <- 0
x <- golub[2058,gol.fac=="ALL"]
z.value <- sqrt(n)*(mean(x) - mu0)/sigma
2*pnorm(-abs(z.value),0,1)

## [1] 0.9991094

mean(x)+qnorm(c(0.025),0,1)*sigma/sqrt(n)

## [1] -0.09424511

mean(x)+qnorm(c(0.975),0,1)*sigma/sqrt(n)

## [1] 0.09435251

library(TeachingDemos)
z.test(x,mu=0,sd=0.25)

## 
##  One Sample z-test
## 
## data:  x
## z = 0.0011162, n = 27.000000, Std. Dev. = 0.250000, Std. Dev. of
## the sample mean = 0.048113, p-value = 0.9991
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.09424511  0.09435251
## sample estimates:
##   mean of x 
## 5.37037e-05

ci.examp(mean.sim =0, sd = 1, n = 25, reps = 100, method = "z", lower.conf=0.025, upper.conf=0.975)

x <- golub[2058,gol.fac=="ALL"]; mu0 <- 0; n <- 27
t.value<-sqrt(n)*(mean(x) - mu0)/sd(x)
t.value

## [1] 0.001076867

mean(x)+qt(0.025,26)*sd(x)/sqrt(n)

## [1] -0.1024562

t.test(x,mu=0)

## 
##  One Sample t-test
## 
## data:  x
## t = 0.0010769, df = 26, p-value = 0.9991
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.1024562  0.1025636
## sample estimates:
##   mean of x 
## 5.37037e-05

t.test(golub[1042,gol.fac=="ALL"],mu=0, alternative = c("greater"))

## 
##  One Sample t-test
## 
## data:  golub[1042, gol.fac == "ALL"]
## t = 20.06, df = 26, p-value < 2.2e-16
## alternative hypothesis: true mean is greater than 0
## 95 percent confidence interval:
##  1.732853      Inf
## sample estimates:
## mean of x 
##  1.893883

t.test(golub[1042,] ~ gol.fac, var.equal=FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  golub[1042, ] by gol.fac
## t = 6.3186, df = 16.118, p-value = 9.871e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.8363826 1.6802008
## sample estimates:
## mean in group ALL mean in group AML 
##         1.8938826         0.6355909

t.test(golub[1042,] ~ gol.fac, var.equal = TRUE)

## 
##  Two Sample t-test
## 
## data:  golub[1042, ] by gol.fac
## t = 6.7983, df = 36, p-value = 6.046e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.8829143 1.6336690
## sample estimates:
## mean in group ALL mean in group AML 
##         1.8938826         0.6355909

var.test(golub[1042,] ~ gol.fac)

## 
##  F test to compare two variances
## 
## data:  golub[1042, ] by gol.fac
## F = 0.71164, num df = 26, denom df = 10, p-value = 0.4652
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.2127735 1.8428387
## sample estimates:
## ratio of variances 
##          0.7116441

binom.test(18, 22, p = 0.7, alternative = c("greater"), conf.level = 0.95)

## 
##  Exact binomial test
## 
## data:  18 and 22
## number of successes = 18, number of trials = 22, p-value = 0.1645
## alternative hypothesis: true probability of success is greater than 0.7
## 95 percent confidence interval:
##  0.6309089 1.0000000
## sample estimates:
## probability of success 
##              0.8181818

library(ape)
zyxinfreq <- table(read.GenBank(c("X94991.1"),as.character=TRUE))
chisq.test(zyxinfreq)

## 
##  Chi-squared test for given probabilities
## 
## data:  zyxinfreq
## X-squared = 187.07, df = 3, p-value < 2.2e-16

shapiro.test(golub[1042, gol.fac=="ALL"])

## 
##  Shapiro-Wilk normality test
## 
## data:  golub[1042, gol.fac == "ALL"]
## W = 0.94663, p-value = 0.1774

library(nortest)
ad.test(golub[1042,gol.fac=="ALL"])

## 
##  Anderson-Darling normality test
## 
## data:  golub[1042, gol.fac == "ALL"]
## A = 0.52154, p-value = 0.1683

library(outliers)
grubbs.test(golub[1042, gol.fac=="ALL"])

## 
##  Grubbs test for one outlier
## 
## data:  golub[1042, gol.fac == "ALL"]
## G = 2.92640, U = 0.65796, p-value = 0.0183
## alternative hypothesis: lowest value 0.45827 is an outlier

data(golub,package="multtest")
gol.fac <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
sh <- apply(golub[,gol.fac=="ALL"], 1, function(x) shapiro.test(x)$p.value)
sum(sh > 0.05)/nrow(golub) * 100

## [1] 58.27598

data(golub, package = "multtest");
gol.fac <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
pt <- apply(golub, 1, function(x) t.test(x ~ gol.fac)$p.value)
pw <- apply(golub, 1, function(x) wilcox.test(x ~ gol.fac)$p.value)

## Warning in wilcox.test.default(x = c(1.20689, 0.51105, 1.20264, 1.72492, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = c(0.24444, 0.64282, 0.46461, 0.68311, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = c(0.2103, 0.57747, 1.02711, -0.0496, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = c(-0.1859, -0.38107, -0.47897,
## -0.42782, : cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = c(-0.14056, -0.49849, -1.04357,
## -0.64631, : cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = c(0.40334, 0.31732, 0.32201, 0.30622, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = c(0.4406, 0.34383, 0.1861, 0.79001,
## 0.06251, : cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = c(0.9125, 0.90637, 1.15064, 0.92193, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = c(0.37595, 0.32497, 0.34482, 0.33303, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = c(2.53193, 2.48503, 2.30672, 2.76934, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = c(0.00447, 0.77371, -0.32637, 0.53205, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = c(1.60338, 1.04229, 0.6379, 1.25547, :
## cannot compute exact p-value with ties

resul <- data.frame(cbind(pw,pt))
resul[pw<0.05 & abs(pt-pw)>0.2,]

##              pw        pt
## 456  0.04480288 0.2636088
## 1509 0.03215830 0.4427477

library(ALL);data(ALL)
ALLB123 <- ALL[,ALL$BT %in% c("B1","B2","B3")]
y <- exprs(ALLB123)["1866_g_at",]
summary(lm(y ~ ALLB123$BT))

## 
## Call:
## lm(formula = y ~ ALLB123$BT)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.59070 -0.25251 -0.07008  0.16395  1.29635 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.58222    0.08506  53.873  < 2e-16 ***
## ALLB123$BTB2 -0.43689    0.10513  -4.156 8.52e-05 ***
## ALLB123$BTB3 -0.72193    0.11494  -6.281 2.00e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3707 on 75 degrees of freedom
## Multiple R-squared:  0.3461, Adjusted R-squared:  0.3287 
## F-statistic: 19.85 on 2 and 75 DF,  p-value: 1.207e-07

library(ALL); data(ALL)
ALLB123 <- ALL[,ALL$BT %in% c("B1","B2","B3")]
y <- exprs(ALLB123)["1242_at",]
summary(lm(y ~ ALLB123$BT))

## 
## Call:
## lm(formula = y ~ ALLB123$BT)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.48408 -0.13461 -0.01297  0.12232  0.67138 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.55083    0.05673 115.483   <2e-16 ***
## ALLB123$BTB2  0.03331    0.07011   0.475    0.636    
## ALLB123$BTB3 -0.04675    0.07665  -0.610    0.544    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2473 on 75 degrees of freedom
## Multiple R-squared:  0.01925,    Adjusted R-squared:  -0.006898 
## F-statistic: 0.7362 on 2 and 75 DF,  p-value: 0.4823

library("ALL"); data(ALL)
ALLBm <- ALL[,which(ALL$BT %in% c("B","B1","B2","B3","B4") & ALL$mol.biol %in% c("BCR/ABL","NEG"))]
facmolb <- factor(ALLBm$mol.biol)
facB <- factor(ceiling(as.integer(ALLBm$BT)/3),levels=1:2,labels=c("B012","B34"))
anova(lm(exprs(ALLBm)["32069_at",] ~ facB * facmolb))

## Analysis of Variance Table
## 
## Response: exprs(ALLBm)["32069_at", ]
##              Df  Sum Sq Mean Sq F value    Pr(>F)    
## facB          1  1.1659  1.1659  4.5999 0.0352127 *  
## facmolb       1  3.2162  3.2162 12.6891 0.0006433 ***
## facB:facmolb  1  1.1809  1.1809  4.6592 0.0340869 *  
## Residuals    75 19.0094  0.2535                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(lm(exprs(ALLBm)["32069_at",] ~ facB * facmolb))

## 
## Call:
## lm(formula = exprs(ALLBm)["32069_at", ] ~ facB * facmolb)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.21905 -0.31589 -0.05137  0.24404  1.52015 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          6.7649     0.1073  63.026  < 2e-16 ***
## facBB34             -0.5231     0.1686  -3.103   0.0027 ** 
## facmolbNEG          -0.6020     0.1458  -4.128 9.39e-05 ***
## facBB34:facmolbNEG   0.5016     0.2324   2.159   0.0341 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5034 on 75 degrees of freedom
## Multiple R-squared:  0.2264, Adjusted R-squared:  0.1954 
## F-statistic: 7.316 on 3 and 75 DF,  p-value: 0.0002285

data(ALL,package="ALL");library(ALL)
ALLB123 <- ALL[,ALL$BT %in% c("B1","B2","B3")]
y <- exprs(ALLB123)["1866_g_at",]
shapiro.test(residuals(lm(y ~ ALLB123$BT)))

## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(lm(y ~ ALLB123$BT))
## W = 0.93462, p-value = 0.0005989

library(ALL); data(ALL); library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

ALLB123 <- ALL[,ALL$BT %in% c("B1","B2","B3")]
y <- exprs(ALLB123)["1866_g_at",]
bptest(lm(y ~ ALLB123$BT),studentize = FALSE)

## 
##  Breusch-Pagan test
## 
## data:  lm(y ~ ALLB123$BT)
## BP = 8.7311, df = 2, p-value = 0.01271

data(ALL,package="ALL");library(ALL)
ALLB123 <- ALL[,ALL$BT %in% c("B1","B2","B3")]
y <- exprs(ALLB123)["1866_g_at",]
oneway.test(y ~ ALLB123$BT)

## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  y and ALLB123$BT
## F = 14.157, num df = 2.000, denom df = 36.998, p-value = 2.717e-05

data(ALL,package="ALL");library(ALL)
ALLB123 <- ALL[,ALL$BT %in% c("B1","B2","B3")]
y <- exprs(ALLB123)["1866_g_at",]
kruskal.test(y ~ ALLB123$BT)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  y by ALLB123$BT
## Kruskal-Wallis chi-squared = 30.667, df = 2, p-value = 2.192e-07

data(ALL, package = "ALL")
slotNames(ALL)

## [1] "experimentData"    "assayData"         "phenoData"        
## [4] "featureData"       "annotation"        "protocolData"     
## [7] ".__classVersion__"

row.names(exprs(ALL))[1:10]

##  [1] "1000_at"   "1001_at"   "1002_f_at" "1003_s_at" "1004_at"  
##  [6] "1005_at"   "1006_at"   "1007_s_at" "1008_f_at" "1009_at"

ALL1pp <- ALL1 <- ALL[,ALL$mol == "ALL1/AF4"]
mads <- apply(exprs(ALL1), 2, mad)
meds <- apply(exprs(ALL1), 2, median)
dat <- sweep(exprs(ALL1), 2, meds)
exprs(ALL1pp) <- sweep(dat, 2, mads, FUN="/")
cvval <- apply(exprs(ALL1pp),1,function(x){sd(x)/abs(mean(x))})


library("genefilter")
f1 <- function(x)(IQR(x)>0.5)
f2 <- pOverA(.25, log2(100))
f3 <- function(x) (median(2^x) > 300)
f4 <- function(x) (shapiro.test(x)$p.value > 0.05)
f5 <- function(x) (sd(x)/abs(mean(x))<0.1)
f6 <- function(x) (sqrt(10)* abs(mean(x))/sd(x) > qt(0.975,9))
ff <- filterfun(f1,f2,f3,f4,f5,f6)
library("ALL"); data(ALL)
selected <- genefilter(exprs(ALL[,ALL$BT=="B"]), ff)
sum(selected)

## [1] 317

library("genefilter");library("ALL"); data(ALL)
patientB <- factor(ALL$BT %in% c("B","B1","B2","B3","B4"))
f1 <- function(x) (shapiro.test(x)$p.value > 0.05)
f2 <- function(x) (t.test(x ~ patientB)$p.value < 0.05)
sel1 <- genefilter(exprs(ALL[,patientB==TRUE]), filterfun(f1))
sel2 <- genefilter(exprs(ALL[,patientB==FALSE]), filterfun(f1))
sel3 <- genefilter(exprs(ALL), filterfun(f2))
selected <- sel1 & sel2 & sel3
ALLs <- ALL[selected,]

library(limma)

## 
## Attaching package: 'limma'

## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA

x <- matrix(as.integer(c(sel1,sel2,sel3)),ncol = 3,byrow=FALSE)
colnames(x) <- c("sel1","sel2","sel3")
vc <- vennCounts(x, include="both")
vennDiagram(vc)

library("ALL"); library("limma");
data(ALL, package = "ALL")
allB <- ALL[,which(ALL$BT %in% c("B","B1","B2"))]
design.ma <- model.matrix(~ 0 + factor(allB$BT))
colnames(design.ma) <- c("B","B1","B2")
fit <- lmFit(allB, design.ma)
fit <- eBayes(fit)
toptab <- topTable(fit, coef=2,5,adjust.method="fdr")
print(toptab[,1:5],digits=4)

##                 logFC AveExpr     t   P.Value adj.P.Val
## AFFX-hum_alu_at 13.42   13.50 326.0 3.165e-99 3.996e-95
## 32466_at        12.68   12.70 306.3 1.333e-97 8.413e-94
## 32748_at        12.08   12.11 296.3 9.771e-97 3.616e-93
## 35278_at        12.44   12.45 295.5 1.146e-96 3.616e-93
## 34593_g_at      12.64   12.58 278.0 4.431e-95 1.119e-91

cont.ma <- makeContrasts(B-B1,B1-B2, levels=factor(allB$BT))
cont.ma

##       Contrasts
## Levels B - B1 B1 - B2
##     B       1       0
##     B1     -1       1
##     B2      0      -1

fit1 <- contrasts.fit(fit, cont.ma)
fit1 <- eBayes(fit1)
toptabcon <- topTable(fit, coef=2,5,adjust.method="fdr")
print(toptabcon[,1:5],digits=4)

##                 logFC AveExpr     t   P.Value adj.P.Val
## AFFX-hum_alu_at 13.42   13.50 326.0 3.165e-99 3.996e-95
## 32466_at        12.68   12.70 306.3 1.333e-97 8.413e-94
## 32748_at        12.08   12.11 296.3 9.771e-97 3.616e-93
## 35278_at        12.44   12.45 295.5 1.146e-96 3.616e-93
## 34593_g_at      12.64   12.58 278.0 4.431e-95 1.119e-91

toptabcon <- topTable(fit1, coef=2,5,adjust.method="fdr")
print(toptabcon[,1:5],digits=4)

##            logFC AveExpr      t   P.Value adj.P.Val
## 33358_at  1.4890   5.260  7.374 5.737e-10 7.242e-06
## 1389_at  -1.7852   9.262 -7.081 1.816e-09 9.744e-06
## 1914_at   2.0976   4.939  7.019 2.315e-09 9.744e-06
## 36873_at  1.8646   4.303  6.426 2.361e-08 7.452e-05
## 37471_at  0.8701   6.551  6.106 8.161e-08 2.061e-04

library(multtest); data(golub)
index <- grep("Cyclin",golub.gnames[,2])
golub.gnames[index,2]

##  [1] "CCND2 Cyclin D2"                                        
##  [2] "CDK2 Cyclin-dependent kinase 2"                         
##  [3] "CCND3 Cyclin D3"                                        
##  [4] "CDKN1A Cyclin-dependent kinase inhibitor 1A (p21, Cip1)"
##  [5] "CCNH Cyclin H"                                          
##  [6] "Cyclin-dependent kinase 4 (CDK4) gene"                  
##  [7] "Cyclin G2 mRNA"                                         
##  [8] "Cyclin A1 mRNA"                                         
##  [9] "Cyclin-selective ubiquitin carrier protein mRNA"        
## [10] "CDK6 Cyclin-dependent kinase 6"                         
## [11] "Cyclin G1 mRNA"                                         
## [12] "CCNF Cyclin F"

dist.cyclin <- dist(golub[index,],method="euclidian")
diam <- as.matrix(dist.cyclin)
rownames(diam) <- colnames(diam) <- golub.gnames[index,3]
diam[1:5,1:5]

##           D13639_at M68520_at M92287_at U09579_at U11791_at
## D13639_at  0.000000  8.821806  11.55349 10.056814  8.669112
## M68520_at  8.821806  0.000000  11.70156  5.931260  2.934802
## M92287_at 11.553494 11.701562   0.00000 11.991333 11.900558
## U09579_at 10.056814  5.931260  11.99133  0.000000  5.698232
## U11791_at  8.669112  2.934802  11.90056  5.698232  0.000000

library("genefilter"); library("ALL"); data(ALL)
closeto1389_at <- genefinder(ALL, "1389_at", 10, method = "euc")
closeto1389_at[[1]]$indices

##  [1]  2653  1096  6634  9255  6639 11402  9849  2274  8518 10736

round(closeto1389_at[[1]]$dists,1)

##  [1] 12.6 12.8 12.8 12.8 13.0 13.0 13.1 13.2 13.3 13.4

featureNames(ALL)[closeto1389_at[[1]]$indices]

##  [1] "32629_f_at" "1988_at"    "36571_at"   "39168_at"   "36576_at"  
##  [6] "41295_at"   "39756_g_at" "32254_at"   "38438_at"   "40635_at"

names <- list(c("g1","g2","g3","g4","g5"),c("p1","p2"))
sl.clus.dat <- matrix(c(1,1,1,1.1,3,2,3,2.3,5,5),ncol = 2,
byrow = TRUE,dimnames = names)
plot(sl.clus.dat,type="n", xlim=c(0,6), ylim=c(0,6))
text(sl.clus.dat,labels=row.names(sl.clus.dat))

print(dist(sl.clus.dat,method="euclidian"),digits=3)

##      g1   g2   g3   g4
## g2 0.10               
## g3 2.24 2.19          
## g4 2.39 2.33 0.30     
## g5 5.66 5.59 3.61 3.36

sl.out<-hclust(dist(sl.clus.dat,method="euclidian"),method="single")
plot(sl.out)

sl.out<-hclust(dist(rnorm(20,0,1),method="euclidian"),method="single")
plot(sl.out)

x <- c(rnorm(10,0,0.1),rnorm(10,3,0.5),rnorm(10,10,1.0))
plot(hclust(dist(x,method="euclidian"),method="single"))

plot(sl.out)

data(golub, package="multtest")
clusdata <- data.frame(golub[1042,],golub[2124,])
colnames(clusdata)<-c("CCND3 Cyclin D3","Zyxin")
gol.fac <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
plot(clusdata, pch=as.numeric(gol.fac))
legend("topright",legend=c("ALL","AML"),pch=1:2)

plot(hclust(dist(clusdata,method="euclidian"),method="single"))

data <- rbind(matrix(rnorm(100,0,0.5), ncol = 2), 
              matrix(rnorm(100,2,0.5), ncol = 2))
cl <- kmeans(data, 2)
plot(data, col = cl$cluster)
points(cl$centers, col = 1:2, pch = 8, cex=2)

initial <- matrix(c(0,0,2,2), nrow = 2, ncol=2, byrow=TRUE)
cl<- kmeans(data, initial, nstart = 10)

n <- 100; nboot<-1000
boot.cl <- matrix(0,nrow=nboot,ncol = 4)
for (i in 1:nboot){
dat.star <- data[sample(1:n,replace=TRUE),]
cl <- kmeans(dat.star, initial, nstart = 10)
boot.cl[i,] <- c(cl$centers[1,],cl$centers[2,])
}
quantile(boot.cl[,1],c(0.025,0.975))

##       2.5%      97.5% 
## -0.1727041  0.1178634

quantile(boot.cl[,2],c(0.025,0.975))

##        2.5%       97.5% 
## -0.08834266  0.13606280

quantile(boot.cl[,3],c(0.025,0.975))

##     2.5%    97.5% 
## 1.847427 2.138265

quantile(boot.cl[,4],c(0.025,0.975))

##     2.5%    97.5% 
## 1.785780 2.030579

data <- data.frame(golub[1042,],golub[2124,])
colnames(data)<-c("CCND3 Cyclin D3","Zyxin")
cl <- kmeans(data, 2,nstart = 10)
cl

## K-means clustering with 2 clusters of sizes 11, 27
## 
## Cluster means:
##   CCND3 Cyclin D3      Zyxin
## 1       0.6355909  1.5866682
## 2       1.8938826 -0.2947926
## 
## Clustering vector:
##  [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1
## [36] 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1]  4.733248 19.842225
##  (between_SS / total_SS =  62.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

quantile(boot.cl[,1],c(0.025,0.975))

##       2.5%      97.5% 
## -0.1727041  0.1178634

quantile(boot.cl[,2],c(0.025,0.975))

##        2.5%       97.5% 
## -0.08834266  0.13606280

quantile(boot.cl[,3],c(0.025,0.975))

##     2.5%    97.5% 
## 1.847427 2.138265

quantile(boot.cl[,4],c(0.025,0.975))

##     2.5%    97.5% 
## 1.785780 2.030579

library(multtest); data(golub)
x <- golub[2289,]; y <- golub[2430,]
cor(x,y)

## [1] 0.6376217

cor.test(x,y)

## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 4.9662, df = 36, p-value = 1.666e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3993383 0.7952115
## sample estimates:
##       cor 
## 0.6376217

nboot <- 1000; boot.cor <- matrix(0,nrow=nboot,ncol = 1)
data <- matrix(c(x,y),ncol=2,byrow=FALSE)
for (i in 1:nboot){
dat.star <- data[sample(1:nrow(data),replace=TRUE),]
boot.cor[i,] <- cor(dat.star)[2,1]}
mean(boot.cor)

## [1] 0.6432279

quantile(boot.cor[,1],c(0.025,0.975))

##      2.5%     97.5% 
## 0.2389299 0.9248476

library(multtest); data(golub)
corgol<- apply(golub, 1, function(x) cor(x,golub.cl))
o <- order(corgol)

eigen(cor(golub))$values[1:5]

## [1] 25.4382629  2.0757158  1.2484411  1.0713373  0.7365232

data <- golub; p <- ncol(data); n <- nrow(data) ; nboot<-1000
eigenvalues <- array(dim=c(nboot,p))
for (i in 1:nboot){dat.star <- data[sample(1:n,replace=TRUE),]
eigenvalues[i,] <- eigen(cor(dat.star))$values}
for (j in 1:p) print(quantile(eigenvalues[,j],c(0.025,0.975)))

##     2.5%    97.5% 
## 24.86513 26.00970 
##     2.5%    97.5% 
## 1.933884 2.245744 
##     2.5%    97.5% 
## 1.143758 1.385539 
##      2.5%     97.5% 
## 0.9959979 1.1453831 
##      2.5%     97.5% 
## 0.6867921 0.8053297 
##      2.5%     97.5% 
## 0.5261660 0.6061389 
##      2.5%     97.5% 
## 0.4865392 0.5585574 
##      2.5%     97.5% 
## 0.4419577 0.5171381 
##      2.5%     97.5% 
## 0.3910773 0.4500239 
##      2.5%     97.5% 
## 0.3684624 0.4211697 
##      2.5%     97.5% 
## 0.3084073 0.3540221 
##      2.5%     97.5% 
## 0.2882908 0.3295121 
##      2.5%     97.5% 
## 0.2717718 0.3081364 
##      2.5%     97.5% 
## 0.2528265 0.2881842 
##      2.5%     97.5% 
## 0.2354121 0.2660459 
##      2.5%     97.5% 
## 0.2243074 0.2520104 
##      2.5%     97.5% 
## 0.2143896 0.2417358 
##      2.5%     97.5% 
## 0.2055815 0.2312879 
##      2.5%     97.5% 
## 0.1937281 0.2183918 
##      2.5%     97.5% 
## 0.1833810 0.2075669 
##      2.5%     97.5% 
## 0.1755144 0.1994077 
##      2.5%     97.5% 
## 0.1685520 0.1901211 
##      2.5%     97.5% 
## 0.1626335 0.1827831 
##      2.5%     97.5% 
## 0.1564219 0.1749540 
##      2.5%     97.5% 
## 0.1502405 0.1692852 
##      2.5%     97.5% 
## 0.1455630 0.1638323 
##      2.5%     97.5% 
## 0.1407281 0.1576954 
##      2.5%     97.5% 
## 0.1364985 0.1526834 
##      2.5%     97.5% 
## 0.1326813 0.1481741 
##      2.5%     97.5% 
## 0.1280307 0.1429988 
##      2.5%     97.5% 
## 0.1233945 0.1385494 
##      2.5%     97.5% 
## 0.1192567 0.1342291 
##      2.5%     97.5% 
## 0.1156207 0.1299444 
##      2.5%     97.5% 
## 0.1106859 0.1253995 
##      2.5%     97.5% 
## 0.1046511 0.1197956 
##       2.5%      97.5% 
## 0.09850563 0.11377120 
##      2.5%     97.5% 
## 0.0932918 0.1073594 
##       2.5%      97.5% 
## 0.08287218 0.09662925

for (j in 1:5) cat(j,as.numeric(quantile(eigenvalues[,j], 
                                         c(0.025,0.975))),"\n" )

## 1 24.86513 26.0097 
## 2 1.933884 2.245744 
## 3 1.143758 1.385539 
## 4 0.9959979 1.145383 
## 5 0.6867921 0.8053297

pca <- princomp(golub, center = TRUE, cor=TRUE, scores=TRUE)

## Warning: In princomp.default(golub, center = TRUE, cor = TRUE, scores = TRUE) :
##  extra argument 'center' will be disregarded

o <- order(pca$scores[,2])
golub.gnames[o[1:10],2]

##  [1] "TCL1 gene (T cell leukemia) extracted from H.sapiens mRNA for Tcell leukemia/lymphoma 1"                        
##  [2] "CD24 signal transducer mRNA and 3' region"                                                                      
##  [3] "GB DEF = (lambda) DNA for immunoglobin light chain"                                                             
##  [4] "MB-1 gene"                                                                                                      
##  [5] "Terminal transferase mRNA"                                                                                      
##  [6] "IGB Immunoglobulin-associated beta (B29)"                                                                       
##  [7] "C-myb gene extracted from Human (c-myb) gene, complete primary cds, and five complete alternatively spliced cds"
##  [8] "Adenosine triphosphatase, calcium"                                                                              
##  [9] "RETINOBLASTOMA BINDING PROTEIN P48"                                                                             
## [10] "Cytoplasmic dynein light chain 1 (hdlc1) mRNA"

golub.gnames[o[3041:3051],2]

##  [1] "GB DEF = Cystic fibrosis antigen mRNA"                                                     
##  [2] "CYSTATIN A"                                                                                
##  [3] "LYZ Lysozyme"                                                                              
##  [4] "MACROPHAGE INFLAMMATORY PROTEIN 1-ALPHA PRECURSOR"                                         
##  [5] "GRO2 GRO2 oncogene"                                                                        
##  [6] "Azurocidin gene"                                                                           
##  [7] "LGALS3 Lectin, galactoside-binding, soluble, 3 (galectin 3) (NOTE: redefinition of symbol)"
##  [8] "DF D component of complement (adipsin)"                                                    
##  [9] "CST3 Cystatin C (amyloid angiopathy and cerebral hemorrhage)"                              
## [10] "Interleukin 8 (IL8) gene"                                                                  
## [11] "INTERLEUKIN-8 PRECURSOR"

biplot(princomp(data,cor=TRUE),pc.biplot=TRUE,cex=0.5,expand=0.8)

data(golub, package = "multtest")
factor <- factor(golub.cl)
o1 <- grep("CD",golub.gnames[,2])
o2 <- grep("Op",golub.gnames[,2])
o3 <- grep("MCM",golub.gnames[,2])
o <- c(o1,o2,o3)

pt <- apply(golub, 1, function(x) t.test(x ~ gol.fac)$p.value)
oo <- o[pt[o]<0.01]

Z <- as.matrix(scale(golub, center = TRUE, scale = TRUE))
K <- eigen(cor(Z))
P <- Z %*% -K$vec[,1:2]
leu <- data.frame(P[oo,], row.names= oo)
attach(leu)

cl <- hclust(dist(leu,method="euclidian"),method="single")
plot(cl)

a <- as.integer(rownames(leu)[cl$order])
for (i in 1:length(a)) cat(a[i],golub.gnames[a[i],2],"\n")

## 1910 FCGR2B Fc fragment of IgG, low affinity IIb, receptor for (CD32) 
## 2874 GB DEF = Fas (Apo-1, CD95) 
## 2350 SLC6A8 gene (creatine transporter) extracted from Human Xq28 cosmid, creatine transporter (SLC6A8) gene, and CDM gene, partial cds 
## 1737 TFRC Transferrin receptor (p90, CD71) 
## 1911 ME491  gene extracted from H.sapiens gene for Me491/CD63 antigen 
## 2749 ITGAX Integrin, alpha X (antigen CD11C (p150), alpha polypeptide) 
## 2761 CD36 CD36 antigen (collagen type I receptor, thrombospondin receptor) 
## 792 ANPEP Alanyl (membrane) aminopeptidase (aminopeptidase N, aminopeptidase M, microsomal aminopeptidase, CD13) 
## 2321 ICAM1 Intercellular adhesion molecule 1 (CD54), human rhinovirus receptor 
## 68 PRKCD Protein kinase C, delta 
## 808 CD33 CD33 antigen (differentiation antigen) 
## 2611 Cell division control related protein (hCDCrel-1) mRNA 
## 450 Opioid-Binding Cell Adhesion Molecule 
## 504 CD3Z CD3Z antigen, zeta polypeptide (TiT3 complex) 
## 313 CD38 CD38 antigen (p45) 
## 1756 CD3G CD3G antigen, gamma polypeptide (TiT3 complex) 
## 893 CD72 CD72 antigen 
## 182 NT5 5' nucleotidase (CD73) 
## 2653 CD2 CD2 antigen (p50), sheep red blood cell receptor 
## 2289 MCM3 Minichromosome maintenance deficient (S. cerevisiae) 3 
## 2430 MCM3 Minichromosome maintenance deficient (S. cerevisiae) 3 
## 316 P105MCM mRNA 
## 1616 KAI1 Kangai 1 (suppression of tumorigenicity 6, prostate; CD82 antigen (R2 leukocyte antigen, antigen detected by monoclonal and antibody IA4)) 
## 1101 CDC10 Cell division cycle 10 (homologous to CDC10 of S. cerevisiae 
## 2673 CD19 gene 
## 2459 CD24 signal transducer mRNA and 3' region 
## 892 CD9 CD9 antigen 
## 1882 CD22 CD22 antigen 
## 885 CD53 CD53 antigen 
## 849 Oncoprotein 18 (Op18) gene 
## 2397 T-CELL ANTIGEN CD7 PRECURSOR 
## 1754 Catalase (EC 1.11.1.6) 5'flank and exon 1 mapping to chromosome 11, band p13 (and joined CDS) 
## 1798 CD37 CD37 antigen 
## 2233 GB DEF = CD36 gene exon 15

data(golub, package = "multtest")
gol.true <- factor(golub.cl,levels=0:1,labels= c(" ALL","not ALL"))
gol.pred <- factor(golub[1042,]>1.27,levels=c("TRUE","FALSE"),
labels=c("ALL","notALL"))
table(gol.pred,gol.true)

##         gol.true
## gol.pred  ALL not ALL
##   ALL      25       1
##   notALL    2      10

library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:multtest':
## 
##     wapply

## The following object is masked from 'package:stats':
## 
##     lowess

gol.true <- factor(golub.cl,levels=0:1,labels= c("TRUE","FALSE"))
pred <- prediction(golub[1042,], gol.true)
perf <- performance(pred, "tpr", "fpr" )
plot(perf)

set.seed(123); n<-10 ; sigma <- 0.5
fac <- factor(c(rep(1,n),rep(2,n),rep(3,n)))
levels(fac) <- c("ALL1","ALL2","AML")
geneA <- c(rnorm(10,0,sigma),rnorm(10,2,sigma),rnorm(10,4,sigma))
dat <- data.frame(fac,geneA)
library(rpart)
rp <- rpart(fac ~ geneA, method="class",data=dat)
plot(rp, branch=0,margin=0.1); text(rp, digits=3, use.n=TRUE)

set.seed(123)
n<-10 ; sigma <- 0.5
fac <- factor(c(rep(1,n),rep(2,n),rep(3,n)))
levels(fac) <- c("ALL1","ALL2","AML")
geneA <- c(rnorm(20,0,sigma),rnorm(10,2,sigma))
geneB <- c(rnorm(10,0,sigma),rnorm(20,2,sigma))
geneC <- c(rnorm(30,1,sigma))
dat <- data.frame(fac,geneA,geneB,geneC)
library(rpart)
rp <- rpart(fac ~ geneA + geneB + geneC, method="class",data=dat)
summary(rp)

## Call:
## rpart(formula = fac ~ geneA + geneB + geneC, data = dat, method = "class")
##   n= 30 
## 
##     CP nsplit rel error xerror       xstd
## 1 0.50      0         1   1.35 0.08215838
## 2 0.01      2         0   0.85 0.13570802
## 
## Variable importance
## geneB geneA geneC 
##    45    39    15 
## 
## Node number 1: 30 observations,    complexity param=0.5
##   predicted class=ALL1  expected loss=0.6666667  P(node) =1
##     class counts:    10    10    10
##    probabilities: 0.333 0.333 0.333 
##   left son=2 (10 obs) right son=3 (20 obs)
##   Primary splits:
##       geneB < 0.8365932  to the left,  improve=10.000000, (0 missing)
##       geneA < 1.025055   to the left,  improve=10.000000, (0 missing)
##       geneC < 0.7845772  to the left,  improve= 2.159091, (0 missing)
##   Surrogate splits:
##       geneA < 0.04529778 to the left,  agree=0.767, adj=0.3, (0 split)
## 
## Node number 2: 10 observations
##   predicted class=ALL1  expected loss=0  P(node) =0.3333333
##     class counts:    10     0     0
##    probabilities: 1.000 0.000 0.000 
## 
## Node number 3: 20 observations,    complexity param=0.5
##   predicted class=ALL2  expected loss=0.5  P(node) =0.6666667
##     class counts:     0    10    10
##    probabilities: 0.000 0.500 0.500 
##   left son=6 (10 obs) right son=7 (10 obs)
##   Primary splits:
##       geneA < 1.025055   to the left,  improve=10.000000, (0 missing)
##       geneB < 1.96844    to the left,  improve= 2.525253, (0 missing)
##       geneC < 0.8736851  to the left,  improve= 1.666667, (0 missing)
##   Surrogate splits:
##       geneB < 1.96844    to the left,  agree=0.75, adj=0.5, (0 split)
##       geneC < 0.7845772  to the left,  agree=0.75, adj=0.5, (0 split)
## 
## Node number 6: 10 observations
##   predicted class=ALL2  expected loss=0  P(node) =0.3333333
##     class counts:     0    10     0
##    probabilities: 0.000 1.000 0.000 
## 
## Node number 7: 10 observations
##   predicted class=AML   expected loss=0  P(node) =0.3333333
##     class counts:     0     0    10
##    probabilities: 0.000 0.000 1.000

library(rpart);data(golub); library(multtest)
gol.fac <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
gol.rp <- rpart(gol.fac ~ golub[1042,] , method="class")
predictedclass <- predict(gol.rp, type="class")
table(predictedclass, gol.fac)

##               gol.fac
## predictedclass ALL AML
##            ALL  25   1
##            AML   2  10

summary(gol.rp)

## Call:
## rpart(formula = gol.fac ~ golub[1042, ], method = "class")
##   n= 38 
## 
##          CP nsplit rel error    xerror      xstd
## 1 0.7272727      0 1.0000000 1.0000000 0.2541521
## 2 0.0100000      1 0.2727273 0.3636364 0.1719828
## 
## Variable importance
## golub[1042, ] 
##           100 
## 
## Node number 1: 38 observations,    complexity param=0.7272727
##   predicted class=ALL  expected loss=0.2894737  P(node) =1
##     class counts:    27    11
##    probabilities: 0.711 0.289 
##   left son=2 (26 obs) right son=3 (12 obs)
##   Primary splits:
##       golub[1042, ] < 1.198515 to the right, improve=10.37517, (0 missing)
## 
## Node number 2: 26 observations
##   predicted class=ALL  expected loss=0.03846154  P(node) =0.6842105
##     class counts:    25     1
##    probabilities: 0.962 0.038 
## 
## Node number 3: 12 observations
##   predicted class=AML  expected loss=0.1666667  P(node) =0.3157895
##     class counts:     2    10
##    probabilities: 0.167 0.833

predict(gol.rp,type="class")

##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
## ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL ALL AML ALL 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
## ALL ALL AML ALL ALL ALL ALL ALL ALL AML ALL AML AML AML AML AML AML AML 
##  37  38 
## AML AML 
## Levels: ALL AML

predict(gol.rp, type="prob")

##          ALL        AML
## 1  0.9615385 0.03846154
## 2  0.9615385 0.03846154
## 3  0.9615385 0.03846154
## 4  0.9615385 0.03846154
## 5  0.9615385 0.03846154
## 6  0.9615385 0.03846154
## 7  0.9615385 0.03846154
## 8  0.9615385 0.03846154
## 9  0.9615385 0.03846154
## 10 0.9615385 0.03846154
## 11 0.9615385 0.03846154
## 12 0.9615385 0.03846154
## 13 0.9615385 0.03846154
## 14 0.9615385 0.03846154
## 15 0.9615385 0.03846154
## 16 0.9615385 0.03846154
## 17 0.1666667 0.83333333
## 18 0.9615385 0.03846154
## 19 0.9615385 0.03846154
## 20 0.9615385 0.03846154
## 21 0.1666667 0.83333333
## 22 0.9615385 0.03846154
## 23 0.9615385 0.03846154
## 24 0.9615385 0.03846154
## 25 0.9615385 0.03846154
## 26 0.9615385 0.03846154
## 27 0.9615385 0.03846154
## 28 0.1666667 0.83333333
## 29 0.9615385 0.03846154
## 30 0.1666667 0.83333333
## 31 0.1666667 0.83333333
## 32 0.1666667 0.83333333
## 33 0.1666667 0.83333333
## 34 0.1666667 0.83333333
## 35 0.1666667 0.83333333
## 36 0.1666667 0.83333333
## 37 0.1666667 0.83333333
## 38 0.1666667 0.83333333

library(rpart);data(golub); library(multtest)
row.names(golub)<- paste("gene", 1:3051, sep = "")
goldata <- data.frame(t(golub[1:3051,]))
gol.fac <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
gol.rp <- rpart(gol.fac~., data=goldata, method="class", cp=0.001)
plot(gol.rp, branch=0,margin=0.1); text(gol.rp, digits=3, use.n=TRUE)

golub.gnames[896,]

## [1] "2020"                    "FAH Fumarylacetoacetate"
## [3] "M55150_at"

library(faraway)

## 
## Attaching package: 'faraway'

## The following object is masked from 'package:rpart':
## 
##     solder

logitmod <- glm((-golub.cl + 1) ~ golub[1042,],
family=binomial(link = "logit"))
pchisq(deviance(logitmod),df.residual(logitmod),lower=FALSE)

## [1] 0.99385

plot((-golub.cl + 1) ~ golub[1042,], xlim=c(-2,5), ylim = c(0,1),
xlab="CCND3 expression values ", ylab="Probability of ALL")
x <- seq(-2,5,.1)
lines(x,ilogit(-4.844124 + 4.439953*x))

pchisq(deviance(logitmod),df.residual(logitmod),lower=FALSE)

## [1] 0.99385

summary(logitmod)

## 
## Call:
## glm(formula = (-golub.cl + 1) ~ golub[1042, ], family = binomial(link = "logit"))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8864  -0.1305   0.1371   0.2707   2.3950  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)     -4.844      1.849  -2.620  0.00880 **
## golub[1042, ]    4.440      1.488   2.984  0.00284 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 45.728  on 37  degrees of freedom
## Residual deviance: 18.270  on 36  degrees of freedom
## AIC: 22.27
## 
## Number of Fisher Scoring iterations: 6

pred <- predict(logitmod,type="response") > 0.5
pred.fac <- factor(pred,levels=c(TRUE,FALSE),labels=c("ALL","not ALL"))
table(pred.fac,gol.fac)

##          gol.fac
## pred.fac  ALL AML
##   ALL      26   2
##   not ALL   1   9

library(seqinr)

## 
## Attaching package: 'seqinr'

## The following object is masked from 'package:limma':
## 
##     zscore

## The following objects are masked from 'package:ape':
## 
##     as.alignment, consensus

choosebank()

##  [1] "genbank"         "embl"            "emblwgs"        
##  [4] "swissprot"       "ensembl"         "hogenom7"       
##  [7] "hogenom"         "hogenomdna"      "hovergendna"    
## [10] "hovergen"        "hogenom5"        "hogenom5dna"    
## [13] "hogenom4"        "hogenom4dna"     "homolens"       
## [16] "homolensdna"     "hobacnucl"       "hobacprot"      
## [19] "phever2"         "phever2dna"      "refseq"         
## [22] "greviews"        "bacterial"       "archaeal"       
## [25] "protozoan"       "ensprotists"     "ensfungi"       
## [28] "ensmetazoa"      "ensplants"       "ensemblbacteria"
## [31] "mito"            "polymorphix"     "emglib"         
## [34] "refseqViruses"   "ribodb"          "taxodb"

choosebank("genbank")
query("ccnd","k=ccnd",virtual=TRUE)$nelem

## [1] 7

query("ccnd3hs","sp=homo sapiens AND k=ccnd3",virtual=TRUE)$nelem

## [1] 6

library(seqinr)
x <- s2c("GAATTC"); y <- s2c("GATTA"); d <- 2
s <- matrix(data=NA,nrow=length(y),ncol=length(x))
for (i in 1:(nrow(s))) for (j in 1:(ncol(s)))
{if (y[i]==x[j]) s[i,j]<- 2 else s[i,j]<- -1 }
rownames(s) <- c(y); colnames(s) <- c(x)
s

##    G  A  A  T  T  C
## G  2 -1 -1 -1 -1 -1
## A -1  2  2 -1 -1 -1
## T -1 -1 -1  2  2 -1
## T -1 -1 -1  2  2 -1
## A -1  2  2 -1 -1 -1

F <- matrix(data=NA,nrow=(length(y)+1),ncol=(length(x)+1))
rownames(F) <- c("",y); colnames(F) <- c("",x)
F[,1] <- -seq(0,length(y)*d,d); F[1,] <- -seq(0,length(x)*d,d)
for (i in 2:(nrow(F)))
for (j in 2:(ncol(F)))
{F[i,j] <- max(c(F[i-1,j-1]+s[i-1,j-1],F[i-1,j]-d,F[i,j-1]-d))}
F

##        G  A  A  T   T   C
##     0 -2 -4 -6 -8 -10 -12
## G  -2  2  0 -2 -4  -6  -8
## A  -4  0  4  2  0  -2  -4
## T  -6 -2  2  3  4   2   0
## T  -8 -4  0  1  5   6   4
## A -10 -6 -2  2  3   4   5

library(seqinr);library(Biostrings);data(BLOSUM50)

## Loading required package: S4Vectors

## Loading required package: stats4

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:gplots':
## 
##     space

## The following object is masked from 'package:base':
## 
##     expand.grid

## Loading required package: IRanges

## Loading required package: XVector

## 
## Attaching package: 'Biostrings'

## The following object is masked from 'package:seqinr':
## 
##     translate

## The following object is masked from 'package:ape':
## 
##     complement

## The following object is masked from 'package:base':
## 
##     strsplit

x <- s2c("HEAGAWGHEE"); y <- s2c("PAWHEAE"); s <- BLOSUM50[y,x]; d <- 8
F <- matrix(data=NA,nrow=(length(y)+1),ncol=(length(x)+1))
F[1,] <- -seq(0,80,8); F[,1] <- -seq(0,56,8)
rownames(F) <- c("",y); colnames(F) <- c("",x)
for (i in 2:(nrow(F)))
for (j in 2:(ncol(F)))
{F[i,j] <- max(c(F[i-1,j-1]+s[i-1,j-1],F[i-1,j]-d,F[i,j-1]-d))}
F

##         H   E   A   G   A   W   G   H   E   E
##     0  -8 -16 -24 -32 -40 -48 -56 -64 -72 -80
## P  -8  -2  -9 -17 -25 -33 -41 -49 -57 -65 -73
## A -16 -10  -3  -4 -12 -20 -28 -36 -44 -52 -60
## W -24 -18 -11  -6  -7 -15  -5 -13 -21 -29 -37
## H -32 -14 -18 -13  -8  -9 -13  -7  -3 -11 -19
## E -40 -22  -8 -16 -16  -9 -12 -15  -7   3  -5
## A -48 -30 -16  -3 -11 -11 -12 -12 -15  -5   2
## E -56 -38 -24 -11  -6 -12 -14 -15 -12  -9   1

library(Biostrings);data(BLOSUM50)
pairwiseAlignment(AAString("PAWHEAE"), AAString("HEAGAWGHEE"),
substitutionMatrix = "BLOSUM50",gapOpening = 0, gapExtension = -8,
scoreOnly = FALSE)

## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: [1] PA--W-HEAE 
## subject: [2] EAGAWGHE-E 
## score: 1

library(seqinr);library(Biostrings);data(BLOSUM50)
randallscore <- double()
for (i in 1:1000) {
x <- c2s(sample(rownames(BLOSUM50),7, replace=TRUE))
y <- c2s(sample(rownames(BLOSUM50),10, replace=TRUE))
randallscore[i] <- pairwiseAlignment(AAString(x), AAString(y),
substitutionMatrix = "BLOSUM50",gapOpening = 0, gapExtension = -8,
scoreOnly = TRUE)
}
sum(randallscore>1)/1000

## [1] 0.003

markov1 <- function(x,P,n){ seq <- x
for(k in 1:(n-1)){
seq[k+1] <- sample(x, 1, replace=TRUE, P[seq[k],])}
return(seq)
}
P <- matrix(c(1/6,5/6,0.5,0.5), 2, 2, byrow=TRUE)
rownames(P) <- colnames(P) <- StateSpace <- x <- c(1,2)
markov1(x,P,30)

##  [1] 1 2 2 1 2 1 2 1 2 1 2 1 2 1 2 2 1 2 1 1 2 2 1 1 2 1 2 2 2 1

markov2 <- function(StateSpace,P,pi0,n){
seq <- character(n)
seq[1] <- sample(StateSpace, 1, replace=TRUE, pi0)
for(k in 1:(n-1)){
seq[k+1] <- sample(StateSpace, 1, replace=TRUE, P[seq[k],])}
return(seq)
}
P <- matrix(c(
1/6,5/6,0,0,
1/8,2/4,1/4,1/8,
0,2/6,3/6,1/6,
0,1/6,3/6,2/6),4,4,byrow=TRUE)
rownames(P) <- colnames(P) <- StateSpace <- c("a","c","g","t")
pi0 <- c(1/4,1/4,1/4,1/4)
x <- markov2(StateSpace,P,pi0,1000)
table(x)

## x
##   a   c   g   t 
##  62 418 371 149

pi0 <- c(1/4,1/4,1/4,1/4)
P <- matrix(c(.01,.01,.01,.97,
.01,.01,.01,.97,
.01,.01,.01,.97,
.01,.28,.01,0.70),4,4,byrow=T)
rownames(P) <- colnames(P) <- StateSpace <- c("a","c","g","t")
x <- markov2(StateSpace,P,pi0,30000)
table(getTrans(x))

## 
##    *    A    C    D    F    H    I    L    M    N    P    R    S    T    V 
##    1    1   68    2 5298   16  103 2140    1    1   21   23 2158    1   99 
##    W    Y 
##    2   65

nr <- count(x,2)
names(nr)

##  [1] "aa" "ac" "ag" "at" "ca" "cc" "cg" "ct" "ga" "gc" "gg" "gt" "ta" "tc"
## [15] "tg" "tt"

A <- matrix(NA,4,4)
A[1,1]<-nr["aa"]; A[1,2]<-nr["ag"]; A[1,3]<-nr["ac"]; A[1,4]<-nr["at"]
A[2,1]<-nr["ga"]; A[2,2]<-nr["gg"]; A[2,3]<-nr["gc"]; A[2,4]<-nr["gt"]
A[3,1]<-nr["ca"]; A[3,2]<-nr["cg"]; A[3,3]<-nr["cc"]; A[3,4]<-nr["ct"]
A[4,1]<-nr["ta"]; A[4,2]<-nr["tg"]; A[4,3]<-nr["tc"]; A[4,4]<-nr["tt"]
rowsumA <- apply(A, 1, sum)
Phat <- sweep(A, 1, rowsumA, FUN="/")
rownames(Phat) <- colnames(Phat) <- c("a","g","c","t")
round(Phat,3)

##       a     g     c     t
## a 0.013 0.006 0.016 0.965
## g 0.014 0.014 0.014 0.958
## c 0.009 0.013 0.010 0.969
## t 0.011 0.009 0.276 0.705

P <- matrix(c(5/6,1/6,0.5,0.5),2,2,byrow=T)
pi0 <- c(2/3,1/3)
pi0 %*% P

##           [,1]      [,2]
## [1,] 0.7222222 0.2777778

P %*% P

##           [,1]      [,2]
## [1,] 0.7777778 0.2222222
## [2,] 0.6666667 0.3333333

P <- matrix(c(1/6,5/6,0.5,0.5),2,2,byrow=T)
V <- eigen(P,symmetric = FALSE)
V$values

## [1]  1.0000000 -0.3333333

V$vectors

##            [,1]       [,2]
## [1,] -0.7071068 -0.8574929
## [2,] -0.7071068  0.5144958

V$vec %*% diag(V$va)^(16) %*% solve(V$vec)

##       [,1]  [,2]
## [1,] 0.375 0.625
## [2,] 0.375 0.625

P <- matrix(c(1,0,0, 1/4,1/2,1/4,0,0,1),3,3,byrow=T)
V <- eigen(P,symmetric = FALSE)
V$vec %*% diag(V$va)^(5) %*% solve(V$vec)

##          [,1]    [,2]     [,3]
## [1,] 1.000000 0.00000 0.000000
## [2,] 0.484375 0.03125 0.484375
## [3,] 0.000000 0.00000 1.000000

library(Matrix)

## 
## Attaching package: 'Matrix'

## The following object is masked from 'package:S4Vectors':
## 
##     expand

Q <- 0.2 * Matrix(c(-3,1,1,1,1,-3,1,1,1,1,-3,1,1,1,1,-3),4)
rownames(Q) <- colnames(Q) <- c("A","G","C","T")
P <- as.matrix(expm(Q))
round(P,2)

##      A    G    C    T
## A 0.59 0.14 0.14 0.14
## G 0.14 0.59 0.14 0.14
## C 0.14 0.14 0.59 0.14
## T 0.14 0.14 0.14 0.59

alpha <- 3/6; beta <- 2/6; gamma<- 1/6; Q <- matrix(data=NA,4,4)
Q[1,2] <- Q[2,1] <- Q[3,4] <- Q[4,3] <- alpha
Q[1,3] <- Q[3,1] <- Q[2,4] <- Q[4,2] <- beta
Q[1,4] <- Q[4,1] <- Q[2,3] <- Q[3,2] <- gamma
diag(Q) <- -(alpha + beta + gamma)
Q

##            [,1]       [,2]       [,3]       [,4]
## [1,] -1.0000000  0.5000000  0.3333333  0.1666667
## [2,]  0.5000000 -1.0000000  0.1666667  0.3333333
## [3,]  0.3333333  0.1666667 -1.0000000  0.5000000
## [4,]  0.1666667  0.3333333  0.5000000 -1.0000000

Q <- Matrix(Q)
P <- as.matrix(expm(Q))
P

##           [,1]      [,2]      [,3]      [,4]
## [1,] 0.4550880 0.2288517 0.1767105 0.1393498
## [2,] 0.2288517 0.4550880 0.1393498 0.1767105
## [3,] 0.1767105 0.1393498 0.4550880 0.2288517
## [4,] 0.1393498 0.1767105 0.2288517 0.4550880

library(Matrix)
alpha <- 1/5; Q <- matrix(rep(alpha,16),4,4)
diag(Q) <- -3 * alpha
Q <- Matrix(Q)
P <- as.matrix(expm(Q))
V <- eigen(P,symmetric = FALSE)
V$vec %*% diag(V$va)^(50) %*% solve(V$vec)

##      [,1] [,2] [,3] [,4]
## [1,] 0.25 0.25 0.25 0.25
## [2,] 0.25 0.25 0.25 0.25
## [3,] 0.25 0.25 0.25 0.25
## [4,] 0.25 0.25 0.25 0.25

library(ape);library(seqinr)
accnr <- paste("AJ5345",26:27,sep="")
seqbin <- read.GenBank(accnr, species.names = TRUE, as.character = FALSE)
dist.dna(seqbin, model = "JC69")

##           AJ534526
## AJ534527 0.1326839

library(ape);library(seqinr)
accnr <- paste("AJ5345",26:35,sep="")
seq <- read.GenBank(accnr)
names(seq) <- attr(seq, "species")
dist <- dist.dna(seq, model = "K80")
plot(nj(dist))

data(golub, package="multtest")
rowindex <- agrep("^oncogene",golub.gnames[,2])
oncogol <- golub[rowindex,]
oncogolub.gnames <- golub.gnames[rowindex,]
gol.fac <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
meangol <- apply(oncogol[,gol.fac=="ALL"],1,mean)
o <- order(meangol,decreasing=TRUE)
oncogolub.gnames[o[1:3],2]

## [1] "Cellular oncogene c-fos (complete sequence)"
## [2] "GB DEF = VAV oncogene"                      
## [3] "FYN FYN oncogene related to SRC, FGR, YES"

meangol <- apply(oncogol[,gol.fac=="AML"],1,mean)
o <- order(meangol,decreasing=TRUE)
oncogolub.gnames[o[1:3],2]

## [1] "PIM1 Pim-1 oncogene"       "JUNB Jun B proto-oncogene"
## [3] "Proto-oncogene BCL3 gene"

data(golub, package="multtest")
gol.fac <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
efs <- apply(golub[,gol.fac=="ALL"],1,function(x) mean(x)/sd(x))
o <- order(efs,decreasing=TRUE)
efs[o[1:5]]

## [1] 11.138128 10.638308  9.155108  8.954115  8.695353

golub.gnames[o[1:5],2]

## [1] "YWHAZ Tyrosine 3-monooxygenase/tryptophan 5-monooxygenase activation protein, zeta polypeptide"
## [2] "ZNF91 Zinc finger protein 91 (HPF7, HTF10)"                                                    
## [3] "HnRNP-E2 mRNA"                                                                                 
## [4] "54 kDa protein mRNA"                                                                           
## [5] "Immunophilin homolog ARA9 mRNA"

refs <- apply(golub[,gol.fac=="ALL"],1,function(x) median(x)/mad(x))
o <- order(refs,decreasing=TRUE)
refs[o[1:5]]

## [1] 14.51217 13.57425 13.27698 13.14419 12.91608

golub.gnames[o[1:5],2]

## [1] "COX6B gene (COXG) extracted from Human DNA from overlapping chromosome 19 cosmids R31396, F25451, and R31076 containing COX6B and UPKA, genomic sequence"
## [2] "AFFX-HSAC07/X00351_M_at (endogenous control)"                                                                                                            
## [3] "ATP5A1 ATP synthase, H+ transporting, mitochondrial F1 complex, alpha subunit, isoform 1, cardiac muscle"                                                
## [4] "ATP SYNTHASE GAMMA CHAIN, MITOCHONDRIAL PRECURSOR"                                                                                                       
## [5] "YWHAZ Tyrosine 3-monooxygenase/tryptophan 5-monooxygenase activation protein, zeta polypeptide"

data(golub, package = "multtest")
gol.fac <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
stripchart(golub[1042,] ~ gol.fac,method="jitter")

stripchart(golub[1042,] ~ gol.fac,method="jitter",vertical = TRUE)

stripchart(golub[1042,] ~ gol.fac,method="jitter",col=c("red", "blue"),
vertical = TRUE)

stripchart(golub[1042,] ~ gol.fac,method="jitter",col=c("red", "blue"),
pch="*",vertical = TRUE)
title("CCND3 Cyclin D3 expression value for ALL and AMl patients")

an <- sqrt(2*log(n)) - 0.5*(log(log(n))+log(4*pi))*(2*log(n))^(-1/2)
bn <- (2*log(n))^(-1/2)
e <- double(); n <- 10000 # Serfling p.90
for (i in 1:1000) e[i] <- (max(rnorm(n))-an)/bn
plot(density(e),ylim=c(0,0.5))
f<-function(x){exp(-x)*exp(-exp(-x))}
curve(f,range(density(e)$x),add=TRUE,col = "blue")
curve(dnorm,add=TRUE,col = "red")

library(multtest);data(golub)
gol.fac <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
shapiro.test(golub[i,gol.fac=="ALL"])

## 
##  Shapiro-Wilk normality test
## 
## data:  golub[i, gol.fac == "ALL"]
## W = 0.9667, p-value = 0.5173

library(multtest); data(golub)
gol.fac <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
pt <- apply(golub, 1, function(x) t.test(x ~ gol.fac)$p.value)
index <-agrep("^antigen",golub.gnames[,2])
golub.index<-golub[index,]
pt.index<-pt[index]
golub.gnames.index<-golub.gnames[index,]
golub.gnames.index[order(pt.index)[1:length(index)],2]

##  [1] "ME491  gene extracted from H.sapiens gene for Me491/CD63 antigen"                                                                               
##  [2] "CD22 CD22 antigen"                                                                                                                              
##  [3] "ITGAX Integrin, alpha X (antigen CD11C (p150), alpha polypeptide)"                                                                              
##  [4] "CD33 CD33 antigen (differentiation antigen)"                                                                                                    
##  [5] "CD38 CD38 antigen (p45)"                                                                                                                        
##  [6] "CD37 CD37 antigen"                                                                                                                              
##  [7] "CD2 CD2 antigen (p50), sheep red blood cell receptor"                                                                                           
##  [8] "APS Prostate specific antigen"                                                                                                                  
##  [9] "GB DEF = T-cell antigen receptor gene T3-delta"                                                                                                 
## [10] "PCNA Proliferating cell nuclear antigen"                                                                                                        
## [11] "KAI1 Kangai 1 (suppression of tumorigenicity 6, prostate; CD82 antigen (R2 leukocyte antigen, antigen detected by monoclonal and antibody IA4))"
## [12] "CD36 CD36 antigen (collagen type I receptor, thrombospondin receptor)"                                                                          
## [13] "CD72 CD72 antigen"                                                                                                                              
## [14] "CD3Z CD3Z antigen, zeta polypeptide (TiT3 complex)"                                                                                             
## [15] "CD53 CD53 antigen"                                                                                                                              
## [16] "CD3G CD3G antigen, gamma polypeptide (TiT3 complex)"                                                                                            
## [17] "Autoantigen DFS70 mRNA, partial cds"                                                                                                            
## [18] "CD9 CD9 antigen"                                                                                                                                
## [19] "CD58 CD58 antigen, (lymphocyte function-associated antigen 3)"                                                                                  
## [20] "CALR Autoantigen calreticulin"                                                                                                                  
## [21] "GB DEF = CD1 R2 gene for MHC-related antigen"                                                                                                   
## [22] "ITGA4 Integrin, alpha 4 (antigen CD49D, alpha 4 subunit of VLA-4 receptor)"                                                                     
## [23] "CD1B CD1b antigen (thymocyte antigen)"                                                                                                          
## [24] "CD48 CD48 antigen (B-cell membrane protein)"                                                                                                    
## [25] "Autoantigen p542 mRNA, 3' end of cds"                                                                                                           
## [26] "CD69 CD69 antigen (early T cell activation antigen)"                                                                                            
## [27] "Hepatitis delta antigen interacting protein A (dipA) mRNA"                                                                                      
## [28] "GB DEF = Cystic fibrosis antigen mRNA"                                                                                                          
## [29] "SSB Sjogren syndrome antigen B (autoantigen La)"                                                                                                
## [30] "NASP Nuclear autoantigenic sperm protein (histone-binding)"                                                                                     
## [31] "GB DEF = B-lymphocyte cell-surface antigen B1 (CD20)"                                                                                           
## [32] "PRTN3 Proteinase 3 (serine proteinase, neutrophil, Wegener granulomatosis autoantigen)"                                                         
## [33] "Renal cell carcinoma antigen RAGE-4 mRNA, complete putative cds"                                                                                
## [34] "CDW52 CDW52 antigen (CAMPATH-1 antigen)"                                                                                                        
## [35] "62 kDa paraneoplastic antigen mRNA, 3' end"                                                                                                     
## [36] "CD59 CD59 antigen p18-20 (antigen identified by monoclonal antibodies 16.3A5, EJ16, EJ30, EL32 and G344)"                                       
## [37] "Thyroid autoantigen (truncated actin-binding protein) mRNA"                                                                                     
## [38] "ITGAM Integrin, alpha M (complement component receptor 3, alpha; also known as CD11b (p170), macrophage antigen alpha polypeptide)"             
## [39] "MAGE-11 antigen (MAGE11) gene"                                                                                                                  
## [40] "CD40 CD40 antigen"                                                                                                                              
## [41] "TRA1 Homologue of mouse tumor rejection antigen gp96"                                                                                           
## [42] "CD27 CD27 antigen (T cell activation antigen)"                                                                                                  
## [43] "CD47 CD47 antigen (Rh-related antigen, integrin-associated signal transducer)"                                                                  
## [44] "G22P1 Thyroid autoantigen 70kD (Ku antigen)"                                                                                                    
## [45] "GB DEF = Proliferating cell nuclear antigen (PCNA) gene, promoter region"                                                                       
## [46] "Melanoma antigen p15 mRNA"                                                                                                                      
## [47] "Breast tumor autoantigen mRNA, complete sequence"                                                                                               
## [48] "PECAM1 Platelet/endothelial cell adhesion molecule (CD31 antigen)"                                                                              
## [49] "Golgi complex autoantigen golgin-97 mRNA"                                                                                                       
## [50] "CGM7 Carcinoembryonic antigen gene family member 7"                                                                                             
## [51] "ITGA2B Integrin, alpha 2b (platelet glycoprotein IIb of IIb/IIIa complex, antigen CD41B)"                                                       
## [52] "Autoantigen mRNA"                                                                                                                               
## [53] "CD14 CD14 antigen"                                                                                                                              
## [54] "ITGAL Integrin, alpha L (antigen CD11A (p180), lymphocyte function-associated antigen 1; alpha polypeptide)"                                    
## [55] "HLA-E MHC class I antigen HLA-E"                                                                                                                
## [56] "Surface antigen mRNA"                                                                                                                           
## [57] "ITGB1 Integrin, beta 1 (fibronectin receptor, beta polypeptide, antigen CD29 includes MDF2, MSK12)"                                             
## [58] "CD1C CD1c antigen (thymocyte antigen)"                                                                                                          
## [59] "CD70 CD70 antigen (CD27 ligand)"                                                                                                                
## [60] "Golgi antigen gcp372"                                                                                                                           
## [61] "CD5 CD5 antigen (p56-62)"                                                                                                                       
## [62] "CD97 CD97 antigen (leucocyte antigen)"                                                                                                          
## [63] "KAI1 Kangai 1 (suppression of tumorigenicity 6, prostate; CD82 antigen (R2 leukocyte antigen, antigen detected by monoclonal and antibody IA4))"
## [64] "CD40LG CD40 antigen ligand (hyper IgM syndrome)"                                                                                                
## [65] "Autoantigen pericentriol material 1 (PCM-1) mRNA"                                                                                               
## [66] "ANX11 Annexin XI (56kD autoantigen)"                                                                                                            
## [67] "Class I histocompatibility antigen-like protein mRNA"                                                                                           
## [68] "HLA-SB alpha gene (class II antigen) extracted from Human HLA-SB(DP) alpha gene"                                                                
## [69] "MAGE-5a antigen (MAGE5a) gene"                                                                                                                  
## [70] "A33 antigen precursor mRNA"                                                                                                                     
## [71] "CD34 CD34 antigen (hemopoietic progenitor cell antigen)"                                                                                        
## [72] "CD1A CD1a antigen (thymocyte antigen)"                                                                                                          
## [73] "Nucleolar autoantigen No55 mRNA"                                                                                                                
## [74] "Breast epithelial antigen BA46 mRNA"                                                                                                            
## [75] "CD8A CD8 antigen, alpha polypeptide (p32)"

all66 <- golub[66,gol.fac=="ALL"]
all790 <- golub[790,gol.fac=="ALL"]
boxplot(all66,all790)

mean(all66);mean(all790)

## [1] 1.182503

## [1] -1.174024

median(all66);median(all790)

## [1] 1.23023

## [1] -1.28137

sd(all66);sd(all790)

## [1] 0.4408175

## [1] 0.4809953

IQR(all66)/1.349 ;IQR(all790)/1.349

## [1] 0.4279985

## [1] 0.1532024

mean(all66);mean(all790)

## [1] 1.182503

## [1] -1.174024

mad(all66);mad(all790)

## [1] 0.4523413

## [1] 0.1864814

shapiro.test(all66);shapiro.test(all790)

## 
##  Shapiro-Wilk normality test
## 
## data:  all66
## W = 0.98698, p-value = 0.9758

## 
##  Shapiro-Wilk normality test
## 
## data:  all790
## W = 0.66224, p-value = 1.206e-06

library(multtest);data(golub)
gol.fac <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
allsh <- apply(golub[,gol.fac=="ALL"], 1, function(x) shapiro.test(x)$p.value)
amlsh <- apply(golub[,gol.fac=="AML"], 1, function(x) shapiro.test(x)$p.value)
100 * sum(allsh>0.05)/length(allsh)

## [1] 58.27598

100 * sum(amlsh>0.05)/length(amlsh)

## [1] 78.56441

100 * sum(allsh>0.05 & allsh>0.05)/length(allsh)

## [1] 58.27598

library("genefilter")
data(ALL, package = "ALL")
ALLB <- ALL[,ALL$BT %in% c("B1","B2","B3","B4")]
f1 <- function(x) (shapiro.test(x)$p.value > 0.05)
sel1 <- genefilter(exprs(ALL[,ALLB$BT=="B1"]), filterfun(f1))
sel2 <- genefilter(exprs(ALL[,ALLB$BT=="B2"]), filterfun(f1))
sel3 <- genefilter(exprs(ALL[,ALLB$BT=="B3"]), filterfun(f1))
sel4 <- genefilter(exprs(ALL[,ALLB$BT=="B4"]), filterfun(f1))
selected <- sel1 & sel2 & sel3 & sel4

library(limma)
x <- matrix(as.integer(c(sel2,sel3,sel4)),ncol = 3,byrow=FALSE)
colnames(x) <- c("sel2","sel3","sel4")
vc <- vennCounts(x, include="both")
vennDiagram(vc)

library("ALL")
library("limma");
allB <- ALL[,which(ALL$BT %in% c("B1","B2","B3","B4"))]
facB123 <- factor(allB$BT)
cont.ma <- makeContrasts(B2-B1,B3-B2,B4-B3, levels=facB123)
design.ma <- model.matrix(~ 0 + facB123)
colnames(design.ma) <- c("B1","B2","B3","B4")
fit <- lmFit(allB, design.ma)
fit1 <- contrasts.fit(fit, cont.ma)
fit1 <- eBayes(fit1)
topTable(fit1, coef=2,5,adjust.method="BH")

##               logFC  AveExpr         t      P.Value    adj.P.Val         B
## 35991_at  0.5964481 4.144598  6.624128 2.578836e-09 0.0000325578 10.842989
## 33873_at  0.5707770 7.217570  6.083524 2.891823e-08 0.0001825464  8.625253
## 35614_at  1.7248509 5.663477  5.961231 4.946078e-08 0.0002081474  8.132884
## 36711_at -2.3664712 7.576108 -5.759565 1.187487e-07 0.0003054110  7.329631
## 37902_at  0.8470235 4.258491  5.742783 1.276579e-07 0.0003054110  7.263298

library(multtest);data(golub);
gol.fac <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
o1 <- grep("oncogene",golub.gnames[,2])
plot(hclust(dist(golub[o1,],method="euclidian"),method="single"))

o3 <- grep("receptor",golub.gnames[,2])
plot(hclust(dist(golub[o3,],method="euclidian"),method="single"))

library(ROCR); data(golub, package = "multtest")
gol.true <- factor(golub.cl,levels=0:1,labels= c("TRUE","FALSE"))
auc.values <- apply(golub,1,
function(x) performance(prediction(x, gol.true),"auc")@y.values[[1]])
o <- order(auc.values,decreasing=TRUE)
golub.gnames[o[1:25],2]

##  [1] "TCF3 Transcription factor 3 (E2A immunoglobulin enhancer binding factors E12/E47)"                              
##  [2] "Macmarcks"                                                                                                      
##  [3] "VIL2 Villin 2 (ezrin)"                                                                                          
##  [4] "TOP2B Topoisomerase (DNA) II beta (180kD)"                                                                      
##  [5] "C-myb gene extracted from Human (c-myb) gene, complete primary cds, and five complete alternatively spliced cds"
##  [6] "RETINOBLASTOMA BINDING PROTEIN P48"                                                                             
##  [7] "RB1 Retinoblastoma 1 (including osteosarcoma)"                                                                  
##  [8] "CCND3 Cyclin D3"                                                                                                
##  [9] "ALDR1 Aldehyde reductase 1 (low Km aldose reductase)"                                                           
## [10] "T-COMPLEX PROTEIN 1, GAMMA SUBUNIT"                                                                             
## [11] "SPTAN1 Spectrin, alpha, non-erythrocytic 1 (alpha-fodrin)"                                                      
## [12] "Inducible protein mRNA"                                                                                         
## [13] "Translational initiation factor 2 beta subunit (elF-2-beta) mRNA"                                               
## [14] "NUCLEOLYSIN TIA-1"                                                                                              
## [15] "Putative enterocyte differentiation promoting factor mRNA, partial cds"                                         
## [16] "IEF SSP 9502 mRNA"                                                                                              
## [17] "ACADM Acyl-Coenzyme A dehydrogenase, C-4 to C-12 straight chain"                                                
## [18] "PROTEASOME IOTA CHAIN"                                                                                          
## [19] "X-LINKED HELICASE II"                                                                                           
## [20] "Stimulator of TAR RNA binding (SRB) mRNA"                                                                       
## [21] "MYL1 Myosin light chain (alkali)"                                                                               
## [22] "Transcriptional activator hSNF2b"                                                                               
## [23] "HKR-T1"                                                                                                         
## [24] "ADA Adenosine deaminase"                                                                                        
## [25] "Transcriptional activator hSNF2b"

library(multtest);library(ROCR);data(golub)
golub.clchanged <- -golub.cl +1
pred <- prediction(golub[1042,], golub.clchanged)
perf <- performance(pred, "sens", "spec")
plot(perf)

library(rpart)
predictors <- matrix(rnorm(100*4,0,1),100,4)
colnames(predictors) <- letters[1:4]
groups <- gl(2,50)
simdata <- data.frame(groups,predictors)
rp<-rpart(groups ~ a + b + c + d,method="class",data=simdata)
predicted <- predict(rp,type="class")
table(predicted,groups)

##          groups
## predicted  1  2
##         1 44 19
##         2  6 31

plot(rp, branch=0,margin=0.1); text(rp, digits=3, use.n=TRUE)

table(predicted,groups)

##          groups
## predicted  1  2
##         1 44 19
##         2  6 31

statistics for bioinformatics

Kushan De Silva

January 3, 2018