# PCA #####
x = rnorm(100)
y = rnorm(100, x)

pca = prcomp(cbind(x, y))
names(pca)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
pca$rotation
##      PC1     PC2
## x 0.5569  0.8306
## y 0.8306 -0.5569
pca$center
##        x        y 
##  0.02950 -0.04793

par(mfrow = c(1, 2), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0), 
    oma = c(0, 0, 0, 1))
plot(x, y, pch = 19, col = "gray", xlim = range(x, y), ylim = range(x, y), main = "Origina coordinates")
points(pca$center[1], pca$center[2], cex = 2)
b1 = pca$rotation[2, 1]/pca$rotation[1, 1]
abline(a = pca$center[2] - b1 * pca$center[1], b = b1, col = "red")
b2 = pca$rotation[2, 2]/pca$rotation[1, 2]
abline(a = pca$center[2] - b2 * pca$center[1], b = b2, col = "blue")
dev = round(pca$sdev/sum(pca$sdev) * 100, 1)
plot(-pca$x[, 1], -pca$x[, 2], pch = 19, col = "gray", main = "PCA", xlab = paste("PC1 (", 
    dev[1], "%)", sep = ""), ylab = paste("PC2 (", dev[2], "%)", sep = ""), 
    xlim = range(x, pca$x, y), ylim = range(x, y, pca$x))
abline(a = 0, b = 0, col = "red")
abline(v = 0, col = "blue")

plot of chunk unnamed-chunk-1


# load data #####
setwd("/home/mazin/teaching/R/2014")
d = as.matrix(read.table("my.code/5.hs.cnts"))
meta = read.table("my.code/5.hs.meta", stringsAsFactors = F)
rin = meta$rin
sex = meta$sex
age = meta$age
# make functions #####
getPal = function(col = c("blue", "white", "red"), n = 200) {
    r = character(n)
    if (n == 1) {
        i = floor(length(col)/2)
        return(.getPal(col[i], col[i + 1], 0.2))
    }
    for (i in 0:(n - 1)) {
        cinx = i/(n - 1) * (length(col) - 1) + 1
        if (cinx == floor(cinx)) 
            r[i + 1] = col[cinx] else {
            rate = cinx - floor(cinx)
            cinx = floor(cinx)
            r[i + 1] = .getPal(col[cinx], col[cinx + 1], 1 - rate)
        }
    }
    r
}

.getPal = function(c1, c2, r) {
    c1 = t(col2rgb(c1, alpha = TRUE)/255)
    c2 = t(col2rgb(c2, alpha = TRUE)/255)
    return(rgb((c1 * r + c2 * (1 - r)), alpha = r * c1[4] + (1 - r) * c2[4]))
}

# read count density #####
par(mfrow = c(2, 2), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0), 
    oma = c(0, 0, 0, 1))
plot(density(d), main = "", xlab = "read counts")
plot(density(log(d), bw = 0.02, from = -2.5), main = "", xlab = "log(read counts)")
table(d == 0)
## 
##   FALSE    TRUE 
##  782200 1453360
plot(density(log(d + 0.1), bw = 0.02, from = -2.5), main = "", xlab = "log(read counts + 1)")

dim(d)
## [1] 55889    40
sum(apply(d, 1, sd) == 0)
## [1] 23162
d = d[apply(d < 20, 1, sum) < 20, ]
sum(apply(d, 1, sd) == 0)
## [1] 0
dim(d)
## [1] 10432    40
plot(density(log(d), bw = 0.02, from = -2.5), xlab = "log(read counts)", main = "after filtering")

dn = sweep(d, 1, apply(d, 1, mean), "-")
dn = sweep(dn, 1, apply(dn, 1, sd), "/")

# PCA on real data #####
sam.cols = getPal(c("#0000FF", "#DDDDDD", "#FF0000"), 40)
pch.sex = ifelse(sex == "m", 17, 19)
par(mfcol = c(2, 3), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0), 
    oma = c(0, 0, 0, 1))
pca = prcomp(t(d))
dev = round(pca$sdev/sum(pca$sdev) * 100, 1)

plot(pca$x[, 1], pca$x[, 2], pch = pch.sex, col = sam.cols, main = "PCA", xlab = paste("PC1 (", 
    dev[1], "%)", sep = ""), ylab = paste("PC2 (", dev[2], "%)", sep = ""))
plot(pca$x[, 3], pca$x[, 4], pch = pch.sex, col = sam.cols, main = "PCA", xlab = paste("PC3 (", 
    dev[3], "%)", sep = ""), ylab = paste("PC4 (", dev[4], "%)", sep = ""))

plot(log(age), pca$x[, 1], col = "red", pch = pch.sex, ylim = range(pca$x[, 
    1:2]), main = "Age")
points(log(age), pca$x[, 2], col = "orange", pch = pch.sex)

plot(log(age), pca$x[, 3], col = "green", pch = pch.sex, ylim = range(pca$x[, 
    3:4]), main = "Age")
points(log(age), pca$x[, 4], col = "blue", pch = pch.sex)

plot(rin, pca$x[, 1], col = "red", pch = pch.sex, ylim = range(pca$x[, 1:2]), 
    main = "RIN")
points(rin, pca$x[, 2], col = "orange", pch = pch.sex)

plot(rin, pca$x[, 3], col = "green", pch = pch.sex, ylim = range(pca$x[, 3:4]), 
    main = "RIN")
points(rin, pca$x[, 4], col = "blue", pch = pch.sex)

# correlation heatmap ####
par(mfcol = c(1, 1), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0), 
    oma = c(0, 0, 0, 1))

plot of chunk unnamed-chunk-1

image(cor(dn), col = getPal())

plot of chunk unnamed-chunk-1


heatmap(cor(dn, m = "sp"), col = getPal(), distfun = function(x) {
    as.dist(1 - x)
}, symm = T, ColSideColors = sam.cols)

plot of chunk unnamed-chunk-1

heatmap(cor(dn, m = "sp"), col = getPal(), distfun = function(x) {
    as.dist(1 - x)
}, symm = T, ColSideColors = sam.cols, Rowv = 1:40)

plot of chunk unnamed-chunk-1

library(gplots)
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
heatmap.2(cor(dn), col = getPal(), distfun = function(x) {
    as.dist(1 - x)
}, symm = T, ColSideColors = sam.cols, Colv = T, trace = "none", dendrogram = "col", 
    keysize = 2)

plot of chunk unnamed-chunk-1


# MDS #####
par(mfrow = c(2, 2), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0), 
    oma = c(0, 0, 0, 1))
mds = cmdscale(1 - cor(dn, m = "p"), k = 2)
plot(mds[, 1], mds[, 2], pch = pch.sex, col = sam.cols, main = "norm, Pearson", 
    xlab = "", ylab = "")

mds = cmdscale(1 - cor(dn, m = "s"), k = 2)
plot(mds[, 1], mds[, 2], pch = pch.sex, col = sam.cols, main = "norm, Spearman", 
    xlab = "", ylab = "")

mds = cmdscale(1 - cor(d, m = "p"), k = 2)
plot(mds[, 1], mds[, 2], pch = pch.sex, col = sam.cols, main = "not norm, Pearson", 
    xlab = "", ylab = "")

mds = cmdscale(1 - cor(d, m = "s"), k = 2)
plot(mds[, 1], mds[, 2], pch = pch.sex, col = sam.cols, main = "not norm, Spearman", 
    xlab = "", ylab = "")

plot of chunk unnamed-chunk-1


# anova ####
y = cbind(rnorm(10, 0), rnorm(10, 2), rnorm(10, 5))
x = cbind(runif(10, 0, 1), runif(10, 1.2, 2.2), runif(10, 2.4, 3.4))
my = apply(y, 2, mean)

par(mfrow = c(1, 3), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0), 
    oma = c(0, 0, 0, 1))
plot(x[, 1:2], y[, 1:2], pch = 19, col = rep(c("red", "blue"), each = 10), xaxt = "n", 
    ylab = "", xlab = "", bty = "l", main = "t-test")
segments(0, my[1], 1.5, my[1], col = "red", lwd = 2)
segments(0.8, my[2], 2.4, my[2], col = "blue", lwd = 2)
arrows(x[, 1:2], y[, 1:2], x[, 1:2], rep(my[1:2], each = 10), col = rep(c("red", 
    "blue"), each = 10), length = 0.15)
arrows(1.2, my[1], 1.2, my[2], length = 0.15, lwd = 2, code = 3)


plot(x, y, pch = 19, col = rep(c("red", "blue", "green"), each = 10), xaxt = "n", 
    ylab = "", xlab = "", bty = "l", main = "anova. within group var")
segments(c(0, 1.2, 2.4), my, c(1, 2.2, 3.4), my, col = c("red", "blue", "green"))
arrows(x, y, x, rep(my, each = 10), col = rep(c("red", "blue", "green"), each = 10), 
    length = 0.15)

plot(x, y, pch = 19, col = rep(c("red", "blue", "green"), each = 10), xaxt = "n", 
    ylab = "", xlab = "", bty = "l", main = "anova. total var")
segments(0, mean(y), 3.4, mean(y))
arrows(x, y, x, rep(mean(y), 30), col = rep(c("red", "blue", "green"), each = 10), 
    length = 0.15)

# one-way anova by hand
yy = as.numeric(y)
sum((yy - mean(y))^2)
## [1] 169.7
total = var(yy) * 29
resid = sum((yy - rep(my, each = 10))^2)
explained = total - resid
f.stat = (explained/2)/(resid/27)
pf(f.stat, 2, 27, lower.tail = F)
## [1] 1.747e-11

# least square model by hand
xx = as.numeric(x)
b = cov(xx, yy)/var(xx)
a = mean(y) - b * mean(x)
plot(xx, yy, pch = 19)
abline(a = a, b = b, col = "red")
arrows(xx, yy, xx, a + b * xx, length = 0.15)

resid = sum((yy - (a + b * xx))^2)
explained = total - resid
f.stat = (explained/1)/(resid/28)
pf(f.stat, 1, 28, lower.tail = F)
## [1] 3.541e-10

# anova through lm
f = factor(rep(1:3, each = 10))
m = lm(yy ~ f)
anova(m)
## Analysis of Variance Table
## 
## Response: yy
##           Df Sum Sq Mean Sq F value  Pr(>F)    
## f          2  142.6    71.3    71.1 1.7e-11 ***
## Residuals 27   27.1     1.0                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m = lm(yy ~ xx)
anova(m)
## Analysis of Variance Table
## 
## Response: yy
##           Df Sum Sq Mean Sq F value  Pr(>F)    
## xx         1  129.0   129.0    88.7 3.5e-10 ***
## Residuals 28   40.7     1.5                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


# multiple predictors
x = runif(30, 0, 10)
f = factor(rep(1:2, each = 15))

y = c(rnorm(15, x[1:15], 1), rnorm(15, x[16:30] + 3, 1))
plot(x, y, col = f, pch = 19)
m = lm(y ~ x + f)
abline(a = m$coef[1], b = m$coef[2], col = "black")
abline(a = m$coef[1] + m$coef[3], b = m$coef[2], col = "red")
anova(m)
## Analysis of Variance Table
## 
## Response: y
##           Df Sum Sq Mean Sq F value  Pr(>F)    
## x          1    414     414   493.7 < 2e-16 ***
## f          1     76      76    90.6 4.1e-10 ***
## Residuals 27     23       1                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# interaction
y = c(rnorm(15, x[1:15], 1), rnorm(15, x[16:30] * 3 + 3, 1))
plot(x, y, col = f, pch = 19)
m = lm(y ~ x + f + x:f)
abline(a = m$coef[1], b = m$coef[2], col = "black")
abline(a = m$coef[1] + m$coef[3], b = m$coef[2] + m$coef[4], col = "red")
anova(m)
## Analysis of Variance Table
## 
## Response: y
##           Df Sum Sq Mean Sq F value  Pr(>F)    
## x          1   1695    1695    1725 < 2e-16 ***
## f          1   1151    1151    1171 < 2e-16 ***
## x:f        1    245     245     249 7.8e-15 ***
## Residuals 26     26       1                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# not ortogonal factors
age = runif(30, 50, 100)
height = rnorm(30, 150 * (100/age)^0.5, 15)
sex = factor(ifelse(runif(30, 50, 100) > age, "M", "F"))

par(mfrow = c(3, 1), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0), 
    oma = c(0, 0, 0, 1))
boxplot(age ~ sex, col = 1:2, ylab = "age")
boxplot(height ~ sex, col = 1:2, ylab = "height")
plot(height, age, pch = 19, col = sex)

anova(lm(height ~ age + sex))
## Analysis of Variance Table
## 
## Response: height
##           Df Sum Sq Mean Sq F value  Pr(>F)    
## age        1   8392    8392   79.26 1.6e-09 ***
## sex        1    507     507    4.79   0.037 *  
## Residuals 27   2859     106                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lm(height ~ sex + age))
## Analysis of Variance Table
## 
## Response: height
##           Df Sum Sq Mean Sq F value  Pr(>F)    
## sex        1   4511    4511    42.6 5.3e-07 ***
## age        1   4388    4388    41.5 6.7e-07 ***
## Residuals 27   2859     106                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# glm
gm = glm(yy ~ xx, family = "gaussian")
lm = lm(yy ~ xx)
anova(gm, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: gaussian, link: identity
## 
## Response: yy
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev Pr(>Chi)    
## NULL                    29      169.7             
## xx    1      129        28       40.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lm, test = "Chisq")
## Analysis of Variance Table
## 
## Response: yy
##           Df Sum Sq Mean Sq F value  Pr(>F)    
## xx         1  129.0   129.0    88.7 3.5e-10 ***
## Residuals 28   40.7     1.5                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# binom GLM
age = sort(runif(20, 0, 90))
tot.mes = round(runif(20, 10, 20))
mes = rbinom(20, tot.mes, 1 - age/500)
mes = cbind(mes, tot.mes - mes)
colnames(mes) = c("tee", "coffee")
plot(age, mes[, 1]/tot.mes, ylab = "tee freq", xlab = "age")

m = glm(mes ~ age, family = "binomial")
anova(m, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: mes
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev Pr(>Chi)    
## NULL                    19      24.35             
## age   1     14.7        18       9.63  0.00012 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m$coefficients
## (Intercept)         age 
##     4.07100    -0.03188

y.pred = predict(m, type = "resp")
# the same: l = predict(m,type='link') 1/(1+exp(-l)) or: l =
# predict(m,type='terms') + attr(predict(m,type='terms') ,'constant')
lines(age, y.pred, col = "red")

curve(1/(1 + exp(-x)), -5, 5)
abline(v = 0, lty = 2)
abline(h = 0.5, lty = 2)

# biological variability
x = 1:100
par(mfrow = c(2, 1), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0), 
    oma = c(0, 0, 0, 1))
plot(x, dnorm(x, 30, 10), col = "red", t = "l", xlab = "expression (read counts)", 
    lty = 2, ylab = "frequencty in population", main = "underlying distribution")
lines(x, dnorm(x, 50, 15), col = "blue", t = "l", lty = 2)

h = hist(rpois(400, pmax(0, rnorm(200, 30, 10))), breaks = 0:35 * 4, xlab = "read count", 
    ylab = "# of samples", main = "observed data", col = "#FF000040", border = NA, 
    xlim = c(0, 100))
hist(rpois(400, pmax(0, rnorm(200, 50, 15))), breaks = 0:35 * 4, xlab = "read count", 
    ylab = "# of samples", main = "observed data", col = "#0000FF40", border = NA, 
    add = T, xlim = c(0, 100))
y = dpois(x, 30)
lines(x, y/max(y) * max(h$counts), col = "red")
y = dpois(x, 50)
lines(x, y/max(y) * max(h$counts), col = "blue")

# negative binom
par(mfrow = c(1, 1), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0), 
    oma = c(0, 0, 0, 1))

plot of chunk unnamed-chunk-1

plot(1, t = "n", xlim = c(0, 100), ylim = c(0, 0.075))
cols = getPal(c("red", "yellow", "green"), 21)
for (s in 0:20) lines(x, dnbinom(x, size = 0.1 + (s/2)^2, mu = 30), t = "l", 
    col = cols[s], lty = 2)
lines(x, dpois(x, 30), t = "l", col = "green", lwd = 4)
legend("topright", col = c(cols, "green"), lwd = c(rep(1, 21), 4), lty = c(rep(2, 
    21), 1), legend = c(0.1 + (0:20/2)^2, "pois"), ncol = 2, title = "disp param")

plot of chunk unnamed-chunk-1


# quasi
x = d[1, ]
a = log(meta$age)
sex = meta$sex
m = x ~ rin + a + sex + a:sex
mq = glm(m, family = "quasipoisson")
summary(mq)
## 
## Call:
## glm(formula = m, family = "quasipoisson")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -16.939   -3.214    0.434    3.132   11.068  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.0924     0.5357    9.51  3.1e-11 ***
## rin           0.0620     0.0466    1.33     0.19    
## a             0.0155     0.0467    0.33     0.74    
## sexm          0.1507     0.3928    0.38     0.70    
## a:sexm       -0.0141     0.0521   -0.27     0.79    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 31.18)
## 
##     Null deviance: 1279.4  on 39  degrees of freedom
## Residual deviance: 1198.9  on 35  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 4
mp = glm(m, family = "poisson")
summary(mp)
## 
## Call:
## glm(formula = m, family = "poisson")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -16.939   -3.214    0.434    3.132   11.068  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  5.09242    0.09594   53.08  < 2e-16 ***
## rin          0.06202    0.00835    7.43  1.1e-13 ***
## a            0.01548    0.00836    1.85    0.064 .  
## sexm         0.15068    0.07035    2.14    0.032 *  
## a:sexm      -0.01406    0.00932   -1.51    0.132    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 1279.4  on 39  degrees of freedom
## Residual deviance: 1198.9  on 35  degrees of freedom
## AIC: 1508
## 
## Number of Fisher Scoring iterations: 4

p = predict(mp, type = "resp")
pearson.resid = (x - p)/sqrt(p)
# the same pearson.resid = residuals(m, type = 'pearson'
sum(pearson.resid^2)/m$df.residual
## numeric(0)

anova(mq, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: quasipoisson, link: log
## 
## Response: x
## 
## Terms added sequentially (first to last)
## 
## 
##       Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL                     39       1279         
## rin    1     70.7        38       1209     0.13
## a      1      2.0        37       1207     0.80
## sex    1      5.5        36       1201     0.67
## a:sex  1      2.3        35       1199     0.79
anova(mp, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: x
## 
## Terms added sequentially (first to last)
## 
## 
##       Df Deviance Resid. Df Resid. Dev Pr(>Chi)    
## NULL                     39       1279             
## rin    1     70.7        38       1209   <2e-16 ***
## a      1      2.0        37       1207    0.154    
## sex    1      5.5        36       1201    0.019 *  
## a:sex  1      2.3        35       1199    0.130    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# examples:
library(MASS)
m = x ~ rin + a + sex + a:sex  # + I(a^2) + ...
anova(lm(m))
## Analysis of Variance Table
## 
## Response: x
##           Df Sum Sq Mean Sq F value Pr(>F)
## rin        1  19938   19938    2.17   0.15
## a          1    631     631    0.07   0.80
## sex        1   1604    1604    0.17   0.68
## a:sex      1    722     722    0.08   0.78
## Residuals 35 321997    9200
anova(glm(m, family = "poisson"), test = "Chisq")[, 5]
## [1]        NA 4.226e-17 1.539e-01 1.851e-02 1.304e-01
anova(glm(m, family = "quasipoisson"), test = "Chisq")[, 5]
## [1]     NA 0.1322 0.7985 0.6732 0.7865
anova(glm.nb(m), test = "Chisq")[, 5]
## Warning: tests made without re-estimating 'theta'
## [1]     NA 0.2008 0.8316 0.7009 0.7848