# PCA #####
x = rnorm(100)
y = rnorm(100, x)
pca = prcomp(cbind(x, y))
names(pca)
## [1] "sdev" "rotation" "center" "scale" "x"
pca$rotation
## PC1 PC2
## x 0.5569 0.8306
## y 0.8306 -0.5569
pca$center
## x y
## 0.02950 -0.04793
par(mfrow = c(1, 2), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0),
oma = c(0, 0, 0, 1))
plot(x, y, pch = 19, col = "gray", xlim = range(x, y), ylim = range(x, y), main = "Origina coordinates")
points(pca$center[1], pca$center[2], cex = 2)
b1 = pca$rotation[2, 1]/pca$rotation[1, 1]
abline(a = pca$center[2] - b1 * pca$center[1], b = b1, col = "red")
b2 = pca$rotation[2, 2]/pca$rotation[1, 2]
abline(a = pca$center[2] - b2 * pca$center[1], b = b2, col = "blue")
dev = round(pca$sdev/sum(pca$sdev) * 100, 1)
plot(-pca$x[, 1], -pca$x[, 2], pch = 19, col = "gray", main = "PCA", xlab = paste("PC1 (",
dev[1], "%)", sep = ""), ylab = paste("PC2 (", dev[2], "%)", sep = ""),
xlim = range(x, pca$x, y), ylim = range(x, y, pca$x))
abline(a = 0, b = 0, col = "red")
abline(v = 0, col = "blue")
# load data #####
setwd("/home/mazin/teaching/R/2014")
d = as.matrix(read.table("my.code/5.hs.cnts"))
meta = read.table("my.code/5.hs.meta", stringsAsFactors = F)
rin = meta$rin
sex = meta$sex
age = meta$age
# make functions #####
getPal = function(col = c("blue", "white", "red"), n = 200) {
r = character(n)
if (n == 1) {
i = floor(length(col)/2)
return(.getPal(col[i], col[i + 1], 0.2))
}
for (i in 0:(n - 1)) {
cinx = i/(n - 1) * (length(col) - 1) + 1
if (cinx == floor(cinx))
r[i + 1] = col[cinx] else {
rate = cinx - floor(cinx)
cinx = floor(cinx)
r[i + 1] = .getPal(col[cinx], col[cinx + 1], 1 - rate)
}
}
r
}
.getPal = function(c1, c2, r) {
c1 = t(col2rgb(c1, alpha = TRUE)/255)
c2 = t(col2rgb(c2, alpha = TRUE)/255)
return(rgb((c1 * r + c2 * (1 - r)), alpha = r * c1[4] + (1 - r) * c2[4]))
}
# read count density #####
par(mfrow = c(2, 2), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0),
oma = c(0, 0, 0, 1))
plot(density(d), main = "", xlab = "read counts")
plot(density(log(d), bw = 0.02, from = -2.5), main = "", xlab = "log(read counts)")
table(d == 0)
##
## FALSE TRUE
## 782200 1453360
plot(density(log(d + 0.1), bw = 0.02, from = -2.5), main = "", xlab = "log(read counts + 1)")
dim(d)
## [1] 55889 40
sum(apply(d, 1, sd) == 0)
## [1] 23162
d = d[apply(d < 20, 1, sum) < 20, ]
sum(apply(d, 1, sd) == 0)
## [1] 0
dim(d)
## [1] 10432 40
plot(density(log(d), bw = 0.02, from = -2.5), xlab = "log(read counts)", main = "after filtering")
dn = sweep(d, 1, apply(d, 1, mean), "-")
dn = sweep(dn, 1, apply(dn, 1, sd), "/")
# PCA on real data #####
sam.cols = getPal(c("#0000FF", "#DDDDDD", "#FF0000"), 40)
pch.sex = ifelse(sex == "m", 17, 19)
par(mfcol = c(2, 3), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0),
oma = c(0, 0, 0, 1))
pca = prcomp(t(d))
dev = round(pca$sdev/sum(pca$sdev) * 100, 1)
plot(pca$x[, 1], pca$x[, 2], pch = pch.sex, col = sam.cols, main = "PCA", xlab = paste("PC1 (",
dev[1], "%)", sep = ""), ylab = paste("PC2 (", dev[2], "%)", sep = ""))
plot(pca$x[, 3], pca$x[, 4], pch = pch.sex, col = sam.cols, main = "PCA", xlab = paste("PC3 (",
dev[3], "%)", sep = ""), ylab = paste("PC4 (", dev[4], "%)", sep = ""))
plot(log(age), pca$x[, 1], col = "red", pch = pch.sex, ylim = range(pca$x[,
1:2]), main = "Age")
points(log(age), pca$x[, 2], col = "orange", pch = pch.sex)
plot(log(age), pca$x[, 3], col = "green", pch = pch.sex, ylim = range(pca$x[,
3:4]), main = "Age")
points(log(age), pca$x[, 4], col = "blue", pch = pch.sex)
plot(rin, pca$x[, 1], col = "red", pch = pch.sex, ylim = range(pca$x[, 1:2]),
main = "RIN")
points(rin, pca$x[, 2], col = "orange", pch = pch.sex)
plot(rin, pca$x[, 3], col = "green", pch = pch.sex, ylim = range(pca$x[, 3:4]),
main = "RIN")
points(rin, pca$x[, 4], col = "blue", pch = pch.sex)
# correlation heatmap ####
par(mfcol = c(1, 1), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0),
oma = c(0, 0, 0, 1))
image(cor(dn), col = getPal())
heatmap(cor(dn, m = "sp"), col = getPal(), distfun = function(x) {
as.dist(1 - x)
}, symm = T, ColSideColors = sam.cols)
heatmap(cor(dn, m = "sp"), col = getPal(), distfun = function(x) {
as.dist(1 - x)
}, symm = T, ColSideColors = sam.cols, Rowv = 1:40)
library(gplots)
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
heatmap.2(cor(dn), col = getPal(), distfun = function(x) {
as.dist(1 - x)
}, symm = T, ColSideColors = sam.cols, Colv = T, trace = "none", dendrogram = "col",
keysize = 2)
# MDS #####
par(mfrow = c(2, 2), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0),
oma = c(0, 0, 0, 1))
mds = cmdscale(1 - cor(dn, m = "p"), k = 2)
plot(mds[, 1], mds[, 2], pch = pch.sex, col = sam.cols, main = "norm, Pearson",
xlab = "", ylab = "")
mds = cmdscale(1 - cor(dn, m = "s"), k = 2)
plot(mds[, 1], mds[, 2], pch = pch.sex, col = sam.cols, main = "norm, Spearman",
xlab = "", ylab = "")
mds = cmdscale(1 - cor(d, m = "p"), k = 2)
plot(mds[, 1], mds[, 2], pch = pch.sex, col = sam.cols, main = "not norm, Pearson",
xlab = "", ylab = "")
mds = cmdscale(1 - cor(d, m = "s"), k = 2)
plot(mds[, 1], mds[, 2], pch = pch.sex, col = sam.cols, main = "not norm, Spearman",
xlab = "", ylab = "")
# anova ####
y = cbind(rnorm(10, 0), rnorm(10, 2), rnorm(10, 5))
x = cbind(runif(10, 0, 1), runif(10, 1.2, 2.2), runif(10, 2.4, 3.4))
my = apply(y, 2, mean)
par(mfrow = c(1, 3), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0),
oma = c(0, 0, 0, 1))
plot(x[, 1:2], y[, 1:2], pch = 19, col = rep(c("red", "blue"), each = 10), xaxt = "n",
ylab = "", xlab = "", bty = "l", main = "t-test")
segments(0, my[1], 1.5, my[1], col = "red", lwd = 2)
segments(0.8, my[2], 2.4, my[2], col = "blue", lwd = 2)
arrows(x[, 1:2], y[, 1:2], x[, 1:2], rep(my[1:2], each = 10), col = rep(c("red",
"blue"), each = 10), length = 0.15)
arrows(1.2, my[1], 1.2, my[2], length = 0.15, lwd = 2, code = 3)
plot(x, y, pch = 19, col = rep(c("red", "blue", "green"), each = 10), xaxt = "n",
ylab = "", xlab = "", bty = "l", main = "anova. within group var")
segments(c(0, 1.2, 2.4), my, c(1, 2.2, 3.4), my, col = c("red", "blue", "green"))
arrows(x, y, x, rep(my, each = 10), col = rep(c("red", "blue", "green"), each = 10),
length = 0.15)
plot(x, y, pch = 19, col = rep(c("red", "blue", "green"), each = 10), xaxt = "n",
ylab = "", xlab = "", bty = "l", main = "anova. total var")
segments(0, mean(y), 3.4, mean(y))
arrows(x, y, x, rep(mean(y), 30), col = rep(c("red", "blue", "green"), each = 10),
length = 0.15)
# one-way anova by hand
yy = as.numeric(y)
sum((yy - mean(y))^2)
## [1] 169.7
total = var(yy) * 29
resid = sum((yy - rep(my, each = 10))^2)
explained = total - resid
f.stat = (explained/2)/(resid/27)
pf(f.stat, 2, 27, lower.tail = F)
## [1] 1.747e-11
# least square model by hand
xx = as.numeric(x)
b = cov(xx, yy)/var(xx)
a = mean(y) - b * mean(x)
plot(xx, yy, pch = 19)
abline(a = a, b = b, col = "red")
arrows(xx, yy, xx, a + b * xx, length = 0.15)
resid = sum((yy - (a + b * xx))^2)
explained = total - resid
f.stat = (explained/1)/(resid/28)
pf(f.stat, 1, 28, lower.tail = F)
## [1] 3.541e-10
# anova through lm
f = factor(rep(1:3, each = 10))
m = lm(yy ~ f)
anova(m)
## Analysis of Variance Table
##
## Response: yy
## Df Sum Sq Mean Sq F value Pr(>F)
## f 2 142.6 71.3 71.1 1.7e-11 ***
## Residuals 27 27.1 1.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m = lm(yy ~ xx)
anova(m)
## Analysis of Variance Table
##
## Response: yy
## Df Sum Sq Mean Sq F value Pr(>F)
## xx 1 129.0 129.0 88.7 3.5e-10 ***
## Residuals 28 40.7 1.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# multiple predictors
x = runif(30, 0, 10)
f = factor(rep(1:2, each = 15))
y = c(rnorm(15, x[1:15], 1), rnorm(15, x[16:30] + 3, 1))
plot(x, y, col = f, pch = 19)
m = lm(y ~ x + f)
abline(a = m$coef[1], b = m$coef[2], col = "black")
abline(a = m$coef[1] + m$coef[3], b = m$coef[2], col = "red")
anova(m)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x 1 414 414 493.7 < 2e-16 ***
## f 1 76 76 90.6 4.1e-10 ***
## Residuals 27 23 1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# interaction
y = c(rnorm(15, x[1:15], 1), rnorm(15, x[16:30] * 3 + 3, 1))
plot(x, y, col = f, pch = 19)
m = lm(y ~ x + f + x:f)
abline(a = m$coef[1], b = m$coef[2], col = "black")
abline(a = m$coef[1] + m$coef[3], b = m$coef[2] + m$coef[4], col = "red")
anova(m)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x 1 1695 1695 1725 < 2e-16 ***
## f 1 1151 1151 1171 < 2e-16 ***
## x:f 1 245 245 249 7.8e-15 ***
## Residuals 26 26 1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# not ortogonal factors
age = runif(30, 50, 100)
height = rnorm(30, 150 * (100/age)^0.5, 15)
sex = factor(ifelse(runif(30, 50, 100) > age, "M", "F"))
par(mfrow = c(3, 1), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0),
oma = c(0, 0, 0, 1))
boxplot(age ~ sex, col = 1:2, ylab = "age")
boxplot(height ~ sex, col = 1:2, ylab = "height")
plot(height, age, pch = 19, col = sex)
anova(lm(height ~ age + sex))
## Analysis of Variance Table
##
## Response: height
## Df Sum Sq Mean Sq F value Pr(>F)
## age 1 8392 8392 79.26 1.6e-09 ***
## sex 1 507 507 4.79 0.037 *
## Residuals 27 2859 106
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lm(height ~ sex + age))
## Analysis of Variance Table
##
## Response: height
## Df Sum Sq Mean Sq F value Pr(>F)
## sex 1 4511 4511 42.6 5.3e-07 ***
## age 1 4388 4388 41.5 6.7e-07 ***
## Residuals 27 2859 106
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# glm
gm = glm(yy ~ xx, family = "gaussian")
lm = lm(yy ~ xx)
anova(gm, test = "Chisq")
## Analysis of Deviance Table
##
## Model: gaussian, link: identity
##
## Response: yy
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 29 169.7
## xx 1 129 28 40.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lm, test = "Chisq")
## Analysis of Variance Table
##
## Response: yy
## Df Sum Sq Mean Sq F value Pr(>F)
## xx 1 129.0 129.0 88.7 3.5e-10 ***
## Residuals 28 40.7 1.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# binom GLM
age = sort(runif(20, 0, 90))
tot.mes = round(runif(20, 10, 20))
mes = rbinom(20, tot.mes, 1 - age/500)
mes = cbind(mes, tot.mes - mes)
colnames(mes) = c("tee", "coffee")
plot(age, mes[, 1]/tot.mes, ylab = "tee freq", xlab = "age")
m = glm(mes ~ age, family = "binomial")
anova(m, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: mes
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 19 24.35
## age 1 14.7 18 9.63 0.00012 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m$coefficients
## (Intercept) age
## 4.07100 -0.03188
y.pred = predict(m, type = "resp")
# the same: l = predict(m,type='link') 1/(1+exp(-l)) or: l =
# predict(m,type='terms') + attr(predict(m,type='terms') ,'constant')
lines(age, y.pred, col = "red")
curve(1/(1 + exp(-x)), -5, 5)
abline(v = 0, lty = 2)
abline(h = 0.5, lty = 2)
# biological variability
x = 1:100
par(mfrow = c(2, 1), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0),
oma = c(0, 0, 0, 1))
plot(x, dnorm(x, 30, 10), col = "red", t = "l", xlab = "expression (read counts)",
lty = 2, ylab = "frequencty in population", main = "underlying distribution")
lines(x, dnorm(x, 50, 15), col = "blue", t = "l", lty = 2)
h = hist(rpois(400, pmax(0, rnorm(200, 30, 10))), breaks = 0:35 * 4, xlab = "read count",
ylab = "# of samples", main = "observed data", col = "#FF000040", border = NA,
xlim = c(0, 100))
hist(rpois(400, pmax(0, rnorm(200, 50, 15))), breaks = 0:35 * 4, xlab = "read count",
ylab = "# of samples", main = "observed data", col = "#0000FF40", border = NA,
add = T, xlim = c(0, 100))
y = dpois(x, 30)
lines(x, y/max(y) * max(h$counts), col = "red")
y = dpois(x, 50)
lines(x, y/max(y) * max(h$counts), col = "blue")
# negative binom
par(mfrow = c(1, 1), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(3, 3, 1.5, 0),
oma = c(0, 0, 0, 1))
plot(1, t = "n", xlim = c(0, 100), ylim = c(0, 0.075))
cols = getPal(c("red", "yellow", "green"), 21)
for (s in 0:20) lines(x, dnbinom(x, size = 0.1 + (s/2)^2, mu = 30), t = "l",
col = cols[s], lty = 2)
lines(x, dpois(x, 30), t = "l", col = "green", lwd = 4)
legend("topright", col = c(cols, "green"), lwd = c(rep(1, 21), 4), lty = c(rep(2,
21), 1), legend = c(0.1 + (0:20/2)^2, "pois"), ncol = 2, title = "disp param")
# quasi
x = d[1, ]
a = log(meta$age)
sex = meta$sex
m = x ~ rin + a + sex + a:sex
mq = glm(m, family = "quasipoisson")
summary(mq)
##
## Call:
## glm(formula = m, family = "quasipoisson")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -16.939 -3.214 0.434 3.132 11.068
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.0924 0.5357 9.51 3.1e-11 ***
## rin 0.0620 0.0466 1.33 0.19
## a 0.0155 0.0467 0.33 0.74
## sexm 0.1507 0.3928 0.38 0.70
## a:sexm -0.0141 0.0521 -0.27 0.79
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 31.18)
##
## Null deviance: 1279.4 on 39 degrees of freedom
## Residual deviance: 1198.9 on 35 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 4
mp = glm(m, family = "poisson")
summary(mp)
##
## Call:
## glm(formula = m, family = "poisson")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -16.939 -3.214 0.434 3.132 11.068
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.09242 0.09594 53.08 < 2e-16 ***
## rin 0.06202 0.00835 7.43 1.1e-13 ***
## a 0.01548 0.00836 1.85 0.064 .
## sexm 0.15068 0.07035 2.14 0.032 *
## a:sexm -0.01406 0.00932 -1.51 0.132
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1279.4 on 39 degrees of freedom
## Residual deviance: 1198.9 on 35 degrees of freedom
## AIC: 1508
##
## Number of Fisher Scoring iterations: 4
p = predict(mp, type = "resp")
pearson.resid = (x - p)/sqrt(p)
# the same pearson.resid = residuals(m, type = 'pearson'
sum(pearson.resid^2)/m$df.residual
## numeric(0)
anova(mq, test = "Chisq")
## Analysis of Deviance Table
##
## Model: quasipoisson, link: log
##
## Response: x
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 39 1279
## rin 1 70.7 38 1209 0.13
## a 1 2.0 37 1207 0.80
## sex 1 5.5 36 1201 0.67
## a:sex 1 2.3 35 1199 0.79
anova(mp, test = "Chisq")
## Analysis of Deviance Table
##
## Model: poisson, link: log
##
## Response: x
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 39 1279
## rin 1 70.7 38 1209 <2e-16 ***
## a 1 2.0 37 1207 0.154
## sex 1 5.5 36 1201 0.019 *
## a:sex 1 2.3 35 1199 0.130
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# examples:
library(MASS)
m = x ~ rin + a + sex + a:sex # + I(a^2) + ...
anova(lm(m))
## Analysis of Variance Table
##
## Response: x
## Df Sum Sq Mean Sq F value Pr(>F)
## rin 1 19938 19938 2.17 0.15
## a 1 631 631 0.07 0.80
## sex 1 1604 1604 0.17 0.68
## a:sex 1 722 722 0.08 0.78
## Residuals 35 321997 9200
anova(glm(m, family = "poisson"), test = "Chisq")[, 5]
## [1] NA 4.226e-17 1.539e-01 1.851e-02 1.304e-01
anova(glm(m, family = "quasipoisson"), test = "Chisq")[, 5]
## [1] NA 0.1322 0.7985 0.6732 0.7865
anova(glm.nb(m), test = "Chisq")[, 5]
## Warning: tests made without re-estimating 'theta'
## [1] NA 0.2008 0.8316 0.7009 0.7848