Book Chapter 2:
Book chapter 7:
“The greatest value of a picture is when it forces us to notice what we never expected to see.” - John W. Tukey
suppressPackageStartupMessages(library(UsingR))
suppressPackageStartupMessages(library(rafalib))
# height qq plot
x <- father.son$fheight
ps <- (seq(0, 99) + 0.5) / 100
qs <- quantile(x, ps)
normalqs <- qnorm(ps, mean(x), popsd(x))
par(mfrow = c(1, 3))
plot(normalqs,
qs,
xlab = "Normal Percentiles",
ylab = "Height Percentiles",
main = "Height Q-Q Plot")
abline(0, 1)
# t-distributed for 12 df
x <- rt(1000, 12)
qqnorm(x,
xlab = "t quantiles",
main = "T Quantiles (df=12) Q-Q Plot",
ylim = c(-6, 6))
qqline(x)
# t-distributed for 3 df
x <- rt(1000, 3)
qqnorm(x,
xlab = "t quantiles",
main = "T Quantiles (df=3) Q-Q Plot",
ylim = c(-6, 6))
qqline(x)
par(mfrow = c(1, 3))
hist(exec.pay, main = "CEO Compensation")
qqnorm(exec.pay, main = "CEO Compensation")
boxplot(exec.pay,
ylab = "10,000s of dollars",
ylim = c(0, 400),
main = "CEO Compensation")
plot()
and cor()
par(mfrow = c(1, 3))
plot(
father.son$fheight,
father.son$sheight,
xlab = "Father's height in inches",
ylab = "Son's height in inches",
main = paste("correlation =", signif(
cor(father.son$fheight, father.son$sheight), 2
))
)
plot(
cars$speed,
cars$dist,
xlab = "Speed",
ylab = "Stopping Distance",
main = paste("correlation =", signif(cor(
cars$speed, cars$dist
), 2))
)
plot(
faithful$eruptions,
faithful$waiting,
xlab = "Eruption Duration",
ylab = "Waiting Time",
main = paste("correlation =", signif(
cor(faithful$eruptions, faithful$waiting), 2
))
)
#install from Github
BiocManager::install("genomicsclass/GSE5859Subset")
BiocManager::install("genomicsclass/tissuesGeneExpression")
library(GSE5859Subset)
data(GSE5859Subset) ##this loads three tables
c(class(geneExpression), class(sampleInfo))
## [1] "matrix" "data.frame"
rbind(dim(geneExpression), dim(sampleInfo))
## [,1] [,2]
## [1,] 8793 24
## [2,] 24 4
head(sampleInfo)
## ethnicity date filename group
## 107 ASN 2005-06-23 GSM136508.CEL.gz 1
## 122 ASN 2005-06-27 GSM136530.CEL.gz 1
## 113 ASN 2005-06-27 GSM136517.CEL.gz 1
## 163 ASN 2005-10-28 GSM136576.CEL.gz 1
## 153 ASN 2005-10-07 GSM136566.CEL.gz 1
## 161 ASN 2005-10-07 GSM136574.CEL.gz 1
T-test for every row (gene) of gene expression matrix:
library(genefilter)
g <- factor(sampleInfo$group)
system.time(results <- rowttests(geneExpression,g))
## user system elapsed
## 0.007 0.001 0.009
pvals <- results$p.value
Note that these 8,793 tests are done in about 0.01s
par(mar = c(4, 4, 0, 0))
plot(results$dm,
-log10(results$p.value),
xlab = "Effect size (difference in group means)",
ylab = "- log (base 10) p-values")
abline(h = -log10(0.05 / nrow(geneExpression)), col = "red")
legend("bottomright",
lty = 1,
col = "red",
legend = "Bonferroni = 0.05")
m <- nrow(geneExpression)
n <- ncol(geneExpression)
set.seed(1)
randomData <- matrix(rnorm(n * m), m, n)
nullpvals <- rowttests(randomData, g)$p.value
par(mfrow = c(1, 2))
hist(pvals, ylim = c(0, 1400))
hist(nullpvals, ylim = c(0, 1400))
Note that permuting these data doesn’t produce an ideal null p-value histogram due to batch effects:
permg <- sample(g)
permresults <- rowttests(geneExpression, permg)
hist(permresults$p.value)
ComBat
: corrects for known batch effects by linear model), andsva
: creates surrogate variables for unknown batch effects, corrects the structure of permutation p-valuesAll are available from the sva Bioconductor package
rafalib::mypar(1, 2)
pseudo <- apply(geneExpression, 1, median)
plot(geneExpression[, 1], pseudo)
plot((geneExpression[, 1] + pseudo) / 2, (geneExpression[, 1] - pseudo))
affyPLM::MAplots
better MA plots
Detailed representation of high-dimensional dataset
ge = geneExpression[sample(1:nrow(geneExpression), 200),]
pheatmap::pheatmap(
ge,
scale = "row",
show_colnames = FALSE,
show_rownames = FALSE
)
Combination of RColorBrewer
package and colorRampPalette()
can create anything you want
rafalib::mypar(1, 1)
RColorBrewer::display.brewer.all(n = 7)
“Pie charts are a very bad way of displaying information.” - R Help
library(GSE5859Subset)
data(GSE5859Subset) ##this loads three tables
set.seed(1)
ge = geneExpression[sample(1:nrow(geneExpression), 200),]
pheatmap::pheatmap(
ge,
scale = "row",
show_colnames = FALSE,
show_rownames = FALSE
)
Source: http://master.bioconductor.org/help/course-materials/2002/Summer02Course/Distance/distance.pdf
A metric satisfies the following five properties:
rafalib::mypar()
plot(
c(0, 1, 1),
c(0, 0, 1),
pch = 16,
cex = 2,
xaxt = "n",
yaxt = "n",
xlab = "",
ylab = "",
bty = "n",
xlim = c(-0.25, 1.25),
ylim = c(-0.25, 1.25)
)
lines(c(0, 1, 1, 0), c(0, 0, 1, 0))
text(0, .2, expression(paste('(A'[x] * ',A'[y] * ')')), cex = 1.5)
text(1, 1.2, expression(paste('(B'[x] * ',B'[y] * ')')), cex = 1.5)
text(-0.1, 0, "A", cex = 2)
text(1.1, 1, "B", cex = 2)
##BiocManager::install("genomicsclass/tissuesGeneExpression")
library(tissuesGeneExpression)
data(tissuesGeneExpression)
dim(e) ##gene expression data
## [1] 22215 189
table(tissue) ##tissue[i] corresponds to e[,i]
## tissue
## cerebellum colon endometrium hippocampus kidney liver
## 38 34 15 31 39 26
## placenta
## 6
Interested in identifying similar samples and similar genes
Euclidean distance as for two dimensions. E.g., the distance between two samples \(i\) and \(j\) is:
\[ \mbox{dist}(i,j) = \sqrt{ \sum_{g=1}^{22215} (Y_{g,i}-Y_{g,j })^2 } \]
and the distance between two features \(h\) and \(g\) is:
\[ \mbox{dist}(h,g) = \sqrt{ \sum_{i=1}^{189} (Y_{h,i}-Y_{g,i})^2 } \]
The distance between samples \(i\) and \(j\) can be written as:
\[ \mbox{dist}(i,j) = \sqrt{ (\mathbf{Y}_i - \mathbf{Y}_j)^\top(\mathbf{Y}_i - \mathbf{Y}_j) }\]
with \(\mathbf{Y}_i\) and \(\mathbf{Y}_j\) columns \(i\) and \(j\).
t(matrix(1:3, ncol = 1))
## [,1] [,2] [,3]
## [1,] 1 2 3
matrix(1:3, ncol = 1)
## [,1]
## [1,] 1
## [2,] 2
## [3,] 3
t(matrix(1:3, ncol = 1)) %*% matrix(1:3, ncol = 1)
## [,1]
## [1,] 14
Note: R is very efficient at matrix algebra
kidney1 <- e[, 1]
kidney2 <- e[, 2]
colon1 <- e[, 87]
sqrt(sum((kidney1 - kidney2) ^ 2))
## [1] 85.8546
sqrt(sum((kidney1 - colon1) ^ 2))
## [1] 122.8919
dim(e)
## [1] 22215 189
(d <- dist(t(e[, c(1, 2, 87)])))
## GSM11805.CEL.gz GSM11814.CEL.gz
## GSM11814.CEL.gz 85.8546
## GSM92240.CEL.gz 122.8919 115.4773
class(d)
## [1] "dist"
Excerpt from ?dist:
dist(
x,
method = "euclidean",
diag = FALSE,
upper = FALSE,
p = 2
)
dist
class output from dist()
is used for many clustering algorithms and heatmap functionsCaution: dist(e)
creates a 22215 x 22215 matrix that will probably crash your R session.
\[x_{gi} \leftarrow \frac{(x_{gi} - \bar{x}_g)}{s_g}\]
Simulate the heights of twin pairs:
set.seed(1)
n <- 100
y <- t(MASS::mvrnorm(n, c(0, 0), matrix(c(1, 0.95, 0.95, 1), 2, 2)))
dim(y)
## [1] 2 100
cor(t(y))
## [,1] [,2]
## [1,] 1.0000000 0.9433295
## [2,] 0.9433295 1.0000000
z1 = (y[1,] + y[2,]) / 2 #the sum
z2 = (y[1,] - y[2,]) #the difference
z = rbind(z1, z2) #matrix now same dimensions as y
thelim <- c(-3, 3)
rafalib::mypar(1, 2)
plot(
y[1,],
y[2,],
xlab = "Twin 1 (standardized height)",
ylab = "Twin 2 (standardized height)",
xlim = thelim,
ylim = thelim,
main = "Original twin heights"
)
points(y[1, 1:2], y[2, 1:2], col = 2, pch = 16)
plot(
z[1,],
z[2,],
xlim = thelim,
ylim = thelim,
xlab = "Average height",
ylab = "Difference in height",
main = "Manual PCA-like projection"
)
points(z[1, 1:2] , z[2, 1:2], col = 2, pch = 16)
rafalib::mypar()
d = dist(t(y))
d3 = dist(z[1, ]) * sqrt(2) ##distance computed using just first dimension mypar(1,1)
plot(as.numeric(d),
as.numeric(d3),
xlab = "Pairwise distances in 2 dimensions",
ylab = "Pairwise distances in 1 dimension")
abline(0, 1, col = "red")
SVD generalizes the example rotation we looked at:
\[\mathbf{Y} = \mathbf{UDV}^\top\]
note: the above formulation is for \(m > n\)
Scaling:
system.time(e.standardize.slow <-
t(apply(e, 1, function(x)
x - mean(x))))
## user system elapsed
## 1.248 0.121 1.441
system.time(e.standardize.fast <- t(scale(t(e), scale = FALSE)))
## user system elapsed
## 0.149 0.001 0.155
all.equal(e.standardize.slow[, 1], e.standardize.fast[, 1])
## [1] TRUE
SVD:
s <- svd(e.standardize.fast)
names(s)
## [1] "d" "u" "v"
dim(s$u) # loadings
## [1] 22215 189
length(s$d) # eigenvalues
## [1] 189
dim(s$v) # d %*% vT = scores
## [1] 189 189
rafalib::mypar()
p <- prcomp(t(e.standardize.fast))
plot(s$u[, 1] ~ p$rotation[, 1])
Lesson: u and v can each be multiplied by -1 arbitrarily
plot(p$rotation[, 1],
xlab = "Index of genes",
ylab = "Loadings of PC1",
main = "Scores of PC1") #or, predict(p)
abline(h = c(-0.03, 0.04), col = "red")
Genes with high PC1 loadings:
e.pc1genes <-
e.standardize.fast[p$rotation[, 1] < -0.03 |
p$rotation[, 1] > 0.03,]
pheatmap::pheatmap(
e.pc1genes,
scale = "none",
show_rownames = TRUE,
show_colnames = FALSE
)
rafalib::mypar()
plot(
p$sdev ^ 2 / sum(p$sdev ^ 2) * 100,
xlim = c(0, 150),
type = "b",
ylab = "% variance explained",
main = "Screeplot"
)
Alternatively as cumulative % variance explained (using cumsum()
function):
rafalib::mypar()
plot(
cumsum(p$sdev ^ 2) / sum(p$sdev ^ 2) * 100,
ylab = "cumulative % variance explained",
ylim = c(0, 100),
type = "b",
main = "Cumulative screeplot"
)
prcomp()
), scores \(\mathbf{D V^T}\)rafalib::mypar()
plot(
p$x[, 1:2],
xlab = "PC1",
ylab = "PC2",
main = "plot of p$x[, 1:2]",
col = factor(tissue),
pch = as.integer(factor(tissue))
)
legend(
"topright",
legend = levels(factor(tissue)),
col = 1:length(unique(tissue)),
pch = 1:length(unique(tissue)),
bty = 'n'
)
d <- as.dist(1 - cor(e.standardize.fast))
mds <- cmdscale(d)
rafalib::mypar()
plot(mds, col = factor(tissue), pch = as.integer(factor(tissue)))
legend(
"topright",
legend = levels(factor(tissue)),
col = 1:length(unique(tissue)),
bty = 'n',
pch = 1:length(unique(tissue))
)
library(Biobase)
library(genefilter)
library(GSE5859) ## BiocInstaller::biocLite("genomicsclass/GSE5859")
data(GSE5859)
geneExpression = exprs(e)
sampleInfo = pData(e)
head(table(sampleInfo$date))
##
## 2002-10-31 2002-11-15 2002-11-19 2002-11-21 2002-11-22 2002-11-27
## 2 2 2 5 4 2
year <- as.integer(format(sampleInfo$date, "%y"))
year <- year - min(year)
month = as.integer(format(sampleInfo$date, "%m")) + 12 * year
table(year, sampleInfo$ethnicity)
##
## year ASN CEU HAN
## 0 0 32 0
## 1 0 54 0
## 2 0 13 0
## 3 80 3 0
## 4 2 0 24
pc <- prcomp(t(geneExpression), scale. = TRUE)
boxplot(
pc$x[, 1] ~ month,
varwidth = TRUE,
notch = TRUE,
main = "PC1 scores vs. month",
xlab = "Month",
ylab = "PC1"
)
A starting point for a color palette:
RColorBrewer::display.brewer.all(n = 3, colorblindFriendly = TRUE)
Interpolate one color per month on a quantitative palette:
col3 <- c(RColorBrewer::brewer.pal(n = 3, "Greys")[2:3], "black")
MYcols <- colorRampPalette(col3, space = "Lab")(length(unique(month)))
d <- as.dist(1 - cor(geneExpression))
mds <- cmdscale(d)
plot(mds, col = MYcols[as.integer(factor(month))],
main = "MDS shaded by batch")
hcclass <- cutree(hclust(d), k = 5)
table(hcclass, year)
## year
## hcclass 0 1 2 3 4
## 1 0 2 8 1 0
## 2 25 43 0 2 0
## 3 6 0 0 0 0
## 4 1 7 0 0 0
## 5 0 2 5 80 26