Book Chapter 2:
Book chapter 7:
“The greatest value of a picture is when it forces us to notice what we never expected to see.” - John W. Tukey
suppressPackageStartupMessages(library(UsingR))
suppressPackageStartupMessages(library(rafalib))
# height qq plot
x <- father.son$fheight
ps <- (seq(0, 99) + 0.5) / 100
qs <- quantile(x, ps)
normalqs <- qnorm(ps, mean(x), popsd(x))
par(mfrow = c(1, 3))
plot(normalqs,
qs,
xlab = "Normal Percentiles",
ylab = "Height Percentiles",
main = "Height Q-Q Plot")
abline(0, 1)
# t-distributed for 12 df
x <- rt(1000, 12)
qqnorm(x,
xlab = "t quantiles",
main = "T Quantiles (df=12) Q-Q Plot",
ylim = c(-6, 6))
qqline(x)
# t-distributed for 3 df
x <- rt(1000, 3)
qqnorm(x,
xlab = "t quantiles",
main = "T Quantiles (df=3) Q-Q Plot",
ylim = c(-6, 6))
qqline(x)
par(mfrow = c(1, 3))
hist(exec.pay, main = "CEO Compensation")
qqnorm(exec.pay, main = "CEO Compensation")
boxplot(exec.pay,
ylab = "10,000s of dollars",
ylim = c(0, 400),
main = "CEO Compensation")
plot()
and cor()
par(mfrow = c(1, 3))
plot(
father.son$fheight,
father.son$sheight,
xlab = "Father's height in inches",
ylab = "Son's height in inches",
main = paste("correlation =", signif(
cor(father.son$fheight, father.son$sheight), 2
))
)
plot(
cars$speed,
cars$dist,
xlab = "Speed",
ylab = "Stopping Distance",
main = paste("correlation =", signif(cor(
cars$speed, cars$dist
), 2))
)
plot(
faithful$eruptions,
faithful$waiting,
xlab = "Eruption Duration",
ylab = "Waiting Time",
main = paste("correlation =", signif(
cor(faithful$eruptions, faithful$waiting), 2
))
)
#install from Github
BiocManager::install("genomicsclass/GSE5859Subset")
BiocManager::install("genomicsclass/tissuesGeneExpression")
library(GSE5859Subset)
data(GSE5859Subset) ##this loads three tables
c(class(geneExpression), class(sampleInfo))
## [1] "matrix" "data.frame"
rbind(dim(geneExpression), dim(sampleInfo))
## [,1] [,2]
## [1,] 8793 24
## [2,] 24 4
head(sampleInfo)
## ethnicity date filename group
## 107 ASN 2005-06-23 GSM136508.CEL.gz 1
## 122 ASN 2005-06-27 GSM136530.CEL.gz 1
## 113 ASN 2005-06-27 GSM136517.CEL.gz 1
## 163 ASN 2005-10-28 GSM136576.CEL.gz 1
## 153 ASN 2005-10-07 GSM136566.CEL.gz 1
## 161 ASN 2005-10-07 GSM136574.CEL.gz 1
T-test for every row (gene) of gene expression matrix:
library(genefilter)
g <- factor(sampleInfo$group)
system.time(results <- rowttests(geneExpression,g))
## user system elapsed
## 0.006 0.000 0.007
pvals <- results$p.value
Note that these 8,793 tests are done in about 0.01s
par(mar = c(4, 4, 0, 0))
plot(results$dm,
-log10(results$p.value),
xlab = "Effect size (difference in group means)",
ylab = "- log (base 10) p-values")
abline(h = -log10(0.05 / nrow(geneExpression)), col = "red")
legend("bottomright",
lty = 1,
col = "red",
legend = "Bonferroni = 0.05")
m <- nrow(geneExpression)
n <- ncol(geneExpression)
set.seed(1)
randomData <- matrix(rnorm(n * m), m, n)
nullpvals <- rowttests(randomData, g)$p.value
par(mfrow = c(1, 2))
hist(pvals, ylim = c(0, 1400))
hist(nullpvals, ylim = c(0, 1400))
Note that permuting these data doesn’t produce an ideal null p-value histogram due to batch effects:
permg <- sample(g)
permresults <- rowttests(geneExpression, permg)
hist(permresults$p.value)
ComBat
: corrects for known batch effects by linear model), andsva
: creates surrogate variables for unknown batch effects, corrects the structure of permutation p-valuesAll are available from the sva Bioconductor package
rafalib::mypar(1, 2)
pseudo <- apply(geneExpression, 1, median)
plot(geneExpression[, 1], pseudo)
plot((geneExpression[, 1] + pseudo) / 2, (geneExpression[, 1] - pseudo))
affyPLM::MAplots
better MA plots
Detailed representation of high-dimensional dataset
ge = geneExpression[sample(1:nrow(geneExpression), 200),]
pheatmap::pheatmap(
ge,
scale = "row",
show_colnames = FALSE,
show_rownames = FALSE
)
Combination of RColorBrewer
package and colorRampPalette()
can create anything you want
rafalib::mypar(1, 1)
RColorBrewer::display.brewer.all(n = 7)
“Pie charts are a very bad way of displaying information.” - R Help
library(GSE5859Subset)
data(GSE5859Subset) ##this loads three tables
set.seed(1)
ge = geneExpression[sample(1:nrow(geneExpression), 200),]
pheatmap::pheatmap(
ge,
scale = "row",
show_colnames = FALSE,
show_rownames = FALSE
)
Source: http://master.bioconductor.org/help/course-materials/2002/Summer02Course/Distance/distance.pdf
A metric satisfies the following five properties:
rafalib::mypar()
plot(
c(0, 1, 1),
c(0, 0, 1),
pch = 16,
cex = 2,
xaxt = "n",
yaxt = "n",
xlab = "",
ylab = "",
bty = "n",
xlim = c(-0.25, 1.25),
ylim = c(-0.25, 1.25)
)
lines(c(0, 1, 1, 0), c(0, 0, 1, 0))
text(0, .2, expression(paste('(A'[x] * ',A'[y] * ')')), cex = 1.5)
text(1, 1.2, expression(paste('(B'[x] * ',B'[y] * ')')), cex = 1.5)
text(-0.1, 0, "A", cex = 2)
text(1.1, 1, "B", cex = 2)
##BiocManager::install("genomicsclass/tissuesGeneExpression")
library(tissuesGeneExpression)
data(tissuesGeneExpression)
dim(e) ##gene expression data
## [1] 22215 189
table(tissue) ##tissue[i] corresponds to e[,i]
## tissue
## cerebellum colon endometrium hippocampus kidney liver
## 38 34 15 31 39 26
## placenta
## 6
Interested in identifying similar samples and similar genes
Euclidean distance as for two dimensions. E.g., the distance between two samples \(i\) and \(j\) is:
\[ \mbox{dist}(i,j) = \sqrt{ \sum_{g=1}^{22215} (Y_{g,i}-Y_{g,j })^2 } \]
and the distance between two features \(h\) and \(g\) is:
\[ \mbox{dist}(h,g) = \sqrt{ \sum_{i=1}^{189} (Y_{h,i}-Y_{g,i})^2 } \]
The distance between samples \(i\) and \(j\) can be written as:
\[ \mbox{dist}(i,j) = \sqrt{ (\mathbf{Y}_i - \mathbf{Y}_j)^\top(\mathbf{Y}_i - \mathbf{Y}_j) }\]
with \(\mathbf{Y}_i\) and \(\mathbf{Y}_j\) columns \(i\) and \(j\).
t(matrix(1:3, ncol = 1))
## [,1] [,2] [,3]
## [1,] 1 2 3
matrix(1:3, ncol = 1)
## [,1]
## [1,] 1
## [2,] 2
## [3,] 3
t(matrix(1:3, ncol = 1)) %*% matrix(1:3, ncol = 1)
## [,1]
## [1,] 14
Note: R is very efficient at matrix algebra
kidney1 <- e[, 1]
kidney2 <- e[, 2]
colon1 <- e[, 87]
sqrt(sum((kidney1 - kidney2) ^ 2))
## [1] 85.8546
sqrt(sum((kidney1 - colon1) ^ 2))
## [1] 122.8919
dim(e)
## [1] 22215 189
(d <- dist(t(e[, c(1, 2, 87)])))
## GSM11805.CEL.gz GSM11814.CEL.gz
## GSM11814.CEL.gz 85.8546
## GSM92240.CEL.gz 122.8919 115.4773
class(d)
## [1] "dist"
Excerpt from ?dist:
dist(
x,
method = "euclidean",
diag = FALSE,
upper = FALSE,
p = 2
)
dist
class output from dist()
is used for many clustering algorithms and heatmap functionsCaution: dist(e)
creates a 22215 x 22215 matrix that will probably crash your R session.
\[x_{gi} \leftarrow \frac{(x_{gi} - \bar{x}_g)}{s_g}\]
Simulate the heights of twin pairs:
set.seed(1)
n <- 100
y <- t(MASS::mvrnorm(n, c(0, 0), matrix(c(1, 0.95, 0.95, 1), 2, 2)))
dim(y)
## [1] 2 100
cor(t(y))
## [,1] [,2]
## [1,] 1.0000000 0.9433295
## [2,] 0.9433295 1.0000000
z1 = (y[1,] + y[2,]) / 2 #the sum
z2 = (y[1,] - y[2,]) #the difference
z = rbind(z1, z2) #matrix now same dimensions as y
thelim <- c(-3, 3)
rafalib::mypar(1, 2)
plot(
y[1,],
y[2,],
xlab = "Twin 1 (standardized height)",
ylab = "Twin 2 (standardized height)",
xlim = thelim,
ylim = thelim,
main = "Original twin heights"
)
points(y[1, 1:2], y[2, 1:2], col = 2, pch = 16)
plot(
z[1,],
z[2,],
xlim = thelim,
ylim = thelim,
xlab = "Average height",
ylab = "Difference in height",
main = "Manual PCA-like projection"
)
points(z[1, 1:2] , z[2, 1:2], col = 2, pch = 16)
rafalib::mypar()
d = dist(t(y))
d3 = dist(z[1, ]) * sqrt(2) ##distance computed using just first dimension mypar(1,1)
plot(as.numeric(d),
as.numeric(d3),
xlab = "Pairwise distances in 2 dimensions",
ylab = "Pairwise distances in 1 dimension")
abline(0, 1, col = "red")
SVD generalizes the example rotation we looked at:
\[\mathbf{Y} = \mathbf{UDV}^\top\]
note: the above formulation is for \(m > n\)
Scaling:
system.time(e.standardize.slow <-
t(apply(e, 1, function(x)
x - mean(x))))
## user system elapsed
## 1.560 0.080 1.678
system.time(e.standardize.fast <- t(scale(t(e), scale = FALSE)))
## user system elapsed
## 0.137 0.001 0.139
all.equal(e.standardize.slow[, 1], e.standardize.fast[, 1])
## [1] TRUE
SVD:
s <- svd(e.standardize.fast)
names(s)
## [1] "d" "u" "v"
dim(s$u) # loadings
## [1] 22215 189
length(s$d) # eigenvalues
## [1] 189
dim(s$v) # d %*% vT = scores
## [1] 189 189
rafalib::mypar()
p <- prcomp(t(e.standardize.fast))
plot(s$u[, 1] ~ p$rotation[, 1])
Lesson: u and v can each be multiplied by -1 arbitrarily
plot(p$rotation[, 1],
xlab = "Index of genes",
ylab = "Loadings of PC1",
main = "Scores of PC1") #or, predict(p)
abline(h = c(-0.03, 0.04), col = "red")
Genes with high PC1 loadings:
e.pc1genes <-
e.standardize.fast[p$rotation[, 1] < -0.03 |
p$rotation[, 1] > 0.03,]
pheatmap::pheatmap(
e.pc1genes,
scale = "none",
show_rownames = TRUE,
show_colnames = FALSE
)
rafalib::mypar()
plot(
p$sdev ^ 2 / sum(p$sdev ^ 2) * 100,
xlim = c(0, 150),
type = "b",
ylab = "% variance explained",
main = "Screeplot"
)
Alternatively as cumulative % variance explained (using cumsum()
function):
rafalib::mypar()
plot(
cumsum(p$sdev ^ 2) / sum(p$sdev ^ 2) * 100,
ylab = "cumulative % variance explained",
ylim = c(0, 100),
type = "b",
main = "Cumulative screeplot"
)
prcomp()
), scores \(\mathbf{D V^T}\)rafalib::mypar()
plot(
p$x[, 1:2],
xlab = "PC1",
ylab = "PC2",
main = "plot of p$x[, 1:2]",
col = factor(tissue),
pch = as.integer(factor(tissue))
)
legend(
"topright",
legend = levels(factor(tissue)),
col = 1:length(unique(tissue)),
pch = 1:length(unique(tissue)),
bty = 'n'
)
d <- as.dist(1 - cor(e.standardize.fast))
mds <- cmdscale(d)
rafalib::mypar()
plot(mds, col = factor(tissue), pch = as.integer(factor(tissue)))
legend(
"topright",
legend = levels(factor(tissue)),
col = 1:length(unique(tissue)),
bty = 'n',
pch = 1:length(unique(tissue))
)
library(Biobase)
library(genefilter)
library(GSE5859) ## BiocInstaller::biocLite("genomicsclass/GSE5859")
data(GSE5859)
geneExpression = exprs(e)
sampleInfo = pData(e)
head(table(sampleInfo$date))
##
## 2002-10-31 2002-11-15 2002-11-19 2002-11-21 2002-11-22 2002-11-27
## 2 2 2 5 4 2
year <- as.integer(format(sampleInfo$date, "%y"))
year <- year - min(year)
month = as.integer(format(sampleInfo$date, "%m")) + 12 * year
table(year, sampleInfo$ethnicity)
##
## year ASN CEU HAN
## 0 0 32 0
## 1 0 54 0
## 2 0 13 0
## 3 80 3 0
## 4 2 0 24
pc <- prcomp(t(geneExpression), scale. = TRUE)
boxplot(
pc$x[, 1] ~ month,
varwidth = TRUE,
notch = TRUE,
main = "PC1 scores vs. month",
xlab = "Month",
ylab = "PC1"
)
A starting point for a color palette:
RColorBrewer::display.brewer.all(n = 3, colorblindFriendly = TRUE)
Interpolate one color per month on a quantitative palette:
col3 <- c(RColorBrewer::brewer.pal(n = 3, "Greys")[2:3], "black")
MYcols <- colorRampPalette(col3, space = "Lab")(length(unique(month)))
d <- as.dist(1 - cor(geneExpression))
mds <- cmdscale(d)
plot(mds, col = MYcols[as.integer(factor(month))],
main = "MDS shaded by batch")
hcclass <- cutree(hclust(d), k = 5)
table(hcclass, year)
## year
## hcclass 0 1 2 3 4
## 1 0 2 8 1 0
## 2 25 43 0 2 0
## 3 6 0 0 0 0
## 4 1 7 0 0 0
## 5 0 2 5 80 26
Exploratory Data Analysis: http://genomicsclass.github.io/book/pages/exploratory_data_analysis_exercises.html
Given the above histogram, how many people are between the ages of 35 and 45?
The InsectSprays
data set is included in R. The dataset reports the counts of insects in agricultural experimental units treated with different insecticides. Make a boxplot and determine which insecticide appears to be most effective.
Download and load this dataset into R. Use exploratory data analysis tools to determine which two columns are different from the rest. Which is the column that is positively skewed?
Which is the column that is negatively skewed?
Let’s consider a random sample of finishers from the New York City Marathon in 2002. This dataset can be found in the UsingR package. Load the library and then load the nym.2002
dataset.
suppressPackageStartupMessages(library(dplyr))
data(nym.2002, package="UsingR")
Use boxplots and histograms to compare the finishing times of males and females. Which of the following best describes the difference?
A) Males and females have the same distribution.
B) Most males are faster than most women.
C) Male and females have similar right skewed distributions with the former, 20 minutes shifted to the left.
D) Both distribution are normally distributed with a difference in mean of about 30 minutes.
Use dplyr
to create two new data frames: males and females, with the data for each gender. For males, what is the Pearson correlation between age and time to finish?
For females, what is the Pearson correlation between age and time to finish?
If we interpret these correlations without visualizing the data, we would conclude that the older we get, the slower we run marathons, regardless of gender. Look at scatterplots and boxplots of times stratified by age groups (20-25, 25-30, etc..). After examining the data, what is a more reasonable conclusion?