Levi Waldron
“The greatest value of a picture is when it forces us to notice what we never expected to see.” - John W. Tukey
plot()
and cor()
library(GSE5859Subset)
data(GSE5859Subset) ##this loads three tables
c(class(geneExpression), class(sampleInfo))
#> [1] "matrix" "array" "data.frame"
rbind(dim(geneExpression), dim(sampleInfo))
#> [,1] [,2]
#> [1,] 8793 24
#> [2,] 24 4
head(sampleInfo)
#> ethnicity date filename group
#> 107 ASN 2005-06-23 GSM136508.CEL.gz 1
#> 122 ASN 2005-06-27 GSM136530.CEL.gz 1
#> 113 ASN 2005-06-27 GSM136517.CEL.gz 1
#> 163 ASN 2005-10-28 GSM136576.CEL.gz 1
#> 153 ASN 2005-10-07 GSM136566.CEL.gz 1
#> 161 ASN 2005-10-07 GSM136574.CEL.gz 1
T-test for every row (gene) of gene expression matrix:
library(genefilter)
g <- factor(sampleInfo$group)
system.time(results <- rowttests(geneExpression, g))
#> user system elapsed
#> 0.004 0.000 0.004
pvals <- results$p.value
Note that these 8,793 tests are done in about 0.01s
par(mar = c(4, 4, 0, 0))
plot(results$dm,
-log10(results$p.value),
xlab = "Effect size (difference in group means)",
ylab = "- log (base 10) p-values")
abline(h = -log10(0.05 / nrow(geneExpression)), col = "red")
legend("bottomright",
lty = 1,
col = "red",
legend = "Bonferroni = 0.05")
Note that permuting these data doesn’t produce an ideal null p-value histogram due to batch effects:
ComBat
: corrects for known batch effects by linear
model), andsva
: creates surrogate variables for unknown batch
effects, corrects the structure of permutation p-valuesbatchelor
for single-cell analysisComBat
and sva
are available from the sva Bioconductor
package
rafalib::mypar(1, 2)
pseudo <- apply(geneExpression, 1, median)
plot(geneExpression[, 1], pseudo)
plot((geneExpression[, 1] + pseudo) / 2, (geneExpression[, 1] - pseudo))
affyPLM::MAplots
better MA plots
ComplexHeatmap
package is the best as of 2023: large
datasets, interactive heatmaps, simple defaults but many customizations
possibleCombination of RColorBrewer
package and
colorRampPalette()
can create anything you want
“Pie charts are a very bad way of displaying information.” - R Help
library(Biobase)
library(genefilter)
library(GSE5859) ## BiocInstaller::biocLite("genomicsclass/GSE5859")
data(GSE5859)
geneExpression = exprs(e)
sampleInfo = pData(e)
ExpressionSet
object is obsolete, we use
SummarizedExperiment
nowpc <- prcomp(t(geneExpression), scale. = TRUE)
boxplot(
pc$x[, 1] ~ month,
varwidth = TRUE,
notch = TRUE,
main = "PC1 scores vs. month",
xlab = "Month",
ylab = "PC1"
)
A starting point for a color palette:
Interpolate one color per month on a quantitative palette:
d <- as.dist(1 - cor(geneExpression))
mds <- cmdscale(d)
plot(mds, col = MYcols[as.integer(factor(month))],
main = "MDS shaded by month")
batchelor::rescaleBatches()
limma::removeBatchEffect()
, sva::comBat()
,
batchelor::regressBatches()
batchelor::fastMNN()