Shaun Jackman — Mar 8, 2013, 10:08 AM
# STAT 540 Homework 1
# Shaun Jackman
# 2013-02-18
library(lattice)
library(limma)
library(preprocessCore)
library(reshape2)
design <- read.table('gse7191_design.txt')
design$sample <- factor(row.names(design))
design$DateRun <- as.Date(design$DateRun, format='%m/%d/%y')
design$Genotype <- relevel(design$Genotype, 'Wild_type')
data <- t(read.table('GSE7191_data.txt', header=TRUE, row.names=1))
tall <- melt(cbind(design, data), id.vars=colnames(design),
variable.name = 'probe', value.name = 'expression')
# > 1a Indicate how many probes and how many samples are available.
cat('Number of probes:', ncol(data))
Number of probes: 12422
cat('Number of samples:', nrow(data))
Number of samples: 50
# > 1b What is the breakdown of samples for Genotype, BrainRegion and Sex within each DateRun?
by(design, design$DateRun,
function(x) summary(subset(x, select=c(Genotype, BrainRegion, Sex))))
design$DateRun: 2003-08-14
Genotype BrainRegion Sex
Wild_type:4 hippocampus:4 female:8
S1P2_KO :4 neocortex :4 male :0
S1P3_KO :0
--------------------------------------------------------
design$DateRun: 2003-08-21
Genotype BrainRegion Sex
Wild_type:8 hippocampus:4 female:0
S1P2_KO :0 neocortex :4 male :8
S1P3_KO :0
--------------------------------------------------------
design$DateRun: 2003-09-11
Genotype BrainRegion Sex
Wild_type:7 hippocampus:3 female:6
S1P2_KO :0 neocortex :4 male :1
S1P3_KO :0
--------------------------------------------------------
design$DateRun: 2003-10-23
Genotype BrainRegion Sex
Wild_type:0 hippocampus:4 female:2
S1P2_KO :7 neocortex :3 male :5
S1P3_KO :0
--------------------------------------------------------
design$DateRun: 2003-12-18
Genotype BrainRegion Sex
Wild_type:1 hippocampus:2 female:3
S1P2_KO :1 neocortex :3 male :2
S1P3_KO :3
--------------------------------------------------------
design$DateRun: 2004-01-16
Genotype BrainRegion Sex
Wild_type:0 hippocampus:4 female:3
S1P2_KO :0 neocortex :3 male :4
S1P3_KO :7
--------------------------------------------------------
design$DateRun: 2004-03-11
Genotype BrainRegion Sex
Wild_type:0 hippocampus:2 female:0
S1P2_KO :4 neocortex :2 male :4
S1P3_KO :0
--------------------------------------------------------
design$DateRun: 2004-07-23
Genotype BrainRegion Sex
Wild_type:0 hippocampus:2 female:4
S1P2_KO :4 neocortex :2 male :0
S1P3_KO :0
# > 1c Create a plot showing the raw data for one probe, labeled by sample type (choose at least one factor to label, such as genotype).
stripplot(expression ~ Genotype, tall, group = BrainRegion,
subset = probe == '100001_at',
type = c('p', 'a'), jitter.data=TRUE, auto.key=TRUE, grid=TRUE)
# > 1d Report the average expression of the selected probe across Genotype, BrainRegion and Sex, i.e. one of the 12 average expression values you report will be for wild type neocortex in the male mice.
oneprobe = subset(tall, probe == '100001_at')
by(oneprobe[,c('expression')], oneprobe[,c('Genotype', 'BrainRegion', 'Sex')],
mean)
Genotype: Wild_type
BrainRegion: hippocampus
Sex: female
[1] 4.334
--------------------------------------------------------
Genotype: S1P2_KO
BrainRegion: hippocampus
Sex: female
[1] 4.255
--------------------------------------------------------
Genotype: S1P3_KO
BrainRegion: hippocampus
Sex: female
[1] 4.395
--------------------------------------------------------
Genotype: Wild_type
BrainRegion: neocortex
Sex: female
[1] 4.302
--------------------------------------------------------
Genotype: S1P2_KO
BrainRegion: neocortex
Sex: female
[1] 4.278
--------------------------------------------------------
Genotype: S1P3_KO
BrainRegion: neocortex
Sex: female
[1] 4.421
--------------------------------------------------------
Genotype: Wild_type
BrainRegion: hippocampus
Sex: male
[1] 4.357
--------------------------------------------------------
Genotype: S1P2_KO
BrainRegion: hippocampus
Sex: male
[1] 4.303
--------------------------------------------------------
Genotype: S1P3_KO
BrainRegion: hippocampus
Sex: male
[1] 4.476
--------------------------------------------------------
Genotype: Wild_type
BrainRegion: neocortex
Sex: male
[1] 4.308
--------------------------------------------------------
Genotype: S1P2_KO
BrainRegion: neocortex
Sex: male
[1] 4.349
--------------------------------------------------------
Genotype: S1P3_KO
BrainRegion: neocortex
Sex: male
[1] 4.311
# > 2a Order the data by run dates and examine batch effects using a heatmap.
heatmap(data[order(design$DateRun),], Rowv=NA)
heatmap(data)
outlier <- 'GSM172976'
# > 2b Document, identify and remove the worst outlier observed in the heatmap.
cat('The outlier is', outlier)
The outlier is GSM172976
design.clean <- subset(design, sample != outlier)
data.clean <- data[design$sample != outlier,]
tall.clean <- subset(tall, sample != outlier)
# > 2c Scatter plot this outlier, to-be-removed sample against the other samples from the same group (i.e., same Genotype, BrainRegion and Sex)
x <- design[outlier,c('Genotype', 'BrainRegion', 'Sex')]
same.as.outlier <- subset(design,
subset = Genotype == x$Genotype & BrainRegion == x$BrainRegion & Sex == x$Sex
)$sample
splom(t(data[same.as.outlier,]))
# > 2d Re-make the heatmap of the sample correlation matrix after removing the outlier.
heatmap(data.clean)
# > Comment on the results.
# The initial clustering of the data showed that a single sample was isolated
# from all other samples in the dendrogram. After removing the outlier, no one
# sample is isolated from the rest.
# > 3a Re-normalize the datasets we have provided (before removing the outlier) using quantile normalization.
data.norm <- t(normalize.quantiles(t(data)))
dimnames(data.norm) <- dimnames(data)
tall.norm <- melt(cbind(design, data.norm), id.vars=colnames(design),
variable.name = 'probe', value.name = 'expression')
# > Compare before and after re-normalization datasets using box plots. Identify the outlier with a different color.
sample.col <- rep('blue', nrow(design))
sample.col[design$sample == outlier] <- 'red'
boxplot(t(data), col=sample.col)
boxplot(t(data.norm), col=sample.col)
# > 3b Repeat a) for the clean data (after removing the outlier).
data.clean.norm <- t(normalize.quantiles(t(data.clean)))
dimnames(data.clean.norm) <- dimnames(data.clean)
tall.clean.norm <- melt(cbind(design.clean, data.clean.norm),
id.vars=colnames(design.clean),
variable.name = 'probe', value.name = 'expression')
boxplot(t(data.clean))
boxplot(t(data.clean.norm))
# > 3c Comment and describe the results from a) and b).
# > Does the outlier affect the normalization?
# No, the outlier does not greatly affect the normalization.
# > Does normalization have the desired effect in the clean data?
# Yes, the normalization does have the desired effect in the clean data. Namely,
# the data is now quantile normalized.
# > Does normalization solve the problem of the outlier?
# No, normalization does not solve the problem of the outlier.
# > 3d Compare the normalized data of the outlying sample against other samples in the same group (i.e., repeat 2c for the normalized data).
splom(t(data[same.as.outlier,]), xlab='Without quantile normalization')
splom(t(data.norm[same.as.outlier,]), xlab='With quantile normalization')
# > Does normalization solve the problem of the outlier?
# No. Quantile normalization does not solve the problem of the outlier.
# > 4a Write out, in an equation or English or, ideally, both, the model you are fitting. In the context of that model, what statistical test(s) are you conducting to find the requested probe hits?
# I am fitting a linear model with three parameters:
# 1. the mean expression for the probe, also known as the intercept
# 2. the effect on expression due to the Genotype S1P2_KO for that probe
# 3. the effect on expression due to the Genotype S1P3_KO for that probe
# I am conducting an F-test to determine whether the effect on expression due to
# both genotypes S1P2_KO and S1P3_KO is zero.
# > 4b How many probes have p-value < 10e-4?
mm <- model.matrix(~Genotype, design)
eb <- eBayes(lmFit(t(data), mm))
hits <- topTable(eb, coef = grep('Genotype', colnames(mm)), number=Inf)
countHits <- function(hits, p.value=10e-4) {
cat('Number of probes with a p-value <', p.value , ':',
sum(hits$P.Value < p.value), '\n')
cat('Number of probes with an adjusted p-value <', p.value , ':',
sum(hits$adj.P.Val < p.value))
}
countHits(hits)
Number of probes with a p-value < 0.001 : 165
Number of probes with an adjusted p-value < 0.001 : 7
stripplot(expression ~ Genotype | probe, tall.norm,
subset = probe %in% subset(hits, adj.P.Val < 10e-4)$ID,
type=c('p', 'a'), jitter.data=TRUE, grid=TRUE,
scales=list(relation='sliced'))
# > Order results in ascending order of p-values and display the top 50 probes in a heat map.
top50 <- topTable(eb, coef = grep('Genotype', colnames(mm)),
number=50)$ID
heatmap(data.norm[,top50])
# > 4c State explicitly what, if any, adjustment has been made to these p-values with respect to multiple testing.
# The p-values have been adjusted using Benjamini & Hochberg multiple testing
# correction to control the false discovery rate.
# > What is the associated probability statement for the probes reported in 4b?
# Using an adjusted p-value threshold of 10e-4, which equals 1e-3,
# the probablity that a reported discovery is false is 10e-4 = 1e-3 = 0.001.
# We expect one false discovery for every 1,000 reported discoveries.
# > 4d Provide a stripplot for the top-ranking probe and report the associated p-value.
stripplot(expression ~ Genotype, tall.norm, subset = probe == top50[1],
type=c('p', 'a'), jitter.data=TRUE, grid=TRUE)
cat('The p-value is:', hits[1,]$P.Value)
The p-value is: 7.234e-11
cat('The adjusted p-value is:', hits[1,]$adj.P.Val)
The adjusted p-value is: 8.987e-07
# > Do the same for a probe near the bottom of the overall ranking (that is, a "boring" gene).
boring <- tail(hits, n=1)
stripplot(expression ~ Genotype, tall.norm, subset = probe == boring$ID,
type=c('p', 'a'), jitter.data=TRUE, grid=TRUE)
cat('The p-value is:', boring$P.Value)
The p-value is: 0.9998
cat('The adjusted p-value is:', boring$adj.P.Val)
The adjusted p-value is: 0.9998
# > Comment on the differences you observe.
# The top-ranking probe has clearly different expression levels for all three
# genotypes and a low p-value, whereas the boring probe has nearly identical
# expression levels for all three genotypes and a high p-value.
# > 4e Test the null hypothesis that the mean probe expression of the S1P3 knockout is the same as that of the wild type.
mm <- model.matrix(~ 0 + Genotype, design)
fit <- lmFit(t(data.norm), mm)
eb <- eBayes(contrasts.fit(fit,
makeContrasts(P3vsWT = GenotypeS1P3_KO - GenotypeWild_type, levels=mm)))
# > How many probes have a p-value < 10e-4?
P3vsWT.hits <- topTable(eb, number=Inf)
countHits(P3vsWT.hits)
Number of probes with a p-value < 0.001 : 160
Number of probes with an adjusted p-value < 0.001 : 4
stripplot(expression ~ Genotype | probe, tall.norm,
subset = probe %in% subset(P3vsWT.hits, adj.P.Val < 10e-4)$ID
& Genotype %in% c('Wild_type', 'S1P3_KO'),
type=c('p', 'a'), jitter.data=TRUE, grid=TRUE,
scales=list(relation='sliced'))
# > 4f Repeat the test for differential expression across the 3 genotypes using the clean normalized dataset (i.e., the normalized data after removing the outlier).
mm <- model.matrix(~Genotype, design.clean)
eb <- eBayes(lmFit(t(data.clean.norm), mm))
hits.clean <- topTable(eb, coef = grep('Genotype', colnames(mm)), number=Inf)
countHits(hits.clean)
Number of probes with a p-value < 0.001 : 148
Number of probes with an adjusted p-value < 0.001 : 8
stripplot(expression ~ Genotype | probe, tall.norm,
subset = probe %in% subset(hits.clean, adj.P.Val < 10e-4)$ID,
type=c('p', 'a'), jitter.data=TRUE, grid=TRUE,
scales=list(relation='sliced'))
# > Compare the list of probes with p < 10e-4 between both analyses.
compareHits <- function(hits.id, hits.clean.id) {
df <- data.frame(
Raw = colnames(data) %in% hits.id,
Clean = colnames(data) %in% hits.clean.id)
print(addmargins(with(df, table(Raw, Clean))))
vennDiagram(df)
}
compareHits(
subset(hits, P.Value < 10e-4)$ID,
subset(hits.clean, P.Value < 10e-4)$ID)
Clean
Raw FALSE TRUE Sum
FALSE 12197 60 12257
TRUE 77 88 165
Sum 12274 148 12422
compareHits(
subset(hits, adj.P.Val < 10e-4)$ID,
subset(hits.clean, adj.P.Val < 10e-4)$ID)
Clean
Raw FALSE TRUE Sum
FALSE 12412 3 12415
TRUE 2 5 7
Sum 12414 8 12422
# > 5a Using the parametrization of your choice, fit a 3x2 full factorial model.
# > Test for a difference of expression across the 6 groups.
# > How many probesets have an overall p-value < 10e-4?
mm <- model.matrix(~ Genotype * BrainRegion, design.clean)
eb <- eBayes(lmFit(t(data.clean.norm), mm))
hits <- topTable(eb, coef = grep('(Intercept)', colnames(mm), invert=TRUE),
number=Inf)
countHits(hits)
Number of probes with a p-value < 0.001 : 2000
Number of probes with an adjusted p-value < 0.001 : 1524
# > 5b Use an F-test to test the null hypothesis that all the interaction effects between Genotype and BrainRegion are zero.
# > How many probes have a p-value < 0.05?
hits <- topTable(eb, coef = grep(':', colnames(mm)), number=Inf)
countHits(hits, 0.05)
Number of probes with a p-value < 0.05 : 211
Number of probes with an adjusted p-value < 0.05 : 0
# > 5c Identify the probe with the lowest p-value and pick additional "boring" gene (with respect to Genotype:BrainRegion interaction effects).
# > For this contrasting pair of probes, report the relevant statistical results and plot their data illustrating the interaction effect.
interesting = hits[1,]
boring = hits[9999,]
rbind(interesting, boring)
ID GenotypeS1P2_KO.BrainRegionneocortex
162182_f_at 162182_f_at -0.21966
162476_r_at 162476_r_at 0.04564
GenotypeS1P3_KO.BrainRegionneocortex AveExpr F P.Value
162182_f_at -0.16995 8.530 7.9180 0.001039
162476_r_at -0.03779 6.593 0.1426 0.867460
adj.P.Val
162182_f_at 1
162476_r_at 1
stripplot(expression ~ Genotype | probe, tall.clean.norm,
group = BrainRegion,
subset = probe %in% c(interesting$ID, boring$ID),
type=c('p', 'a'), jitter.data=TRUE, grid=TRUE, auto.key=TRUE,
scales=list(relation='sliced'))
# > 5d In problem 4, you identified probes with significant differential expression among the three genotypes. In 5a, you fit a more complex model considering the joint effect of Genotype and BrainRegion on each probe expression.
# > For each probe, use an F-test to test if the more complex model with Genotype and BrainRegion effects is better than the simpler model with only Genotype effect.
hits <- topTable(eb, coef = grep('BrainRegion', colnames(mm)), number=Inf)
countHits(hits)
Number of probes with a p-value < 0.001 : 1995
Number of probes with an adjusted p-value < 0.001 : 1599
# > Draw a density plot of the corresponding p-values and comment on the results.
densityplot(~P.Value, hits)
# The density plot appears to be a mixture of a uniform distribution, expected
# of probes for which the null hypothesis is true, and a peaked distribution
# of probes for which we can reject the null hypothesis.