STAT 540 Homework 1

Shaun Jackman — Mar 8, 2013, 10:08 AM

# STAT 540 Homework 1
# Shaun Jackman
# 2013-02-18

library(lattice)
library(limma)
library(preprocessCore)
library(reshape2)

design <- read.table('gse7191_design.txt')
design$sample <- factor(row.names(design))
design$DateRun <- as.Date(design$DateRun, format='%m/%d/%y')
design$Genotype <- relevel(design$Genotype, 'Wild_type')
data <- t(read.table('GSE7191_data.txt', header=TRUE, row.names=1))
tall <- melt(cbind(design, data), id.vars=colnames(design),
    variable.name = 'probe', value.name = 'expression')

# > 1a Indicate how many probes and how many samples are available.
cat('Number of probes:', ncol(data))

Number of probes: 12422

cat('Number of samples:', nrow(data))

Number of samples: 50


# > 1b What is the breakdown of samples for Genotype, BrainRegion and Sex within each DateRun?
by(design, design$DateRun,
    function(x) summary(subset(x, select=c(Genotype, BrainRegion, Sex))))

design$DateRun: 2003-08-14
      Genotype      BrainRegion     Sex   
 Wild_type:4   hippocampus:4    female:8  
 S1P2_KO  :4   neocortex  :4    male  :0  
 S1P3_KO  :0                              
-------------------------------------------------------- 
design$DateRun: 2003-08-21
      Genotype      BrainRegion     Sex   
 Wild_type:8   hippocampus:4    female:0  
 S1P2_KO  :0   neocortex  :4    male  :8  
 S1P3_KO  :0                              
-------------------------------------------------------- 
design$DateRun: 2003-09-11
      Genotype      BrainRegion     Sex   
 Wild_type:7   hippocampus:3    female:6  
 S1P2_KO  :0   neocortex  :4    male  :1  
 S1P3_KO  :0                              
-------------------------------------------------------- 
design$DateRun: 2003-10-23
      Genotype      BrainRegion     Sex   
 Wild_type:0   hippocampus:4    female:2  
 S1P2_KO  :7   neocortex  :3    male  :5  
 S1P3_KO  :0                              
-------------------------------------------------------- 
design$DateRun: 2003-12-18
      Genotype      BrainRegion     Sex   
 Wild_type:1   hippocampus:2    female:3  
 S1P2_KO  :1   neocortex  :3    male  :2  
 S1P3_KO  :3                              
-------------------------------------------------------- 
design$DateRun: 2004-01-16
      Genotype      BrainRegion     Sex   
 Wild_type:0   hippocampus:4    female:3  
 S1P2_KO  :0   neocortex  :3    male  :4  
 S1P3_KO  :7                              
-------------------------------------------------------- 
design$DateRun: 2004-03-11
      Genotype      BrainRegion     Sex   
 Wild_type:0   hippocampus:2    female:0  
 S1P2_KO  :4   neocortex  :2    male  :4  
 S1P3_KO  :0                              
-------------------------------------------------------- 
design$DateRun: 2004-07-23
      Genotype      BrainRegion     Sex   
 Wild_type:0   hippocampus:2    female:4  
 S1P2_KO  :4   neocortex  :2    male  :0  
 S1P3_KO  :0


# > 1c Create a plot showing the raw data for one probe, labeled by sample type (choose at least one factor to label, such as genotype).
stripplot(expression ~ Genotype, tall, group = BrainRegion,
    subset = probe == '100001_at',
    type = c('p', 'a'), jitter.data=TRUE, auto.key=TRUE, grid=TRUE)

plot of chunk unnamed-chunk-1


# > 1d Report the average expression of the selected probe across Genotype, BrainRegion and Sex, i.e. one of the 12 average expression values you report will be for wild type neocortex in the male mice.
oneprobe = subset(tall, probe == '100001_at')
by(oneprobe[,c('expression')], oneprobe[,c('Genotype', 'BrainRegion', 'Sex')],
    mean)

Genotype: Wild_type
BrainRegion: hippocampus
Sex: female
[1] 4.334
-------------------------------------------------------- 
Genotype: S1P2_KO
BrainRegion: hippocampus
Sex: female
[1] 4.255
-------------------------------------------------------- 
Genotype: S1P3_KO
BrainRegion: hippocampus
Sex: female
[1] 4.395
-------------------------------------------------------- 
Genotype: Wild_type
BrainRegion: neocortex
Sex: female
[1] 4.302
-------------------------------------------------------- 
Genotype: S1P2_KO
BrainRegion: neocortex
Sex: female
[1] 4.278
-------------------------------------------------------- 
Genotype: S1P3_KO
BrainRegion: neocortex
Sex: female
[1] 4.421
-------------------------------------------------------- 
Genotype: Wild_type
BrainRegion: hippocampus
Sex: male
[1] 4.357
-------------------------------------------------------- 
Genotype: S1P2_KO
BrainRegion: hippocampus
Sex: male
[1] 4.303
-------------------------------------------------------- 
Genotype: S1P3_KO
BrainRegion: hippocampus
Sex: male
[1] 4.476
-------------------------------------------------------- 
Genotype: Wild_type
BrainRegion: neocortex
Sex: male
[1] 4.308
-------------------------------------------------------- 
Genotype: S1P2_KO
BrainRegion: neocortex
Sex: male
[1] 4.349
-------------------------------------------------------- 
Genotype: S1P3_KO
BrainRegion: neocortex
Sex: male
[1] 4.311


# > 2a Order the data by run dates and examine batch effects using a heatmap.
heatmap(data[order(design$DateRun),], Rowv=NA)

plot of chunk unnamed-chunk-1

heatmap(data)

plot of chunk unnamed-chunk-1

outlier <- 'GSM172976'

# > 2b Document, identify and remove the worst outlier observed in the heatmap.
cat('The outlier is', outlier)

The outlier is GSM172976

design.clean <- subset(design, sample != outlier)
data.clean <- data[design$sample != outlier,]
tall.clean <- subset(tall, sample != outlier)

# > 2c Scatter plot this outlier, to-be-removed sample against the other samples from the same group (i.e., same Genotype, BrainRegion and Sex)
x <- design[outlier,c('Genotype', 'BrainRegion', 'Sex')]
same.as.outlier <- subset(design,
    subset = Genotype == x$Genotype & BrainRegion == x$BrainRegion & Sex == x$Sex
    )$sample
splom(t(data[same.as.outlier,]))

plot of chunk unnamed-chunk-1


# > 2d Re-make the heatmap of the sample correlation matrix after removing the outlier.
heatmap(data.clean)

plot of chunk unnamed-chunk-1

# > Comment on the results.
# The initial clustering of the data showed that a single sample was isolated
# from all other samples in the dendrogram. After removing the outlier, no one
# sample is isolated from the rest.

# > 3a Re-normalize the datasets we have provided (before removing the outlier) using quantile normalization.
data.norm <- t(normalize.quantiles(t(data)))
dimnames(data.norm) <- dimnames(data)
tall.norm <- melt(cbind(design, data.norm), id.vars=colnames(design),
    variable.name = 'probe', value.name = 'expression')
# > Compare before and after re-normalization datasets using box plots. Identify the outlier with a different color.
sample.col <- rep('blue', nrow(design))
sample.col[design$sample == outlier] <- 'red'
boxplot(t(data), col=sample.col)

plot of chunk unnamed-chunk-1

boxplot(t(data.norm), col=sample.col)

plot of chunk unnamed-chunk-1


# > 3b Repeat a) for the clean data (after removing the outlier).
data.clean.norm <- t(normalize.quantiles(t(data.clean)))
dimnames(data.clean.norm) <- dimnames(data.clean)
tall.clean.norm <- melt(cbind(design.clean, data.clean.norm),
    id.vars=colnames(design.clean),
    variable.name = 'probe', value.name = 'expression')
boxplot(t(data.clean))

plot of chunk unnamed-chunk-1

boxplot(t(data.clean.norm))

plot of chunk unnamed-chunk-1


# > 3c Comment and describe the results from a) and b).
# > Does the outlier affect the normalization?
# No, the outlier does not greatly affect the normalization.
# > Does normalization have the desired effect in the clean data?
# Yes, the normalization does have the desired effect in the clean data. Namely,
# the data is now quantile normalized.
# > Does normalization solve the problem of the outlier?
# No, normalization does not solve the problem of the outlier.

# > 3d Compare the normalized data of the outlying sample against other samples in the same group (i.e., repeat 2c for the normalized data).
splom(t(data[same.as.outlier,]), xlab='Without quantile normalization')

plot of chunk unnamed-chunk-1

splom(t(data.norm[same.as.outlier,]), xlab='With quantile normalization')

plot of chunk unnamed-chunk-1

# > Does normalization solve the problem of the outlier?
# No. Quantile normalization does not solve the problem of the outlier.

# > 4a Write out, in an equation or English or, ideally, both, the model you are fitting. In the context of that model, what statistical test(s) are you conducting to find the requested probe hits?
# I am fitting a linear model with three parameters:
# 1. the mean expression for the probe, also known as the intercept
# 2. the effect on expression due to the Genotype S1P2_KO for that probe
# 3. the effect on expression due to the Genotype S1P3_KO for that probe
# I am conducting an F-test to determine whether the effect on expression due to
# both genotypes S1P2_KO and S1P3_KO is zero.

# > 4b How many probes have p-value < 10e-4?
mm <- model.matrix(~Genotype, design)
eb <- eBayes(lmFit(t(data), mm))
hits <- topTable(eb, coef = grep('Genotype', colnames(mm)), number=Inf)
countHits <- function(hits, p.value=10e-4) {
    cat('Number of probes with a p-value <', p.value , ':',
        sum(hits$P.Value < p.value), '\n')
    cat('Number of probes with an adjusted p-value <', p.value , ':',
        sum(hits$adj.P.Val < p.value))
}
countHits(hits)

Number of probes with a p-value < 0.001 : 165 
Number of probes with an adjusted p-value < 0.001 : 7

stripplot(expression ~ Genotype | probe, tall.norm,
    subset = probe %in% subset(hits, adj.P.Val < 10e-4)$ID,
    type=c('p', 'a'), jitter.data=TRUE, grid=TRUE,
    scales=list(relation='sliced'))

plot of chunk unnamed-chunk-1

# > Order results in ascending order of p-values and display the top 50 probes in a heat map.
top50 <- topTable(eb, coef = grep('Genotype', colnames(mm)),
    number=50)$ID
heatmap(data.norm[,top50])

plot of chunk unnamed-chunk-1


# > 4c State explicitly what, if any, adjustment has been made to these p-values with respect to multiple testing.
# The p-values have been adjusted using Benjamini & Hochberg multiple testing
# correction to control the false discovery rate.
# > What is the associated probability statement for the probes reported in 4b?
# Using an adjusted p-value threshold of 10e-4, which equals 1e-3,
# the probablity that a reported discovery is false is 10e-4 = 1e-3 = 0.001.
# We expect one false discovery for every 1,000 reported discoveries.

# > 4d Provide a stripplot for the top-ranking probe and report the associated p-value.
stripplot(expression ~ Genotype, tall.norm, subset = probe == top50[1],
    type=c('p', 'a'), jitter.data=TRUE, grid=TRUE)

plot of chunk unnamed-chunk-1

cat('The p-value is:', hits[1,]$P.Value)

The p-value is: 7.234e-11

cat('The adjusted p-value is:', hits[1,]$adj.P.Val)

The adjusted p-value is: 8.987e-07

# > Do the same for a probe near the bottom of the overall ranking (that is, a "boring" gene).
boring <- tail(hits, n=1)
stripplot(expression ~ Genotype, tall.norm, subset = probe == boring$ID,
    type=c('p', 'a'), jitter.data=TRUE, grid=TRUE)

plot of chunk unnamed-chunk-1

cat('The p-value is:', boring$P.Value)

The p-value is: 0.9998

cat('The adjusted p-value is:', boring$adj.P.Val)

The adjusted p-value is: 0.9998

# > Comment on the differences you observe.
# The top-ranking probe has clearly different expression levels for all three
# genotypes and a low p-value, whereas the boring probe has nearly identical
# expression levels for all three genotypes and a high p-value.

# > 4e Test the null hypothesis that the mean probe expression of the S1P3 knockout is the same as that of the wild type.
mm <- model.matrix(~ 0 + Genotype, design)
fit <- lmFit(t(data.norm), mm)
eb <- eBayes(contrasts.fit(fit,
    makeContrasts(P3vsWT = GenotypeS1P3_KO - GenotypeWild_type, levels=mm)))
# > How many probes have a p-value < 10e-4?
P3vsWT.hits <- topTable(eb, number=Inf)
countHits(P3vsWT.hits)

Number of probes with a p-value < 0.001 : 160 
Number of probes with an adjusted p-value < 0.001 : 4

stripplot(expression ~ Genotype | probe, tall.norm,
    subset = probe %in% subset(P3vsWT.hits, adj.P.Val < 10e-4)$ID
        & Genotype %in% c('Wild_type', 'S1P3_KO'),
    type=c('p', 'a'), jitter.data=TRUE, grid=TRUE,
    scales=list(relation='sliced'))

plot of chunk unnamed-chunk-1


# > 4f Repeat the test for differential expression across the 3 genotypes using the clean normalized dataset (i.e., the normalized data after removing the outlier).
mm <- model.matrix(~Genotype, design.clean)
eb <- eBayes(lmFit(t(data.clean.norm), mm))
hits.clean <- topTable(eb, coef = grep('Genotype', colnames(mm)), number=Inf)
countHits(hits.clean)

Number of probes with a p-value < 0.001 : 148 
Number of probes with an adjusted p-value < 0.001 : 8

stripplot(expression ~ Genotype | probe, tall.norm,
    subset = probe %in% subset(hits.clean, adj.P.Val < 10e-4)$ID,
    type=c('p', 'a'), jitter.data=TRUE, grid=TRUE,
    scales=list(relation='sliced'))

plot of chunk unnamed-chunk-1

# > Compare the list of probes with p < 10e-4 between both analyses.
compareHits <- function(hits.id, hits.clean.id) {
    df <- data.frame(
        Raw = colnames(data) %in% hits.id,
        Clean = colnames(data) %in% hits.clean.id)
    print(addmargins(with(df, table(Raw, Clean))))
    vennDiagram(df)
}
compareHits(
    subset(hits, P.Value < 10e-4)$ID,
    subset(hits.clean, P.Value < 10e-4)$ID)

       Clean
Raw     FALSE  TRUE   Sum
  FALSE 12197    60 12257
  TRUE     77    88   165
  Sum   12274   148 12422

plot of chunk unnamed-chunk-1

compareHits(
    subset(hits, adj.P.Val < 10e-4)$ID,
    subset(hits.clean, adj.P.Val < 10e-4)$ID)

       Clean
Raw     FALSE  TRUE   Sum
  FALSE 12412     3 12415
  TRUE      2     5     7
  Sum   12414     8 12422

plot of chunk unnamed-chunk-1


# > 5a Using the parametrization of your choice, fit a 3x2 full factorial model.
# > Test for a difference of expression across the 6 groups.
# > How many probesets have an overall p-value < 10e-4?
mm <- model.matrix(~ Genotype * BrainRegion, design.clean)
eb <- eBayes(lmFit(t(data.clean.norm), mm))
hits <- topTable(eb, coef = grep('(Intercept)', colnames(mm), invert=TRUE),
    number=Inf)
countHits(hits)

Number of probes with a p-value < 0.001 : 2000 
Number of probes with an adjusted p-value < 0.001 : 1524


# > 5b Use an F-test to test the null hypothesis that all the interaction effects between Genotype and BrainRegion are zero.
# > How many probes have a p-value < 0.05?
hits <- topTable(eb, coef = grep(':', colnames(mm)), number=Inf)
countHits(hits, 0.05)

Number of probes with a p-value < 0.05 : 211 
Number of probes with an adjusted p-value < 0.05 : 0


# > 5c Identify the probe with the lowest p-value and pick additional "boring" gene (with respect to Genotype:BrainRegion interaction effects).
# > For this contrasting pair of probes, report the relevant statistical results and plot their data illustrating the interaction effect.
interesting = hits[1,]
boring = hits[9999,]
rbind(interesting, boring)

                     ID GenotypeS1P2_KO.BrainRegionneocortex
162182_f_at 162182_f_at                             -0.21966
162476_r_at 162476_r_at                              0.04564
            GenotypeS1P3_KO.BrainRegionneocortex AveExpr      F  P.Value
162182_f_at                             -0.16995   8.530 7.9180 0.001039
162476_r_at                             -0.03779   6.593 0.1426 0.867460
            adj.P.Val
162182_f_at         1
162476_r_at         1

stripplot(expression ~ Genotype | probe, tall.clean.norm,
    group = BrainRegion,
    subset = probe %in% c(interesting$ID, boring$ID),
    type=c('p', 'a'), jitter.data=TRUE, grid=TRUE, auto.key=TRUE,
    scales=list(relation='sliced'))

plot of chunk unnamed-chunk-1


# > 5d In problem 4, you identified probes with significant differential expression among the three genotypes. In 5a, you fit a more complex model considering the joint effect of Genotype and BrainRegion on each probe expression.
# > For each probe, use an F-test to test if the more complex model with Genotype and BrainRegion effects is better than the simpler model with only Genotype effect.
hits <- topTable(eb, coef = grep('BrainRegion', colnames(mm)), number=Inf)
countHits(hits)

Number of probes with a p-value < 0.001 : 1995 
Number of probes with an adjusted p-value < 0.001 : 1599

# > Draw a density plot of the corresponding p-values and comment on the results.
densityplot(~P.Value, hits)

plot of chunk unnamed-chunk-1

# The density plot appears to be a mixture of a uniform distribution, expected
# of probes for which the null hypothesis is true, and a peaked distribution
# of probes for which we can reject the null hypothesis.