Setup
Load relevant packages
library(minfi)
library(limma)
library(matrixStats)
library(MASS)
library(sva)
library(Hmisc)
#library( missMethyl )
library(ggplot2)
library(qqman)
# Set up color palettes for plotting
myColors <- c("dodgerblue", "firebrick1", "seagreen3")
graphColors <- c(
"#023FA5", "#7D87B9", "#BEC1D4", "#D6BCC0", "#BB7784", "#D33F6A", "#11C638", "#8DD593", "#C6DEC7", "#EAD3C6",
"#F0B98D", "#EF9708", "#0FCFC0", "#9CDED6", "#D5EAE7", "#F3E1EB", "#F6C4E1", "#F79CD4", "#4A6FE3", "#8595E1",
"#B5BBE3", "#E6AFB9", "#E07B91"
)
Setting file paths for data
# Setting file paths for data
# The rest of this script assumes that your data are in a folder called "project" on the Cloud.
# As you work on your own computer, you will need to specify the folder locations.
# Folder location of the data files
data_dir <- "/cloud/project/Data/"
data_dir
## [1] "/cloud/project/Data/"
Read in the data
# Need noob, combat.beta, pd.complete files
load(file.path(data_dir, "noob.rda"))
load(file.path(data_dir, "combat-beta.rda"))
load(file.path(data_dir, "pd-complete.rda"))
pd <- pData(noob)
# Make sure these output to TRUE
identical(colnames(combat.beta), colnames(noob))
## [1] TRUE
identical(rownames(pd.complete), colnames(noob))
## [1] TRUE
identical(rownames(noob), rownames(combat.beta))
## [1] TRUE
Map DNA methylation data to the genome
mapped <- mapToGenome(noob)
## Loading required package: IlluminaHumanMethylation450kanno.ilmn12.hg19
noob <- noob[match(rownames(mapped), rownames(noob)), ]
combat.beta <- combat.beta[match(rownames(mapped), rownames(combat.beta)), ]
save(mapped, file = file.path(data_dir, "mapped-noob.rda"))
pre.beta <- getBeta(noob)
chrnames <- as.character(seqnames(mapped))
pos <- as.numeric(start(mapped))
rm(noob, mapped)
# Compute PCs for cell type
cellscores <- prcomp((pd.complete[, c("CD8T", "CD4T", "NK", "Bcell", "Mono", "Gran")]), center = T, scale. = F)
round((cellscores$sdev^2 / sum(cellscores$sdev^2)) * 100, digits = 2) # Variance explained by each PC
## [1] 86.72 7.28 3.08 2.38 0.51 0.02
pd.complete <- data.frame(pd.complete, cellscores$x)
save(pd.complete, file = file.path(data_dir, "pd-complete.rda"))
Box Plots and t-statistics for each cell type
# Box Plots and t-statistics for each cell type
celltypes <- c("CD8T", "CD4T", "NK", "Bcell", "Mono", "Gran")
for (i in celltypes) {
myttest <- t.test(pd.complete[which(pd.complete$casestatus == "RA"), i], pd.complete[which(pd.complete$casestatus == "Control"), i])
thisplot <- data.frame(pd.complete[, i], pd.complete$casestatus)
colnames(thisplot) <- c("CellComp", "Status")
p1 <- ggplot(thisplot, aes(factor(Status), CellComp)) +
geom_boxplot() +
xlab("Case Status") +
labs(title = paste("Cell Type ", i, ", p-value of ", format(myttest$p.value, digits = 3, scientific = TRUE), sep = "")) +
scale_y_continuous(limits = c(0, 1))
print(p1)
}






Single site association testing
# Construct the model matrix
mod <- model.matrix(~ factor(pd.complete$casestatus) + pd.complete$age + factor(pd.complete$gender) + factor(pd.complete$smoking) + cellscores$x[, 1])
# Run the single site association model
out <- lmFit(combat.beta, mod)
out <- eBayes(out)
ss.hits <- topTable(out, coef = 2, number = nrow(combat.beta))
ss.hits[1:10, ]
## logFC AveExpr t P.Value adj.P.Val B
## cg11442732 0.3699116 0.4046268 14.50118 3.568283e-07 0.1138831 6.895382
## cg02291365 0.4012610 0.1423684 13.33762 6.950165e-07 0.1138831 6.370619
## cg03390207 0.0909542 0.6345475 11.92089 1.688981e-06 0.1138831 5.635448
## cg03456771 0.3348556 0.7288353 11.87544 1.740492e-06 0.1138831 5.609896
## cg26374305 0.2796207 0.4984591 11.77432 1.861498e-06 0.1138831 5.552577
## cg09532691 0.1391447 0.7897108 11.00648 3.156028e-06 0.1138831 5.095187
## cg04313735 0.1199025 0.5233330 10.93230 3.326886e-06 0.1138831 5.048834
## cg14409166 0.1816781 0.8039799 10.88448 3.442475e-06 0.1138831 5.018742
## cg03039243 0.1014243 0.5349671 10.76574 3.749334e-06 0.1138831 4.943294
## cg21960303 0.1077232 0.7877348 10.75663 3.774096e-06 0.1138831 4.937465
rm(mod, out)
QQ plot
# Make a qq plot of our data
qq(ss.hits$P.Value)

Calculate a lambda statistic
# Calculate a lambda statistic
observed <- -log10(sort(ss.hits$P.Value, decreasing = F))
expected <- -log10(ppoints(length(ss.hits$P.Value)))
lambda <- median(observed) / median(expected)
lambda
## [1] 1.824154
Make a manhattan plot of our data
# Grab the correct position and chromosome vectors
pos.ss <- pos[match(rownames(ss.hits), rownames(combat.beta))]
chr.ss <- chrnames[match(rownames(ss.hits), rownames(combat.beta))]
chr.ss.num <- as.numeric(unlist(lapply(chr.ss, function(x) {
strsplit(x, "chr", fixed = TRUE)[[1]][2]
})))
## Warning: NAs introduced by coercion
# Construct data frame for function
formanhattan <- data.frame(pos.ss, chr.ss.num, ss.hits$P.Value)
colnames(formanhattan) <- c("BP", "CHR", "P")
formanhattan <- formanhattan[-which(is.na(formanhattan$CHR)), ]
# Call function
manhattan(formanhattan)
## Warning in manhattan(formanhattan): No SNP column found. OK unless you're trying
## to highlight.

Plot the top 6 hits and examine case vs control beta differences
# Create a mini result list of the first 6 hits
mytrunc <- ss.hits[1:6, ]
# Create a results object to populate with the case-control differences
mymeans <- matrix(0, nrow(mytrunc), 2)
# Plot the results: here we are formatting the data in a certain way in order to fit
# the ggplot2 plotting function. We have also shifted the position of the controls by 10bp
# so that their methylation distribution is easier to see. We are also populating our object
# "mymeans" with the case (first column) and control (second column) means at each site.
for (k in 1:nrow(mytrunc)) {
probe <- which(rownames(pre.beta) == rownames(mytrunc)[k])
grabBeta <- pre.beta[probe, ]
formbeta <- c()
for (leonard in 1:length(grabBeta)) {
tempbeta <- grabBeta[leonard]
formbeta <- c(formbeta, tempbeta)
}
reppos <- rep(pos.ss[k], each = ncol(pre.beta))
# reppos.real<-pos[reppos]
xmin <- min(reppos) - 10
xmax <- max(reppos) + 20
status <- pd.complete$casestatus
reppos.real.shift <- ifelse(status == "Control", reppos + 10, reppos)
toplot <- data.frame((as.numeric(formbeta)), jitter(reppos.real.shift), status)
rownames(toplot) <- c()
# toplot<-data.frame(toplot)
colnames(toplot) <- c("Beta", "Position", "Status")
casemean <- mean(as.numeric(grabBeta[which(status == "RA")]))
contmean <- mean(as.numeric(grabBeta[which(status == "Control")]))
casedat <- data.frame(x = (pos.ss[k] - 2):(pos.ss[k] + 2), y = mean(as.numeric(grabBeta[which(status == "RA")])), color = "red")
contdat <- data.frame(x = (pos.ss[k] + 8):(pos.ss[k] + 12), y = mean(as.numeric(grabBeta[which(status == "Control")])), color = "chartreuse")
mymeans[k, 1] <- mean(as.numeric(grabBeta[which(status == "RA")]))
mymeans[k, 2] <- mean(as.numeric(grabBeta[which(status == "Control")]))
meanDiff <- round((mean(as.numeric(grabBeta[which(status == "RA")])) - mean(as.numeric(grabBeta[which(status == "Control")]))) * 100, digits = 1)
pVal <- format(mytrunc$P.Value[k], digits = 3)
p2 <- ggplot(data = toplot, aes(x = (Position), y = (Beta), color = factor(Status))) +
geom_point(size = 2) +
scale_colour_manual(values = c("dodgerblue", "black")) +
theme(legend.direction = "vertical", legend.position = c(0.1, 0.95), legend.title = element_blank(), panel.background = element_blank()) +
ylab("Percent Methylation") +
xlab(paste0(rownames(mytrunc)[k], ":", " Delta M = ", meanDiff, " at p-val ", pVal)) +
scale_x_continuous(limits = c(xmin, xmax)) +
theme(axis.text.x = element_blank()) +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.25), labels = c("0", "25", "50", "75", "100")) +
geom_line(data = casedat, aes(x = x, y = y), color = "red") +
geom_line(data = contdat, aes(x = x, y = y), color = "chartreuse")
#+stat_summary(fun.y=mean,geom="line",size=1)
print(p2)
}






rm(pos.ss, chr.ss, chr.ss.num, formanhattan, pre.beta)
Gene ontology analysis
# Need to pick a significance cutoff for inclusion. Here is 1x10-5, but may need to be flexible with data.
# gene.ontology <- gometh( as.character( rownames( ss.hits )[ ss.hits$P.Value < 1e-5 ] ), all.cpg = as.character( rownames( ss.hits ) ), plot.bias = FALSE, prior.prob = TRUE )
# memory requirements of missMethyl package may be too much for RStudio cloud, load the premade gene.ontology object:
load(file.path(data_dir, "Premade_Intermediate_Files/gene-ontology.rda"))
dim(gene.ontology)
## [1] 22710 6
summary(gene.ontology$P.DE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0004689 1.0000000 1.0000000 0.9545232 1.0000000 1.0000000
summary(gene.ontology$FDR)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1 1 1 1 1
gene.ontology <- gene.ontology[order(gene.ontology$P.DE), ]
head(gene.ontology)
## ONTOLOGY TERM N DE P.DE
## GO:0060997 BP dendritic spine morphogenesis 56 3 0.0004689500
## GO:0097061 BP dendritic spine organization 78 3 0.0009737338
## GO:0106027 BP neuron projection organization 86 3 0.0010624276
## GO:0060422 MF peptidyl-dipeptidase inhibitor activity 1 1 0.0011992584
## GO:0060996 BP dendritic spine development 93 3 0.0015877015
## GO:0045598 BP regulation of fat cell differentiation 131 3 0.0016976997
## FDR
## GO:0060997 1
## GO:0097061 1
## GO:0106027 1
## GO:0060422 1
## GO:0060996 1
## GO:0045598 1
# Some functions that will be useful in making the plot
wrap.it <- function(x, len) {
sapply(x, function(y) {
paste(strwrap(y, len),
collapse = "\n"
)
},
USE.NAMES = FALSE
)
}
# Call this function with a list or vector
wrap.labels <- function(x, len) {
if (is.list(x)) {
lapply(x, wrap.it, len)
} else {
wrap.it(x, len)
}
}
par(mai = c(1, 4, 1, 1))
barplot(abs(log(as.numeric(gene.ontology$P.DE[1:10]), base = 10)),
main = "Gene Ontology", horiz = TRUE, names.arg = wrap.labels(gene.ontology$TERM[1:10], 50),
xlab = "-log10(P value)", col = "dodgerblue", las = 1, cex.axis = 1.2, cex.main = 1.4, cex.lab = 1, cex.names = 1, space = 0.4, xlim = c(0, 5)
)
