Setup

Load relevant packages

library(minfi)
library(limma)
library(matrixStats)
library(MASS)
library(sva)
library(Hmisc)
#library( missMethyl )
library(ggplot2)
library(qqman)
# Set up color palettes for plotting
myColors <- c("dodgerblue", "firebrick1", "seagreen3")
graphColors <- c(
  "#023FA5", "#7D87B9", "#BEC1D4", "#D6BCC0", "#BB7784", "#D33F6A", "#11C638", "#8DD593", "#C6DEC7", "#EAD3C6",
  "#F0B98D", "#EF9708", "#0FCFC0", "#9CDED6", "#D5EAE7", "#F3E1EB", "#F6C4E1", "#F79CD4", "#4A6FE3", "#8595E1",
  "#B5BBE3", "#E6AFB9", "#E07B91"
)

Setting file paths for data

# Setting file paths for data

# The rest of this script assumes that your data are in a folder called "project" on the Cloud.
# As you work on your own computer, you will need to specify the folder locations.

# Folder location of the data files
data_dir <- "/cloud/project/Data/"
data_dir
## [1] "/cloud/project/Data/"

Read in the data

# Need noob, combat.beta, pd.complete files
load(file.path(data_dir, "noob.rda"))
load(file.path(data_dir, "combat-beta.rda"))
load(file.path(data_dir, "pd-complete.rda"))
pd <- pData(noob)

# Make sure these output to TRUE
identical(colnames(combat.beta), colnames(noob))
## [1] TRUE
identical(rownames(pd.complete), colnames(noob))
## [1] TRUE
identical(rownames(noob), rownames(combat.beta))
## [1] TRUE

Map DNA methylation data to the genome

mapped <- mapToGenome(noob)
## Loading required package: IlluminaHumanMethylation450kanno.ilmn12.hg19
noob <- noob[match(rownames(mapped), rownames(noob)), ]
combat.beta <- combat.beta[match(rownames(mapped), rownames(combat.beta)), ]
save(mapped, file = file.path(data_dir, "mapped-noob.rda"))

pre.beta <- getBeta(noob)
chrnames <- as.character(seqnames(mapped))
pos <- as.numeric(start(mapped))

rm(noob, mapped)
# Compute PCs for cell type
cellscores <- prcomp((pd.complete[, c("CD8T", "CD4T", "NK", "Bcell", "Mono", "Gran")]), center = T, scale. = F)
round((cellscores$sdev^2 / sum(cellscores$sdev^2)) * 100, digits = 2) # Variance explained by each PC
## [1] 86.72  7.28  3.08  2.38  0.51  0.02
pd.complete <- data.frame(pd.complete, cellscores$x)

save(pd.complete, file = file.path(data_dir, "pd-complete.rda"))

Box Plots and t-statistics for each cell type

# Box Plots and t-statistics for each cell type
celltypes <- c("CD8T", "CD4T", "NK", "Bcell", "Mono", "Gran")

for (i in celltypes) {
  myttest <- t.test(pd.complete[which(pd.complete$casestatus == "RA"), i], pd.complete[which(pd.complete$casestatus == "Control"), i])
  thisplot <- data.frame(pd.complete[, i], pd.complete$casestatus)
  colnames(thisplot) <- c("CellComp", "Status")
  p1 <- ggplot(thisplot, aes(factor(Status), CellComp)) +
    geom_boxplot() +
    xlab("Case Status") +
    labs(title = paste("Cell Type ", i, ", p-value of ", format(myttest$p.value, digits = 3, scientific = TRUE), sep = "")) +
    scale_y_continuous(limits = c(0, 1))
  print(p1)
}

Single site association testing

# Construct the model matrix
mod <- model.matrix(~ factor(pd.complete$casestatus) + pd.complete$age + factor(pd.complete$gender) + factor(pd.complete$smoking) + cellscores$x[, 1])

# Run the single site association model
out <- lmFit(combat.beta, mod)
out <- eBayes(out)
ss.hits <- topTable(out, coef = 2, number = nrow(combat.beta))

ss.hits[1:10, ]
##                logFC   AveExpr        t      P.Value adj.P.Val        B
## cg11442732 0.3699116 0.4046268 14.50118 3.568283e-07 0.1138831 6.895382
## cg02291365 0.4012610 0.1423684 13.33762 6.950165e-07 0.1138831 6.370619
## cg03390207 0.0909542 0.6345475 11.92089 1.688981e-06 0.1138831 5.635448
## cg03456771 0.3348556 0.7288353 11.87544 1.740492e-06 0.1138831 5.609896
## cg26374305 0.2796207 0.4984591 11.77432 1.861498e-06 0.1138831 5.552577
## cg09532691 0.1391447 0.7897108 11.00648 3.156028e-06 0.1138831 5.095187
## cg04313735 0.1199025 0.5233330 10.93230 3.326886e-06 0.1138831 5.048834
## cg14409166 0.1816781 0.8039799 10.88448 3.442475e-06 0.1138831 5.018742
## cg03039243 0.1014243 0.5349671 10.76574 3.749334e-06 0.1138831 4.943294
## cg21960303 0.1077232 0.7877348 10.75663 3.774096e-06 0.1138831 4.937465
rm(mod, out)

QQ plot

# Make a qq plot of our data
qq(ss.hits$P.Value)

Calculate a lambda statistic

# Calculate a lambda statistic
observed <- -log10(sort(ss.hits$P.Value, decreasing = F))
expected <- -log10(ppoints(length(ss.hits$P.Value)))
lambda <- median(observed) / median(expected)
lambda
## [1] 1.824154

Make a manhattan plot of our data

# Grab the correct position and chromosome vectors
pos.ss <- pos[match(rownames(ss.hits), rownames(combat.beta))]
chr.ss <- chrnames[match(rownames(ss.hits), rownames(combat.beta))]
chr.ss.num <- as.numeric(unlist(lapply(chr.ss, function(x) {
  strsplit(x, "chr", fixed = TRUE)[[1]][2]
})))
## Warning: NAs introduced by coercion
# Construct data frame for function
formanhattan <- data.frame(pos.ss, chr.ss.num, ss.hits$P.Value)
colnames(formanhattan) <- c("BP", "CHR", "P")

formanhattan <- formanhattan[-which(is.na(formanhattan$CHR)), ]
# Call function
manhattan(formanhattan)
## Warning in manhattan(formanhattan): No SNP column found. OK unless you're trying
## to highlight.

Plot the top 6 hits and examine case vs control beta differences

# Create a mini result list of the first 6 hits
mytrunc <- ss.hits[1:6, ]

# Create a results object to populate with the case-control differences
mymeans <- matrix(0, nrow(mytrunc), 2)


# Plot the results: here we are formatting the data in a certain way in order to fit
# the ggplot2 plotting function. We have also shifted the position of the controls by 10bp
# so that their methylation distribution is easier to see. We are also populating our object
# "mymeans" with the case (first column) and control (second column) means at each site.
for (k in 1:nrow(mytrunc)) {
  probe <- which(rownames(pre.beta) == rownames(mytrunc)[k])
  grabBeta <- pre.beta[probe, ]

  formbeta <- c()
  for (leonard in 1:length(grabBeta)) {
    tempbeta <- grabBeta[leonard]
    formbeta <- c(formbeta, tempbeta)
  }
  reppos <- rep(pos.ss[k], each = ncol(pre.beta))
  # reppos.real<-pos[reppos]
  xmin <- min(reppos) - 10
  xmax <- max(reppos) + 20
  status <- pd.complete$casestatus
  reppos.real.shift <- ifelse(status == "Control", reppos + 10, reppos)
  toplot <- data.frame((as.numeric(formbeta)), jitter(reppos.real.shift), status)
  rownames(toplot) <- c()
  # toplot<-data.frame(toplot)
  colnames(toplot) <- c("Beta", "Position", "Status")

  casemean <- mean(as.numeric(grabBeta[which(status == "RA")]))
  contmean <- mean(as.numeric(grabBeta[which(status == "Control")]))
  casedat <- data.frame(x = (pos.ss[k] - 2):(pos.ss[k] + 2), y = mean(as.numeric(grabBeta[which(status == "RA")])), color = "red")
  contdat <- data.frame(x = (pos.ss[k] + 8):(pos.ss[k] + 12), y = mean(as.numeric(grabBeta[which(status == "Control")])), color = "chartreuse")

  mymeans[k, 1] <- mean(as.numeric(grabBeta[which(status == "RA")]))
  mymeans[k, 2] <- mean(as.numeric(grabBeta[which(status == "Control")]))

  meanDiff <- round((mean(as.numeric(grabBeta[which(status == "RA")])) - mean(as.numeric(grabBeta[which(status == "Control")]))) * 100, digits = 1)
  pVal <- format(mytrunc$P.Value[k], digits = 3)

  p2 <- ggplot(data = toplot, aes(x = (Position), y = (Beta), color = factor(Status))) +
    geom_point(size = 2) +
    scale_colour_manual(values = c("dodgerblue", "black")) +
    theme(legend.direction = "vertical", legend.position = c(0.1, 0.95), legend.title = element_blank(), panel.background = element_blank()) +
    ylab("Percent Methylation") +
    xlab(paste0(rownames(mytrunc)[k], ":", " Delta M = ", meanDiff, " at p-val ", pVal)) +
    scale_x_continuous(limits = c(xmin, xmax)) +
    theme(axis.text.x = element_blank()) +
    scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.25), labels = c("0", "25", "50", "75", "100")) +
    geom_line(data = casedat, aes(x = x, y = y), color = "red") +
    geom_line(data = contdat, aes(x = x, y = y), color = "chartreuse")

  #+stat_summary(fun.y=mean,geom="line",size=1)
  print(p2)
}

rm(pos.ss, chr.ss, chr.ss.num, formanhattan, pre.beta)

Gene ontology analysis

# Need to pick a significance cutoff for inclusion. Here is 1x10-5, but may need to be flexible with data.
# gene.ontology <- gometh( as.character( rownames( ss.hits )[ ss.hits$P.Value < 1e-5 ] ), all.cpg = as.character( rownames( ss.hits ) ), plot.bias = FALSE, prior.prob = TRUE )

# memory requirements of missMethyl package may be too much for RStudio cloud, load the premade gene.ontology object:

load(file.path(data_dir, "Premade_Intermediate_Files/gene-ontology.rda"))

dim(gene.ontology)
## [1] 22710     6
summary(gene.ontology$P.DE)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0004689 1.0000000 1.0000000 0.9545232 1.0000000 1.0000000
summary(gene.ontology$FDR)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       1       1       1       1
gene.ontology <- gene.ontology[order(gene.ontology$P.DE), ]
head(gene.ontology)
##            ONTOLOGY                                    TERM   N DE         P.DE
## GO:0060997       BP           dendritic spine morphogenesis  56  3 0.0004689500
## GO:0097061       BP            dendritic spine organization  78  3 0.0009737338
## GO:0106027       BP          neuron projection organization  86  3 0.0010624276
## GO:0060422       MF peptidyl-dipeptidase inhibitor activity   1  1 0.0011992584
## GO:0060996       BP             dendritic spine development  93  3 0.0015877015
## GO:0045598       BP  regulation of fat cell differentiation 131  3 0.0016976997
##            FDR
## GO:0060997   1
## GO:0097061   1
## GO:0106027   1
## GO:0060422   1
## GO:0060996   1
## GO:0045598   1
# Some functions that will be useful in making the plot
wrap.it <- function(x, len) {
  sapply(x, function(y) {
    paste(strwrap(y, len),
      collapse = "\n"
    )
  },
  USE.NAMES = FALSE
  )
}
# Call this function with a list or vector
wrap.labels <- function(x, len) {
  if (is.list(x)) {
    lapply(x, wrap.it, len)
  } else {
    wrap.it(x, len)
  }
}

par(mai = c(1, 4, 1, 1))
barplot(abs(log(as.numeric(gene.ontology$P.DE[1:10]), base = 10)),
  main = "Gene Ontology", horiz = TRUE, names.arg = wrap.labels(gene.ontology$TERM[1:10], 50),
  xlab = "-log10(P value)", col = "dodgerblue", las = 1, cex.axis = 1.2, cex.main = 1.4, cex.lab = 1, cex.names = 1, space = 0.4, xlim = c(0, 5)
)