Setup

Load relevant packages

library(minfi)
library(limma)
library(matrixStats)
library(MASS)
library(sva)
library(Hmisc)
#library( missMethyl )
library(ggplot2)
library(qqman)

# Set up color palettes for plotting
myColors <- c("dodgerblue", "firebrick1", "seagreen3")
graphColors <- c(
  "#023FA5", "#7D87B9", "#BEC1D4", "#D6BCC0", "#BB7784", "#D33F6A", "#11C638", "#8DD593", "#C6DEC7", "#EAD3C6",
  "#F0B98D", "#EF9708", "#0FCFC0", "#9CDED6", "#D5EAE7", "#F3E1EB", "#F6C4E1", "#F79CD4", "#4A6FE3", "#8595E1",
  "#B5BBE3", "#E6AFB9", "#E07B91"
)

Setting file paths for data

# Setting file paths for data

# The rest of this script assumes that your data are in a folder called "project" on the Cloud.
# As you work on your own computer, you will need to specify the folder locations.

# Folder location of the data files
data_dir <- "/cloud/project/Data/"
data_dir

## [1] "/cloud/project/Data/"

Read in the data

# Need noob, combat.beta, pd.complete files
load(file.path(data_dir, "noob.rda"))
load(file.path(data_dir, "combat-beta.rda"))
load(file.path(data_dir, "pd-complete.rda"))
pd <- pData(noob)

# Make sure these output to TRUE
identical(colnames(combat.beta), colnames(noob))

## [1] TRUE

identical(rownames(pd.complete), colnames(noob))

## [1] TRUE

identical(rownames(noob), rownames(combat.beta))

## [1] TRUE

Map DNA methylation data to the genome

mapped <- mapToGenome(noob)

## Loading required package: IlluminaHumanMethylation450kanno.ilmn12.hg19

noob <- noob[match(rownames(mapped), rownames(noob)), ]
combat.beta <- combat.beta[match(rownames(mapped), rownames(combat.beta)), ]
save(mapped, file = file.path(data_dir, "mapped-noob.rda"))

pre.beta <- getBeta(noob)
chrnames <- as.character(seqnames(mapped))
pos <- as.numeric(start(mapped))

rm(noob, mapped)

# Compute PCs for cell type
cellscores <- prcomp((pd.complete[, c("CD8T", "CD4T", "NK", "Bcell", "Mono", "Gran")]), center = T, scale. = F)
round((cellscores$sdev^2 / sum(cellscores$sdev^2)) * 100, digits = 2) # Variance explained by each PC

## [1] 86.72  7.28  3.08  2.38  0.51  0.02

pd.complete <- data.frame(pd.complete, cellscores$x)

save(pd.complete, file = file.path(data_dir, "pd-complete.rda"))

Box Plots and t-statistics for each cell type

# Box Plots and t-statistics for each cell type
celltypes <- c("CD8T", "CD4T", "NK", "Bcell", "Mono", "Gran")

for (i in celltypes) {
  myttest <- t.test(pd.complete[which(pd.complete$casestatus == "RA"), i], pd.complete[which(pd.complete$casestatus == "Control"), i])
  thisplot <- data.frame(pd.complete[, i], pd.complete$casestatus)
  colnames(thisplot) <- c("CellComp", "Status")
  p1 <- ggplot(thisplot, aes(factor(Status), CellComp)) +
    geom_boxplot() +
    xlab("Case Status") +
    labs(title = paste("Cell Type ", i, ", p-value of ", format(myttest$p.value, digits = 3, scientific = TRUE), sep = "")) +
    scale_y_continuous(limits = c(0, 1))
  print(p1)
}

Single site association testing

# Construct the model matrix
mod <- model.matrix(~ factor(pd.complete$casestatus) + pd.complete$age + factor(pd.complete$gender) + factor(pd.complete$smoking) + cellscores$x[, 1])

# Run the single site association model
out <- lmFit(combat.beta, mod)
out <- eBayes(out)
ss.hits <- topTable(out, coef = 2, number = nrow(combat.beta))

ss.hits[1:10, ]

##                logFC   AveExpr        t      P.Value adj.P.Val        B
## cg11442732 0.3699116 0.4046268 14.50118 3.568283e-07 0.1138831 6.895382
## cg02291365 0.4012610 0.1423684 13.33762 6.950165e-07 0.1138831 6.370619
## cg03390207 0.0909542 0.6345475 11.92089 1.688981e-06 0.1138831 5.635448
## cg03456771 0.3348556 0.7288353 11.87544 1.740492e-06 0.1138831 5.609896
## cg26374305 0.2796207 0.4984591 11.77432 1.861498e-06 0.1138831 5.552577
## cg09532691 0.1391447 0.7897108 11.00648 3.156028e-06 0.1138831 5.095187
## cg04313735 0.1199025 0.5233330 10.93230 3.326886e-06 0.1138831 5.048834
## cg14409166 0.1816781 0.8039799 10.88448 3.442475e-06 0.1138831 5.018742
## cg03039243 0.1014243 0.5349671 10.76574 3.749334e-06 0.1138831 4.943294
## cg21960303 0.1077232 0.7877348 10.75663 3.774096e-06 0.1138831 4.937465

rm(mod, out)

QQ plot

# Make a qq plot of our data
qq(ss.hits$P.Value)

Calculate a lambda statistic

# Calculate a lambda statistic
observed <- -log10(sort(ss.hits$P.Value, decreasing = F))
expected <- -log10(ppoints(length(ss.hits$P.Value)))
lambda <- median(observed) / median(expected)
lambda

## [1] 1.824154

Make a manhattan plot of our data

# Grab the correct position and chromosome vectors
pos.ss <- pos[match(rownames(ss.hits), rownames(combat.beta))]
chr.ss <- chrnames[match(rownames(ss.hits), rownames(combat.beta))]
chr.ss.num <- as.numeric(unlist(lapply(chr.ss, function(x) {
  strsplit(x, "chr", fixed = TRUE)[[1]][2]
})))

## Warning: NAs introduced by coercion

# Construct data frame for function
formanhattan <- data.frame(pos.ss, chr.ss.num, ss.hits$P.Value)
colnames(formanhattan) <- c("BP", "CHR", "P")

formanhattan <- formanhattan[-which(is.na(formanhattan$CHR)), ]

# Call function
manhattan(formanhattan)

## Warning in manhattan(formanhattan): No SNP column found. OK unless you're trying
## to highlight.

Plot the top 6 hits and examine case vs control beta differences

# Create a mini result list of the first 6 hits
mytrunc <- ss.hits[1:6, ]

# Create a results object to populate with the case-control differences
mymeans <- matrix(0, nrow(mytrunc), 2)


# Plot the results: here we are formatting the data in a certain way in order to fit
# the ggplot2 plotting function. We have also shifted the position of the controls by 10bp
# so that their methylation distribution is easier to see. We are also populating our object
# "mymeans" with the case (first column) and control (second column) means at each site.
for (k in 1:nrow(mytrunc)) {
  probe <- which(rownames(pre.beta) == rownames(mytrunc)[k])
  grabBeta <- pre.beta[probe, ]

  formbeta <- c()
  for (leonard in 1:length(grabBeta)) {
    tempbeta <- grabBeta[leonard]
    formbeta <- c(formbeta, tempbeta)
  }
  reppos <- rep(pos.ss[k], each = ncol(pre.beta))
  # reppos.real<-pos[reppos]
  xmin <- min(reppos) - 10
  xmax <- max(reppos) + 20
  status <- pd.complete$casestatus
  reppos.real.shift <- ifelse(status == "Control", reppos + 10, reppos)
  toplot <- data.frame((as.numeric(formbeta)), jitter(reppos.real.shift), status)
  rownames(toplot) <- c()
  # toplot<-data.frame(toplot)
  colnames(toplot) <- c("Beta", "Position", "Status")

  casemean <- mean(as.numeric(grabBeta[which(status == "RA")]))
  contmean <- mean(as.numeric(grabBeta[which(status == "Control")]))
  casedat <- data.frame(x = (pos.ss[k] - 2):(pos.ss[k] + 2), y = mean(as.numeric(grabBeta[which(status == "RA")])), color = "red")
  contdat <- data.frame(x = (pos.ss[k] + 8):(pos.ss[k] + 12), y = mean(as.numeric(grabBeta[which(status == "Control")])), color = "chartreuse")

  mymeans[k, 1] <- mean(as.numeric(grabBeta[which(status == "RA")]))
  mymeans[k, 2] <- mean(as.numeric(grabBeta[which(status == "Control")]))

  meanDiff <- round((mean(as.numeric(grabBeta[which(status == "RA")])) - mean(as.numeric(grabBeta[which(status == "Control")]))) * 100, digits = 1)
  pVal <- format(mytrunc$P.Value[k], digits = 3)

  p2 <- ggplot(data = toplot, aes(x = (Position), y = (Beta), color = factor(Status))) +
    geom_point(size = 2) +
    scale_colour_manual(values = c("dodgerblue", "black")) +
    theme(legend.direction = "vertical", legend.position = c(0.1, 0.95), legend.title = element_blank(), panel.background = element_blank()) +
    ylab("Percent Methylation") +
    xlab(paste0(rownames(mytrunc)[k], ":", " Delta M = ", meanDiff, " at p-val ", pVal)) +
    scale_x_continuous(limits = c(xmin, xmax)) +
    theme(axis.text.x = element_blank()) +
    scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.25), labels = c("0", "25", "50", "75", "100")) +
    geom_line(data = casedat, aes(x = x, y = y), color = "red") +
    geom_line(data = contdat, aes(x = x, y = y), color = "chartreuse")

  #+stat_summary(fun.y=mean,geom="line",size=1)
  print(p2)
}

rm(pos.ss, chr.ss, chr.ss.num, formanhattan, pre.beta)

Gene ontology analysis

# Need to pick a significance cutoff for inclusion. Here is 1x10-5, but may need to be flexible with data.
# gene.ontology <- gometh( as.character( rownames( ss.hits )[ ss.hits$P.Value < 1e-5 ] ), all.cpg = as.character( rownames( ss.hits ) ), plot.bias = FALSE, prior.prob = TRUE )

# memory requirements of missMethyl package may be too much for RStudio cloud, load the premade gene.ontology object:

load(file.path(data_dir, "Premade_Intermediate_Files/gene-ontology.rda"))

dim(gene.ontology)

## [1] 22710     6

summary(gene.ontology$P.DE)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0004689 1.0000000 1.0000000 0.9545232 1.0000000 1.0000000

summary(gene.ontology$FDR)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       1       1       1       1

gene.ontology <- gene.ontology[order(gene.ontology$P.DE), ]
head(gene.ontology)

##            ONTOLOGY                                    TERM   N DE         P.DE
## GO:0060997       BP           dendritic spine morphogenesis  56  3 0.0004689500
## GO:0097061       BP            dendritic spine organization  78  3 0.0009737338
## GO:0106027       BP          neuron projection organization  86  3 0.0010624276
## GO:0060422       MF peptidyl-dipeptidase inhibitor activity   1  1 0.0011992584
## GO:0060996       BP             dendritic spine development  93  3 0.0015877015
## GO:0045598       BP  regulation of fat cell differentiation 131  3 0.0016976997
##            FDR
## GO:0060997   1
## GO:0097061   1
## GO:0106027   1
## GO:0060422   1
## GO:0060996   1
## GO:0045598   1

# Some functions that will be useful in making the plot
wrap.it <- function(x, len) {
  sapply(x, function(y) {
    paste(strwrap(y, len),
      collapse = "\n"
    )
  },
  USE.NAMES = FALSE
  )
}
# Call this function with a list or vector
wrap.labels <- function(x, len) {
  if (is.list(x)) {
    lapply(x, wrap.it, len)
  } else {
    wrap.it(x, len)
  }
}

par(mai = c(1, 4, 1, 1))
barplot(abs(log(as.numeric(gene.ontology$P.DE[1:10]), base = 10)),
  main = "Gene Ontology", horiz = TRUE, names.arg = wrap.labels(gene.ontology$TERM[1:10], 50),
  xlab = "-log10(P value)", col = "dodgerblue", las = 1, cex.axis = 1.2, cex.main = 1.4, cex.lab = 1, cex.names = 1, space = 0.4, xlim = c(0, 5)
)

Epigenomics for Social Scientists

03 Single Site, Gene Ontology Association Analysis

Kelly Bakulski, Shan Andrews, John Dou, Jonah Fisher, Erin Ware

January 2021

Setup

Load relevant packages

Setting file paths for data

Read in the data

Map DNA methylation data to the genome

Box Plots and t-statistics for each cell type

Single site association testing

QQ plot

Calculate a lambda statistic

Make a manhattan plot of our data

Plot the top 6 hits and examine case vs control beta differences

Gene ontology analysis