Exploratory analysis of raw data

  1. “Dealing with” NAs
  2. Determine threshold and counts
  3. Count of methylated and unmethylated probes
  4. Compare control samples across assays (scatterplots of beta-values)
  5. Identify probes with difference > 0.2 between controls

Load data and packages

library(knitr)
library(plyr)
library(ggplot2)
library(latticeExtra)
library(xtable)
library(directlabels)  # allows for clearer labelling
library(lattice)

load("All_3_metasets.Rdata")
load("All_3_sets.Rdata")
dataList <- list(ALL = ALL.dat, APL = APL.dat, CTRL = CTRL.dat)

1) Counting and plotting NAs by probe

na_per_samp <- lapply(dataList, function(x) apply(x, 2, function(y) sum(is.na(y))))
na_per_samp <- lapply(na_per_samp, sort)

par(mai = c(2,1,1,0.5))
plot(unlist(na_per_samp), xaxt = 'n', ylab = '# NA readings (/485577)', 
     xlab = 'Sample', col = rep(c('red', 'green', 'blue'), laply(na_per_samp, length)), 
     main = 'Number of NA readings per sample')
axis(1, at = 1:length(unlist(na_per_samp)), 
     labels = do.call(c, lapply(na_per_samp, names)), las = 2, cex = 0.8)
legend('topleft', legend = names(na_per_samp), fill = c('red', 'green', 'blue'))

plot of chunk NAplot


Tables of NA counts per probe:

na_per_probe <- lapply(dataList, function(x) apply(x, 1, function(x) sum(is.na(x))))
llply(na_per_probe, table)
## $ALL
## 
##      0      1      2      3      4      5      6      7      8      9 
## 462250  20220   2358    478    142     55     30     14      7      4 
##     10     11     12     13     14     16     18     20     30 
##      5      5      3      1      1      1      1      1      1 
## 
## $APL
## 
##      0      1      2      3      4      5      7 
## 482070   3440     58      2      3      3      1 
## 
## $CTRL
## 
##      0      1      2      3      4 
## 483502   2008     52     12      3

Not a huge amount of probes to worry about, so we left this alone.

2) Finding methylation thresholds

### Calculate means across samples for each probe, and reshape for
### plotting
dat.probeMeans <- unlist(lapply(dataList, function(x) apply(x, 1, function(y) mean(y, 
    na.rm = T))))
plotDat <- data.frame(Beta = dat.probeMeans, Data = rep(c("ALL", "APL", "CTRL"), 
    each = nrow(ALL.dat)))

direct.label(densityplot(~Beta, data = plotDat, groups = Data, plot.points = F, 
    lwd = 2, panel = function(...) {
        panel.densityplot(...)
        panel.abline(v = 0.2, col = "red")
        panel.abline(v = 0.7, col = "red")
    }, xlab = "Beta", main = "Density of Beta-Values for 3 arrays"))

plot of chunk densities

3) Count of methylated probes per sample (\( \beta > 0.7 \))

### Percentages: (excluding na's)
meth_probes <- llply(dataList, function(x) apply(x, 2, function(y) sum(y > 0.7, 
    na.rm = T)/length(y)))
meth_probes <- llply(meth_probes, sort)
meth <- unlist(meth_probes)
names(meth) <- my.strsplit(names(meth), "\\.", i = 2)

## Plot these:
colors <- rep(c("red", "green", "blue"), laply(meth_probes, length))
par(mai = c(2, 1, 1, 0.5))
barplot(meth, col = colors, las = 2, ylab = "% Meth", main = "Proportion methylated (beta>0.8) probes per sample")
legend("topleft", legend = names(meth_probes), fill = c("red", "green", "blue"), 
    cex = 0.7)

plot of chunk MethProbesCount

4) Count of unmethylated probes per sample (\( \beta < 0.2 \))

### Percentages: (excluding na's)
unmeth_probes <- llply(dataList, function(x) apply(x, 2, function(y) sum(y < 
    0.2, na.rm = T)/length(y)))
unmeth_probes <- llply(unmeth_probes, sort)
unmeth <- unlist(unmeth_probes)
names(unmeth) <- my.strsplit(names(unmeth), "\\.", i = 2)

## Plot these:
colors <- rep(c("red", "green", "blue"), laply(unmeth_probes, length))
par(mai = c(2, 1, 1, 0.5))
barplot(unmeth, col = colors, las = 2, ylab = "% Unmeth", main = "Proportion unmethylated (beta<0.2) probes per sample")
legend("topleft", legend = names(unmeth_probes), fill = c("red", "green", "blue"), 
    cex = 0.7)

plot of chunk MethProbes

5) Ctrl sample scatterplots for probe filtering (\( \Delta \beta < 0.2 \))

Function for custom pairs plotting with upper triangular matrix of correlations

my.pairs <- function(..., textCex = 2, textCol = "red", method = c("pearson", 
    "spearman")) {
    method <- match.arg(method)
    cor.panel <- function(x, y) text(mean(range(x)), mean(range(y)), labels = format(cor(x, 
        y, method = method), digits = 3), cex = textCex, col = textCol)
    pairs(..., upper.panel = cor.panel)
}

All 5 types of control samples from all data sets:

pbmc <- names(CTRL.dat)[1:3]
lcl <- names(CTRL.dat)[4:6]
nbc <- names(CTRL.dat)[7:9]
abc <- names(ALL.dat)[grep("HBC", names(ALL.dat))]
hbm <- names(APL.dat)[grep("HBM", names(APL.dat))]

ctrls.samps.list <- list(PBMC = pbmc, LCL = lcl, NBC = nbc, ABC = abc, HBM = hbm)
ctrls.samps <- unlist(ctrls.samps.list)

Some data cleaning for plotting

## Bind all data sets into one data frame:
whole.dat <- cbind(ALL.dat, APL.dat, CTRL.dat)

### Calculate average beta-value over all sets of controls:
AVG.CTRL.dat <- lapply(ctrls.samps.list, function(x) apply(whole.dat[, x], 1, 
    function(x) mean(x, na.rm = T)))
AVG.CTRL.dat <- do.call(cbind, AVG.CTRL.dat)

#### There are still na's (when I try to calculate correlation)
#### Temporarily remove probes with any NAs for calculation of
#### correlations:
AVG.noNA.dat <- AVG.CTRL.dat[apply(AVG.CTRL.dat, 1, function(x) !any(is.na(x))), 
    ]
my.pairs(AVG.noNA.dat, lower.panel = function(...) {
    smoothScatter(..., add = T)
    abline(-0.2, 1, lty = 3, col = "red")
    abline(0.2, 1, lty = 3, col = "red")
}, method = "spearman")

plot of chunk pairsPlot