Some variables are missing: * age when the samples (adult and infant) are taken * Calculate BMI * Finish pre-processing with the compiled metadata

Setup

# R version 3.2.3 (2015-12-10)
# Platform: x86_64-pc-linux-gnu (64-bit)
# Running under: Ubuntu 16.04.1 LTS
# CRAN packages
library(reshape2) #1.4.2
library(dplyr) #0.5.0
library(RCurl) #1.95-4.8
library(RPMM) #1.20
library(lme4) #1.1-12
library(scales) #0.4.1
library(RColorBrewer) #1.1-2
library(gridExtra) #2.2.1
library(ggplot2) #2.2.0
library(ggsci) #2.7
library(ggdendro) #0.1-20
# Bioconductor packages
library(lumi) #2.22.1
library(methylumi) #2.16.0
library(wateRmelon) #1.10.0
library(FlowSorted.Blood.450k) #1.8.0
library(sva) #3.18.0
library(limma) #3.26.9

Functions

# from (https://stackoverflow.com/questions/13673894/suppress-nas-in-paste)
paste5 <- function(..., sep = " ", collapse = NULL, na.rm = F) {
  if (na.rm == F)
    paste(..., sep = sep, collapse = collapse)
  else
    if (na.rm == T) {
      paste.na <- function(x, sep) {
        x <- gsub("^\\s+|\\s+$", "", x)
        ret <- paste(na.omit(x), collapse = sep)
        is.na(ret) <- ret == ""
        return(ret)
      }
      df <- data.frame(..., stringsAsFactors = F)
      ret <- apply(df, 1, FUN = function(x) paste.na(x, sep))

      if (is.null(collapse))
        ret
      else {
        paste.na(ret, sep = collapse)
      }
    }
}

Load the data from Genome Studio and make methylumi objects

allFile <- ("raw/DBC_alldata.txt") 
qcFile <- ("raw/DBC_qcfile.txt") 

DBC.2 <- methylumiR(allFile, qcfile = qcFile)
save(DBC.2, file="1-DBC_raw.RData")
head(fData(DBC.2))
head(pData(DBC.2))
dim(DBC.2)

There are 865,918 probes for 176 samples in total

Meta data

# Importing sample file
sampleFile <- ("raw/DBC_samplefile.txt")
meta_DBC <- read.delim(sampleFile, nrows = 176)
head(meta_DBC)
# Change the Sample Label column to represent the individual
meta_DBC$Sample_Label <- gsub("_REP[0123456789]", "", meta_DBC$Sample_Label)

# Importing interview data
meta1 <- readRDS("meta/dbc_int1.rds")
meta2 <- readRDS("meta/dbc_int2.rds")
meta3 <- readRDS("meta/dbc_int3.rds")
meta4 <- readRDS("meta/dbc_int4.rds")
meta7 <- readRDS("meta/dbc_int7.rds")

meta3 <- meta3[, c(1:3, 131, 5:47, 129, 48:79, 130, 80:97, 4, 98:128)]
# Organizing interview data into categories
meta_physical <- as.data.frame(c(meta1[, c(1:10, 264)], meta2[, c(4, 5, 8:10, 127)], meta3[, c(4:6, 8:14, 51:55)], meta4[, c(4, 11:22)]))
meta_health <- as.data.frame(c(meta1[, c(1:4, 13:193, 252, 262, 263, 265:276)], meta2[, c(4, 20:82, 102:125)], meta3[, c(4, 16:50)]))
meta_wellbeing <- as.data.frame(c(meta1[, c(1:4, 11, 12, 230:235)], meta2[, c(4, 6, 7, 83:101, 126)], meta3[, c(4, 15, 64:99, 112:131)]))
meta_ses <- as.data.frame(c(meta1[, c(1:4, 194:229, 236:251, 253:261)], meta2[, c(4, 11:19)], meta3[, c(4, 7, 56:63, 100:111)], meta4[, c(4:10)]))
save(meta_DBC, meta_physical, meta_health, meta_wellbeing, meta_ses, file = "0-DBC_interview.RData")

Sample Filtering

P-values of samples on each array

# Add p-values and calculate means to assess samples qualities
avgPval <- colMeans(pvals(DBC.2))
meta_DBC$Det_pval <- avgPval

save(meta_DBC, file="0-DBC_meta.RData")
load("1-DBC_raw.RData")
load("0-DBC_meta.RData")

ggplot(meta_DBC)+
    geom_boxplot(aes(as.factor(Sentrix_ID), Det_pval, fill=as.factor(Sentrix_ID)), outlier.shape = NA)+
    geom_point(aes(as.factor(Sentrix_ID), Det_pval, group=Sample_ID, fill=as.factor(Sentrix_ID)), shape=21, color="black", position = position_jitter(w = 0.25))+
    theme_bw()

ggplot(meta_DBC)+
    geom_boxplot(aes(as.factor(Sentrix_ID), Det_pval, fill=as.factor(Sentrix_ID)), outlier.shape = NA)+
    geom_point(aes(as.factor(Sentrix_ID), Det_pval, group=Sample_ID, fill=as.factor(Sentrix_ID)), shape=21, color="black", position = position_jitter(w = 0.25))+ 
    ylim(0, 0.001) + 
    theme_bw() 
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Warning: Removed 12 rows containing missing values (geom_point).

# 12 samples not shown, y limit chosen arbitrarily
# Find samples with relatively higher pvals
sf_pval <- meta_DBC[meta_DBC$Det_pval >= 0.001, c(1, 10)]

Beta distribtuions

DBC_betas <- betas(DBC.2)

## for speed with only plot a sample of probes
Beta_melted <- melt(DBC_betas[sample(1:nrow(DBC_betas),10000),])
#remove NAs before plotting (otherwise get many non-inifnite warnings)
Beta_Plot <- Beta_melted[which(!(is.na(Beta_melted$value))),]

# Add meta
colnames(Beta_Plot) <- c("CpG","Sample_ID","Beta")
Beta_Plot <- merge(Beta_Plot,meta_DBC, by="Sample_ID")

ggplot(Beta_Plot, aes(Beta, group=as.character(Sample_ID), color=as.character(Sentrix_ID)))+
    geom_density()+
    theme_bw()+
    xlab("DNAm Beta Value")

# Looks beautiful

Quality check via DNAm clustering

# Plot clustering with color function
plotHclustColors <- function(matrix,leafcolor) {
  colnames(matrix) <- leafcolor
  d <- dist(t(matrix))
  hc <- hclust(d, method = "complete") #single, complete, average, ward
  color<-rep(brewer.pal(12,"Paired"),ceiling(length(unique(leafcolor))/12))
  labelColors <- color[sample(1:length(color),length(unique(leafcolor)))]
  colLab <- function(n) {
    if (is.leaf(n)) {
      a <- attributes(n)
      labCol <- labelColors[which(unique(leafcolor) == a$label)]
      attr(n, "nodePar") <- c(a$nodePar, lab.col=labCol)
    }
    n
  }
  clusDendro <- dendrapply(as.dendrogram(hc), colLab)
  plot(clusDendro, horiz = TRUE)
}


plotHclustColors <- function(matrix,leafcolor) {
  colnames(matrix) <- leafcolor
  d <- dist(t(matrix))
  hc <- hclust(d, method = "complete") #single, complete, average, ward
  ggdendrogram(hc, rotate = TRUE) + 
      theme(axis.text.x = element_text(aes(colour = leafcolor), axis.text.y = element_blank())
}

# Clustering By Any Meta Data variable
# remove rows with NAs
DBC_cluster <- DBC_betas[complete.cases(DBC_betas),]
meta_DBC$Sentrix_ID<-as.character(meta_DBC$Sentrix_ID)

# Plot
plotHclustColors(DBC_cluster, meta_DBC$Sample_Label) 
# all the technical replicates clustered perfectly

#Genotyping Probes (no samples from the same individual so useless in SGA project)
CpGs<- rownames(DBC_betas)
SNP_Probes<-CpGs[grep("rs", CpGs)]# 59 SNP probes on EPIC
SNPs<-DBC_betas[CpGs%in%SNP_Probes,]
# Remove NAs (if any)
SNPs<-SNPs[complete.cases(SNPs),]# 58 cause one was all NA

plotHclustColors(SNPs, meta_DBC$Sample_ID)

plotSampleRelation(SNPs, method = "cluster", cex = 0.5)

Hcluster

Sample Correlation

# Correlation between technical replicates
(cor <- cor.test(SNPs[,"54169104_REP2"], SNPs[,"54169104_REP1"]))
(cor <- cor.test(SNPs[,"261680_REP1"], SNPs[,"261680_REP2"]))
(cor <- cor.test(SNPs[,"261702_REP1"], SNPs[,"261702_REP2"]))

# Correlation between samples (two time points)
# 50537129 missing newborn data
for 
(cor <- cor.test(SNPs[,"50585014"], SNPs[,"261689"]))
(cor <- cor.test(SNPs[,"54227514"], SNPs[,"160004"]))
# 52715808, 52861215, 52869358 missing newborn data
(cor <- cor.test(SNPs[,"52872464"], SNPs[,"261692"]))
(cor <- cor.test(SNPs[,"52958335"], SNPs[,"160764"]))
(cor <- cor.test(SNPs[,"52974168"], SNPs[,"261747"]))
(cor <- cor.test(SNPs[,"52971877"], SNPs[,"261687"]))
(cor <- cor.test(SNPs[,"53022912"], SNPs[,"261697"]))
(cor <- cor.test(SNPs[,"53036674"], SNPs[,"261680_REP1"]))
(cor <- cor.test(SNPs[,"261681"], SNPs[,"261680_REP2"]))
(cor <- cor.test(SNPs[,"53037893"], SNPs[,"261714"])) # 261714 missing
(cor <- cor.test(SNPs[,"52950285"], SNPs[,"261699"]))
(cor <- cor.test(SNPs[,"53048943"], SNPs[,"261675"]))
(cor <- cor.test(SNPs[,"53087294"], SNPs[,"261711"]))
(cor <- cor.test(SNPs[,"53073273"], SNPs[,"261701"]))
(cor <- cor.test(SNPs[,"53127976"], SNPs[,"261683"]))
# 54144712 missing newborn data
(cor <- cor.test(SNPs[,"53190246"], SNPs[,"261730"]))
(cor <- cor.test(SNPs[,"52799136"], SNPs[,"261731"]))
(cor <- cor.test(SNPs[,"53273604"], SNPs[,"212567"]))
# 53275271 missing newborn data
(cor <- cor.test(SNPs[,"53331960"], SNPs[,"261713"])) # 53331960 missing
(cor <- cor.test(SNPs[,"53192417"], SNPs[,"261704"]))
(cor <- cor.test(SNPs[,"53378934"], SNPs[,"261737"]))
(cor <- cor.test(SNPs[,"53343642"], SNPs[,"261685"]))
# 53580184 missing newborn data
(cor <- cor.test(SNPs[,"53422316"], SNPs[,"261702_REP1"]))
(cor <- cor.test(SNPs[,"53399120"], SNPs[,"261686"]))
(cor <- cor.test(SNPs[,"53494696"], SNPs[,"261684"]))
# 53414519 missing newborn data
(cor <- cor.test(SNPs[,"53523495"], SNPs[,"261716"]))
(cor <- cor.test(SNPs[,"53432285"], SNPs[,"261722"]))
(cor <- cor.test(SNPs[,"53489005"], SNPs[,"261744"])) #35
Sample Pair Correlation P-Value
50585014/261689 0.998 2.2e-16
54227514/160004 0.998 2.2e-16

Probe Filtering

Removing SNP Control Probes

There are 59 control probes which measure single nucleotide polymorphisms which we use for quality control. We remove them from further analysis.

DBCPF <- DBC.2
DBC.rs <- DBC.2[substring(featureNames(DBC.2),1,2) == "rs", ]
DBCPF <- DBC.2[substring(featureNames(DBC.2),1,2) != "rs", ]
dim(DBC.rs) # 59, 176
dim(DBCPF) # 865859, 176

DBC.rs object contains only the SNP probes. The remaining object DBCPF has 865,859 probes.

Removing Sex Probes

For this cohort (DBC) the mothers are all females whereas the children are a mixture of sex. The probes on the sex chromosomes will be removed from further analysis.

DBC.xy <- DBC.2[fData(DBC.2)$CHR%in%c("X","Y"),]
dim (DBC.xy) # 19627, 176
DBCPF <- DBCPF[!fData(DBCPF)$CHR%in%c("X","Y"),] 
dim(DBCPF) # 846232, 176

There are 19,627 sex probes in DBC.xy. The remaining object DBCPF has 846,232 probes.

Removal of Polymorphic / Cross-reactive Probes

cross_reactive <- read.csv("../Pidsley_cross_reactive.csv", stringsAsFactors = F)
DBCPF <- DBCPF[which(!(featureNames(DBCPF)%in%cross_reactive$X)),]
dim(DBCPF) # 804459, 176

polymorphic <- read.csv("../Pidsley_Polymorphic_CpGs.csv",stringsAsFactors = F)
length(unique(polymorphic$PROBE))
baseext <- read.csv("../Pidsley_single_base_extension.csv",stringsAsFactors = F)
length(unique(baseext$PROBE))

DBCPF <- DBCPF[which(!(featureNames(DBCPF)%in%c(polymorphic$PROBE, baseext$PROBE))),]
dim(DBCPF) # 792939, 176

After removal of poorly designed probes on the EPIC array according to Pidsley et al., the remaining object DBCPF has 792,939 probes.

Watermelon Filtering

# waterMelon Bad probe filtration
DBC.pf <- pfilter(DBCPF)
# 0 samples having 1 % of sites with a detection p-value greater than 0.05 were removed 
# Samples removed:  
# 304 sites were removed as beadcount <3 in 5 % of samples 
# 5247 sites having 1 % of samples with a detection p-value greater than 0.05 were removed 
dim(DBC.pf) # 787447, 176 

DBCPF <- DBC.pf
save(DBCPF, file = "2-DBC_filtered.RData")

Probe Attrition Plot

df<-data.frame(sample_num_remaining=c(865918, 865859, 846232, 804459, 792939, 792694, 787447), filter=c("EPIC Probe Number", "Removal of SNP Probes","Removal of X and Y chromosome probes", "Removal of Pidsley Cross Reactive Probes", "Removal of Pidsley Polymorphic Probes", "Removal of Probes with Beadcount <3\nin 5 % of Samples", "Removal of Probes with 1 % of samples\nwith a detection p-value greater than 0.05"))
df$sample_num_lost <- c(0,sapply(2:nrow(df), function(x) df$sample_num_remaining[x-1]-df$sample_num_remaining[x]))

df$filter <- factor(df$filter, rev(df$filter))

ggplot(df)+
  geom_bar(aes(filter,-sample_num_remaining), stat="identity", fill="grey70", color="black")+
  geom_bar(aes(filter,sample_num_lost), stat="identity",fill="darkred", color="black")+
  geom_text(aes(x=filter, y=-min(sample_num_remaining)/2,  label=comma(sample_num_remaining)))+
  geom_text(aes(x=filter, y=max(sample_num_lost)/1.5,  label=comma(sample_num_lost)))+
  geom_hline(yintercept=0)+
  coord_flip()+theme_bw()+ylab("")+xlab("")+
  theme(axis.line = element_blank(),
        axis.ticks = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_text(colour = "grey20", size=12),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank()) +
  scale_x_discrete(position = "top") 

Normalization

Qunatile Normalization with equalized type I and II backgrounds

# dasen
DBC_dasen <- dasen(DBCPF) 
dim(DBC_dasen) # 787447, 176

DBC_norm_betas <- betas(DBC_dasen)
save(DBC_norm_betas, file = "3-DBC_normalized.RData")

DNAm distribution

Distribution of normalized data should be tighter

load("3-DBC_normalized.RData")

## Beta distribtuions plot
Beta_melted <- DBC_norm_betas[sample(1:nrow(DBC_norm_betas), 10000), ] %>% melt()
#remove NAs before plotting (otherwise get many non-inifnite warnings)
Beta_Plot <- Beta_melted[which(!(is.na(Beta_melted$value))),]

#add meta
colnames(Beta_Plot) <- c("CpG","Sample_ID","Beta")
Beta_Plot <- merge(Beta_Plot,meta_DBC, by="Sample_ID")

ggplot(Beta_Plot, aes(Beta, group = as.character(Sample_ID), color=as.character(Sentrix_ID))) +
    geom_density() + 
    theme_bw() + 
    xlab("DNAm Beta Value")

Blood composition correction

source("../ECC5.R")

## load DBC data
DBC_betas_df <- as.data.frame(DBC_norm_betas)
meta_DBC <- meta_DBC[which(meta_DBC$Sample_ID%in%colnames(DBC_betas_df)),]
meta_DBC <- meta_DBC[match(colnames(DBC_betas_df), meta_DBC$Sample_ID),]
dim(meta_DBC)
identical(colnames(DBC_betas_df), as.character(meta_DBC$Sample_ID))

## Predict cell compostition
predictions<- ECC5(DBC_betas_df)
counts<- as.data.frame(predictions$counts)

save(counts, file="DBC_cell_compositions.RData")

# Impute NAs
imputeMedianv3<-function(x) apply(x, 1, function(x){x[is.na(x)]<-median(x, na.rm=T); x}) #impute with row mean
DBC_betas_imputed <- t(imputeMedianv3(DBC_betas_df))

# deconvolute
avebeta.lm<-apply(DBC_betas_imputed, 1, function(x){
counts[colnames(DBC_betas_imputed),] -> blood
lm(x~CD8T+CD4T+NK+Bcell+Mono,data=blood)
})
residuals <- t(sapply(avebeta.lm, function(x)residuals(summary(x))))
colnames(residuals) <- colnames(DBC_betas_imputed)
adj.residuals <- residuals+matrix(apply(DBC_betas_imputed, 1, mean), nrow=nrow(residuals), ncol=ncol(residuals))

save(adj.residuals, file="4-DBC_adjusted_betas.RData")

Do composition again to see the normalization effect after adjustment

source("../ECC5.R")
load("4-DBC_adjusted_betas.RData")

adj.residuals_df <- as.data.frame(adj.residuals)
predictions<- ECC5(adj.residuals_df)
counts_adjusted<- as.data.frame(predictions$counts)
save(counts_adjusted, file="DBC_cell_compositions_adjusted.RData")

Blood composition predition before and after adjustment

#Blood plot
load("DBC_cell_compositions.RData")
load("DBC_cell_compositions_adjusted.RData")

counts$Sample_ID<-rownames(counts)
cellprop_melt<-melt(counts, id="Sample_ID")
cellprop_melt$Data<-"Original Predicated Proportions"
counts_adjusted$Sample_ID<-rownames(counts_adjusted)
cellprop_melt_adjusted<-melt(counts_adjusted, id="Sample_ID")
cellprop_melt_adjusted$Data<-"Proportions After Deconvolution"
cellprop_melt_blood<-rbind(cellprop_melt, cellprop_melt_adjusted)
cellprop_melt_blood$Data <- factor(cellprop_melt_blood$Data,c("Original Predicated Proportions", "Proportions After Deconvolution"))

ggplot(cellprop_melt_blood, aes(variable, value, fill = as.factor(variable)))+
    geom_point(shape = 21, color = "black", position = position_jitter(w = 0.2))+
    theme_bw()+
    xlab("Blood Cell Type")+
    ylab("Cell Type Proportion")+
    facet_grid(~Data, scales = "free_x")+
    scale_fill_manual(values = c("#fed976", "#feb24c", "#fd8d3c", "#fc4e2a", "#e31a1c", "#bd0026"), guide=FALSE)

Blood composition association with variables

load("DBC_cell_compositions.RData")

meta_DBC <- meta_DBC[match(rownames(counts), meta_DBC$Sample_ID), ]

#cell_comp_pvalues <- sapply(1:6, function(x) t.test(counts[,x]~as.character(meta_DBC$Sentrix_ID))$p.value)
# should be no difference

counts$Sample_ID <- rownames(counts)
counts_plot <- melt(counts)
## Using Sample_ID as id variables
plt <- merge(counts_plot, meta_DBC, by="Sample_ID")

ggplot(plt, aes(as.factor(Sentrix_ID), value))+
    geom_boxplot()+
    geom_point(shape=21,aes(group=Sample_ID, fill=as.factor(Sentrix_ID)), position=position_jitter(w=0.1))+
    facet_wrap(~variable)+
    xlab("Sentrix ID")+
    ylab("Cell Proportion in PBMC Sample")+
    theme_bw()

Beta distribution after cell type correction

load("DBC_meta.RData")
load("4-DBC_adjusted_betas.RData")

DBC_betas <- adj.residuals

## Beta distribtuions plot
Beta_melted <- DBC_betas[sample(1:nrow(DBC_betas), 10000), ] %>% melt()
#remove NAs before plotting (otherwise get many non-inifnite warnings)
Beta_Plot <- Beta_melted[which(!(is.na(Beta_melted$value))),]

#add meta
colnames(Beta_Plot) <- c("CpG","Sample_ID","Beta")
Beta_Plot <- merge(Beta_Plot,meta_DBC, by="Sample_ID")

ggplot(Beta_Plot, aes(Beta, group = as.character(Sample_ID), color=as.character(Sentrix_ID))) +
    geom_density() + 
    theme_bw() + 
    xlab("DNAm Beta Value")

Correct for batch effect

Heat scree plot

source("../heat_scree_plot.R")

DBC_betas <- as.data.frame(adj.residuals)

meta_DBC <- meta_DBC[which(meta_DBC$Sample_ID%in%colnames(DBC_betas)),]
meta_DBC <- meta_DBC[match(colnames(DBC_betas), meta_DBC$Sample_ID),]
dim(meta_DBC)
## [1] 176  10
# Restructure meta
meta_DBC$Sentrix_ID <- as.factor(meta_DBC$Sentrix_ID)
meta_DBC$Sentrix_row <- sapply(1:nrow(meta_DBC), function(x) strsplit(as.character(meta_DBC$Sentrix_Position[x]),split="C")[[1]][1])
meta_DBC$Sentrix_row <- as.factor(meta_DBC$Sentrix_row)
levels(meta_DBC$Sentrix_row) <- c(1,2,3,4,5,6,7,8)
meta_DBC$Sentrix_row <- as.numeric(meta_DBC$Sentrix_row)
meta_DBC$Batch <- as.factor(meta_DBC$Batch)
meta_DBC$Dummy1 <- sample(1:1000, 176, replace = FALSE)
meta_DBC$Dummy2 <- sample(1:1000, 176, replace = FALSE)

# input column numbers in meta that contain categorical variables
meta_categorical <- meta_DBC[, c("Sentrix_ID","Sentrix_row", "Batch")]  
# input column numbers in meta that contain continuous variables
meta_continuous <- meta_DBC[, c("Dummy1", "Dummy2")] 
ord <- 1:length(c(colnames(meta_categorical), colnames(meta_continuous)))

# PCA
PCA_full <- prcomp(na.omit(DBC_betas))
Loadings <- as.data.frame(unclass(PCA_full$rotation))
vars <- PCA_full$sdev^2
Importance <- vars/sum(vars)

PCs_to_view<-20

heat_scree_plot(Loadings, Importance)

ComBat (correct for Sentrix ID)

Mval <- function(beta) log2(beta/(1-beta))

pheno <- meta_DBC
edata <- apply(DBC_betas, 1, Mval) # need mvalues for combat
edata <- as.data.frame(edata)
edata <- t(edata)

pheno$Sentrix_ID <- as.factor(pheno$Sentrix_ID)

# Correct for batch effect from Sentrix_ID
batch = pheno$Sentrix_ID
combat_DBC_betas_ID <- ComBat(dat = edata, batch = batch, mod = NULL, par.prior = TRUE)
# Found 22 batches
# Adjusting for 0 covariate(s) or covariate level(s)
# Found 4379 Missing Data Values

save(combat_DBC_betas_ID, file="5-DBC_combat_ID.RData")
load("5-DBC_combat_ID.RData")

# PCA
PCA_full <- prcomp(na.omit(combat_DBC_betas_ID))
Loadings <- as.data.frame(unclass(PCA_full$rotation))
vars <- PCA_full$sdev^2
Importance <- vars/sum(vars)
heat_scree_plot(Loadings, Importance)

ComBat (correct for row effect)

#row
batch <- pheno$Sentrix_row
edata <- combat_DBC_betas_ID
combat_DBC_betas_row <- ComBat(dat = edata, batch = batch, mod = NULL, par.prior = TRUE)
# Found 8 batches
# Adjusting for 0 covariate(s) or covariate level(s)
# Found 4379 Missing Data Values

save(combat_DBC_betas_row, file="5-DBC_combat_row.RData")
# another heat scree
load("5-DBC_combat_row.RData")

# PCA
PCA_full <- prcomp(na.omit(combat_DBC_betas_row))
Loadings <- as.data.frame(unclass(PCA_full$rotation))
vars <- PCA_full$sdev^2
Importance <- vars/sum(vars)
heat_scree_plot(Loadings, Importance)

#Back to betas
betas <- function(M) 2^M/((2^M)+1)
combat_DBC_betas2 <- apply(combat_DBC_betas_row, 1, betas) 
combat_DBC_betas2 <- as.data.frame(combat_DBC_betas2)
combat_DBC_betas2 <- t(combat_DBC_betas2)
combat_DBC_betas <- combat_DBC_betas2

identical(colnames(combat_DBC_betas), as.character(meta_DBC$Sample_ID))

save(combat_DBC_betas, file="5-DBC_combat.RData")

check the beta value distribution again

load("5-DBC_combat.RData")

## Beta distribtuions plot
Beta_melted <- combat_DBC_betas[sample(1:nrow(combat_DBC_betas), 10000), ] %>% melt()
#remove NAs before plotting (otherwise get many non-inifnite warnings)
Beta_Plot <- Beta_melted[which(!(is.na(Beta_melted$value))),]

#add meta
colnames(Beta_Plot) <- c("CpG","Sample_ID","Beta")
Beta_Plot <- merge(Beta_Plot,meta_DBC, by="Sample_ID")

ggplot(Beta_Plot, aes(Beta, group = as.character(Sample_ID), color=as.character(Sentrix_ID))) +
    geom_density() + 
    theme_bw() + 
    xlab("DNAm Beta Value")

Remove Blood invariable CpGs

x <- getURL("https://raw.githubusercontent.com/redgar598/Tissue_Invariable_450K_CpGs/master/Invariant_Blood_CpGs.csv")
y <- read.csv(text = x)

DBC_blood_invariable <- combat_DBC_betas[which(rownames(combat_DBC_betas)%in%y$CpG),] #102265/120009 of the independent invariable sites are in DBC

# Call varibility in PAWS
Variation <- function(x) {quantile(x, c(0.9), na.rm=T)[[1]]-quantile(x, c(0.1), na.rm=T)[[1]]}
DBC_ref_range <- sapply(1:nrow(DBC_blood_invariable), function(x) Variation(DBC_blood_invariable[x,]))
Invariable_in_DBC <- DBC_blood_invariable[which(DBC_ref_range < 0.05), ]

# Which CpGs are invariable in PAWS and the independent data
invar_independent <- intersect(y$CpG, rownames(Invariable_in_DBC)) #101656/102265 (99.4%)
DBC_betas_variable <- combat_DBC_betas[which(!(rownames(combat_DBC_betas)%in%invar_independent)),] #685791 
DBC_betas <- DBC_betas_variable

identical(colnames(DBC_betas), as.character(meta_DBC$Sample_ID))
save(DBC_betas, file="6-DBC_cleaned_invariable.RData")

Probe Attrition Plot

df<-data.frame(sample_num_remaining=c(865918, 865859, 846232, 804459, 792939, 792694, 787447, 685791), filter=c("EPIC Probe Number", "Removal of SNP Probes","Removal of X and Y chromosome probes", "Removal of Pidsley Cross Reactive Probes", "Removal of Pidsley Polymorphic Probes", "Removal of Probes with Beadcount <3\nin 5 % of Samples", "Removal of Probes with 1 % of samples\nwith a detection p-value greater than 0.05","Non-variable CpG Filtering"))
df$sample_num_lost <- c(0,sapply(2:nrow(df), function(x) df$sample_num_remaining[x-1]-df$sample_num_remaining[x]))

df$filter <- factor(df$filter, rev(df$filter))

ggplot(df)+
  geom_bar(aes(filter,-sample_num_remaining), stat="identity", fill="grey70", color="black")+
  geom_bar(aes(filter,sample_num_lost), stat="identity",fill="darkred", color="black")+
  geom_text(aes(x=filter, y=-min(sample_num_remaining)/2,  label=comma(sample_num_remaining)))+
  geom_text(aes(x=filter, y=max(sample_num_lost)/1.5,  label=comma(sample_num_lost)))+
  geom_hline(yintercept=0)+
  coord_flip()+theme_bw()+ylab("")+xlab("")+
  theme(axis.line = element_blank(),
        axis.ticks = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_text(colour = "grey20", size=12),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank()) +
  scale_x_discrete(position = "top")