Some variables are missing: * age when the samples (adult and infant) are taken * Calculate BMI * Finish pre-processing with the compiled metadata
# R version 3.2.3 (2015-12-10)
# Platform: x86_64-pc-linux-gnu (64-bit)
# Running under: Ubuntu 16.04.1 LTS
# CRAN packages
library(reshape2) #1.4.2
library(dplyr) #0.5.0
library(RCurl) #1.95-4.8
library(RPMM) #1.20
library(lme4) #1.1-12
library(scales) #0.4.1
library(RColorBrewer) #1.1-2
library(gridExtra) #2.2.1
library(ggplot2) #2.2.0
library(ggsci) #2.7
library(ggdendro) #0.1-20
# Bioconductor packages
library(lumi) #2.22.1
library(methylumi) #2.16.0
library(wateRmelon) #1.10.0
library(FlowSorted.Blood.450k) #1.8.0
library(sva) #3.18.0
library(limma) #3.26.9
# from (https://stackoverflow.com/questions/13673894/suppress-nas-in-paste)
paste5 <- function(..., sep = " ", collapse = NULL, na.rm = F) {
if (na.rm == F)
paste(..., sep = sep, collapse = collapse)
else
if (na.rm == T) {
paste.na <- function(x, sep) {
x <- gsub("^\\s+|\\s+$", "", x)
ret <- paste(na.omit(x), collapse = sep)
is.na(ret) <- ret == ""
return(ret)
}
df <- data.frame(..., stringsAsFactors = F)
ret <- apply(df, 1, FUN = function(x) paste.na(x, sep))
if (is.null(collapse))
ret
else {
paste.na(ret, sep = collapse)
}
}
}
allFile <- ("raw/DBC_alldata.txt")
qcFile <- ("raw/DBC_qcfile.txt")
DBC.2 <- methylumiR(allFile, qcfile = qcFile)
save(DBC.2, file="1-DBC_raw.RData")
head(fData(DBC.2))
head(pData(DBC.2))
dim(DBC.2)
There are 865,918 probes for 176 samples in total
# Importing sample file
sampleFile <- ("raw/DBC_samplefile.txt")
meta_DBC <- read.delim(sampleFile, nrows = 176)
head(meta_DBC)
# Change the Sample Label column to represent the individual
meta_DBC$Sample_Label <- gsub("_REP[0123456789]", "", meta_DBC$Sample_Label)
# Importing interview data
meta1 <- readRDS("meta/dbc_int1.rds")
meta2 <- readRDS("meta/dbc_int2.rds")
meta3 <- readRDS("meta/dbc_int3.rds")
meta4 <- readRDS("meta/dbc_int4.rds")
meta7 <- readRDS("meta/dbc_int7.rds")
meta3 <- meta3[, c(1:3, 131, 5:47, 129, 48:79, 130, 80:97, 4, 98:128)]
# Organizing interview data into categories
meta_physical <- as.data.frame(c(meta1[, c(1:10, 264)], meta2[, c(4, 5, 8:10, 127)], meta3[, c(4:6, 8:14, 51:55)], meta4[, c(4, 11:22)]))
meta_health <- as.data.frame(c(meta1[, c(1:4, 13:193, 252, 262, 263, 265:276)], meta2[, c(4, 20:82, 102:125)], meta3[, c(4, 16:50)]))
meta_wellbeing <- as.data.frame(c(meta1[, c(1:4, 11, 12, 230:235)], meta2[, c(4, 6, 7, 83:101, 126)], meta3[, c(4, 15, 64:99, 112:131)]))
meta_ses <- as.data.frame(c(meta1[, c(1:4, 194:229, 236:251, 253:261)], meta2[, c(4, 11:19)], meta3[, c(4, 7, 56:63, 100:111)], meta4[, c(4:10)]))
save(meta_DBC, meta_physical, meta_health, meta_wellbeing, meta_ses, file = "0-DBC_interview.RData")
# Add p-values and calculate means to assess samples qualities
avgPval <- colMeans(pvals(DBC.2))
meta_DBC$Det_pval <- avgPval
save(meta_DBC, file="0-DBC_meta.RData")
load("1-DBC_raw.RData")
load("0-DBC_meta.RData")
ggplot(meta_DBC)+
geom_boxplot(aes(as.factor(Sentrix_ID), Det_pval, fill=as.factor(Sentrix_ID)), outlier.shape = NA)+
geom_point(aes(as.factor(Sentrix_ID), Det_pval, group=Sample_ID, fill=as.factor(Sentrix_ID)), shape=21, color="black", position = position_jitter(w = 0.25))+
theme_bw()
ggplot(meta_DBC)+
geom_boxplot(aes(as.factor(Sentrix_ID), Det_pval, fill=as.factor(Sentrix_ID)), outlier.shape = NA)+
geom_point(aes(as.factor(Sentrix_ID), Det_pval, group=Sample_ID, fill=as.factor(Sentrix_ID)), shape=21, color="black", position = position_jitter(w = 0.25))+
ylim(0, 0.001) +
theme_bw()
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Warning: Removed 12 rows containing missing values (geom_point).
# 12 samples not shown, y limit chosen arbitrarily
# Find samples with relatively higher pvals
sf_pval <- meta_DBC[meta_DBC$Det_pval >= 0.001, c(1, 10)]
DBC_betas <- betas(DBC.2)
## for speed with only plot a sample of probes
Beta_melted <- melt(DBC_betas[sample(1:nrow(DBC_betas),10000),])
#remove NAs before plotting (otherwise get many non-inifnite warnings)
Beta_Plot <- Beta_melted[which(!(is.na(Beta_melted$value))),]
# Add meta
colnames(Beta_Plot) <- c("CpG","Sample_ID","Beta")
Beta_Plot <- merge(Beta_Plot,meta_DBC, by="Sample_ID")
ggplot(Beta_Plot, aes(Beta, group=as.character(Sample_ID), color=as.character(Sentrix_ID)))+
geom_density()+
theme_bw()+
xlab("DNAm Beta Value")
# Looks beautiful
# Plot clustering with color function
plotHclustColors <- function(matrix,leafcolor) {
colnames(matrix) <- leafcolor
d <- dist(t(matrix))
hc <- hclust(d, method = "complete") #single, complete, average, ward
color<-rep(brewer.pal(12,"Paired"),ceiling(length(unique(leafcolor))/12))
labelColors <- color[sample(1:length(color),length(unique(leafcolor)))]
colLab <- function(n) {
if (is.leaf(n)) {
a <- attributes(n)
labCol <- labelColors[which(unique(leafcolor) == a$label)]
attr(n, "nodePar") <- c(a$nodePar, lab.col=labCol)
}
n
}
clusDendro <- dendrapply(as.dendrogram(hc), colLab)
plot(clusDendro, horiz = TRUE)
}
plotHclustColors <- function(matrix,leafcolor) {
colnames(matrix) <- leafcolor
d <- dist(t(matrix))
hc <- hclust(d, method = "complete") #single, complete, average, ward
ggdendrogram(hc, rotate = TRUE) +
theme(axis.text.x = element_text(aes(colour = leafcolor), axis.text.y = element_blank())
}
# Clustering By Any Meta Data variable
# remove rows with NAs
DBC_cluster <- DBC_betas[complete.cases(DBC_betas),]
meta_DBC$Sentrix_ID<-as.character(meta_DBC$Sentrix_ID)
# Plot
plotHclustColors(DBC_cluster, meta_DBC$Sample_Label)
# all the technical replicates clustered perfectly
#Genotyping Probes (no samples from the same individual so useless in SGA project)
CpGs<- rownames(DBC_betas)
SNP_Probes<-CpGs[grep("rs", CpGs)]# 59 SNP probes on EPIC
SNPs<-DBC_betas[CpGs%in%SNP_Probes,]
# Remove NAs (if any)
SNPs<-SNPs[complete.cases(SNPs),]# 58 cause one was all NA
plotHclustColors(SNPs, meta_DBC$Sample_ID)
plotSampleRelation(SNPs, method = "cluster", cex = 0.5)
# Correlation between technical replicates
(cor <- cor.test(SNPs[,"54169104_REP2"], SNPs[,"54169104_REP1"]))
(cor <- cor.test(SNPs[,"261680_REP1"], SNPs[,"261680_REP2"]))
(cor <- cor.test(SNPs[,"261702_REP1"], SNPs[,"261702_REP2"]))
# Correlation between samples (two time points)
# 50537129 missing newborn data
for
(cor <- cor.test(SNPs[,"50585014"], SNPs[,"261689"]))
(cor <- cor.test(SNPs[,"54227514"], SNPs[,"160004"]))
# 52715808, 52861215, 52869358 missing newborn data
(cor <- cor.test(SNPs[,"52872464"], SNPs[,"261692"]))
(cor <- cor.test(SNPs[,"52958335"], SNPs[,"160764"]))
(cor <- cor.test(SNPs[,"52974168"], SNPs[,"261747"]))
(cor <- cor.test(SNPs[,"52971877"], SNPs[,"261687"]))
(cor <- cor.test(SNPs[,"53022912"], SNPs[,"261697"]))
(cor <- cor.test(SNPs[,"53036674"], SNPs[,"261680_REP1"]))
(cor <- cor.test(SNPs[,"261681"], SNPs[,"261680_REP2"]))
(cor <- cor.test(SNPs[,"53037893"], SNPs[,"261714"])) # 261714 missing
(cor <- cor.test(SNPs[,"52950285"], SNPs[,"261699"]))
(cor <- cor.test(SNPs[,"53048943"], SNPs[,"261675"]))
(cor <- cor.test(SNPs[,"53087294"], SNPs[,"261711"]))
(cor <- cor.test(SNPs[,"53073273"], SNPs[,"261701"]))
(cor <- cor.test(SNPs[,"53127976"], SNPs[,"261683"]))
# 54144712 missing newborn data
(cor <- cor.test(SNPs[,"53190246"], SNPs[,"261730"]))
(cor <- cor.test(SNPs[,"52799136"], SNPs[,"261731"]))
(cor <- cor.test(SNPs[,"53273604"], SNPs[,"212567"]))
# 53275271 missing newborn data
(cor <- cor.test(SNPs[,"53331960"], SNPs[,"261713"])) # 53331960 missing
(cor <- cor.test(SNPs[,"53192417"], SNPs[,"261704"]))
(cor <- cor.test(SNPs[,"53378934"], SNPs[,"261737"]))
(cor <- cor.test(SNPs[,"53343642"], SNPs[,"261685"]))
# 53580184 missing newborn data
(cor <- cor.test(SNPs[,"53422316"], SNPs[,"261702_REP1"]))
(cor <- cor.test(SNPs[,"53399120"], SNPs[,"261686"]))
(cor <- cor.test(SNPs[,"53494696"], SNPs[,"261684"]))
# 53414519 missing newborn data
(cor <- cor.test(SNPs[,"53523495"], SNPs[,"261716"]))
(cor <- cor.test(SNPs[,"53432285"], SNPs[,"261722"]))
(cor <- cor.test(SNPs[,"53489005"], SNPs[,"261744"])) #35
| Sample Pair | Correlation | P-Value |
|---|---|---|
| 50585014/261689 | 0.998 | 2.2e-16 |
| 54227514/160004 | 0.998 | 2.2e-16 |
There are 59 control probes which measure single nucleotide polymorphisms which we use for quality control. We remove them from further analysis.
DBCPF <- DBC.2
DBC.rs <- DBC.2[substring(featureNames(DBC.2),1,2) == "rs", ]
DBCPF <- DBC.2[substring(featureNames(DBC.2),1,2) != "rs", ]
dim(DBC.rs) # 59, 176
dim(DBCPF) # 865859, 176
DBC.rs object contains only the SNP probes. The remaining object DBCPF has 865,859 probes.
For this cohort (DBC) the mothers are all females whereas the children are a mixture of sex. The probes on the sex chromosomes will be removed from further analysis.
DBC.xy <- DBC.2[fData(DBC.2)$CHR%in%c("X","Y"),]
dim (DBC.xy) # 19627, 176
DBCPF <- DBCPF[!fData(DBCPF)$CHR%in%c("X","Y"),]
dim(DBCPF) # 846232, 176
There are 19,627 sex probes in DBC.xy. The remaining object DBCPF has 846,232 probes.
cross_reactive <- read.csv("../Pidsley_cross_reactive.csv", stringsAsFactors = F)
DBCPF <- DBCPF[which(!(featureNames(DBCPF)%in%cross_reactive$X)),]
dim(DBCPF) # 804459, 176
polymorphic <- read.csv("../Pidsley_Polymorphic_CpGs.csv",stringsAsFactors = F)
length(unique(polymorphic$PROBE))
baseext <- read.csv("../Pidsley_single_base_extension.csv",stringsAsFactors = F)
length(unique(baseext$PROBE))
DBCPF <- DBCPF[which(!(featureNames(DBCPF)%in%c(polymorphic$PROBE, baseext$PROBE))),]
dim(DBCPF) # 792939, 176
After removal of poorly designed probes on the EPIC array according to Pidsley et al., the remaining object DBCPF has 792,939 probes.
# waterMelon Bad probe filtration
DBC.pf <- pfilter(DBCPF)
# 0 samples having 1 % of sites with a detection p-value greater than 0.05 were removed
# Samples removed:
# 304 sites were removed as beadcount <3 in 5 % of samples
# 5247 sites having 1 % of samples with a detection p-value greater than 0.05 were removed
dim(DBC.pf) # 787447, 176
DBCPF <- DBC.pf
save(DBCPF, file = "2-DBC_filtered.RData")
df<-data.frame(sample_num_remaining=c(865918, 865859, 846232, 804459, 792939, 792694, 787447), filter=c("EPIC Probe Number", "Removal of SNP Probes","Removal of X and Y chromosome probes", "Removal of Pidsley Cross Reactive Probes", "Removal of Pidsley Polymorphic Probes", "Removal of Probes with Beadcount <3\nin 5 % of Samples", "Removal of Probes with 1 % of samples\nwith a detection p-value greater than 0.05"))
df$sample_num_lost <- c(0,sapply(2:nrow(df), function(x) df$sample_num_remaining[x-1]-df$sample_num_remaining[x]))
df$filter <- factor(df$filter, rev(df$filter))
ggplot(df)+
geom_bar(aes(filter,-sample_num_remaining), stat="identity", fill="grey70", color="black")+
geom_bar(aes(filter,sample_num_lost), stat="identity",fill="darkred", color="black")+
geom_text(aes(x=filter, y=-min(sample_num_remaining)/2, label=comma(sample_num_remaining)))+
geom_text(aes(x=filter, y=max(sample_num_lost)/1.5, label=comma(sample_num_lost)))+
geom_hline(yintercept=0)+
coord_flip()+theme_bw()+ylab("")+xlab("")+
theme(axis.line = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(colour = "grey20", size=12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank()) +
scale_x_discrete(position = "top")
# dasen
DBC_dasen <- dasen(DBCPF)
dim(DBC_dasen) # 787447, 176
DBC_norm_betas <- betas(DBC_dasen)
save(DBC_norm_betas, file = "3-DBC_normalized.RData")
Distribution of normalized data should be tighter
load("3-DBC_normalized.RData")
## Beta distribtuions plot
Beta_melted <- DBC_norm_betas[sample(1:nrow(DBC_norm_betas), 10000), ] %>% melt()
#remove NAs before plotting (otherwise get many non-inifnite warnings)
Beta_Plot <- Beta_melted[which(!(is.na(Beta_melted$value))),]
#add meta
colnames(Beta_Plot) <- c("CpG","Sample_ID","Beta")
Beta_Plot <- merge(Beta_Plot,meta_DBC, by="Sample_ID")
ggplot(Beta_Plot, aes(Beta, group = as.character(Sample_ID), color=as.character(Sentrix_ID))) +
geom_density() +
theme_bw() +
xlab("DNAm Beta Value")
source("../ECC5.R")
## load DBC data
DBC_betas_df <- as.data.frame(DBC_norm_betas)
meta_DBC <- meta_DBC[which(meta_DBC$Sample_ID%in%colnames(DBC_betas_df)),]
meta_DBC <- meta_DBC[match(colnames(DBC_betas_df), meta_DBC$Sample_ID),]
dim(meta_DBC)
identical(colnames(DBC_betas_df), as.character(meta_DBC$Sample_ID))
## Predict cell compostition
predictions<- ECC5(DBC_betas_df)
counts<- as.data.frame(predictions$counts)
save(counts, file="DBC_cell_compositions.RData")
# Impute NAs
imputeMedianv3<-function(x) apply(x, 1, function(x){x[is.na(x)]<-median(x, na.rm=T); x}) #impute with row mean
DBC_betas_imputed <- t(imputeMedianv3(DBC_betas_df))
# deconvolute
avebeta.lm<-apply(DBC_betas_imputed, 1, function(x){
counts[colnames(DBC_betas_imputed),] -> blood
lm(x~CD8T+CD4T+NK+Bcell+Mono,data=blood)
})
residuals <- t(sapply(avebeta.lm, function(x)residuals(summary(x))))
colnames(residuals) <- colnames(DBC_betas_imputed)
adj.residuals <- residuals+matrix(apply(DBC_betas_imputed, 1, mean), nrow=nrow(residuals), ncol=ncol(residuals))
save(adj.residuals, file="4-DBC_adjusted_betas.RData")
Do composition again to see the normalization effect after adjustment
source("../ECC5.R")
load("4-DBC_adjusted_betas.RData")
adj.residuals_df <- as.data.frame(adj.residuals)
predictions<- ECC5(adj.residuals_df)
counts_adjusted<- as.data.frame(predictions$counts)
save(counts_adjusted, file="DBC_cell_compositions_adjusted.RData")
#Blood plot
load("DBC_cell_compositions.RData")
load("DBC_cell_compositions_adjusted.RData")
counts$Sample_ID<-rownames(counts)
cellprop_melt<-melt(counts, id="Sample_ID")
cellprop_melt$Data<-"Original Predicated Proportions"
counts_adjusted$Sample_ID<-rownames(counts_adjusted)
cellprop_melt_adjusted<-melt(counts_adjusted, id="Sample_ID")
cellprop_melt_adjusted$Data<-"Proportions After Deconvolution"
cellprop_melt_blood<-rbind(cellprop_melt, cellprop_melt_adjusted)
cellprop_melt_blood$Data <- factor(cellprop_melt_blood$Data,c("Original Predicated Proportions", "Proportions After Deconvolution"))
ggplot(cellprop_melt_blood, aes(variable, value, fill = as.factor(variable)))+
geom_point(shape = 21, color = "black", position = position_jitter(w = 0.2))+
theme_bw()+
xlab("Blood Cell Type")+
ylab("Cell Type Proportion")+
facet_grid(~Data, scales = "free_x")+
scale_fill_manual(values = c("#fed976", "#feb24c", "#fd8d3c", "#fc4e2a", "#e31a1c", "#bd0026"), guide=FALSE)
load("DBC_cell_compositions.RData")
meta_DBC <- meta_DBC[match(rownames(counts), meta_DBC$Sample_ID), ]
#cell_comp_pvalues <- sapply(1:6, function(x) t.test(counts[,x]~as.character(meta_DBC$Sentrix_ID))$p.value)
# should be no difference
counts$Sample_ID <- rownames(counts)
counts_plot <- melt(counts)
## Using Sample_ID as id variables
plt <- merge(counts_plot, meta_DBC, by="Sample_ID")
ggplot(plt, aes(as.factor(Sentrix_ID), value))+
geom_boxplot()+
geom_point(shape=21,aes(group=Sample_ID, fill=as.factor(Sentrix_ID)), position=position_jitter(w=0.1))+
facet_wrap(~variable)+
xlab("Sentrix ID")+
ylab("Cell Proportion in PBMC Sample")+
theme_bw()
load("DBC_meta.RData")
load("4-DBC_adjusted_betas.RData")
DBC_betas <- adj.residuals
## Beta distribtuions plot
Beta_melted <- DBC_betas[sample(1:nrow(DBC_betas), 10000), ] %>% melt()
#remove NAs before plotting (otherwise get many non-inifnite warnings)
Beta_Plot <- Beta_melted[which(!(is.na(Beta_melted$value))),]
#add meta
colnames(Beta_Plot) <- c("CpG","Sample_ID","Beta")
Beta_Plot <- merge(Beta_Plot,meta_DBC, by="Sample_ID")
ggplot(Beta_Plot, aes(Beta, group = as.character(Sample_ID), color=as.character(Sentrix_ID))) +
geom_density() +
theme_bw() +
xlab("DNAm Beta Value")
source("../heat_scree_plot.R")
DBC_betas <- as.data.frame(adj.residuals)
meta_DBC <- meta_DBC[which(meta_DBC$Sample_ID%in%colnames(DBC_betas)),]
meta_DBC <- meta_DBC[match(colnames(DBC_betas), meta_DBC$Sample_ID),]
dim(meta_DBC)
## [1] 176 10
# Restructure meta
meta_DBC$Sentrix_ID <- as.factor(meta_DBC$Sentrix_ID)
meta_DBC$Sentrix_row <- sapply(1:nrow(meta_DBC), function(x) strsplit(as.character(meta_DBC$Sentrix_Position[x]),split="C")[[1]][1])
meta_DBC$Sentrix_row <- as.factor(meta_DBC$Sentrix_row)
levels(meta_DBC$Sentrix_row) <- c(1,2,3,4,5,6,7,8)
meta_DBC$Sentrix_row <- as.numeric(meta_DBC$Sentrix_row)
meta_DBC$Batch <- as.factor(meta_DBC$Batch)
meta_DBC$Dummy1 <- sample(1:1000, 176, replace = FALSE)
meta_DBC$Dummy2 <- sample(1:1000, 176, replace = FALSE)
# input column numbers in meta that contain categorical variables
meta_categorical <- meta_DBC[, c("Sentrix_ID","Sentrix_row", "Batch")]
# input column numbers in meta that contain continuous variables
meta_continuous <- meta_DBC[, c("Dummy1", "Dummy2")]
ord <- 1:length(c(colnames(meta_categorical), colnames(meta_continuous)))
# PCA
PCA_full <- prcomp(na.omit(DBC_betas))
Loadings <- as.data.frame(unclass(PCA_full$rotation))
vars <- PCA_full$sdev^2
Importance <- vars/sum(vars)
PCs_to_view<-20
heat_scree_plot(Loadings, Importance)
Mval <- function(beta) log2(beta/(1-beta))
pheno <- meta_DBC
edata <- apply(DBC_betas, 1, Mval) # need mvalues for combat
edata <- as.data.frame(edata)
edata <- t(edata)
pheno$Sentrix_ID <- as.factor(pheno$Sentrix_ID)
# Correct for batch effect from Sentrix_ID
batch = pheno$Sentrix_ID
combat_DBC_betas_ID <- ComBat(dat = edata, batch = batch, mod = NULL, par.prior = TRUE)
# Found 22 batches
# Adjusting for 0 covariate(s) or covariate level(s)
# Found 4379 Missing Data Values
save(combat_DBC_betas_ID, file="5-DBC_combat_ID.RData")
load("5-DBC_combat_ID.RData")
# PCA
PCA_full <- prcomp(na.omit(combat_DBC_betas_ID))
Loadings <- as.data.frame(unclass(PCA_full$rotation))
vars <- PCA_full$sdev^2
Importance <- vars/sum(vars)
heat_scree_plot(Loadings, Importance)
#row
batch <- pheno$Sentrix_row
edata <- combat_DBC_betas_ID
combat_DBC_betas_row <- ComBat(dat = edata, batch = batch, mod = NULL, par.prior = TRUE)
# Found 8 batches
# Adjusting for 0 covariate(s) or covariate level(s)
# Found 4379 Missing Data Values
save(combat_DBC_betas_row, file="5-DBC_combat_row.RData")
# another heat scree
load("5-DBC_combat_row.RData")
# PCA
PCA_full <- prcomp(na.omit(combat_DBC_betas_row))
Loadings <- as.data.frame(unclass(PCA_full$rotation))
vars <- PCA_full$sdev^2
Importance <- vars/sum(vars)
heat_scree_plot(Loadings, Importance)
#Back to betas
betas <- function(M) 2^M/((2^M)+1)
combat_DBC_betas2 <- apply(combat_DBC_betas_row, 1, betas)
combat_DBC_betas2 <- as.data.frame(combat_DBC_betas2)
combat_DBC_betas2 <- t(combat_DBC_betas2)
combat_DBC_betas <- combat_DBC_betas2
identical(colnames(combat_DBC_betas), as.character(meta_DBC$Sample_ID))
save(combat_DBC_betas, file="5-DBC_combat.RData")
check the beta value distribution again
load("5-DBC_combat.RData")
## Beta distribtuions plot
Beta_melted <- combat_DBC_betas[sample(1:nrow(combat_DBC_betas), 10000), ] %>% melt()
#remove NAs before plotting (otherwise get many non-inifnite warnings)
Beta_Plot <- Beta_melted[which(!(is.na(Beta_melted$value))),]
#add meta
colnames(Beta_Plot) <- c("CpG","Sample_ID","Beta")
Beta_Plot <- merge(Beta_Plot,meta_DBC, by="Sample_ID")
ggplot(Beta_Plot, aes(Beta, group = as.character(Sample_ID), color=as.character(Sentrix_ID))) +
geom_density() +
theme_bw() +
xlab("DNAm Beta Value")
x <- getURL("https://raw.githubusercontent.com/redgar598/Tissue_Invariable_450K_CpGs/master/Invariant_Blood_CpGs.csv")
y <- read.csv(text = x)
DBC_blood_invariable <- combat_DBC_betas[which(rownames(combat_DBC_betas)%in%y$CpG),] #102265/120009 of the independent invariable sites are in DBC
# Call varibility in PAWS
Variation <- function(x) {quantile(x, c(0.9), na.rm=T)[[1]]-quantile(x, c(0.1), na.rm=T)[[1]]}
DBC_ref_range <- sapply(1:nrow(DBC_blood_invariable), function(x) Variation(DBC_blood_invariable[x,]))
Invariable_in_DBC <- DBC_blood_invariable[which(DBC_ref_range < 0.05), ]
# Which CpGs are invariable in PAWS and the independent data
invar_independent <- intersect(y$CpG, rownames(Invariable_in_DBC)) #101656/102265 (99.4%)
DBC_betas_variable <- combat_DBC_betas[which(!(rownames(combat_DBC_betas)%in%invar_independent)),] #685791
DBC_betas <- DBC_betas_variable
identical(colnames(DBC_betas), as.character(meta_DBC$Sample_ID))
save(DBC_betas, file="6-DBC_cleaned_invariable.RData")
df<-data.frame(sample_num_remaining=c(865918, 865859, 846232, 804459, 792939, 792694, 787447, 685791), filter=c("EPIC Probe Number", "Removal of SNP Probes","Removal of X and Y chromosome probes", "Removal of Pidsley Cross Reactive Probes", "Removal of Pidsley Polymorphic Probes", "Removal of Probes with Beadcount <3\nin 5 % of Samples", "Removal of Probes with 1 % of samples\nwith a detection p-value greater than 0.05","Non-variable CpG Filtering"))
df$sample_num_lost <- c(0,sapply(2:nrow(df), function(x) df$sample_num_remaining[x-1]-df$sample_num_remaining[x]))
df$filter <- factor(df$filter, rev(df$filter))
ggplot(df)+
geom_bar(aes(filter,-sample_num_remaining), stat="identity", fill="grey70", color="black")+
geom_bar(aes(filter,sample_num_lost), stat="identity",fill="darkred", color="black")+
geom_text(aes(x=filter, y=-min(sample_num_remaining)/2, label=comma(sample_num_remaining)))+
geom_text(aes(x=filter, y=max(sample_num_lost)/1.5, label=comma(sample_num_lost)))+
geom_hline(yintercept=0)+
coord_flip()+theme_bw()+ylab("")+xlab("")+
theme(axis.line = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(colour = "grey20", size=12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank()) +
scale_x_discrete(position = "top")