Load required packages
#load packages
library("limma")
library("dplyr")
library("lattice")
library("ggplot2")
library("tidyr")
#set working directory
setwd("~/Desktop/SCHOOL/THESIS/Chapter 2 - Methyl/PAPER/Gitlab uploads/")
#load data on shrew
df <- read.csv("SampleSheetAgeN90final.csv")
#subset samples to only use ones that can be used for ageing (basically remove ones that didn't pass QC)
samples <- subset(df, CanBeUsedForAgingStudies == 'yes')
#look at structure of data frame
str(df)
#load probe data
load("all_probes_sesame_normalized.Rdata")
#match the probe data with your samples info
data <- normalized_betas_sesame %>% tibble::column_to_rownames(var = "CGid")%>%
dplyr::select(samples$Basename)
#look at structure of data frame
str(data)
#load my mapped probes
probe_marie_cinereusshrewn90 <- read.csv("aln_2.csv")
### Data refinement of CpG Sites, based on shared probe mappings
#add new column called "qname" to be able to match CGids with probe data set
data_1<- cbind(qname = rownames(data), data)
#subset columns that have matching CGids (removes about 8 thousand cols)
data_2 <- subset(data_1, qname %in% probe_marie_cinereusshrewn90$qname)
#remove that new column because we need only numeric values in table for matrix step coming up in a bit
data <- data_2[ -c(1)]
#remove fetus from "df" dataset
df_nofet=subset(df, df$Age>0.1)
#remove fetus from "data" based on the "df" dataframe
data_nofet <- subset(data, select = df$Age>0.1)
#add new "Location_1" column to separate Bon Portage Island samples from all others ("yes" = from BPI)
df_nofet<- mutate(df_nofet, Location_1 = ifelse(df_nofet$Location == "Bon Portage Island Nova Scotia", "yes", "no"))
Code inspired by https://rstudio-pubs-static.s3.amazonaws.com/13450_cca08b93a0d94bc2b45c9ed94e230d02.html
#make matrix with variables to be able to run EWAS
#I want my model to include sex, age, tissue type and BPI/others location
(df_nofetMat <- model.matrix(~Location_1 + Age + Sex + Tissue, df_nofet))
#Estimate the correlation between samples from same individual to be able to include and account for it in regression model
cor <- duplicateCorrelation(data_nofet, df_nofetMat, block=df_nofet$AnimalID)
cor$consensus.correlation
#use limma package to run regressions
#need probe dataset (data_nofet), matrix with sample info (df_nofet), and info on duplicate correlation from above
Fit <- lmFit(data_nofet, df_nofetMat, block = df_nofet$AnimalID, correlation = cor$consensus.correlation)
#use eBayes() to moderate the estimated error variances
EbFit <- eBayes(Fit)
#add "CGid" column based on rownames
EbFit$CGid <- rownames(EbFit)
#view info on top significant CpGs
topTable(EbFit)
## Location_1yes Age SexM TissueTail AveExpr
## cg06523030 0.4175807439 0.011054763 0.0007761812 -0.017445389 0.2068136
## cg13102615 0.0158779679 0.005051293 -0.5827512656 0.031988272 0.2825970
## cg13324140 0.0175684687 0.015260792 -0.4806779171 0.026281575 0.2399096
## cg11115577 0.0111941089 0.005109338 0.4096539246 -0.032348835 0.7662909
## cg15281901 0.0009163823 -0.008132844 -0.3669961121 0.016661764 0.2199308
## cg10876267 0.0127565094 -0.006252519 0.6301947558 0.004923112 0.6398498
## cg03178290 0.0064034548 0.014931974 -0.3859389668 0.007419707 0.2593505
## cg06382285 0.0220263697 0.016068915 -0.4471342484 0.024474292 0.2092987
## cg25476308 -0.0367942823 0.012334982 0.0045571277 0.613616435 0.7375965
## cg11584409 0.0046753530 0.010849478 -0.0080296441 0.347239108 0.3667963
## F P.Value adj.P.Val
## cg06523030 1369.2294 2.470725e-42 7.315322e-38
## cg13102615 1274.4949 1.030654e-41 1.525780e-37
## cg13324140 1207.7976 3.005383e-41 2.966112e-37
## cg11115577 1107.5363 1.685741e-40 1.159245e-36
## cg15281901 1099.2404 1.957655e-40 1.159245e-36
## cg10876267 976.6089 2.055283e-39 1.014214e-35
## cg03178290 763.5188 2.713047e-37 1.144063e-33
## cg06382285 758.5046 3.091227e-37 1.144063e-33
## cg25476308 731.9475 6.260289e-37 2.059496e-33
## cg11584409 702.0200 1.430119e-36 4.234295e-33
#save output to computer
write.csv(EbFit, file="~/Desktop/SCHOOL/THESIS/Chapter 2 - Methyl/PAPER/Gitlab uploads/EWAS/EbFit_BPI_effect.csv")
#view data for each variable/coefficient (outputs in order of most significant)
#Location
coeff_2 <- topTable(EbFit,coef=2)
CpG_2 <- rownames(coeff_2)
coeff_2
## logFC AveExpr t P.Value adj.P.Val B
## cg06523030 0.4175807 0.2068136 66.83982 8.574010e-43 2.538593e-38 83.56436
## cg20625795 -0.3536041 0.7265886 -32.99225 1.033809e-30 1.530451e-26 59.50666
## cg03673751 -0.3565916 0.7992837 -18.69703 2.003040e-21 1.976867e-17 38.73787
## cg25476805 -0.3005787 0.7448504 -15.81354 7.599710e-19 5.625305e-15 32.82549
## cg23036219 -0.8375173 0.5987181 -15.63745 1.120608e-18 6.635790e-15 32.43763
## cg17246436 -0.3730393 0.3640227 -14.90609 5.820051e-18 2.872001e-14 30.79087
## cg00903722 -0.2884190 0.7350563 -14.15895 3.324609e-17 1.406215e-13 29.04678
## cg21401008 0.3391333 0.1627410 13.68873 1.028141e-16 3.494350e-13 27.91580
## cg19929340 -0.2392314 0.2552236 -13.67531 1.062184e-16 3.494350e-13 27.88315
## cg21330564 -0.3120941 0.7702570 -13.50446 1.611491e-16 4.670393e-13 27.46539
#Age
coeff_3 <- topTable(EbFit,coef=3)
CpG_3 <- rownames(coeff_3)
coeff_3
## logFC AveExpr t P.Value adj.P.Val B
## cg08938156 0.11340415 0.1773514 10.881687 1.517831e-13 4.493995e-09 20.69216
## cg10427592 -0.11495189 0.9407453 -10.259683 8.759640e-13 7.886733e-09 18.94027
## cg07537139 -0.14071212 0.7344573 -10.236734 9.353850e-13 7.886733e-09 18.87466
## cg26393176 0.06387102 0.1248493 10.118423 1.313445e-12 7.886733e-09 18.53531
## cg00029553 -0.14891122 0.5646610 -10.113584 1.331858e-12 7.886733e-09 18.52139
## cg02034167 -0.11737840 0.8602233 -10.013581 1.777074e-12 8.769267e-09 18.23307
## cg26301436 0.11575889 0.1897811 9.656597 5.027537e-12 2.126505e-08 17.19329
## cg16518996 -0.19061787 0.7793890 -9.485316 8.328317e-12 3.082310e-08 16.68863
## cg15469181 -0.17731380 0.5853877 -9.357266 1.217531e-11 4.005406e-08 16.30893
## cg17157676 -0.24687063 0.6842645 -9.179215 2.071446e-11 5.601522e-08 15.77760
#Sex
coeff_4 <-topTable(EbFit,coef=4)
CpG_4 <- rownames(coeff_4)
coeff_4
## logFC AveExpr t P.Value adj.P.Val B
## cg13102615 -0.5827513 0.2825970 -67.75294 4.995337e-43 1.479019e-38 84.76998
## cg13324140 -0.4806779 0.2399096 -66.32011 1.169843e-42 1.731836e-38 84.09651
## cg11115577 0.4096539 0.7662909 62.71912 1.078167e-41 9.771414e-38 82.30892
## cg15281901 -0.3669961 0.2199308 -62.40058 1.320105e-41 9.771414e-38 82.14390
## cg10876267 0.6301948 0.6398498 59.12479 1.125095e-40 6.662365e-37 80.37689
## cg03178290 -0.3859390 0.2593505 -52.79713 1.001889e-38 4.927059e-35 76.56245
## cg06382285 -0.4471342 0.2092987 -52.59656 1.164868e-38 4.927059e-35 76.43193
## cg04927657 -0.4700621 0.2819531 -50.22955 7.204824e-38 2.666505e-34 74.84231
## cg21064897 -0.4806909 0.2703235 -48.48715 2.908073e-37 9.566915e-34 73.61098
## cg13842554 -0.3021386 0.1864508 -46.52258 1.488135e-36 4.406071e-33 72.15566
#Re-arrange data for plotting
#make matrix
data_nofet_M <- as.matrix(data_nofet)
#transpose data (basically swap rows and columns to have CGid as column and sample ID as rows)
data_nofet_M <- t(data_nofet_M)
#add new column at the start for Basename (copying the actual rownames and making a column out of it)
data_nofet_M <- cbind(Basename = rownames(data_nofet_M), data_nofet_M)
#change back to dataframe
data_nofet_M <- as.data.frame(data_nofet_M)
#bind the re-arranged probe data with the sample data frame (df_nofet) using the "Basename" column
allData <- cbind(df_nofet, Basename = data_nofet_M)
#put CpGs of interest (top 10) in the same column using gather (change wide format to tall)
allData_long <- gather(allData, CPG, Value, Basename.cg08938156, Basename.cg10427592, Basename.cg07537139, Basename.cg26393176, Basename.cg00029553, Basename.cg02034167, Basename.cg26301436, Basename.cg16518996, Basename.cg15469181, Basename.cg17157676, factor_key=TRUE)
#change CpG methylation value to numeric
allData_long$Value <- as.numeric(allData_long$Value)
#plot linear regression of DNAm level vs age for top 10 significant CpGs
baseplot <- ggplot(allData_long, aes(allData_long$Age, allData_long$Value, color=allData_long$CPG)) + geom_point(size = 1)
#add title and axis labels
baseplot <- baseplot + ggtitle("CpG methylation over age") + xlab("Age (years)") + ylab("DNAm level")
#make each CpG have a unique color and change legend names
baseplot <- baseplot + scale_colour_discrete(name="CpG", breaks=c("Basename.cg08938156", "Basename.cg10427592", "Basename.cg07537139", "Basename.cg26393176", "Basename.cg00029553", "Basename.cg02034167", "Basename.cg26301436", "Basename.cg16518996", "Basename.cg15469181", "Basename.cg17157676"), labels=c("cg08938156", "cg10427592", "cg07537139", "cg26393176", "cg00029553", "cg02034167", "cg26301436", "cg16518996", "cg15469181", "cg17157676"))
#add regression line
DNAm_Age_plot <- baseplot + geom_smooth(method = "lm")
#view plot
DNAm_Age_plot
#need to update and put CpGs of interest (top 10)in the same column using gather (change wide format to tall)
allData_long_2 <- gather(allData, CPG_2, Value_2, Basename.cg06523030, Basename.cg20625795, Basename.cg03673751, Basename.cg25476805, Basename.cg23036219, Basename.cg17246436, Basename.cg00903722, Basename.cg21401008, Basename.cg19929340, Basename.cg21330564, factor_key=TRUE)
#change CpG methylation value to numeric
allData_long_2$Value_2 <- as.numeric(allData_long_2$Value_2)
##change CpG names for legend, got the info on specific genes from Manhattan plot scripts
allData_long_2$CPG_2 <- factor(allData_long_2$CPG_2, levels = c("Basename.cg06523030", "Basename.cg20625795", "Basename.cg03673751", "Basename.cg25476805", "Basename.cg23036219", "Basename.cg17246436", "Basename.cg00903722", "Basename.cg21401008", "Basename.cg19929340", "Basename.cg21330564"),
labels = c("cg06523030", "cg20625795 SMO_Exon", "cg03673751 ZO1_Exon", "cg25476805 PRP8_Promoter", "cg23036219 CBX5_Intergenic", "cg17246436", "cg00903722 ZBT20_Intergenic", "cg21401008 NR6A1_Intergenic", "cg19929340", "cg21330564 EDNRA_Intergenic"))
library(ggrepel)
#plot boxplots of DNAm level vs location for top 10 significant CpGs
DNAm_Location_plot <- ggplot(allData_long_2, aes(x=allData_long_2$CPG_2, allData_long_2$Value_2, fill=allData_long_2$Location_1)) +
geom_boxplot() +
#put boxplot for each CpG in its own panel, each panel has its own range of values for the y axis, put 5 side by side per row and edit labeller to avoid cropped text
facet_wrap(~allData_long_2$CPG_2, scale="free", ncol =5, labeller = label_wrap_gen(10)) +
#change title and axis names
ggtitle("CpG Methylation Levels") + xlab("") + ylab("DNAm Level") +
#remove x axis labels
scale_x_discrete(labels = NULL, breaks = NULL) +
#select colors (red and blue), specify legend labels
scale_fill_manual(values=c("#fb6869", "#66b1ff"), name="Location", breaks=c("no", "yes"), labels=c("Other", "BPI")) +
#use classic theme
theme_classic() +
#select size for all of the text on the plot
theme(plot.title = element_text(hjust = 0.5, size = 13, face="bold"),
strip.text = element_text(size=11),
axis.text = element_text(size =10),
axis.title = element_text(size = 11),
legend.title = element_text(size=11),
legend.text = element_text(size=10))
#view plot
DNAm_Location_plot
#need to update and put CpGs of interest (top 10) in the same column using gather (change wide format to tall)
allData_long_3 <- gather(allData, CPG_3, Value_3, Basename.cg13102615, Basename.cg13324140, Basename.cg11115577, Basename.cg15281901, Basename.cg10876267, Basename.cg03178290, Basename.cg06382285, Basename.cg04927657, Basename.cg21064897, Basename.cg13842554, factor_key=TRUE)
#change cpg methylation value to numeric
allData_long_3$Value_3 <- as.numeric(allData_long_3$Value_3)
#change CpG names for legend
allData_long_3$CPG_3 <- factor(allData_long_3$CPG_3, levels=c("Basename.cg13102615", "Basename.cg13324140", "Basename.cg11115577", "Basename.cg15281901", "Basename.cg10876267", "Basename.cg03178290", "Basename.cg06382285", "Basename.cg04927657", "Basename.cg21064897", "Basename.cg13842554"),labels = c("cg13102615", "cg13324140", "cg11115577", "cg15281901", "cg10876267", "cg03178290", "cg06382285", "cg04927657", "cg21064897", "cg13842554"))
#plot boxplots of DNAm level vs sex for top 10 significant CpGs
DNAm_Sex_plot <- ggplot(allData_long_3, aes(allData_long_3$CPG_3, allData_long_3$Value_3, fill=allData_long_3$Sex)) +
geom_boxplot() +
#put boxplot for each CpG in its own panel, each panel has its own range of values for the y axis and put 5 side by side per row
facet_wrap(~allData_long_3$CPG_3, scale="free", ncol =5) +
#change title and axis names
ggtitle("CpG methylation by sex") + xlab("") + ylab("DNAm level") +
#remove x axis labels
scale_x_discrete(labels = NULL, breaks = NULL) +
#specification for color fill
labs(fill = "Sex") +
#select colors (red and blue)
scale_fill_manual(values=c("#fb6869", "#66b1ff")) +
#use theme classic
theme_classic() +
#select size for all of the text on the plot
theme(plot.title = element_text(hjust = 0.5, size = 13, face="bold"),
strip.text = element_text(size=11),
axis.text = element_text(size =10),
axis.title = element_text(size = 11),
legend.title = element_text(size=11),
legend.text = element_text(size=10))
#view plot
DNAm_Sex_plot