EWAS Island Effect

Load required packages

#load packages
library("limma")
library("dplyr")
library("lattice")
library("ggplot2")
library("tidyr")

#set working directory
setwd("~/Desktop/SCHOOL/THESIS/Chapter 2 - Methyl/PAPER/Gitlab uploads/")

1. Load data with info for all samples and the methylation probe data

#load data on shrew
df <- read.csv("SampleSheetAgeN90final.csv")
#subset samples to only use ones that can be used for ageing (basically remove ones that didn't pass QC)
samples <- subset(df, CanBeUsedForAgingStudies == 'yes') 
#look at structure of data frame
str(df)

#load probe data
load("all_probes_sesame_normalized.Rdata")
#match the probe data with your samples info
data <- normalized_betas_sesame %>% tibble::column_to_rownames(var = "CGid")%>%
  dplyr::select(samples$Basename)
#look at structure of data frame
str(data)

#load my mapped probes
probe_marie_cinereusshrewn90 <- read.csv("aln_2.csv")

### Data refinement of CpG Sites, based on shared probe mappings
#add new column called "qname" to be able to match CGids with probe data set 
data_1<- cbind(qname = rownames(data), data)
#subset columns that have matching CGids (removes about 8 thousand cols)
data_2 <- subset(data_1, qname %in% probe_marie_cinereusshrewn90$qname)
#remove that new column because we need only numeric values in table for matrix step coming up in a bit
data <- data_2[ -c(1)]

#remove fetus from "df" dataset
df_nofet=subset(df, df$Age>0.1)
#remove fetus from "data" based on the "df" dataframe
data_nofet <- subset(data, select = df$Age>0.1)
#add new "Location_1" column to separate Islands from all others ("yes" = from BPI and Long Island)
df_nofet<- mutate(df_nofet, Location_1 = ifelse(df_nofet$Location == "Bon Portage Island Nova Scotia" | df_nofet$Location == "Long Island Nova Scotia", "yes", "no"))

2. EWAS of island effect

Code inspired by https://rstudio-pubs-static.s3.amazonaws.com/13450_cca08b93a0d94bc2b45c9ed94e230d02.html

#make matrix with variables to be able to run EWAS
#I want my model to include  sex, age, tissue type and island/mainland location
(df_nofetMat <- model.matrix(~Location_1 + Age + Sex + Tissue, df_nofet))

#Estimate the correlation between samples from same individual to be able to include and account for it in regression model
cor <- duplicateCorrelation(data_nofet, df_nofetMat, block=df_nofet$AnimalID)
cor$consensus.correlation

#use limma package to run regressions
#need probe dataset (data_nofet), matrix with sample info (df_nofet), and info on duplicat correlation form above
Fit <- lmFit(data_nofet, df_nofetMat, block = df_nofet$AnimalID, correlation = cor$consensus.correlation)
#use eBayes() to moderate the estimated error variances
EbFit <- eBayes(Fit)
#add "CGid" column based on rownames
EbFit$CGid <- rownames(EbFit)
#view info on top significant CpGs
topTable(EbFit)

##            Location_1yes          Age          SexM   TissueTail   AveExpr
## cg13324140   0.023898685  0.000455792 -0.4780987927  0.029949026 0.2399096
## cg13102615   0.012650773 -0.001167773 -0.5821757698  0.036474903 0.2825970
## cg15281901   0.003643584 -0.010671132 -0.3668983853  0.016464649 0.2199308
## cg11115577   0.005463240  0.003082504  0.4098490152 -0.029035604 0.7662909
## cg10876267   0.021014780 -0.020828952  0.6320186474  0.006889635 0.6398498
## cg03178290   0.012859004  0.006276269 -0.3846935256  0.008175150 0.2593505
## cg11584409  -0.008190076  0.018880259 -0.0093752371  0.350460757 0.3667963
## cg20468818   0.026795743 -0.008501356  0.0016643980 -0.351333776 0.1515623
## cg06382285   0.020073997  0.005272771 -0.4453586035  0.030834126 0.2092987
## cg25476308  -0.032068098  0.029921197  0.0003450775  0.602861388 0.7375965
##                    F      P.Value    adj.P.Val
## cg13324140 1249.2828 1.469106e-41 4.349729e-37
## cg13102615 1181.2618 4.480771e-41 6.633333e-37
## cg15281901 1077.3673 2.800252e-40 2.763662e-36
## cg11115577 1008.8788 1.034070e-39 7.654188e-36
## cg10876267  989.6848 1.514960e-39 8.970985e-36
## cg03178290  757.8907 3.024636e-37 1.492557e-33
## cg11584409  745.4742 4.195771e-37 1.774691e-33
## cg20468818  695.9869 1.634980e-36 5.848164e-33
## cg06382285  693.0504 1.777677e-36 5.848164e-33
## cg25476308  688.4863 2.025995e-36 5.998565e-33

#save output to computer
write.csv(EbFit, file="~/Desktop/SCHOOL/THESIS/Chapter 2 - Methyl/PAPER/Gitlab uploads/EWAS/EbFit_Island_effect.csv")

#view data for each variable/coefficient (outputs in order of most significant)
#Location
coeff_2 <- topTable(EbFit,coef=2)
CpG_2 <- rownames(coeff_2)
coeff_2

##                 logFC   AveExpr         t      P.Value    adj.P.Val        B
## cg11703808  0.1546308 0.7561761  18.42500 3.345181e-21 9.904411e-17 38.19944
## cg09227150  0.4427736 0.3073579  17.26261 3.443455e-20 5.097691e-16 35.90139
## cg00152260  0.4083112 0.2873600  16.65221 1.230189e-19 1.214115e-15 34.64297
## cg22063749 -0.3683613 0.6958420 -16.19470 3.270096e-19 2.420525e-15 33.67522
## cg25115601 -0.2423548 0.7050174 -13.23223 3.121381e-16 1.848357e-12 26.85343
## cg24102071 -0.4622002 0.5973457 -12.94702 6.368567e-16 3.142675e-12 26.14201
## cg11099953 -0.6953243 0.3970198 -12.61113 1.493760e-15 6.318178e-12 25.29104
## cg11065917 -0.2987741 0.7062629 -12.15911 4.809244e-15 1.779901e-11 24.12318
## cg14198079 -0.2525598 0.7462261 -11.80913 1.210301e-14 3.981621e-11 23.20082
## cg08887457 -0.3467049 0.5162934 -11.76437 1.363482e-14 4.036998e-11 23.08169

#Age
coeff_3 <- topTable(EbFit,coef=3)
CpG_3 <- rownames(coeff_3)
coeff_3

##                  logFC   AveExpr         t      P.Value    adj.P.Val         B
## cg10427592 -0.12768122 0.9407453 -9.100419 2.610087e-11 7.727947e-07 15.669645
## cg09313626 -0.10710763 0.5218670 -8.140531 4.935685e-10 5.416711e-06 12.754053
## cg00505445 -0.17455241 0.7352356 -8.106482 5.488426e-10 5.416711e-06 12.648774
## cg07302051 -0.13610351 0.7062077 -7.742303 1.720632e-09 1.273612e-05 11.515670
## cg08938156  0.09366198 0.1773514  7.628787 2.463132e-09 1.458569e-05 11.160001
## cg15647667 -0.11548521 0.8475586 -7.528962 3.379922e-09 1.667879e-05 10.846337
## cg00029553 -0.13603649 0.5646610 -7.336819 6.229197e-09 2.634772e-05 10.240401
## cg08255799 -0.15688131 0.5347054 -7.141075 1.164682e-08 4.012822e-05  9.620430
## cg18560935 -0.10303860 0.6753129 -7.126649 1.219785e-08 4.012822e-05  9.574643
## cg27021764 -0.16001788 0.5927151 -7.043503 1.592625e-08 4.626779e-05  9.310503

#Sex
coeff_4 <-topTable(EbFit,coef=4)
CpG_4 <- rownames(coeff_4)
coeff_4

##                 logFC   AveExpr         t      P.Value    adj.P.Val        B
## cg13324140 -0.4780988 0.2399096 -65.91993 1.426329e-42 4.223075e-38 83.61169
## cg13102615 -0.5821758 0.2825970 -64.03502 4.527003e-42 6.701775e-38 82.70143
## cg15281901 -0.3668984 0.2199308 -60.71107 3.773130e-41 3.723828e-37 80.99960
## cg11115577  0.4098490 0.7662909  58.99824 1.176794e-40 7.688931e-37 80.07099
## cg10876267  0.6320186 0.6398498  58.85233 1.298455e-40 7.688931e-37 79.99018
## cg03178290 -0.3846935 0.2593505 -51.51575 2.553312e-38 1.259974e-34 75.54299
## cg06382285 -0.4453586 0.2092987 -49.21557 1.556168e-37 6.582145e-34 73.97562
## cg04927657 -0.4694932 0.2819531 -47.72019 5.268118e-37 1.949731e-33 72.90624
## cg21064897 -0.4781476 0.2703235 -47.16746 8.346024e-37 2.635556e-33 72.50036
## cg13842554 -0.2996719 0.1864508 -47.09055 8.901499e-37 2.635556e-33 72.44341

#Re-arrange data for plotting
#make matrix
data_nofet_M <- as.matrix(data_nofet)
#transpose data (basically swap rows and columns to have CGid as column and sample ID as rows)
data_nofet_M <- t(data_nofet_M)
#add new column at the start for Basename (copying the actual rownames and making a column out of it)
data_nofet_M <- cbind(Basename = rownames(data_nofet_M), data_nofet_M)
#change back to dataframe
data_nofet_M <- as.data.frame(data_nofet_M)
#bind the re-arranged probe data with the sample data frame (df_nofet) using the "Basename" column
allData <- cbind(df_nofet, Basename = data_nofet_M)

3. Plotting significant CpGs

1. Linear regression of DNAm level vs age for top 10 significant CpGs

#put CpGs of interest (top 10) in the same column using gather (change wide format to tall)
allData_long <- gather(allData, CPG, Value, Basename.cg10427592, Basename.cg09313626, Basename.cg00505445, Basename.cg07302051, Basename.cg08938156, Basename.cg15647667, Basename.cg00029553, Basename.cg08255799, Basename.cg18560935, Basename.cg27021764, factor_key=TRUE)
#change CpG methylation value to numeric
allData_long$Value <- as.numeric(allData_long$Value)
#plot linear regression of DNAm level vs age for top 10 significant CpGs
baseplot <- ggplot(allData_long, aes(allData_long$Age, allData_long$Value, color=allData_long$CPG)) + geom_point(size = 1)
#add title and axis labels
baseplot <- baseplot + ggtitle("CpG methylation over age") + xlab("Age (years)") + ylab("DNAm level")
#make each CpG have a unique color and change legend names
baseplot <- baseplot + scale_colour_discrete(name="CpG", breaks=c("Basename.cg10427592", "Basename.cg09313626", "Basename.cg00505445", "Basename.cg07302051", "Basename.cg08938156", "Basename.cg15647667", "Basename.cg00029553", "Basename.cg08255799", "Basename.cg18560935", "Basename.cg27021764"), labels=c("cg10427592", "cg09313626", "cg00505445", "cg07302051", "cg08938156", "cg15647667", "cg00029553", "cg08255799", "cg18560935", "cg27021764"))
#add regression line
DNAm_Age_plot <- baseplot + geom_smooth(method = "lm") 
#view plot
DNAm_Age_plot

2. Boxplots DNAm level vs location for top 10 significant CpGs

#need to update and put CpGs of interest (top 10) in the same column using gather (change wide format to tall)
allData_long_2 <- gather(allData, CPG_2, Value_2, Basename.cg11703808, Basename.cg09227150, Basename.cg00152260, Basename.cg22063749, Basename.cg25115601, Basename.cg24102071, Basename.cg11099953, Basename.cg11065917, Basename.cg14198079, Basename.cg08887457, factor_key=TRUE)
#change CpG methylation value to numeric
allData_long_2$Value_2 <- as.numeric(allData_long_2$Value_2)
#change CpG names for legend, got the info on specific genes from Manhattan plot scripts
allData_long_2$CPG_2 <- factor(allData_long_2$CPG_2, levels=c("Basename.cg11703808", "Basename.cg09227150", "Basename.cg00152260", "Basename.cg22063749", "Basename.cg25115601", "Basename.cg24102071", "Basename.cg11099953", "Basename.cg11065917", "Basename.cg14198079", "Basename.cg08887457"), labels=c("cg11703808 EWS_Exon", "cg09227150" , "cg00152260 PO4F3_Intron", "cg22063749 UBP54_Intron", "cg25115601 TRI47_Exon", "cg24102071 ATPA_Exon"," cg11099953 TSH3_Intergenic", "cg11065917 COE4_Exon", "cg14198079", "cg08887457"))

library(ggrepel)
#plot boxplots of DNAm level vs location for top 10 significant CpGs
DNAm_Location_plot <- ggplot(allData_long_2, aes(x=allData_long_2$CPG_2, allData_long_2$Value_2, fill=allData_long_2$Location_1)) +
  geom_boxplot() + 
  #put boxplot for each CpG in its own panel, each panel has its own range of values for the y axis, put 5 side by side per row and   edit labeller to avoid cropped text
  facet_wrap(~allData_long_2$CPG_2, scale="free", ncol =5, labeller = label_wrap_gen(10)) +
  #change title and axis names
  ggtitle("CpG Methylation Levels") + xlab("") + ylab("DNAm Level") +
  #remove x axis labels
  scale_x_discrete(labels = NULL, breaks = NULL) + 
  #select colors (red and blue), specify legend labels
  scale_fill_manual(values=c("#fb6869", "#66b1ff"), name="Location", breaks=c("no", "yes"), labels=c("Mainland", "Island")) +
  #use classic theme
  theme_classic() +
  #select size for all of the text on the plot
  theme(plot.title = element_text(hjust = 0.5, size = 13, face="bold"),
        strip.text = element_text(size=11),
        axis.text = element_text(size =10),
        axis.title = element_text(size = 11),
        legend.title = element_text(size=11),
        legend.text = element_text(size=10))
#view plot
DNAm_Location_plot

3. Boxplots DNAm level vs sex for top 10 significant CpGs

#need to update and put CpGs of interest (top 10) in the same column using gather (change wide format to tall)
allData_long_3 <- gather(allData, CPG_3, Value_3, Basename.cg13324140, Basename.cg13102615, Basename.cg15281901, Basename.cg11115577, Basename.cg10876267, Basename.cg03178290, Basename.cg06382285, Basename.cg04927657, Basename.cg21064897, Basename.cg13842554, factor_key=TRUE)
#change CpG methylation value to numeric
allData_long_3$Value_3 <- as.numeric(allData_long_3$Value_3)
#change CpG names for legend
allData_long_3$CPG_3 <- factor(allData_long_3$CPG_3, levels=c("Basename.cg13324140", "Basename.cg13102615", "Basename.cg15281901", "Basename.cg11115577", "Basename.cg10876267", "Basename.cg03178290", "Basename.cg06382285", "Basename.cg04927657", "Basename.cg21064897", "Basename.cg13842554"),labels = c("cg13324140", "cg13102615", "cg15281901", "cg11115577", "cg10876267", "cg03178290", "cg06382285", "cg04927657", "cg21064897", "cg13842554"))

#plot boxplots of DNAm level vs sex for top 10 significant CpGs
DNAm_Sex_plot <- ggplot(allData_long_3, aes(allData_long_3$CPG_3, allData_long_3$Value_3, fill=allData_long_3$Sex)) + 
  geom_boxplot() + 
  #put boxplot for each CpG in its own panel, each panel has its own range of values for the y axis and put 5 side by side per row 
  facet_wrap(~allData_long_3$CPG_3, scale="free", ncol =5) +
  #change title and axis names
  ggtitle("CpG methylation by sex") + xlab("") + ylab("DNAm level") +
  #remove x axis labels
  scale_x_discrete(labels = NULL, breaks = NULL) + 
  #specification for color fill 
  labs(fill = "Sex") +
  #select colors (red and blue)
  scale_fill_manual(values=c("#fb6869", "#66b1ff")) +
  #use theme classic
  theme_classic() +
  #select size for all of the text on the plot
  theme(plot.title = element_text(hjust = 0.5, size = 13, face="bold"),
        strip.text = element_text(size=11),
        axis.text = element_text(size =10),
        axis.title = element_text(size = 11),
        legend.title = element_text(size=11),
        legend.text = element_text(size=10))
#view plot
DNAm_Sex_plot

EWAS Island Effect

Marie-Laurence Cossette

18/07/2022

1. Load data with info for all samples and the methylation probe data

2. EWAS of island effect

3. Plotting significant CpGs

1. Linear regression of DNAm level vs age for top 10 significant CpGs

2. Boxplots DNAm level vs location for top 10 significant CpGs

3. Boxplots DNAm level vs sex for top 10 significant CpGs