EWAS Bon Portage Island Effect

Load required packages

#load packages
library("limma")
library("dplyr")
library("lattice")
library("ggplot2")
library("tidyr")

#set working directory
setwd("~/Desktop/SCHOOL/THESIS/Chapter 2 - Methyl/PAPER/Gitlab uploads/")

1. Load data with info for all samples and the methylation probe data

#load data on shrew
df <- read.csv("SampleSheetAgeN90final.csv")
#subset samples to only use ones that can be used for ageing (basically remove ones that didn't pass QC)
samples <- subset(df, CanBeUsedForAgingStudies == 'yes') 
#look at structure of data frame
str(df)

#load probe data
load("all_probes_sesame_normalized.Rdata")
#match the probe data with your samples info
data <- normalized_betas_sesame %>% tibble::column_to_rownames(var = "CGid")%>%
  dplyr::select(samples$Basename)
#look at structure of data frame
str(data)

#load my mapped probes
probe_marie_cinereusshrewn90 <- read.csv("aln_2.csv")

### Data refinement of CpG Sites, based on shared probe mappings
#add new column called "qname" to be able to match CGids with probe data set 
data_1<- cbind(qname = rownames(data), data)
#subset columns that have matching CGids (removes about 8 thousand cols)
data_2 <- subset(data_1, qname %in% probe_marie_cinereusshrewn90$qname)
#remove that new column because we need only numeric values in table for matrix step coming up in a bit
data <- data_2[ -c(1)]

#remove fetus from "df" dataset
df_nofet=subset(df, df$Age>0.1)
#remove fetus from "data" based on the "df" dataframe
data_nofet <- subset(data, select = df$Age>0.1)
#add new "Location_1" column to separate Bon Portage Island samples from all others ("yes" = from BPI)
df_nofet<- mutate(df_nofet, Location_1 = ifelse(df_nofet$Location == "Bon Portage Island Nova Scotia", "yes", "no"))

2. EWAS of island effect

Code inspired by https://rstudio-pubs-static.s3.amazonaws.com/13450_cca08b93a0d94bc2b45c9ed94e230d02.html

#make matrix with variables to be able to run EWAS
#I want my model to include sex, age, tissue type and BPI/others location
(df_nofetMat <- model.matrix(~Location_1 + Age + Sex + Tissue, df_nofet))

#Estimate the correlation between samples from same individual to be able to include and account for it in regression model
cor <- duplicateCorrelation(data_nofet, df_nofetMat, block=df_nofet$AnimalID)
cor$consensus.correlation

#use limma package to run regressions
#need probe dataset (data_nofet), matrix with sample info (df_nofet), and info on duplicate correlation from above
Fit <- lmFit(data_nofet, df_nofetMat, block = df_nofet$AnimalID, correlation = cor$consensus.correlation)
#use eBayes() to moderate the estimated error variances
EbFit <- eBayes(Fit)
#add "CGid" column based on rownames
EbFit$CGid <- rownames(EbFit)
#view info on top significant CpGs
topTable(EbFit)

##            Location_1yes          Age          SexM   TissueTail   AveExpr
## cg06523030  0.4175807439  0.011054763  0.0007761812 -0.017445389 0.2068136
## cg13102615  0.0158779679  0.005051293 -0.5827512656  0.031988272 0.2825970
## cg13324140  0.0175684687  0.015260792 -0.4806779171  0.026281575 0.2399096
## cg11115577  0.0111941089  0.005109338  0.4096539246 -0.032348835 0.7662909
## cg15281901  0.0009163823 -0.008132844 -0.3669961121  0.016661764 0.2199308
## cg10876267  0.0127565094 -0.006252519  0.6301947558  0.004923112 0.6398498
## cg03178290  0.0064034548  0.014931974 -0.3859389668  0.007419707 0.2593505
## cg06382285  0.0220263697  0.016068915 -0.4471342484  0.024474292 0.2092987
## cg25476308 -0.0367942823  0.012334982  0.0045571277  0.613616435 0.7375965
## cg11584409  0.0046753530  0.010849478 -0.0080296441  0.347239108 0.3667963
##                    F      P.Value    adj.P.Val
## cg06523030 1369.2294 2.470725e-42 7.315322e-38
## cg13102615 1274.4949 1.030654e-41 1.525780e-37
## cg13324140 1207.7976 3.005383e-41 2.966112e-37
## cg11115577 1107.5363 1.685741e-40 1.159245e-36
## cg15281901 1099.2404 1.957655e-40 1.159245e-36
## cg10876267  976.6089 2.055283e-39 1.014214e-35
## cg03178290  763.5188 2.713047e-37 1.144063e-33
## cg06382285  758.5046 3.091227e-37 1.144063e-33
## cg25476308  731.9475 6.260289e-37 2.059496e-33
## cg11584409  702.0200 1.430119e-36 4.234295e-33

#save output to computer
write.csv(EbFit, file="~/Desktop/SCHOOL/THESIS/Chapter 2 - Methyl/PAPER/Gitlab uploads/EWAS/EbFit_BPI_effect.csv")

#view data for each variable/coefficient (outputs in order of most significant)
#Location
coeff_2 <- topTable(EbFit,coef=2)
CpG_2 <- rownames(coeff_2)
coeff_2

##                 logFC   AveExpr         t      P.Value    adj.P.Val        B
## cg06523030  0.4175807 0.2068136  66.83982 8.574010e-43 2.538593e-38 83.56436
## cg20625795 -0.3536041 0.7265886 -32.99225 1.033809e-30 1.530451e-26 59.50666
## cg03673751 -0.3565916 0.7992837 -18.69703 2.003040e-21 1.976867e-17 38.73787
## cg25476805 -0.3005787 0.7448504 -15.81354 7.599710e-19 5.625305e-15 32.82549
## cg23036219 -0.8375173 0.5987181 -15.63745 1.120608e-18 6.635790e-15 32.43763
## cg17246436 -0.3730393 0.3640227 -14.90609 5.820051e-18 2.872001e-14 30.79087
## cg00903722 -0.2884190 0.7350563 -14.15895 3.324609e-17 1.406215e-13 29.04678
## cg21401008  0.3391333 0.1627410  13.68873 1.028141e-16 3.494350e-13 27.91580
## cg19929340 -0.2392314 0.2552236 -13.67531 1.062184e-16 3.494350e-13 27.88315
## cg21330564 -0.3120941 0.7702570 -13.50446 1.611491e-16 4.670393e-13 27.46539

#Age
coeff_3 <- topTable(EbFit,coef=3)
CpG_3 <- rownames(coeff_3)
coeff_3

##                  logFC   AveExpr          t      P.Value    adj.P.Val        B
## cg08938156  0.11340415 0.1773514  10.881687 1.517831e-13 4.493995e-09 20.69216
## cg10427592 -0.11495189 0.9407453 -10.259683 8.759640e-13 7.886733e-09 18.94027
## cg07537139 -0.14071212 0.7344573 -10.236734 9.353850e-13 7.886733e-09 18.87466
## cg26393176  0.06387102 0.1248493  10.118423 1.313445e-12 7.886733e-09 18.53531
## cg00029553 -0.14891122 0.5646610 -10.113584 1.331858e-12 7.886733e-09 18.52139
## cg02034167 -0.11737840 0.8602233 -10.013581 1.777074e-12 8.769267e-09 18.23307
## cg26301436  0.11575889 0.1897811   9.656597 5.027537e-12 2.126505e-08 17.19329
## cg16518996 -0.19061787 0.7793890  -9.485316 8.328317e-12 3.082310e-08 16.68863
## cg15469181 -0.17731380 0.5853877  -9.357266 1.217531e-11 4.005406e-08 16.30893
## cg17157676 -0.24687063 0.6842645  -9.179215 2.071446e-11 5.601522e-08 15.77760

#Sex
coeff_4 <-topTable(EbFit,coef=4)
CpG_4 <- rownames(coeff_4)
coeff_4

##                 logFC   AveExpr         t      P.Value    adj.P.Val        B
## cg13102615 -0.5827513 0.2825970 -67.75294 4.995337e-43 1.479019e-38 84.76998
## cg13324140 -0.4806779 0.2399096 -66.32011 1.169843e-42 1.731836e-38 84.09651
## cg11115577  0.4096539 0.7662909  62.71912 1.078167e-41 9.771414e-38 82.30892
## cg15281901 -0.3669961 0.2199308 -62.40058 1.320105e-41 9.771414e-38 82.14390
## cg10876267  0.6301948 0.6398498  59.12479 1.125095e-40 6.662365e-37 80.37689
## cg03178290 -0.3859390 0.2593505 -52.79713 1.001889e-38 4.927059e-35 76.56245
## cg06382285 -0.4471342 0.2092987 -52.59656 1.164868e-38 4.927059e-35 76.43193
## cg04927657 -0.4700621 0.2819531 -50.22955 7.204824e-38 2.666505e-34 74.84231
## cg21064897 -0.4806909 0.2703235 -48.48715 2.908073e-37 9.566915e-34 73.61098
## cg13842554 -0.3021386 0.1864508 -46.52258 1.488135e-36 4.406071e-33 72.15566

#Re-arrange data for plotting
#make matrix
data_nofet_M <- as.matrix(data_nofet)
#transpose data (basically swap rows and columns to have CGid as column and sample ID as rows)
data_nofet_M <- t(data_nofet_M)
#add new column at the start for Basename (copying the actual rownames and making a column out of it)
data_nofet_M <- cbind(Basename = rownames(data_nofet_M), data_nofet_M)
#change back to dataframe
data_nofet_M <- as.data.frame(data_nofet_M)
#bind the re-arranged probe data with the sample data frame (df_nofet) using the "Basename" column
allData <- cbind(df_nofet, Basename = data_nofet_M)

3. Plotting significant CpGs

1. Linear regression of DNAm level vs age for top 10 significant CpGs

#put CpGs of interest (top 10) in the same column using gather (change wide format to tall)
allData_long <- gather(allData, CPG, Value, Basename.cg08938156, Basename.cg10427592, Basename.cg07537139, Basename.cg26393176, Basename.cg00029553, Basename.cg02034167, Basename.cg26301436, Basename.cg16518996, Basename.cg15469181, Basename.cg17157676, factor_key=TRUE)
#change CpG methylation value to numeric
allData_long$Value <- as.numeric(allData_long$Value)
#plot linear regression of DNAm level vs age for top 10 significant CpGs
baseplot <- ggplot(allData_long, aes(allData_long$Age, allData_long$Value, color=allData_long$CPG)) + geom_point(size = 1)
#add title and axis labels
baseplot <- baseplot + ggtitle("CpG methylation over age") + xlab("Age (years)") + ylab("DNAm level")
#make each CpG have a unique color and change legend names
baseplot <- baseplot + scale_colour_discrete(name="CpG", breaks=c("Basename.cg08938156", "Basename.cg10427592", "Basename.cg07537139", "Basename.cg26393176", "Basename.cg00029553", "Basename.cg02034167", "Basename.cg26301436", "Basename.cg16518996", "Basename.cg15469181", "Basename.cg17157676"), labels=c("cg08938156", "cg10427592", "cg07537139", "cg26393176", "cg00029553", "cg02034167", "cg26301436", "cg16518996", "cg15469181", "cg17157676"))
#add regression line
DNAm_Age_plot <- baseplot + geom_smooth(method = "lm") 
#view plot
DNAm_Age_plot

2. Boxplots DNAm level vs location for top 10 significant CpGs

#need to update and put CpGs of interest (top 10)in the same column using gather (change wide format to tall)
allData_long_2 <- gather(allData, CPG_2, Value_2, Basename.cg06523030, Basename.cg20625795, Basename.cg03673751, Basename.cg25476805, Basename.cg23036219, Basename.cg17246436, Basename.cg00903722,  Basename.cg21401008, Basename.cg19929340, Basename.cg21330564, factor_key=TRUE)
#change CpG methylation value to numeric
allData_long_2$Value_2 <- as.numeric(allData_long_2$Value_2)
##change CpG names for legend, got the info on specific genes from Manhattan plot scripts
allData_long_2$CPG_2 <- factor(allData_long_2$CPG_2, levels = c("Basename.cg06523030", "Basename.cg20625795", "Basename.cg03673751", "Basename.cg25476805", "Basename.cg23036219", "Basename.cg17246436", "Basename.cg00903722",  "Basename.cg21401008", "Basename.cg19929340", "Basename.cg21330564"), 
labels = c("cg06523030", "cg20625795 SMO_Exon", "cg03673751 ZO1_Exon", "cg25476805 PRP8_Promoter", "cg23036219 CBX5_Intergenic", "cg17246436", "cg00903722 ZBT20_Intergenic", "cg21401008 NR6A1_Intergenic", "cg19929340", "cg21330564 EDNRA_Intergenic"))

library(ggrepel)
#plot boxplots of DNAm level vs location for top 10 significant CpGs 
DNAm_Location_plot <- ggplot(allData_long_2, aes(x=allData_long_2$CPG_2, allData_long_2$Value_2, fill=allData_long_2$Location_1)) +
  geom_boxplot() + 
  #put boxplot for each CpG in its own panel, each panel has its own range of values for the y axis, put 5 side by side per row and edit labeller to avoid cropped text
  facet_wrap(~allData_long_2$CPG_2, scale="free", ncol =5, labeller = label_wrap_gen(10)) +
  #change title and axis names
  ggtitle("CpG Methylation Levels") + xlab("") + ylab("DNAm Level") +
  #remove x axis labels
  scale_x_discrete(labels = NULL, breaks = NULL) + 
   #select colors (red and blue), specify legend labels
  scale_fill_manual(values=c("#fb6869", "#66b1ff"), name="Location", breaks=c("no", "yes"), labels=c("Other", "BPI")) +
  #use classic theme
  theme_classic() +
  #select size for all of the text on the plot
  theme(plot.title = element_text(hjust = 0.5, size = 13, face="bold"),
        strip.text = element_text(size=11),
        axis.text = element_text(size =10),
        axis.title = element_text(size = 11),
        legend.title = element_text(size=11),
        legend.text = element_text(size=10))
#view plot
DNAm_Location_plot

3. Boxplots DNAm level vs sex for top 10 significant CpGs

#need to update and put CpGs of interest (top 10) in the same column using gather (change wide format to tall)
allData_long_3 <- gather(allData, CPG_3, Value_3, Basename.cg13102615, Basename.cg13324140, Basename.cg11115577, Basename.cg15281901, Basename.cg10876267, Basename.cg03178290, Basename.cg06382285, Basename.cg04927657, Basename.cg21064897, Basename.cg13842554, factor_key=TRUE)
#change cpg methylation value to numeric
allData_long_3$Value_3 <- as.numeric(allData_long_3$Value_3)
#change CpG names for legend
allData_long_3$CPG_3 <- factor(allData_long_3$CPG_3, levels=c("Basename.cg13102615", "Basename.cg13324140", "Basename.cg11115577", "Basename.cg15281901", "Basename.cg10876267", "Basename.cg03178290", "Basename.cg06382285", "Basename.cg04927657", "Basename.cg21064897", "Basename.cg13842554"),labels = c("cg13102615", "cg13324140", "cg11115577", "cg15281901", "cg10876267", "cg03178290", "cg06382285", "cg04927657", "cg21064897", "cg13842554"))

#plot boxplots of DNAm level vs sex for top 10 significant CpGs
DNAm_Sex_plot <- ggplot(allData_long_3, aes(allData_long_3$CPG_3, allData_long_3$Value_3, fill=allData_long_3$Sex)) + 
  geom_boxplot() + 
  #put boxplot for each CpG in its own panel, each panel has its own range of values for the y axis and put 5 side by side per row
  facet_wrap(~allData_long_3$CPG_3, scale="free", ncol =5) +
  #change title and axis names
  ggtitle("CpG methylation by sex") + xlab("") + ylab("DNAm level") +
   #remove x axis labels
  scale_x_discrete(labels = NULL, breaks = NULL) + 
  #specification for color fill 
  labs(fill = "Sex") + 
  #select colors (red and blue)
  scale_fill_manual(values=c("#fb6869", "#66b1ff")) +
  #use theme classic
  theme_classic() +
  #select size for all of the text on the plot
  theme(plot.title = element_text(hjust = 0.5, size = 13, face="bold"),
        strip.text = element_text(size=11),
        axis.text = element_text(size =10),
        axis.title = element_text(size = 11),
        legend.title = element_text(size=11),
        legend.text = element_text(size=10))
#view plot
DNAm_Sex_plot

EWAS Bon Portage Island Effect

Marie-Laurence Cossette

23/07/2022

1. Load data with info for all samples and the methylation probe data

2. EWAS of island effect

3. Plotting significant CpGs

1. Linear regression of DNAm level vs age for top 10 significant CpGs

2. Boxplots DNAm level vs location for top 10 significant CpGs

3. Boxplots DNAm level vs sex for top 10 significant CpGs