EWAS Long Island Effect

Load required packages

#load packages
library("limma")
library("dplyr")
library("lattice")
library("ggplot2")
library("tidyr")

#set working directory
setwd("~/Desktop/SCHOOL/THESIS/Chapter 2 - Methyl/PAPER/Gitlab uploads/")

1. Load data with info for all samples and the methylation probe data

#load data on shrew
df <- read.csv("SampleSheetAgeN90final.csv")
#subset samples to only use ones that can be used for ageing (basically remove ones that didn't pass QC)
samples <- subset(df, CanBeUsedForAgingStudies == 'yes') 
#look at structure of data frame
str(df)

#load probe data
load("all_probes_sesame_normalized.Rdata")
#match the probe data with your samples info
data <- normalized_betas_sesame %>% tibble::column_to_rownames(var = "CGid")%>%
  dplyr::select(samples$Basename)
#look at structure of data frame
str(data)

#load my mapped probes
probe_marie_cinereusshrewn90 <- read.csv("aln_2.csv")

### Data refinement of CpG Sites, based on shared probe mappings
#add new column called "qname" to be able to match CGids with probe data set 
data_1<- cbind(qname = rownames(data), data)
#subset columns that have matching CGids (removes about 8 thousand cols)
data_2 <- subset(data_1, qname %in% probe_marie_cinereusshrewn90$qname)
#remove that new column because we need only numeric values in table for matrix step coming up in a bit
data <- data_2[ -c(1)]

#remove fetus from "df" dataset
df_nofet=subset(df, df$Age>0.1)
#remove fetus from "data" based on the "df" dataframe
data_nofet <- subset(data, select = df$Age>0.1)
#remove BPI shrews from both data sets
df_LI= subset(df_nofet, df_nofet$Location != "Bon Portage Island Nova Scotia")
data_LI <- subset(data_nofet, select = df_nofet$Location != "Bon Portage Island Nova Scotia")
#add new "Location_1" column to separate Long Island samples from all others ("yes" = from Long Island)
df_LI<- mutate(df_LI, Location_1 = ifelse(df_LI$Location == "Long Island Nova Scotia", "yes", "no"))

2. EWAS of island effect

Code inspired by https://rstudio-pubs-static.s3.amazonaws.com/13450_cca08b93a0d94bc2b45c9ed94e230d02.html

#make matrix with variables to be able to run EWAS
#I want my model to include sex, age, tissue type and Long Island/others location
(df_LIMat <- model.matrix(~Location_1 + Age + Sex + Tissue, df_LI))

#Estimate the correlation between samples from same individual to be able to include and account for it in regression model
cor <- duplicateCorrelation(data_LI, df_LIMat, block=df_LI$AnimalID)
cor$consensus.correlation

#use limma package to run regressions
#need probe dataset (data_LI), matrix with sample info (df_LI), and info on duplicate correlation from above
Fit <- lmFit(data_LI, df_LIMat, block = df_LI$AnimalID, correlation = cor$consensus.correlation)
#use eBayes() to moderate the estimated error variances
EbFit <- eBayes(Fit)
#add "CGid" column based on rownames
EbFit$CGid <- rownames(EbFit)
#view info on top significant CpGs
topTable(EbFit)

##            Location_1yes          Age         SexM   TissueTail   AveExpr
## cg10876267  -0.003345980  0.015349762  0.658177260  0.004722296 0.6225154
## cg25476308   0.009269547 -0.007161175  0.015273830  0.614045977 0.6625765
## cg21064897   0.025013169  0.001894787 -0.471260158  0.013639004 0.2708862
## cg13324140   0.043934973 -0.036122928 -0.459417309  0.029548103 0.2383797
## cg15894722   0.015668478 -0.015050080 -0.001596506  0.652299735 0.4370756
## cg04927657  -0.001923276  0.009815394 -0.467404142  0.016454185 0.2843306
## cg20468818   0.018825048  0.004235476  0.006536476 -0.351620355 0.1959083
## cg11115577  -0.012804726  0.020385455  0.409916218 -0.032813146 0.7585485
## cg13102615   0.011377825 -0.012669210 -0.564337286  0.032296339 0.2833316
## cg15281901   0.009351362 -0.017961904 -0.357455145  0.017260205 0.2247608
##                    F      P.Value    adj.P.Val
## cg10876267 1538.5560 1.045109e-28 3.094360e-24
## cg25476308 1143.7852 3.695930e-27 3.269027e-23
## cg21064897 1129.0700 4.318161e-27 3.269027e-23
## cg13324140 1126.9580 4.416410e-27 3.269027e-23
## cg15894722 1104.8885 5.601059e-27 3.316723e-23
## cg04927657  957.0216 3.144815e-26 1.551861e-22
## cg20468818  922.2551 4.903143e-26 2.073890e-22
## cg11115577  906.5209 6.027703e-26 2.143255e-22
## cg13102615  900.6676 6.514893e-26 2.143255e-22
## cg15281901  766.4547 4.510445e-25 1.335453e-21

#save output to computer
write.csv(EbFit, file="~/Desktop/SCHOOL/THESIS/Chapter 2 - Methyl/PAPER/Gitlab uploads/EWAS/EbFit_LongIsland_effect.csv")

#view data for each variable/coefficient (outputs in order of most significant)
#Location
coeff_2 <- topTable(EbFit,coef=2)
CpG_2 <- rownames(coeff_2)
coeff_2

##                  logFC   AveExpr          t      P.Value    adj.P.Val         B
## cg01463565  0.44691660 0.2095297  30.557483 7.911992e-21 2.342582e-16 35.563113
## cg13695149  0.41374336 0.2260077  26.275982 2.742564e-19 4.060092e-15 32.760656
## cg09711113 -0.88177994 0.6637395 -12.270279 7.134340e-12 7.041118e-08 17.255494
## cg09227150  0.45371331 0.2080255   9.101776 2.804366e-09 2.075792e-05 11.393451
## cg11703808  0.14104021 0.7176595   8.915627 4.138356e-09 2.450569e-05 11.008175
## cg17205049  0.26337387 0.6828091   8.448678 1.120593e-08 5.529755e-05 10.020607
## cg12694663 -0.07695597 0.8828349  -8.142657 2.186667e-08 8.659750e-05  9.356973
## cg22063749 -0.37677851 0.7846924  -8.092490 2.442839e-08 8.659750e-05  9.246943
## cg20521676 -0.26277742 0.6049661  -8.048038 2.695551e-08 8.659750e-05  9.149157
## cg00152260  0.38649331 0.1825423   8.011270 2.924801e-08 8.659750e-05  9.068069

#Age
coeff_3 <- topTable(EbFit,coef=3)
CpG_3 <- rownames(coeff_3)
coeff_3

##                  logFC    AveExpr         t      P.Value   adj.P.Val        B
## cg00059486  0.06989609 0.05981315  7.042727 2.677890e-07 0.007928696 6.961184
## cg19873240  0.14527563 0.27220537  6.472941 1.041808e-06 0.009800135 5.629925
## cg03886239  0.05102320 0.08765046  6.440024 1.128200e-06 0.009800135 5.551811
## cg03789886  0.04143492 0.10755669  6.241532 1.828704e-06 0.009800135 5.078183
## cg02898094  0.06356100 0.09858640  6.193479 2.056870e-06 0.009800135 4.962874
## cg26301436  0.09847957 0.17594923  6.151352 2.280663e-06 0.009800135 4.861585
## cg15647667 -0.12393298 0.85347838 -6.110753 2.519800e-06 0.009800135 4.763795
## cg08938156  0.10660786 0.16490517  6.090578 2.647969e-06 0.009800135 4.715139
## cg13753818 -0.18072545 0.35123732 -5.725985 6.536032e-06 0.018824234 3.829169
## cg15944967  0.05349242 0.12065243  5.629385 8.320887e-06 0.018824234 3.592512

#Sex
coeff_4 <-topTable(EbFit,coef=4)
CpG_4 <- rownames(coeff_4)
coeff_4

##                 logFC   AveExpr         t      P.Value    adj.P.Val        B
## cg10876267  0.6581773 0.6225154  71.37679 1.253549e-29 3.711506e-25 53.30361
## cg21064897 -0.4712602 0.2708862 -61.65042 4.243907e-28 6.282679e-24 51.02109
## cg13324140 -0.4594173 0.2383797 -60.58374 6.454200e-28 6.369865e-24 50.73296
## cg04927657 -0.4674041 0.2843306 -56.90593 2.902901e-27 2.148728e-23 49.67215
## cg13102615 -0.5643373 0.2833316 -54.72721 7.405973e-27 3.928814e-23 48.99026
## cg11115577  0.4099162 0.7585485  54.56237 7.961661e-27 3.928814e-23 48.93693
## cg15281901 -0.3574551 0.2247608 -50.17499 5.936447e-26 2.510948e-22 47.41970
## cg13842554 -0.2959779 0.1885673 -47.74552 1.947604e-25 7.208084e-22 46.49086
## cg01042193 -0.3536074 0.2070621 -46.32614 4.008618e-25 1.318746e-21 45.91564
## cg07121495  0.3774155 0.7866066  45.71868 5.495975e-25 1.627248e-21 45.66167

#Re-arrange data for plotting
#make matrix
data_LI_M <- as.matrix(data_LI)
#transpose data (basically swap rows and columns to have CGid as column and sample ID as rows)
data_LI_M <- t(data_LI_M)
#add new column at the start for Basename (copying the actual rownames and making a column out of it)
data_LI_M <- cbind(Basename = rownames(data_LI_M), data_LI_M)
#change back to dataframe
data_LI_M <- as.data.frame(data_LI_M)
#bind the re-arranged probe data with the sample data frame (df_LI) using the "Basename" column
allData <- cbind(df_LI, Basename = data_LI_M)

3. Plotting significant CpGs

1. Linear regression of DNAm level vs age for top 10 significant CpGs

#put CpGs of interest (top 10) in the same column using gather (change wide format to tall)
allData_long <- gather(allData, CPG, Value, Basename.cg00059486, Basename.cg19873240, Basename.cg03886239, Basename.cg03789886, Basename.cg02898094, Basename.cg26301436, Basename.cg15647667, Basename.cg08938156, Basename.cg13753818, Basename.cg15944967, factor_key=TRUE)
#change CpG methylation value to numeric
allData_long$Value <- as.numeric(allData_long$Value)
#plot linear regression of DNAm level vs age for top 10 significant CpGs
baseplot <- ggplot(allData_long, aes(allData_long$Age, allData_long$Value, color=allData_long$CPG)) + geom_point(size = 1)
#add title and axis labels
baseplot <- baseplot + ggtitle("CpG methylation over age") + xlab("Age (years)") + ylab("DNAm level")
#make each CpG have a unique color and change legend names
baseplot <- baseplot + scale_colour_discrete(name="CpG", breaks=c("Basename.cg00059486", "Basename.cg19873240", "Basename.cg03886239", "Basename.cg03789886", "Basename.cg02898094", "Basename.cg26301436", "Basename.cg15647667", "Basename.cg08938156", "Basename.cg13753818", "Basename.cg15944967"), labels=c("cg00059486", "cg19873240", "cg03886239", "cg03789886", "cg02898094", "cg26301436", "cg15647667", "cg08938156", "cg13753818", "cg15944967"))
#add regression line
DNAm_Age_plot <- baseplot + geom_smooth(method = "lm") 
#view plot
DNAm_Age_plot

2. Boxplots DNAm level vs location for top 10 significant CpGs

#need to update and put CpGs of interest (top 10)in the same column using gather (change wide format to tall)
allData_long_2 <- gather(allData, CPG_2, Value_2, Basename.cg01463565, Basename.cg13695149, Basename.cg09711113, Basename.cg09227150, Basename.cg11703808, Basename.cg17205049, Basename.cg12694663, Basename.cg22063749, Basename.cg20521676, Basename.cg00152260, factor_key=TRUE)
#change CpG methylation value to numeric
allData_long_2$Value_2 <- as.numeric(allData_long_2$Value_2)
##change CpG names for legend, got the info on specific genes from Manhattan plot scripts
allData_long_2$CPG_2 <- factor(allData_long_2$CPG_2, levels = c("Basename.cg01463565", "Basename.cg13695149", "Basename.cg09711113", "Basename.cg09227150", "Basename.cg11703808", "Basename.cg17205049", "Basename.cg12694663", "Basename.cg22063749", "Basename.cg20521676", "Basename.cg00152260"), 
labels = c("cg01463565 NET1_Promoter", "cg13695149 NET1_Promoter", "cg09711113 TM256_Promoter", "cg09227150", "cg11703808 EWS_Exon", "cg17205049", "cg12694663", "cg22063749 UBP54_Intron", "cg20521676 PARVA_Exon", "cg00152260 PO4F3_Intron"))

library(ggrepel)
#plot boxplots of DNAm level vs location for top 10 significant CpGs 
DNAm_Location_plot <- ggplot(allData_long_2, aes(x=allData_long_2$CPG_2, allData_long_2$Value_2, fill=allData_long_2$Location_1)) +
  geom_boxplot() + 
  #put boxplot for each CpG in its own panel, each panel has its own range of values for the y axis, put 5 side by side per row and edit labeller to avoid cropped text
  facet_wrap(~allData_long_2$CPG_2, scale="free", ncol =5, labeller = label_wrap_gen(10)) +
  #change title and axis names
  ggtitle("CpG Methylation Levels") + xlab("") + ylab("DNAm Level") +
  #remove x axis labels
  scale_x_discrete(labels = NULL, breaks = NULL) + 
   #select colors (red and blue), specify legend labels
  scale_fill_manual(values=c("#fb6869", "#66b1ff"), name="Location", breaks=c("no", "yes"), labels=c("Other", "Long Island")) +
  #use classic theme
  theme_classic() +
  #select size for all of the text on the plot
  theme(plot.title = element_text(hjust = 0.5, size = 13, face="bold"),
        strip.text = element_text(size=11),
        axis.text = element_text(size =10),
        axis.title = element_text(size = 11),
        legend.title = element_text(size=11),
        legend.text = element_text(size=10))
#view plot
DNAm_Location_plot

3. Boxplots DNAm level vs sex for top 10 significant CpGs

#need to update and put CpGs of interest (top 10) in the same column using gather (change wide format to tall)
allData_long_3 <- gather(allData, CPG_3, Value_3, Basename.cg10876267, Basename.cg21064897, Basename.cg13324140, Basename.cg04927657, Basename.cg13102615, Basename.cg11115577, Basename.cg15281901, Basename.cg13842554, Basename.cg01042193, Basename.cg07121495, factor_key=TRUE)
#change cpg methylation value to numeric
allData_long_3$Value_3 <- as.numeric(allData_long_3$Value_3)
#change CpG names for legend
allData_long_3$CPG_3 <- factor(allData_long_3$CPG_3, levels=c("Basename.cg10876267", "Basename.cg21064897", "Basename.cg13324140", "Basename.cg04927657", "Basename.cg13102615", "Basename.cg11115577", "Basename.cg15281901", "Basename.cg13842554", "Basename.cg01042193", "Basename.cg07121495"),labels = c("cg10876267", "cg21064897", "cg13324140", "cg04927657", "cg13102615", "cg11115577", "cg15281901", "cg13842554", "cg01042193", "cg07121495"))

#plot boxplots of DNAm level vs sex for top 10 significant CpGs
DNAm_Sex_plot <- ggplot(allData_long_3, aes(allData_long_3$CPG_3, allData_long_3$Value_3, fill=allData_long_3$Sex)) + 
  geom_boxplot() + 
  #put boxplot for each CpG in its own panel, each panel has its own range of values for the y axis and put 5 side by side per row
  facet_wrap(~allData_long_3$CPG_3, scale="free", ncol =5) +
  #change title and axis names
  ggtitle("CpG methylation by sex") + xlab("") + ylab("DNAm level") +
   #remove x axis labels
  scale_x_discrete(labels = NULL, breaks = NULL) + 
  #specification for color fill 
  labs(fill = "Sex") + 
  #select colors (red and blue)
  scale_fill_manual(values=c("#fb6869", "#66b1ff")) +
  #use theme classic
  theme_classic() +
  #select size for all of the text on the plot
  theme(plot.title = element_text(hjust = 0.5, size = 13, face="bold"),
        strip.text = element_text(size=11),
        axis.text = element_text(size =10),
        axis.title = element_text(size = 11),
        legend.title = element_text(size=11),
        legend.text = element_text(size=10))
#view plot
DNAm_Sex_plot

EWAS Long Island Effect

Marie-Laurence Cossette

23/07/2022

1. Load data with info for all samples and the methylation probe data

2. EWAS of island effect

3. Plotting significant CpGs

1. Linear regression of DNAm level vs age for top 10 significant CpGs

2. Boxplots DNAm level vs location for top 10 significant CpGs

3. Boxplots DNAm level vs sex for top 10 significant CpGs