Load required packages

#load packages
library("limma")
library("dplyr")
library("lattice")
library("ggplot2")
library("tidyr")

#set working directory
setwd("~/Desktop/SCHOOL/THESIS/Chapter 2 - Methyl/PAPER/Gitlab uploads/")

1. Load data with info for all samples and the methylation probe data

#load data on shrew
df <- read.csv("SampleSheetAgeN90final.csv")
#subset samples to only use ones that can be used for ageing (basically remove ones that didn't pass QC)
samples <- subset(df, CanBeUsedForAgingStudies == 'yes') 
#look at structure of data frame
str(df)

#load probe data
load("all_probes_sesame_normalized.Rdata")
#match the probe data with your samples info
data <- normalized_betas_sesame %>% tibble::column_to_rownames(var = "CGid")%>%
  dplyr::select(samples$Basename)
#look at structure of data frame
str(data)

#load my mapped probes
probe_marie_cinereusshrewn90 <- read.csv("aln_2.csv")

### Data refinement of CpG Sites, based on shared probe mappings
#add new column called "qname" to be able to match CGids with probe data set 
data_1<- cbind(qname = rownames(data), data)
#subset columns that have matching CGids (removes about 8 thousand cols)
data_2 <- subset(data_1, qname %in% probe_marie_cinereusshrewn90$qname)
#remove that new column because we need only numeric values in table for matrix step coming up in a bit
data <- data_2[ -c(1)]

#remove fetus from "df" dataset
df_nofet=subset(df, df$Age>0.1)
#remove fetus from "data" based on the "df" dataframe
data_nofet <- subset(data, select = df$Age>0.1)
#add new "Location_1" column to separate Islands from all others ("yes" = from BPI and Long Island)
df_nofet<- mutate(df_nofet, Location_1 = ifelse(df_nofet$Location == "Bon Portage Island Nova Scotia" | df_nofet$Location == "Long Island Nova Scotia", "yes", "no"))  

2. EWAS of island effect

Code inspired by https://rstudio-pubs-static.s3.amazonaws.com/13450_cca08b93a0d94bc2b45c9ed94e230d02.html

#make matrix with variables to be able to run EWAS
#I want my model to include  sex, age, tissue type and island/mainland location
(df_nofetMat <- model.matrix(~Location_1 + Age + Sex + Tissue, df_nofet))
#Estimate the correlation between samples from same individual to be able to include and account for it in regression model
cor <- duplicateCorrelation(data_nofet, df_nofetMat, block=df_nofet$AnimalID)
cor$consensus.correlation
#use limma package to run regressions
#need probe dataset (data_nofet), matrix with sample info (df_nofet), and info on duplicat correlation form above
Fit <- lmFit(data_nofet, df_nofetMat, block = df_nofet$AnimalID, correlation = cor$consensus.correlation)
#use eBayes() to moderate the estimated error variances
EbFit <- eBayes(Fit)
#add "CGid" column based on rownames
EbFit$CGid <- rownames(EbFit)
#view info on top significant CpGs
topTable(EbFit)
##            Location_1yes          Age          SexM   TissueTail   AveExpr
## cg13324140   0.023898685  0.000455792 -0.4780987927  0.029949026 0.2399096
## cg13102615   0.012650773 -0.001167773 -0.5821757698  0.036474903 0.2825970
## cg15281901   0.003643584 -0.010671132 -0.3668983853  0.016464649 0.2199308
## cg11115577   0.005463240  0.003082504  0.4098490152 -0.029035604 0.7662909
## cg10876267   0.021014780 -0.020828952  0.6320186474  0.006889635 0.6398498
## cg03178290   0.012859004  0.006276269 -0.3846935256  0.008175150 0.2593505
## cg11584409  -0.008190076  0.018880259 -0.0093752371  0.350460757 0.3667963
## cg20468818   0.026795743 -0.008501356  0.0016643980 -0.351333776 0.1515623
## cg06382285   0.020073997  0.005272771 -0.4453586035  0.030834126 0.2092987
## cg25476308  -0.032068098  0.029921197  0.0003450775  0.602861388 0.7375965
##                    F      P.Value    adj.P.Val
## cg13324140 1249.2828 1.469106e-41 4.349729e-37
## cg13102615 1181.2618 4.480771e-41 6.633333e-37
## cg15281901 1077.3673 2.800252e-40 2.763662e-36
## cg11115577 1008.8788 1.034070e-39 7.654188e-36
## cg10876267  989.6848 1.514960e-39 8.970985e-36
## cg03178290  757.8907 3.024636e-37 1.492557e-33
## cg11584409  745.4742 4.195771e-37 1.774691e-33
## cg20468818  695.9869 1.634980e-36 5.848164e-33
## cg06382285  693.0504 1.777677e-36 5.848164e-33
## cg25476308  688.4863 2.025995e-36 5.998565e-33
#save output to computer
write.csv(EbFit, file="~/Desktop/SCHOOL/THESIS/Chapter 2 - Methyl/PAPER/Gitlab uploads/EWAS/EbFit_Island_effect.csv")
#view data for each variable/coefficient (outputs in order of most significant)
#Location
coeff_2 <- topTable(EbFit,coef=2)
CpG_2 <- rownames(coeff_2)
coeff_2
##                 logFC   AveExpr         t      P.Value    adj.P.Val        B
## cg11703808  0.1546308 0.7561761  18.42500 3.345181e-21 9.904411e-17 38.19944
## cg09227150  0.4427736 0.3073579  17.26261 3.443455e-20 5.097691e-16 35.90139
## cg00152260  0.4083112 0.2873600  16.65221 1.230189e-19 1.214115e-15 34.64297
## cg22063749 -0.3683613 0.6958420 -16.19470 3.270096e-19 2.420525e-15 33.67522
## cg25115601 -0.2423548 0.7050174 -13.23223 3.121381e-16 1.848357e-12 26.85343
## cg24102071 -0.4622002 0.5973457 -12.94702 6.368567e-16 3.142675e-12 26.14201
## cg11099953 -0.6953243 0.3970198 -12.61113 1.493760e-15 6.318178e-12 25.29104
## cg11065917 -0.2987741 0.7062629 -12.15911 4.809244e-15 1.779901e-11 24.12318
## cg14198079 -0.2525598 0.7462261 -11.80913 1.210301e-14 3.981621e-11 23.20082
## cg08887457 -0.3467049 0.5162934 -11.76437 1.363482e-14 4.036998e-11 23.08169
#Age
coeff_3 <- topTable(EbFit,coef=3)
CpG_3 <- rownames(coeff_3)
coeff_3
##                  logFC   AveExpr         t      P.Value    adj.P.Val         B
## cg10427592 -0.12768122 0.9407453 -9.100419 2.610087e-11 7.727947e-07 15.669645
## cg09313626 -0.10710763 0.5218670 -8.140531 4.935685e-10 5.416711e-06 12.754053
## cg00505445 -0.17455241 0.7352356 -8.106482 5.488426e-10 5.416711e-06 12.648774
## cg07302051 -0.13610351 0.7062077 -7.742303 1.720632e-09 1.273612e-05 11.515670
## cg08938156  0.09366198 0.1773514  7.628787 2.463132e-09 1.458569e-05 11.160001
## cg15647667 -0.11548521 0.8475586 -7.528962 3.379922e-09 1.667879e-05 10.846337
## cg00029553 -0.13603649 0.5646610 -7.336819 6.229197e-09 2.634772e-05 10.240401
## cg08255799 -0.15688131 0.5347054 -7.141075 1.164682e-08 4.012822e-05  9.620430
## cg18560935 -0.10303860 0.6753129 -7.126649 1.219785e-08 4.012822e-05  9.574643
## cg27021764 -0.16001788 0.5927151 -7.043503 1.592625e-08 4.626779e-05  9.310503
#Sex
coeff_4 <-topTable(EbFit,coef=4)
CpG_4 <- rownames(coeff_4)
coeff_4
##                 logFC   AveExpr         t      P.Value    adj.P.Val        B
## cg13324140 -0.4780988 0.2399096 -65.91993 1.426329e-42 4.223075e-38 83.61169
## cg13102615 -0.5821758 0.2825970 -64.03502 4.527003e-42 6.701775e-38 82.70143
## cg15281901 -0.3668984 0.2199308 -60.71107 3.773130e-41 3.723828e-37 80.99960
## cg11115577  0.4098490 0.7662909  58.99824 1.176794e-40 7.688931e-37 80.07099
## cg10876267  0.6320186 0.6398498  58.85233 1.298455e-40 7.688931e-37 79.99018
## cg03178290 -0.3846935 0.2593505 -51.51575 2.553312e-38 1.259974e-34 75.54299
## cg06382285 -0.4453586 0.2092987 -49.21557 1.556168e-37 6.582145e-34 73.97562
## cg04927657 -0.4694932 0.2819531 -47.72019 5.268118e-37 1.949731e-33 72.90624
## cg21064897 -0.4781476 0.2703235 -47.16746 8.346024e-37 2.635556e-33 72.50036
## cg13842554 -0.2996719 0.1864508 -47.09055 8.901499e-37 2.635556e-33 72.44341
#Re-arrange data for plotting
#make matrix
data_nofet_M <- as.matrix(data_nofet)
#transpose data (basically swap rows and columns to have CGid as column and sample ID as rows)
data_nofet_M <- t(data_nofet_M)
#add new column at the start for Basename (copying the actual rownames and making a column out of it)
data_nofet_M <- cbind(Basename = rownames(data_nofet_M), data_nofet_M)
#change back to dataframe
data_nofet_M <- as.data.frame(data_nofet_M)
#bind the re-arranged probe data with the sample data frame (df_nofet) using the "Basename" column
allData <- cbind(df_nofet, Basename = data_nofet_M)

3. Plotting significant CpGs

1. Linear regression of DNAm level vs age for top 10 significant CpGs

#put CpGs of interest (top 10) in the same column using gather (change wide format to tall)
allData_long <- gather(allData, CPG, Value, Basename.cg10427592, Basename.cg09313626, Basename.cg00505445, Basename.cg07302051, Basename.cg08938156, Basename.cg15647667, Basename.cg00029553, Basename.cg08255799, Basename.cg18560935, Basename.cg27021764, factor_key=TRUE)
#change CpG methylation value to numeric
allData_long$Value <- as.numeric(allData_long$Value)
#plot linear regression of DNAm level vs age for top 10 significant CpGs
baseplot <- ggplot(allData_long, aes(allData_long$Age, allData_long$Value, color=allData_long$CPG)) + geom_point(size = 1)
#add title and axis labels
baseplot <- baseplot + ggtitle("CpG methylation over age") + xlab("Age (years)") + ylab("DNAm level")
#make each CpG have a unique color and change legend names
baseplot <- baseplot + scale_colour_discrete(name="CpG", breaks=c("Basename.cg10427592", "Basename.cg09313626", "Basename.cg00505445", "Basename.cg07302051", "Basename.cg08938156", "Basename.cg15647667", "Basename.cg00029553", "Basename.cg08255799", "Basename.cg18560935", "Basename.cg27021764"), labels=c("cg10427592", "cg09313626", "cg00505445", "cg07302051", "cg08938156", "cg15647667", "cg00029553", "cg08255799", "cg18560935", "cg27021764"))
#add regression line
DNAm_Age_plot <- baseplot + geom_smooth(method = "lm") 
#view plot
DNAm_Age_plot

2. Boxplots DNAm level vs location for top 10 significant CpGs

#need to update and put CpGs of interest (top 10) in the same column using gather (change wide format to tall)
allData_long_2 <- gather(allData, CPG_2, Value_2, Basename.cg11703808, Basename.cg09227150, Basename.cg00152260, Basename.cg22063749, Basename.cg25115601, Basename.cg24102071, Basename.cg11099953, Basename.cg11065917, Basename.cg14198079, Basename.cg08887457, factor_key=TRUE)
#change CpG methylation value to numeric
allData_long_2$Value_2 <- as.numeric(allData_long_2$Value_2)
#change CpG names for legend, got the info on specific genes from Manhattan plot scripts
allData_long_2$CPG_2 <- factor(allData_long_2$CPG_2, levels=c("Basename.cg11703808", "Basename.cg09227150", "Basename.cg00152260", "Basename.cg22063749", "Basename.cg25115601", "Basename.cg24102071", "Basename.cg11099953", "Basename.cg11065917", "Basename.cg14198079", "Basename.cg08887457"), labels=c("cg11703808 EWS_Exon", "cg09227150" , "cg00152260 PO4F3_Intron", "cg22063749 UBP54_Intron", "cg25115601 TRI47_Exon", "cg24102071 ATPA_Exon"," cg11099953 TSH3_Intergenic", "cg11065917 COE4_Exon", "cg14198079", "cg08887457"))

library(ggrepel)
#plot boxplots of DNAm level vs location for top 10 significant CpGs
DNAm_Location_plot <- ggplot(allData_long_2, aes(x=allData_long_2$CPG_2, allData_long_2$Value_2, fill=allData_long_2$Location_1)) +
  geom_boxplot() + 
  #put boxplot for each CpG in its own panel, each panel has its own range of values for the y axis, put 5 side by side per row and   edit labeller to avoid cropped text
  facet_wrap(~allData_long_2$CPG_2, scale="free", ncol =5, labeller = label_wrap_gen(10)) +
  #change title and axis names
  ggtitle("CpG Methylation Levels") + xlab("") + ylab("DNAm Level") +
  #remove x axis labels
  scale_x_discrete(labels = NULL, breaks = NULL) + 
  #select colors (red and blue), specify legend labels
  scale_fill_manual(values=c("#fb6869", "#66b1ff"), name="Location", breaks=c("no", "yes"), labels=c("Mainland", "Island")) +
  #use classic theme
  theme_classic() +
  #select size for all of the text on the plot
  theme(plot.title = element_text(hjust = 0.5, size = 13, face="bold"),
        strip.text = element_text(size=11),
        axis.text = element_text(size =10),
        axis.title = element_text(size = 11),
        legend.title = element_text(size=11),
        legend.text = element_text(size=10))
#view plot
DNAm_Location_plot

3. Boxplots DNAm level vs sex for top 10 significant CpGs

#need to update and put CpGs of interest (top 10) in the same column using gather (change wide format to tall)
allData_long_3 <- gather(allData, CPG_3, Value_3, Basename.cg13324140, Basename.cg13102615, Basename.cg15281901, Basename.cg11115577, Basename.cg10876267, Basename.cg03178290, Basename.cg06382285, Basename.cg04927657, Basename.cg21064897, Basename.cg13842554, factor_key=TRUE)
#change CpG methylation value to numeric
allData_long_3$Value_3 <- as.numeric(allData_long_3$Value_3)
#change CpG names for legend
allData_long_3$CPG_3 <- factor(allData_long_3$CPG_3, levels=c("Basename.cg13324140", "Basename.cg13102615", "Basename.cg15281901", "Basename.cg11115577", "Basename.cg10876267", "Basename.cg03178290", "Basename.cg06382285", "Basename.cg04927657", "Basename.cg21064897", "Basename.cg13842554"),labels = c("cg13324140", "cg13102615", "cg15281901", "cg11115577", "cg10876267", "cg03178290", "cg06382285", "cg04927657", "cg21064897", "cg13842554"))

#plot boxplots of DNAm level vs sex for top 10 significant CpGs
DNAm_Sex_plot <- ggplot(allData_long_3, aes(allData_long_3$CPG_3, allData_long_3$Value_3, fill=allData_long_3$Sex)) + 
  geom_boxplot() + 
  #put boxplot for each CpG in its own panel, each panel has its own range of values for the y axis and put 5 side by side per row 
  facet_wrap(~allData_long_3$CPG_3, scale="free", ncol =5) +
  #change title and axis names
  ggtitle("CpG methylation by sex") + xlab("") + ylab("DNAm level") +
  #remove x axis labels
  scale_x_discrete(labels = NULL, breaks = NULL) + 
  #specification for color fill 
  labs(fill = "Sex") +
  #select colors (red and blue)
  scale_fill_manual(values=c("#fb6869", "#66b1ff")) +
  #use theme classic
  theme_classic() +
  #select size for all of the text on the plot
  theme(plot.title = element_text(hjust = 0.5, size = 13, face="bold"),
        strip.text = element_text(size=11),
        axis.text = element_text(size =10),
        axis.title = element_text(size = 11),
        legend.title = element_text(size=11),
        legend.text = element_text(size=10))
#view plot
DNAm_Sex_plot