Load required packages

#load packages
library("limma")
library("dplyr")
library("lattice")
library("ggplot2")
library("tidyr")

#set working directory
setwd("~/Desktop/SCHOOL/THESIS/Chapter 2 - Methyl/PAPER/Gitlab uploads/")

1. Load data with info for all samples and the methylation probe data

#load data on shrew
df <- read.csv("SampleSheetAgeN90final.csv")
#subset samples to only use ones that can be used for ageing (basically remove ones that didn't pass QC)
samples <- subset(df, CanBeUsedForAgingStudies == 'yes') 
#look at structure of data frame
str(df)

#load probe data
load("all_probes_sesame_normalized.Rdata")
#match the probe data with your samples info
data <- normalized_betas_sesame %>% tibble::column_to_rownames(var = "CGid")%>%
  dplyr::select(samples$Basename)
#look at structure of data frame
str(data)

#load my mapped probes
probe_marie_cinereusshrewn90 <- read.csv("aln_2.csv")

### Data refinement of CpG Sites, based on shared probe mappings
#add new column called "qname" to be able to match CGids with probe data set 
data_1<- cbind(qname = rownames(data), data)
#subset columns that have matching CGids (removes about 8 thousand cols)
data_2 <- subset(data_1, qname %in% probe_marie_cinereusshrewn90$qname)
#remove that new column because we need only numeric values in table for matrix step coming up in a bit
data <- data_2[ -c(1)]

#remove fetus from "df" dataset
df_nofet=subset(df, df$Age>0.1)
#remove fetus from "data" based on the "df" dataframe
data_nofet <- subset(data, select = df$Age>0.1)
#remove BPI shrews from both data sets
df_LI= subset(df_nofet, df_nofet$Location != "Bon Portage Island Nova Scotia")
data_LI <- subset(data_nofet, select = df_nofet$Location != "Bon Portage Island Nova Scotia")
#add new "Location_1" column to separate Long Island samples from all others ("yes" = from Long Island)
df_LI<- mutate(df_LI, Location_1 = ifelse(df_LI$Location == "Long Island Nova Scotia", "yes", "no"))  

2. EWAS of island effect

Code inspired by https://rstudio-pubs-static.s3.amazonaws.com/13450_cca08b93a0d94bc2b45c9ed94e230d02.html

#make matrix with variables to be able to run EWAS
#I want my model to include sex, age, tissue type and Long Island/others location
(df_LIMat <- model.matrix(~Location_1 + Age + Sex + Tissue, df_LI))
#Estimate the correlation between samples from same individual to be able to include and account for it in regression model
cor <- duplicateCorrelation(data_LI, df_LIMat, block=df_LI$AnimalID)
cor$consensus.correlation
#use limma package to run regressions
#need probe dataset (data_LI), matrix with sample info (df_LI), and info on duplicate correlation from above
Fit <- lmFit(data_LI, df_LIMat, block = df_LI$AnimalID, correlation = cor$consensus.correlation)
#use eBayes() to moderate the estimated error variances
EbFit <- eBayes(Fit)
#add "CGid" column based on rownames
EbFit$CGid <- rownames(EbFit)
#view info on top significant CpGs
topTable(EbFit)
##            Location_1yes          Age         SexM   TissueTail   AveExpr
## cg10876267  -0.003345980  0.015349762  0.658177260  0.004722296 0.6225154
## cg25476308   0.009269547 -0.007161175  0.015273830  0.614045977 0.6625765
## cg21064897   0.025013169  0.001894787 -0.471260158  0.013639004 0.2708862
## cg13324140   0.043934973 -0.036122928 -0.459417309  0.029548103 0.2383797
## cg15894722   0.015668478 -0.015050080 -0.001596506  0.652299735 0.4370756
## cg04927657  -0.001923276  0.009815394 -0.467404142  0.016454185 0.2843306
## cg20468818   0.018825048  0.004235476  0.006536476 -0.351620355 0.1959083
## cg11115577  -0.012804726  0.020385455  0.409916218 -0.032813146 0.7585485
## cg13102615   0.011377825 -0.012669210 -0.564337286  0.032296339 0.2833316
## cg15281901   0.009351362 -0.017961904 -0.357455145  0.017260205 0.2247608
##                    F      P.Value    adj.P.Val
## cg10876267 1538.5560 1.045109e-28 3.094360e-24
## cg25476308 1143.7852 3.695930e-27 3.269027e-23
## cg21064897 1129.0700 4.318161e-27 3.269027e-23
## cg13324140 1126.9580 4.416410e-27 3.269027e-23
## cg15894722 1104.8885 5.601059e-27 3.316723e-23
## cg04927657  957.0216 3.144815e-26 1.551861e-22
## cg20468818  922.2551 4.903143e-26 2.073890e-22
## cg11115577  906.5209 6.027703e-26 2.143255e-22
## cg13102615  900.6676 6.514893e-26 2.143255e-22
## cg15281901  766.4547 4.510445e-25 1.335453e-21
#save output to computer
write.csv(EbFit, file="~/Desktop/SCHOOL/THESIS/Chapter 2 - Methyl/PAPER/Gitlab uploads/EWAS/EbFit_LongIsland_effect.csv")
#view data for each variable/coefficient (outputs in order of most significant)
#Location
coeff_2 <- topTable(EbFit,coef=2)
CpG_2 <- rownames(coeff_2)
coeff_2
##                  logFC   AveExpr          t      P.Value    adj.P.Val         B
## cg01463565  0.44691660 0.2095297  30.557483 7.911992e-21 2.342582e-16 35.563113
## cg13695149  0.41374336 0.2260077  26.275982 2.742564e-19 4.060092e-15 32.760656
## cg09711113 -0.88177994 0.6637395 -12.270279 7.134340e-12 7.041118e-08 17.255494
## cg09227150  0.45371331 0.2080255   9.101776 2.804366e-09 2.075792e-05 11.393451
## cg11703808  0.14104021 0.7176595   8.915627 4.138356e-09 2.450569e-05 11.008175
## cg17205049  0.26337387 0.6828091   8.448678 1.120593e-08 5.529755e-05 10.020607
## cg12694663 -0.07695597 0.8828349  -8.142657 2.186667e-08 8.659750e-05  9.356973
## cg22063749 -0.37677851 0.7846924  -8.092490 2.442839e-08 8.659750e-05  9.246943
## cg20521676 -0.26277742 0.6049661  -8.048038 2.695551e-08 8.659750e-05  9.149157
## cg00152260  0.38649331 0.1825423   8.011270 2.924801e-08 8.659750e-05  9.068069
#Age
coeff_3 <- topTable(EbFit,coef=3)
CpG_3 <- rownames(coeff_3)
coeff_3
##                  logFC    AveExpr         t      P.Value   adj.P.Val        B
## cg00059486  0.06989609 0.05981315  7.042727 2.677890e-07 0.007928696 6.961184
## cg19873240  0.14527563 0.27220537  6.472941 1.041808e-06 0.009800135 5.629925
## cg03886239  0.05102320 0.08765046  6.440024 1.128200e-06 0.009800135 5.551811
## cg03789886  0.04143492 0.10755669  6.241532 1.828704e-06 0.009800135 5.078183
## cg02898094  0.06356100 0.09858640  6.193479 2.056870e-06 0.009800135 4.962874
## cg26301436  0.09847957 0.17594923  6.151352 2.280663e-06 0.009800135 4.861585
## cg15647667 -0.12393298 0.85347838 -6.110753 2.519800e-06 0.009800135 4.763795
## cg08938156  0.10660786 0.16490517  6.090578 2.647969e-06 0.009800135 4.715139
## cg13753818 -0.18072545 0.35123732 -5.725985 6.536032e-06 0.018824234 3.829169
## cg15944967  0.05349242 0.12065243  5.629385 8.320887e-06 0.018824234 3.592512
#Sex
coeff_4 <-topTable(EbFit,coef=4)
CpG_4 <- rownames(coeff_4)
coeff_4
##                 logFC   AveExpr         t      P.Value    adj.P.Val        B
## cg10876267  0.6581773 0.6225154  71.37679 1.253549e-29 3.711506e-25 53.30361
## cg21064897 -0.4712602 0.2708862 -61.65042 4.243907e-28 6.282679e-24 51.02109
## cg13324140 -0.4594173 0.2383797 -60.58374 6.454200e-28 6.369865e-24 50.73296
## cg04927657 -0.4674041 0.2843306 -56.90593 2.902901e-27 2.148728e-23 49.67215
## cg13102615 -0.5643373 0.2833316 -54.72721 7.405973e-27 3.928814e-23 48.99026
## cg11115577  0.4099162 0.7585485  54.56237 7.961661e-27 3.928814e-23 48.93693
## cg15281901 -0.3574551 0.2247608 -50.17499 5.936447e-26 2.510948e-22 47.41970
## cg13842554 -0.2959779 0.1885673 -47.74552 1.947604e-25 7.208084e-22 46.49086
## cg01042193 -0.3536074 0.2070621 -46.32614 4.008618e-25 1.318746e-21 45.91564
## cg07121495  0.3774155 0.7866066  45.71868 5.495975e-25 1.627248e-21 45.66167
#Re-arrange data for plotting
#make matrix
data_LI_M <- as.matrix(data_LI)
#transpose data (basically swap rows and columns to have CGid as column and sample ID as rows)
data_LI_M <- t(data_LI_M)
#add new column at the start for Basename (copying the actual rownames and making a column out of it)
data_LI_M <- cbind(Basename = rownames(data_LI_M), data_LI_M)
#change back to dataframe
data_LI_M <- as.data.frame(data_LI_M)
#bind the re-arranged probe data with the sample data frame (df_LI) using the "Basename" column
allData <- cbind(df_LI, Basename = data_LI_M)

3. Plotting significant CpGs

1. Linear regression of DNAm level vs age for top 10 significant CpGs

#put CpGs of interest (top 10) in the same column using gather (change wide format to tall)
allData_long <- gather(allData, CPG, Value, Basename.cg00059486, Basename.cg19873240, Basename.cg03886239, Basename.cg03789886, Basename.cg02898094, Basename.cg26301436, Basename.cg15647667, Basename.cg08938156, Basename.cg13753818, Basename.cg15944967, factor_key=TRUE)
#change CpG methylation value to numeric
allData_long$Value <- as.numeric(allData_long$Value)
#plot linear regression of DNAm level vs age for top 10 significant CpGs
baseplot <- ggplot(allData_long, aes(allData_long$Age, allData_long$Value, color=allData_long$CPG)) + geom_point(size = 1)
#add title and axis labels
baseplot <- baseplot + ggtitle("CpG methylation over age") + xlab("Age (years)") + ylab("DNAm level")
#make each CpG have a unique color and change legend names
baseplot <- baseplot + scale_colour_discrete(name="CpG", breaks=c("Basename.cg00059486", "Basename.cg19873240", "Basename.cg03886239", "Basename.cg03789886", "Basename.cg02898094", "Basename.cg26301436", "Basename.cg15647667", "Basename.cg08938156", "Basename.cg13753818", "Basename.cg15944967"), labels=c("cg00059486", "cg19873240", "cg03886239", "cg03789886", "cg02898094", "cg26301436", "cg15647667", "cg08938156", "cg13753818", "cg15944967"))
#add regression line
DNAm_Age_plot <- baseplot + geom_smooth(method = "lm") 
#view plot
DNAm_Age_plot

2. Boxplots DNAm level vs location for top 10 significant CpGs

#need to update and put CpGs of interest (top 10)in the same column using gather (change wide format to tall)
allData_long_2 <- gather(allData, CPG_2, Value_2, Basename.cg01463565, Basename.cg13695149, Basename.cg09711113, Basename.cg09227150, Basename.cg11703808, Basename.cg17205049, Basename.cg12694663, Basename.cg22063749, Basename.cg20521676, Basename.cg00152260, factor_key=TRUE)
#change CpG methylation value to numeric
allData_long_2$Value_2 <- as.numeric(allData_long_2$Value_2)
##change CpG names for legend, got the info on specific genes from Manhattan plot scripts
allData_long_2$CPG_2 <- factor(allData_long_2$CPG_2, levels = c("Basename.cg01463565", "Basename.cg13695149", "Basename.cg09711113", "Basename.cg09227150", "Basename.cg11703808", "Basename.cg17205049", "Basename.cg12694663", "Basename.cg22063749", "Basename.cg20521676", "Basename.cg00152260"), 
labels = c("cg01463565 NET1_Promoter", "cg13695149 NET1_Promoter", "cg09711113 TM256_Promoter", "cg09227150", "cg11703808 EWS_Exon", "cg17205049", "cg12694663", "cg22063749 UBP54_Intron", "cg20521676 PARVA_Exon", "cg00152260 PO4F3_Intron"))

library(ggrepel)
#plot boxplots of DNAm level vs location for top 10 significant CpGs 
DNAm_Location_plot <- ggplot(allData_long_2, aes(x=allData_long_2$CPG_2, allData_long_2$Value_2, fill=allData_long_2$Location_1)) +
  geom_boxplot() + 
  #put boxplot for each CpG in its own panel, each panel has its own range of values for the y axis, put 5 side by side per row and edit labeller to avoid cropped text
  facet_wrap(~allData_long_2$CPG_2, scale="free", ncol =5, labeller = label_wrap_gen(10)) +
  #change title and axis names
  ggtitle("CpG Methylation Levels") + xlab("") + ylab("DNAm Level") +
  #remove x axis labels
  scale_x_discrete(labels = NULL, breaks = NULL) + 
   #select colors (red and blue), specify legend labels
  scale_fill_manual(values=c("#fb6869", "#66b1ff"), name="Location", breaks=c("no", "yes"), labels=c("Other", "Long Island")) +
  #use classic theme
  theme_classic() +
  #select size for all of the text on the plot
  theme(plot.title = element_text(hjust = 0.5, size = 13, face="bold"),
        strip.text = element_text(size=11),
        axis.text = element_text(size =10),
        axis.title = element_text(size = 11),
        legend.title = element_text(size=11),
        legend.text = element_text(size=10))
#view plot
DNAm_Location_plot

3. Boxplots DNAm level vs sex for top 10 significant CpGs

#need to update and put CpGs of interest (top 10) in the same column using gather (change wide format to tall)
allData_long_3 <- gather(allData, CPG_3, Value_3, Basename.cg10876267, Basename.cg21064897, Basename.cg13324140, Basename.cg04927657, Basename.cg13102615, Basename.cg11115577, Basename.cg15281901, Basename.cg13842554, Basename.cg01042193, Basename.cg07121495, factor_key=TRUE)
#change cpg methylation value to numeric
allData_long_3$Value_3 <- as.numeric(allData_long_3$Value_3)
#change CpG names for legend
allData_long_3$CPG_3 <- factor(allData_long_3$CPG_3, levels=c("Basename.cg10876267", "Basename.cg21064897", "Basename.cg13324140", "Basename.cg04927657", "Basename.cg13102615", "Basename.cg11115577", "Basename.cg15281901", "Basename.cg13842554", "Basename.cg01042193", "Basename.cg07121495"),labels = c("cg10876267", "cg21064897", "cg13324140", "cg04927657", "cg13102615", "cg11115577", "cg15281901", "cg13842554", "cg01042193", "cg07121495"))

#plot boxplots of DNAm level vs sex for top 10 significant CpGs
DNAm_Sex_plot <- ggplot(allData_long_3, aes(allData_long_3$CPG_3, allData_long_3$Value_3, fill=allData_long_3$Sex)) + 
  geom_boxplot() + 
  #put boxplot for each CpG in its own panel, each panel has its own range of values for the y axis and put 5 side by side per row
  facet_wrap(~allData_long_3$CPG_3, scale="free", ncol =5) +
  #change title and axis names
  ggtitle("CpG methylation by sex") + xlab("") + ylab("DNAm level") +
   #remove x axis labels
  scale_x_discrete(labels = NULL, breaks = NULL) + 
  #specification for color fill 
  labs(fill = "Sex") + 
  #select colors (red and blue)
  scale_fill_manual(values=c("#fb6869", "#66b1ff")) +
  #use theme classic
  theme_classic() +
  #select size for all of the text on the plot
  theme(plot.title = element_text(hjust = 0.5, size = 13, face="bold"),
        strip.text = element_text(size=11),
        axis.text = element_text(size =10),
        axis.title = element_text(size = 11),
        legend.title = element_text(size=11),
        legend.text = element_text(size=10))
#view plot
DNAm_Sex_plot