Sorghum Post-transcriptional modification Profile

Post-transcriptional modifications statistics (PTMs)

Load packages

library(dplyr)
library(tidyr)
library(ggplot2)
library(ggVennDiagram)
library(ggvenn)
library(ggpubr)
library(UpSetR)
library(pheatmap)
library(corrplot)
library(stringr)
library(stringr)
library(zFPKM)
library(RColorBrewer)
library(viridis)
library(matrixStats)
library(ggplot2)
library(ggpubr)
library(Hmisc)
library(patchwork)
library(gridExtra)
library(reshape2)

Define two vairables

RMF: Relative modification frequency
RMA: Relative modification abundance
RML: Relative modification length-normalized frequency

Load mapping depth for comparison

### load meta information
setwd("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/11_MODs_depth/data/")
merge_DP.list <- read.table("sample.list")
colnames(merge_DP.list) <- "depth_ID"

### generate the additional column
merge_DP.list$H_correlation <- "fill"
merge_DP.list$H_Pvalue <- "fill"
merge_DP.list$M_correlation <- "fill"
merge_DP.list$M_Pvalue <- "fill"

head(merge_DP.list)

Define the functions

### define the correlation extraction function
flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut]
    )
}

### define function
fun_length <- function(x){
  return(data.frame(y=median(x),label= paste0("", length(x))))
}

Exam the correlation of modified ratio of HAMR/Modtect derived dataset

### load each depth samples
for (i in 1:nrow(merge_DP.list)){
  DP.df <- unique(read.table(paste0("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/11_MODs_depth/data/",merge_DP.list[i,1]), header = F))
  colnames(DP.df) <- c("Chr", "pos1", "pos2", "Index", "Nucleotides",
                       "M_ratio1", "M_ratio2", "H_nonref1", "H_ref1", "H_total1",
                       "H_nonref2", "H_ref2", "H_total2", "H_mods1", "H_mod2")
  ### calculate the HAMR retio
  DP.df$H_ratio1 <- DP.df$H_nonref1/DP.df$H_total1
  DP.df$H_ratio2 <- DP.df$H_nonref2/DP.df$H_total2
  
  DP.df$M_ratio <- (DP.df$M_ratio1 + DP.df$M_ratio2)/2
  DP.df$H_ratio <- (DP.df$H_ratio1 + DP.df$H_ratio2)/2
  ### Plot the pearson correlation
  #corrplot.mixed(cor(DP.df[,c("H_ratio1","H_ratio2","M_ratio1", "M_ratio2")]), order = 'AOE')
  
  ### Take the HAMR for scatter plot
  Plot.DP1 <- DP.df[,c("H_ratio1","H_ratio2")]
  res <- rcorr(Plot.DP1[,1], Plot.DP1[,2],  type=c("pearson"))
  merge_DP.list[i,2:3] <- flattenCorrMatrix(res$r, res$P)[,3:4]
  
  
  Plot.DP2 <- DP.df[,c("M_ratio1","M_ratio2")]
  res <- rcorr(Plot.DP2[,1], Plot.DP2[,2],  type=c("pearson"))
  merge_DP.list[i,4:5] <- flattenCorrMatrix(res$r, res$P)[,3:4]
  
  
 #print(ggscatter(Plot.DP1, x = "H_ratio1", y = "H_ratio2", 
          #add = "reg.line", conf.int = TRUE, 
          #cor.coef = TRUE, cor.method = "pearson",
          #xlab = "HAMR_ratio1", ylab = "HAMR_ratio2",
          #color = "black", shape = 21, size = 2, 
          #add.params = list(color = "blue", fill = "lightgray"),
          #title= paste0("Ratio correlation of ",merge_DP.list[i,1])))
  
  assign(merge_DP.list[i,1], DP.df)
}

Compare the corelation from two methods regarding the RMA

temp1 <- merge_DP.list[,c(1,2)]
temp1$type <- "HAMR"
colnames(temp1) <- c("depth_ID", "correlation", "type")
head(temp1, 30)

temp2 <- merge_DP.list[,c(1,4)]
temp2$type <- "Modtect"
colnames(temp2) <- c("depth_ID", "correlation", "type")
head(temp2, 30)

Plot_bind <-rbind(temp1, temp2)
Plot_bind$correlation <- as.numeric(Plot_bind$correlation)
head(Plot_bind, 40)

###Plot the combined correlation 
#ggplot(Plot_bind, aes(x=type, y=correlation, color = type)) +
  #geom_boxplot() + theme_bw() +
  #geom_jitter() +
  #ylab("Pearson-correlation of modified ratio between two replicates")

Build the comparaitve table for each sample

comp.df <- data.frame("depth_ID" = merge_DP.list[,1])
### Calculate the WL-WW ratio changes using log2Ratio
comp.df <- data.frame("WL_sample" = comp.df[grepl("WL",comp.df$depth_ID),],
           "WW_sample" = comp.df[grepl("WW",comp.df$depth_ID),])
comp.df$comp_set <- str_replace(comp.df[,1], "WL_", "") 
comp.df$comp_set <- str_replace(comp.df$comp_set, ".merge.depth.mods", "_comp_ratio") 
head(comp.df, 50)

Load the TPM data and logFC data from two conditions

### load the TPM and logFC data for comparison
TPM_mean <- read.csv("//Users/leon/OneDrive - Cornell University/Projects/9_Sorghum/3_FeatureCounts/Sorghum_TPM_mean.20221025.csv", row.names = 1)
head(TPM_mean, 20)

### Load the logFC data for comparison
Gene_logFC <- read.csv("/Users/leon/OneDrive - Cornell University/Projects/9_Sorghum/4_DEGs/All_DEGs.csv")
Gene_logFC <- Gene_logFC %>% separate("Sample", c("Accession", "Time"))
head(Gene_logFC, 30)

RMA correlation across two conditions (pair-wise: WL relative to WW)

Plot the RMA over paried genes across two conditions

### combine the ratio for each
for (i in 1:nrow(comp.df)){
  df1 <- get(comp.df[i,1])
  df1 <- df1[,c("Index", "Nucleotides","H_ratio1", "H_ratio2","H_ratio","H_total1","H_total2")]
  colnames(df1) <- c("Index", "Nucleotids_WL","H_WL_ratio1", "H_WL_ratio2","H_ratio_WL","H_total1_WL","H_total2_WL")
  
  df2 <- get(comp.df[i,2])
  df2 <- df2[,c("Index", "Nucleotides","H_ratio1","H_ratio2","H_ratio","H_total1","H_total2")]
  colnames(df2) <- c("Index", "Nucleotids_WW","H_WW_ratio1", "H_WW_ratio2","H_ratio_WW","H_total1_WW","H_total2_WW")
  
  ### Perform the t.test and logFC calculation (common PTMs)
  join_df <- inner_join(df1,df2, by = "Index")
  join_df$log2FC <- log2(join_df$H_ratio_WL/join_df$H_ratio_WW)
  #join_df$P.value <- apply(join_df,1,function(x){t.test(as.numeric(x[3:4]),as.numeric(x[7:8]), paired = T)$p.value})
  join_df$set <- comp.df[i,3]
  join_df <- join_df %>% separate(set, c("Accession", "Time"))
  
 ### Plot the average of ratio between two conditions for common mods
  #print(ggscatter(join_df, x = "H_ratio_WL", y = "H_ratio_WW", 
    #add = "reg.line", conf.int = TRUE, 
    #cor.coef = TRUE, cor.method = "pearson",
    #xlab = "H_ratio_WL", ylab = "H_ratio_WW",
    #color = "black", shape = 21, size = 2, 
    #add.params = list(color = "blue", fill = "lightgray"),
    #title= paste0("Ratio correlation of ",comp.df[1,3])))
    #assign(paste0(comp.df[i,3], "_pairedGgenes"), join_df)
}

head(join_df, 40)

Plot the RMA over all genes across two conditions

All genes: Inclusion of paired, WL-specific, and WW-specific modified sites

### combine the ratio for each
for (i in 1:nrow(comp.df)){
  df1 <- get(comp.df[i,1])
  df1 <- df1[,c("Index", "Nucleotides","H_ratio1", "H_ratio2","H_ratio","H_total1","H_total2")]
  colnames(df1) <- c("Index", "Nucleotids_WL","H_WL_ratio1", "H_WL_ratio2","H_ratio_WL","H_total1_WL","H_total2_WL")
  
  df2 <- get(comp.df[i,2])
  df2 <- df2[,c("Index", "Nucleotides","H_ratio1","H_ratio2","H_ratio","H_total1","H_total2")]
  colnames(df2) <- c("Index", "Nucleotids_WW","H_WW_ratio1", "H_WW_ratio2","H_ratio_WW","H_total1_WW","H_total2_WW")

  ### Perform the t.test and logFC calculation (all PTMs)
  all_df <- full_join(df1,df2, by = "Index")
  all_df[is.na(all_df)] <- 0
  all_df$set <- comp.df[i,3]
  all_df$group <- "fill"
  all_df[all_df$H_ratio_WL == 0,15] <- "WW"
  all_df[all_df$H_ratio_WW == 0,15] <- "WL"
  all_df[all_df$H_ratio_WW != 0 & all_df$H_ratio_WL != 0,15] <- "both"
  all_df$Delta <- all_df$H_ratio_WL-all_df$H_ratio_WW
  all_df <- all_df %>% separate(set, c("Accession", "Time"))
  
  ### Plot the average of ratio with condition-specific PTMs included
  #print(ggscatter(all_df, x = "H_ratio_WL", y = "H_ratio_WW", fill = "group",
          #add = "reg.line", conf.int = TRUE, 
          #cor.coef = TRUE, cor.method = "pearson",
          #xlab = "H_ratio_WL", ylab = "H_ratio_WW",
          #color = "black", shape = 21, size = 2, 
          #add.params = list(color = "blue", fill = "lightgray"),
          #palette = c(WW="steelblue", WL="tomato", both="grey60"),
          #title= paste0("Ratio correlation of ",comp.df[i,3])))

  assign(paste0(comp.df[i,3], "_all_genes"), all_df)
}

head(all_df, 40)

Combine The correaltion statistics

### extract the unique index over all samples
comp_list <- lapply(ls(pattern = "_comp_ratio_all_genes"), get)
comp_combine <- bind_rows(comp_list)
nrow(comp_combine)

## [1] 154588

head(comp_combine, 60)

write.csv(comp_combine, "/Users/leon/Documents/Project/Sorghum/PTMs/RMA_comparison_allSites_20230206.csv", row.names = F, quote = F)

Extract non-duplicated modified sites across all samples

Index.df <- data.frame("Index" = unique(comp_combine$Index))
head(Index.df, 80)

write.csv(Index.df, "/Users/leon/Documents/Project/Sorghum/PTMs/RMA_allSites_Index.csv", row.names = F, quote = F)

Plot the scatter plot of the RMA to describe the level of modification (all genes)

RMA_cor <- read.csv("/Users/leon/Documents/Project/Sorghum/PTMs/RMA_comparison_allSites_20230206.csv", header = T)
head(RMA_cor)

RMA_cor %>%
  ggplot(aes(x=H_ratio_WL, y=H_ratio_WW, color=Time))+
  geom_point(size = 1)+ 
  scale_color_manual(values = c(TP1="#D8DAEB", TP3="#B2ABD2", TP5="#8073AC", TP7 ="#542788"))+
  theme_bw()+ 
  ggtitle("Comparison of RMF from two conditions of across accessions") +
  facet_wrap(~Accession,scale='free_x') +
  stat_smooth(aes(color= Time), size = 0.5, level = 0.95) +
  geom_abline(slope=1, intercept=0, shape = 2) +
  stat_cor()

RMA_cor %>%
  ggplot(aes(x=H_ratio_WL, y=H_ratio_WW, color=Accession))+
  geom_point(size = 1, alpha = 0.5)+ 
  scale_color_manual(values = c(A="#1B9E77", B="#D95F02", C="#666666", D="#E7298A", E="#66A61E", F="#E6AB02"))+
  theme_bw()+ 
  ggtitle("Comparison of RMF from two conditions of across accessions") +
  facet_wrap(~Time) +
  stat_smooth(aes(color= Accession), size = 0.5, level = 0.95) +
  stat_cor()

Plot the boxplot of the Delta to describe the level of modification (all genes)

ggplot(RMA_cor, (aes(x=Accession, y=Delta, fill = Time))) +
  geom_boxplot(outlier.size = 0.2, outlier.shape = 21) +
  theme_bw() +
  ylab("Delta of RMA between conditions across time")+
  scale_fill_manual(values = c(TP1="#D8DAEB", TP3="#B2ABD2", TP5="#8073AC", TP7 ="#542788")) +
  stat_summary(fun = mean, color = "darkred", position = position_dodge(0.75),
             geom = "point", shape = 4, size = 3,
             show.legend = FALSE) +
  stat_compare_means(aes(group = Time), method = "kruskal.test", label = "p.signif") +
  stat_summary(fun.data = fun_length, geom = "text",vjust = -20, position = position_dodge(0.75), size = 1, color = "red")


ggplot(RMA_cor, (aes(x=Time, y=Delta, fill = Accession))) +
  geom_boxplot(outlier.size = 0.2, outlier.shape = 21) +
  theme_bw() +
  ylab("Delta of RMA between conditions across time")+
  scale_fill_manual(values = c(A="#1B9E77", B="#D95F02", C="#666666", D="#E7298A", E="#66A61E", F="#E6AB02")) +
  stat_summary(fun = mean, color = "darkred", position = position_dodge(0.75),
             geom = "point", shape = 4, size = 3,
             show.legend = FALSE) +
  stat_compare_means(aes(group = Accession), method = "kruskal.test", label = "p.signif") +
  stat_summary(fun.data = fun_length, geom = "text",vjust = -20, position = position_dodge(0.75), size = 1, color = "red")

Match moditifed sites to corresponded transcripts

Load transcripts associated with modified sites

MODs_counts_list <- 
  read.table("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/11_MODs_profile/1_MODs_count/Intersection/sample.list", header = F)
colnames(MODs_counts_list) <- "transcripts.sample"
MODs_counts_list[,1] <- str_replace(MODs_counts_list[,1], ".anno.gene.clean", "")  
MODs_counts_list

load gene list from annotations

Gene_frame <-  read.table("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/11_MODs_profile/1_MODs_count/WGCNA_transcripts.txt", header = T)
colnames(Gene_frame) <- c("transcripts", "length")
head(Gene_frame)

Collapse the modified sites to corresponded genes

for (i in 1:nrow(MODs_counts_list)){
  PATH <- "/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/11_MODs_profile/1_MODs_count/Intersection/"
  sample.df <- read.table(paste0(PATH,MODs_counts_list[i,1],".anno.gene.clean"), header = F)
  colnames(sample.df) <- c("transcripts", "interval", "Chr", "Site","MOD1", "MOD2")
  ### to be exported and join with the depth file
  sample.df$Index <- paste0(sample.df$Chr,";",sample.df$Site)
  DP.df <- get(merge_DP.list[i,1])
  DP2TP <- inner_join(DP.df,sample.df, by = "Index")
  
  ### transform into log2
  DP2TP$log2H_nonref1 <- log2(DP2TP$H_nonref1)
  DP2TP$log2H_total1 <- log2(DP2TP$H_total1)
  
  ### plot the correlation at the nucleotide level
  #print(ggscatter(DP2TP, x = "log2H_nonref1", y = "log2H_total1", 
   #add = "reg.line", conf.int = TRUE, 
   #cor.coef = TRUE, cor.method = "pearson",
   #ylab = "log2 total read counts", xlab = "log2 modified read counts",
   #color = "black", shape = 21, size = 2, 
   #add.params = list(color = "blue", fill = "lightgray"),
   #title= paste0("Correlation of RNA abundance and modification level of ",MODs_counts_list[i,1])))

    DP2TP_clean <- DP2TP[,c("Index","Nucleotides", "MOD1", "MOD2", "transcripts", "M_ratio", "H_ratio")]
    DP2TP_clean <- DP2TP_clean %>% group_by(transcripts) %>% mutate(transcripts.count = n())
    DP2TP_clean <- left_join(Gene_frame,DP2TP_clean,by = "transcripts")
    DP2TP_clean$RML <- DP2TP_clean$transcripts.count/DP2TP_clean$length
    count.df <- unique(DP2TP_clean[, c("transcripts", "RML")])
    
    count.df[is.na(count.df)] <- 0
    MODs.df <- as.data.frame(count.df[,2])
    colnames(MODs.df)<- MODs_counts_list[i,1]
    
    assign(paste(MODs_counts_list[i,1], "_RML"),MODs.df) 
    assign(paste0(MODs_counts_list[i,1], "_DP2TP"), DP2TP_clean)
}

correlate the RNA adundance (TPM) with RMA score over samples

DP_set <- ls(pattern = "_DP2TP") %>% str_replace("_DP2TP","")
TPM_mean$transcripts <- rownames(TPM_mean)

for (i in 1:length(DP_set)){
  DP.df <- get(paste0(DP_set[1], "_DP2TP"))
  
  ### modify here if M_ratio needed
  DP.TPM <- left_join(DP.df, TPM_mean, by = "transcripts")
  DP.TPM.clean <- DP.TPM[,c("Index", "Nucleotides", "MOD1", "MOD2", "transcripts", "transcripts.count", "H_ratio", DP_set[i])]
  DP.TPM.clean$H_ratio <- log2(DP.TPM.clean$H_ratio)
  DP.TPM.clean[,8] <- log2(DP.TPM.clean[,8])
  
  ### Plot transcripts with one modified nucleotides only 
 # print(ggscatter(DP.TPM.clean[DP.TPM.clean$transcripts.count == 1,], y = "H_ratio", x = DP_set[i], 
         # add = "reg.line", conf.int = TRUE, 
         # cor.coef = TRUE, cor.method = "pearson",
         # ylab = "log2(H_ratio)", xlab = "log2(TPM)",
         # color = "black", shape = 21, size = 2, 
          #add.params = list(color = "blue", fill = "lightgray"),
          #title= paste0("Correlation of RNA abundance and modification level of ",DP_set[i])))
}

Combine the relarive modifiction length-normalzied counts (RML)

### collapse the RML into dataframe
RML_list <- lapply(ls(pattern = " _RML"), get)
RML_combine <- bind_cols(RML_list)
rownames(RML_combine) <- Gene_frame$transcripts
write.csv(RML_combine, "/Users/leon/Documents/Project/Sorghum/Modification_Pattern/RML.csv")
head(RML_combine, 50)

generate RML for upset plot with binary matrix

RML_combine_upset <- RML_combine
for (i in 1:ncol(RML_combine_upset)){
RML_combine_upset[,i] <-  replace(x = RML_combine_upset [,i], 
                   RML_combine_upset [,i] != 0.000000   , 
                   values =  1)}
head(RML_combine_upset, 40)

Combine the site2gene full list for comparison

### bind the DP2TP to extract site2genes
DP2TP_list <- lapply(ls(pattern = "_DP2TP"), get)
DP2TP_combine <- bind_rows(DP2TP_list)
head(na.omit(DP2TP_combine), 50)

sites2gene <- unique(DP2TP_combine[c("Index","transcripts", "Nucleotides", "MOD1", "MOD2")]) %>% na.omit()
head(sites2gene, 50)

write.csv(sites2gene, "/Users/leon/Documents/Project/Sorghum/Modification_Pattern/sites2gene.csv")

Compare the RML relative to TPM (expression over time)

Generate the comparative dataframe

comp.frame <- data.frame("sample" = str_replace(colnames(RML_combine), "WL_", "") %>% str_replace("WW_", "") %>% unique())
comp.frame$sample2 <- comp.frame$sample
comp.frame <- comp.frame %>% separate(sample2, c("Accession", "Time"))
comp.frame

Merge RML to gene expression profile

### combine the information 
for (i in 1:nrow(comp.frame)){
  RML.temp <- RML_combine[,grepl(comp.frame[i,2], colnames(RML_combine))] 
  RML.temp <- RML.temp[,grepl(comp.frame[i,3], colnames(RML.temp))] 
  colnames(RML.temp) <- c("WL_RML", "WW_RML")
  RML.temp$Accession <- comp.frame[i,2]
  RML.temp$Time <- comp.frame[i,3]
  RML.temp$total <- RML.temp$WL_RML + RML.temp$WW_RML
  RML.temp <- RML.temp[RML.temp$total != 0,c("WL_RML", "WW_RML", "Accession", "Time")]
  RML.temp$transcripts <- rownames(RML.temp)
  rownames(RML.temp) <- NULL
  
  ### reformat TPM to RML table
  EXT1 <- TPM_mean[,grepl(comp.frame[i,2],colnames(TPM_mean))] 
  EXT2 <- EXT1[, grepl(comp.frame[i,3],colnames(EXT1))]
  EXT2$transcripts <- TPM_mean$transcripts
  
  ### reformat logFC to RML table
  logFC_temp <- Gene_logFC[Gene_logFC$Accession == comp.frame[i,2] & Gene_logFC$Time == comp.frame[i,3],
                           c("transcripts", "log2FoldChange", "Accession", "Time")]
  
  
  RML_temp2 <- left_join(RML.temp, EXT2, by = "transcripts")
  RML_join <- left_join(RML_temp2, logFC_temp, by = "transcripts")
  colnames(RML_join) <- c("WL_RML", "WW_RML", "Accession.x", "Time.x", "transcripts", "WW_TPM", "WL_TPM", "log2FoldChange", "Accession", "Time")
  RML_join_clean <- RML_join[, c("WL_RML", "WW_RML", "transcripts", "WW_TPM", "WL_TPM", "log2FoldChange", "Accession", "Time")]
    
  #colnames(RML_join)
  assign(paste0(comp.frame[i,1], "_RML_stats"), RML_join_clean)
}

head(RML_join_clean, 40)

Combine all RML statistics comparison files

RML_merge <- lapply(ls(pattern = "_RML_stats"), get)
RML_all <- bind_rows(RML_merge)
head(RML_all, 50)

### Classify groups based on RML and TPM
RML_all$group <- "both"
RML_all[RML_all$WL_RML == 0,"group"] <- "WW"
RML_all[RML_all$WW_RML == 0,"group"] <- "WL"

nrow(RML_all[RML_all$group == "both",])

## [1] 26196

nrow(RML_all[RML_all$group != "both",])

## [1] 17286

Classify total numbers of mods and total genes been modified

Generate the comparative table

Acc <- c("A","B","C","D","E","F")
Time <- c("TP1","TP3","TP5","TP7")

### define the summary table
IS_summary <- data.frame("Accession"=rep(Acc, times = 4),
                                   "Time"= rep(Time, each = 6))
IS_summary$WL_specifc <- "Fill"
IS_summary$WW_specifc <- "Fill"
IS_summary$Overlap <- "Fill"

Summarize shared, WL-specific, WW-specific modified transcripts

for (i in 1:nrow(IS_summary)){
  temp <- RML_combine_upset[,grepl(IS_summary[i,1],colnames(RML_combine_upset ))]
  temp.Time <- temp[,grepl(IS_summary[i,2],colnames(temp))]
  
  temp.Time$count <- rowSums(temp.Time)
  IS_summary[i,3] <- nrow(temp.Time[temp.Time[,1] == 1 & temp.Time[,3] == 1,])
  IS_summary[i,4] <- nrow(temp.Time[temp.Time[,2] == 1 & temp.Time[,3] == 1,])
  IS_summary[i,5] <- nrow(temp.Time[temp.Time$count == 2,])
  
  temp.WL <- data.frame("WL_Sgene" = rownames(temp.Time[temp.Time[,1] == 1 & temp.Time[,3] == 1,]))
  temp.WL$type <- paste0(IS_summary[i,1], "_", IS_summary[i,2], "_WL")
  
  temp.WW <- data.frame("WW_Sgene" = rownames(temp.Time[temp.Time[,2] == 1 & temp.Time[,3] == 1,]))
  temp.WW$type <- paste0(IS_summary[i,1], "_", IS_summary[i,2], "_WW")
  
  assign(paste0(IS_summary[i,1], "_", IS_summary[i,2], "_WL_specific"), temp.WL)
  assign(paste0(IS_summary[i,1], "_", IS_summary[i,2], "_WW_specific"), temp.WW)
}

IS_summary

Reshape the plot table

IS_plot <- melt(IS_summary, id.vars=c("Accession", "Time"))
IS_plot$value <- as.numeric(IS_plot$value)
IS_plot$Index <- paste0(IS_plot$Accession, IS_plot$variable)
IS_plot$variable <- as.character(IS_plot$variable)
IS_plot

Generate the line graph

IS_plot %>% ggplot(aes(x=Time, y = value, color= Accession, group = Index)) +
  geom_line(alpha = 0.8,size = 1, aes(shape = variable))+ 
  theme_bw()+
  scale_color_manual(values = c(A="#1B9E77", B="#D95F02", C="#666666", D="#E7298A", E="#66A61E", F="#E6AB02")) +
  ylab("Numbers of modified transcripts")

## Warning in geom_line(alpha = 0.8, size = 1, aes(shape = variable)): Ignoring
## unknown aesthetics: shape

merge_LS <- lapply(ls(pattern = "_WL_specific"), get)
WL_specific_combine <- bind_rows(merge_LS)
head(WL_specific_combine)

merge_WS <- lapply(ls(pattern = "_WW_specific"), get)
WW_specific_combine <- bind_rows(merge_WS)
head(WW_specific_combine)

Build up the upset plot table of WL and WW specific modified genes

Observe the WL and WW-specific modified transcripts shift over time or over accessions

WL_meta_gene <- data.frame("WL_Sgene"=unique(WL_specific_combine[,1]))
WW_meta_gene <- data.frame("WW_Sgene"=unique(WW_specific_combine[,1]))

WL_sample <- ls(pattern = "_WL_specific")
WW_sample <- ls(pattern = "_WW_specific")
WL_sample

##  [1] "A_TP1_WL_specific" "A_TP3_WL_specific" "A_TP5_WL_specific"
##  [4] "A_TP7_WL_specific" "B_TP1_WL_specific" "B_TP3_WL_specific"
##  [7] "B_TP5_WL_specific" "B_TP7_WL_specific" "C_TP1_WL_specific"
## [10] "C_TP3_WL_specific" "C_TP5_WL_specific" "C_TP7_WL_specific"
## [13] "D_TP1_WL_specific" "D_TP3_WL_specific" "D_TP5_WL_specific"
## [16] "D_TP7_WL_specific" "E_TP1_WL_specific" "E_TP3_WL_specific"
## [19] "E_TP5_WL_specific" "E_TP7_WL_specific" "F_TP1_WL_specific"
## [22] "F_TP3_WL_specific" "F_TP5_WL_specific" "F_TP7_WL_specific"

WW_sample

##  [1] "A_TP1_WW_specific" "A_TP3_WW_specific" "A_TP5_WW_specific"
##  [4] "A_TP7_WW_specific" "B_TP1_WW_specific" "B_TP3_WW_specific"
##  [7] "B_TP5_WW_specific" "B_TP7_WW_specific" "C_TP1_WW_specific"
## [10] "C_TP3_WW_specific" "C_TP5_WW_specific" "C_TP7_WW_specific"
## [13] "D_TP1_WW_specific" "D_TP3_WW_specific" "D_TP5_WW_specific"
## [16] "D_TP7_WW_specific" "E_TP1_WW_specific" "E_TP3_WW_specific"
## [19] "E_TP5_WW_specific" "E_TP7_WW_specific" "F_TP1_WW_specific"
## [22] "F_TP3_WW_specific" "F_TP5_WW_specific" "F_TP7_WW_specific"

Generate matrix for Upset plot

### For WL-sample
for (i in 1:length(WL_sample)){
  
  temp.df <- get(WL_sample[i])
  temp.df$type <- 1
  MODs.df <- left_join(WL_meta_gene,temp.df,by="WL_Sgene")
  MODs.df <- as.data.frame(MODs.df[,2])
  colnames(MODs.df)<- WL_sample[i]
  
  assign(paste0(WL_sample[i], "_WL_bind"),MODs.df) 
}
col_LS <- lapply(ls(pattern = "_WL_bind"), get)
WL_specific_bind <- bind_cols(col_LS)
rownames(WL_specific_bind) <- WL_meta_gene[,1]
WL_specific_bind[is.na(WL_specific_bind)] <- 0
head(WL_specific_bind, 20)

### For WW-sample
for (i in 1:length(WW_sample)){
  
  temp.df <- get(WW_sample[i])
  temp.df$type <- 1
  MODs.df <- left_join(WW_meta_gene,temp.df,by="WW_Sgene")
  MODs.df <- as.data.frame(MODs.df[,2])
  colnames(MODs.df)<- WW_sample[i]
  
  assign(paste0(WW_sample[i], "_WW_bind"),MODs.df) 
}
col_LS <- lapply(ls(pattern = "_WW_bind"), get)
WW_specific_bind <- bind_cols(col_LS)
rownames(WW_specific_bind) <- WW_meta_gene[,1]
WW_specific_bind[is.na(WW_specific_bind)] <- 0
head(WW_specific_bind, 20)

Plot results by time point per accessions using Upset plot

Acc <- c("A","B","C","D","E","F")

for (i in 1:length(Acc)){
  print(upset(WL_specific_bind[,grepl(paste0(Acc[i], "_"),colnames(WL_specific_bind))], 
    nintersects = 200,
    nsets = 4,  
    order.by = c("freq", "degree"), 
    decreasing = c(TRUE,TRUE),
    keep.order = T,
    mb.ratio = c(0.5, 0.5),
    number.angles = 0, 
    text.scale = 1, 
    point.size = 2, 
    line.size = 1,
    matrix.color = "steelblue",
    sets.bar.color = "tomato"))
}

for (i in 1:length(Acc)){
  print(upset(WW_specific_bind[,grepl(paste0(Acc[i], "_"),colnames(WW_specific_bind))], 
    nintersects = 200,
    nsets = 4,  
    order.by = c("freq", "degree"), 
    decreasing = c(TRUE,TRUE),
    keep.order = T,
    mb.ratio = c(0.5, 0.5),
    number.angles = 0, 
    text.scale = 1, 
    point.size = 2, 
    line.size = 1,
    matrix.color = "steelblue",
    sets.bar.color = "tomato"))
}

Plot results by time point per time point using Upset plot

Time <- c("TP1","TP3","TP5","TP7")

### Plot upset plot with per accession over time
for (i in 1:length(Time)){
  print(upset(WL_specific_bind[,grepl(paste0(Time[i], "_"),colnames(WL_specific_bind))], 
    nintersects = 200,
    nsets = 6,  
    order.by = c("freq", "degree"), 
    decreasing = c(TRUE,TRUE),
    keep.order = T,
    mb.ratio = c(0.5, 0.5),
    number.angles = 0, 
    text.scale = 1, 
    point.size = 2, 
    line.size = 1,
    matrix.color = "steelblue",
    sets.bar.color = "tomato"))
}

for (i in 1:length(Time)){
  print(upset(WW_specific_bind[,grepl(paste0(Time[i], "_"),colnames(WW_specific_bind))], 
    nintersects = 200,
    nsets = 6,  
    order.by = c("freq", "degree"), 
    decreasing = c(TRUE,TRUE),
    keep.order = T,
    mb.ratio = c(0.5, 0.5),
    number.angles = 0, 
    text.scale = 1, 
    point.size = 2, 
    line.size = 1,
    matrix.color = "steelblue",
    sets.bar.color = "tomato"))
}