Comparative Sub-genome analysis of lncRNAs and protein-coding genes in Brassicaceae

Load libraries
Question1: From genome-wide, Does the C.sativa lncRNA from each sub-genome (paralogous lncRNAs) expressed differently compared with diploid species?
Transform data into log2
Random sampling 2000 genes from all expressed genes
Plot comparison of TPM across four samples
Question2: How genes and lncRNAs expressed in each sub-genome from C.sativa ?
Plot boxplot for sub-genome comparisons under the C.sativa
Question3: Does the C.sativa lncRNA from each sub-genome (paralogous lncRNAs) performs weaker or stronger compared with correponded diploid speceis ?
Generate PheatMap for lncRNAs from three speceis and orthogroups
Generate Boxplot for side-by-side comparison of all orthologous groups for lncRNAs
Generate Boxplot for side-by-side comparison of all syntenic genes

Load libraries

### Install or load packages 
library(DESeq2)
library(pheatmap)
library(dplyr)
library(ggplot2)
library(knitr)
library(RColorBrewer)
library(reshape2)
library(ggpubr)

Load TPM data

#Load melt files for fout samples comparisons
TPM <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/4_LncRNA/4_TPM/all_TPM.csv",header = T)
head(TPM, 20)

Remove genes and lncRNAs has less then 1 feature counts across reps

Both filters for lincRNAs and genes in this dataset were summarized as follow:
Camelina protein-coding genes and lncRNAs that expressed (on average) more than 1 count per experiment across all of the experiments are used in this anaylsis (61,497 combined, 3,272 for lncRNAs, 58,225for proteins coding genes)

Load final ID list for lncRNAs and genes

final.list <- read.table("/Users/Leon/OneDrive - Cornell University/Projects/4_LncRNA/4_TPM/final.list", header = F)
colnames(final.list) <- "Gene.ID"
head(final.list, 20)

#merge TPM with final list (63,105 items remained)
all_TPM <- left_join(final.list,TPM,by = "Gene.ID")
#Remove lncRNA and genes from scaffold
all_TPM <- na.omit(all_TPM)

head(all_TPM,20)

Question1: From genome-wide, Does the C.sativa lncRNA from each sub-genome (paralogous lncRNAs) expressed differently compared with diploid species?

Estimated output counts (gene and lncRNA combined)

C_hispida_Csativa_rep: 21,913
C_hispida_rep: 21,913

C_neglecta_rep: 20,316
C_neglecta_Csativa_rep: 20,316

#calculate the average across samples
C_hispida_all_TPM <- all_TPM [all_TPM $species=="C_hispida",]
C_neglecta_all_TPM <- all_TPM [all_TPM $species=="C_neglecta",]
C_neglecta_like_all_TPM <- all_TPM [all_TPM $species=="C_neglecta_like",]

#subtracting each TPM data with assigned subgenome lncRNA
C_hispida_rep <- C_hispida_all_TPM[,c(1,2,3,7,8,18,19,20,24,25,26)]
C_hispida_Csativa_rep<- C_hispida_all_TPM[,c(1,2,3,4,5,6,9,10,11,31,32,36,37,38)]
  
C_neglecta_rep <- C_neglecta_all_TPM[,c(1,2,3,27,28,29)]
C_neglecta_Csativa_rep <- C_neglecta_all_TPM[,c(1,2,3,4,5,6,9,10,11,31,32,36,37,38)]


#cALCULATE THE AVERAGE FOR EACH REP
C_hispida_rep$ave <- rowMeans(C_hispida_rep[,4:11])
C_hispida_Csativa_rep$ave <- rowMeans(C_hispida_Csativa_rep[,4:14])
C_hispida_Csativa_rep$species <- "C_hispida.Csativa"

C_neglecta_rep$ave <- rowMeans(C_neglecta_rep[,4:6])
C_neglecta_Csativa_rep$ave <- rowMeans(C_neglecta_Csativa_rep[,4:14])
C_neglecta_Csativa_rep$species <- "C_neglecta.Csativa"


TPM_comp_combine <- bind_rows(C_hispida_rep[,c("species","Gene.ID","ave")],
                              C_hispida_Csativa_rep[,c("species","Gene.ID","ave")],
                              C_neglecta_rep[,c("species","Gene.ID","ave")],
                              C_neglecta_Csativa_rep[,c("species","Gene.ID","ave")])


TPM_comp_combine_clean <- na.omit(TPM_comp_combine)

TPM_lncRNA_clean <- TPM_comp_combine_clean%>% filter(grepl('CONS', Gene.ID))
TPM_gene_clean <- TPM_comp_combine_clean%>% filter(grepl('Csa', Gene.ID))

head(TPM_gene_clean,20)

head(TPM_lncRNA_clean, 20)

Transform data into log2

TPM_lncRNA_clean$Log2 <- log2(TPM_lncRNA_clean$ave + 1)
TPM_gene_clean$Log2 <- log2(TPM_gene_clean$ave + 1)

Random sampling 2000 genes from all expressed genes

### Define species list
species.list <- c("C_hispida","C_hispida.Csativa","C_neglecta","C_neglecta.Csativa")

#radom sample 2000 genes out of the list
for (i in 1:length(species.list)){
  sample.df <- TPM_gene_clean[TPM_gene_clean$species==species.list[i],][sample(nrow(TPM_gene_clean[TPM_gene_clean$species==species.list[i],]), 2000), ]
  assign(paste0(species.list[i],"_sample"), sample.df)
}

#build merge list and merge 4 samples
Gene_sample_combine <- bind_rows(C_hispida_sample,C_hispida.Csativa_sample,C_neglecta_sample,C_neglecta.Csativa_sample)
head(Gene_sample_combine, 20)

Plot comparison of TPM across four samples

my_comparisons <- list(c("C_hispida","C_hispida.Csativa"), c("C_neglecta","C_neglecta.Csativa"))

fun_length <- function(x){
  return(data.frame(y=median(x),label= paste0("N=", length(x))))
}

#Plot all lncRNAs
ggboxplot(TPM_lncRNA_clean , x= "species", y ="Log2", fill = "species", outlier.size = 0.01) +
  #add custom color
  scale_fill_manual(values = c("steelblue", "steelblue", "tomato", "tomato"))+ 
  #add summary information
  stat_summary(fun=mean, color ="darkblue", geom="point", shape=4, size=5,show.legend = TRUE) +
  # Add numbers of observations
  stat_summary(fun.data = fun_length, geom = "text", vjust = -18, size = 3, color = "red") +
  # Add pairwise comparisons p-value
  stat_compare_means(comparisons = my_comparisons,test=t.test, label.y = c(10,10)) + 
  theme_classic() + theme(legend.position = "none") +
  ylab("log2 transformed TPM of lncRNAs") +
  xlab("") +
  scale_y_continuous(breaks = c(0,2,4,6,8,10,12), limits = c(0,13))

#Plot all genes
ggboxplot(Gene_sample_combine, 
          x= "species", y ="Log2", fill = "species", 
          outlier.size = 0.01) +
  scale_fill_manual(values = c("steelblue", "steelblue", "tomato", "tomato"))+ 
  stat_summary(fun=mean, color ="darkblue", geom="point", shape=4, size=5,show.legend = TRUE)+
  # Add numbers of observations
  stat_summary(fun.data = fun_length, geom = "text", vjust = -18, size = 3, color = "red") +
  # Add pairwise comparisons p-value
  stat_compare_means(comparisons = my_comparisons,test=t.test, label.y = c(10,10)) +
  
  theme_classic() + theme(legend.position = "none") +
  ylab("log2 transformed TPM of protein-coding genes") +
  xlab("") +
  scale_y_continuous(breaks = c(0,2,4,6,8,10,12), limits = c(0,13))

## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

## Warning: Removed 4 rows containing non-finite values (stat_summary).
## Removed 4 rows containing non-finite values (stat_summary).

## Warning: Removed 4 rows containing non-finite values (stat_signif).

Question2: How genes and lncRNAs expressed in each sub-genome from C.sativa ?

Reshape the table contating lncRNAs and genes from 3 sub-genomes

#Subtract all suggenomes under C.sativa reps

Cs_all <- all_TPM[,c(1,2,3,4,5,6,9,10,11,31,32,36,37,38)]
Cs_all$ave <- rowMeans(Cs_all[,4:14])
Cs_all$Log2 <- log2(Cs_all$ave + 1)

#Split dataframe
Cs_lncRNA <- Cs_all%>% filter(grepl('CONS', Gene.ID))
Cs_gene <- Cs_all%>% filter(grepl('Csa', Gene.ID))

#rename class to differentiate lncRNAs and genes
Cs_lncRNA$Class <- "lncRNAs"
Cs_gene$Class <- "Genes"

Create randome sampling for genes from each sub-genome

sub_genome.list <- c("C_hispida","C_neglecta_like","C_neglecta")

#random sampling 2000 genes out of the list for each subgenome
for (i in 1:length(sub_genome.list)){
  sub.df <- Cs_gene[Cs_gene$species==sub_genome.list[i],][sample(nrow(Cs_gene[Cs_gene$species==sub_genome.list[i],]), 2000), ]
  assign(paste0("Cs_",sub_genome.list[i],"_subgenome"), sub.df)
}


#build merge list and merge 3 samples
Cs_sample <- bind_rows(Cs_C_hispida_subgenome, Cs_C_neglecta_like_subgenome, Cs_C_neglecta_subgenome)
head(Cs_sample, 40)

Plot boxplot for sub-genome comparisons under the C.sativa

#build list of comparisons for expression level 
my_comp <- list(c("C_neglecta_like","C_hispida"), c("C_neglecta","C_neglecta_like"), c("C_neglecta","C_hispida"))

fun_length <- function(x){
  return(data.frame(y=median(x),label= paste0("N=", length(x))))
}

#Build comparisons of lncRNAs
ggboxplot(Cs_lncRNA, x= "species", y ="Log2", fill = "steelblue", outlier.size = 0.01) + 
  #add stats value 
  stat_summary(fun=mean, color ="darkblue", geom="point", shape=4, size=5,show.legend = TRUE) +
  stat_summary(fun.data = fun_length, geom = "text", vjust = -18, size = 3, color = "red") +
  # Add pairwise comparisons p-value
  stat_compare_means(comparisons = my_comp,test=t.test, label.y = c(12,10,11)) + 
  theme_classic()  +
  ylab("log2 transformed TPM value of lncRNAs") +
  xlab("Sub-genome of C.sativa")

ggboxplot(Cs_sample, x= "species", y ="Log2", fill = "tomato", outlier.size = 0.01) +
  #add stats value
  stat_summary(fun=mean, color ="darkblue", geom="point", shape=4, size=5,show.legend = TRUE) +
  stat_summary(fun.data = fun_length, geom = "text", vjust = -18, size = 3, color = "red") +
  # Add pairwise comparisons p-value
  stat_compare_means(comparisons = my_comp,test=t.test, label.y = c(12,13,14)) + 
  theme_classic()  +
  ylab("log2 transformed TPM of expressed Genes") +
  xlab("Sub-genome of C.sativa")+
  scale_y_continuous(breaks = c(0,2,4,6,8,10,12,14), limits = c(0,15))

## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

## Warning: Removed 1 rows containing non-finite values (stat_summary).
## Removed 1 rows containing non-finite values (stat_summary).

## Warning: Removed 1 rows containing non-finite values (stat_signif).

Question3: Does the C.sativa lncRNA from each sub-genome (paralogous lncRNAs) performs weaker or stronger compared with correponded diploid speceis ?

load paralogous table for all lncRNAs across three sub-genomes

orthogroups <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/4_LncRNA/3_subgenome/0_Csat_lincRNA_orthogroups.csv", header = T)
synteny <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/4_LncRNA/3_subgenome/3_Cs-Cs-syntenic_gene_subgenome_assign.csv", header = T)

lncRNA_TPM <- all_TPM%>% filter(grepl('CONS', Gene.ID))
gene_TPM <- all_TPM%>% filter(grepl('Csa', Gene.ID))

#Select samples belong to Csativa
Cs_lncRNA_TPM <- lncRNA_TPM[,c(1,2,3,4,5,6,9,10,11,31,32,36,37,38)]
Chis_lncRNA_TPM <- lncRNA_TPM [,c(1,2,3,7,8,18,19,20,24,25,26)]
Cneg_lncRNA_TPM <- lncRNA_TPM [,c(1,2,3,27,28,29)]
Cs_gene_TPM <- gene_TPM[,c(1,2,3,4,5,6,9,10,11,31,32,36,37,38)]

#Match each ID to orthologous table 
for (i in 1:ncol(orthogroups)){
  synteny.df <- left_join(data.frame(Gene.ID = orthogroups[,i]),Cs_lncRNA_TPM, by = "Gene.ID")
  assign(paste0("sub_orthogroups_",i),synteny.df)
}

#extract neglecta data
Cneg_sub_synteny <- left_join(data.frame(Gene.ID = orthogroups[,2]),Cneg_lncRNA_TPM, by = "Gene.ID")
#extract hispida data
Chis_sub_synteny <- left_join(data.frame(Gene.ID = orthogroups[,4]),Chis_lncRNA_TPM, by = "Gene.ID")

#Match each ID to synteny table
for (i in 1:ncol(synteny)){
  synteny.df <- left_join(data.frame(Gene.ID = synteny[,i]),Cs_gene_TPM, by = "Gene.ID")
  assign(paste0("sub_synteny_",i),synteny.df)
}

#build the list for each subgroup 
lncRNA_list <- lapply(ls(pattern = "sub_orthogroups_"), get)
gene_list <- lapply(ls(pattern = "sub_synteny_"), get)

orthogroups_combine <- bind_cols(lncRNA_list )
syntney_combine <- bind_cols(gene_list)
head(syntney_combine,50)

orthogroups_combine_speceis <- bind_cols(orthogroups_combine,Cneg_sub_synteny,Chis_sub_synteny )
head(orthogroups_combine_speceis,40)

#write.csv(orthogroups_combine_speceis,"/Users/Leon/OneDrive - Cornell University/Projects/4_LncRNA/3_subgenome/1_orthogroup_table.csv", row.names = F)
#write.csv(syntney_combine ,"/Users/Leon/OneDrive - Cornell University/Projects/4_LncRNA/3_subgenome/1_synteny_table.csv", row.names = F)

Generate PheatMap for lncRNAs from three speceis and orthogroups

lncRNA_pheat1 <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/4_LncRNA/3_subgenome/Results/Log2/2_orthogroup_pheatmap_three-speceis.csv", header = T)
lncRNA_pheat2 <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/4_LncRNA/3_subgenome/Results/Log2/2_orthogroup_pheatmap_Cs.csv", header = T)

#Check the header of two files
head(lncRNA_pheat1,20)

head(lncRNA_pheat2,20)

#Transform the column with gene ID into rownames
rownames(lncRNA_pheat1)=lncRNA_pheat1$lncRNA.ID
lncRNA_pheat_ph1 <- lncRNA_pheat1[,-1]

rownames(lncRNA_pheat2)=lncRNA_pheat2$lncRNA.ID
lncRNA_pheat_ph2 <- lncRNA_pheat2[,-1]

#Plot the Pheatmap
pheatmap(lncRNA_pheat_ph1, fontsize_col = 8, fontsize_row = 3)

pheatmap(lncRNA_pheat_ph2, fontsize_col = 8, fontsize_row = 3)

Generate Boxplot for side-by-side comparison of all orthologous groups for lncRNAs

#Melt the dataframe used for Pheatmap
lncRNA_pheat2_melt <- melt(lncRNA_pheat2, id.vars=c("lncRNA.ID"))
lncRNA_pheat1_melt <- melt(lncRNA_pheat1, id.vars=c("lncRNA.ID"))

#Plot the box plot of lncRNAs
sub_comp1 <- list(c("C.Neglecta.like","C.Hispida"), c("C.Neglecta","C.Neglecta.like"), c("C.Neglecta","C.Hispida"))
sub_comp2 <- list(c("C.sativa.Neglecta","C.Neglecta"),c("C.sativa.Hispida","C.Hispida"))

fun_length <- function(x){
  return(data.frame(y=median(x),label= paste0("N=", length(x))))
}

ggboxplot(lncRNA_pheat2_melt , x= "variable", y ="value", fill = "steelblue", outlier.size = 0.01) +
  #add stats value
  stat_summary(fun=mean, color ="darkblue", geom="point", shape=4, size=5,show.legend = TRUE) +
  stat_summary(fun.data = fun_length, geom = "text", vjust = -18, size = 3, color = "red") +
  # Add pairwise comparisons p-value
  stat_compare_means(comparisons = sub_comp1,test=t.test, label.y = c(8,9,10)) + 
  theme_classic()  +
  ylab("log2 transformed TPM of orthologous lncRNAs") +
  xlab("Sub-genome of C.sativa")+
  scale_y_continuous(breaks = c(0,1,2,3,4,5,6,7,8,9,10,11), limits = c(0,11))

lncRNA_pheat1_melt$variable <- factor(lncRNA_pheat1_melt$variable , levels=c("C.sativa.Neglecta","C.Neglecta", "C.sativa.Hispida","C.Hispida"))

ggboxplot(lncRNA_pheat1_melt,
          x= "variable", y ="value", fill = "steelblue", outlier.size = 0.01) +
  #add stats value
  stat_summary(fun=mean, color ="darkblue", geom="point", shape=4, size=5,show.legend = TRUE) +
  stat_summary(fun.data = fun_length, geom = "text", vjust = -18, size = 3, color = "red") +
  # Add pairwise comparisons p-value
  stat_compare_means(comparisons = sub_comp2,test=t.test, label.y = c(7,7)) + 
  theme_classic()  +
  ylab("log2 transformed TPM of orthologous lncRNAs") +
  xlab("Sub-genome of C.sativa")+
  scale_y_continuous(breaks = c(0,1,2,3,4,5,6,7,8,9), limits = c(0,9))

## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

## Warning: Removed 1 rows containing non-finite values (stat_summary).
## Removed 1 rows containing non-finite values (stat_summary).

## Warning: Removed 1 rows containing non-finite values (stat_signif).

Generate Boxplot for side-by-side comparison of all syntenic genes

Each row contains syntenic genes from three sub-genomes
Calculation is based on average and then log2 transformed

#Load log2 transformed TPM for syntenic genes
gene_pheat <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/4_LncRNA/3_subgenome/Results/Log2/2_synteny_pheatmap_log2.csv", header = T)

#random sampling 500 sets of syntenic genes out of the list for each subgenome
#gene_pheat_sample <- gene_pheat[sample(nrow(gene_pheat), 500), ]
gene_pheat_melt <- melt(gene_pheat , id.vars=c("Gene.ID"))


#Plot all syntenic genes across sub-genome
sub_comp3 <- list(c("C.Neglecta.like","C.Hispida"), c("C.Neglecta","C.Neglecta.like"), c("C.Neglecta","C.Hispida"))
fun_length <- function(x){
  return(data.frame(y=median(x),label= paste0("N=", length(x))))
}


ggboxplot(gene_pheat_melt , x= "variable", y ="value", fill = "tomato", outlier.size = 0.01) +
  #add stats value
  stat_summary(fun=mean, color ="darkblue", geom="point", shape=4, size=5,show.legend = TRUE) +
  stat_summary(fun.data = fun_length, geom = "text", vjust = -18, size = 3, color = "red") +
  # Add pairwise comparisons p-value
  stat_compare_means(comparisons = sub_comp3,test=t.test, label.y = c(4,4.5,5)) + 
  theme_classic()  +
  ylab("log2 transformed TPM of protein-coding genes") +
  xlab("Sub-genome of C.sativa")+
  scale_y_continuous(breaks = c(0,1,2,3,4,5,6), limits = c(0,6))