### Install or load packages
library(DESeq2)
library(pheatmap)
library(dplyr)
library(ggplot2)
library(knitr)
library(RColorBrewer)
library(reshape2)
library(ggpubr)
#Load melt files for fout samples comparisons
TPM <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/4_LncRNA/4_TPM/all_TPM.csv",header = T)
head(TPM, 20)
Both filters for lincRNAs and genes in this dataset were
summarized as follow:
Camelina protein-coding genes and lncRNAs that expressed (on average)
more than 1 count per experiment across all of the experiments are used
in this anaylsis (61,497Â combined, 3,272 for lncRNAs, 58,225for proteins
coding genes)
Load final ID list for lncRNAs and genes
final.list <- read.table("/Users/Leon/OneDrive - Cornell University/Projects/4_LncRNA/4_TPM/final.list", header = F)
colnames(final.list) <- "Gene.ID"
head(final.list, 20)
#merge TPM with final list (63,105 items remained)
all_TPM <- left_join(final.list,TPM,by = "Gene.ID")
#Remove lncRNA and genes from scaffold
all_TPM <- na.omit(all_TPM)
head(all_TPM,20)
Estimated output counts (gene and lncRNA
combined)
C_hispida_Csativa_rep: 21,913
C_hispida_rep: 21,913
C_neglecta_rep: 20,316
C_neglecta_Csativa_rep: 20,316
#calculate the average across samples
C_hispida_all_TPM <- all_TPM [all_TPM $species=="C_hispida",]
C_neglecta_all_TPM <- all_TPM [all_TPM $species=="C_neglecta",]
C_neglecta_like_all_TPM <- all_TPM [all_TPM $species=="C_neglecta_like",]
#subtracting each TPM data with assigned subgenome lncRNA
C_hispida_rep <- C_hispida_all_TPM[,c(1,2,3,7,8,18,19,20,24,25,26)]
C_hispida_Csativa_rep<- C_hispida_all_TPM[,c(1,2,3,4,5,6,9,10,11,31,32,36,37,38)]
C_neglecta_rep <- C_neglecta_all_TPM[,c(1,2,3,27,28,29)]
C_neglecta_Csativa_rep <- C_neglecta_all_TPM[,c(1,2,3,4,5,6,9,10,11,31,32,36,37,38)]
#cALCULATE THE AVERAGE FOR EACH REP
C_hispida_rep$ave <- rowMeans(C_hispida_rep[,4:11])
C_hispida_Csativa_rep$ave <- rowMeans(C_hispida_Csativa_rep[,4:14])
C_hispida_Csativa_rep$species <- "C_hispida.Csativa"
C_neglecta_rep$ave <- rowMeans(C_neglecta_rep[,4:6])
C_neglecta_Csativa_rep$ave <- rowMeans(C_neglecta_Csativa_rep[,4:14])
C_neglecta_Csativa_rep$species <- "C_neglecta.Csativa"
TPM_comp_combine <- bind_rows(C_hispida_rep[,c("species","Gene.ID","ave")],
C_hispida_Csativa_rep[,c("species","Gene.ID","ave")],
C_neglecta_rep[,c("species","Gene.ID","ave")],
C_neglecta_Csativa_rep[,c("species","Gene.ID","ave")])
TPM_comp_combine_clean <- na.omit(TPM_comp_combine)
TPM_lncRNA_clean <- TPM_comp_combine_clean%>% filter(grepl('CONS', Gene.ID))
TPM_gene_clean <- TPM_comp_combine_clean%>% filter(grepl('Csa', Gene.ID))
head(TPM_gene_clean,20)
head(TPM_lncRNA_clean, 20)
TPM_lncRNA_clean$Log2 <- log2(TPM_lncRNA_clean$ave + 1)
TPM_gene_clean$Log2 <- log2(TPM_gene_clean$ave + 1)
### Define species list
species.list <- c("C_hispida","C_hispida.Csativa","C_neglecta","C_neglecta.Csativa")
#radom sample 2000 genes out of the list
for (i in 1:length(species.list)){
sample.df <- TPM_gene_clean[TPM_gene_clean$species==species.list[i],][sample(nrow(TPM_gene_clean[TPM_gene_clean$species==species.list[i],]), 2000), ]
assign(paste0(species.list[i],"_sample"), sample.df)
}
#build merge list and merge 4 samples
Gene_sample_combine <- bind_rows(C_hispida_sample,C_hispida.Csativa_sample,C_neglecta_sample,C_neglecta.Csativa_sample)
head(Gene_sample_combine, 20)
my_comparisons <- list(c("C_hispida","C_hispida.Csativa"), c("C_neglecta","C_neglecta.Csativa"))
fun_length <- function(x){
return(data.frame(y=median(x),label= paste0("N=", length(x))))
}
#Plot all lncRNAs
ggboxplot(TPM_lncRNA_clean , x= "species", y ="Log2", fill = "species", outlier.size = 0.01) +
#add custom color
scale_fill_manual(values = c("steelblue", "steelblue", "tomato", "tomato"))+
#add summary information
stat_summary(fun=mean, color ="darkblue", geom="point", shape=4, size=5,show.legend = TRUE) +
# Add numbers of observations
stat_summary(fun.data = fun_length, geom = "text", vjust = -18, size = 3, color = "red") +
# Add pairwise comparisons p-value
stat_compare_means(comparisons = my_comparisons,test=t.test, label.y = c(10,10)) +
theme_classic() + theme(legend.position = "none") +
ylab("log2 transformed TPM of lncRNAs") +
xlab("") +
scale_y_continuous(breaks = c(0,2,4,6,8,10,12), limits = c(0,13))
#Plot all genes
ggboxplot(Gene_sample_combine,
x= "species", y ="Log2", fill = "species",
outlier.size = 0.01) +
scale_fill_manual(values = c("steelblue", "steelblue", "tomato", "tomato"))+
stat_summary(fun=mean, color ="darkblue", geom="point", shape=4, size=5,show.legend = TRUE)+
# Add numbers of observations
stat_summary(fun.data = fun_length, geom = "text", vjust = -18, size = 3, color = "red") +
# Add pairwise comparisons p-value
stat_compare_means(comparisons = my_comparisons,test=t.test, label.y = c(10,10)) +
theme_classic() + theme(legend.position = "none") +
ylab("log2 transformed TPM of protein-coding genes") +
xlab("") +
scale_y_continuous(breaks = c(0,2,4,6,8,10,12), limits = c(0,13))
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
## Warning: Removed 4 rows containing non-finite values (stat_summary).
## Removed 4 rows containing non-finite values (stat_summary).
## Warning: Removed 4 rows containing non-finite values (stat_signif).
Reshape the table contating lncRNAs and genes from 3 sub-genomes
#Subtract all suggenomes under C.sativa reps
Cs_all <- all_TPM[,c(1,2,3,4,5,6,9,10,11,31,32,36,37,38)]
Cs_all$ave <- rowMeans(Cs_all[,4:14])
Cs_all$Log2 <- log2(Cs_all$ave + 1)
#Split dataframe
Cs_lncRNA <- Cs_all%>% filter(grepl('CONS', Gene.ID))
Cs_gene <- Cs_all%>% filter(grepl('Csa', Gene.ID))
#rename class to differentiate lncRNAs and genes
Cs_lncRNA$Class <- "lncRNAs"
Cs_gene$Class <- "Genes"
sub_genome.list <- c("C_hispida","C_neglecta_like","C_neglecta")
#random sampling 2000 genes out of the list for each subgenome
for (i in 1:length(sub_genome.list)){
sub.df <- Cs_gene[Cs_gene$species==sub_genome.list[i],][sample(nrow(Cs_gene[Cs_gene$species==sub_genome.list[i],]), 2000), ]
assign(paste0("Cs_",sub_genome.list[i],"_subgenome"), sub.df)
}
#build merge list and merge 3 samples
Cs_sample <- bind_rows(Cs_C_hispida_subgenome, Cs_C_neglecta_like_subgenome, Cs_C_neglecta_subgenome)
head(Cs_sample, 40)
#build list of comparisons for expression level
my_comp <- list(c("C_neglecta_like","C_hispida"), c("C_neglecta","C_neglecta_like"), c("C_neglecta","C_hispida"))
fun_length <- function(x){
return(data.frame(y=median(x),label= paste0("N=", length(x))))
}
#Build comparisons of lncRNAs
ggboxplot(Cs_lncRNA, x= "species", y ="Log2", fill = "steelblue", outlier.size = 0.01) +
#add stats value
stat_summary(fun=mean, color ="darkblue", geom="point", shape=4, size=5,show.legend = TRUE) +
stat_summary(fun.data = fun_length, geom = "text", vjust = -18, size = 3, color = "red") +
# Add pairwise comparisons p-value
stat_compare_means(comparisons = my_comp,test=t.test, label.y = c(12,10,11)) +
theme_classic() +
ylab("log2 transformed TPM value of lncRNAs") +
xlab("Sub-genome of C.sativa")
ggboxplot(Cs_sample, x= "species", y ="Log2", fill = "tomato", outlier.size = 0.01) +
#add stats value
stat_summary(fun=mean, color ="darkblue", geom="point", shape=4, size=5,show.legend = TRUE) +
stat_summary(fun.data = fun_length, geom = "text", vjust = -18, size = 3, color = "red") +
# Add pairwise comparisons p-value
stat_compare_means(comparisons = my_comp,test=t.test, label.y = c(12,13,14)) +
theme_classic() +
ylab("log2 transformed TPM of expressed Genes") +
xlab("Sub-genome of C.sativa")+
scale_y_continuous(breaks = c(0,2,4,6,8,10,12,14), limits = c(0,15))
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing non-finite values (stat_summary).
## Removed 1 rows containing non-finite values (stat_summary).
## Warning: Removed 1 rows containing non-finite values (stat_signif).
load paralogous table for all lncRNAs across three sub-genomes
orthogroups <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/4_LncRNA/3_subgenome/0_Csat_lincRNA_orthogroups.csv", header = T)
synteny <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/4_LncRNA/3_subgenome/3_Cs-Cs-syntenic_gene_subgenome_assign.csv", header = T)
lncRNA_TPM <- all_TPM%>% filter(grepl('CONS', Gene.ID))
gene_TPM <- all_TPM%>% filter(grepl('Csa', Gene.ID))
#Select samples belong to Csativa
Cs_lncRNA_TPM <- lncRNA_TPM[,c(1,2,3,4,5,6,9,10,11,31,32,36,37,38)]
Chis_lncRNA_TPM <- lncRNA_TPM [,c(1,2,3,7,8,18,19,20,24,25,26)]
Cneg_lncRNA_TPM <- lncRNA_TPM [,c(1,2,3,27,28,29)]
Cs_gene_TPM <- gene_TPM[,c(1,2,3,4,5,6,9,10,11,31,32,36,37,38)]
#Match each ID to orthologous table
for (i in 1:ncol(orthogroups)){
synteny.df <- left_join(data.frame(Gene.ID = orthogroups[,i]),Cs_lncRNA_TPM, by = "Gene.ID")
assign(paste0("sub_orthogroups_",i),synteny.df)
}
#extract neglecta data
Cneg_sub_synteny <- left_join(data.frame(Gene.ID = orthogroups[,2]),Cneg_lncRNA_TPM, by = "Gene.ID")
#extract hispida data
Chis_sub_synteny <- left_join(data.frame(Gene.ID = orthogroups[,4]),Chis_lncRNA_TPM, by = "Gene.ID")
#Match each ID to synteny table
for (i in 1:ncol(synteny)){
synteny.df <- left_join(data.frame(Gene.ID = synteny[,i]),Cs_gene_TPM, by = "Gene.ID")
assign(paste0("sub_synteny_",i),synteny.df)
}
#build the list for each subgroup
lncRNA_list <- lapply(ls(pattern = "sub_orthogroups_"), get)
gene_list <- lapply(ls(pattern = "sub_synteny_"), get)
orthogroups_combine <- bind_cols(lncRNA_list )
syntney_combine <- bind_cols(gene_list)
head(syntney_combine,50)
orthogroups_combine_speceis <- bind_cols(orthogroups_combine,Cneg_sub_synteny,Chis_sub_synteny )
head(orthogroups_combine_speceis,40)
#write.csv(orthogroups_combine_speceis,"/Users/Leon/OneDrive - Cornell University/Projects/4_LncRNA/3_subgenome/1_orthogroup_table.csv", row.names = F)
#write.csv(syntney_combine ,"/Users/Leon/OneDrive - Cornell University/Projects/4_LncRNA/3_subgenome/1_synteny_table.csv", row.names = F)
lncRNA_pheat1 <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/4_LncRNA/3_subgenome/Results/Log2/2_orthogroup_pheatmap_three-speceis.csv", header = T)
lncRNA_pheat2 <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/4_LncRNA/3_subgenome/Results/Log2/2_orthogroup_pheatmap_Cs.csv", header = T)
#Check the header of two files
head(lncRNA_pheat1,20)
head(lncRNA_pheat2,20)
#Transform the column with gene ID into rownames
rownames(lncRNA_pheat1)=lncRNA_pheat1$lncRNA.ID
lncRNA_pheat_ph1 <- lncRNA_pheat1[,-1]
rownames(lncRNA_pheat2)=lncRNA_pheat2$lncRNA.ID
lncRNA_pheat_ph2 <- lncRNA_pheat2[,-1]
#Plot the Pheatmap
pheatmap(lncRNA_pheat_ph1, fontsize_col = 8, fontsize_row = 3)
pheatmap(lncRNA_pheat_ph2, fontsize_col = 8, fontsize_row = 3)
#Melt the dataframe used for Pheatmap
lncRNA_pheat2_melt <- melt(lncRNA_pheat2, id.vars=c("lncRNA.ID"))
lncRNA_pheat1_melt <- melt(lncRNA_pheat1, id.vars=c("lncRNA.ID"))
#Plot the box plot of lncRNAs
sub_comp1 <- list(c("C.Neglecta.like","C.Hispida"), c("C.Neglecta","C.Neglecta.like"), c("C.Neglecta","C.Hispida"))
sub_comp2 <- list(c("C.sativa.Neglecta","C.Neglecta"),c("C.sativa.Hispida","C.Hispida"))
fun_length <- function(x){
return(data.frame(y=median(x),label= paste0("N=", length(x))))
}
ggboxplot(lncRNA_pheat2_melt , x= "variable", y ="value", fill = "steelblue", outlier.size = 0.01) +
#add stats value
stat_summary(fun=mean, color ="darkblue", geom="point", shape=4, size=5,show.legend = TRUE) +
stat_summary(fun.data = fun_length, geom = "text", vjust = -18, size = 3, color = "red") +
# Add pairwise comparisons p-value
stat_compare_means(comparisons = sub_comp1,test=t.test, label.y = c(8,9,10)) +
theme_classic() +
ylab("log2 transformed TPM of orthologous lncRNAs") +
xlab("Sub-genome of C.sativa")+
scale_y_continuous(breaks = c(0,1,2,3,4,5,6,7,8,9,10,11), limits = c(0,11))
lncRNA_pheat1_melt$variable <- factor(lncRNA_pheat1_melt$variable , levels=c("C.sativa.Neglecta","C.Neglecta", "C.sativa.Hispida","C.Hispida"))
ggboxplot(lncRNA_pheat1_melt,
x= "variable", y ="value", fill = "steelblue", outlier.size = 0.01) +
#add stats value
stat_summary(fun=mean, color ="darkblue", geom="point", shape=4, size=5,show.legend = TRUE) +
stat_summary(fun.data = fun_length, geom = "text", vjust = -18, size = 3, color = "red") +
# Add pairwise comparisons p-value
stat_compare_means(comparisons = sub_comp2,test=t.test, label.y = c(7,7)) +
theme_classic() +
ylab("log2 transformed TPM of orthologous lncRNAs") +
xlab("Sub-genome of C.sativa")+
scale_y_continuous(breaks = c(0,1,2,3,4,5,6,7,8,9), limits = c(0,9))
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing non-finite values (stat_summary).
## Removed 1 rows containing non-finite values (stat_summary).
## Warning: Removed 1 rows containing non-finite values (stat_signif).
Each row contains syntenic genes from three sub-genomes
Calculation is based on average and then log2 transformed
#Load log2 transformed TPM for syntenic genes
gene_pheat <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/4_LncRNA/3_subgenome/Results/Log2/2_synteny_pheatmap_log2.csv", header = T)
#random sampling 500 sets of syntenic genes out of the list for each subgenome
#gene_pheat_sample <- gene_pheat[sample(nrow(gene_pheat), 500), ]
gene_pheat_melt <- melt(gene_pheat , id.vars=c("Gene.ID"))
#Plot all syntenic genes across sub-genome
sub_comp3 <- list(c("C.Neglecta.like","C.Hispida"), c("C.Neglecta","C.Neglecta.like"), c("C.Neglecta","C.Hispida"))
fun_length <- function(x){
return(data.frame(y=median(x),label= paste0("N=", length(x))))
}
ggboxplot(gene_pheat_melt , x= "variable", y ="value", fill = "tomato", outlier.size = 0.01) +
#add stats value
stat_summary(fun=mean, color ="darkblue", geom="point", shape=4, size=5,show.legend = TRUE) +
stat_summary(fun.data = fun_length, geom = "text", vjust = -18, size = 3, color = "red") +
# Add pairwise comparisons p-value
stat_compare_means(comparisons = sub_comp3,test=t.test, label.y = c(4,4.5,5)) +
theme_classic() +
ylab("log2 transformed TPM of protein-coding genes") +
xlab("Sub-genome of C.sativa")+
scale_y_continuous(breaks = c(0,1,2,3,4,5,6), limits = c(0,6))