Correlation between gene and traits of cotton

Plot correlation between gene expression and traits
Correaltion analysis between Log2FC and phenotype ratio (WL/WW) conditions

Plot correlation between gene expression and traits

Load packages and transformed tables

library(dplyr)
library(ggplot2)
library(knitr)
library(RColorBrewer)
library(ggpubr)
library(tidyr)
library(cowplot)
library(knitr)

Trait Tables

Trait <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/6_Cotton/10_phenotype/FIBER_yield_and_quality_2019_BLUEs_TIPO_removed.csv", header = T)
rownames(Trait) <- Trait$genotype
head(Trait)

Metabolites_BLUE <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/6_Cotton/10_phenotype/Metabolites_2019_BLUEs.csv", header = T, row.names = 1)
Metabolites_BLUE$genotype <- rownames(Metabolites_BLUE) 
head(Metabolites_BLUE)

Gene expression Table

TPM_bind <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/6_Cotton/3_FeatureCounts/Gh_WGCNA-TPM-ave_bind.csv", header = T, row.names = 1)
TPM_corr = as.data.frame(t(TPM_bind))
head(TPM_corr)

Zscore_bind <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/6_Cotton/14_MapMan/turquoise_Zscore.csv", header = T, row.names = 1)
Zscore_corr = as.data.frame(t(Zscore_bind))
head(Zscore_corr)

Gene information List

GenePlot <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/6_Cotton/17_Cistrome/03_DL-prediction/Plot_GeneList.csv", header = T)
head(GenePlot)

Reformat the QTls with TPM data

### import lint related genes
yield_gene <- GenePlot[GenePlot$Class == "QTLs", ]
yield_gene

for (i in 1:nrow(yield_gene)){
  
  yield_df <- TPM_corr[,yield_gene[i,1], drop = F]
  yield_df$Gene <- yield_gene[i,7]
  yield_df$Accessions <- rownames(yield_df)
  rownames(yield_df) <- NULL
  colnames(yield_df) <- c("TPM", "Gene", "genotype")
  
  assign(paste0("yield_QTLs_", yield_gene[i,1]), yield_df)
}

### Merge QTLs into same datafroma
QTLs_combine <- bind_rows(lapply(ls(pattern = "yield_QTLs_"), get))
QTLs_combine$TPM <- as.numeric(QTLs_combine$TPM)

## Warning: NAs introduced by coercion

head(QTLs_combine)

#rm(list=ls(pattern="yield_QTLs_"))

Reformat the QTls paralogs with TPM data

### for QTLs paralogs
QTLs_paralogs <- GenePlot[GenePlot$Class == "QTLs_paralogs", ]
QTLs_paralogs

for (i in 1:nrow(QTLs_paralogs)){
  
  QTLs_paralogs_df <- TPM_corr[,QTLs_paralogs[i,1], drop = F]
  QTLs_paralogs_df$Gene <- QTLs_paralogs[i,7]
  QTLs_paralogs_df$Accessions <- rownames(QTLs_paralogs_df)
  rownames(QTLs_paralogs_df) <- NULL
  colnames(QTLs_paralogs_df) <- c("TPM", "Gene", "genotype")
  
  assign(paste0("Paralogs-yield_QTLs_", QTLs_paralogs[i,1]), QTLs_paralogs_df)
}

### Merge QTLs into same datafroma
QTLs_paralogs_combine <- bind_rows(lapply(ls(pattern = "Paralogs-yield_QTLs_"), get))
QTLs_paralogs_combine$TPM <- as.numeric(QTLs_paralogs_combine$TPM)

## Warning: NAs introduced by coercion

head(QTLs_paralogs_combine)

Reformat the QTls with Z-score data

### import lint related genes
yield_gene <- GenePlot[GenePlot$Class == "QTLs", ]
yield_gene

for (i in 1:nrow(yield_gene)){
  
  yield_df <- Zscore_corr[,yield_gene[i,1], drop = F]
  yield_df$Gene <- yield_gene[i,7]
  yield_df$Accessions <- rownames(yield_df)
  rownames(yield_df) <- NULL
  colnames(yield_df) <- c("Zscore", "Gene", "genotype")
  
  assign(paste0("Zsore_yieldQTLs-", yield_gene[i,1]), yield_df)
}

### Merge QTLs into same datafroma
QTLs_Zscore <- bind_rows(lapply(ls(pattern = "Zsore_yieldQTLs-"), get))
QTLs_Zscore$Zscore <- as.numeric(QTLs_Zscore$Zscore)
head(QTLs_Zscore)

#rm(list=ls(pattern="yield_QTLs_"))

Combine traits and phenotype data

### Extract values of selected traits
Select_traits <- Trait[,c("genotype", "Lint_yield", "treatment")]
rownames(Select_traits) <- NULL

### combine two sets
Select_Comp <- left_join(QTLs_combine, Select_traits, by = "genotype")
head(Select_Comp)

Select_Zscore <- left_join(QTLs_Zscore, Select_traits, by = "genotype")
head(Select_Zscore)

### combine two sets
QTLs_paralogs_combine_Comp <- left_join(QTLs_paralogs_combine, Select_traits, by = "genotype")
head(QTLs_paralogs_combine_Comp)

Select_Zscore <- left_join(QTLs_Zscore, Select_traits, by = "genotype")
head(Select_Zscore)

Plot QTLs correlation with TPM

Plot_WL <- Select_Comp[Select_Comp$treatment == "WL",]

### Plot each QTLs
for (i in 1:nrow(yield_gene)){
  print(ggscatter(Plot_WL[Plot_WL$Gene == yield_gene[i,7],], 
          x = "TPM", y = "Lint_yield", 
          add = "reg.line", conf.int = TRUE, 
          cor.coef = TRUE, cor.method = "pearson",
          xlab = paste0("TPM of ", yield_gene[i,7]), ylab = "Lint yield",
          shape = 21, size = 3, 
          add.params = list(color = "blue", fill = "lightgray")))
}

Plot QTLs paralogs correlation with TPM

Plot_WL_paralogs <- QTLs_paralogs_combine_Comp[QTLs_paralogs_combine_Comp$treatment == "WL",]

### Plot each QTLs
for (i in 1:nrow(QTLs_paralogs)){
  print(ggscatter(Plot_WL_paralogs[Plot_WL_paralogs$Gene == QTLs_paralogs[i,7],], 
          x = "TPM", y = "Lint_yield", 
          add = "reg.line", conf.int = TRUE, 
          cor.coef = TRUE, cor.method = "pearson",
          xlab = paste0("TPM of ", QTLs_paralogs[i,7]), ylab = "Lint yield",
          shape = 21, size = 3, 
          add.params = list(color = "blue", fill = "lightgray")))
}

### Plot QTLs paralogs and QTLs correlation with TPM side by side

Plot_WL_paralogs$Class <- "QTL_paralogs"
Plot_WL$Class <- "QTL"

### combined two dataset
QTLs_comparison <- bind_rows(Plot_WL_paralogs, Plot_WL) 
head(QTLs_comparison)

QTLs_comparison[QTLs_comparison$Gene == "API5",]%>%
  ggplot(aes(x=TPM, y=Lint_yield, color=Class))+
  geom_point(size = 3, alpha = 0.7)+ 
  scale_color_manual(values = c(QTL_paralogs="steelblue", QTL="tomato"))+
  theme_bw()+ 
  ggtitle("Correaltion between lint yield and API5 TPM") +
  geom_smooth(method = "lm", aes(color= Class), size = 0.5, level = 0.95) +
  stat_cor(label.y.npc = "top") +
  xlab("TPM of API5") +
  ylab("Lint yieled (Kg)") + 
  theme(legend.position = c(0.8, 0.2))

QTLs_comparison[QTLs_comparison$Gene == "IPS",]%>%
  ggplot(aes(x=TPM, y=Lint_yield, color=Class))+
  geom_point(size = 3, alpha = 0.7)+ 
  scale_color_manual(values = c(QTL_paralogs="steelblue", QTL="tomato"))+
  theme_bw()+ 
  ggtitle("Correaltion between lint yield and IPS TPM") +
  geom_smooth(method = "lm", aes(color= Class), size = 0.5, level = 0.95) +
  stat_cor(label.y.npc = "top") +
  xlab("TPM of IPS") +
  ylab("Lint yieled (Kg)") + 
  theme(legend.position = c(0.8, 0.2))

QTLs_comparison[QTLs_comparison$Gene == "ABP",]%>%
  ggplot(aes(x=TPM, y=Lint_yield, color=Class))+
  geom_point(size = 3, alpha = 0.7)+ 
  scale_color_manual(values = c(QTL_paralogs="steelblue", QTL="tomato"))+
  theme_bw()+ 
  ggtitle("Correaltion between lint yield and ABP TPM") +
  geom_smooth(method = "lm", aes(color= Class), size = 0.5, level = 0.95) +
  stat_cor(label.y.npc = "top") +
  xlab("TPM of ABP") +
  ylab("Lint yieled (Kg)") + 
  theme(legend.position = c(0.8, 0.2))

QTLs_comparison[QTLs_comparison$Gene == "DBC",]%>%
  ggplot(aes(x=TPM, y=Lint_yield, color=Class))+
  geom_point(size = 3, alpha = 0.7)+ 
  scale_color_manual(values = c(QTL_paralogs="steelblue", QTL="tomato"))+
  theme_bw()+ 
  ggtitle("Correaltion between lint yield and DBC TPM") +
  geom_smooth(method = "lm", aes(color= Class), size = 0.5, level = 0.95) +
  stat_cor(label.y.npc = "top") +
  xlab("TPM of DBC") +
  ylab("Lint yieled (Kg)") + 
  theme(legend.position = c(0.8, 0.2))

QTLs_comparison[QTLs_comparison$Gene == "L7T",]%>%
  ggplot(aes(x=TPM, y=Lint_yield, color=Class))+
  geom_point(size = 3, alpha = 0.7)+ 
  scale_color_manual(values = c(QTL_paralogs="steelblue", QTL="tomato"))+
  theme_bw()+ 
  ggtitle("Correaltion between lint yield and L7T TPM") +
  geom_smooth(method = "lm", aes(color= Class), size = 0.5, level = 0.95) +
  stat_cor(label.y.npc = "top") +
  xlab("TPM of L7T") +
  ylab("Lint yieled (Kg)") + 
  theme(legend.position = c(0.8, 0.2))

Plot with normalized Z-score for QTLs

Zscore_WL <- Select_Zscore [Select_Zscore$treatment == "WL",]
head(Zscore_WL)

### Plot each QTLs
Zscore_WL %>%
  ggplot(aes(x=Zscore, y=Lint_yield, color=Gene))+
  geom_point(size = 3, alpha = 0.5)+ 
  scale_color_manual(values = c(IPS="#1B9E77", ABP="tomato", L7T="#E7298A", DBC="steelblue", API5="#E6AB02"))+
  theme_bw()+ 
  ggtitle("Correaltion between lint yield and 5 QTLs Zscore") +
  geom_smooth(method = "lm", aes(color= Gene), size = 0.5, level = 0.95) +
  xlab("Zscore of lint yield QTLs") +
  ylab("Lint yieled (Kg)") +
  stat_cor(label.y.npc = "top") +
  theme(legend.position = c(0.8, 0.2))

Reformat the TFs with TPM data for correlation analysis

### import lint related genes
TF_gene_all <- GenePlot[GenePlot$Class == "TFs" | GenePlot$Class == "TFs_paralogs", ]
TF_gene_all

for (i in 1:nrow(TF_gene_all)){
  
  TF_df <- TPM_corr[,TF_gene_all[i,1], drop = F]
  TF_df$Gene <- TF_gene_all[i,1]
  TF_df$Accessions <- rownames(TF_df)
  rownames(TF_df) <- NULL
  colnames(TF_df) <- c("TPM", "Gene", "genotype")
  
  assign(paste0("yield_TFs_", TF_gene_all[i,1]), TF_df)
}

### Merge QTLs into same datafroma
TFs_combine <- bind_rows(lapply(ls(pattern = "yield_TFs_"), get))
TFs_combine$TPM <- as.numeric(TFs_combine$TPM)

## Warning: NAs introduced by coercion

head(TFs_combine)

Reformat the TFs with Zscore data for correlation analysis

TF_gene <- GenePlot[GenePlot$Class == "TFs", ]
TF_gene

for (i in 1:nrow(TF_gene)){
  
  TF_df <- Zscore_corr[,TF_gene[i,1], drop = F]
  TF_df$Gene <- TF_gene[i,1]
  TF_df$Accessions <- rownames(TF_df)
  rownames(TF_df) <- NULL
  colnames(TF_df) <- c("Zscore", "Gene", "genotype")
  
  assign(paste0("Zscore_yield_TFs-", TF_gene[i,1]), TF_df)
}

### Merge QTLs into same datafroma
TFs_Zscore <- bind_rows(lapply(ls(pattern = "Zscore_yield_TFs-"), get))
TFs_Zscore$Zscore <- as.numeric(TFs_Zscore$Zscore)
head(TFs_Zscore)

Combine with selected traits data

### combine two sets
Select_TF <- left_join(TFs_combine, Select_traits, by = "genotype")
head(Select_TF)

Select_Zscore_TF <- left_join(TFs_Zscore, Select_traits, by = "genotype")
head(Select_Zscore_TF)

Plot TFs correlation with TPM

Plot_WL_TF <- Select_TF[Select_TF$treatment == "WL",] %>% na.omit()
Plot_WL_TF

### Plot each QTLs
for (i in 1:nrow(TF_gene_all)){
  print(ggscatter(Plot_WL_TF[Plot_WL_TF$Gene == TF_gene_all[i,1],], 
          x = "TPM", y = "Lint_yield", 
          add = "reg.line", conf.int = TRUE, 
          cor.coef = TRUE, cor.method = "pearson",
          xlab = paste0("TPM of ", TF_gene_all[i,1]), 
          ylab = "Lint yield (Kg)",
          shape = 21, size = 3, 
          add.params = list(color = "blue", fill = "lightgray"),
          title = paste0(TF_gene_all[i,7], " and lint yield correlation")))
}

Plot selected TFs with paralogs side by side

HSFB2A

Plot_WL_TF[Plot_WL_TF$Gene == "Gohir.D11G009300" | Plot_WL_TF$Gene =="Gohir.A11G010000",] %>%
  ggplot(aes(x=TPM, y=Lint_yield, color=Gene))+
  geom_point(size = 3, alpha = 0.7)+ 
  scale_color_manual(values = c(Gohir.D11G009300="steelblue", Gohir.A11G010000="tomato"))+
  theme_bw()+ 
  ggtitle("Correaltion between lint yield and HSFB2A TPM") +
  geom_smooth(method = "lm", aes(color= Gene), size = 0.5, level = 0.95) +
  stat_cor(label.y.npc = "top") +
  xlab("TPM of HSFB2A") +
  ylab("Lint yieled (Kg)") + 
  theme(legend.position = c(0.8, 0.2))

## `geom_smooth()` using formula = 'y ~ x'

DREB2C

Plot_WL_TF[Plot_WL_TF$Gene == "Gohir.A13G021700" | Plot_WL_TF$Gene =="Gohir.D13G022300",] %>%
  ggplot(aes(x=TPM, y=Lint_yield, color=Gene))+
  geom_point(size = 3, alpha = 0.7)+ 
  scale_color_manual(values = c(Gohir.A13G021700="steelblue", Gohir.D13G022300="tomato"))+
  theme_bw()+ 
  ggtitle("Correaltion between lint yield and GhDREB2C TPM") +
  geom_smooth(method = "lm", aes(color= Gene), size = 0.5, level = 0.95) +
  stat_cor(label.y.npc = "top") +
  xlab("TPM of GhDREB2C") +
  ylab("Lint yieled (Kg)") + 
  theme(legend.position = c(0.8, 0.2))

## `geom_smooth()` using formula = 'y ~ x'

HSFB2B

Plot_WL_TF[Plot_WL_TF$Gene == "Gohir.A03G014700" | Plot_WL_TF$Gene =="Gohir.D03G153000" |
           Plot_WL_TF$Gene == "Gohir.D08G198000" | Plot_WL_TF$Gene =="Gohir.A08G179400",] %>%
  ggplot(aes(x=TPM, y=Lint_yield, color=Gene))+
  geom_point(size = 3, alpha = 0.7)+ 
  scale_color_manual(values = c(Gohir.A03G014700="steelblue", Gohir.D03G153000="tomato",
                                Gohir.D08G198000="black", Gohir.A08G179400="#E6AB02"))+
  theme_bw()+ 
  ggtitle("Correaltion between lint yield and GhHSFB2B TPM") +
  geom_smooth(method = "lm", aes(color= Gene), size = 0.5, level = 0.95) +
  stat_cor(label.y.npc = "top") +
  xlab("TPM of GhHSFB2B") +
  ylab("Lint yieled (Kg)") + 
  theme(legend.position = c(0.8, 0.2))

## `geom_smooth()` using formula = 'y ~ x'

HSFA6B

Plot_WL_TF[Plot_WL_TF$Gene == "Gohir.A08G064100" | Plot_WL_TF$Gene =="Gohir.D08G072600" ,] %>%
  ggplot(aes(x=TPM, y=Lint_yield, color=Gene))+
  geom_point(size = 3, alpha = 0.7)+ 
  scale_color_manual(values = c(Gohir.A08G064100="steelblue", Gohir.D08G072600="tomato"))+
  theme_bw()+ 
  ggtitle("Correaltion between lint yield and GhHSFA6B TPM") +
  geom_smooth(method = "lm", aes(color= Gene), size = 0.5, level = 0.95) +
  stat_cor(label.y.npc = "top") +
  xlab("TPM of GhHSFA6B") +
  ylab("Lint yieled (Kg)") + 
  theme(legend.position = c(0.8, 0.2))

## `geom_smooth()` using formula = 'y ~ x'

Plot with normalized Z-score for TFs

Zscore_WL_TF <- Select_Zscore_TF[Select_Zscore_TF$treatment == "WL",]
head(Zscore_WL_TF)

MBFC1

### Plot each QTLs
MBFC_Zscore <- Zscore_WL_TF[Zscore_WL_TF$Gene == "Gohir.A06G061400" | Zscore_WL_TF$Gene == "Gohir.D06G059700", ] 

MBFC_Zscore %>%
  ggplot(aes(x=Zscore, y=Lint_yield, color=Gene))+
  geom_point(size = 2, alpha = 0.7)+ 
  scale_color_manual(values = c(Gohir.A06G061400="steelblue", Gohir.D06G059700="tomato"))+
  theme_bw()+ 
  ggtitle("Correaltion between lint yield and GhMBFC1 Zscore") +
  geom_smooth(method = "lm", aes(color= Gene), size = 0.5, level = 0.95) +
  stat_cor(label.y.npc = "top") +
  xlab("Zscore of TFs") +
  ylab("Lint yieled (Kg)") + 
  theme(legend.position = c(0.8, 0.2))

DREB2C

### Plot each QTLs
DREB2C_Zscore <- Zscore_WL_TF[Zscore_WL_TF$Gene == "Gohir.A13G021700" | Zscore_WL_TF$Gene == "Gohir.D13G022300", ] 

DREB2C_Zscore %>%
  ggplot(aes(x=Zscore, y=Lint_yield, color=Gene))+
  geom_point(size = 2, alpha = 0.7)+ 
  scale_color_manual(values = c(Gohir.A13G021700="steelblue", Gohir.D13G022300="tomato"))+
  theme_bw()+ 
  ggtitle("Correaltion between lint yield and GhDREB2C Zscore") +
  geom_smooth(method = "lm", aes(color= Gene), size = 0.5, level = 0.95) +
  stat_cor(label.y.npc = "top") +
  xlab("Zscore of TFs") +
  ylab("Lint yieled (Kg)") +
  theme(legend.position = c(0.8, 0.2))

HSFA6B

### Plot each QTLs
HSFA6B_Zscore <- Zscore_WL_TF[Zscore_WL_TF$Gene == "Gohir.A08G064100" | Zscore_WL_TF$Gene == "Gohir.D08G072600",] 

HSFA6B_Zscore %>%
  ggplot(aes(x=Zscore, y=Lint_yield, color=Gene))+
  geom_point(size = 2, alpha = 0.7)+ 
  scale_color_manual(values = c(Gohir.A08G064100="steelblue", Gohir.D08G072600="tomato"))+
  theme_bw()+ 
  ggtitle("Correaltion between lint yield and GhHSFA6B expression") +
  geom_smooth(method = "lm", aes(color= Gene), size = 0.5, level = 0.95) +
  stat_cor(label.y.npc = "top") +
  xlab("Zscore of TFs") +
  ylab("Lint yieled (Kg)") +
  theme(legend.position = c(0.8, 0.2))

HSFB2B

### Plot each QTLs
HSF7_Zscore <- Zscore_WL_TF[Zscore_WL_TF$Gene == "Gohir.D03G153000" | Zscore_WL_TF$Gene == "Gohir.D08G198000" |
                              Zscore_WL_TF$Gene == "Gohir.A08G179400",] 
HSF7_Zscore %>%
  ggplot(aes(x=Zscore, y=Lint_yield, color=Gene))+
  geom_point(size = 2, alpha = 0.7)+ 
  scale_color_manual(values = c(Gohir.A08G179400="steelblue", Gohir.D08G198000="tomato", Gohir.D03G153000 = "black"))+
  theme_bw()+ 
  ggtitle("Correaltion between lint yield and GhHSFB2B Zscore") +
  geom_smooth(method = "lm", aes(color= Gene), size = 0.5, level = 0.95) +
  stat_cor(label.y.npc = "top") +
  xlab("Zscore of TFs") +
  ylab("Lint yieled (Kg)") +
  theme(legend.position = c(0.8, 0.2))

Correaltion analysis between Log2FC and phenotype ratio (WL/WW) conditions

Load packages and envs

library(Hmisc)
library(dplyr)
library(reshape2)
library(ggpubr)
library(ggVennDiagram)
library(pheatmap)
library(pheatmap)
library(stringr)
library(RColorBrewer)
library(viridis)
library(ggplot2)
library(tidyr)
library(cowplot)
library(knitr)

load("/Users/Leon/OneDrive - Cornell University/Projects/6_Cotton/2_Pipeline/LogFC.RData")

Define correlation function

flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut]
    )
}

Load Log2FC data derived

LOG <- read.csv("/Users/Leon/OneDrive - Cornell University/3_Manuscript/Manuscript8/7_LogFC_ratio/Ratio_LogFC.csv", header = T)
row.names(LOG) = LOG[,1]
head(LOG)

Ratio <- read.csv("/Users/Leon/OneDrive - Cornell University/3_Manuscript/Manuscript8/7_LogFC_ratio/LogFC_meta_table.csv", header = T)
row.names(Ratio) = Ratio$Gene
head(Ratio)

define new dataframe for calculation output (10 traits from df1 to df10)

df1 <- data.frame(matrix(ncol = 7 , nrow = 67413))
df2 <- data.frame(matrix(ncol = 7 , nrow = 67413))
df3 <- data.frame(matrix(ncol = 7 , nrow = 67413))
df4 <- data.frame(matrix(ncol = 7 , nrow = 67413))
df5 <- data.frame(matrix(ncol = 7 , nrow = 67413))
df6 <- data.frame(matrix(ncol = 7 , nrow = 67413))
df7 <- data.frame(matrix(ncol = 7 , nrow = 67413))
df8 <- data.frame(matrix(ncol = 7 , nrow = 67413))
df9 <- data.frame(matrix(ncol = 7 , nrow = 67413))
df10 <- data.frame(matrix(ncol = 7 , nrow = 67413))


colnames(df1) <- c("row", "column", "R", "P-value", "Intercept", "Effects", "R.square")
colnames(df2) <- c("row", "column", "R", "P-value", "Intercept", "Effects", "R.square")
colnames(df3) <- c("row", "column", "R", "P-value", "Intercept", "Effects", "R.square")
colnames(df4) <- c("row", "column", "R", "P-value", "Intercept", "Effects", "R.square")
colnames(df5) <- c("row", "column", "R", "P-value", "Intercept", "Effects", "R.square")
colnames(df6) <- c("row", "column", "R", "P-value", "Intercept", "Effects", "R.square")
colnames(df7) <- c("row", "column", "R", "P-value", "Intercept", "Effects", "R.square")
colnames(df8) <- c("row", "column", "R", "P-value", "Intercept", "Effects", "R.square")
colnames(df9) <- c("row", "column", "R", "P-value", "Intercept", "Effects", "R.square")
colnames(df10) <- c("row", "column", "R", "P-value", "Intercept", "Effects", "R.square")

Calculate the correlation between gene expression and phenotype (production traits)

for (i in 1:(nrow(Ratio))){
  res <- rcorr(t(rbind(LOG[1,-1], Ratio[i,-1])))
  df1[i,1:4] <- flattenCorrMatrix(res$r, res$P)
  temp.df = data.frame(t(rbind(LOG[1,-1], Ratio[i,-1])))
  sm = summary(lm(Lint_yield ~ get(Ratio[i,1]), temp.df))
  df1[i,5] = sm$coefficients[1]
  df1[i,6] = sm$coefficients[2]
  df1[i,7] = sm$r.squared
}

for (i in 1:(nrow(Ratio))){
  res <- rcorr(t(rbind(LOG[2,-1], Ratio[i,-1])))
  df2[i,1:4] <- flattenCorrMatrix(res$r, res$P)
  temp.df = data.frame(t(rbind(LOG[2,-1], Ratio[i,-1])))
  sm = summary(lm(Mic ~ get(Ratio[i,1]), temp.df))
  df2[i,5] = sm$coefficients[1]
  df2[i,6] = sm$coefficients[2]
  df2[i,7] = sm$r.squared
}

for (i in 1:(nrow(Ratio))){
  res <- rcorr(t(rbind(LOG[3,-1], Ratio[i,-1])))
  df3[i,1:4] <- flattenCorrMatrix(res$r, res$P)
  temp.df = data.frame(t(rbind(LOG[3,-1], Ratio[i,-1])))
  sm = summary(lm(UHM ~ get(Ratio[i,1]), temp.df))
  df3[i,5] = sm$coefficients[1]
  df3[i,6] = sm$coefficients[2]
  df3[i,7] = sm$r.squared
}

for (i in 1:(nrow(Ratio))){
  res <- rcorr(t(rbind(LOG[4,-1], Ratio[i,-1])))
  df4[i,1:4] <- flattenCorrMatrix(res$r, res$P)
  temp.df = data.frame(t(rbind(LOG[4,-1], Ratio[i,-1])))
  sm = summary(lm(UI ~ get(Ratio[i,1]), temp.df))
  df4[i,5] = sm$coefficients[1]
  df4[i,6] = sm$coefficients[2]
  df4[i,7] = sm$r.squared
}

for (i in 1:(nrow(Ratio))){
  res <- rcorr(t(rbind(LOG[5,-1], Ratio[i,-1])))
  df5[i,1:4] <- flattenCorrMatrix(res$r, res$P)
  temp.df = data.frame(t(rbind(LOG[5,-1], Ratio[i,-1])))
  sm = summary(lm(Str ~ get(Ratio[i,1]), temp.df))
  df5[i,5] = sm$coefficients[1]
  df5[i,6] = sm$coefficients[2]
  df5[i,7] = sm$r.squared
}

for (i in 1:(nrow(Ratio))){
  res <- rcorr(t(rbind(LOG[6,-1], Ratio[i,-1])))
  df6[i,1:4] <- flattenCorrMatrix(res$r, res$P)
  temp.df = data.frame(t(rbind(LOG[6,-1], Ratio[i,-1])))
  sm = summary(lm(Elo ~ get(Ratio[i,1]), temp.df))
  df6[i,5] = sm$coefficients[1]
  df6[i,6] = sm$coefficients[2]
  df6[i,7] = sm$r.squared
}

Calculate the correlation between gene expression and phenotype (vegetative indices)

### for vegetative indices
for (i in 1:(nrow(Ratio))){
  res <- rcorr(t(rbind(LOG[7,-1], Ratio[i,-1])))
  df7[i,1:4] <- flattenCorrMatrix(res$r, res$P)
  temp.df = data.frame(t(rbind(LOG[7,-1], Ratio[i,-1])))
  sm = summary(lm(NDVI ~ get(Ratio[i,1]), temp.df))
  df7[i,5] = sm$coefficients[1]
  df7[i,6] = sm$coefficients[2]
  df7[i,7] = sm$r.squared
}

for (i in 1:(nrow(Ratio))){
  res <- rcorr(t(rbind(LOG[8,-1], Ratio[i,-1])))
  df8[i,1:4] <- flattenCorrMatrix(res$r, res$P)
  temp.df = data.frame(t(rbind(LOG[8,-1], Ratio[i,-1])))
  sm = summary(lm(sPRI ~ get(Ratio[i,1]), temp.df))
  df8[i,5] = sm$coefficients[1]
  df8[i,6] = sm$coefficients[2]
  df8[i,7] = sm$r.squared
}


for (i in 1:(nrow(Ratio))){
  res <- rcorr(t(rbind(LOG[9,-1], Ratio[i,-1])))
  df9[i,1:4] <- flattenCorrMatrix(res$r, res$P)
  temp.df = data.frame(t(rbind(LOG[9,-1], Ratio[i,-1])))
  sm = summary(lm(CRI ~ get(Ratio[i,1]), temp.df))
  df9[i,5] = sm$coefficients[1]
  df9[i,6] = sm$coefficients[2]
  df9[i,7] = sm$r.squared
}

for (i in 1:(nrow(Ratio))){
  res <- rcorr(t(rbind(LOG[10,-1], Ratio[i,-1])))
  df10[i,1:4] <- flattenCorrMatrix(res$r, res$P)
  temp.df = data.frame(t(rbind(LOG[10,-1], Ratio[i,-1])))
  sm = summary(lm(WI.NDVI ~ get(Ratio[i,1]), temp.df))
  df10[i,5] = sm$coefficients[1]
  df10[i,6] = sm$coefficients[2]
  df10[i,7] = sm$r.squared
}

Filter and the correlation

### Combined all this datasets
combined <- bind_rows(df1, df2, df3, df4, df5, df6, df7, df8, df9, df10)
head(combine)

##                                                                        
## 1 function (...)                                                       
## 2 {                                                                    
## 3     lifecycle::deprecate_warn("1.0.0", "combine()", "vctrs::vec_c()")
## 4     args <- list2(...)                                               
## 5     if (length(args) == 1 && is.list(args[[1]])) {                   
## 6         args <- args[[1]]

combine_filter <- combined[combined$`P-value` < 0.05,]
combine_filter$Type <- "fill"
combine_filter[combine_filter$R < 0,8] = "N correlated"
combine_filter[combine_filter$R > 0,8] = "Pcorrelated"

combine_filter$ABS = abs(combine_filter$Effects)
head(combine_filter)

write.csv(combine_filter,"/Users/Leon/OneDrive - Cornell University/3_Manuscript/Manuscript8/7_LogFC_ratio/Combined_file.csv", row.names = F, quote = F)
write.csv(combined,"/Users/Leon/OneDrive - Cornell University/3_Manuscript/Manuscript8/7_LogFC_ratio/Combined-non-filter_file.csv", row.names = F, quote = F)

Statistics of correlation

combine_stats <- combine_filter %>% group_by(row, Type) %>% mutate(counts = n())
combine_stats <- unique(combine_stats[,c(1,8,10)])
combine_stats <- dcast(combine_stats, row~Type,sum)
combine_stats

write.csv(combine_stats,"/Users/Leon/OneDrive - Cornell University/3_Manuscript/Manuscript8/7_LogFC_ratio/Combined_stats.csv", row.names = F, quote = F)

Plot clustering heatmap of metaboltites assocaited with effects

### Load metabolties Z-score
Zscore <- read.csv("/Users/Leon/OneDrive - Cornell University/3_Manuscript/Manuscript8/11_Metabolties/Zscores_Met_2019_BLUEs_WW_and_WL_plot.csv", header = T)
sample_col <- read.csv("/Users/Leon/OneDrive - Cornell University/3_Manuscript/Manuscript8/11_Metabolties/sample_col.csv", header = T)
sample_row <- read.csv("/Users/Leon/OneDrive - Cornell University/3_Manuscript/Manuscript8/11_Metabolties/sample_row.csv", header = T)

rownames(sample_row) <- sample_row[,1]
rownames(sample_col) <- sample_col[,1]
rownames(Zscore) <- Zscore[,1]

head(Zscore)

head(sample_col)

head(sample_row)

Plot the metaboliets with Genotype variations

annotation_colors = list(
  treatment = c(WL="red", WW="royalblue"),
  Group = c(Wild="pink", Commercial="slateblue1", Elite="palegreen"))

Metablites_select <- Zscore[Zscore$Class == "Genotype",3:44]
head(Metablites_select)

pheatmap(Metablites_select,
         annotation_col = sample_col[,2:3], 
         annotation_row = sample_row[sample_row$Type == "Genotype",2:3], 
         color = inferno(30),
         annotation_colors = annotation_colors, 
         clustering_distance_r = "manhattan", 
         show_colnames =F,show_rownames = F,
         cluster_rows = T, cluster_cols = T,
         fontsize_row = 4, fontsize_col = 4, 
         angle_col = 90, border_color = NA)

Plot the metaboliets with Contion variations

annotation_colors = list(
  treatment = c(WL="red", WW="royalblue"),
  Group = c(Wild="pink", Commercial="slateblue1", Elite="palegreen"))

Metablites_select <- Zscore[Zscore$Class == "Env",3:44]
head(Metablites_select)

pheatmap(Metablites_select,
         annotation_col = sample_col[,2:3], 
         annotation_row = sample_row[sample_row$Type == "Env",2:3], 
         color = inferno(30),
         annotation_colors = annotation_colors, 
         clustering_distance_r = "manhattan", 
         show_colnames =F,show_rownames = F,
         cluster_rows = T, cluster_cols = T,
         fontsize_row = 4, fontsize_col = 4, 
         angle_col = 90, border_color = NA)

Plot the metaboliets with Interaction variations

annotation_colors = list(
  treatment = c(WL="red", WW="royalblue"),
  Group = c(Wild="pink", Commercial="slateblue1", Elite="palegreen"))

Metablites_select <- Zscore[Zscore$Class == "Interaction",3:44]
head(Metablites_select)

pheatmap(Metablites_select,
         annotation_col = sample_col[,2:3], 
         annotation_row = sample_row[sample_row$Type == "Interaction",2:3], 
         color = inferno(30),
         annotation_colors = annotation_colors, 
         clustering_distance_r = "manhattan", 
         show_colnames =F,show_rownames = F,
         cluster_rows = T, cluster_cols = T,
         fontsize_row = 4, fontsize_col = 4, 
         angle_col = 90, border_color = NA)

Pair-wise comparison of metaboltes ratio to phenotype ratio

Ratio <- read.csv("/Users/Leon/OneDrive - Cornell University/3_Manuscript/Manuscript8/7_LogFC_ratio/Metabolites_WWWL_ratio.csv", header = T)
row.names(Ratio) =Ratio$Metabolites
head(Ratio)

LOG <- read.csv("/Users/Leon/OneDrive - Cornell University/3_Manuscript/Manuscript8/7_LogFC_ratio/Ratio_LogFC.csv", header = T)
row.names(LOG) = LOG[,1]
head(LOG)

PM_combine: Phenotype-metabolites

###Define DF first for
df <- data.frame(matrix(ncol = 7 , nrow = 451))
colnames(df) <- c("row", "column", "R", "P-value", "Intercept", "Effects", "R.square")

### Loop analysis nested in metabolites
for (n in 1:nrow(LOG)){
  for (i in 1:(nrow(Ratio))){
    res <- rcorr(t(rbind(LOG[n,-1], Ratio[i,-1])))
    
    df[i,1:4] = flattenCorrMatrix(res$r, res$P)
    temp.df = data.frame(t(rbind(LOG[n,-1], Ratio[i,-1])))
    sm = summary(lm(get(LOG[n,1]) ~ get(Ratio[i,1]), temp.df))
    df[i,5] = sm$coefficients[1]
    df[i,6] = sm$coefficients[2]
    df[i,7] = sm$r.squared
  }
  assign(paste0("Metabolites_df",n), df)
}

merge_list <- lapply(ls(pattern = "Metabolites_df"), get)
PM_combine <- bind_rows(merge_list)
PM_combine_clean <- PM_combine[PM_combine$`P-value` < 0.05, ]
head(PM_combine_clean)

write.csv(PM_combine_clean, "/Users/Leon/OneDrive - Cornell University/3_Manuscript/Manuscript8/7_LogFC_ratio/Phenotype-metabolites.correlation-Feb15-2022.csv", quote = F)

Pair-wise comparison of gene expression to metabolites level

Ratio <- read.csv("/Users/Leon/OneDrive - Cornell University/3_Manuscript/Manuscript8/7_LogFC_ratio/LogFC_meta_table.csv", header = T)
row.names(Ratio) = Ratio$Gene
head(Ratio)

LOG <- read.csv("/Users/Leon/OneDrive - Cornell University/3_Manuscript/Manuscript8/7_LogFC_ratio/Metabolites_WWWL_ratio.csv", header = T)
row.names(LOG) =LOG$Metabolites
head(LOG)

GM_combine: Genotype-metabolites

###Define DF first for
df <- data.frame(matrix(ncol = 7 , nrow = 451))
colnames(df) <- c("row", "column", "R", "P-value", "Intercept", "Effects", "R.square")

### Loop analysis nested in metabolites
for (n in 1:nrow(LOG)){
  for (i in 1:(nrow(Ratio))){
    res <- rcorr(t(rbind(LOG[n,-1], Ratio[i,-1])))
    
    df[i,1:4] = flattenCorrMatrix(res$r, res$P)
    temp.df = data.frame(t(rbind(LOG[n,-1], Ratio[i,-1])))
    sm = summary(lm(get(LOG[n,1]) ~ get(Ratio[i,1]), temp.df))
    df[i,5] = sm$coefficients[1]
    df[i,6] = sm$coefficients[2]
    df[i,7] = sm$r.squared
  }
  assign(paste0("GM_df",n), df)
}

merge_list <- lapply(ls(pattern = "GM_df"), get)
GM_combine <- bind_rows(merge_list)
GM_combine_clean <- GM_combine[GM_combine$`P-value` < 0.05, ]
head(GM_combine_clean)

write.csv(GM_combine_clean, "/Users/Leon/OneDrive - Cornell University/3_Manuscript/Manuscript8/LogFC_ratio/Gene-metabolites.correlation-Feb15-2022.csv", quote = F)

Plot effects distribution

### PLot the density plot of 10 traits and 451 metabolites
combine_filter = read.csv("/Users/Leon/OneDrive - Cornell University/3_Manuscript/Manuscript8/7_LogFC_ratio/Plot_meta_Feb2022.csv", header = T)
combine_filter$abs.effect = abs(combine_filter$Effects)
head(combine_filter)

### define length function
fun_length <- function(x){
  return(data.frame(y=median(x),label= paste0("N=", length(x))))
}

### Compare effects across the three types of phenotype (density plot)
ggplot(data = combine_filter, aes(y=log2(abs.effect), fill = Type))+
  geom_density(alpha = 0.2) +
  facet_wrap(~ Trait, ncol=3) + theme_bw() +
  xlab("Log2 transformed Abosolute value of effects") +
  ylab("Density")

### Compare R square across thre types of phenotypes
ggplot(data = combine_filter, aes(x=R.square, color = Type))+
  facet_wrap(~ Trait, ncol=3) +
  geom_density(alpha = 0.5) + theme_bw() +
  xlab("R square of linear regression fit") +
  ylab("Density")

### Compare effects across the three types of phenotype (boxplot)
level_order = c("Metabolites", "Production", "Index")
ggplot(data = combine_filter, aes(x = factor(Trait, level = level_order), y = log2(abs.effect)))+
  geom_boxplot(alpha = 0.3, outlier.size = 0.02, fill = "grey") +
  theme_bw() +
  ylab("Log2 transformed value of absolute effect") +
  stat_summary(fun = mean, color = "darkred", position = position_dodge(1),
             geom = "point", shape = 4, size = 6,
             show.legend = F) + 
  stat_summary(fun.data = fun_length, geom = "text", vjust = -18, size = 3, color = "red") +
  stat_compare_means(method = "kruskal") +
  theme(legend.position="none")

Plot effects distribution in venn plot

venn_combine <- list(Meatbolites = combine_filter[combine_filter$Trait == "Metabolites",2] , 
                      Index = combine_filter[combine_filter$Trait == "Index",2],
                     Production = combine_filter[combine_filter$Trait == "Production",2]) 

ggVennDiagram(venn_combine, label_alpha = 2, set_color = "black", color = "red")