Phenotype GxE analysis pipeline (R)

—– Setup —–

# If needed, uncomment to install packages:
# pkgs <- c("tidyverse","janitor","GGally","factoextra","ggfortify","pheatmap",
#           "cluster","NbClust","randomForest","ranger","caret","e1071",
#           "lme4","lmerTest","broom.mixed","performance","patchwork")
#install.packages(setdiff(pkgs, rownames(installed.packages())), dependencies = TRUE)

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(GGally)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggfortify)
library(pheatmap)
library(cluster)
library(NbClust)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(ranger)
## 
## Attaching package: 'ranger'
## 
## The following object is masked from 'package:randomForest':
## 
##     importance
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(e1071)
## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:ggplot2':
## 
##     element
library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(lmerTest)
## 
## Attaching package: 'lmerTest'
## 
## The following object is masked from 'package:lme4':
## 
##     lmer
## 
## The following object is masked from 'package:stats':
## 
##     step
library(broom.mixed)
library(performance)
library(patchwork)
raw <- read.csv("Trait.Absolute_MMJ.csv")
raw

let’s reshape it first too

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
raw2 <- dcast(Genotype + Temperature + CO2 + Condition ~ Trait, data = raw)
raw2

Coerce factors; trim white spaces

dat <- raw2 |>
  mutate(
    Genotype   = factor(str_squish(as.character(Genotype))),
    Condition  = factor(str_squish(as.character(Condition))),
    Temperature= factor(str_squish(as.character(Temperature))),
    CO2        = factor(str_squish(as.character(CO2)))
  )
dat

Identify trait columns = all numeric columns not in metadata

meta_cols <- c("Genotype","Condition","Temperature","CO2")
trait_cols <- names(dat)[sapply(dat, is.numeric) & !names(dat) %in% meta_cols]
dat

Composite environment factor

dat <- dat |>
  mutate(environment = paste(dat$Condition, ".T.",dat$Temperature,".CO2.", dat$CO2, sep = "")) |>
  mutate(environment = factor(environment))
dat

Make scaled trait matrix

X <- scale(dat[, trait_cols, drop = FALSE])
X_df <- as.data.frame(X)
X_df

—– Quick EDA —–

Pairwise plot for traits (keeps file small)

pdf("Correlation_CvsD.pdf", width= 10, height = 10)
ggpairs(dat, columns = 5:13, aes(color = Condition))
dev.off()
## quartz_off_screen 
##                 2
pdf("Correlation_CO2.pdf", width= 10, height = 10)
ggpairs(dat, columns = 5:13, aes(color = CO2))
dev.off()
## quartz_off_screen 
##                 2
pdf("Correlation_Temp.pdf", width= 10, height = 10)
ggpairs(dat, columns = 5:13, aes(color = Temperature))
dev.off()
## quartz_off_screen 
##                 2

PCA

pca <- prcomp(X_df, scale = TRUE)
summary(pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.0063 1.6813 1.0338 0.75574 0.53402 0.36070 0.27105
## Proportion of Variance 0.4473 0.3141 0.1188 0.06346 0.03169 0.01446 0.00816
## Cumulative Proportion  0.4473 0.7614 0.8801 0.94357 0.97526 0.98972 0.99788
##                            PC8     PC9
## Standard deviation     0.13662 0.02070
## Proportion of Variance 0.00207 0.00005
## Cumulative Proportion  0.99995 1.00000
pca
## Standard deviations (1, .., p=9):
## [1] 2.00631513 1.68133788 1.03383118 0.75574180 0.53402354 0.36070496 0.27105073
## [8] 0.13661537 0.02069903
## 
## Rotation (n x k) = (9 x 9):
##               PC1          PC2         PC3          PC4         PC5         PC6
## AriIdx -0.3005591 -0.179666192  0.57174937 -0.519507378  0.30275678  0.19022160
## ChlIdx  0.4304852  0.191361585  0.23329407 -0.105840306 -0.04685558 -0.76669375
## Fq.Fm   0.4598663  0.007295212  0.35596262 -0.003994782  0.02331185  0.20612269
## Fv.Fm   0.2696364 -0.413971205 -0.24980928  0.259419456  0.61196625  0.02678297
## GR     -0.2945924 -0.003241439  0.52641188  0.785434031 -0.04897422 -0.05926574
## NPQ    -0.3013932  0.458956854  0.07076547 -0.131106497  0.16070366 -0.15180967
## npq.t   0.2133399  0.507899956 -0.09298904  0.077490731 -0.23269077  0.49302599
## phiNO  -0.3738561 -0.323450000 -0.25952132 -0.058045657 -0.43181256 -0.21125264
## phiNPQ -0.2779586  0.431788440 -0.26729565  0.090143543  0.51211867 -0.13613095
##                PC7          PC8           PC9
## AriIdx  0.37952729 -0.090551762 -0.0226383013
## ChlIdx  0.34615026 -0.048597429  0.0154419724
## Fq.Fm  -0.30719929  0.131055623 -0.7121568787
## Fv.Fm   0.33579126  0.371778411 -0.0050928807
## GR      0.11517599 -0.003452863 -0.0006229628
## NPQ    -0.17383073  0.772417026  0.0246383465
## npq.t   0.61379926  0.087137209 -0.0176036854
## phiNO   0.32732487  0.199885073 -0.5538101559
## phiNPQ  0.04047378 -0.435725814 -0.4294562618
fviz_pca_biplot(pca, label = "var", habillage = dat$Temperature,
                addEllipses=TRUE, ellipse.level=0.95)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fviz_pca_biplot(pca, label = "var", habillage = dat$CO2,
                addEllipses=TRUE, ellipse.level=0.95)

fviz_pca_biplot(pca, label = "var", habillage = dat$Condition,
                addEllipses=TRUE, ellipse.level=0.95)

fviz_pca_ind(pca, label = "var", habillage = dat$Temperature,
                addEllipses=TRUE, ellipse.level=0.95)

dat$Temp_and_CO2 <- paste("Temp.", dat$Temperature, ".CO2.", dat$CO2, sep="")

dat$environment
##  [1] Control.T.ambient.CO2.ambient Drought.T.ambient.CO2.ambient
##  [3] Control.T.ambient.CO2.high    Drought.T.ambient.CO2.high   
##  [5] Control.T.high.CO2.ambient    Drought.T.high.CO2.ambient   
##  [7] Control.T.high.CO2.high       Drought.T.high.CO2.high      
##  [9] Control.T.ambient.CO2.ambient Drought.T.ambient.CO2.ambient
## [11] Control.T.ambient.CO2.high    Drought.T.ambient.CO2.high   
## [13] Control.T.high.CO2.ambient    Drought.T.high.CO2.ambient   
## [15] Control.T.high.CO2.high       Drought.T.high.CO2.high      
## [17] Control.T.ambient.CO2.ambient Drought.T.ambient.CO2.ambient
## [19] Control.T.ambient.CO2.high    Drought.T.ambient.CO2.high   
## [21] Control.T.high.CO2.ambient    Drought.T.high.CO2.ambient   
## [23] Control.T.high.CO2.high       Drought.T.high.CO2.high      
## [25] Control.T.ambient.CO2.ambient Drought.T.ambient.CO2.ambient
## [27] Control.T.ambient.CO2.high    Drought.T.ambient.CO2.high   
## [29] Control.T.high.CO2.ambient    Drought.T.high.CO2.ambient   
## [31] Control.T.high.CO2.high       Drought.T.high.CO2.high      
## [33] Control.T.ambient.CO2.ambient Drought.T.ambient.CO2.ambient
## [35] Control.T.ambient.CO2.high    Drought.T.ambient.CO2.high   
## [37] Control.T.high.CO2.ambient    Drought.T.high.CO2.ambient   
## [39] Control.T.high.CO2.high       Drought.T.high.CO2.high      
## [41] Control.T.ambient.CO2.ambient Drought.T.ambient.CO2.ambient
## [43] Control.T.ambient.CO2.high    Drought.T.ambient.CO2.high   
## [45] Control.T.high.CO2.ambient    Drought.T.high.CO2.ambient   
## [47] Control.T.high.CO2.high       Drought.T.high.CO2.high      
## 8 Levels: Control.T.ambient.CO2.ambient ... Drought.T.high.CO2.high
fviz_pca_ind(pca, label = "var", habillage = dat$Temp_and_CO2,
                addEllipses=TRUE, ellipse.level=0.95)

fviz_pca_ind(pca, label = "var", habillage = dat$environment,
                addEllipses=TRUE, ellipse.level=0.8)

pdf("PCA_all_env.pdf")
fviz_pca_ind(pca, label = "var", habillage = dat$environment,
                addEllipses=TRUE, ellipse.level=0.8)
dev.off()
## quartz_off_screen 
##                 2
pdf("PCA_Temp_and_CO2.pdf")
fviz_pca_ind(pca, label = "var", habillage = dat$Temp_and_CO2,
                addEllipses=TRUE, ellipse.level=0.8)
dev.off()
## quartz_off_screen 
##                 2
pdf("PCA_CO2.pdf")
fviz_pca_biplot(pca, label = "var", habillage = dat$CO2,
                addEllipses=TRUE, ellipse.level=0.8)
dev.off()
## quartz_off_screen 
##                 2
pdf("PCA_Temp.pdf")
fviz_pca_biplot(pca, label = "var", habillage = dat$Temperature,
                addEllipses=TRUE, ellipse.level=0.8)
dev.off()
## quartz_off_screen 
##                 2
pdf("PCA_Drought.pdf")
fviz_pca_biplot(pca, label = "var", habillage = dat$Condition,
                addEllipses=TRUE, ellipse.level=0.8)
dev.off()
## quartz_off_screen 
##                 2
fviz_pca_var(pca, col.var="steelblue")+
 theme_minimal()
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fviz_pca_var(pca, col.var="contrib")+
 scale_color_gradient2(low="white",
           high="red", midpoint=8) +
 theme_minimal()

# ANOVA

anova_res <- aov(as.matrix(X_df$AriIdx) ~ Condition * Temperature * CO2, data = dat)
summary(anova_res)
##                           Df Sum Sq Mean Sq F value   Pr(>F)    
## Condition                  1  0.076   0.076   0.601    0.443    
## Temperature                1 13.594  13.594 107.490 6.75e-13 ***
## CO2                        1 19.844  19.844 156.906 2.00e-15 ***
## Condition:Temperature      1  0.027   0.027   0.216    0.645    
## Condition:CO2              1  0.016   0.016   0.128    0.722    
## Temperature:CO2            1  8.330   8.330  65.869 5.51e-10 ***
## Condition:Temperature:CO2  1  0.053   0.053   0.420    0.521    
## Residuals                 40  5.059   0.126                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_res <- aov(as.matrix(X_df$ChlIdx) ~ Condition * Temperature * CO2, data = dat)
summary(anova_res)
##                           Df Sum Sq Mean Sq F value   Pr(>F)    
## Condition                  1   0.45    0.45   4.660   0.0369 *  
## Temperature                1  40.32   40.32 418.869  < 2e-16 ***
## CO2                        1   0.37    0.37   3.807   0.0581 .  
## Condition:Temperature      1   0.01    0.01   0.113   0.7385    
## Condition:CO2              1   0.10    0.10   1.041   0.3137    
## Temperature:CO2            1   1.90    1.90  19.744 6.84e-05 ***
## Condition:Temperature:CO2  1   0.00    0.00   0.041   0.8401    
## Residuals                 40   3.85    0.10                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_res <- aov(as.matrix(X_df$Fq.Fm) ~ Condition * Temperature * CO2, data = dat)
summary(anova_res)
##                           Df Sum Sq Mean Sq F value   Pr(>F)    
## Condition                  1   0.00    0.00   0.006  0.93623    
## Temperature                1  41.61   41.61 735.173  < 2e-16 ***
## CO2                        1   0.52    0.52   9.108  0.00441 ** 
## Condition:Temperature      1   0.09    0.09   1.672  0.20346    
## Condition:CO2              1   0.45    0.45   7.882  0.00768 ** 
## Temperature:CO2            1   1.66    1.66  29.296 3.15e-06 ***
## Condition:Temperature:CO2  1   0.41    0.41   7.286  0.01013 *  
## Residuals                 40   2.26    0.06                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_res <- aov(as.matrix(X_df$Fv.Fm) ~ Condition * Temperature * CO2, data = dat)
summary(anova_res)
##                           Df Sum Sq Mean Sq F value   Pr(>F)    
## Condition                  1  0.019   0.019   0.322    0.573    
## Temperature                1  5.933   5.933  99.861 1.97e-12 ***
## CO2                        1 17.094  17.094 287.688  < 2e-16 ***
## Condition:Temperature      1  0.000   0.000   0.003    0.959    
## Condition:CO2              1  0.020   0.020   0.331    0.568    
## Temperature:CO2            1 21.520  21.520 362.179  < 2e-16 ***
## Condition:Temperature:CO2  1  0.038   0.038   0.634    0.431    
## Residuals                 40  2.377   0.059                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_res <- aov(as.matrix(X_df$GR) ~ Condition * Temperature * CO2, data = dat)
summary(anova_res)
##                           Df Sum Sq Mean Sq F value   Pr(>F)    
## Condition                  1  5.648   5.648   9.165 0.004303 ** 
## Temperature                1  8.380   8.380  13.596 0.000673 ***
## CO2                        1  5.270   5.270   8.551 0.005663 ** 
## Condition:Temperature      1  0.762   0.762   1.237 0.272703    
## Condition:CO2              1  2.119   2.119   3.438 0.071081 .  
## Temperature:CO2            1  0.001   0.001   0.002 0.969196    
## Condition:Temperature:CO2  1  0.167   0.167   0.270 0.605997    
## Residuals                 40 24.653   0.616                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_res <- aov(as.matrix(X_df$NPQ) ~ Condition * Temperature * CO2, data = dat)
summary(anova_res)
##                           Df Sum Sq Mean Sq F value   Pr(>F)    
## Condition                  1  0.095   0.095   0.520    0.475    
## Temperature                1  9.936   9.936  54.144 5.97e-09 ***
## CO2                        1  7.323   7.323  39.906 1.69e-07 ***
## Condition:Temperature      1  0.397   0.397   2.166    0.149    
## Condition:CO2              1  0.003   0.003   0.015    0.902    
## Temperature:CO2            1 21.578  21.578 117.581 1.78e-13 ***
## Condition:Temperature:CO2  1  0.326   0.326   1.777    0.190    
## Residuals                 40  7.341   0.184                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_res <- aov(as.matrix(X_df$npq.t) ~ Condition * Temperature * CO2, data = dat)
summary(anova_res)
##                           Df Sum Sq Mean Sq F value   Pr(>F)    
## Condition                  1  0.000   0.000   0.000    0.998    
## Temperature                1 13.251  13.251  45.815 3.94e-08 ***
## CO2                        1  0.019   0.019   0.066    0.799    
## Condition:Temperature      1  0.058   0.058   0.202    0.655    
## Condition:CO2              1  0.002   0.002   0.006    0.940    
## Temperature:CO2            1 22.095  22.095  76.390 8.08e-11 ***
## Condition:Temperature:CO2  1  0.005   0.005   0.018    0.893    
## Residuals                 40 11.569   0.289                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_res <- aov(as.matrix(X_df$phiNO) ~ Condition * Temperature * CO2, data = dat)
summary(anova_res)
##                           Df Sum Sq Mean Sq F value   Pr(>F)    
## Condition                  1  0.145   0.145   0.605   0.4413    
## Temperature                1 31.515  31.515 131.542 3.21e-14 ***
## CO2                        1  0.725   0.725   3.024   0.0897 .  
## Condition:Temperature      1  0.991   0.991   4.135   0.0487 *  
## Condition:CO2              1  0.386   0.386   1.610   0.2119    
## Temperature:CO2            1  2.866   2.866  11.961   0.0013 ** 
## Condition:Temperature:CO2  1  0.790   0.790   3.297   0.0769 .  
## Residuals                 40  9.583   0.240                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_res <- aov(as.matrix(X_df$phiNPQ) ~ Condition * Temperature * CO2, data = dat)
summary(anova_res)
##                           Df Sum Sq Mean Sq F value   Pr(>F)    
## Condition                  1  0.309   0.309   0.970    0.331    
## Temperature                1 11.569  11.569  36.326 4.33e-07 ***
## CO2                        1  0.017   0.017   0.053    0.819    
## Condition:Temperature      1  0.724   0.724   2.273    0.139    
## Condition:CO2              1  0.103   0.103   0.325    0.572    
## Temperature:CO2            1 21.507  21.507  67.534 4.01e-10 ***
## Condition:Temperature:CO2  1  0.032   0.032   0.102    0.751    
## Residuals                 40 12.739   0.318                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

MANOVA

dat
manova_res <- manova(as.matrix(X_df) ~ Condition * Temperature * CO2, data = dat)
summary(manova_res)
##                           Df  Pillai approx F num Df den Df  Pr(>F)    
## Condition                  1 0.30681     1.57      9     32 0.16549    
## Temperature                1 0.99131   405.55      9     32 < 2e-16 ***
## CO2                        1 0.96805   107.71      9     32 < 2e-16 ***
## Condition:Temperature      1 0.15800     0.67      9     32 0.73178    
## Condition:CO2              1 0.33556     1.80      9     32 0.10793    
## Temperature:CO2            1 0.96053    86.52      9     32 < 2e-16 ***
## Condition:Temperature:CO2  1 0.34761     1.89      9     32 0.08902 .  
## Residuals                 40                                           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Hierarchival clustering:

dist_matrix <- dist(X_df)
cluster <- hclust(dist_matrix)
plot(cluster)

Random Forest

data <- cbind(dat$Condition, X_df)
colnames(data)[1] <- "Condition"
data
rf <- randomForest(Condition ~ ., data, importance = TRUE)
varImpPlot(rf)

Genotype specific responses:

for (g in unique(dat$Genotype)) {
  cat("\n### PCA for genotype:", g, "\n")
  pca_g <- prcomp(subset(X_df, dat$Genotype == g), scale = TRUE)
  print(fviz_pca_biplot(pca_g))
}
## 
## ### PCA for genotype: Australia

## 
## ### PCA for genotype: Mexico

## 
## ### PCA for genotype: Pakistan

## 
## ### PCA for genotype: S.Africa

## 
## ### PCA for genotype: Texas

## 
## ### PCA for genotype: Vietnam