This is based on the script shared by Bernhard Schmid and contained
also this repo (analysis/TianDiMay23_BS.R
).
Read full species-level data
df <- read_csv("~/data/LeafNP_tiandi/Global_total_leaf_N_P_Di/leafnp_data_covariates_20210702.csv") %>%
mutate(grass = tree_shrub_Herb == "H")
## Rows: 36414 Columns: 66
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): FunGroups, Dc_Db_Ec_Eb_Hf_Hg, tree_shrub_Herb, Family_New, Family,...
## dbl (56): lon, lat, leafN, leafP, LeafNP, Lat_Di_check_final, Lon_Di_check_f...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Define predictor sets.
trgts <- c("leafN", "leafP", "LeafNP")
## predictors excluding PHO, and TS (too many missing)
preds <- c("elv", "mat", "matgs", "tmonthmin", "tmonthmax", "ndaysgs", "mai", "maigs", "map", "pmonthmin", "mapgs", "mavgs", "mav", "alpha", "vcmax25", "jmax25", "gs_accl", "aet", "ai", "cwdx80", "gti", "ndep", "co2", "T_BULK_DENSITY", "AWC_CLASS", "T_CLAY", "T_SILT", "T_SAND", "T_GRAVEL", "T_PH_H2O", "T_TEB", "T_BS", "T_CEC_SOIL", "T_CEC_CLAY", "T_ECE", "T_ESP", "T_CACO3", "T_OC", "ORGC", "TOTN", "CNrt", "ALSA", "PBR", "TP", "TK")
Filter to use only data from species that were recorded in at least five different sites.
# yields 398 species
use_species <- df %>%
dplyr::select(sitename, Species) %>%
distinct() %>%
group_by(Species) %>%
summarise(n = n()) %>%
dplyr::filter(n >= 5) %>%
pull(Species)
Additionally, analogous to how it’s done for the trait gradient analysis, use data only for sites where at least M species are present.
use_sites <- df %>%
dplyr::select(sitename, Species) %>%
distinct() %>%
group_by(sitename) %>%
summarise(n = n()) %>%
dplyr::filter(n >= 5) %>%
pull(sitename)
Apply filtering.
df <- df %>%
dplyr::filter(Species %in% use_species & sitename %in% use_sites)
Prepare date for linear mixed effects modelling. Do data pre-processing as Di Tian implemented it. Transformations are required, but not for Random Forest modelling. Therefore, define separate data objects for the two model types.
Make factors and numeric. No capping to positive numbers done here (as opposed to Di Tian’s).
df_lmm <- df %>%
# drop_na() %>%
rename(
SITE = sitename,
SP = Species,
FG = FunGroups,
TG = tree_shrub_Herb,
FGID = Dc_Db_Ec_Eb_Hf_Hg,
FA = Family,
GE = Genus,
ID = id
) %>%
mutate(across(all_of(c("SITE", "FG", "FGID", "TG", "FA", "GE", "SP", "ID")), factor)) %>%
mutate(across(all_of(preds), as.numeric))
# lmm2 <- lme4::lmer(
# log(leafN) ~ SP + ALSA + mav + elv + co2 + tmonthmin + mai + ndep +
# (1|SITE),
# data = df_lmm |>
# drop_na()
# )
lmm2 <- lm(
terms(log(leafN) ~ SP + (ALSA + mav + elv + co2 + tmonthmin + mai + ndep),
keep.order = TRUE),
data = df_lmm |>
drop_na())
df_ss2 <- as_tibble(anova(lmm2), rownames = "Predictor") |>
select(Predictor, `Species first`=`Sum Sq`)
df_ss2_sum <- df_ss2 |>
summarise(`Species first` = sum(`Species first`))
df_ss2 <- df_ss2 |>
mutate(`Species first (%)` = `Species first`/df_ss2_sum$`Species first`)
# lmm3 <- lm(
# log(leafN) ~ ALSA + mav + elv + co2 + tmonthmin + mai + ndep + SP,
# data = df_lmm |>
# drop_na()
# )
lmm3 <- lm(
terms(log(leafN) ~ (ALSA + mav + elv + co2 + tmonthmin + mai + ndep) + SP,
keep.order = TRUE),
data = df_lmm |>
drop_na())
df_ss3 <- as_tibble(anova(lmm3), rownames = "Predictor") |>
select(Predictor, `Species last`=`Sum Sq`)
df_ss3_sum <- df_ss3 |>
summarise(`Species last` = sum(`Species last`))
df_ss3 <- df_ss3 |>
mutate(`Species last (%)` = `Species last`/df_ss3_sum$`Species last`)
lmm4 <- lme4::lmer(
log(leafN) ~ ALSA + mav + elv + co2 + tmonthmin + mai + ndep +
(1|SP),
data = df_lmm |>
drop_na()
)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
# lmm4 <- lme(
#
# log(leafN) ~ ALSA + mav + elv + co2 + tmonthmin + mai + ndep,
# random = ~1|SP,
# data = df_lmm |>
# drop_na()
#
# )
df_ss4 <- as_tibble(anova(lmm4), rownames = "Predictor") |>
select(Predictor, `Species as random`=`Sum Sq`)
df_ss4_sum <- df_ss4 |>
summarise(`Species as random` = sum(`Species as random`))
df_ss4 <- df_ss4 |>
mutate(`Species as random (%)` = `Species as random`/df_ss4_sum$`Species as random`)
Sum of squares for each predictor in the three different models.
df_ss2 |>
left_join(
df_ss3,
by = "Predictor"
) |>
left_join(
df_ss4,
by = "Predictor"
) |>
mutate(
`Species first` = format(`Species first`, digits = 2),
`Species last` = format(`Species last`, digits = 2),
`Species as random` = format(`Species as random`, digits = 2)
) |>
select(c(1,2,4,6)) |>
knitr::kable()
Predictor | Species first | Species last | Species as random |
---|---|---|---|
SP | 164.069 | 132.89 | NA |
ALSA | 0.792 | 0.13 | 0.519 |
mav | 1.240 | 1.48 | 0.637 |
elv | 0.012 | 0.81 | 0.072 |
co2 | 1.725 | 17.80 | 2.212 |
tmonthmin | 0.075 | 14.52 | 2.116 |
mai | 1.557 | 0.34 | 1.622 |
ndep | 0.018 | 1.53 | 0.014 |
Residuals | 68.665 | 68.67 | NA |