knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.0
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(broom)
library(rbeni)
##
## Attaching package: 'rbeni'
## The following object is masked _by_ '.GlobalEnv':
##
## add_alpha
do_eval <- FALSE
This is to investigate main points using own analysis and code.
Read all files.
vec_files <- list.files("~/data/pep/PEP725_2020-12/", pattern = "pep725_2020-12_", full.names = TRUE)
list_df <- map(as.list(vec_files), ~read.delim(., sep = ";"))
names(list_df) <- vec_files
df <- list_df %>%
bind_rows(.id = "filnam") %>%
as_tibble() %>%
mutate(date = ymd(date))
Zani et al.:
Phenology definitions followed the BBCH (Biologische Bundesanstalt, Bundessortenamt und Chemische Industrie) codes (see table S1). For the five deciduous angiosperms, spring leaf-out was defined as the date when the first or 50% of leaf stalks are visible (BBCH11 or BBCH13, respectively), for Larix decidua leaf-out was defined as the date when the first leaves separated (mouse ear stage = BBCH10). Leaf senescence was defined as the date when 50% of leaves had lost their green color (BBCH94) or had fallen (BCCH95).
BBCH94 is not available in this dataset. Using BCCH205 instead (autumnal colouration >=50%)
This information is given in column phase_id
.
df_sub <- df %>%
filter(phase_id %in% c(10, 11, 13, 95, 205)) %>%
mutate(id = paste0("i", 1:n())) %>%
mutate(pheno = ifelse(phase_id %in% c(10, 11, 13), "on", ifelse(phase_id %in% c(95, 205), "off", NA))) %>%
mutate(date = ymd(date)) %>%
mutate(lon = round(lon, digits = 4), lat = round(lat, digits = 4)) %>%
na_if(-9999) %>%
mutate(elv = ifelse(is.na(alt), alt_dem, alt))
Determine unique “sites”.
df_sites <- df_sub %>%
select(lon, lat, elv) %>%
distinct() %>%
mutate(id_site = paste0("site_i", 1:n()))
df_sub <- df_sub %>%
left_join(df_sites, by = c("lon", "lat", "elv"))
Determine unique species x “sites”.
df_species_sites <- df_sub %>%
select(lon, lat, elv, species) %>%
distinct() %>%
mutate(id_species_site = paste0("species_site_i", 1:n()))
df_sub <- df_sub %>%
left_join(df_species_sites, by = c("lon", "lat", "elv", "species"))
Determine unique species x “sites” x pheno-event.
df_pheno_species_sites <- df_sub %>%
select(lon, lat, elv, species, pheno) %>%
distinct() %>%
mutate(id_pheno_species_site = paste0("pheno_species_site_i", 1:n()))
df_sub <- df_sub %>%
left_join(df_pheno_species_sites, by = c("lon", "lat", "elv", "species", "pheno"))
Remove individual time series with less than 15 years of leaf-out and leaf senescence observations,
n_years <- function(vec){length(unique(vec))}
df_retain <- df_sub %>%
group_by(id_species_site) %>%
summarise(n_years = n_years(year)) %>%
filter(n_years >= 15)
df_sub <- df_sub %>%
filter(id_species_site %in% unique(df_retain$id_species_site))
Remove dates deviating from an individual’s median more than 3 times the median absolute deviation.
medabsdev <- function(vec){abs(vec - median(vec)) / median(abs(vec - median(vec)))}
df_retain <- df_sub %>%
group_by(id_pheno_species_site, pheno) %>%
summarise(medabsdev = medabsdev(day)) %>%
filter(medabsdev < 3)
df_sub <- df_sub %>%
filter(id_pheno_species_site %in% unique(df_retain$id_pheno_species_site))
Remove time series for which the standard deviation of phenological observations across years was higher than 15 for leaf-out and 20 for leaf senescence.
df_retain <- df_sub %>%
group_by(id_pheno_species_site, pheno) %>%
summarise(sd = sd(day)) %>%
filter((sd <= 15.0 & pheno == "on") | (sd <= 20.0 & pheno == "off"))
df_sub <- df_sub %>%
filter(id_pheno_species_site %in% unique(df_retain$id_pheno_species_site))
Save intermediate file.
write_csv(df_sub, path = "data/df_sub.csv")
Read if not yet available.
df_sub <- read_csv("data/df_sub.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## filnam = col_character(),
## genus = col_character(),
## species = col_character(),
## subspecies = col_logical(),
## date = col_date(format = ""),
## id = col_character(),
## pheno = col_character(),
## id_site = col_character(),
## id_species_site = col_character(),
## id_pheno_species_site = col_character()
## )
## See spec(...) for full column specifications.
## Warning: 6172 parsing failures.
## row col expected actual file
## 34837 subspecies 1/0/T/F/TRUE/FALSE Prunus domestica Besztercei 'data/df_sub.csv'
## 35790 subspecies 1/0/T/F/TRUE/FALSE Solanum tuberosum Desire 'data/df_sub.csv'
## 36354 subspecies 1/0/T/F/TRUE/FALSE Solanum tuberosum Desire 'data/df_sub.csv'
## 36355 subspecies 1/0/T/F/TRUE/FALSE Prunus avium Germersdorfer 'data/df_sub.csv'
## 37847 subspecies 1/0/T/F/TRUE/FALSE Vitis vinifera CHASSELAS BLANC 'data/df_sub.csv'
## ..... .......... .................. .............................. .................
## See problems(...) for more details.
## this creates duplicates - problem!!!
df_anl <- df_sub %>%
select(id_species_site, pheno, day, year) %>%
pivot_wider(names_from = "pheno", values_from = "day")
## solution, but reduces from 2.6 to 2.2 mio obs.
df_anl <- df_sub %>%
distinct(id_pheno_species_site, year, .keep_all = TRUE) %>%
select(id_site, id_species_site, pheno, day, year) %>%
pivot_wider(names_from = "pheno", values_from = "day") %>%
filter(!is.na(on) & !is.na(off))
write_csv(df_anl, path = "data/df_anl.csv")
Laura: For the mixed effects modelling, you may best work with df_anl
(I think).
Coverage of years in the data.
df_anl <- read_csv("data/df_anl.csv")
## Parsed with column specification:
## cols(
## id_site = col_character(),
## id_species_site = col_character(),
## year = col_double(),
## on = col_double(),
## off = col_double()
## )
df_anl %>%
ggplot(aes(year, ..count..)) +
geom_histogram() +
labs(title = "Number of data points per year", subtitle = "By sites and species, providing both EOS and SOS")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Get correlation between on and off for withing species-sites => interannual temporal variation.
df_anl <- read_csv("data/df_anl.csv")
get_coef_on <- function(df){df %>% filter(term == "on") %>% pull(estimate)}
get_p_on <- function(df){df %>% filter(term == "on") %>% pull(p.value)}
df_temporal <- df_anl %>%
# ungroup() %>%
# slice(1:10) %>%
group_by(id_species_site) %>%
nest() %>%
mutate(linmod = purrr::map(data, ~lm(off ~ on, data = .))) %>%
mutate(summ = purrr::map(linmod, ~summary(.))) %>%
mutate(df_coef = purrr::map(linmod, ~tidy(.))) %>%
mutate(coef_on = purrr::map_dbl(df_coef, ~get_coef_on(.))) %>%
mutate(p_value_on = purrr::map_dbl(df_coef, ~get_p_on(.))) %>%
select(-linmod) %>%
mutate(rsq = purrr::map_dbl(summ, "r.squared"),
adj_rsq = purrr::map_dbl(summ, "adj.r.squared")) %>%
select(-summ)
save(df_temporal, file = "data/df_temporal.RData")
Get temporal trends for on and off for each site and species.
get_coef_year <- function(df){df %>% filter(term == "year") %>% pull(estimate)}
get_p_year <- function(df){df %>% filter(term == "year") %>% pull(p.value)}
df_temporal <- df_temporal %>%
## SOS
mutate(linmod = purrr::map(data, ~lm(on ~ year, data = .))) %>%
mutate(df_coef = purrr::map(linmod, ~tidy(.))) %>%
mutate(coef_year_on = purrr::map_dbl(df_coef, ~get_coef_year(.))) %>%
mutate(p_value_year_on = purrr::map_dbl(df_coef, ~get_p_year(.))) %>%
select(-linmod, -df_coef) %>%
## EOS
mutate(linmod = purrr::map(data, ~lm(off ~ year, data = .))) %>%
mutate(df_coef = purrr::map(linmod, ~tidy(.))) %>%
mutate(coef_year_off = purrr::map_dbl(df_coef, ~get_coef_year(.))) %>%
mutate(p_value_year_off = purrr::map_dbl(df_coef, ~get_p_year(.))) %>%
select(-linmod, -df_coef)
save(df_temporal, file = "data/df_temporal.RData")
load("data/df_temporal.RData")
df_temporal %>%
# filter(p_value_on < 0.05) %>%
ggplot(aes(coef_on, ..density..)) +
geom_histogram() +
xlim(-2.5, 2.5) +
geom_vline(xintercept = 0, linetype = "dotted") +
geom_vline(xintercept = median(df_temporal$coef_on, na.rm = TRUE), color = "red", linetype = "dashed") +
# geom_vline(xintercept = mean(df_temporal$coef_on, na.rm = TRUE), color = "red") +
labs(title = "Slope of EOS ~ SOS", subtitle = paste(nrow(df_temporal), "sites and species"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 92 rows containing non-finite values (stat_bin).
df_temporal %>%
# filter(p_value_on < 0.05) %>%
ggplot(aes(coef_year_on, ..density..)) +
geom_histogram() +
xlim(-2.5, 2.5) +
geom_vline(xintercept = 0, linetype = "dotted") +
geom_vline(xintercept = median(df_temporal$coef_year_on, na.rm = TRUE), color = "red", linetype = "dashed") +
# geom_vline(xintercept = mean(df_temporal$coef_year_on, na.rm = TRUE), color = "red") +
labs(title = "Temporal trend of SOS", subtitle = paste(nrow(df_temporal), "sites and species"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 305 rows containing non-finite values (stat_bin).
df_temporal %>%
# filter(p_value_on < 0.05) %>%
ggplot(aes(coef_year_off, ..density..)) +
geom_histogram() +
xlim(-2.5, 2.5) +
geom_vline(xintercept = 0, linetype = "dotted") +
geom_vline(xintercept = median(df_temporal$coef_year_off, na.rm = TRUE), color = "red", linetype = "dashed") +
# geom_vline(xintercept = mean(df_temporal$coef_year_off, na.rm = TRUE), color = "red") +
labs(title = "Temporal trend of EOS", subtitle = paste(nrow(df_temporal), "sites and species"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 391 rows containing non-finite values (stat_bin).
out <- df_temporal %>%
analyse_modobs2("coef_year_on", "coef_year_off")
## Loading required package: LSD
##
## Attaching package: 'LSD'
## The following objects are masked from 'package:rbeni':
##
## colorpalette, complementarycolor, convertcolor, convertgrey,
## daltonize, disco, distinctcolors, heatpairs, heatscatter,
## heatscatterpoints
## Loading required package: ggthemes
## Loading required package: RColorBrewer
out$gg +
labs(title = "Trend in EOS vs. trend in SOS")
## `geom_smooth()` using formula 'y ~ x'
Get correlation between on and off for withing species-sites => long-term temporal variation (5 bins).
df_temporal_longterm <- df_anl %>%
# mutate(yearbin = ntile(year, n = 5)) %>%
mutate(yearbin = cut(year, breaks = 5, labels = FALSE)) %>%
# group_by(id_species_site, yearbin) %>%
group_by(id_site, yearbin) %>%
summarise(on = mean(on, na.rm = TRUE), off = mean(off, na.rm = TRUE)) %>%
ungroup() %>%
# group_by(id_species_site) %>%
group_by(id_site) %>%
nest() %>%
mutate(linmod = purrr::map(data, ~lm(off ~ on, data = .))) %>%
mutate(summ = purrr::map(linmod, ~summary(.))) %>%
mutate(df_coef = purrr::map(linmod, ~tidy(.))) %>%
mutate(coef_on = purrr::map_dbl(df_coef, ~get_coef_on(.))) %>%
mutate(p_value_on = purrr::map_dbl(df_coef, ~get_p_on(.))) %>%
select(-linmod) %>%
mutate(rsq = purrr::map_dbl(summ, "r.squared"),
adj_rsq = purrr::map_dbl(summ, "adj.r.squared")) %>%
select(-summ)
save(df_temporal_longterm, file = "data/df_temporal_longterm.RData")
## years in bins overview
df_anl %>%
# mutate(yearbin = ntile(year, n = 5)) %>%
mutate(yearbin = cut(year, breaks = 5, labels = FALSE)) %>%
select(year, yearbin) %>%
ggplot(aes(year, yearbin)) +
geom_point()
load("data/df_temporal_longterm.RData")
df_temporal_longterm %>%
# filter(p_value_on < 0.05) %>%
ggplot(aes(coef_on, ..density..)) +
geom_histogram() +
xlim(-20,20) +
geom_vline(xintercept = 0, linetype = "dotted") +
geom_vline(xintercept = median(df_temporal_longterm$coef_on, na.rm = TRUE), color = "red", linetype = "dashed")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 666 rows containing non-finite values (stat_bin).
# geom_vline(xintercept = mean(df_temporal_longterm$coef_on, na.rm = TRUE), color = "red")
Get correlation between on and off for withing species-sites => long-term temporal variation (30 bins).
df_spatial <- df_anl %>%
group_by(id_site) %>%
summarise(on = mean(on, na.rm = TRUE), off = mean(off, na.rm = TRUE))
save(df_spatial, file = "data/df_spatial.RData")
load("data/df_spatial.RData")
out_modobs <- df_spatial %>% analyse_modobs2("on", "off", type = "heat")
out_modobs$gg
## `geom_smooth()` using formula 'y ~ x'
Get correlation between on and off for withing species-sites => long-term temporal variation (30 bins).
df_spatial_species <- df_anl %>%
left_join(select(df_sub, id_species_site, species), by = c("id_species_site")) %>%
group_by(id_site, species) %>%
summarise(on = mean(on, na.rm = TRUE), off = mean(off, na.rm = TRUE)) %>%
ungroup() %>%
group_by(species) %>%
nest() %>%
mutate(linmod = purrr::map(data, ~lm(off ~ on, data = .))) %>%
mutate(summ = purrr::map(linmod, ~summary(.))) %>%
mutate(df_coef = purrr::map(linmod, ~tidy(.))) %>%
mutate(coef_on = purrr::map_dbl(df_coef, ~get_coef_on(.))) %>%
mutate(p_value_on = purrr::map_dbl(df_coef, ~get_p_on(.))) %>%
select(-linmod) %>%
mutate(rsq = purrr::map_dbl(summ, "r.squared"),
adj_rsq = purrr::map_dbl(summ, "adj.r.squared")) %>%
select(-summ)
save(df_spatial_species, file = "data/df_spatial_species.RData")
load("data/df_spatial_species.RData")
df_spatial_species %>%
# filter(p_value_on < 0.05) %>%
ggplot(aes(coef_on, ..density..)) +
geom_histogram(binwidth = 0.5) +
xlim(-2.5, 2.5) +
geom_vline(xintercept = 0, linetype = "dotted") +
# geom_vline(xintercept = mean(df_spatial_species$coef_on, na.rm = TRUE), color = "springgreen3") +
geom_vline(xintercept = median(df_spatial_species$coef_on, na.rm = TRUE), color = "springgreen3", linetype = "dashed")
## Warning: Removed 3 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
Dependence of mean slope vs. aggregation time scale (number of bins). With Increasing aggregation (from right to left) reversal from a positive to a negative slope. Median slopes are plotted.
bins <- c(25, 15, 10, 7, 6, 5, 4, 3, 2)
list_temporal_longterm <- list()
list_median_coef <- list()
list_gg <- list()
for (ibreaks in bins){
print(paste0("breaks_", ibreaks))
list_temporal_longterm[[paste0("breaks_", ibreaks)]] <- df_anl %>%
# mutate(yearbin = ntile(year, n = ibreaks)) %>%
mutate(yearbin = cut(year, breaks = ibreaks, labels = FALSE)) %>%
# group_by(id_species_site, yearbin) %>%
group_by(id_site, yearbin) %>%
summarise(on = mean(on, na.rm = TRUE), off = mean(off, na.rm = TRUE)) %>%
ungroup() %>%
# group_by(id_species_site) %>%
group_by(id_site) %>%
nest() %>%
mutate(linmod = purrr::map(data, ~lm(off ~ on, data = .))) %>%
mutate(summ = purrr::map(linmod, ~summary(.))) %>%
mutate(df_coef = purrr::map(linmod, ~tidy(.))) %>%
mutate(coef_on = purrr::map_dbl(df_coef, ~get_coef_on(.))) %>%
mutate(p_value_on = purrr::map_dbl(df_coef, ~get_p_on(.))) %>%
select(-linmod) %>%
mutate(rsq = purrr::map_dbl(summ, "r.squared"),
adj_rsq = purrr::map_dbl(summ, "adj.r.squared")) %>%
select(-summ)
list_median_coef[[paste0("breaks_", ibreaks)]] <- median(list_temporal_longterm[[paste0("breaks_", ibreaks)]]$coef_on, na.rm = TRUE)
list_gg[[paste0("breaks_", ibreaks)]] <- list_temporal_longterm[[paste0("breaks_", ibreaks)]] %>%
# filter(p_value_on < 0.05) %>%
ggplot(aes(coef_on, ..density..)) +
geom_histogram() +
xlim(-20,20) +
geom_vline(xintercept = 0, linetype = "dotted") +
geom_vline(xintercept = median(list_temporal_longterm[[paste0("breaks_", ibreaks)]]$coef_on, na.rm = TRUE), color = "red", linetype = "dashed")
# geom_vline(xintercept = mean(list_temporal_longterm[[paste0("breaks_", ibreaks)]]$coef_on, na.rm = TRUE), color = "red")
}
save(list_gg, file = "data/list_gg.RData")
save(list_median_coef, file = "data/list_median_coef.RData")
save(list_temporal_longterm, file = "data/list_temporal_longterm.RData")
bins <- c(25, 15, 10, 7, 6, 5, 4, 3, 2)
load("data/list_gg.RData")
load("data/list_median_coef.RData")
load("data/list_temporal_longterm.RData")
list_gg
## $breaks_25
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 21 rows containing non-finite values (stat_bin).
##
## $breaks_15
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 62 rows containing non-finite values (stat_bin).
##
## $breaks_10
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 111 rows containing non-finite values (stat_bin).
##
## $breaks_7
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 357 rows containing non-finite values (stat_bin).
##
## $breaks_6
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 480 rows containing non-finite values (stat_bin).
##
## $breaks_5
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 666 rows containing non-finite values (stat_bin).
##
## $breaks_4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 940 rows containing non-finite values (stat_bin).
##
## $breaks_3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1402 rows containing non-finite values (stat_bin).
##
## $breaks_2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2834 rows containing non-finite values (stat_bin).
vec_slopes <- c(list_median_coef %>% unlist(), out_modobs$results %>% pull(slope))
tibble(nbins = c(bins, 1), median_slope = vec_slopes) %>%
ggplot(aes(nbins, median_slope)) +
geom_point(size = 2) +
geom_hline(yintercept = median(df_temporal$coef_on, na.rm = TRUE), color = "red") +
geom_hline(yintercept = 0, linetype = "dotted")
df_temporal %>%
# filter(p_value_on < 0.05) %>%
ggplot(aes(coef_on, ..density..)) +
geom_histogram() +
xlim(-2.5, 2.5) +
geom_vline(xintercept = 0, linetype = "dotted") +
geom_vline(xintercept = median(df_temporal$coef_on, na.rm = TRUE), color = "red") +
geom_vline(xintercept = median(df_spatial_species$coef_on, na.rm = TRUE), color = "springgreen3") +
geom_vline(xintercept = out_modobs$df_metrics %>% filter(.metric == "slope") %>% pull(.estimate), color = "royalblue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 92 rows containing non-finite values (stat_bin).