PEP725 explorations

knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)

## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(broom)
library(rbeni)

## 
## Attaching package: 'rbeni'

## The following object is masked _by_ '.GlobalEnv':
## 
##     add_alpha

do_eval <- FALSE

Explorations

This is to investigate main points using own analysis and code.

Read all files.

vec_files <- list.files("~/data/pep/PEP725_2020-12/", pattern = "pep725_2020-12_", full.names = TRUE)
list_df <- map(as.list(vec_files), ~read.delim(., sep = ";"))
names(list_df) <- vec_files

df <- list_df %>% 
  bind_rows(.id = "filnam") %>% 
  as_tibble() %>%  
  mutate(date = ymd(date))

Interpretation of phenological states

Zani et al.:

Phenology definitions followed the BBCH (Biologische Bundesanstalt, Bundessortenamt und Chemische Industrie) codes (see table S1). For the five deciduous angiosperms, spring leaf-out was defined as the date when the first or 50% of leaf stalks are visible (BBCH11 or BBCH13, respectively), for Larix decidua leaf-out was defined as the date when the first leaves separated (mouse ear stage = BBCH10). Leaf senescence was defined as the date when 50% of leaves had lost their green color (BBCH94) or had fallen (BCCH95).

BBCH94 is not available in this dataset. Using BCCH205 instead (autumnal colouration >=50%)

This information is given in column phase_id.

df_sub <- df %>% 
  filter(phase_id %in% c(10, 11, 13, 95, 205)) %>% 
  mutate(id = paste0("i", 1:n())) %>% 
  mutate(pheno = ifelse(phase_id %in% c(10, 11, 13), "on", ifelse(phase_id %in% c(95, 205), "off", NA))) %>% 
  mutate(date = ymd(date)) %>% 
  mutate(lon = round(lon, digits = 4), lat = round(lat, digits = 4)) %>% 
  na_if(-9999) %>% 
  mutate(elv = ifelse(is.na(alt), alt_dem, alt))

Determine unique “sites”.

df_sites <- df_sub %>% 
  select(lon, lat, elv) %>% 
  distinct() %>% 
  mutate(id_site = paste0("site_i", 1:n()))

df_sub <- df_sub %>% 
  left_join(df_sites, by = c("lon", "lat", "elv"))

Determine unique species x “sites”.

df_species_sites <- df_sub %>% 
  select(lon, lat, elv, species) %>% 
  distinct() %>% 
  mutate(id_species_site = paste0("species_site_i", 1:n()))

df_sub <- df_sub %>% 
  left_join(df_species_sites, by = c("lon", "lat", "elv", "species"))

Determine unique species x “sites” x pheno-event.

df_pheno_species_sites <- df_sub %>% 
  select(lon, lat, elv, species, pheno) %>% 
  distinct() %>% 
  mutate(id_pheno_species_site = paste0("pheno_species_site_i", 1:n()))

df_sub <- df_sub %>% 
  left_join(df_pheno_species_sites, by = c("lon", "lat", "elv", "species", "pheno"))

Cleaning

Remove individual time series with less than 15 years of leaf-out and leaf senescence observations,

n_years <- function(vec){length(unique(vec))}
df_retain <- df_sub %>% 
  group_by(id_species_site) %>% 
  summarise(n_years = n_years(year)) %>% 
  filter(n_years >= 15)

df_sub <- df_sub %>% 
  filter(id_species_site %in% unique(df_retain$id_species_site))

Remove dates deviating from an individual’s median more than 3 times the median absolute deviation.

medabsdev <- function(vec){abs(vec - median(vec)) / median(abs(vec - median(vec)))}

df_retain <- df_sub %>% 
  group_by(id_pheno_species_site, pheno) %>% 
  summarise(medabsdev = medabsdev(day)) %>% 
  filter(medabsdev < 3)

df_sub <- df_sub %>% 
  filter(id_pheno_species_site %in% unique(df_retain$id_pheno_species_site))

Remove time series for which the standard deviation of phenological observations across years was higher than 15 for leaf-out and 20 for leaf senescence.

df_retain <- df_sub %>% 
  group_by(id_pheno_species_site, pheno) %>% 
  summarise(sd = sd(day)) %>% 
  filter((sd <= 15.0 & pheno == "on") | (sd <= 20.0 & pheno == "off"))

df_sub <- df_sub %>% 
  filter(id_pheno_species_site %in% unique(df_retain$id_pheno_species_site))

Save intermediate file.

write_csv(df_sub, path = "data/df_sub.csv")

Analysis

Read if not yet available.

df_sub <- read_csv("data/df_sub.csv")

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   filnam = col_character(),
##   genus = col_character(),
##   species = col_character(),
##   subspecies = col_logical(),
##   date = col_date(format = ""),
##   id = col_character(),
##   pheno = col_character(),
##   id_site = col_character(),
##   id_species_site = col_character(),
##   id_pheno_species_site = col_character()
## )

## See spec(...) for full column specifications.

## Warning: 6172 parsing failures.
##   row        col           expected                         actual              file
## 34837 subspecies 1/0/T/F/TRUE/FALSE Prunus domestica Besztercei    'data/df_sub.csv'
## 35790 subspecies 1/0/T/F/TRUE/FALSE Solanum tuberosum Desire       'data/df_sub.csv'
## 36354 subspecies 1/0/T/F/TRUE/FALSE Solanum tuberosum Desire       'data/df_sub.csv'
## 36355 subspecies 1/0/T/F/TRUE/FALSE Prunus avium Germersdorfer     'data/df_sub.csv'
## 37847 subspecies 1/0/T/F/TRUE/FALSE Vitis vinifera CHASSELAS BLANC 'data/df_sub.csv'
## ..... .......... .................. .............................. .................
## See problems(...) for more details.

## this creates duplicates - problem!!!
df_anl <- df_sub %>% 
  select(id_species_site, pheno, day, year) %>% 
  pivot_wider(names_from = "pheno", values_from = "day")

## solution, but reduces from 2.6 to 2.2 mio obs.
df_anl <- df_sub %>% 
  distinct(id_pheno_species_site, year, .keep_all = TRUE) %>% 
  select(id_site, id_species_site, pheno, day, year) %>% 
  pivot_wider(names_from = "pheno", values_from = "day") %>% 
  filter(!is.na(on) & !is.na(off))

write_csv(df_anl, path = "data/df_anl.csv")

Laura: For the mixed effects modelling, you may best work with df_anl (I think).

Coverage of years in the data.

df_anl <- read_csv("data/df_anl.csv")

## Parsed with column specification:
## cols(
##   id_site = col_character(),
##   id_species_site = col_character(),
##   year = col_double(),
##   on = col_double(),
##   off = col_double()
## )

df_anl %>% 
  ggplot(aes(year, ..count..)) +
  geom_histogram() +
  labs(title = "Number of data points per year", subtitle = "By sites and species, providing both EOS and SOS")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Interannual

Get correlation between on and off for withing species-sites => interannual temporal variation.

df_anl <- read_csv("data/df_anl.csv")
get_coef_on <- function(df){df %>% filter(term == "on") %>% pull(estimate)}
get_p_on <- function(df){df %>% filter(term == "on") %>% pull(p.value)}

df_temporal <- df_anl %>% 
  # ungroup() %>% 
  # slice(1:10) %>% 
  group_by(id_species_site) %>% 
  nest() %>% 
  mutate(linmod = purrr::map(data, ~lm(off ~ on, data = .))) %>% 
  mutate(summ = purrr::map(linmod, ~summary(.))) %>% 
  mutate(df_coef = purrr::map(linmod, ~tidy(.))) %>% 
  mutate(coef_on = purrr::map_dbl(df_coef, ~get_coef_on(.))) %>% 
  mutate(p_value_on = purrr::map_dbl(df_coef, ~get_p_on(.))) %>% 
  select(-linmod) %>% 
  mutate(rsq = purrr::map_dbl(summ, "r.squared"),
         adj_rsq = purrr::map_dbl(summ, "adj.r.squared")) %>% 
  select(-summ)

save(df_temporal, file = "data/df_temporal.RData")

Get temporal trends for on and off for each site and species.

get_coef_year <- function(df){df %>% filter(term == "year") %>% pull(estimate)}
get_p_year <- function(df){df %>% filter(term == "year") %>% pull(p.value)}

df_temporal <- df_temporal %>% 
  
  ## SOS
  mutate(linmod = purrr::map(data, ~lm(on ~ year, data = .))) %>% 
  mutate(df_coef = purrr::map(linmod, ~tidy(.))) %>% 
  mutate(coef_year_on = purrr::map_dbl(df_coef, ~get_coef_year(.))) %>% 
  mutate(p_value_year_on = purrr::map_dbl(df_coef, ~get_p_year(.))) %>% 
  select(-linmod, -df_coef) %>% 

  ## EOS
  mutate(linmod = purrr::map(data, ~lm(off ~ year, data = .))) %>% 
  mutate(df_coef = purrr::map(linmod, ~tidy(.))) %>% 
  mutate(coef_year_off = purrr::map_dbl(df_coef, ~get_coef_year(.))) %>% 
  mutate(p_value_year_off = purrr::map_dbl(df_coef, ~get_p_year(.))) %>% 
  select(-linmod, -df_coef)
  
save(df_temporal, file = "data/df_temporal.RData")

load("data/df_temporal.RData")

df_temporal %>% 
  # filter(p_value_on < 0.05) %>% 
  ggplot(aes(coef_on, ..density..)) +
  geom_histogram() +
  xlim(-2.5, 2.5) +
  geom_vline(xintercept = 0, linetype = "dotted") +
  geom_vline(xintercept = median(df_temporal$coef_on, na.rm = TRUE), color = "red", linetype = "dashed") +
  # geom_vline(xintercept = mean(df_temporal$coef_on, na.rm = TRUE), color = "red") +
  labs(title = "Slope of EOS ~ SOS", subtitle = paste(nrow(df_temporal), "sites and species"))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 92 rows containing non-finite values (stat_bin).

df_temporal %>% 
  # filter(p_value_on < 0.05) %>% 
  ggplot(aes(coef_year_on, ..density..)) +
  geom_histogram() +
  xlim(-2.5, 2.5) +
  geom_vline(xintercept = 0, linetype = "dotted") +
  geom_vline(xintercept = median(df_temporal$coef_year_on, na.rm = TRUE), color = "red", linetype = "dashed") +
  # geom_vline(xintercept = mean(df_temporal$coef_year_on, na.rm = TRUE), color = "red") +
  labs(title = "Temporal trend of SOS", subtitle = paste(nrow(df_temporal), "sites and species"))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 305 rows containing non-finite values (stat_bin).

df_temporal %>% 
  # filter(p_value_on < 0.05) %>% 
  ggplot(aes(coef_year_off, ..density..)) +
  geom_histogram() +
  xlim(-2.5, 2.5) +
  geom_vline(xintercept = 0, linetype = "dotted") +
  geom_vline(xintercept = median(df_temporal$coef_year_off, na.rm = TRUE), color = "red", linetype = "dashed") +
  # geom_vline(xintercept = mean(df_temporal$coef_year_off, na.rm = TRUE), color = "red") +
  labs(title = "Temporal trend of EOS", subtitle = paste(nrow(df_temporal), "sites and species"))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 391 rows containing non-finite values (stat_bin).

out <- df_temporal %>% 
  analyse_modobs2("coef_year_on", "coef_year_off")

## Loading required package: LSD

## 
## Attaching package: 'LSD'

## The following objects are masked from 'package:rbeni':
## 
##     colorpalette, complementarycolor, convertcolor, convertgrey,
##     daltonize, disco, distinctcolors, heatpairs, heatscatter,
##     heatscatterpoints

## Loading required package: ggthemes

## Loading required package: RColorBrewer

out$gg +
  labs(title = "Trend in EOS vs. trend in SOS")

## `geom_smooth()` using formula 'y ~ x'

Long-term

Get correlation between on and off for withing species-sites => long-term temporal variation (5 bins).

df_temporal_longterm <- df_anl %>% 
  # mutate(yearbin = ntile(year, n = 5)) %>% 
  mutate(yearbin = cut(year, breaks = 5, labels = FALSE)) %>%
  # group_by(id_species_site, yearbin) %>% 
  group_by(id_site, yearbin) %>% 
  summarise(on = mean(on, na.rm = TRUE), off = mean(off, na.rm = TRUE)) %>% 
  ungroup() %>% 
  # group_by(id_species_site) %>% 
  group_by(id_site) %>% 
  nest() %>% 
  mutate(linmod = purrr::map(data, ~lm(off ~ on, data = .))) %>% 
  mutate(summ = purrr::map(linmod, ~summary(.))) %>% 
  mutate(df_coef = purrr::map(linmod, ~tidy(.))) %>% 
  mutate(coef_on = purrr::map_dbl(df_coef, ~get_coef_on(.))) %>% 
  mutate(p_value_on = purrr::map_dbl(df_coef, ~get_p_on(.))) %>% 
  select(-linmod) %>% 
  mutate(rsq = purrr::map_dbl(summ, "r.squared"),
         adj_rsq = purrr::map_dbl(summ, "adj.r.squared")) %>% 
  select(-summ)

save(df_temporal_longterm, file = "data/df_temporal_longterm.RData")

## years in bins overview
df_anl %>% 
  # mutate(yearbin = ntile(year, n = 5)) %>% 
  mutate(yearbin = cut(year, breaks = 5, labels = FALSE)) %>%
  select(year, yearbin) %>% 
  ggplot(aes(year, yearbin)) +
  geom_point()

load("data/df_temporal_longterm.RData")
df_temporal_longterm %>% 
  # filter(p_value_on < 0.05) %>% 
  ggplot(aes(coef_on, ..density..)) +
  geom_histogram() +
  xlim(-20,20) +
  geom_vline(xintercept = 0, linetype = "dotted") +
  geom_vline(xintercept = median(df_temporal_longterm$coef_on, na.rm = TRUE), color = "red", linetype = "dashed")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 666 rows containing non-finite values (stat_bin).

  # geom_vline(xintercept = mean(df_temporal_longterm$coef_on, na.rm = TRUE), color = "red")

Spatial

Get correlation between on and off for withing species-sites => long-term temporal variation (30 bins).

df_spatial <- df_anl %>% 
  group_by(id_site) %>% 
  summarise(on = mean(on, na.rm = TRUE), off = mean(off, na.rm = TRUE))

save(df_spatial, file = "data/df_spatial.RData")

load("data/df_spatial.RData")
out_modobs <- df_spatial %>% analyse_modobs2("on", "off", type = "heat")
out_modobs$gg

## `geom_smooth()` using formula 'y ~ x'

Spatial by species

Get correlation between on and off for withing species-sites => long-term temporal variation (30 bins).

df_spatial_species <- df_anl %>% 
  left_join(select(df_sub, id_species_site, species), by = c("id_species_site")) %>% 
  group_by(id_site, species) %>% 
  summarise(on = mean(on, na.rm = TRUE), off = mean(off, na.rm = TRUE)) %>% 
  
  ungroup() %>% 
  group_by(species) %>% 
  nest() %>% 
  mutate(linmod = purrr::map(data, ~lm(off ~ on, data = .))) %>% 
  mutate(summ = purrr::map(linmod, ~summary(.))) %>% 
  mutate(df_coef = purrr::map(linmod, ~tidy(.))) %>% 
  mutate(coef_on = purrr::map_dbl(df_coef, ~get_coef_on(.))) %>% 
  mutate(p_value_on = purrr::map_dbl(df_coef, ~get_p_on(.))) %>% 
  select(-linmod) %>% 
  mutate(rsq = purrr::map_dbl(summ, "r.squared"),
         adj_rsq = purrr::map_dbl(summ, "adj.r.squared")) %>% 
  select(-summ)

save(df_spatial_species, file = "data/df_spatial_species.RData")

load("data/df_spatial_species.RData")
df_spatial_species %>% 
  # filter(p_value_on < 0.05) %>% 
  ggplot(aes(coef_on, ..density..)) +
  geom_histogram(binwidth = 0.5) +
  xlim(-2.5, 2.5) +
  geom_vline(xintercept = 0, linetype = "dotted") +
  # geom_vline(xintercept = mean(df_spatial_species$coef_on, na.rm = TRUE), color = "springgreen3") +
  geom_vline(xintercept = median(df_spatial_species$coef_on, na.rm = TRUE), color = "springgreen3", linetype = "dashed")

## Warning: Removed 3 rows containing non-finite values (stat_bin).

## Warning: Removed 2 rows containing missing values (geom_bar).

Varying number of bins

Dependence of mean slope vs. aggregation time scale (number of bins). With Increasing aggregation (from right to left) reversal from a positive to a negative slope. Median slopes are plotted.

Aggregating species

bins <- c(25, 15, 10, 7, 6, 5, 4, 3, 2)

list_temporal_longterm <- list()
list_median_coef <- list()
list_gg <- list()

for (ibreaks in bins){
  
  print(paste0("breaks_", ibreaks))

  list_temporal_longterm[[paste0("breaks_", ibreaks)]] <- df_anl %>%
    # mutate(yearbin = ntile(year, n = ibreaks)) %>%
    mutate(yearbin = cut(year, breaks = ibreaks, labels = FALSE)) %>%
    # group_by(id_species_site, yearbin) %>%
    group_by(id_site, yearbin) %>%
    summarise(on = mean(on, na.rm = TRUE), off = mean(off, na.rm = TRUE)) %>%
    ungroup() %>%
    # group_by(id_species_site) %>%
    group_by(id_site) %>%
    nest() %>%
    mutate(linmod = purrr::map(data, ~lm(off ~ on, data = .))) %>%
    mutate(summ = purrr::map(linmod, ~summary(.))) %>%
    mutate(df_coef = purrr::map(linmod, ~tidy(.))) %>%
    mutate(coef_on = purrr::map_dbl(df_coef, ~get_coef_on(.))) %>%
    mutate(p_value_on = purrr::map_dbl(df_coef, ~get_p_on(.))) %>%
    select(-linmod) %>%
    mutate(rsq = purrr::map_dbl(summ, "r.squared"),
           adj_rsq = purrr::map_dbl(summ, "adj.r.squared")) %>%
    select(-summ)
  
  list_median_coef[[paste0("breaks_", ibreaks)]] <- median(list_temporal_longterm[[paste0("breaks_", ibreaks)]]$coef_on, na.rm = TRUE)
  
  list_gg[[paste0("breaks_", ibreaks)]] <- list_temporal_longterm[[paste0("breaks_", ibreaks)]] %>% 
    # filter(p_value_on < 0.05) %>% 
    ggplot(aes(coef_on, ..density..)) +
    geom_histogram() +
    xlim(-20,20) +
    geom_vline(xintercept = 0, linetype = "dotted") +
    geom_vline(xintercept = median(list_temporal_longterm[[paste0("breaks_", ibreaks)]]$coef_on, na.rm = TRUE), color = "red", linetype = "dashed")
    # geom_vline(xintercept = mean(list_temporal_longterm[[paste0("breaks_", ibreaks)]]$coef_on, na.rm = TRUE), color = "red")
  
}
save(list_gg, file = "data/list_gg.RData")
save(list_median_coef, file = "data/list_median_coef.RData")
save(list_temporal_longterm, file = "data/list_temporal_longterm.RData")

bins <- c(25, 15, 10, 7, 6, 5, 4, 3, 2)

load("data/list_gg.RData")
load("data/list_median_coef.RData")
load("data/list_temporal_longterm.RData")

list_gg

## $breaks_25

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 21 rows containing non-finite values (stat_bin).

## 
## $breaks_15

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 62 rows containing non-finite values (stat_bin).

## 
## $breaks_10

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 111 rows containing non-finite values (stat_bin).

## 
## $breaks_7

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 357 rows containing non-finite values (stat_bin).

## 
## $breaks_6

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 480 rows containing non-finite values (stat_bin).

## 
## $breaks_5

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 666 rows containing non-finite values (stat_bin).

## 
## $breaks_4

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 940 rows containing non-finite values (stat_bin).

## 
## $breaks_3

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 1402 rows containing non-finite values (stat_bin).

## 
## $breaks_2

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2834 rows containing non-finite values (stat_bin).

vec_slopes <- c(list_median_coef %>% unlist(), out_modobs$results %>% pull(slope))
tibble(nbins =  c(bins, 1), median_slope = vec_slopes) %>% 
  ggplot(aes(nbins, median_slope)) +
  geom_point(size = 2) +
  geom_hline(yintercept = median(df_temporal$coef_on, na.rm = TRUE), color = "red") +
  geom_hline(yintercept = 0, linetype = "dotted")

Summary

df_temporal %>% 
  # filter(p_value_on < 0.05) %>% 
  ggplot(aes(coef_on, ..density..)) +
  geom_histogram() +
  xlim(-2.5, 2.5) +
  geom_vline(xintercept = 0, linetype = "dotted") +
  geom_vline(xintercept = median(df_temporal$coef_on, na.rm = TRUE), color = "red") +
  geom_vline(xintercept = median(df_spatial_species$coef_on, na.rm = TRUE), color = "springgreen3") +
  geom_vline(xintercept = out_modobs$df_metrics %>% filter(.metric == "slope") %>% pull(.estimate), color = "royalblue")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 92 rows containing non-finite values (stat_bin).