Time series and z-scores

Generate time series

With autocorrelation
22 ensemble members
1000 time steps (e.g., years)

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(patchwork)
set.seed(123)

create_ar_series <- function(idx, len, amplitude = 2){
  
  # generate a auto-regressive time series from white noise plus a sine wave
  dfa <- tibble(
    idx = idx,
    year = seq(len),
    ar_series = stats::filter(rnorm(len), filter = 0.7, method = "recursive"),
    sine_wave = amplitude * sin(seq(len)*2*pi/len)
    ) |> 
    
    # synthetic PCWD time series as the sum of the AR t series plus sine wave
    # plus 10 to have all values positive
    mutate(
      pcwd = ar_series + sine_wave + 10
    )
}

ztransform <- function(df, mymean, mysd){
  # z-transform pcwd values
  df |> 
    mutate(pcwd_zscore = (pcwd - mymean)/mysd)

}

df_list <- purrr::map(
  seq(1:22),
  ~create_ar_series(., 1000)
)

# get mean and sd across all ensemble members for z-transformation
mysumm <- df_list |> 
  bind_rows() |> # make flat
  dplyr::filter(year %in% 1:30) |> # reference period
  ungroup() |> 
  summarise(
    pcwd_mean = mean(pcwd, na.rm = TRUE),
    pcwd_sd = sd(pcwd, na.rm = TRUE)
  )

df_list <- df_list |> 
  purrr::map(~ztransform(., mysumm$pcwd_mean, mysumm$pcwd_sd))

Visualise.

gg1 <- ggplot(
  aes(year, pcwd, group = idx),
  data = bind_rows(df_list)
) +
  geom_line(alpha = 0.2)

gg2 <- ggplot(
  aes(year, pcwd_zscore, group = idx),
  data = bind_rows(df_list)
) +
  geom_line(alpha = 0.2) +
  
  # mark reference period
  geom_vline(xintercept = 30, linetype = "dotted") +
  
  # mark the interval of +/- 1.96 SD - where ~95% of observations are expected to fall into
  # based on distribution of annual values during reference period
  geom_hline(yintercept = c(-1.960, 1.960), linetype = "dotted") 

gg1 / gg2

30-year means

# Define the 30-year moving average filter
get_rollmean <- function(df, window_size){
  weights <- rep(1/window_size, window_size)  # Equal weights for simple moving average
  df |> 
    dplyr::select(idx, year, pcwd) |> 
    mutate(pcwd = stats::filter(pcwd, filter = weights, sides = 1))
}

df30_list <- df_list |> 
  purrr::map(~get_rollmean(., 30))

Add z-score of 30-year means.

# get mean and sd across all ensemble members for z-transformation
mysumm <- df30_list |> 
  bind_rows() |> # make flat
  dplyr::filter(year %in% 1:30) |> # reference period
  tidyr::drop_na(pcwd) |>  # this yields 22 values
  ungroup() |> 
  summarise(
    pcwd_mean = mean(pcwd, na.rm = TRUE),
    pcwd_sd = sd(pcwd, na.rm = TRUE)
  )

df30_list <- df30_list |> 
  purrr::map(~ztransform(., mysumm$pcwd_mean, mysumm$pcwd_sd))

Visualise for one ensemble member (red is 30-year rolling mean as trailing mean).

ggplot() +
  geom_line(aes(year, pcwd), data = df_list[[1]], color = "grey") +
  geom_line(aes(year, pcwd), data = df30_list[[1]], color = "red")

## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

## Warning: Removed 29 rows containing missing values or values outside the scale range
## (`geom_line()`).

Visualise.

gg1_30 <- ggplot(
  aes(year, pcwd, group = idx),
  data = bind_rows(df30_list)
) +
  geom_line(alpha = 0.2)

gg2_30 <- ggplot(
  aes(year, pcwd_zscore, group = idx),
  data = bind_rows(df30_list)
) +
  geom_line(alpha = 0.2) +
  
  # mark reference period
  geom_vline(xintercept = 30, linetype = "dotted") +
  
  # mark the interval of +/- 1.96 SD - where ~95% of observations are expected to fall into
  # based on distribution of annual values during reference period
  geom_hline(yintercept = c(-1.960, 1.960), linetype = "dotted") 

gg1_30 / gg2_30

## Warning: Removed 638 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 638 rows containing missing values or values outside the scale range
## (`geom_line()`).

For direct comparison again: z-transformed values

gg2 <- gg2 + ylim(-10, 10) + labs(title = "Z-scores from annual values")
gg2_30 <- gg2_30 + ylim(-10, 10) + labs(title = "Z-scores from 30-year means")
gg2  / gg2_30

## Warning: Removed 638 rows containing missing values or values outside the scale range
## (`geom_line()`).

This demonstrates that the long-term variability is more clearly detected from 30-year means.

Time series and z-scores

Beni Stocker

2025-02-17

Generate time series

30-year means