1 Overview

This RPubs-ready R Markdown note demonstrates dynamic-systems thinking through three representative regimes:

  1. Agriculture (real data): a long-run annual yield system with shocks and uncertainty-aware forecasting.
  2. Biology (counts): a seasonal counting process with a transient outbreak regime and time-respecting anomaly screening.
  3. Astronomy-like (irregular sampling): heteroskedastic noise and transient flare detection via smoothing and residual screening.

A key design goal is reproducibility: the script auto-creates a timestamped run folder and saves all figures and tables (CSV + HTML) automatically.


2 0. Reproducible setup and output management

# ============================================================
# Dynamic Systems RPubs: End-to-End Reproducible Pipeline
# Saves all plots/tables automatically into a timestamped folder
# ============================================================

# ---------- Packages ----------
pkgs <- c(
  "tidyverse","lubridate","fs","scales",
  "patchwork","viridis","knitr","kableExtra",
  "forecast","agridat","zoo"
)
to_install <- pkgs[!pkgs %in% installed.packages()[,"Package"]]
if(length(to_install)) install.packages(to_install, dependencies = TRUE)
invisible(lapply(pkgs, library, character.only = TRUE))

# ---------- Create a timestamped run folder ----------
run_id <- format(Sys.time(), "%Y%m%d_%H%M%S")
ROOT <- paste0("DynamicSystems_RPubs_Run_", run_id)
dirs <- c("data","figs","tables","models","logs")
purrr::walk(dirs, ~fs::dir_create(fs::path(ROOT, .x)))

# ---------- Logging (NO glue evaluation) ----------
log_file <- fs::path(ROOT, "logs", "run_log.txt")
log_line <- function(...) {
  msg <- paste0(..., collapse = "")
  cat(msg, "\n")
  cat(msg, "\n", file = log_file, append = TRUE)
}

# ---------- Plot saver ----------
save_plot <- function(p, name, width = 9, height = 5, dpi = 300) {
  out <- fs::path(ROOT, "figs", paste0(name, ".png"))
  ggplot2::ggsave(out, plot = p, width = width, height = height, dpi = dpi)
  log_line("Saved plot: ", out)
  invisible(out)
}

# ---------- Table saver (CSV + HTML) ----------
save_table <- function(df, name, digits = 3) {
  csv_path  <- fs::path(ROOT, "tables", paste0(name, ".csv"))
  html_path <- fs::path(ROOT, "tables", paste0(name, ".html"))

  readr::write_csv(df, csv_path)

  html <- df %>%
    knitr::kable(format = "html", digits = digits, escape = TRUE) %>%
    kableExtra::kable_styling(
      bootstrap_options = c("striped","hover","condensed","responsive"),
      full_width = FALSE, position = "left"
    )

  kableExtra::save_kable(html, file = html_path)

  log_line("Saved table: ", csv_path)
  log_line("Saved table: ", html_path)

  invisible(list(csv = csv_path, html = html_path))
}

set.seed(123)
log_line("Run folder: ", ROOT)
## Run folder: DynamicSystems_RPubs_Run_20260206_021745
ROOT
## [1] "DynamicSystems_RPubs_Run_20260206_021745"

3 1. Case Study A: Agriculture (real dataset: agridat::nass.corn)

We start with a real agricultural dataset and construct a clean annual time series for Illinois, obtained by averaging yields within year.

3.1 1.1 Build the Illinois yield series

data("nass.corn", package = "agridat")
corn <- agridat::nass.corn %>%
  tibble::as_tibble() %>%
  mutate(year = as.integer(year)) %>%
  filter(!is.na(yield))
log_line("corn rows: ", nrow(corn))
## corn rows: 6381
# Pick one state to create a clean dynamic series
corn_il <- corn %>%
  filter(state == "Illinois") %>%
  group_by(year) %>%
  summarise(yield = mean(yield, na.rm = TRUE), .groups = "drop") %>%
  arrange(year)

save_table(corn_il, "agri_corn_il_series")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_corn_il_series.csv 
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_corn_il_series.html
corn_il %>%
  head(12) %>%
  knitr::kable(caption = "Table A1. First 12 rows of the Illinois annual corn-yield series (yearly mean yield).") %>%
  kableExtra::kable_styling(full_width = FALSE)
Table A1. First 12 rows of the Illinois annual corn-yield series (yearly mean yield).
year yield
1866 29.0
1867 27.0
1868 34.2
1869 23.5
1870 40.0
1871 33.0
1872 39.8
1873 21.0
1874 24.0
1875 34.3
1876 30.0
1877 27.0
p1 <- ggplot(corn_il, aes(year, yield)) +
  geom_line(linewidth = 1.1, color = "#2c7fb8") +
  geom_point(size = 2.2, color = "#f03b20") +
  scale_x_continuous(breaks = scales::pretty_breaks(10)) +
  scale_y_continuous(labels = scales::comma) +
  labs(
    title = "Agriculture: Illinois Corn Yield as a Dynamic System",
    subtitle = "Yearly average yield (county-aggregated)",
    x = "Year", y = "Yield"
  ) +
  theme_minimal(base_size = 13)

save_plot(p1, "agri_corn_il_timeseries")
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/agri_corn_il_timeseries.png
p1
Figure A1. Illinois corn yield as a dynamic system. The annual series captures long-run variation and potential shocks typical of agricultural production.

Figure A1. Illinois corn yield as a dynamic system. The annual series captures long-run variation and potential shocks typical of agricultural production.

3.2 1.2 Baseline forecasting: ARIMA + prediction intervals

ARIMA provides a transparent statistical baseline and produces prediction intervals that summarize forecast uncertainty.

y_ts <- ts(corn_il$yield, start = min(corn_il$year), frequency = 1)
fit_arima <- forecast::auto.arima(y_ts)
fc <- forecast::forecast(fit_arima, h = 8)

df_fc <- tibble(
  year = (max(corn_il$year) + 1):(max(corn_il$year) + 8),
  mean = as.numeric(fc$mean),
  lo80 = as.numeric(fc$lower[, "80%"]),
  hi80 = as.numeric(fc$upper[, "80%"]),
  lo95 = as.numeric(fc$lower[, "95%"]),
  hi95 = as.numeric(fc$upper[, "95%"])
)

save_table(df_fc, "agri_arima_forecast_table")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_arima_forecast_table.csv 
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_arima_forecast_table.html
df_fc %>%
  knitr::kable(caption = "Table A2. ARIMA 8-step forecasts with 80% and 95% prediction intervals.") %>%
  kableExtra::kable_styling(full_width = FALSE)
Table A2. ARIMA 8-step forecasts with 80% and 95% prediction intervals.
year mean lo80 hi80 lo95 hi95
2012 176.7126 162.3151 191.1101 154.6935 198.7316
2013 175.0607 160.6561 189.4653 153.0308 197.0907
2014 174.3813 159.8069 188.9557 152.0916 196.6709
2015 174.3110 159.3623 189.2596 151.4489 197.1730
2016 174.6222 159.1433 190.1011 150.9492 198.2952
2017 175.1725 159.0664 191.2785 150.5404 199.8045
2018 175.8725 159.0883 192.6566 150.2034 201.5415
2019 176.6662 159.1835 194.1489 149.9288 203.4037
p2 <- ggplot() +
  geom_line(data = corn_il, aes(year, yield), linewidth = 1.05, color = "#1b9e77") +
  geom_ribbon(data = df_fc, aes(year, ymin = lo95, ymax = hi95),
              fill = "#7570b3", alpha = 0.18) +
  geom_ribbon(data = df_fc, aes(year, ymin = lo80, ymax = hi80),
              fill = "#7570b3", alpha = 0.30) +
  geom_line(data = df_fc, aes(year, mean), linewidth = 1.1, color = "#d95f02") +
  geom_point(data = df_fc, aes(year, mean), size = 2.2, color = "#d95f02") +
  labs(
    title = "Agriculture: ARIMA Forecast with Uncertainty Bands",
    subtitle = "80% and 95% prediction intervals",
    x = "Year", y = "Yield"
  ) +
  theme_minimal(base_size = 13)

save_plot(p2, "agri_arima_forecast_bands")
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/agri_arima_forecast_bands.png
p2
Figure A2. ARIMA forecast for Illinois yield with 80% and 95% prediction intervals. Interval width increases with horizon, reflecting escalating uncertainty.

Figure A2. ARIMA forecast for Illinois yield with 80% and 95% prediction intervals. Interval width increases with horizon, reflecting escalating uncertainty.


4 2. Case Study B: Biology (counts with seasonality + outbreak regime)

To mimic biological surveillance streams, we simulate a Poisson count process driven by a seasonal baseline intensity and a transient outbreak regime.

n <- 200
t <- 1:n
lambda <- 18 + 6*sin(2*pi*t/52) + ifelse(t > 120 & t < 150, 20, 0)
y <- rpois(n, pmax(lambda, 0.1))
bio <- tibble(t = t, y = y, lambda = lambda)

save_table(bio %>% summarise(n=n(), mean=mean(y), var=var(y)),
           "bio_basic_stats")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/bio_basic_stats.csv 
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/bio_basic_stats.html
(bio %>% summarise(n=n(), mean=mean(y), var=var(y))) %>%
  knitr::kable(caption = "Table B1. Basic summary statistics for the simulated count series.") %>%
  kableExtra::kable_styling(full_width = FALSE)
Table B1. Basic summary statistics for the simulated count series.
n mean var
200 20.91 70.51447
p3 <- ggplot(bio, aes(t, y)) +
  geom_line(linewidth = 1.0, color = "#377eb8") +
  geom_point(size = 1.6, color = "#ff7f00", alpha = 0.85) +
  geom_line(aes(y = lambda), linewidth = 1.0, linetype = 2, color = "#4daf4a") +
  labs(
    title = "Biology: Count Dynamics (Seasonality + Outbreak Regime)",
    subtitle = "Dashed line = latent intensity (simulated truth)",
    x = "Time index", y = "Count"
  ) +
  theme_minimal(base_size = 13)

save_plot(p3, "bio_counts_outbreak")
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/bio_counts_outbreak.png
p3
Figure B1. Biology-inspired count dynamics with seasonality and an outbreak regime. Points show observed counts; the dashed curve shows the latent intensity used for simulation.

Figure B1. Biology-inspired count dynamics with seasonality and an outbreak regime. Points show observed counts; the dashed curve shows the latent intensity used for simulation.

4.1 2.1 Rolling anomaly screening (time-respecting z-scores)

We compute rolling mean and SD using a right-aligned window (past-only), producing a time-respecting anomaly score.

bio2 <- bio %>%
  mutate(
    roll_mean = zoo::rollapply(y, width = 12, FUN = mean, fill = NA, align = "right"),
    roll_sd   = zoo::rollapply(y, width = 12, FUN = sd,   fill = NA, align = "right"),
    z = (y - roll_mean) / roll_sd
  )

top_anoms <- bio2 %>%
  filter(!is.na(z)) %>%
  mutate(absz = abs(z)) %>%
  arrange(desc(absz)) %>%
  slice(1:12)

save_table(top_anoms, "bio_top_anomalies")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/bio_top_anomalies.csv 
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/bio_top_anomalies.html
top_anoms %>%
  knitr::kable(caption = "Table B2. Top 12 candidate anomalies ranked by absolute rolling z-score.") %>%
  kableExtra::kable_styling(full_width = FALSE)
Table B2. Top 12 candidate anomalies ranked by absolute rolling z-score.
t y lambda roll_mean roll_sd z absz
122 58 42.93790 29.41667 11.106577 2.573550 2.573550
163 32 22.49106 18.25000 5.412528 2.540403 2.540403
198 24 12.38990 13.75000 4.287932 2.390430 2.390430
54 28 19.43589 16.08333 5.212892 2.285999 2.285999
178 33 20.78834 24.33333 4.007569 2.162575 2.162575
107 28 20.12763 17.00000 5.257030 2.092436 2.092436
150 13 14.02126 29.83333 8.054737 -2.089867 2.089867
46 19 14.02126 12.00000 3.437758 2.036211 2.036211
190 8 13.06210 16.25000 4.114829 -2.004944 2.004944
48 21 15.21166 12.75000 4.180583 1.973409 1.973409
78 15 18.00000 20.91667 3.117643 -1.897801 1.897801
76 17 19.43589 21.66667 2.461830 -1.895609 1.895609
p4 <- ggplot(bio2, aes(t, z)) +
  geom_hline(yintercept = c(-3, 3), linetype = 2, color = "#e41a1c") +
  geom_line(linewidth = 1.05, color = "#984ea3") +
  labs(
    title = "Biology: Rolling Anomaly Score (z)",
    subtitle = "Rule-of-thumb flags when |z| > 3",
    x = "Time index", y = "Rolling z-score"
  ) +
  theme_minimal(base_size = 13)

save_plot(p4, "bio_rolling_anomaly_z")
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/bio_rolling_anomaly_z.png
p4
Figure B2. Rolling anomaly z-score for the biological count series. Dashed lines at ±3 provide a practical screening threshold; exceedances tend to cluster around the outbreak interval.

Figure B2. Rolling anomaly z-score for the biological count series. Dashed lines at ±3 provide a practical screening threshold; exceedances tend to cluster around the outbreak interval.


5 3. Case Study C: Astronomy-like (irregular sampling + heteroskedastic noise + flare)

Astronomical time series often arrive with irregular sampling and heteroskedastic errors. We simulate a quasi-periodic signal with a transient flare and noise SD that varies over time.

set.seed(7)
n <- 350
time <- sort(runif(n, 0, 200))
signal <- sin(2*pi*time/18) + 0.5*cos(2*pi*time/40)
flare  <- ifelse(time > 120 & time < 135, 3*exp(-(time-120)/4), 0)
true_flux <- signal + flare
noise_sd <- 0.15 + 0.25*(sin(time/30)^2)
flux <- true_flux + rnorm(n, 0, noise_sd)
astro <- tibble(time = time, flux = flux, true = true_flux, sd = noise_sd)

save_table(astro %>% summarise(n=n(), mean_flux=mean(flux), sd_flux=sd(flux)),
           "astro_basic_stats")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/astro_basic_stats.csv 
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/astro_basic_stats.html
(astro %>% summarise(n=n(), mean_flux=mean(flux), sd_flux=sd(flux))) %>%
  knitr::kable(caption = "Table C1. Basic summary statistics for the simulated astronomy-like series.") %>%
  kableExtra::kable_styling(full_width = FALSE)
Table C1. Basic summary statistics for the simulated astronomy-like series.
n mean_flux sd_flux
350 0.062857 0.8771605
p5 <- ggplot(astro, aes(time, flux, color = sd)) +
  geom_point(size = 1.9, alpha = 0.88) +
  scale_color_viridis_c(option = "C", end = 0.95) +
  labs(
    title = "Astronomy-like: Irregular Time Series with Heteroskedastic Noise",
    subtitle = "Color encodes measurement noise standard deviation",
    x = "Time", y = "Observed flux", color = "Noise SD"
  ) +
  theme_minimal(base_size = 13)

save_plot(p5, "astro_lightcurve_scatter")
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/astro_lightcurve_scatter.png
p5
Figure C1. Irregularly sampled light curve with heteroskedastic measurement noise. Point color encodes the observation-specific noise standard deviation.

Figure C1. Irregularly sampled light curve with heteroskedastic measurement noise. Point color encodes the observation-specific noise standard deviation.

5.1 3.1 Trend extraction (LOESS) + flare candidate screening

We use LOESS smoothing to estimate a smooth trend and then rank the largest positive residuals as flare candidates.

fit_loess <- loess(flux ~ time, data = astro, span = 0.12)
astro2 <- astro %>%
  mutate(fit = predict(fit_loess), resid = flux - fit)

flare_candidates <- astro2 %>%
  arrange(desc(resid)) %>%
  slice(1:15)

save_table(flare_candidates, "astro_top_residuals_flare_candidates")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/astro_top_residuals_flare_candidates.csv 
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/astro_top_residuals_flare_candidates.html
flare_candidates %>%
  knitr::kable(caption = "Table C2. Top 15 flare candidates ranked by LOESS residual (largest positive residuals).") %>%
  kableExtra::kable_styling(full_width = FALSE)
Table C2. Top 15 flare candidates ranked by LOESS residual (largest positive residuals).
time flux true sd fit resid
120.238306 3.0451558 2.4215646 0.2951499 1.3119860 1.7331698
60.769103 1.1733900 0.2060203 0.3517497 0.0902021 1.0831879
7.411651 1.0858600 0.7241998 0.1649511 0.0233301 1.0625299
57.743578 1.1466900 0.4964224 0.3699597 0.1300438 1.0166462
40.531790 2.0890557 1.4981950 0.3881220 1.1227816 0.9662741
21.432380 0.7569778 0.4439613 0.2573132 -0.1018896 0.8588674
8.542348 0.6675341 0.2725580 0.1697280 -0.1739655 0.8414996
112.552681 1.7225488 1.1949887 0.2320826 0.9079535 0.8145954
5.550041 1.1604567 1.2553008 0.1584592 0.4268232 0.7336335
121.265413 1.9584974 1.6799115 0.3035433 1.2464270 0.7120704
145.915160 0.9382812 0.3205657 0.3943095 0.2663556 0.6719256
37.227352 1.3330067 0.8687643 0.3737667 0.6615852 0.6714215
128.649511 1.5737568 1.2489599 0.3576721 0.9034660 0.6702908
153.412144 0.7181279 0.1119715 0.3618461 0.0838860 0.6342419
125.833728 1.6994392 0.9441510 0.3387230 1.0672949 0.6321443
p6 <- ggplot(astro2, aes(time, flux)) +
  geom_point(size = 1.4, alpha = 0.55, color = "#1f78b4") +
  geom_line(aes(y = fit), linewidth = 1.2, color = "#e31a1c") +
  labs(
    title = "Astronomy-like: Trend Extraction under Irregular Sampling",
    subtitle = "LOESS fit overlays the noisy observations",
    x = "Time", y = "Flux"
  ) +
  theme_minimal(base_size = 13)

save_plot(p6, "astro_loess_fit")
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/astro_loess_fit.png
p6
Figure C2. LOESS trend extraction under irregular sampling. The fitted smooth trend (red) overlays noisy observations; large positive residuals can indicate transient events (e.g., flares).

Figure C2. LOESS trend extraction under irregular sampling. The fitted smooth trend (red) overlays noisy observations; large positive residuals can indicate transient events (e.g., flares).


6 4. Consolidated key results

key_results <- tibble(
  CaseStudy = c("Agriculture (nass.corn)", "Biology (counts)", "Astronomy-like (light curve)"),
  DynamicFeature = c("Trend + long memory + shocks", "Seasonality + outbreak regime shift", "Irregular sampling + heteroskedastic noise + flare"),
  BaselineMethod = c("ARIMA + prediction intervals", "Rolling anomaly z-score", "LOESS smoothing + residual screening"),
  SavedOutputs = c("Time plot + forecast bands + tables", "Dynamics plot + anomaly plot + tables", "Scatter + LOESS + tables")
)

save_table(key_results, "key_results_summary")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/key_results_summary.csv 
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/key_results_summary.html
key_results %>%
  knitr::kable(caption = "Table K1. Summary of dynamic regimes, baseline methods, and saved outputs.") %>%
  kableExtra::kable_styling(full_width = FALSE)
Table K1. Summary of dynamic regimes, baseline methods, and saved outputs.
CaseStudy DynamicFeature BaselineMethod SavedOutputs
Agriculture (nass.corn) Trend + long memory + shocks ARIMA + prediction intervals Time plot + forecast bands + tables
Biology (counts) Seasonality + outbreak regime shift Rolling anomaly z-score Dynamics plot + anomaly plot + tables
Astronomy-like (light curve) Irregular sampling + heteroskedastic noise + flare LOESS smoothing + residual screening Scatter + LOESS + tables
log_line("DONE. All outputs saved under: ", ROOT)
## DONE. All outputs saved under: DynamicSystems_RPubs_Run_20260206_021745
cat("\nDONE. Outputs folder:\n", ROOT, "\n")
## 
## DONE. Outputs folder:
##  DynamicSystems_RPubs_Run_20260206_021745

7 5. Add-on (robust): ML forecasting + rolling-origin CV + conformal prediction intervals

This add-on avoids tidymodels entirely to remain robust to version mismatches (e.g., rlang). We construct lag features and evaluate Random Forest vs XGBoost using time-respecting rolling-origin evaluation. Finally, we produce distribution-free conformal intervals for 1-step-ahead predictions.

7.1 5.1 Packages (ML add-on)

ml_pkgs2 <- c("ranger","xgboost")
to_install_ml <- ml_pkgs2[!ml_pkgs2 %in% installed.packages()[,"Package"]]
if(length(to_install_ml)) install.packages(to_install_ml, dependencies = TRUE)
invisible(lapply(ml_pkgs2, library, character.only = TRUE))
set.seed(123)

7.2 5.2 Utilities (lags, metrics, rolling-origin slices)

make_lags_df <- function(df, y = "y", time = "t", lags = 1:6) {
  df <- df[order(df[[time]]), , drop = FALSE]
  for (L in lags) df[[paste0("lag", L)]] <- dplyr::lag(df[[y]], L)
  df <- tidyr::drop_na(df)
  df
}
rmse <- function(truth, pred) sqrt(mean((truth - pred)^2, na.rm = TRUE))
mae  <- function(truth, pred) mean(abs(truth - pred), na.rm = TRUE)

rolling_origin_slices <- function(n, initial, assess = 1, skip = 1) {
  # returns list of list(train_idx, test_idx)
  stopifnot(initial >= 5, assess >= 1, n > initial + assess)
  out <- list()
  k <- 1
  start_test <- initial + 1
  while ((start_test + assess - 1) <= n) {
    train_idx <- 1:(start_test - 1)
    test_idx  <- start_test:(start_test + assess - 1)
    out[[k]] <- list(train = train_idx, test = test_idx)
    k <- k + 1
    start_test <- start_test + skip
  }
  out
}

7.3 5.3 Build agriculture supervised dataset (lag features)

agri_ml <- corn_il %>%
  dplyr::transmute(year = year, y = yield) %>%
  dplyr::mutate(t = dplyr::row_number())

agri_lag <- make_lags_df(agri_ml, y = "y", time = "t", lags = 1:6)
save_table(head(agri_lag, 12), "agri_ml_lagged_head_v2")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_ml_lagged_head_v2.csv 
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_ml_lagged_head_v2.html
# Rolling CV settings (series is short, be conservative)
nA <- nrow(agri_lag)
initialA <- max(12, floor(0.65 * nA))
assessA  <- 1
skipA    <- 1
slicesA  <- rolling_origin_slices(nA, initial = initialA, assess = assessA, skip = skipA)
log_line("Agriculture rolling slices (manual): ", length(slicesA))
## Agriculture rolling slices (manual): 49
X_cols <- grep("^lag", names(agri_lag), value = TRUE)

head(agri_lag, 12) %>%
  knitr::kable(caption = "Table M1. First 12 rows of the lag-feature dataset used for ML forecasting.") %>%
  kableExtra::kable_styling(full_width = FALSE)
Table M1. First 12 rows of the lag-feature dataset used for ML forecasting.
year y t lag1 lag2 lag3 lag4 lag5 lag6
1872 39.8 7 33.0 40.0 23.5 34.2 27.0 29.0
1873 21.0 8 39.8 33.0 40.0 23.5 34.2 27.0
1874 24.0 9 21.0 39.8 33.0 40.0 23.5 34.2
1875 34.3 10 24.0 21.0 39.8 33.0 40.0 23.5
1876 30.0 11 34.3 24.0 21.0 39.8 33.0 40.0
1877 27.0 12 30.0 34.3 24.0 21.0 39.8 33.0
1878 29.0 13 27.0 30.0 34.3 24.0 21.0 39.8
1879 36.1 14 29.0 27.0 30.0 34.3 24.0 21.0
1880 31.0 15 36.1 29.0 27.0 30.0 34.3 24.0
1881 21.0 16 31.0 36.1 29.0 27.0 30.0 34.3
1882 25.0 17 21.0 31.0 36.1 29.0 27.0 30.0
1883 27.3 18 25.0 21.0 31.0 36.1 29.0 27.0

7.4 5.4 ML models: Random Forest vs XGBoost

# Model 1: Random Forest (ranger)
rf_predict_one <- function(train_df, test_df) {
  fit <- ranger::ranger(
    formula = as.formula(paste("y ~", paste(X_cols, collapse = " + "))),
    data = train_df,
    num.trees = 800,
    mtry = max(1, floor(sqrt(length(X_cols)))),
    min.node.size = 3,
    importance = "permutation",
    seed = 123
  )
  pred <- predict(fit, data = test_df)$predictions
  list(pred = as.numeric(pred), fit = fit)
}

# Model 2: XGBoost (xgboost)
xgb_fit_predict <- function(train_df, test_df, params = list()) {
  dtrain <- xgboost::xgb.DMatrix(
    data = as.matrix(train_df[, X_cols, drop=FALSE]),
    label = train_df$y
  )
  dtest <- xgboost::xgb.DMatrix(
    data = as.matrix(test_df[, X_cols, drop=FALSE])
  )

  # Sensible defaults (stable for small series)
  p <- modifyList(list(
    objective = "reg:squarederror",
    eval_metric = "rmse",
    max_depth = 3,
    eta = 0.05,
    subsample = 0.9,
    colsample_bytree = 0.9,
    min_child_weight = 1
  ), params)

  fit <- xgboost::xgb.train(
    params = p,
    data = dtrain,
    nrounds = 600,
    verbose = 0
  )

  pred <- predict(fit, dtest)
  list(pred = as.numeric(pred), fit = fit)
}

7.5 5.5 Rolling-origin CV: predictions + metrics

predA <- tibble::tibble(
  t = agri_lag$t,
  y = agri_lag$y,
  rf = NA_real_,
  xgb = NA_real_
)

rf_fits <- list()
xgb_fits <- list()

for (i in seq_along(slicesA)) {
  tr <- slicesA[[i]]$train
  te <- slicesA[[i]]$test

  train_df <- agri_lag[tr, , drop=FALSE]
  test_df  <- agri_lag[te, , drop=FALSE]

  rf_out <- rf_predict_one(train_df, test_df)
  xg_out <- xgb_fit_predict(train_df, test_df)

  predA$rf[te]  <- rf_out$pred
  predA$xgb[te] <- xg_out$pred

  # store last fits (useful for importance plotting)
  rf_fits[[i]] <- rf_out$fit
  xgb_fits[[i]] <- xg_out$fit
}

predA_eval <- predA %>% tidyr::drop_na(rf, xgb)

agri_ml_metrics <- tibble::tibble(
  model = c("Random Forest (ranger)", "XGBoost"),
  RMSE = c(rmse(predA_eval$y, predA_eval$rf), rmse(predA_eval$y, predA_eval$xgb)),
  MAE  = c(mae(predA_eval$y, predA_eval$rf),  mae(predA_eval$y, predA_eval$xgb))
) %>% dplyr::arrange(RMSE)

save_table(agri_ml_metrics, "agri_ml_metrics_manual_cv")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_ml_metrics_manual_cv.csv 
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_ml_metrics_manual_cv.html
log_line("Agriculture ML metrics saved.")
## Agriculture ML metrics saved.
agri_ml_metrics %>%
  knitr::kable(caption = "Table M2. Rolling-origin cross-validation metrics (1-step ahead).") %>%
  kableExtra::kable_styling(full_width = FALSE)
Table M2. Rolling-origin cross-validation metrics (1-step ahead).
model RMSE MAE
Random Forest (ranger) 20.21592 16.43237
XGBoost 23.92128 18.49081
p_mlA <- ggplot(predA_eval, aes(t, y)) +
  geom_line(color="#1f78b4", linewidth=0.9) +
  geom_line(aes(y = rf), color="#33a02c", linewidth=0.9) +
  geom_line(aes(y = xgb), color="#e31a1c", linewidth=0.9) +
  labs(
    title="Agriculture: Rolling-Origin ML Forecasts (1-step ahead)",
    subtitle="Blue: observed | Green: RF | Red: XGBoost (time-respecting CV)",
    x="Time index", y="Yield"
  ) +
  theme_minimal(base_size=13)

save_plot(p_mlA, "agri_ml_rolling_forecasts")
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/agri_ml_rolling_forecasts.png
p_mlA
Figure M1. Rolling-origin 1-step-ahead ML forecasts (time-respecting). Blue: observed; green: Random Forest; red: XGBoost. Predictions are generated only from past information.

Figure M1. Rolling-origin 1-step-ahead ML forecasts (time-respecting). Blue: observed; green: Random Forest; red: XGBoost. Predictions are generated only from past information.

p_metsA <- agri_ml_metrics %>%
  tidyr::pivot_longer(c(RMSE, MAE), names_to="metric", values_to="value") %>%
  ggplot(aes(reorder(model, value), value, fill=model)) +
  geom_col(alpha=0.9, show.legend=FALSE) +
  facet_wrap(~metric, scales="free_x") +
  coord_flip() +
  scale_fill_viridis_d(end=0.9) +
  labs(title="Agriculture: ML Model Comparison (Rolling CV)", x=NULL, y="Error") +
  theme_minimal(base_size=13)

save_plot(p_metsA, "agri_ml_metric_comparison", width=9, height=4.8)
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/agri_ml_metric_comparison.png
p_metsA
Figure M2. ML model comparison under rolling-origin evaluation. Lower values indicate better predictive accuracy.

Figure M2. ML model comparison under rolling-origin evaluation. Lower values indicate better predictive accuracy.

7.6 5.6 Feature importance (RF + XGB)

# Feature importance (RF + XGB) using LAST trained model
last_rf <- rf_fits[[length(rf_fits)]]
rf_imp <- ranger::importance(last_rf) %>%
  tibble::enframe(name="feature", value="importance") %>%
  dplyr::arrange(desc(importance))

save_table(rf_imp, "agri_rf_feature_importance")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_rf_feature_importance.csv 
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_rf_feature_importance.html
p_rfimp <- rf_imp %>%
  ggplot(aes(reorder(feature, importance), importance, fill=feature)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_fill_viridis_d(end=0.9) +
  labs(title="Agriculture: Random Forest Feature Importance", x=NULL, y="Permutation importance") +
  theme_minimal(base_size=13)

save_plot(p_rfimp, "agri_rf_importance", width=8.5, height=4.8)
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/agri_rf_importance.png
last_xgb <- xgb_fits[[length(xgb_fits)]]
xgb_imp <- xgboost::xgb.importance(model = last_xgb) %>%
  as_tibble() %>%
  dplyr::select(Feature, Gain, Cover, Frequency) %>%
  dplyr::arrange(desc(Gain))

save_table(xgb_imp, "agri_xgb_feature_importance")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_xgb_feature_importance.csv 
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_xgb_feature_importance.html
p_ximp <- xgb_imp %>%
  slice_head(n=10) %>%
  ggplot(aes(reorder(Feature, Gain), Gain, fill=Feature)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_fill_viridis_d(end=0.9) +
  labs(title="Agriculture: XGBoost Feature Importance (Top 10)", x=NULL, y="Gain") +
  theme_minimal(base_size=13)

save_plot(p_ximp, "agri_xgb_importance_top10", width=8.5, height=4.8)
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/agri_xgb_importance_top10.png
# Print in-document (top rows)
rf_imp %>% head(10) %>%
  knitr::kable(caption = "Table M3. Random Forest permutation importance (top 10).") %>%
  kableExtra::kable_styling(full_width = FALSE)
Table M3. Random Forest permutation importance (top 10).
feature importance
lag5 393.2981
lag4 386.2044
lag1 345.2860
lag6 337.7214
lag3 245.0324
lag2 243.1737
xgb_imp %>% head(10) %>%
  knitr::kable(caption = "Table M4. XGBoost importance by gain (top 10).") %>%
  kableExtra::kable_styling(full_width = FALSE)
Table M4. XGBoost importance by gain (top 10).
Feature Gain Cover Frequency
lag1 0.3257660 0.2145725 0.2119879
lag5 0.2602731 0.1705030 0.1542480
lag4 0.1989658 0.1416740 0.1429750
lag2 0.1020557 0.1367274 0.1630465
lag3 0.0614475 0.1761518 0.1770690
lag6 0.0514919 0.1603712 0.1506736

7.7 5.7 Distribution-free conformal prediction intervals (rolling calibration)

rolling_conformal_from_preds <- function(df_pred, pred_col = "xgb", alpha = 0.1, calib = 20) {
  # df_pred must have columns: t, y, pred_col
  df <- df_pred %>% dplyr::select(t, y, pred = all_of(pred_col)) %>% dplyr::arrange(t)
  n <- nrow(df)
  lo <- hi <- rep(NA_real_, n)

  for (i in seq_len(n)) {
    if (i <= calib) next
    cal_idx <- (i - calib):(i - 1)
    cal_res <- abs(df$y[cal_idx] - df$pred[cal_idx])
    q <- as.numeric(stats::quantile(cal_res, probs = 1 - alpha, na.rm = TRUE, type = 8))
    lo[i] <- df$pred[i] - q
    hi[i] <- df$pred[i] + q
  }

  out <- df %>% mutate(lo = lo, hi = hi) %>% tidyr::drop_na(lo, hi)
  out
}

# Choose best ML model for conformal
bestA <- agri_ml_metrics$model[1]
best_col <- if(grepl("XGBoost", bestA)) "xgb" else "rf"
log_line("Agriculture conformal using: ", bestA)
## Agriculture conformal using: Random Forest (ranger)
agri_conf <- rolling_conformal_from_preds(predA_eval, pred_col = best_col, alpha = 0.1, calib = 20)

covA <- mean(agri_conf$y >= agri_conf$lo & agri_conf$y <= agri_conf$hi)
widthA <- mean(agri_conf$hi - agri_conf$lo)

save_table(
  tibble::tibble(
    model = bestA,
    alpha = 0.10,
    target_coverage = 0.90,
    empirical_coverage = covA,
    avg_interval_width = widthA
  ),
  "agri_conformal_summary_manual"
)
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_conformal_summary_manual.csv 
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_conformal_summary_manual.html
(tibble::tibble(
  model = bestA,
  alpha = 0.10,
  target_coverage = 0.90,
  empirical_coverage = covA,
  avg_interval_width = widthA
)) %>%
  knitr::kable(caption = "Table C-ML1. Conformal interval summary under rolling calibration.") %>%
  kableExtra::kable_styling(full_width = FALSE)
Table C-ML1. Conformal interval summary under rolling calibration.
model alpha target_coverage empirical_coverage avg_interval_width
Random Forest (ranger) 0.1 0.9 0.8275862 77.25496
p_confA <- ggplot(agri_conf, aes(t, y)) +
  geom_ribbon(aes(ymin = lo, ymax = hi), fill="#6a3d9a", alpha=0.18) +
  geom_line(aes(y = pred), color="#e31a1c", linewidth=1.0) +
  geom_line(color="#1f78b4", linewidth=0.9) +
  labs(
    title="Agriculture: Distribution-Free Conformal Prediction Intervals",
    subtitle=paste0(bestA, " | rolling calibration | target coverage 90% (empirical = ", round(covA, 3), ")"),
    x="Time index", y="Yield"
  ) +
  theme_minimal(base_size=13)

save_plot(p_confA, "agri_conformal_intervals_manual")
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/agri_conformal_intervals_manual.png
p_confA
Figure C-ML1. Distribution-free conformal prediction intervals for 1-step-ahead forecasts (rolling calibration). The purple band targets 90% marginal coverage; empirical coverage is reported in the subtitle.

Figure C-ML1. Distribution-free conformal prediction intervals for 1-step-ahead forecasts (rolling calibration). The purple band targets 90% marginal coverage; empirical coverage is reported in the subtitle.

log_line("ADD-ON completed: ML forecasting + conformal intervals (manual, robust).")
## ADD-ON completed: ML forecasting + conformal intervals (manual, robust).
cat("\nADD-ON completed. Check outputs under:\n", ROOT, "\n")
## 
## ADD-ON completed. Check outputs under:
##  DynamicSystems_RPubs_Run_20260206_021745