agridat::nass.corn)
This RPubs-ready R Markdown note demonstrates dynamic-systems thinking through three representative regimes:
A key design goal is reproducibility: the script auto-creates a timestamped run folder and saves all figures and tables (CSV + HTML) automatically.
# ============================================================
# Dynamic Systems RPubs: End-to-End Reproducible Pipeline
# Saves all plots/tables automatically into a timestamped folder
# ============================================================
# ---------- Packages ----------
pkgs <- c(
"tidyverse","lubridate","fs","scales",
"patchwork","viridis","knitr","kableExtra",
"forecast","agridat","zoo"
)
to_install <- pkgs[!pkgs %in% installed.packages()[,"Package"]]
if(length(to_install)) install.packages(to_install, dependencies = TRUE)
invisible(lapply(pkgs, library, character.only = TRUE))
# ---------- Create a timestamped run folder ----------
run_id <- format(Sys.time(), "%Y%m%d_%H%M%S")
ROOT <- paste0("DynamicSystems_RPubs_Run_", run_id)
dirs <- c("data","figs","tables","models","logs")
purrr::walk(dirs, ~fs::dir_create(fs::path(ROOT, .x)))
# ---------- Logging (NO glue evaluation) ----------
log_file <- fs::path(ROOT, "logs", "run_log.txt")
log_line <- function(...) {
msg <- paste0(..., collapse = "")
cat(msg, "\n")
cat(msg, "\n", file = log_file, append = TRUE)
}
# ---------- Plot saver ----------
save_plot <- function(p, name, width = 9, height = 5, dpi = 300) {
out <- fs::path(ROOT, "figs", paste0(name, ".png"))
ggplot2::ggsave(out, plot = p, width = width, height = height, dpi = dpi)
log_line("Saved plot: ", out)
invisible(out)
}
# ---------- Table saver (CSV + HTML) ----------
save_table <- function(df, name, digits = 3) {
csv_path <- fs::path(ROOT, "tables", paste0(name, ".csv"))
html_path <- fs::path(ROOT, "tables", paste0(name, ".html"))
readr::write_csv(df, csv_path)
html <- df %>%
knitr::kable(format = "html", digits = digits, escape = TRUE) %>%
kableExtra::kable_styling(
bootstrap_options = c("striped","hover","condensed","responsive"),
full_width = FALSE, position = "left"
)
kableExtra::save_kable(html, file = html_path)
log_line("Saved table: ", csv_path)
log_line("Saved table: ", html_path)
invisible(list(csv = csv_path, html = html_path))
}
set.seed(123)
log_line("Run folder: ", ROOT)
## Run folder: DynamicSystems_RPubs_Run_20260206_021745
ROOT
## [1] "DynamicSystems_RPubs_Run_20260206_021745"
agridat::nass.corn)We start with a real agricultural dataset and construct a clean annual time series for Illinois, obtained by averaging yields within year.
data("nass.corn", package = "agridat")
corn <- agridat::nass.corn %>%
tibble::as_tibble() %>%
mutate(year = as.integer(year)) %>%
filter(!is.na(yield))
log_line("corn rows: ", nrow(corn))
## corn rows: 6381
# Pick one state to create a clean dynamic series
corn_il <- corn %>%
filter(state == "Illinois") %>%
group_by(year) %>%
summarise(yield = mean(yield, na.rm = TRUE), .groups = "drop") %>%
arrange(year)
save_table(corn_il, "agri_corn_il_series")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_corn_il_series.csv
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_corn_il_series.html
corn_il %>%
head(12) %>%
knitr::kable(caption = "Table A1. First 12 rows of the Illinois annual corn-yield series (yearly mean yield).") %>%
kableExtra::kable_styling(full_width = FALSE)
| year | yield |
|---|---|
| 1866 | 29.0 |
| 1867 | 27.0 |
| 1868 | 34.2 |
| 1869 | 23.5 |
| 1870 | 40.0 |
| 1871 | 33.0 |
| 1872 | 39.8 |
| 1873 | 21.0 |
| 1874 | 24.0 |
| 1875 | 34.3 |
| 1876 | 30.0 |
| 1877 | 27.0 |
p1 <- ggplot(corn_il, aes(year, yield)) +
geom_line(linewidth = 1.1, color = "#2c7fb8") +
geom_point(size = 2.2, color = "#f03b20") +
scale_x_continuous(breaks = scales::pretty_breaks(10)) +
scale_y_continuous(labels = scales::comma) +
labs(
title = "Agriculture: Illinois Corn Yield as a Dynamic System",
subtitle = "Yearly average yield (county-aggregated)",
x = "Year", y = "Yield"
) +
theme_minimal(base_size = 13)
save_plot(p1, "agri_corn_il_timeseries")
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/agri_corn_il_timeseries.png
p1
Figure A1. Illinois corn yield as a dynamic system. The annual series captures long-run variation and potential shocks typical of agricultural production.
ARIMA provides a transparent statistical baseline and produces prediction intervals that summarize forecast uncertainty.
y_ts <- ts(corn_il$yield, start = min(corn_il$year), frequency = 1)
fit_arima <- forecast::auto.arima(y_ts)
fc <- forecast::forecast(fit_arima, h = 8)
df_fc <- tibble(
year = (max(corn_il$year) + 1):(max(corn_il$year) + 8),
mean = as.numeric(fc$mean),
lo80 = as.numeric(fc$lower[, "80%"]),
hi80 = as.numeric(fc$upper[, "80%"]),
lo95 = as.numeric(fc$lower[, "95%"]),
hi95 = as.numeric(fc$upper[, "95%"])
)
save_table(df_fc, "agri_arima_forecast_table")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_arima_forecast_table.csv
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_arima_forecast_table.html
df_fc %>%
knitr::kable(caption = "Table A2. ARIMA 8-step forecasts with 80% and 95% prediction intervals.") %>%
kableExtra::kable_styling(full_width = FALSE)
| year | mean | lo80 | hi80 | lo95 | hi95 |
|---|---|---|---|---|---|
| 2012 | 176.7126 | 162.3151 | 191.1101 | 154.6935 | 198.7316 |
| 2013 | 175.0607 | 160.6561 | 189.4653 | 153.0308 | 197.0907 |
| 2014 | 174.3813 | 159.8069 | 188.9557 | 152.0916 | 196.6709 |
| 2015 | 174.3110 | 159.3623 | 189.2596 | 151.4489 | 197.1730 |
| 2016 | 174.6222 | 159.1433 | 190.1011 | 150.9492 | 198.2952 |
| 2017 | 175.1725 | 159.0664 | 191.2785 | 150.5404 | 199.8045 |
| 2018 | 175.8725 | 159.0883 | 192.6566 | 150.2034 | 201.5415 |
| 2019 | 176.6662 | 159.1835 | 194.1489 | 149.9288 | 203.4037 |
p2 <- ggplot() +
geom_line(data = corn_il, aes(year, yield), linewidth = 1.05, color = "#1b9e77") +
geom_ribbon(data = df_fc, aes(year, ymin = lo95, ymax = hi95),
fill = "#7570b3", alpha = 0.18) +
geom_ribbon(data = df_fc, aes(year, ymin = lo80, ymax = hi80),
fill = "#7570b3", alpha = 0.30) +
geom_line(data = df_fc, aes(year, mean), linewidth = 1.1, color = "#d95f02") +
geom_point(data = df_fc, aes(year, mean), size = 2.2, color = "#d95f02") +
labs(
title = "Agriculture: ARIMA Forecast with Uncertainty Bands",
subtitle = "80% and 95% prediction intervals",
x = "Year", y = "Yield"
) +
theme_minimal(base_size = 13)
save_plot(p2, "agri_arima_forecast_bands")
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/agri_arima_forecast_bands.png
p2
Figure A2. ARIMA forecast for Illinois yield with 80% and 95% prediction intervals. Interval width increases with horizon, reflecting escalating uncertainty.
To mimic biological surveillance streams, we simulate a Poisson count process driven by a seasonal baseline intensity and a transient outbreak regime.
n <- 200
t <- 1:n
lambda <- 18 + 6*sin(2*pi*t/52) + ifelse(t > 120 & t < 150, 20, 0)
y <- rpois(n, pmax(lambda, 0.1))
bio <- tibble(t = t, y = y, lambda = lambda)
save_table(bio %>% summarise(n=n(), mean=mean(y), var=var(y)),
"bio_basic_stats")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/bio_basic_stats.csv
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/bio_basic_stats.html
(bio %>% summarise(n=n(), mean=mean(y), var=var(y))) %>%
knitr::kable(caption = "Table B1. Basic summary statistics for the simulated count series.") %>%
kableExtra::kable_styling(full_width = FALSE)
| n | mean | var |
|---|---|---|
| 200 | 20.91 | 70.51447 |
p3 <- ggplot(bio, aes(t, y)) +
geom_line(linewidth = 1.0, color = "#377eb8") +
geom_point(size = 1.6, color = "#ff7f00", alpha = 0.85) +
geom_line(aes(y = lambda), linewidth = 1.0, linetype = 2, color = "#4daf4a") +
labs(
title = "Biology: Count Dynamics (Seasonality + Outbreak Regime)",
subtitle = "Dashed line = latent intensity (simulated truth)",
x = "Time index", y = "Count"
) +
theme_minimal(base_size = 13)
save_plot(p3, "bio_counts_outbreak")
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/bio_counts_outbreak.png
p3
Figure B1. Biology-inspired count dynamics with seasonality and an outbreak regime. Points show observed counts; the dashed curve shows the latent intensity used for simulation.
We compute rolling mean and SD using a right-aligned window (past-only), producing a time-respecting anomaly score.
bio2 <- bio %>%
mutate(
roll_mean = zoo::rollapply(y, width = 12, FUN = mean, fill = NA, align = "right"),
roll_sd = zoo::rollapply(y, width = 12, FUN = sd, fill = NA, align = "right"),
z = (y - roll_mean) / roll_sd
)
top_anoms <- bio2 %>%
filter(!is.na(z)) %>%
mutate(absz = abs(z)) %>%
arrange(desc(absz)) %>%
slice(1:12)
save_table(top_anoms, "bio_top_anomalies")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/bio_top_anomalies.csv
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/bio_top_anomalies.html
top_anoms %>%
knitr::kable(caption = "Table B2. Top 12 candidate anomalies ranked by absolute rolling z-score.") %>%
kableExtra::kable_styling(full_width = FALSE)
| t | y | lambda | roll_mean | roll_sd | z | absz |
|---|---|---|---|---|---|---|
| 122 | 58 | 42.93790 | 29.41667 | 11.106577 | 2.573550 | 2.573550 |
| 163 | 32 | 22.49106 | 18.25000 | 5.412528 | 2.540403 | 2.540403 |
| 198 | 24 | 12.38990 | 13.75000 | 4.287932 | 2.390430 | 2.390430 |
| 54 | 28 | 19.43589 | 16.08333 | 5.212892 | 2.285999 | 2.285999 |
| 178 | 33 | 20.78834 | 24.33333 | 4.007569 | 2.162575 | 2.162575 |
| 107 | 28 | 20.12763 | 17.00000 | 5.257030 | 2.092436 | 2.092436 |
| 150 | 13 | 14.02126 | 29.83333 | 8.054737 | -2.089867 | 2.089867 |
| 46 | 19 | 14.02126 | 12.00000 | 3.437758 | 2.036211 | 2.036211 |
| 190 | 8 | 13.06210 | 16.25000 | 4.114829 | -2.004944 | 2.004944 |
| 48 | 21 | 15.21166 | 12.75000 | 4.180583 | 1.973409 | 1.973409 |
| 78 | 15 | 18.00000 | 20.91667 | 3.117643 | -1.897801 | 1.897801 |
| 76 | 17 | 19.43589 | 21.66667 | 2.461830 | -1.895609 | 1.895609 |
p4 <- ggplot(bio2, aes(t, z)) +
geom_hline(yintercept = c(-3, 3), linetype = 2, color = "#e41a1c") +
geom_line(linewidth = 1.05, color = "#984ea3") +
labs(
title = "Biology: Rolling Anomaly Score (z)",
subtitle = "Rule-of-thumb flags when |z| > 3",
x = "Time index", y = "Rolling z-score"
) +
theme_minimal(base_size = 13)
save_plot(p4, "bio_rolling_anomaly_z")
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/bio_rolling_anomaly_z.png
p4
Figure B2. Rolling anomaly z-score for the biological count series. Dashed lines at ±3 provide a practical screening threshold; exceedances tend to cluster around the outbreak interval.
Astronomical time series often arrive with irregular sampling and heteroskedastic errors. We simulate a quasi-periodic signal with a transient flare and noise SD that varies over time.
set.seed(7)
n <- 350
time <- sort(runif(n, 0, 200))
signal <- sin(2*pi*time/18) + 0.5*cos(2*pi*time/40)
flare <- ifelse(time > 120 & time < 135, 3*exp(-(time-120)/4), 0)
true_flux <- signal + flare
noise_sd <- 0.15 + 0.25*(sin(time/30)^2)
flux <- true_flux + rnorm(n, 0, noise_sd)
astro <- tibble(time = time, flux = flux, true = true_flux, sd = noise_sd)
save_table(astro %>% summarise(n=n(), mean_flux=mean(flux), sd_flux=sd(flux)),
"astro_basic_stats")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/astro_basic_stats.csv
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/astro_basic_stats.html
(astro %>% summarise(n=n(), mean_flux=mean(flux), sd_flux=sd(flux))) %>%
knitr::kable(caption = "Table C1. Basic summary statistics for the simulated astronomy-like series.") %>%
kableExtra::kable_styling(full_width = FALSE)
| n | mean_flux | sd_flux |
|---|---|---|
| 350 | 0.062857 | 0.8771605 |
p5 <- ggplot(astro, aes(time, flux, color = sd)) +
geom_point(size = 1.9, alpha = 0.88) +
scale_color_viridis_c(option = "C", end = 0.95) +
labs(
title = "Astronomy-like: Irregular Time Series with Heteroskedastic Noise",
subtitle = "Color encodes measurement noise standard deviation",
x = "Time", y = "Observed flux", color = "Noise SD"
) +
theme_minimal(base_size = 13)
save_plot(p5, "astro_lightcurve_scatter")
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/astro_lightcurve_scatter.png
p5
Figure C1. Irregularly sampled light curve with heteroskedastic measurement noise. Point color encodes the observation-specific noise standard deviation.
We use LOESS smoothing to estimate a smooth trend and then rank the largest positive residuals as flare candidates.
fit_loess <- loess(flux ~ time, data = astro, span = 0.12)
astro2 <- astro %>%
mutate(fit = predict(fit_loess), resid = flux - fit)
flare_candidates <- astro2 %>%
arrange(desc(resid)) %>%
slice(1:15)
save_table(flare_candidates, "astro_top_residuals_flare_candidates")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/astro_top_residuals_flare_candidates.csv
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/astro_top_residuals_flare_candidates.html
flare_candidates %>%
knitr::kable(caption = "Table C2. Top 15 flare candidates ranked by LOESS residual (largest positive residuals).") %>%
kableExtra::kable_styling(full_width = FALSE)
| time | flux | true | sd | fit | resid |
|---|---|---|---|---|---|
| 120.238306 | 3.0451558 | 2.4215646 | 0.2951499 | 1.3119860 | 1.7331698 |
| 60.769103 | 1.1733900 | 0.2060203 | 0.3517497 | 0.0902021 | 1.0831879 |
| 7.411651 | 1.0858600 | 0.7241998 | 0.1649511 | 0.0233301 | 1.0625299 |
| 57.743578 | 1.1466900 | 0.4964224 | 0.3699597 | 0.1300438 | 1.0166462 |
| 40.531790 | 2.0890557 | 1.4981950 | 0.3881220 | 1.1227816 | 0.9662741 |
| 21.432380 | 0.7569778 | 0.4439613 | 0.2573132 | -0.1018896 | 0.8588674 |
| 8.542348 | 0.6675341 | 0.2725580 | 0.1697280 | -0.1739655 | 0.8414996 |
| 112.552681 | 1.7225488 | 1.1949887 | 0.2320826 | 0.9079535 | 0.8145954 |
| 5.550041 | 1.1604567 | 1.2553008 | 0.1584592 | 0.4268232 | 0.7336335 |
| 121.265413 | 1.9584974 | 1.6799115 | 0.3035433 | 1.2464270 | 0.7120704 |
| 145.915160 | 0.9382812 | 0.3205657 | 0.3943095 | 0.2663556 | 0.6719256 |
| 37.227352 | 1.3330067 | 0.8687643 | 0.3737667 | 0.6615852 | 0.6714215 |
| 128.649511 | 1.5737568 | 1.2489599 | 0.3576721 | 0.9034660 | 0.6702908 |
| 153.412144 | 0.7181279 | 0.1119715 | 0.3618461 | 0.0838860 | 0.6342419 |
| 125.833728 | 1.6994392 | 0.9441510 | 0.3387230 | 1.0672949 | 0.6321443 |
p6 <- ggplot(astro2, aes(time, flux)) +
geom_point(size = 1.4, alpha = 0.55, color = "#1f78b4") +
geom_line(aes(y = fit), linewidth = 1.2, color = "#e31a1c") +
labs(
title = "Astronomy-like: Trend Extraction under Irregular Sampling",
subtitle = "LOESS fit overlays the noisy observations",
x = "Time", y = "Flux"
) +
theme_minimal(base_size = 13)
save_plot(p6, "astro_loess_fit")
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/astro_loess_fit.png
p6
Figure C2. LOESS trend extraction under irregular sampling. The fitted smooth trend (red) overlays noisy observations; large positive residuals can indicate transient events (e.g., flares).
key_results <- tibble(
CaseStudy = c("Agriculture (nass.corn)", "Biology (counts)", "Astronomy-like (light curve)"),
DynamicFeature = c("Trend + long memory + shocks", "Seasonality + outbreak regime shift", "Irregular sampling + heteroskedastic noise + flare"),
BaselineMethod = c("ARIMA + prediction intervals", "Rolling anomaly z-score", "LOESS smoothing + residual screening"),
SavedOutputs = c("Time plot + forecast bands + tables", "Dynamics plot + anomaly plot + tables", "Scatter + LOESS + tables")
)
save_table(key_results, "key_results_summary")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/key_results_summary.csv
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/key_results_summary.html
key_results %>%
knitr::kable(caption = "Table K1. Summary of dynamic regimes, baseline methods, and saved outputs.") %>%
kableExtra::kable_styling(full_width = FALSE)
| CaseStudy | DynamicFeature | BaselineMethod | SavedOutputs |
|---|---|---|---|
| Agriculture (nass.corn) | Trend + long memory + shocks | ARIMA + prediction intervals | Time plot + forecast bands + tables |
| Biology (counts) | Seasonality + outbreak regime shift | Rolling anomaly z-score | Dynamics plot + anomaly plot + tables |
| Astronomy-like (light curve) | Irregular sampling + heteroskedastic noise + flare | LOESS smoothing + residual screening | Scatter + LOESS + tables |
log_line("DONE. All outputs saved under: ", ROOT)
## DONE. All outputs saved under: DynamicSystems_RPubs_Run_20260206_021745
cat("\nDONE. Outputs folder:\n", ROOT, "\n")
##
## DONE. Outputs folder:
## DynamicSystems_RPubs_Run_20260206_021745
This add-on avoids tidymodels entirely to remain robust
to version mismatches (e.g., rlang). We construct lag
features and evaluate Random Forest vs
XGBoost using time-respecting rolling-origin
evaluation. Finally, we produce distribution-free conformal
intervals for 1-step-ahead predictions.
ml_pkgs2 <- c("ranger","xgboost")
to_install_ml <- ml_pkgs2[!ml_pkgs2 %in% installed.packages()[,"Package"]]
if(length(to_install_ml)) install.packages(to_install_ml, dependencies = TRUE)
invisible(lapply(ml_pkgs2, library, character.only = TRUE))
set.seed(123)
make_lags_df <- function(df, y = "y", time = "t", lags = 1:6) {
df <- df[order(df[[time]]), , drop = FALSE]
for (L in lags) df[[paste0("lag", L)]] <- dplyr::lag(df[[y]], L)
df <- tidyr::drop_na(df)
df
}
rmse <- function(truth, pred) sqrt(mean((truth - pred)^2, na.rm = TRUE))
mae <- function(truth, pred) mean(abs(truth - pred), na.rm = TRUE)
rolling_origin_slices <- function(n, initial, assess = 1, skip = 1) {
# returns list of list(train_idx, test_idx)
stopifnot(initial >= 5, assess >= 1, n > initial + assess)
out <- list()
k <- 1
start_test <- initial + 1
while ((start_test + assess - 1) <= n) {
train_idx <- 1:(start_test - 1)
test_idx <- start_test:(start_test + assess - 1)
out[[k]] <- list(train = train_idx, test = test_idx)
k <- k + 1
start_test <- start_test + skip
}
out
}
agri_ml <- corn_il %>%
dplyr::transmute(year = year, y = yield) %>%
dplyr::mutate(t = dplyr::row_number())
agri_lag <- make_lags_df(agri_ml, y = "y", time = "t", lags = 1:6)
save_table(head(agri_lag, 12), "agri_ml_lagged_head_v2")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_ml_lagged_head_v2.csv
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_ml_lagged_head_v2.html
# Rolling CV settings (series is short, be conservative)
nA <- nrow(agri_lag)
initialA <- max(12, floor(0.65 * nA))
assessA <- 1
skipA <- 1
slicesA <- rolling_origin_slices(nA, initial = initialA, assess = assessA, skip = skipA)
log_line("Agriculture rolling slices (manual): ", length(slicesA))
## Agriculture rolling slices (manual): 49
X_cols <- grep("^lag", names(agri_lag), value = TRUE)
head(agri_lag, 12) %>%
knitr::kable(caption = "Table M1. First 12 rows of the lag-feature dataset used for ML forecasting.") %>%
kableExtra::kable_styling(full_width = FALSE)
| year | y | t | lag1 | lag2 | lag3 | lag4 | lag5 | lag6 |
|---|---|---|---|---|---|---|---|---|
| 1872 | 39.8 | 7 | 33.0 | 40.0 | 23.5 | 34.2 | 27.0 | 29.0 |
| 1873 | 21.0 | 8 | 39.8 | 33.0 | 40.0 | 23.5 | 34.2 | 27.0 |
| 1874 | 24.0 | 9 | 21.0 | 39.8 | 33.0 | 40.0 | 23.5 | 34.2 |
| 1875 | 34.3 | 10 | 24.0 | 21.0 | 39.8 | 33.0 | 40.0 | 23.5 |
| 1876 | 30.0 | 11 | 34.3 | 24.0 | 21.0 | 39.8 | 33.0 | 40.0 |
| 1877 | 27.0 | 12 | 30.0 | 34.3 | 24.0 | 21.0 | 39.8 | 33.0 |
| 1878 | 29.0 | 13 | 27.0 | 30.0 | 34.3 | 24.0 | 21.0 | 39.8 |
| 1879 | 36.1 | 14 | 29.0 | 27.0 | 30.0 | 34.3 | 24.0 | 21.0 |
| 1880 | 31.0 | 15 | 36.1 | 29.0 | 27.0 | 30.0 | 34.3 | 24.0 |
| 1881 | 21.0 | 16 | 31.0 | 36.1 | 29.0 | 27.0 | 30.0 | 34.3 |
| 1882 | 25.0 | 17 | 21.0 | 31.0 | 36.1 | 29.0 | 27.0 | 30.0 |
| 1883 | 27.3 | 18 | 25.0 | 21.0 | 31.0 | 36.1 | 29.0 | 27.0 |
# Model 1: Random Forest (ranger)
rf_predict_one <- function(train_df, test_df) {
fit <- ranger::ranger(
formula = as.formula(paste("y ~", paste(X_cols, collapse = " + "))),
data = train_df,
num.trees = 800,
mtry = max(1, floor(sqrt(length(X_cols)))),
min.node.size = 3,
importance = "permutation",
seed = 123
)
pred <- predict(fit, data = test_df)$predictions
list(pred = as.numeric(pred), fit = fit)
}
# Model 2: XGBoost (xgboost)
xgb_fit_predict <- function(train_df, test_df, params = list()) {
dtrain <- xgboost::xgb.DMatrix(
data = as.matrix(train_df[, X_cols, drop=FALSE]),
label = train_df$y
)
dtest <- xgboost::xgb.DMatrix(
data = as.matrix(test_df[, X_cols, drop=FALSE])
)
# Sensible defaults (stable for small series)
p <- modifyList(list(
objective = "reg:squarederror",
eval_metric = "rmse",
max_depth = 3,
eta = 0.05,
subsample = 0.9,
colsample_bytree = 0.9,
min_child_weight = 1
), params)
fit <- xgboost::xgb.train(
params = p,
data = dtrain,
nrounds = 600,
verbose = 0
)
pred <- predict(fit, dtest)
list(pred = as.numeric(pred), fit = fit)
}
predA <- tibble::tibble(
t = agri_lag$t,
y = agri_lag$y,
rf = NA_real_,
xgb = NA_real_
)
rf_fits <- list()
xgb_fits <- list()
for (i in seq_along(slicesA)) {
tr <- slicesA[[i]]$train
te <- slicesA[[i]]$test
train_df <- agri_lag[tr, , drop=FALSE]
test_df <- agri_lag[te, , drop=FALSE]
rf_out <- rf_predict_one(train_df, test_df)
xg_out <- xgb_fit_predict(train_df, test_df)
predA$rf[te] <- rf_out$pred
predA$xgb[te] <- xg_out$pred
# store last fits (useful for importance plotting)
rf_fits[[i]] <- rf_out$fit
xgb_fits[[i]] <- xg_out$fit
}
predA_eval <- predA %>% tidyr::drop_na(rf, xgb)
agri_ml_metrics <- tibble::tibble(
model = c("Random Forest (ranger)", "XGBoost"),
RMSE = c(rmse(predA_eval$y, predA_eval$rf), rmse(predA_eval$y, predA_eval$xgb)),
MAE = c(mae(predA_eval$y, predA_eval$rf), mae(predA_eval$y, predA_eval$xgb))
) %>% dplyr::arrange(RMSE)
save_table(agri_ml_metrics, "agri_ml_metrics_manual_cv")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_ml_metrics_manual_cv.csv
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_ml_metrics_manual_cv.html
log_line("Agriculture ML metrics saved.")
## Agriculture ML metrics saved.
agri_ml_metrics %>%
knitr::kable(caption = "Table M2. Rolling-origin cross-validation metrics (1-step ahead).") %>%
kableExtra::kable_styling(full_width = FALSE)
| model | RMSE | MAE |
|---|---|---|
| Random Forest (ranger) | 20.21592 | 16.43237 |
| XGBoost | 23.92128 | 18.49081 |
p_mlA <- ggplot(predA_eval, aes(t, y)) +
geom_line(color="#1f78b4", linewidth=0.9) +
geom_line(aes(y = rf), color="#33a02c", linewidth=0.9) +
geom_line(aes(y = xgb), color="#e31a1c", linewidth=0.9) +
labs(
title="Agriculture: Rolling-Origin ML Forecasts (1-step ahead)",
subtitle="Blue: observed | Green: RF | Red: XGBoost (time-respecting CV)",
x="Time index", y="Yield"
) +
theme_minimal(base_size=13)
save_plot(p_mlA, "agri_ml_rolling_forecasts")
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/agri_ml_rolling_forecasts.png
p_mlA
Figure M1. Rolling-origin 1-step-ahead ML forecasts (time-respecting). Blue: observed; green: Random Forest; red: XGBoost. Predictions are generated only from past information.
p_metsA <- agri_ml_metrics %>%
tidyr::pivot_longer(c(RMSE, MAE), names_to="metric", values_to="value") %>%
ggplot(aes(reorder(model, value), value, fill=model)) +
geom_col(alpha=0.9, show.legend=FALSE) +
facet_wrap(~metric, scales="free_x") +
coord_flip() +
scale_fill_viridis_d(end=0.9) +
labs(title="Agriculture: ML Model Comparison (Rolling CV)", x=NULL, y="Error") +
theme_minimal(base_size=13)
save_plot(p_metsA, "agri_ml_metric_comparison", width=9, height=4.8)
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/agri_ml_metric_comparison.png
p_metsA
Figure M2. ML model comparison under rolling-origin evaluation. Lower values indicate better predictive accuracy.
# Feature importance (RF + XGB) using LAST trained model
last_rf <- rf_fits[[length(rf_fits)]]
rf_imp <- ranger::importance(last_rf) %>%
tibble::enframe(name="feature", value="importance") %>%
dplyr::arrange(desc(importance))
save_table(rf_imp, "agri_rf_feature_importance")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_rf_feature_importance.csv
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_rf_feature_importance.html
p_rfimp <- rf_imp %>%
ggplot(aes(reorder(feature, importance), importance, fill=feature)) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_fill_viridis_d(end=0.9) +
labs(title="Agriculture: Random Forest Feature Importance", x=NULL, y="Permutation importance") +
theme_minimal(base_size=13)
save_plot(p_rfimp, "agri_rf_importance", width=8.5, height=4.8)
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/agri_rf_importance.png
last_xgb <- xgb_fits[[length(xgb_fits)]]
xgb_imp <- xgboost::xgb.importance(model = last_xgb) %>%
as_tibble() %>%
dplyr::select(Feature, Gain, Cover, Frequency) %>%
dplyr::arrange(desc(Gain))
save_table(xgb_imp, "agri_xgb_feature_importance")
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_xgb_feature_importance.csv
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_xgb_feature_importance.html
p_ximp <- xgb_imp %>%
slice_head(n=10) %>%
ggplot(aes(reorder(Feature, Gain), Gain, fill=Feature)) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_fill_viridis_d(end=0.9) +
labs(title="Agriculture: XGBoost Feature Importance (Top 10)", x=NULL, y="Gain") +
theme_minimal(base_size=13)
save_plot(p_ximp, "agri_xgb_importance_top10", width=8.5, height=4.8)
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/agri_xgb_importance_top10.png
# Print in-document (top rows)
rf_imp %>% head(10) %>%
knitr::kable(caption = "Table M3. Random Forest permutation importance (top 10).") %>%
kableExtra::kable_styling(full_width = FALSE)
| feature | importance |
|---|---|
| lag5 | 393.2981 |
| lag4 | 386.2044 |
| lag1 | 345.2860 |
| lag6 | 337.7214 |
| lag3 | 245.0324 |
| lag2 | 243.1737 |
xgb_imp %>% head(10) %>%
knitr::kable(caption = "Table M4. XGBoost importance by gain (top 10).") %>%
kableExtra::kable_styling(full_width = FALSE)
| Feature | Gain | Cover | Frequency |
|---|---|---|---|
| lag1 | 0.3257660 | 0.2145725 | 0.2119879 |
| lag5 | 0.2602731 | 0.1705030 | 0.1542480 |
| lag4 | 0.1989658 | 0.1416740 | 0.1429750 |
| lag2 | 0.1020557 | 0.1367274 | 0.1630465 |
| lag3 | 0.0614475 | 0.1761518 | 0.1770690 |
| lag6 | 0.0514919 | 0.1603712 | 0.1506736 |
rolling_conformal_from_preds <- function(df_pred, pred_col = "xgb", alpha = 0.1, calib = 20) {
# df_pred must have columns: t, y, pred_col
df <- df_pred %>% dplyr::select(t, y, pred = all_of(pred_col)) %>% dplyr::arrange(t)
n <- nrow(df)
lo <- hi <- rep(NA_real_, n)
for (i in seq_len(n)) {
if (i <= calib) next
cal_idx <- (i - calib):(i - 1)
cal_res <- abs(df$y[cal_idx] - df$pred[cal_idx])
q <- as.numeric(stats::quantile(cal_res, probs = 1 - alpha, na.rm = TRUE, type = 8))
lo[i] <- df$pred[i] - q
hi[i] <- df$pred[i] + q
}
out <- df %>% mutate(lo = lo, hi = hi) %>% tidyr::drop_na(lo, hi)
out
}
# Choose best ML model for conformal
bestA <- agri_ml_metrics$model[1]
best_col <- if(grepl("XGBoost", bestA)) "xgb" else "rf"
log_line("Agriculture conformal using: ", bestA)
## Agriculture conformal using: Random Forest (ranger)
agri_conf <- rolling_conformal_from_preds(predA_eval, pred_col = best_col, alpha = 0.1, calib = 20)
covA <- mean(agri_conf$y >= agri_conf$lo & agri_conf$y <= agri_conf$hi)
widthA <- mean(agri_conf$hi - agri_conf$lo)
save_table(
tibble::tibble(
model = bestA,
alpha = 0.10,
target_coverage = 0.90,
empirical_coverage = covA,
avg_interval_width = widthA
),
"agri_conformal_summary_manual"
)
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_conformal_summary_manual.csv
## Saved table: DynamicSystems_RPubs_Run_20260206_021745/tables/agri_conformal_summary_manual.html
(tibble::tibble(
model = bestA,
alpha = 0.10,
target_coverage = 0.90,
empirical_coverage = covA,
avg_interval_width = widthA
)) %>%
knitr::kable(caption = "Table C-ML1. Conformal interval summary under rolling calibration.") %>%
kableExtra::kable_styling(full_width = FALSE)
| model | alpha | target_coverage | empirical_coverage | avg_interval_width |
|---|---|---|---|---|
| Random Forest (ranger) | 0.1 | 0.9 | 0.8275862 | 77.25496 |
p_confA <- ggplot(agri_conf, aes(t, y)) +
geom_ribbon(aes(ymin = lo, ymax = hi), fill="#6a3d9a", alpha=0.18) +
geom_line(aes(y = pred), color="#e31a1c", linewidth=1.0) +
geom_line(color="#1f78b4", linewidth=0.9) +
labs(
title="Agriculture: Distribution-Free Conformal Prediction Intervals",
subtitle=paste0(bestA, " | rolling calibration | target coverage 90% (empirical = ", round(covA, 3), ")"),
x="Time index", y="Yield"
) +
theme_minimal(base_size=13)
save_plot(p_confA, "agri_conformal_intervals_manual")
## Saved plot: DynamicSystems_RPubs_Run_20260206_021745/figs/agri_conformal_intervals_manual.png
p_confA
Figure C-ML1. Distribution-free conformal prediction intervals for 1-step-ahead forecasts (rolling calibration). The purple band targets 90% marginal coverage; empirical coverage is reported in the subtitle.
log_line("ADD-ON completed: ML forecasting + conformal intervals (manual, robust).")
## ADD-ON completed: ML forecasting + conformal intervals (manual, robust).
cat("\nADD-ON completed. Check outputs under:\n", ROOT, "\n")
##
## ADD-ON completed. Check outputs under:
## DynamicSystems_RPubs_Run_20260206_021745