Dr. Debashis Chatterjee
Assistant Professor, Department of Statistics, Siksha‑Bhavana,
Visva‑Bharati University
Guest lecture at – Geological Studies Unit, Indian Statistical
Institute, Kolkata (2 May 2025)
GSU invites you to demonstrate hands‑on quantitative tools for deciphering paleoclimate.
# ---- Package bootstrap --------------------------------------------------
# 1. Set a permanent CRAN mirror *before* any install attempts so that
# non‑interactive knitting (e.g. RPubs) will not throw the
# “trying to use CRAN without setting a mirror” error.
# 2. Only attempt installation for *missing* packages; otherwise we just load.
# 3. Pass an explicit `repos` argument inside `install.packages()` so the
# mirror is honoured even if the global option is overridden elsewhere.
# 0. Environment Setup: Reproducible Science
options(repos = c(CRAN = "https://cloud.r-project.org"))
pkgs <- c("tidyverse", "lubridate", "janitor", "patchwork",
"broom", "changepoint", "ggfortify", "factoextra",
"purrr", "GGally", "WaveletComp", "randomForest",
"xgboost", "pROC")
# Ensure rlang is up‑to‑date *before* anything that depends on it.
if (!requireNamespace("rlang", quietly = TRUE) || packageVersion("rlang") < "1.1.5") {
install.packages("rlang", repos = "https://cloud.r-project.org", quiet = TRUE)
}
## package 'rlang' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'rlang'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\User\AppData\Local\R\win-library\4.4\00LOCK\rlang\libs\x64\rlang.dll
## to C:\Users\User\AppData\Local\R\win-library\4.4\rlang\libs\x64\rlang.dll:
## Permission denied
## Warning: restored 'rlang'
install_if_missing <- setdiff(pkgs, rownames(installed.packages()))
if (length(install_if_missing)) {
install.packages(install_if_missing, repos = "https://cloud.r-project.org", quiet = TRUE)
}
invisible(lapply(pkgs, library, character.only = TRUE))
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'janitor' was built under R version 4.4.3
## Warning: package 'broom' was built under R version 4.4.3
## Warning: package 'changepoint' was built under R version 4.4.3
## Warning: package 'ggfortify' was built under R version 4.4.3
## Warning: package 'factoextra' was built under R version 4.4.1
## Warning: package 'GGally' was built under R version 4.4.1
## Warning: package 'WaveletComp' was built under R version 4.4.3
## Warning: package 'randomForest' was built under R version 4.4.1
## Warning: package 'xgboost' was built under R version 4.4.1
## Warning: package 'pROC' was built under R version 4.4.1
Tip 🔧 – if you encounter a namespace ‘rlang’ … ≥ 1.1.5 is required error, run
install.packages("rlang")
before loading other packages.
:
\[\delta^{18}O \propto
-\text{Temperature}\]
More negative values = colder periods (glacials).
:
\[\begin{align*}
\delta^{18}O(t) &= -35 + \underbrace{0.4\sin\left(\frac{2\pi
t}{100}\right)}_{\text{Eccentricity}} +
\underbrace{0.3\sin\left(\frac{2\pi t}{41}\right)}_{\text{Obliquity}}
\\
&+ \underbrace{0.2\sin\left(\frac{2\pi
t}{23}\right)}_{\text{Precession}} + \underbrace{X_t}_{\text{Red
noise}}
\end{align*}\]
where red noise follows AR(1) process:
\[X_t = 0.8X_{t-1} + \epsilon_t, \quad
\epsilon_t \sim \mathcal{N}(0, 0.2^2)\]
: Simulates Vostok/GISP2 ice cores dominated by Milankovitch cycles.
set.seed(20250402)
ages_kyr <- seq(0, 800, by = 1) # kyr BP
orbital <- 0.4*sin(2*pi*ages_kyr/100) +
0.3*sin(2*pi*ages_kyr/41) +
0.2*sin(2*pi*ages_kyr/23)
red_noise <- arima.sim(list(ar = 0.8), n = length(ages_kyr), sd = 0.2)
d18O <- -35 + orbital + red_noise # ‰ V‑SMOW
synth_core <- tibble(age_kyr = ages_kyr, d18O)
:
\[\text{Observed Age} = \text{True Age} +
\epsilon, \quad \epsilon \sim \mathcal{N}(0, \sigma_t^2)\]
with \(\sigma_t = 0.8\) kyr
(user-adjustable).
# 20 stochastic age models (σ = 0.8 kyr)
A <- 20
age_ens <- map_dfr(1:A, \(i) {
tibble(realisation = i, age_real = ages_kyr + rnorm(length(ages_kyr), 0, 0.8), d18O)
})
age_ens %>%
ggplot(aes(age_real, d18O, group = realisation)) +
geom_line(alpha = .2) + scale_x_reverse() +
labs(x = "Age (kyr BP)", y = expression(delta^{18}*"O (‰)"),
title = "δ¹⁸O Ensembles with Chronological Uncertainty")
:
\[\mathcal{P}(\omega) = \left| \int
\delta^{18}O(t) e^{-i\omega t} dt \right|^2\]
Identifies dominant cycles: \
- Peak at \(\frac{1}{100}\) kyr\(^{-1}\) = Eccentricity \
- Peak at \(\frac{1}{41}\) kyr\(^{-1}\) = Obliquity
lo <- loess(d18O ~ age_kyr, synth_core, span = 0.1)
synth_core <- mutate(synth_core, loess = predict(lo))
p_raw <- ggplot(synth_core) +
geom_line(aes(age_kyr, d18O), alpha = .4) +
geom_line(aes(age_kyr, loess), colour = "red") +
scale_x_reverse() + labs(x = "Age (kyr BP)", y = "δ18O (‰)")
spec <- spectrum(synth_core$d18O, plot = FALSE)
p_spec <- tibble(freq = spec$freq, spec = spec$spec) %>%
ggplot(aes(freq, spec)) + geom_line() +
labs(x = "Frequency (cycles kyr⁻¹)", y = "Spectral Density")
p_raw + p_spec
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
:
\[L(t) = 0.6\sin(2\pi t/100) + 0.4\sin(2\pi
t/41)\]
Represents unobserved temperature influencing all proxies.
latent <- 0.6*sin(2*pi*ages_kyr/100) + 0.4*sin(2*pi*ages_kyr/41)
set.seed(42)
proxy_defs <- tribble(
~proxy, ~beta, ~ar_phi, ~sd,
"ice", 1.0, 0.6, 0.3,
"speleothem",0.7, 0.7, 0.25,
"tree", -0.5, 0.5, 0.35,
"leafwax", 0.6, 0.4, 0.3,
"mgca", -0.8, 0.7, 0.25,
"benthic", 0.9, 0.6, 0.2,
"tex86", 1.2, 0.5, 0.3
)
multi_proxies <- proxy_defs %>%
mutate(data = pmap(list(beta, ar_phi, sd), \(b, phi, s) {
b*latent + arima.sim(list(ar = phi), n = length(latent), sd = s)
})) %>%
select(proxy, data) %>%
unnest_longer(data) %>%
mutate(age_kyr = rep(ages_kyr, times = n()/length(ages_kyr)))
multi_proxies %>%
ggplot(aes(age_kyr, data, colour = proxy)) + geom_line(alpha = .7) +
scale_x_reverse() + facet_wrap(~proxy, ncol = 2, scales = "free_y") +
labs(x = "Age (kyr BP)", y = "Proxy Value", title = "Synthetic Multi‑Proxy Record (7 proxies)")
: \
In real data (e.g., PAGES 2k), PC1 often correlates with temperature
(\(r > 0.7\)).
wide <- multi_proxies %>% select(age_kyr, proxy, data) %>%
pivot_wider(names_from = proxy, values_from = data)
GGally::ggcorr(wide %>% select(-age_kyr), label = TRUE, label_round = 2) +
ggtitle("Proxy–Proxy Correlation Matrix")
pca <- prcomp(wide %>% select(-age_kyr), scale. = TRUE)
factoextra::fviz_eig(pca, addlabels = TRUE)
:
Minimizes cost function:
\[\sum_{i=1}^{m+1}
\mathcal{C}(y_{t_{i-1}:t_i) + \beta m\]
Identifies Pleistocene terminations and Dansgaard-Oeschger events.
# Wavelet on δ18O
library(WaveletComp)
wt <- analyze.wavelet(synth_core, "d18O", dt = 1, make.pval = FALSE)
## Smoothing the time series...
## Starting wavelet transformation...
## Class attributes are accessible through following names:
## series loess.span dt dj Wave Phase Ampl Power Power.avg Power.pval Power.avg.pval Ridge Period Scale nc nr coi.1 coi.2 axis.1 axis.2 date.format date.tz
wt.image(wt, main = "Wavelet Power Spectrum – δ¹⁸O")
# Changepoint detection on PC1
pc1 <- pca$x[,1]
cpts <- cpt.meanvar(pc1, method = "PELT")
plot(cpts, main = "Changepoints in PC1 (PELT)")
:
Minimizes logistic loss:
\[\mathcal{L} = -\sum [y_i \log(p_i) +
(1-y_i)\log(1-p_i)]\]
Perfect AUC=1.0 in synthetic data.
wide$state <- factor(ifelse(latent < -0.2, "Glacial", "Interglacial"))
# Random Forest
set.seed(123)
rf <- randomForest(state ~ ., data = wide %>% select(-age_kyr), ntree = 500)
print(rf)
##
## Call:
## randomForest(formula = state ~ ., data = wide %>% select(-age_kyr), ntree = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 7.74%
## Confusion matrix:
## Glacial Interglacial class.error
## Glacial 244 38 0.13475177
## Interglacial 24 495 0.04624277
varImpPlot(rf, main = "RF Variable Importance")
# XGBoost
Xmat <- as.matrix(wide %>% select(-age_kyr, -state))
y <- as.numeric(wide$state) - 1
xgb <- xgboost(data = Xmat, label = y, nrounds = 400, objective = "binary:logistic",
params = list(eta = .05, max_depth = 4, subsample = .7, colsample_bytree = .7),
verbose = 0)
roc_xgb <- pROC::roc(y, predict(xgb, Xmat))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(roc_xgb$auc)
## Area under the curve: 1
https://www.ncei.noaa.gov/pub/data/paleo/icecore/greenland/summit/greenland-summit-firn-air.txt
url <- "https://www.ncei.noaa.gov/pub/data/paleo/icecore/greenland/summit/greenland-summit-firn-air.txt"
raw <- read_lines(url)
start <- which(str_detect(raw, "^\\d"))[1] - 1 # first numeric row index (skip header)
pdp
on XGB to
visualise non‑linear effects.changepoint
.Contact: debashis.chatterjee@visva‑bharati.ac.in
## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Asia/Calcutta
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] pROC_1.18.5 xgboost_1.7.8.1 randomForest_4.7-1.1
## [4] WaveletComp_1.2 GGally_2.2.1 factoextra_1.0.7
## [7] ggfortify_0.4.17 changepoint_2.3 zoo_1.8-12
## [10] broom_1.0.8 patchwork_1.2.0 janitor_2.2.1
## [13] lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1
## [16] dplyr_1.1.4 purrr_1.0.2 readr_2.1.5
## [19] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.2
## [22] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.5 xfun_0.50 bslib_0.8.0 rstatix_0.7.2
## [5] ggrepel_0.9.5 lattice_0.22-6 tzdb_0.4.0 vctrs_0.6.5
## [9] tools_4.4.0 generics_0.1.3 curl_5.2.1 parallel_4.4.0
## [13] fansi_1.0.6 highr_0.11 pkgconfig_2.0.3 Matrix_1.7-0
## [17] data.table_1.16.2 RColorBrewer_1.1-3 lifecycle_1.0.4 farver_2.1.2
## [21] compiler_4.4.0 munsell_0.5.1 carData_3.0-5 snakecase_0.11.1
## [25] htmltools_0.5.8.1 sass_0.4.9 yaml_2.3.10 crayon_1.5.3
## [29] car_3.1-2 ggpubr_0.6.0 pillar_1.9.0 jquerylib_0.1.4
## [33] cachem_1.1.0 abind_1.4-5 ggstats_0.6.0 tidyselect_1.2.1
## [37] digest_0.6.35 stringi_1.8.4 labeling_0.4.3 fastmap_1.2.0
## [41] grid_4.4.0 colorspace_2.1-1 cli_3.6.2 magrittr_2.0.3
## [45] utf8_1.2.4 withr_3.0.1 scales_1.3.0 backports_1.5.0
## [49] bit64_4.0.5 timechange_0.3.0 rmarkdown_2.27 bit_4.0.5
## [53] gridExtra_2.3 ggsignif_0.6.4 hms_1.1.3 evaluate_0.24.0
## [57] knitr_1.48 rlang_1.1.4 Rcpp_1.0.12 glue_1.7.0
## [61] vroom_1.6.5 rstudioapi_0.17.1 jsonlite_1.8.8 R6_2.5.1
## [65] plyr_1.8.9