Rattlesnake energetics project

Analysis of internal body temperatures

Updated

July 7, 2025

1 Data import

To review/streamline Nicole’s work on data cleaning and Allen’s work on DAYMET integration here after directory/file cleanup.

1.1 Internal body temperatures

1.2 DAYMET weather data

2 Quality control

To translate Allen’s work on outlier detection for this section.

3 Active season identification

Inference of active season dates relies only on location and internal body temperature data.

Code

# read in data and clean up columns
raw_season <- raw |> select(loc, snake_id, datetime, tb)

# preview
raw_season |>
  head() |>
  pander("Table: example data rows for season identification.")

Table: example data rows for season identification.
loc	snake_id	datetime	tb
Eatonton	Arnold	2021-10-08 23:16:06	22.01
Eatonton	Arnold	2021-10-09 00:00:06	21.89
Eatonton	Arnold	2021-10-09 01:00:06	22.01
Eatonton	Arnold	2021-10-09 02:00:06	22.01
Eatonton	Arnold	2021-10-09 03:00:06	22.01
Eatonton	Arnold	2021-10-09 04:00:06	22.01

3.1 Diurnal variation

On days when snakes are inactive, the variance in internal body temperatures is expected to be low. Series of daily standard deviations over the days of the year are shown for each snake by location below. The locations are ordered by latitude.

Code

# compute daily variance per snake and convert to functional data
dvar_indiv <- raw_season |>
  group_by(snake_id, 
           loc, 
           yr = year(datetime), 
           day = yday(datetime)) |>
  summarize(tb.sd = sd(tb), .groups = 'drop') |>
  mutate(fid = consecutive_id(snake_id, loc, yr)) |>
  tf_nest(tb.sd, .id = 'fid', .arg = 'day') 

# plot daily variance series
dvar_indiv |>
  ggplot(aes(y = tb.sd)) +
  geom_spaghetti(alpha = 0.6, linewidth = 0.2) +
  facet_wrap(~loc) +
  labs(y = 'SD(temperature)', x = 'day of year') +
  ggthm

Some series are quite short and data are noisy. Snakes with fewer than 30 days’ worth of data are removed and the series are smoothed using a simple 7-day rolling average.

Code

# drop cases with fewer than 30d and smooth
dvar_indiv |>
  mutate(tfe = tf_count(tb.sd)) |>
  unnest(tfe) |>
  filter(tfe > 30) |>
  mutate(tb.sd = tf_smooth(tb.sd, 'rollmean', k = 7)) |>
  ggplot(aes(y = tb.sd)) +
  geom_spaghetti(alpha = 0.6, linewidth = 0.2) +
  facet_wrap(~loc) +
  labs(y = 'SD(temperature)', x = 'day of year') +
  ggthm

3.2 Location averages

The smoothed series are aggregated by location (i.e., across snake and year). The resulting series show median diurnal variation in body temperature at a given locale on a given day of the year. Note that missing data are replaced by linear interpolations.

Code

# same as above then aggregate by location
dvar_indiv |>
  mutate(tfe = tf_count(tb.sd)) |>
  unnest(tfe) |>
  filter(tfe > 30) |>
  mutate(tb.sd = tf_smooth(tb.sd, 'rollmean', k = 7)) |>
  group_by(loc) |>
  mutate(tb.sd = mean(tb.sd, depth = 'pointwise', na.rm = T)) |>
  ggplot(aes(y = tb.sd)) +
  geom_spaghetti() +
  facet_wrap(~loc) +
  labs(y = 'median SD(temperature)', 
       x = 'day of year') +
  ggthm

3.3 Changepoint detection

Mean changepoint detection via binary segmentation does a fairly good job at identifying apparent start and end dates for active seasons by location. There is some noise in the data that produces inference of superfluous changepoints. Nonetheless, a simple thresholding of inferred means at 1.5 (dashed horizontal line) adequately identifies start and end dates.

Code

# store series
dvar_loc <- dvar_indiv |>
  mutate(tfe = tf_count(tb.sd)) |>
  unnest(tfe) |>
  filter(tfe > 30) |>
  select(-tfe) |>
  mutate(tb.sd = tf_smooth(tb.sd, 'rollmean', k = 7)) |>
  group_by(loc) |>
  mutate(tb.sd = mean(tb.sd, depth = 'pointwise', na.rm = T)) |>
  distinct(loc, tb.sd) |>
  ungroup() |>
  tf_unnest(tb.sd) |>
  rename(day = tb.sd_arg) |>
  nest(c(day, tb.sd_value))

# # for testing
# j <- 6
# .vals <- sd_series$data[[j]]$tb.sd_value
# .args <- sd_series$data[[j]]$day

# wrapper around changepoint detection
cp.fn <- function(.args, .vals, return = 'means', q = 4){
  
  # estimate changepoints via binary segmentation
  cpt.out <- cpt.mean(.vals,
                      method = 'BinSeg', Q = q) 
  cp.ix <- cpts(cpt.out) 
  cp.x <- cpt.out@param.est$mean
  
  # option 1: output estimated mean for each day
  if(return == 'means'){
    out <- cut(.args, breaks = .args[c(1, cp.ix, length(.args))],
               labels = round(cp.x, 5), right = F, include.lowest = T) |>
      as.character() |>
      as.numeric()
  }
  
  # option 2: output days of year for each change and change type
  if(return == 'points'){
    cp.delta <- diff(cp.x)
    out <- tibble(day = .args[cp.ix],
                  delta = cp.delta,
                  type = if_else(cp.delta > 0, 'increase', 'decrease'))
  }
  return(out)
}

# # for testing
# cp.fn(.args, .vals)

# compute changepoints
changepts <- dvar_loc |>
  mutate(tb.sd_mean = map(data, ~cp.fn(.x$day, .x$tb.sd_value, q = 3))) |>
  unnest(everything())

# inspect changepoints and threshold means
changepts |>
  ggplot(aes(x = day, y = tb.sd_value)) +
  geom_path() +
  geom_path(aes(y = tb.sd_mean), 
            color = 'red', 
            linewidth = 0.5, 
            alpha = 0.5) +
  facet_wrap(~loc) +
  geom_hline(aes(yintercept = 1.5), 
             linetype = 'dashed', 
             linewidth = 0.25) +
  labs(y = 'median SD(temperature)', 
       x = 'day of year') +
  ggthm

Comments:

In Lordsburg, there is a changepoint in the middle of the end-of-season transition period; the threshold was set to exclude the tapering off. This is easily changed by reducing the threshold to 1.25.
See plots below. In Sedgwick and Montana de Oro, inferred start dates are unreliable due to missing/sparse data.

Code

# inspect sedgwick and mdo
cpt_means <- changepts |>
  filter(loc %in% c('Montana', 'Sedgwick')) |>
  distinct(loc, day, tb.sd_mean) |>
  tf_nest(tb.sd_mean, .id = 'loc', .arg = 'day')

dvar_indiv |>
  filter(loc %in% c('Montana', 'Sedgwick')) |>
  mutate(tfe = tf_count(tb.sd)) |>
  unnest(tfe) |>
  filter(tfe > 30) |>
  mutate(tb.sd = tf_smooth(tb.sd, 'rollmean', k = 7)) |>
  select(-tfe) |>
  ggplot(aes(y = tb.sd)) +
  geom_spaghetti(linewidth = 0.25, 
                 alpha = 0.5) +
  geom_spaghetti(aes(y = tb.sd_mean), 
                 data = cpt_means, 
                 color = 'red', 
                 linewidth = 0.5, 
                 alpha = 0.5) +
  facet_wrap(~loc) +
  geom_hline(aes(yintercept = 1.5), 
             linetype = 'dashed', 
             linewidth = 0.25) +
  labs(y = 'SD(temperature)', 
       x = 'day of year') +
  ggthm

Figure: Inferred changepoints and individual-level diurnal variation series for MdO and Sedgwick.

In Montana de Oro, data were collected spanning 2017-2018 (from mid April to late February); a single snake appeared to be active for about a week from approximately 2/2/18 until 2/9/18, and then reverted to inactivity until the study endpoint on 2/21/18. The inferred changepoint is simply the start of data collection, and ignores the single active snake from early 2018.

In Sedgwick, data were collected spanning 2015-2016 (from mid May to late March), but only one snake was recorded during the 2016 calendar year; the inferred changepoint is the time at which that snake became active. It is notable that in a later year (2017), data are available for a single snake starting in mid April, and appear to show that snake as inactive until late April.

Code

# inspect start and end dates by year for mdo and sedgwick
raw_season |>
  filter(loc %in% c('Montana', 'Sedgwick')) |>
  group_by(loc, yr = year(datetime)) |>
  summarize(first.obs = min(datetime) |> date(),
            last.obs = max(datetime) |> date()) |>
  pander(caption = "Table: Sampling dates for MdO and Sedgwick.")

Table: Sampling dates for MdO and Sedgwick.
loc	yr	first.obs	last.obs
Sedgwick	2015	2015-05-16	2015-12-31
Sedgwick	2016	2016-01-01	2016-03-24
Sedgwick	2017	2017-04-15	2017-12-11
Montana	2014	2014-04-16	2014-10-07
Montana	2017	2017-04-15	2017-12-31
Montana	2018	2018-01-01	2018-02-21

Options in these locations:

manually specify season start dates
use inferred changepoints as heuristic only

Tentatively, active season classifications are assigned using option (2). These are shown below for all locations with inferred changes (i.e., excluding Rodeo, McIntyre, and Chimineas, at which locations all data was classified as observed during active season). Note again that locations are ordered by (increasing) latitude.

Code

# classify days of year as active/inactive
seasons <- dvar_loc |>
  mutate(tb.sd_mean = map(data, ~cp.fn(.x$day, .x$tb.sd_value, q = 3))) |>
  unnest(everything()) |>
  mutate(season = if_else(tb.sd_mean > 1.5, 'active', 'inactive')) |>
  select(loc, day, season)

# display as table
seasons |>
  mutate(lseason = lag(season)) |>
  drop_na() |>
  filter(season != lseason, day > 1) |>
  mutate(date = parse_date_time(day, orders = 'j'),
         date = paste(month(date, label = T, abbr = T), mday(date))) |>
  select(loc, date, season) |>
  group_by(loc) |>
  add_count() |>
  filter(n > 1) |>
  select(-n) |>
  spread(season, date) |>
  rename(start = active, end = inactive, location = loc) |>
  pander(caption = 'Table: Inferred change points.')

Table: Inferred change points.
location	start	end
Youngblood	Mar 26	Nov 10
Indio	Mar 20	Nov 11
Lordsburg	Mar 30	Nov 12
Eatonton	Mar 15	Oct 26
Sedgwick	Feb 17	Oct 25
Vandenberg	May 6	Oct 16
Montana	Apr 19	Oct 27
Rattlesnake	May 4	Sep 22
Kamloops	Mar 26	Sep 27
AB	May 1	Sep 8

4 Statistical modeling

Note: under development!

52 snakes were identified as having multiple observations recorded at the same time for at least one day. Such days are tentatively removed, but this issue should be resolved during data cleanup.

The analysis developed here uses latitude, elevation, and temperature range as site-level attributes and internal body temperature and mass as snake-level attributes. Internal temperatures are converted to daily functional observations on the 0-24h domain. For each snake, days with sparse data (fewer than 10 observations) are removed from the analysis.

Code

# simultaneous observations to remove
remove <- raw |>
  mutate(yr = year(datetime),
         day = yday(datetime),
         t = hour(datetime) + minute(datetime)/60) |>
  count(snake_id, yr, day, t) |>
  filter(n > 1) |>
  distinct(snake_id, yr, day, t)

# subset data for analysis
raw_analysis <- raw |>
  mutate(elev = (elevation_low + elevation_high)/2) |>
  rename(mass = mass_grams) |>
  select(loc, lat, elev, snake_id, mass, datetime, tmin, tmax, tb) |>
  mutate(yr = year(datetime),
         mo = month(datetime, label = T, abbr = T),
         day = yday(datetime), 
         t = hour(datetime) + minute(datetime)/60) |>
  anti_join(remove, join_by(snake_id, yr, day, t)) |>
  mutate(fid = consecutive_id(snake_id, loc, yr, day)) |>
  select(-datetime) |>
  tf_nest(tb, .id = 'fid', .arg = 't', domain = c(0, 24)) |>
  filter(tf_count(tb) > 10)

# plot with season classification
alldata_panels <- raw_analysis |>
  left_join(seasons, join_by(loc, day)) |>
  filter(!is.na(season)) |>
  ggplot(aes(y = tb, color = season)) +
  facet_grid(mo~loc) +
  geom_spaghetti(alpha = 0.25, linewidth = 0.1) +
  scale_x_continuous(breaks = seq(0, 24, by = 6)) +
  scale_color_manual(values = c('red', 'black')) +
  labs(x = NULL, y = NULL) +
  guides(color = guide_legend(position = 'top', 
                              override.aes = list(alpha = 1, linewidth = 0.5))) +
  ggthm 

# save
ggsave(alldata_panels, filename = here('img/all-panels.png'), 
       width = 12, height = 9, dpi = 400, units = 'in')

# display
alldata_panels

Figure: Diurnal fluctuations in body temperatures across all locations and months; each curve represents data on a single snake on a single day.

Recall that season classification is determined by location, not individually; some observations appear mismatched because snakes are either active or inactive outside of the usual time for the corresponding location. While we consider whether and if so how to incorporate season classifications into the analysis, models below use data from July, in which there is no mixing of activity and inactivity.

Code

# filter to month of july for example modeling
analysis <- raw_analysis |>
  filter(mo == 'Jul')

# plot
analysis |>
  left_join(seasons, join_by(loc, day)) |>
  ggplot(aes(y = tb, color = season)) +
  facet_wrap(~loc, nrow = 2) +
  geom_spaghetti(alpha = 0.2, linewidth = 0.1) +
  scale_x_continuous(breaks = seq(0, 24, by = 6)) +
  scale_color_manual(values = c('red', 'black')) +
  labs(x = NULL, y = NULL) +
  guides(color = guide_none()) +
  ggthm

4.1 Model development

Preliminary modeling follows the MIMS profiles case study in Chapter 5 of FDA with R illustrating function-on-scalar regression.

4.1.1 Latitude as categorical

For a starting point tracking closely with the reference above, consider binning latitude at below/above 40 degrees and fitting a functional linear model with a single (scalar) categorical predictor.

\[ Y_i(t) = \beta_0(t) + \beta_1(t)\mathbf{I}\{\text{latitude}_i > 40\} + \epsilon_i(t) \qquad t \in [0, 24) \]

Here $Y_i(t)$ denotes the internal body temperature of the $i$th observation (snake and day) at time $t$. Errors are assumed independent for $i = 1, \dots, n$.

While this approach accounts for residual correlation across the function domain (e.g., correlation between $\epsilon_i(t_{i1})$ and $\epsilon_i(t_{i2})$), it does not account for within-snake or within-location correlations arising from the multilevel structure of the data. The standard errors returned by the model are thus likely severely underestimated, so the visualization below shows $\hat{\beta}_j(t)\pm 10SE$ as a crude inflation of uncertainty.

Code

# format for model fitting
gam_data <- analysis |>
  mutate(lat.fct = cut(lat, breaks = c(30, 40, 51)),
         lat.ind = as.numeric(lat.fct) - 1) |>
  select(fid, lat.ind, tb) |>
  tf_unnest(tb) |>
  rename(lat = lat.ind, t = tb_arg, tb = tb_value)

# fit gam
fit1 <- gam(tb ~ s(t) + s(t, by = lat), data = gam_data)

# residual fpca
fpca_out <- gam_data |>
  select(-lat, -tb) |>
  bind_cols(resid = resid(fit1)) |>
  tf_nest(resid, .id = 'fid', .arg = t) %>%
  refundr::rfr_fpca('resid', ., center = F, lambda = 50)

# extract first fpc
tvals <- gam_data |> distinct(t) |> pull(t) |> sort()
fpc_df <- tibble(resid = tfd(fpca_out$efunctions[,1])) |>
  tf_unnest(resid) |>
  bind_cols(t = tvals) |>
  rename(fpc = resid_value) |>
  select(t, fpc)

# append to gam data
gamm_data <- gam_data |>
  left_join(fpc_df, join_by(t))

# fit full model
fit2 <- bam(tb ~ s(t, fx = T) + s(t, by = lat, fx = T) + s(fid, bs = 're', by = fpc),
            data = gamm_data, discrete = T, method = 'fREML')

# visualize coefficients
beta.grid <- fpc_df |>
  mutate(lat = 1, fid = 267) %>%
  predict(fit2, newdata = ., type = 'terms', se.fit = T)
beta.grid.tfd <- beta.grid$fit[, 1:2] |> t() |> tfd(arg = tvals)
beta.grid.se.tfd <- beta.grid$se.fit[, 1:2] |> t() |> tfd(arg = tvals)
tibble(term = c('intercept', 'latitude'),
       coef = beta.grid.tfd + append(coef(fit2)[1], 0),
       se = beta.grid.se.tfd) |>
  mutate(upr = coef + 10*se, 
         lwr = coef - 10*se) |>
  ggplot() +
  facet_wrap(~term, scales = 'free') +
  geom_spaghetti(aes(y = coef), linewidth = 0.3, color = 'black', alpha = 1) +
  geom_errorband(aes(ymax = upr, ymin = lwr), fill = 'black', alpha = 0.3) +
  scale_x_continuous(breaks = seq(0, 24, by = 6)) +
  labs(y = expr(paste(hat(beta)[j], '(t)', sep = ''))) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
        panel.grid.major = element_line(color = 'black', linewidth = 0.1))

Here the coefficients are interpreted pointwise in $t$ much like in a scalar linear model: the intercept gives the mean for low-latitude sites and the slope gives the mean difference between high-latitude and low-latitude sites; however, unlike in the linear model, both are estimated as continuous functions of time of day. For example:

the estimated mean internal body temperature at 6am ($t = 6$) is 23.79 degrees centigrade
the estimated mean internal body temperature at 2pm ($t = 14$) is 28.90 degrees centigrade
mean internal body temperature high-latitude sites is estimated to be 3.94 degrees centigrade lower than at low-altitude sites at 6am ($t = 6$)
mean internal body temperature high-latitude sites is estimated to be 0.37 degrees centigrade higher than at low-altitude sites at 6am ($t = 14$)

Code

tibble(term = c('intercept', 'latitude'),
       coef = beta.grid.tfd + append(coef(fit2)[1], 0),
       se = beta.grid.se.tfd) |>
  tf_unnest(everything()) |>
  filter(coef_arg %in% c(6, 14)) |>
  select(-se_arg) |>
  arrange(coef_arg) |>
  pander(digits = 4, caption = 'Table: example pointwise coefficient estimates at 6am and 1pm.')

Table: example pointwise coefficient estimates at 6am and 1pm.
term	coef_arg	coef_value	se_value
intercept	6	23.79	0.02833
latitude	6	-3.942	0.0925
intercept	14	28.9	0.02973
latitude	14	0.3713	0.09305

4.1.2 Latitude as continuous

Treating latitude as a continuous scalar variable gives the model:

\[ Y_i(t) = \beta_0(t) + \beta_1(t)\text{latitude}_i + \epsilon_i(t) \]

Specification is otherwise identical to that above, and still does not account for multilevel structure in the data. For interpretability, latitude is centered and scaled across locations before model fitting.

Code

# unique latitudes
lats <- analysis |> 
  group_by(loc) |>
  summarize(lat = mean(lat)) |>
  mutate(lat.z = scale(lat))

# format for model fitting
gam_data <- analysis |>
  select(fid, loc, tb) |>
  left_join(lats, join_by(loc)) |>
  select(-lat) |>
  tf_unnest(tb) |>
  rename(t = tb_arg, tb = tb_value)

# fit gam
fit1 <- gam(tb ~ s(t) + s(t, by = lat.z), data = gam_data)

# residual fpca
fpca_out <- gam_data |>
  select(-lat.z, -tb) |>
  bind_cols(resid = resid(fit1)) |>
  tf_nest(resid, .id = 'fid', .arg = t) %>%
  refundr::rfr_fpca('resid', ., center = F, lambda = 50)

# extract first fpc
tvals <- gam_data |> distinct(t) |> pull(t) |> sort()
fpc_df <- tibble(resid = tfd(fpca_out$efunctions[,1])) |>
  tf_unnest(resid) |>
  bind_cols(t = tvals) |>
  rename(fpc = resid_value) |>
  select(t, fpc)

# append to gam data
gamm_data <- gam_data |>
  left_join(fpc_df, join_by(t))

# fit full model
fit2 <- bam(tb ~ s(t, fx = T) + s(t, by = lat.z, fx = T) + s(fid, bs = 're', by = fpc),
            data = gamm_data, discrete = T, method = 'fREML')

In this model, the intercept parameter $\beta_0(t)$ now gives the mean internal body temperature at the mean latitude 36.197 as a function of time of day, and the slope parameter $\beta_1(t)$ scales with latitude to give the change in mean internal body temperature associated with a shift in latitude.

Code

# latitude summary stats
lat.m <- attr(lats$lat.z, 'scaled:center')
lat.sd <- attr(lats$lat.z, 'scaled:scale')

# visualize coefficients
beta.grid.in <- tibble(lat.z = seq_range(lats$lat.z, n = 6), 
                       fid = 267, 
                       data = list(fpc_df)) |>
  unnest(data)

beta.grid.out <- beta.grid.in %>%
  predict(fit2, newdata = ., type = 'terms', se.fit = T)

beta.grid.tfd <- beta.grid.in |>
  bind_cols(beta.grid.out$fit) |>
  select(t, lat.z, `s(t)`, `s(t):lat.z`) |>
  tf_nest(starts_with('s'), .id = 'lat.z', .arg = 't') |>
  mutate(lat = lat.sd*lat.z + lat.m,
         `s(t)` = `s(t)` + coef(fit2)[1]) |>
  select(-lat.z) |>
  pivot_longer(starts_with('s')) |>
  distinct(name, value, .keep_all = T) |>
  mutate(term = factor(name, labels = c('intercept', 'latitude')) |> as.character()) |>
  select(term, lat, value) |>
  rename(coef = value)

beta.grid.se.tfd <- beta.grid.in |>
  bind_cols(beta.grid.out$se.fit) |>
  select(t, lat.z, `s(t)`, `s(t):lat.z`) |>
  tf_nest(starts_with('s'), .id = 'lat.z', .arg = 't') |>
  mutate(lat = lat.sd*lat.z + lat.m) |>
  select(-lat.z) |>
  pivot_longer(starts_with('s')) |>
  distinct(name, value, .keep_all = T) |>
  mutate(term = factor(name, labels = c('intercept', 'latitude')) |> as.character()) |>
  select(term, lat, value) |>
  rename(se = value)

p1 <- inner_join(beta.grid.tfd, beta.grid.se.tfd, join_by(term, lat)) |>
  mutate(upr = coef + 4*se, 
         lwr = coef - 4*se) |>
  filter(term == 'latitude') |>
  ggplot() +
  geom_spaghetti(aes(y = coef, color = lat), linewidth = 0.5, alpha = 1) +
  geom_errorband(aes(ymax = upr, ymin = lwr, fill = lat), alpha = 0.3) +
  scale_x_continuous(breaks = seq(0, 24, by = 6)) +
  scale_color_gradient2(low = 'blue', mid = 'lightgrey', high = 'red', 
                        midpoint = lat.m, limits = lat.m + 3*c(-1, 1)*lat.sd) +
  scale_fill_gradient2(low = 'blue', mid = 'lightgrey', high = 'red', 
                       midpoint = lat.m, limits = lat.m + 3*c(-1, 1)*lat.sd) +
  labs(y = expr(hat(beta)[1](t) %*% 'latitude'[i])) +
  ggthm +
  guides(color = guide_colorbar(title = 'latitude'),
         fill = guide_colorbar(title = 'latitude'))

p2 <- inner_join(beta.grid.tfd, beta.grid.se.tfd, join_by(term, lat)) |>
  mutate(upr = coef + 10*se, 
         lwr = coef - 10*se) |>
  filter(term == 'intercept') |>
  ggplot() +
  geom_spaghetti(aes(y = coef), linewidth = 0.5, alpha = 1) +
  geom_errorband(aes(ymax = upr, ymin = lwr), alpha = 0.3) +
  scale_x_continuous(breaks = seq(0, 24, by = 6)) +
  labs(y = expr(paste(hat(beta)[0], '(t)', sep = ''))) +
  ggthm

p2 + p1 + plot_layout(nrow = 1)

--- title: "Rattlesnake energetics project" subtitle: "Analysis of internal body temperatures" date: today published-title: "Updated" format: html: code-fold: true code-tools: true toc: true number-sections: true execute: warning: false message: false echo: true cache: true html-math-method: method: mathjax url: "https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js" --- ```{r, echo = F} # packages library(tidyverse) library(lubridate) library(tidyfun) library(modelr) library(mgcv) library(changepoint) library(here) library(pander) library(patchwork) # theming for ggplot ggthm <- theme_bw() + theme(panel.grid = element_blank(), panel.grid.major.y = element_line(color = 'black', linewidth = 0.1)) ``` # Data import To review/streamline Nicole's work on data cleaning and Allen's work on DAYMET integration here after directory/file cleanup. ```{r, echo = F} # read in data and clean up columns raw <- readRDS(here("data/big_dataset_checkpoint.rds")) |> rename_with(~gsub(" .*", "", .x)) |> rename(tb = bodytemp_celsius, loc = locality) |> select(loc, lat, long, elevation_low, elevation_high, tmin, tmax, snake_id, species, sex, mass_grams, datetime, tb) |> mutate(loc = str_replace(loc, ',', ' ') |> str_replace('-', ' ') %>% gsub(' .*', '', .) |> factor() |> fct_reorder(lat)) ``` ## Internal body temperatures ## DAYMET weather data # Quality control To translate Allen's work on outlier detection for this section. # Active season identification Inference of active season dates relies only on location and internal body temperature data. ```{r} # read in data and clean up columns raw_season <- raw |> select(loc, snake_id, datetime, tb) # preview raw_season |> head() |> pander("Table: example data rows for season identification.") ``` ## Diurnal variation On days when snakes are inactive, the variance in internal body temperatures is expected to be low. Series of daily standard deviations over the days of the year are shown for each snake by location below. The locations are ordered by latitude. ```{r} #| fig-align: center # compute daily variance per snake and convert to functional data dvar_indiv <- raw_season |> group_by(snake_id, loc, yr = year(datetime), day = yday(datetime)) |> summarize(tb.sd = sd(tb), .groups = 'drop') |> mutate(fid = consecutive_id(snake_id, loc, yr)) |> tf_nest(tb.sd, .id = 'fid', .arg = 'day') # plot daily variance series dvar_indiv |> ggplot(aes(y = tb.sd)) + geom_spaghetti(alpha = 0.6, linewidth = 0.2) + facet_wrap(~loc) + labs(y = 'SD(temperature)', x = 'day of year') + ggthm ``` Some series are quite short and data are noisy. Snakes with fewer than 30 days' worth of data are removed and the series are smoothed using a simple 7-day rolling average. ```{r} #| fig-align: center # drop cases with fewer than 30d and smooth dvar_indiv |> mutate(tfe = tf_count(tb.sd)) |> unnest(tfe) |> filter(tfe > 30) |> mutate(tb.sd = tf_smooth(tb.sd, 'rollmean', k = 7)) |> ggplot(aes(y = tb.sd)) + geom_spaghetti(alpha = 0.6, linewidth = 0.2) + facet_wrap(~loc) + labs(y = 'SD(temperature)', x = 'day of year') + ggthm ``` ## Location averages The smoothed series are aggregated by location (*i.e.*, across snake and year). The resulting series show median diurnal variation in body temperature at a given locale on a given day of the year. Note that missing data are replaced by linear interpolations. ```{r} #| fig-align: center # same as above then aggregate by location dvar_indiv |> mutate(tfe = tf_count(tb.sd)) |> unnest(tfe) |> filter(tfe > 30) |> mutate(tb.sd = tf_smooth(tb.sd, 'rollmean', k = 7)) |> group_by(loc) |> mutate(tb.sd = mean(tb.sd, depth = 'pointwise', na.rm = T)) |> ggplot(aes(y = tb.sd)) + geom_spaghetti() + facet_wrap(~loc) + labs(y = 'median SD(temperature)', x = 'day of year') + ggthm ``` ## Changepoint detection Mean changepoint detection via binary segmentation does a fairly good job at identifying apparent start and end dates for active seasons by location. There is some noise in the data that produces inference of superfluous changepoints. Nonetheless, a simple thresholding of inferred means at 1.5 (dashed horizontal line) adequately identifies start and end dates. ```{r} #| fig-align: center # store series dvar_loc <- dvar_indiv |> mutate(tfe = tf_count(tb.sd)) |> unnest(tfe) |> filter(tfe > 30) |> select(-tfe) |> mutate(tb.sd = tf_smooth(tb.sd, 'rollmean', k = 7)) |> group_by(loc) |> mutate(tb.sd = mean(tb.sd, depth = 'pointwise', na.rm = T)) |> distinct(loc, tb.sd) |> ungroup() |> tf_unnest(tb.sd) |> rename(day = tb.sd_arg) |> nest(c(day, tb.sd_value)) # # for testing # j <- 6 # .vals <- sd_series$data[[j]]$tb.sd_value # .args <- sd_series$data[[j]]$day # wrapper around changepoint detection cp.fn <- function(.args, .vals, return = 'means', q = 4){ # estimate changepoints via binary segmentation cpt.out <- cpt.mean(.vals, method = 'BinSeg', Q = q) cp.ix <- cpts(cpt.out) cp.x <- cpt.out@param.est$mean # option 1: output estimated mean for each day if(return == 'means'){ out <- cut(.args, breaks = .args[c(1, cp.ix, length(.args))], labels = round(cp.x, 5), right = F, include.lowest = T) |> as.character() |> as.numeric() } # option 2: output days of year for each change and change type if(return == 'points'){ cp.delta <- diff(cp.x) out <- tibble(day = .args[cp.ix], delta = cp.delta, type = if_else(cp.delta > 0, 'increase', 'decrease')) } return(out) } # # for testing # cp.fn(.args, .vals) # compute changepoints changepts <- dvar_loc |> mutate(tb.sd_mean = map(data, ~cp.fn(.x$day, .x$tb.sd_value, q = 3))) |> unnest(everything()) # inspect changepoints and threshold means changepts |> ggplot(aes(x = day, y = tb.sd_value)) + geom_path() + geom_path(aes(y = tb.sd_mean), color = 'red', linewidth = 0.5, alpha = 0.5) + facet_wrap(~loc) + geom_hline(aes(yintercept = 1.5), linetype = 'dashed', linewidth = 0.25) + labs(y = 'median SD(temperature)', x = 'day of year') + ggthm ``` Comments: 1. In Lordsburg, there is a changepoint in the middle of the end-of-season transition period; the threshold was set to exclude the tapering off. This is easily changed by reducing the threshold to 1.25. 2. See plots below. In Sedgwick and Montana de Oro, inferred start dates are unreliable due to missing/sparse data. ```{r} #| fig-width: 8 #| fig-height: 3 #| fig-align: center #| fig-cap: "Figure: Inferred changepoints and individual-level diurnal variation series for MdO and Sedgwick." # inspect sedgwick and mdo cpt_means <- changepts |> filter(loc %in% c('Montana', 'Sedgwick')) |> distinct(loc, day, tb.sd_mean) |> tf_nest(tb.sd_mean, .id = 'loc', .arg = 'day') dvar_indiv |> filter(loc %in% c('Montana', 'Sedgwick')) |> mutate(tfe = tf_count(tb.sd)) |> unnest(tfe) |> filter(tfe > 30) |> mutate(tb.sd = tf_smooth(tb.sd, 'rollmean', k = 7)) |> select(-tfe) |> ggplot(aes(y = tb.sd)) + geom_spaghetti(linewidth = 0.25, alpha = 0.5) + geom_spaghetti(aes(y = tb.sd_mean), data = cpt_means, color = 'red', linewidth = 0.5, alpha = 0.5) + facet_wrap(~loc) + geom_hline(aes(yintercept = 1.5), linetype = 'dashed', linewidth = 0.25) + labs(y = 'SD(temperature)', x = 'day of year') + ggthm ``` In Montana de Oro, data were collected spanning 2017-2018 (from mid April to late February); a single snake appeared to be active for about a week from approximately 2/2/18 until 2/9/18, and then reverted to inactivity until the study endpoint on 2/21/18. The inferred changepoint is simply the start of data collection, and ignores the single active snake from early 2018. In Sedgwick, data were collected spanning 2015-2016 (from mid May to late March), but only one snake was recorded during the 2016 calendar year; the inferred changepoint is the time at which that snake became active. It is notable that in a later year (2017), data are available for a single snake starting in mid April, and appear to show that snake as *inactive* until late April. ```{r} # inspect start and end dates by year for mdo and sedgwick raw_season |> filter(loc %in% c('Montana', 'Sedgwick')) |> group_by(loc, yr = year(datetime)) |> summarize(first.obs = min(datetime) |> date(), last.obs = max(datetime) |> date()) |> pander(caption = "Table: Sampling dates for MdO and Sedgwick.") ``` Options in these locations: 1. manually specify season start dates 2. use inferred changepoints as heuristic only Tentatively, active season classifications are assigned using option (2). These are shown below for all locations with inferred changes (*i.e.*, excluding Rodeo, McIntyre, and Chimineas, at which locations all data was classified as observed during active season). Note again that locations are ordered by (increasing) latitude. ```{r} # classify days of year as active/inactive seasons <- dvar_loc |> mutate(tb.sd_mean = map(data, ~cp.fn(.x$day, .x$tb.sd_value, q = 3))) |> unnest(everything()) |> mutate(season = if_else(tb.sd_mean > 1.5, 'active', 'inactive')) |> select(loc, day, season) # display as table seasons |> mutate(lseason = lag(season)) |> drop_na() |> filter(season != lseason, day > 1) |> mutate(date = parse_date_time(day, orders = 'j'), date = paste(month(date, label = T, abbr = T), mday(date))) |> select(loc, date, season) |> group_by(loc) |> add_count() |> filter(n > 1) |> select(-n) |> spread(season, date) |> rename(start = active, end = inactive, location = loc) |> pander(caption = 'Table: Inferred change points.') ``` ```{r, echo = F} # export write_rds(seasons, file = here('data/seasons.rds')) ``` # Statistical modeling **Note: under development!** 52 snakes were identified as having multiple observations recorded at the same time for at least one day. Such days are tentatively removed, but this issue should be resolved during data cleanup. The analysis developed here uses latitude, elevation, and temperature range as site-level attributes and internal body temperature and mass as snake-level attributes. Internal temperatures are converted to daily functional observations on the 0-24h domain. For each snake, days with sparse data (fewer than 10 observations) are removed from the analysis. ```{r} #| fig-align: center #| fig-width: 12 #| fig-height: 9 #| fig-dpi: 300 #| fig-cap: "Figure: Diurnal fluctuations in body temperatures across all locations and months; each curve represents data on a single snake on a single day." # simultaneous observations to remove remove <- raw |> mutate(yr = year(datetime), day = yday(datetime), t = hour(datetime) + minute(datetime)/60) |> count(snake_id, yr, day, t) |> filter(n > 1) |> distinct(snake_id, yr, day, t) # subset data for analysis raw_analysis <- raw |> mutate(elev = (elevation_low + elevation_high)/2) |> rename(mass = mass_grams) |> select(loc, lat, elev, snake_id, mass, datetime, tmin, tmax, tb) |> mutate(yr = year(datetime), mo = month(datetime, label = T, abbr = T), day = yday(datetime), t = hour(datetime) + minute(datetime)/60) |> anti_join(remove, join_by(snake_id, yr, day, t)) |> mutate(fid = consecutive_id(snake_id, loc, yr, day)) |> select(-datetime) |> tf_nest(tb, .id = 'fid', .arg = 't', domain = c(0, 24)) |> filter(tf_count(tb) > 10) # plot with season classification alldata_panels <- raw_analysis |> left_join(seasons, join_by(loc, day)) |> filter(!is.na(season)) |> ggplot(aes(y = tb, color = season)) + facet_grid(mo~loc) + geom_spaghetti(alpha = 0.25, linewidth = 0.1) + scale_x_continuous(breaks = seq(0, 24, by = 6)) + scale_color_manual(values = c('red', 'black')) + labs(x = NULL, y = NULL) + guides(color = guide_legend(position = 'top', override.aes = list(alpha = 1, linewidth = 0.5))) + ggthm # save ggsave(alldata_panels, filename = here('img/all-panels.png'), width = 12, height = 9, dpi = 400, units = 'in') # display alldata_panels ``` Recall that season classification is determined by location, not individually; some observations appear mismatched because snakes are either active or inactive outside of the usual time for the corresponding location. While we consider whether and if so how to incorporate season classifications into the analysis, models below use data from July, in which there is no mixing of activity and inactivity. ```{r} #| fig-width: 8 #| fig-height: 4 #| fig-align: center #| fig-cap: "Figure: July data across locations." # filter to month of july for example modeling analysis <- raw_analysis |> filter(mo == 'Jul') # plot analysis |> left_join(seasons, join_by(loc, day)) |> ggplot(aes(y = tb, color = season)) + facet_wrap(~loc, nrow = 2) + geom_spaghetti(alpha = 0.2, linewidth = 0.1) + scale_x_continuous(breaks = seq(0, 24, by = 6)) + scale_color_manual(values = c('red', 'black')) + labs(x = NULL, y = NULL) + guides(color = guide_none()) + ggthm ``` ## Model development Preliminary modeling follows the MIMS profiles case study in [Chapter 5 of FDA with R](https://functionaldataanalysis.org/chapter_05.html) illustrating function-on-scalar regression. ### Latitude as categorical For a starting point tracking closely with the reference above, consider binning latitude at below/above 40 degrees and fitting a functional linear model with a single (scalar) categorical predictor. $$ Y_i(t) = \beta_0(t) + \beta_1(t)\mathbf{I}\{\text{latitude}_i > 40\} + \epsilon_i(t) \qquad t \in [0, 24) $$ Here $Y_i(t)$ denotes the internal body temperature of the $i$th observation (snake and day) at time $t$. Errors are assumed independent for $i = 1, \dots, n$. While this approach accounts for residual correlation across the function domain (*e.g.*, correlation between $\epsilon_i(t_{i1})$ and $\epsilon_i(t_{i2})$), it **does not account for within-snake or within-location correlations arising from the multilevel structure of the data**. The standard errors returned by the model are thus likely severely underestimated, so the visualization below shows $\hat{\beta}_j(t)\pm 10SE$ as a crude inflation of uncertainty. ```{r} #| fig-width: 8 #| fig-height: 4 #| fig-align: center #| fig-cap: "Figure: Coefficient estimates." # format for model fitting gam_data <- analysis |> mutate(lat.fct = cut(lat, breaks = c(30, 40, 51)), lat.ind = as.numeric(lat.fct) - 1) |> select(fid, lat.ind, tb) |> tf_unnest(tb) |> rename(lat = lat.ind, t = tb_arg, tb = tb_value) # fit gam fit1 <- gam(tb ~ s(t) + s(t, by = lat), data = gam_data) # residual fpca fpca_out <- gam_data |> select(-lat, -tb) |> bind_cols(resid = resid(fit1)) |> tf_nest(resid, .id = 'fid', .arg = t) %>% refundr::rfr_fpca('resid', ., center = F, lambda = 50) # extract first fpc tvals <- gam_data |> distinct(t) |> pull(t) |> sort() fpc_df <- tibble(resid = tfd(fpca_out$efunctions[,1])) |> tf_unnest(resid) |> bind_cols(t = tvals) |> rename(fpc = resid_value) |> select(t, fpc) # append to gam data gamm_data <- gam_data |> left_join(fpc_df, join_by(t)) # fit full model fit2 <- bam(tb ~ s(t, fx = T) + s(t, by = lat, fx = T) + s(fid, bs = 're', by = fpc), data = gamm_data, discrete = T, method = 'fREML') # visualize coefficients beta.grid <- fpc_df |> mutate(lat = 1, fid = 267) %>% predict(fit2, newdata = ., type = 'terms', se.fit = T) beta.grid.tfd <- beta.grid$fit[, 1:2] |> t() |> tfd(arg = tvals) beta.grid.se.tfd <- beta.grid$se.fit[, 1:2] |> t() |> tfd(arg = tvals) tibble(term = c('intercept', 'latitude'), coef = beta.grid.tfd + append(coef(fit2)[1], 0), se = beta.grid.se.tfd) |> mutate(upr = coef + 10*se, lwr = coef - 10*se) |> ggplot() + facet_wrap(~term, scales = 'free') + geom_spaghetti(aes(y = coef), linewidth = 0.3, color = 'black', alpha = 1) + geom_errorband(aes(ymax = upr, ymin = lwr), fill = 'black', alpha = 0.3) + scale_x_continuous(breaks = seq(0, 24, by = 6)) + labs(y = expr(paste(hat(beta)[j], '(t)', sep = ''))) + theme_bw() + theme(panel.grid.minor = element_blank(), panel.grid.major = element_line(color = 'black', linewidth = 0.1)) ``` Here the coefficients are interpreted pointwise in $t$ much like in a scalar linear model: the intercept gives the mean for low-latitude sites and the slope gives the mean difference between high-latitude and low-latitude sites; however, unlike in the linear model, both are estimated as continuous functions of time of day. For example: - the estimated mean internal body temperature at 6am ($t = 6$) is 23.79 degrees centigrade - the estimated mean internal body temperature at 2pm ($t = 14$) is 28.90 degrees centigrade - mean internal body temperature high-latitude sites is estimated to be 3.94 degrees centigrade *lower* than at low-altitude sites at 6am ($t = 6$) - mean internal body temperature high-latitude sites is estimated to be 0.37 degrees centigrade *higher* than at low-altitude sites at 6am ($t = 14$) ```{r} tibble(term = c('intercept', 'latitude'), coef = beta.grid.tfd + append(coef(fit2)[1], 0), se = beta.grid.se.tfd) |> tf_unnest(everything()) |> filter(coef_arg %in% c(6, 14)) |> select(-se_arg) |> arrange(coef_arg) |> pander(digits = 4, caption = 'Table: example pointwise coefficient estimates at 6am and 1pm.') ``` ### Latitude as continuous Treating latitude as a continuous scalar variable gives the model: $$ Y_i(t) = \beta_0(t) + \beta_1(t)\text{latitude}_i + \epsilon_i(t) $$ Specification is otherwise identical to that above, and still does not account for multilevel structure in the data. For interpretability, latitude is centered and scaled across locations before model fitting. ```{r} # unique latitudes lats <- analysis |> group_by(loc) |> summarize(lat = mean(lat)) |> mutate(lat.z = scale(lat)) # format for model fitting gam_data <- analysis |> select(fid, loc, tb) |> left_join(lats, join_by(loc)) |> select(-lat) |> tf_unnest(tb) |> rename(t = tb_arg, tb = tb_value) # fit gam fit1 <- gam(tb ~ s(t) + s(t, by = lat.z), data = gam_data) # residual fpca fpca_out <- gam_data |> select(-lat.z, -tb) |> bind_cols(resid = resid(fit1)) |> tf_nest(resid, .id = 'fid', .arg = t) %>% refundr::rfr_fpca('resid', ., center = F, lambda = 50) # extract first fpc tvals <- gam_data |> distinct(t) |> pull(t) |> sort() fpc_df <- tibble(resid = tfd(fpca_out$efunctions[,1])) |> tf_unnest(resid) |> bind_cols(t = tvals) |> rename(fpc = resid_value) |> select(t, fpc) # append to gam data gamm_data <- gam_data |> left_join(fpc_df, join_by(t)) # fit full model fit2 <- bam(tb ~ s(t, fx = T) + s(t, by = lat.z, fx = T) + s(fid, bs = 're', by = fpc), data = gamm_data, discrete = T, method = 'fREML') ``` In this model, the intercept parameter $\beta_0(t)$ now gives the mean internal body temperature at the mean latitude `r mean(lats$lat) |> round(3)` as a function of time of day, and the slope parameter $\beta_1(t)$ scales with latitude to give the change in mean internal body temperature associated with a shift in latitude. ```{r} #| fig-width: 8 #| fig-height: 4 #| fig-align: center #| fig-cap: "Figure: Coefficient estimates." # latitude summary stats lat.m <- attr(lats$lat.z, 'scaled:center') lat.sd <- attr(lats$lat.z, 'scaled:scale') # visualize coefficients beta.grid.in <- tibble(lat.z = seq_range(lats$lat.z, n = 6), fid = 267, data = list(fpc_df)) |> unnest(data) beta.grid.out <- beta.grid.in %>% predict(fit2, newdata = ., type = 'terms', se.fit = T) beta.grid.tfd <- beta.grid.in |> bind_cols(beta.grid.out$fit) |> select(t, lat.z, `s(t)`, `s(t):lat.z`) |> tf_nest(starts_with('s'), .id = 'lat.z', .arg = 't') |> mutate(lat = lat.sd*lat.z + lat.m, `s(t)` = `s(t)` + coef(fit2)[1]) |> select(-lat.z) |> pivot_longer(starts_with('s')) |> distinct(name, value, .keep_all = T) |> mutate(term = factor(name, labels = c('intercept', 'latitude')) |> as.character()) |> select(term, lat, value) |> rename(coef = value) beta.grid.se.tfd <- beta.grid.in |> bind_cols(beta.grid.out$se.fit) |> select(t, lat.z, `s(t)`, `s(t):lat.z`) |> tf_nest(starts_with('s'), .id = 'lat.z', .arg = 't') |> mutate(lat = lat.sd*lat.z + lat.m) |> select(-lat.z) |> pivot_longer(starts_with('s')) |> distinct(name, value, .keep_all = T) |> mutate(term = factor(name, labels = c('intercept', 'latitude')) |> as.character()) |> select(term, lat, value) |> rename(se = value) p1 <- inner_join(beta.grid.tfd, beta.grid.se.tfd, join_by(term, lat)) |> mutate(upr = coef + 4*se, lwr = coef - 4*se) |> filter(term == 'latitude') |> ggplot() + geom_spaghetti(aes(y = coef, color = lat), linewidth = 0.5, alpha = 1) + geom_errorband(aes(ymax = upr, ymin = lwr, fill = lat), alpha = 0.3) + scale_x_continuous(breaks = seq(0, 24, by = 6)) + scale_color_gradient2(low = 'blue', mid = 'lightgrey', high = 'red', midpoint = lat.m, limits = lat.m + 3*c(-1, 1)*lat.sd) + scale_fill_gradient2(low = 'blue', mid = 'lightgrey', high = 'red', midpoint = lat.m, limits = lat.m + 3*c(-1, 1)*lat.sd) + labs(y = expr(hat(beta)[1](t) %*% 'latitude'[i])) + ggthm + guides(color = guide_colorbar(title = 'latitude'), fill = guide_colorbar(title = 'latitude')) p2 <- inner_join(beta.grid.tfd, beta.grid.se.tfd, join_by(term, lat)) |> mutate(upr = coef + 10*se, lwr = coef - 10*se) |> filter(term == 'intercept') |> ggplot() + geom_spaghetti(aes(y = coef), linewidth = 0.5, alpha = 1) + geom_errorband(aes(ymax = upr, ymin = lwr), alpha = 0.3) + scale_x_continuous(breaks = seq(0, 24, by = 6)) + labs(y = expr(paste(hat(beta)[0], '(t)', sep = ''))) + ggthm p2 + p1 + plot_layout(nrow = 1) ```