Rattlesnake energetics project

Analysis of internal body temperatures

Updated

July 7, 2025

1 Data import

To review/streamline Nicole’s work on data cleaning and Allen’s work on DAYMET integration here after directory/file cleanup.

1.1 Internal body temperatures

1.2 DAYMET weather data

2 Quality control

To translate Allen’s work on outlier detection for this section.

3 Active season identification

Inference of active season dates relies only on location and internal body temperature data.

Code
# read in data and clean up columns
raw_season <- raw |> select(loc, snake_id, datetime, tb)

# preview
raw_season |>
  head() |>
  pander("Table: example data rows for season identification.")
Table: example data rows for season identification.
loc snake_id datetime tb
Eatonton Arnold 2021-10-08 23:16:06 22.01
Eatonton Arnold 2021-10-09 00:00:06 21.89
Eatonton Arnold 2021-10-09 01:00:06 22.01
Eatonton Arnold 2021-10-09 02:00:06 22.01
Eatonton Arnold 2021-10-09 03:00:06 22.01
Eatonton Arnold 2021-10-09 04:00:06 22.01

3.1 Diurnal variation

On days when snakes are inactive, the variance in internal body temperatures is expected to be low. Series of daily standard deviations over the days of the year are shown for each snake by location below. The locations are ordered by latitude.

Code
# compute daily variance per snake and convert to functional data
dvar_indiv <- raw_season |>
  group_by(snake_id, 
           loc, 
           yr = year(datetime), 
           day = yday(datetime)) |>
  summarize(tb.sd = sd(tb), .groups = 'drop') |>
  mutate(fid = consecutive_id(snake_id, loc, yr)) |>
  tf_nest(tb.sd, .id = 'fid', .arg = 'day') 

# plot daily variance series
dvar_indiv |>
  ggplot(aes(y = tb.sd)) +
  geom_spaghetti(alpha = 0.6, linewidth = 0.2) +
  facet_wrap(~loc) +
  labs(y = 'SD(temperature)', x = 'day of year') +
  ggthm

Some series are quite short and data are noisy. Snakes with fewer than 30 days’ worth of data are removed and the series are smoothed using a simple 7-day rolling average.

Code
# drop cases with fewer than 30d and smooth
dvar_indiv |>
  mutate(tfe = tf_count(tb.sd)) |>
  unnest(tfe) |>
  filter(tfe > 30) |>
  mutate(tb.sd = tf_smooth(tb.sd, 'rollmean', k = 7)) |>
  ggplot(aes(y = tb.sd)) +
  geom_spaghetti(alpha = 0.6, linewidth = 0.2) +
  facet_wrap(~loc) +
  labs(y = 'SD(temperature)', x = 'day of year') +
  ggthm

3.2 Location averages

The smoothed series are aggregated by location (i.e., across snake and year). The resulting series show median diurnal variation in body temperature at a given locale on a given day of the year. Note that missing data are replaced by linear interpolations.

Code
# same as above then aggregate by location
dvar_indiv |>
  mutate(tfe = tf_count(tb.sd)) |>
  unnest(tfe) |>
  filter(tfe > 30) |>
  mutate(tb.sd = tf_smooth(tb.sd, 'rollmean', k = 7)) |>
  group_by(loc) |>
  mutate(tb.sd = mean(tb.sd, depth = 'pointwise', na.rm = T)) |>
  ggplot(aes(y = tb.sd)) +
  geom_spaghetti() +
  facet_wrap(~loc) +
  labs(y = 'median SD(temperature)', 
       x = 'day of year') +
  ggthm

3.3 Changepoint detection

Mean changepoint detection via binary segmentation does a fairly good job at identifying apparent start and end dates for active seasons by location. There is some noise in the data that produces inference of superfluous changepoints. Nonetheless, a simple thresholding of inferred means at 1.5 (dashed horizontal line) adequately identifies start and end dates.

Code
# store series
dvar_loc <- dvar_indiv |>
  mutate(tfe = tf_count(tb.sd)) |>
  unnest(tfe) |>
  filter(tfe > 30) |>
  select(-tfe) |>
  mutate(tb.sd = tf_smooth(tb.sd, 'rollmean', k = 7)) |>
  group_by(loc) |>
  mutate(tb.sd = mean(tb.sd, depth = 'pointwise', na.rm = T)) |>
  distinct(loc, tb.sd) |>
  ungroup() |>
  tf_unnest(tb.sd) |>
  rename(day = tb.sd_arg) |>
  nest(c(day, tb.sd_value))

# # for testing
# j <- 6
# .vals <- sd_series$data[[j]]$tb.sd_value
# .args <- sd_series$data[[j]]$day

# wrapper around changepoint detection
cp.fn <- function(.args, .vals, return = 'means', q = 4){
  
  # estimate changepoints via binary segmentation
  cpt.out <- cpt.mean(.vals,
                      method = 'BinSeg', Q = q) 
  cp.ix <- cpts(cpt.out) 
  cp.x <- cpt.out@param.est$mean
  
  # option 1: output estimated mean for each day
  if(return == 'means'){
    out <- cut(.args, breaks = .args[c(1, cp.ix, length(.args))],
               labels = round(cp.x, 5), right = F, include.lowest = T) |>
      as.character() |>
      as.numeric()
  }
  
  # option 2: output days of year for each change and change type
  if(return == 'points'){
    cp.delta <- diff(cp.x)
    out <- tibble(day = .args[cp.ix],
                  delta = cp.delta,
                  type = if_else(cp.delta > 0, 'increase', 'decrease'))
  }
  return(out)
}

# # for testing
# cp.fn(.args, .vals)

# compute changepoints
changepts <- dvar_loc |>
  mutate(tb.sd_mean = map(data, ~cp.fn(.x$day, .x$tb.sd_value, q = 3))) |>
  unnest(everything())

# inspect changepoints and threshold means
changepts |>
  ggplot(aes(x = day, y = tb.sd_value)) +
  geom_path() +
  geom_path(aes(y = tb.sd_mean), 
            color = 'red', 
            linewidth = 0.5, 
            alpha = 0.5) +
  facet_wrap(~loc) +
  geom_hline(aes(yintercept = 1.5), 
             linetype = 'dashed', 
             linewidth = 0.25) +
  labs(y = 'median SD(temperature)', 
       x = 'day of year') +
  ggthm

Comments:

  1. In Lordsburg, there is a changepoint in the middle of the end-of-season transition period; the threshold was set to exclude the tapering off. This is easily changed by reducing the threshold to 1.25.

  2. See plots below. In Sedgwick and Montana de Oro, inferred start dates are unreliable due to missing/sparse data.

Code
# inspect sedgwick and mdo
cpt_means <- changepts |>
  filter(loc %in% c('Montana', 'Sedgwick')) |>
  distinct(loc, day, tb.sd_mean) |>
  tf_nest(tb.sd_mean, .id = 'loc', .arg = 'day')

dvar_indiv |>
  filter(loc %in% c('Montana', 'Sedgwick')) |>
  mutate(tfe = tf_count(tb.sd)) |>
  unnest(tfe) |>
  filter(tfe > 30) |>
  mutate(tb.sd = tf_smooth(tb.sd, 'rollmean', k = 7)) |>
  select(-tfe) |>
  ggplot(aes(y = tb.sd)) +
  geom_spaghetti(linewidth = 0.25, 
                 alpha = 0.5) +
  geom_spaghetti(aes(y = tb.sd_mean), 
                 data = cpt_means, 
                 color = 'red', 
                 linewidth = 0.5, 
                 alpha = 0.5) +
  facet_wrap(~loc) +
  geom_hline(aes(yintercept = 1.5), 
             linetype = 'dashed', 
             linewidth = 0.25) +
  labs(y = 'SD(temperature)', 
       x = 'day of year') +
  ggthm

Figure: Inferred changepoints and individual-level diurnal variation series for MdO and Sedgwick.

In Montana de Oro, data were collected spanning 2017-2018 (from mid April to late February); a single snake appeared to be active for about a week from approximately 2/2/18 until 2/9/18, and then reverted to inactivity until the study endpoint on 2/21/18. The inferred changepoint is simply the start of data collection, and ignores the single active snake from early 2018.

In Sedgwick, data were collected spanning 2015-2016 (from mid May to late March), but only one snake was recorded during the 2016 calendar year; the inferred changepoint is the time at which that snake became active. It is notable that in a later year (2017), data are available for a single snake starting in mid April, and appear to show that snake as inactive until late April.

Code
# inspect start and end dates by year for mdo and sedgwick
raw_season |>
  filter(loc %in% c('Montana', 'Sedgwick')) |>
  group_by(loc, yr = year(datetime)) |>
  summarize(first.obs = min(datetime) |> date(),
            last.obs = max(datetime) |> date()) |>
  pander(caption = "Table: Sampling dates for MdO and Sedgwick.")
Table: Sampling dates for MdO and Sedgwick.
loc yr first.obs last.obs
Sedgwick 2015 2015-05-16 2015-12-31
Sedgwick 2016 2016-01-01 2016-03-24
Sedgwick 2017 2017-04-15 2017-12-11
Montana 2014 2014-04-16 2014-10-07
Montana 2017 2017-04-15 2017-12-31
Montana 2018 2018-01-01 2018-02-21

Options in these locations:

  1. manually specify season start dates
  2. use inferred changepoints as heuristic only

Tentatively, active season classifications are assigned using option (2). These are shown below for all locations with inferred changes (i.e., excluding Rodeo, McIntyre, and Chimineas, at which locations all data was classified as observed during active season). Note again that locations are ordered by (increasing) latitude.

Code
# classify days of year as active/inactive
seasons <- dvar_loc |>
  mutate(tb.sd_mean = map(data, ~cp.fn(.x$day, .x$tb.sd_value, q = 3))) |>
  unnest(everything()) |>
  mutate(season = if_else(tb.sd_mean > 1.5, 'active', 'inactive')) |>
  select(loc, day, season)

# display as table
seasons |>
  mutate(lseason = lag(season)) |>
  drop_na() |>
  filter(season != lseason, day > 1) |>
  mutate(date = parse_date_time(day, orders = 'j'),
         date = paste(month(date, label = T, abbr = T), mday(date))) |>
  select(loc, date, season) |>
  group_by(loc) |>
  add_count() |>
  filter(n > 1) |>
  select(-n) |>
  spread(season, date) |>
  rename(start = active, end = inactive, location = loc) |>
  pander(caption = 'Table: Inferred change points.')
Table: Inferred change points.
location start end
Youngblood Mar 26 Nov 10
Indio Mar 20 Nov 11
Lordsburg Mar 30 Nov 12
Eatonton Mar 15 Oct 26
Sedgwick Feb 17 Oct 25
Vandenberg May 6 Oct 16
Montana Apr 19 Oct 27
Rattlesnake May 4 Sep 22
Kamloops Mar 26 Sep 27
AB May 1 Sep 8

4 Statistical modeling

Note: under development!

52 snakes were identified as having multiple observations recorded at the same time for at least one day. Such days are tentatively removed, but this issue should be resolved during data cleanup.

The analysis developed here uses latitude, elevation, and temperature range as site-level attributes and internal body temperature and mass as snake-level attributes. Internal temperatures are converted to daily functional observations on the 0-24h domain. For each snake, days with sparse data (fewer than 10 observations) are removed from the analysis.

Code
# simultaneous observations to remove
remove <- raw |>
  mutate(yr = year(datetime),
         day = yday(datetime),
         t = hour(datetime) + minute(datetime)/60) |>
  count(snake_id, yr, day, t) |>
  filter(n > 1) |>
  distinct(snake_id, yr, day, t)

# subset data for analysis
raw_analysis <- raw |>
  mutate(elev = (elevation_low + elevation_high)/2) |>
  rename(mass = mass_grams) |>
  select(loc, lat, elev, snake_id, mass, datetime, tmin, tmax, tb) |>
  mutate(yr = year(datetime),
         mo = month(datetime, label = T, abbr = T),
         day = yday(datetime), 
         t = hour(datetime) + minute(datetime)/60) |>
  anti_join(remove, join_by(snake_id, yr, day, t)) |>
  mutate(fid = consecutive_id(snake_id, loc, yr, day)) |>
  select(-datetime) |>
  tf_nest(tb, .id = 'fid', .arg = 't', domain = c(0, 24)) |>
  filter(tf_count(tb) > 10)

# plot with season classification
alldata_panels <- raw_analysis |>
  left_join(seasons, join_by(loc, day)) |>
  filter(!is.na(season)) |>
  ggplot(aes(y = tb, color = season)) +
  facet_grid(mo~loc) +
  geom_spaghetti(alpha = 0.25, linewidth = 0.1) +
  scale_x_continuous(breaks = seq(0, 24, by = 6)) +
  scale_color_manual(values = c('red', 'black')) +
  labs(x = NULL, y = NULL) +
  guides(color = guide_legend(position = 'top', 
                              override.aes = list(alpha = 1, linewidth = 0.5))) +
  ggthm 

# save
ggsave(alldata_panels, filename = here('img/all-panels.png'), 
       width = 12, height = 9, dpi = 400, units = 'in')

# display
alldata_panels

Figure: Diurnal fluctuations in body temperatures across all locations and months; each curve represents data on a single snake on a single day.

Recall that season classification is determined by location, not individually; some observations appear mismatched because snakes are either active or inactive outside of the usual time for the corresponding location. While we consider whether and if so how to incorporate season classifications into the analysis, models below use data from July, in which there is no mixing of activity and inactivity.

Code
# filter to month of july for example modeling
analysis <- raw_analysis |>
  filter(mo == 'Jul')

# plot
analysis |>
  left_join(seasons, join_by(loc, day)) |>
  ggplot(aes(y = tb, color = season)) +
  facet_wrap(~loc, nrow = 2) +
  geom_spaghetti(alpha = 0.2, linewidth = 0.1) +
  scale_x_continuous(breaks = seq(0, 24, by = 6)) +
  scale_color_manual(values = c('red', 'black')) +
  labs(x = NULL, y = NULL) +
  guides(color = guide_none()) +
  ggthm 

Figure: July data across locations.

4.1 Model development

Preliminary modeling follows the MIMS profiles case study in Chapter 5 of FDA with R illustrating function-on-scalar regression.

4.1.1 Latitude as categorical

For a starting point tracking closely with the reference above, consider binning latitude at below/above 40 degrees and fitting a functional linear model with a single (scalar) categorical predictor.

\[ Y_i(t) = \beta_0(t) + \beta_1(t)\mathbf{I}\{\text{latitude}_i > 40\} + \epsilon_i(t) \qquad t \in [0, 24) \]

Here \(Y_i(t)\) denotes the internal body temperature of the \(i\)th observation (snake and day) at time \(t\). Errors are assumed independent for \(i = 1, \dots, n\).

While this approach accounts for residual correlation across the function domain (e.g., correlation between \(\epsilon_i(t_{i1})\) and \(\epsilon_i(t_{i2})\)), it does not account for within-snake or within-location correlations arising from the multilevel structure of the data. The standard errors returned by the model are thus likely severely underestimated, so the visualization below shows \(\hat{\beta}_j(t)\pm 10SE\) as a crude inflation of uncertainty.

Code
# format for model fitting
gam_data <- analysis |>
  mutate(lat.fct = cut(lat, breaks = c(30, 40, 51)),
         lat.ind = as.numeric(lat.fct) - 1) |>
  select(fid, lat.ind, tb) |>
  tf_unnest(tb) |>
  rename(lat = lat.ind, t = tb_arg, tb = tb_value)

# fit gam
fit1 <- gam(tb ~ s(t) + s(t, by = lat), data = gam_data)

# residual fpca
fpca_out <- gam_data |>
  select(-lat, -tb) |>
  bind_cols(resid = resid(fit1)) |>
  tf_nest(resid, .id = 'fid', .arg = t) %>%
  refundr::rfr_fpca('resid', ., center = F, lambda = 50)

# extract first fpc
tvals <- gam_data |> distinct(t) |> pull(t) |> sort()
fpc_df <- tibble(resid = tfd(fpca_out$efunctions[,1])) |>
  tf_unnest(resid) |>
  bind_cols(t = tvals) |>
  rename(fpc = resid_value) |>
  select(t, fpc)

# append to gam data
gamm_data <- gam_data |>
  left_join(fpc_df, join_by(t))

# fit full model
fit2 <- bam(tb ~ s(t, fx = T) + s(t, by = lat, fx = T) + s(fid, bs = 're', by = fpc),
            data = gamm_data, discrete = T, method = 'fREML')

# visualize coefficients
beta.grid <- fpc_df |>
  mutate(lat = 1, fid = 267) %>%
  predict(fit2, newdata = ., type = 'terms', se.fit = T)
beta.grid.tfd <- beta.grid$fit[, 1:2] |> t() |> tfd(arg = tvals)
beta.grid.se.tfd <- beta.grid$se.fit[, 1:2] |> t() |> tfd(arg = tvals)
tibble(term = c('intercept', 'latitude'),
       coef = beta.grid.tfd + append(coef(fit2)[1], 0),
       se = beta.grid.se.tfd) |>
  mutate(upr = coef + 10*se, 
         lwr = coef - 10*se) |>
  ggplot() +
  facet_wrap(~term, scales = 'free') +
  geom_spaghetti(aes(y = coef), linewidth = 0.3, color = 'black', alpha = 1) +
  geom_errorband(aes(ymax = upr, ymin = lwr), fill = 'black', alpha = 0.3) +
  scale_x_continuous(breaks = seq(0, 24, by = 6)) +
  labs(y = expr(paste(hat(beta)[j], '(t)', sep = ''))) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
        panel.grid.major = element_line(color = 'black', linewidth = 0.1))

Figure: Coefficient estimates.

Here the coefficients are interpreted pointwise in \(t\) much like in a scalar linear model: the intercept gives the mean for low-latitude sites and the slope gives the mean difference between high-latitude and low-latitude sites; however, unlike in the linear model, both are estimated as continuous functions of time of day. For example:

  • the estimated mean internal body temperature at 6am (\(t = 6\)) is 23.79 degrees centigrade
  • the estimated mean internal body temperature at 2pm (\(t = 14\)) is 28.90 degrees centigrade
  • mean internal body temperature high-latitude sites is estimated to be 3.94 degrees centigrade lower than at low-altitude sites at 6am (\(t = 6\))
  • mean internal body temperature high-latitude sites is estimated to be 0.37 degrees centigrade higher than at low-altitude sites at 6am (\(t = 14\))
Code
tibble(term = c('intercept', 'latitude'),
       coef = beta.grid.tfd + append(coef(fit2)[1], 0),
       se = beta.grid.se.tfd) |>
  tf_unnest(everything()) |>
  filter(coef_arg %in% c(6, 14)) |>
  select(-se_arg) |>
  arrange(coef_arg) |>
  pander(digits = 4, caption = 'Table: example pointwise coefficient estimates at 6am and 1pm.')
Table: example pointwise coefficient estimates at 6am and 1pm.
term coef_arg coef_value se_value
intercept 6 23.79 0.02833
latitude 6 -3.942 0.0925
intercept 14 28.9 0.02973
latitude 14 0.3713 0.09305

4.1.2 Latitude as continuous

Treating latitude as a continuous scalar variable gives the model:

\[ Y_i(t) = \beta_0(t) + \beta_1(t)\text{latitude}_i + \epsilon_i(t) \]

Specification is otherwise identical to that above, and still does not account for multilevel structure in the data. For interpretability, latitude is centered and scaled across locations before model fitting.

Code
# unique latitudes
lats <- analysis |> 
  group_by(loc) |>
  summarize(lat = mean(lat)) |>
  mutate(lat.z = scale(lat))

# format for model fitting
gam_data <- analysis |>
  select(fid, loc, tb) |>
  left_join(lats, join_by(loc)) |>
  select(-lat) |>
  tf_unnest(tb) |>
  rename(t = tb_arg, tb = tb_value)

# fit gam
fit1 <- gam(tb ~ s(t) + s(t, by = lat.z), data = gam_data)

# residual fpca
fpca_out <- gam_data |>
  select(-lat.z, -tb) |>
  bind_cols(resid = resid(fit1)) |>
  tf_nest(resid, .id = 'fid', .arg = t) %>%
  refundr::rfr_fpca('resid', ., center = F, lambda = 50)

# extract first fpc
tvals <- gam_data |> distinct(t) |> pull(t) |> sort()
fpc_df <- tibble(resid = tfd(fpca_out$efunctions[,1])) |>
  tf_unnest(resid) |>
  bind_cols(t = tvals) |>
  rename(fpc = resid_value) |>
  select(t, fpc)

# append to gam data
gamm_data <- gam_data |>
  left_join(fpc_df, join_by(t))

# fit full model
fit2 <- bam(tb ~ s(t, fx = T) + s(t, by = lat.z, fx = T) + s(fid, bs = 're', by = fpc),
            data = gamm_data, discrete = T, method = 'fREML')

In this model, the intercept parameter \(\beta_0(t)\) now gives the mean internal body temperature at the mean latitude 36.197 as a function of time of day, and the slope parameter \(\beta_1(t)\) scales with latitude to give the change in mean internal body temperature associated with a shift in latitude.

Code
# latitude summary stats
lat.m <- attr(lats$lat.z, 'scaled:center')
lat.sd <- attr(lats$lat.z, 'scaled:scale')

# visualize coefficients
beta.grid.in <- tibble(lat.z = seq_range(lats$lat.z, n = 6), 
                       fid = 267, 
                       data = list(fpc_df)) |>
  unnest(data)

beta.grid.out <- beta.grid.in %>%
  predict(fit2, newdata = ., type = 'terms', se.fit = T)

beta.grid.tfd <- beta.grid.in |>
  bind_cols(beta.grid.out$fit) |>
  select(t, lat.z, `s(t)`, `s(t):lat.z`) |>
  tf_nest(starts_with('s'), .id = 'lat.z', .arg = 't') |>
  mutate(lat = lat.sd*lat.z + lat.m,
         `s(t)` = `s(t)` + coef(fit2)[1]) |>
  select(-lat.z) |>
  pivot_longer(starts_with('s')) |>
  distinct(name, value, .keep_all = T) |>
  mutate(term = factor(name, labels = c('intercept', 'latitude')) |> as.character()) |>
  select(term, lat, value) |>
  rename(coef = value)

beta.grid.se.tfd <- beta.grid.in |>
  bind_cols(beta.grid.out$se.fit) |>
  select(t, lat.z, `s(t)`, `s(t):lat.z`) |>
  tf_nest(starts_with('s'), .id = 'lat.z', .arg = 't') |>
  mutate(lat = lat.sd*lat.z + lat.m) |>
  select(-lat.z) |>
  pivot_longer(starts_with('s')) |>
  distinct(name, value, .keep_all = T) |>
  mutate(term = factor(name, labels = c('intercept', 'latitude')) |> as.character()) |>
  select(term, lat, value) |>
  rename(se = value)

p1 <- inner_join(beta.grid.tfd, beta.grid.se.tfd, join_by(term, lat)) |>
  mutate(upr = coef + 4*se, 
         lwr = coef - 4*se) |>
  filter(term == 'latitude') |>
  ggplot() +
  geom_spaghetti(aes(y = coef, color = lat), linewidth = 0.5, alpha = 1) +
  geom_errorband(aes(ymax = upr, ymin = lwr, fill = lat), alpha = 0.3) +
  scale_x_continuous(breaks = seq(0, 24, by = 6)) +
  scale_color_gradient2(low = 'blue', mid = 'lightgrey', high = 'red', 
                        midpoint = lat.m, limits = lat.m + 3*c(-1, 1)*lat.sd) +
  scale_fill_gradient2(low = 'blue', mid = 'lightgrey', high = 'red', 
                       midpoint = lat.m, limits = lat.m + 3*c(-1, 1)*lat.sd) +
  labs(y = expr(hat(beta)[1](t) %*% 'latitude'[i])) +
  ggthm +
  guides(color = guide_colorbar(title = 'latitude'),
         fill = guide_colorbar(title = 'latitude'))

p2 <- inner_join(beta.grid.tfd, beta.grid.se.tfd, join_by(term, lat)) |>
  mutate(upr = coef + 10*se, 
         lwr = coef - 10*se) |>
  filter(term == 'intercept') |>
  ggplot() +
  geom_spaghetti(aes(y = coef), linewidth = 0.5, alpha = 1) +
  geom_errorband(aes(ymax = upr, ymin = lwr), alpha = 0.3) +
  scale_x_continuous(breaks = seq(0, 24, by = 6)) +
  labs(y = expr(paste(hat(beta)[0], '(t)', sep = ''))) +
  ggthm

p2 + p1 + plot_layout(nrow = 1)

Figure: Coefficient estimates.