library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(glue)
library(here)
here() starts at /Users/visuallearninglab/Documents/babyview-developmental-analysis
library(lmerTest)
Loading required package: lme4
Loading required package: Matrix

Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack


Attaching package: 'lmerTest'

The following object is masked from 'package:lme4':

    lmer

The following object is masked from 'package:stats':

    step

Data import

recordings <- read_csv(here("data/recordings_proceessed.csv"))
Rows: 5687 Columns: 40
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (15): subject_id, gopro_video_id, status, supersed_gcp_name_dec24, supe...
dbl  (11): duration_sec, duration_hrs, hours_of_data, weeks_old_at_onboardin...
lgl   (6): date_is_inferred, blackout_region, release, superseded_blackout_c...
dttm  (1): date_time
date  (6): date, logging_date, pipeline_run_date, monday_of_recording_week, ...
time  (1): start_time

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all_locations_avg <- read_csv(here("intermediates/merged_locations_harmonic.csv"))
Rows: 3821320 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (6): superseded_gcp_name_feb25, frame_num, location_options, location_pr...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all_objects_loc_cleaned <- read_csv(here("intermediates/objects_cleaned.csv"))
Rows: 25778515 Columns: 18
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (9): superseded_gcp_name_feb25, time_in_extended_iso, class_name, origin...
dbl (8): ...1, frame_id, xmin, ymin, xmax, ymax, confidence, masked_pixel_count
lgl (1): frame_number

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Constants and helpers

Setting a minimum number of continuous frames to count a location as a ‘stable’ location. 10 frames = 10 seconds.

LOCATION_LENGTH <- 10

avg_age <- function(df) {
  df |> mutate(age_avg = str_extract_all(age_bin, "\\d+") %>% 
      lapply(as.numeric) %>% 
      sapply(function(x) mean(x)))
}

bin_age <- function(df) {
  df |> mutate(age_bin = case_when(
    age < 12*30 ~ "5-12",
    age < 18*30 ~ "12-18",
    age < 24*30 ~ "18-24",
    age < 30*30 ~ "24-30",
    age < 36*30 ~ "30-36")) |> group_by(age_bin, subject_id) |> filter(!is.na(age_bin)) |>
  mutate(total_count = n(), 
         age_bin = factor(age_bin, levels = c("5-12", "12-18", "18-24", "24-30", "30-36")))
}

weighted_ci_normal_df <- function(df, value_col, weight_col, group_col = NULL, conf_level = 0.95) {
  z <- qnorm(1 - (1 - conf_level) / 2)
  
  # Group if needed, otherwise treat as single group
  if (!is.null(group_col)) {
    df <- df %>% group_by(.data[[group_col]])
  }
  
  df %>%
    summarise(
      weighted_mean = weighted.mean(.data[[value_col]], .data[[weight_col]], na.rm = TRUE),
      w_var = sum(.data[[weight_col]] * (.data[[value_col]] - weighted_mean)^2, na.rm = TRUE) / sum(.data[[weight_col]], na.rm = TRUE),
      w_se = sqrt(w_var / n()),
      ci_lower = weighted_mean - z * w_se,
      ci_upper = weighted_mean + z * w_se,
      n_group = n(),
      .groups = 'drop'
    )
}

summarized_data <- function(data, x_var, y_var, group_var) {
  return(data %>%
           group_by_at(c(x_var, group_var)) %>%
           summarise(mean_value = mean(.data[[y_var]], na.rm = TRUE),
                     sd_value = sd(.data[[y_var]], na.rm = TRUE),
                     n = n(),
                     se = sd_value / sqrt(n()),
                     ci_lower = mean_value - qt(1 - (0.05 / 2), n - 1) * se,
                     ci_upper = mean_value + qt(1 - (0.05 / 2), n - 1) * se,
                     .groups = 'drop')
  )
}

plot_subject_breakdown <- function(df, subject_df, x_var, y_var, group, input_title, x_lab, y_lab, use_size = TRUE, use_line=TRUE) {
  p <- ggplot(data = df, aes(x = .data[[x_var]], y = weighted_mean, group = 1))
  
  if (use_size && "total_hours" %in% names(subject_df)) {
    p <- p + geom_jitter(
      data = subject_df,
      aes(x = .data[[x_var]], y = .data[[y_var]], color = .data[[group]], size = total_hours),
      width = 0.1, height = 0, alpha = 0.5
    )
  } else {
    p <- p + geom_jitter(
      data = subject_df,
      aes(x = .data[[x_var]], y = .data[[y_var]], color = .data[[group]]),
      width = 0.1, height = 0, alpha = 0.5
    )
  }
  
  if (use_line) {
    p <- p + geom_line()
  }
  
  p  +
    geom_point(size = 3) +
    geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.1) +
    labs(
      title = input_title,
      x = x_lab,
      y = y_lab
    ) +
    theme_minimal() +
    guides(color = "none")
}

Location switches

locations_age <- all_locations_avg |>
   left_join(recordings) |>
  filter(!is.na(location)) |>
  select(-location) |>
  rename(location = locations_weighted_harmonic) |>
  bin_age() |>
  rename(total_frame_count = total_count)
Joining with `by = join_by(superseded_gcp_name_feb25)`

prop of locations across age

num_hours_loc_age <- locations_age |>
  group_by(subject_id, location, age_bin) |>
  summarize(total_hours = n()/3600, .groups = "drop")

location_props <- locations_age |>
  ungroup() |>
  summarize(prop = n() / first(total_frame_count), .by = c(subject_id, age_bin, location))

location_summary <- location_props |>
  left_join(num_hours_loc_age, by = c("subject_id", "location", "age_bin")) |>
  group_by(age_bin, location) |>
  group_modify(~ weighted_ci_normal_df(
    .x,
    value_col = "prop",
    weight_col = "total_hours",
    group_col = NULL
  )) |>
  ungroup()

ggplot(location_summary, aes(x = age_bin, y = weighted_mean, color = location)) +
  geom_smooth(method="lm", aes(group = location),
    alpha = 0.5, se=FALSE) +
    geom_point(position = position_dodge(width = 0.3)) +
  geom_errorbar(
    aes(ymin = ci_lower, ymax = ci_upper),
    position = position_dodge(width = 0.3),
    width = 0.2
  ) +
  labs(
    x = "Age Bin",
    y = "Proportion",
    color = "Location",
    title = "Location Proportions Across Age Bins"
  ) +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

Cleaning data

Getting location run lengths

Getting the lengths for every stable location span. This is really important for frame detections since location switches are very frequent. Noting that the location in a given frame is already smoothed using probabilities of sliding window of size 5.

rle_id <- function(x) {
  with(rle(x), rep(seq_along(lengths), lengths))
}

loc_lengths <- locations_age |>
arrange(superseded_gcp_name_feb25, frame_num) |>
  group_by(superseded_gcp_name_feb25) |>
  mutate(location_run = rle_id(location), video_frame_count = n()) |>
  group_by(superseded_gcp_name_feb25, location_run, .add = TRUE) |>
  mutate(run_length = n()) |>
  ungroup() |>
  group_by(
    superseded_gcp_name_feb25,
    location_run,
    age_bin,
    run_length,
    subject_id,
    location, 
    total_frame_count,
    video_frame_count
  ) |>
  summarize(lowest_frame_num = min(frame_num), .groups = "drop")

Sanity checking

check_df <- loc_lengths %>%
  distinct(subject_id, age_bin, video_frame_count, superseded_gcp_name_feb25, total_frame_count) %>%
  group_by(subject_id, age_bin) %>%
  summarize(
    sum_video_frame_count = sum(video_frame_count),
    total_frame_count = first(total_frame_count),  # constant per group
    .groups = "drop"
  ) %>%
  mutate(match = sum_video_frame_count == total_frame_count)
stopifnot(all(check_df$match))
loc_participant_hours <- locations_age |>
  group_by(subject_id, location) |>
  summarize(total_hours = n()/3600, .groups = "drop")

loc_lengths_by_subject <- summarized_data(loc_lengths, x_var="location", y_var="run_length", group_var="subject_id")
Warning: There were 14 warnings in `summarise()`.
The first warning was:
ℹ In argument: `ci_lower = mean_value - qt(1 - (0.05/2), n - 1) * se`.
ℹ In group 73: `location = "car"` `subject_id = "recUFk9OvT2aezPX4"`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 13 remaining warnings.
loc_lengths_summarized <- summarized_data(loc_lengths, x_var="location", y_var="run_length", group_var = "location") |> rename(weighted_mean = mean_value)

plot_subject_breakdown(loc_lengths_summarized, loc_lengths_by_subject |> filter(mean_value < 400), x_var = "location", y_var = "mean_value", group="subject_id", input_title="Avg location length (points represent individual subjects)", x_lab="Location", y_lab = "Avg seconds", use_size=FALSE, use_line=FALSE)

loc_lengths_capped <- loc_lengths %>%
  mutate(run_length_capped = ifelse(run_length > 100, 100, run_length))

ggplot(loc_lengths_capped, aes(x = run_length_capped)) +
  geom_histogram(bins = 20, fill = "skyblue", color = "black") +
  scale_x_continuous(breaks = seq(0, 100, by = 5)) +
  labs(
    title = "Histogram of Location Run Lengths (Capped at 100 seconds)",
    x = "Seconds",
    y = "Count"
  ) +
  theme_minimal()

Getting locatin switch info

loc_switches_raw <- loc_lengths |>
  group_by(superseded_gcp_name_feb25) |>
  arrange(location_run) |>
  mutate(
    prior_location = lag(location),
    prior_run_length = lag(run_length),
    curr_run_length = run_length
  ) |>
  filter(location != prior_location) |>
  transmute(
    superseded_gcp_name_feb25,
    frame_num=lowest_frame_num,
    prior_location,
    video_frame_count,
    total_frame_count,
    age_bin,
    curr_location = location,
    num_previous = prior_run_length,
    num_future = curr_run_length,
    subject_id,
    unique_pair = glue("{pmin(prior_location, curr_location)}_{pmax(prior_location, curr_location)}")
  ) |> ungroup()
ggplot(loc_switches_raw |> filter(num_previous < 250 & num_future < 250), aes(x=num_previous, y=num_future, color=curr_location)) +
  geom_jitter(alpha=0.3) +
  xlab("Number of seconds in previous location") +
  ylab("Number of seconds in new location") +
  labs(title="Location switches")

loc_switches_plot_by_subject <- function(loc_switches, title="Location switches across age", y="Proportion of location switches") {
  num_loc_switches <- loc_switches |>
  summarize(prop = n()/first(num_distinct_locations), .by=c(age_bin, subject_id)) |>
    group_by(subject_id) |>
    filter(n() > 1) |>
    ungroup() 
  
  ggplot(num_loc_switches, aes(x = age_bin, y = prop, group =subject_id, color=subject_id)) +
  geom_line() +
  geom_point() +
  labs(
    title = title,
    x = "Age Bin (months)",
    y = y
  ) +
  theme_minimal() +
  guides(color="none")
}

Stabilizing locations to be a mininum continuous length

Setting a minimum number of continuous frames to count a location as a ‘stable’ location. 10 frames = 10 seconds. Location switches only count if the child was in the previous location and the new location for at least x seconds. Locations are counted as distinct locations with switching opportunities (the denominator of our proportion calculation) if the child is in that location for at least 10 seconds.

stable_loc_counts <- loc_lengths |>
   filter(run_length >= LOCATION_LENGTH)

stable_video_counts <- loc_lengths |>
  group_by(superseded_gcp_name_feb25) |>
  filter(all(run_length >= LOCATION_LENGTH))

stable_loc_hours <- stable_loc_counts |>
  group_by(age_bin, subject_id, location) |>
  summarize(total_hours = sum(run_length)/3600, .groups="drop")

stable_loc_counts_by_age <- stable_loc_counts |>
  group_by(age_bin) |>
  summarise(num_distinct_locations = n(), avg_location_length = mean(run_length), .groups = "drop")

stable_loc_counts_by_participant <- stable_loc_counts |>
  group_by(age_bin, subject_id) |>
  summarise(num_distinct_locations = n(), avg_location_length = mean(run_length), .groups = "drop")

stable_switches_raw <- loc_switches_raw |> filter(num_previous >= LOCATION_LENGTH & num_future >= LOCATION_LENGTH) |> ungroup()

How much data did we lose?

paste("Total number of frames:",sum(loc_lengths$run_length))
[1] "Total number of frames: 3161209"
paste0("Stabilized frames (continuous frames>", LOCATION_LENGTH, "): ", sum(stable_loc_counts$run_length))
[1] "Stabilized frames (continuous frames>10): 2401506"
paste0("Stabilized frames from videos with only stable frames (continuous frames>", LOCATION_LENGTH, "): ", sum(stable_video_counts$run_length))
[1] "Stabilized frames from videos with only stable frames (continuous frames>10): 104471"
paste0("Number of stable location switches: ", nrow(stable_switches_raw), " and number of total location switches: ", nrow(loc_switches_raw))
[1] "Number of stable location switches: 10489 and number of total location switches: 304044"

Just using stabilized frames

Quick descriptive plots

loc_switches_raw_by_video <- stable_switches_raw |> ungroup() |> summarize(
    n = n(),
    location = names(which.max(table(prior_location))),
    .by = c(superseded_gcp_name_feb25, video_frame_count, subject_id)
  )
loc_switches_raw_by_participant <- summarized_data(loc_switches_raw_by_video, "subject_id", "n", "subject_id")
Warning: There were 2 warnings in `summarise()`.
The first warning was:
ℹ In argument: `ci_lower = mean_value - qt(1 - (0.05/2), n - 1) * se`.
ℹ In group 10: `subject_id = "recLQGWprQ9Qe7hMz"`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
ggplot(loc_switches_raw_by_video, aes(x = subject_id, y = n, size = video_frame_count, color = location)) +
  geom_jitter(width = 0.2, height = 0, alpha = 0.2) +
  geom_point(
    data = loc_switches_raw_by_participant,
    aes(x = subject_id, y = mean_value),
    inherit.aes = FALSE,
    size = 2
  ) +
  geom_errorbar(
    data = loc_switches_raw_by_participant,
    aes(x = subject_id, ymin = ci_lower, ymax = ci_upper),
    inherit.aes = FALSE,
    width = 0.1
  ) +
  scale_size_continuous(name = "Video Frame Count") +
  labs(
    title = "Switches per Video by Participant",
    x = "Participant ID",
    y = "Number of Switches"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  coord_cartesian(ylim = c(0, 30))

hist((
  stable_loc_counts |>
    group_by(superseded_gcp_name_feb25) |>
    summarize(num_distinct_loc = n_distinct(location))
)$num_distinct_loc, breaks=8, main = "Distinct Locations per video", xlab="number of locations")

hist((
  stable_loc_counts |>
    group_by(superseded_gcp_name_feb25) |>
    summarize(num_distinct_loc = n())
)$num_distinct_loc, breaks=20, main = "Locations moved per video", xlab="number of locations")

Plotting across age

loc_switches_plot_by_subject(stable_switches_raw |> left_join(stable_loc_counts_by_participant), title=paste0("Location switches across age and subject (location > ",LOCATION_LENGTH, " sec)"), y="Prop. location switches from location")
Joining with `by = join_by(age_bin, subject_id)`

prop_switches <- stable_switches_raw |>
  left_join(stable_loc_counts_by_participant) |>
  summarize(prop = n()/first(num_distinct_locations), .by=c(age_bin, subject_id)) |>
    group_by(subject_id) |>
    ungroup() |>
  avg_age() |> left_join(stable_loc_hours |> summarize(total_hours = sum(total_hours), .by=c(subject_id, age_bin)))
Joining with `by = join_by(age_bin, subject_id)`
Joining with `by = join_by(age_bin, subject_id)`
switches_weighted <- weighted_ci_normal_df(prop_switches, value_col = "prop", weight_col = "total_hours", group_col = "age_bin")

plot_subject_breakdown(switches_weighted, prop_switches, x_var="age_bin", y_var=
"prop", group="subject_id", input_title="Location switches across age (weighted by total hours of data, points represent subjects)",
                                                                      x_lab = "Age Bin (months)", y_lab = "Proportion of switches")

Lmers

loc_switch_model <- lmer(scale(prop) ~ scale(age_avg) + scale(total_hours) +  (1 | subject_id), data = prop_switches)
summary(loc_switch_model)
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: scale(prop) ~ scale(age_avg) + scale(total_hours) + (1 | subject_id)
   Data: prop_switches

REML criterion at convergence: 198.5

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-2.47558 -0.55003  0.06068  0.49426  2.72896 

Random effects:
 Groups     Name        Variance Std.Dev.
 subject_id (Intercept) 0.3829   0.6188  
 Residual               0.6580   0.8111  
Number of obs: 70, groups:  subject_id, 31

Fixed effects:
                   Estimate Std. Error       df t value Pr(>|t|)
(Intercept)        -0.06635    0.15163 22.15043  -0.438    0.666
scale(age_avg)      0.06291    0.11400 61.04342   0.552    0.583
scale(total_hours) -0.10640    0.11320 59.44095  -0.940    0.351

Correlation of Fixed Effects:
            (Intr) scl(g_)
scale(g_vg) 0.072         
scl(ttl_hr) 0.074  0.134  

Accounting for the variability of affordances offered across locations

stable_loc_counts_by_location_age <- stable_loc_counts |>
  group_by(age_bin, subject_id, location) |>
  summarise(num_distinct_locations = n(), total_run_length = sum(run_length), avg_location_length = mean(run_length), .groups = "drop")

prop_switches_by_loc <- stable_switches_raw |>
  ungroup() |>
  rename(location = prior_location) |>
  left_join(stable_loc_counts_by_location_age) |>
  summarize(prop = n()/first(num_distinct_locations), num_distinct_locations=first(num_distinct_locations), mean_prev_loc_length=mean(num_previous), num_switches = n(), .by=c(age_bin, subject_id, location)) |>
  avg_age() |>left_join(stable_loc_hours) |> filter(total_hours > 1)
Joining with `by = join_by(location, age_bin, subject_id)`
Joining with `by = join_by(age_bin, subject_id, location)`
loc_switch_model <- lmer(scale(prop) ~ scale(age_avg) + scale(total_hours) + (1|subject_id) + (1 | location), data = prop_switches_by_loc)
summary(loc_switch_model)
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: 
scale(prop) ~ scale(age_avg) + scale(total_hours) + (1 | subject_id) +  
    (1 | location)
   Data: prop_switches_by_loc

REML criterion at convergence: 380.2

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-2.61637 -0.66732 -0.03672  0.53849  2.82290 

Random effects:
 Groups     Name        Variance Std.Dev.
 subject_id (Intercept) 0.2113   0.4596  
 location   (Intercept) 0.5509   0.7422  
 Residual               0.5641   0.7511  
Number of obs: 147, groups:  subject_id, 26; location, 7

Fixed effects:
                    Estimate Std. Error        df t value Pr(>|t|)    
(Intercept)         -0.15558    0.33041   4.45069  -0.471  0.65989    
scale(age_avg)      -0.08693    0.07632 134.17442  -1.139  0.25669    
scale(total_hours)  -0.26725    0.07008 131.72251  -3.813  0.00021 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
            (Intr) scl(g_)
scale(g_vg) 0.018         
scl(ttl_hr) 0.092  0.059  

GLMER

glmer_model <- glmer(
  cbind(num_switches, num_distinct_locations - num_switches) ~ scale(age_avg) + 
    (1 | subject_id) + (1 | location),
  data = prop_switches_by_loc,
  family = binomial
)
summary(glmer_model)
Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: cbind(num_switches, num_distinct_locations - num_switches) ~  
    scale(age_avg) + (1 | subject_id) + (1 | location)
   Data: prop_switches_by_loc

      AIC       BIC    logLik -2*log(L)  df.resid 
   1472.2    1484.2    -732.1    1464.2       143 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-4.4116 -1.1527 -0.2141  1.3352  7.4871 

Random effects:
 Groups     Name        Variance Std.Dev.
 subject_id (Intercept) 0.04246  0.2061  
 location   (Intercept) 0.07108  0.2666  
Number of obs: 147, groups:  subject_id, 26; location, 7

Fixed effects:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -1.75235    0.12230 -14.328   <2e-16 ***
scale(age_avg) -0.02912    0.01779  -1.636    0.102    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
            (Intr)
scale(g_vg) 0.034 

Using total time for the proportion of location switches instead of just location visit

prop_switches_by_loc <- stable_switches_raw |>
  ungroup() |>
  rename(location = prior_location) |>
  left_join(stable_loc_counts_by_location_age) |>
  summarize(prop = n()/first(total_run_length), n=n(), num_distinct_locations=first(num_distinct_locations), mean_prev_loc_length=mean(num_previous), total_hours=first(total_run_length)/3600, .by=c(age_bin, subject_id, location)) |>
  avg_age() 
Joining with `by = join_by(location, age_bin, subject_id)`
loc_switch_model <- lmer(scale(prop) ~ scale(age_avg) + scale(total_hours) + (1|subject_id) + (1 | location), data = prop_switches_by_loc)
summary(loc_switch_model)
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: 
scale(prop) ~ scale(age_avg) + scale(total_hours) + (1 | subject_id) +  
    (1 | location)
   Data: prop_switches_by_loc

REML criterion at convergence: 1153.6

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-1.8265 -0.4527 -0.1508  0.1561  7.3714 

Random effects:
 Groups     Name        Variance  Std.Dev.
 subject_id (Intercept) 0.0007366 0.02714 
 location   (Intercept) 0.2158247 0.46457 
 Residual               0.7946265 0.89142 
Number of obs: 430, groups:  subject_id, 31; location, 10

Fixed effects:
                    Estimate Std. Error        df t value Pr(>|t|)   
(Intercept)          0.16657    0.15514   8.20040   1.074  0.31352   
scale(age_avg)      -0.02169    0.04343  96.85919  -0.499  0.61867   
scale(total_hours)  -0.15801    0.04996 415.28873  -3.163  0.00168 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
            (Intr) scl(g_)
scale(g_vg) -0.009        
scl(ttl_hr)  0.051  0.037 

Just predicting the raw number of location switches

loc_switch_model <- lmer(scale(n) ~ scale(age_avg) + scale(total_hours) + (1|subject_id) + (1 | location), data = prop_switches_by_loc)
summary(loc_switch_model)
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: scale(n) ~ scale(age_avg) + scale(total_hours) + (1 | subject_id) +  
    (1 | location)
   Data: prop_switches_by_loc

REML criterion at convergence: 718.3

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-6.3398 -0.4867 -0.0587  0.2928  5.5877 

Random effects:
 Groups     Name        Variance Std.Dev.
 subject_id (Intercept) 0.04127  0.2031  
 location   (Intercept) 0.06094  0.2469  
 Residual               0.26779  0.5175  
Number of obs: 430, groups:  subject_id, 31; location, 10

Fixed effects:
                    Estimate Std. Error        df t value Pr(>|t|)    
(Intercept)         -0.13020    0.09245  13.21030  -1.408   0.1821    
scale(age_avg)      -0.06338    0.03031 331.08436  -2.091   0.0373 *  
scale(total_hours)   0.71782    0.03008 421.37010  23.867   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
            (Intr) scl(g_)
scale(g_vg) 0.051         
scl(ttl_hr) 0.081  0.085  

Types of location switches

ggplot(loc_switches_raw |> filter(num_previous >= LOCATION_LENGTH & num_future >= LOCATION_LENGTH), aes(x = unique_pair)) +
  geom_bar(alpha=0.5) +
   theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "Location Pair", y = "Count", title="Number of location switches by unique location switch pair")

stable_loc_counts_by_location <- stable_loc_counts |>
  group_by(location, subject_id) |>
  summarise(num_distinct_locations = n(), total_run_length=sum(run_length), avg_location_length = mean(run_length), .groups = "drop") 

prop_switches_by_loc <- stable_switches_raw |> ungroup() |>
  rename(location = prior_location) |>
  left_join(stable_loc_counts_by_location) |>
  summarize(prop = n()/first(num_distinct_locations), total_hours=(first(avg_location_length)*first(num_distinct_locations))/3600, .by=c(location, subject_id)) 
Joining with `by = join_by(location, subject_id)`
weighted_props <- weighted_ci_normal_df(prop_switches_by_loc, value_col="prop", weight_col="total_hours", group_col="location") 
plot_subject_breakdown(weighted_props, prop_switches_by_loc, x_var="location", y_var="prop", input_title="Proportion of location switches by starting location", group="subject_id", x_lab="Starting location", y_lab="Proportion of switches", use_line=FALSE)

# Calculate transition probabilities 
transition_probs <- stable_loc_counts |>
  # Get total time spent at each location 
  group_by(location) |>
  summarise(total_time_at_location = sum(run_length), num_distinct_locations = n(), .groups = "drop") |>
  # Add switches from each location
  left_join(
    stable_switches_raw |> 
      rename(location = prior_location) |>
      group_by(location, curr_location) |>
      summarise(num_switches = n(), .groups = "drop"),
    by = c("location")
  ) |>
  # Fill in missing combinations with 0 switches
  complete(location, curr_location, fill = list(num_switches = 0)) |>
  # Fill in missing values for combinations that didn't exist in original data
  group_by(location) |>
  fill(total_time_at_location, num_distinct_locations, .direction = "downup") |>
  # Calculate probability of switching to each location
  mutate(
    # Probability = number of switches to destination / number of visits to origin location
    switch_prob = ifelse(is.na(num_distinct_locations) | num_distinct_locations == 0, 
                        0, 
                        num_switches / num_distinct_locations)
  ) 

# Create the heatmap
ggplot(transition_probs, aes(x = curr_location, y = location, fill = switch_prob)) +
  geom_tile(color = "white", size = 0.5) +
  scale_fill_gradient2(
    low = "white", 
    mid = "lightblue", 
    high = "darkblue",
    midpoint = max(transition_probs$switch_prob, na.rm = TRUE) / 2,
    name = "Transition\nProbability"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
    axis.text.y = element_text(hjust = 1),
    panel.grid = element_blank(),
    axis.ticks = element_blank()
  ) +
  labs(
    x = "Destination Location", 
    y = "Origin Location", 
    title = "Location Transition Probabilities",
    subtitle = "Probability of moving from origin location to destination location when in origin location"
  ) +
  # Add text labels with the probability values
  geom_text(
    aes(label = sprintf("%.3f", switch_prob)), 
    color = ifelse(transition_probs$switch_prob > max(transition_probs$switch_prob, na.rm = TRUE) * 0.6, "white", "black"),
    size = 3
  )
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

Door detections

Cleaning object detections

objects_age <- all_objects_loc_cleaned |> 
  left_join(recordings) |>
  bin_age() |>
  rename(total_object_count = total_count)
Joining with `by = join_by(superseded_gcp_name_feb25)`
age_participant_hours <- objects_age |>
  distinct(subject_id, age_bin, superseded_gcp_name_feb25, .keep_all = TRUE) |>
  group_by(subject_id, age_bin) |>
  summarize(total_hours = sum(duration_hrs), .groups = "drop")

objects_age_prop_by_participant <- objects_age |>
  group_by(age_bin, class_name, subject_id, total_object_count) |>
  summarize(object_count = n(), class_prop = n()/first(total_object_count)) |>
  left_join(age_participant_hours)
`summarise()` has grouped output by 'age_bin', 'class_name', 'subject_id'. You
can override using the `.groups` argument.
Joining with `by = join_by(age_bin, subject_id)`
objects_age_prop <- objects_age_prop_by_participant |> 
  group_by(age_bin, class_name) |>
  summarize(class_prop = sum(object_count) / sum(total_object_count))
`summarise()` has grouped output by 'age_bin'. You can override using the
`.groups` argument.
doors_by_participant  <- objects_age_prop_by_participant |>
   filter(class_name == "door")

doors <- objects_age_prop |>
  filter(class_name == "door")

Sanity checks:

paste("Number of hours:", sum(doors_by_participant$total_hours))
[1] "Number of hours: 878.000269444445"
summary(lm(total_object_count ~ total_hours, data=doors_by_participant))

Call:
lm(formula = total_object_count ~ total_hours, data = doors_by_participant)

Residuals:
    Min      1Q  Median      3Q     Max 
-154786  -22402   -8901   22692  205246 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)    10043       9565    1.05    0.297    
total_hours    22526        591   38.12   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 53370 on 70 degrees of freedom
Multiple R-squared:  0.954, Adjusted R-squared:  0.9534 
F-statistic:  1453 on 1 and 70 DF,  p-value: < 2.2e-16

Plotting by participant across age

ggplot((doors_by_participant |> group_by(subject_id) |> filter(n() > 1)), aes(x=age_bin, y=log(class_prop), group=subject_id, color=subject_id)) +
  geom_line() +
  geom_point() + 
  labs(title = "Doors across age and participant (participants with > 1 age bin)",
       x = "Age Bin (months)",
       y = "Log proportion of Doors") +
  guides(color="none")

Quick lmer model

door_model <- lmer(scale(class_prop)~ scale(age_avg) + scale(total_hours) + (1|subject_id), data=doors_by_participant |> avg_age())
summary(door_model)
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: scale(class_prop) ~ scale(age_avg) + scale(total_hours) + (1 |  
    subject_id)
   Data: avg_age(doors_by_participant)

REML criterion at convergence: 188.1

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.0623 -0.4615 -0.1389  0.2990  2.5578 

Random effects:
 Groups     Name        Variance Std.Dev.
 subject_id (Intercept) 0.5589   0.7476  
 Residual               0.4153   0.6444  
Number of obs: 72, groups:  subject_id, 31

Fixed effects:
                    Estimate Std. Error        df t value Pr(>|t|)
(Intercept)        -0.046556   0.157846 30.107581  -0.295    0.770
scale(age_avg)     -0.003853   0.095432 58.933718  -0.040    0.968
scale(total_hours)  0.089867   0.096327 59.333673   0.933    0.355

Correlation of Fixed Effects:
            (Intr) scl(g_)
scale(g_vg) 0.080         
scl(ttl_hr) 0.086  0.236  

Plotting across age

doors_weighted <- weighted_ci_normal_df(doors_by_participant |> mutate(log_prop = log(class_prop)), value_col = "log_prop", weight_col = "total_hours", group_col = "age_bin")

plot_subject_breakdown(doors_weighted, doors_by_participant |> mutate(log_class_prop = log(class_prop)), x_var="age_bin", y_var=
"log_class_prop", group="subject_id", input_title="Doors across age (log proportion, weighted by total hours of data)",
                                                                      x_lab = "Age Bin (months)", y_lab = "log proportion of doors")

Looking very constant.