library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4          ✔ readr     2.1.5     
✔ forcats   1.0.0          ✔ stringr   1.5.1     
✔ ggplot2   3.5.1.9000     ✔ tibble    3.2.1     
✔ lubridate 1.9.4          ✔ tidyr     1.3.1     
✔ purrr     1.0.4          
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
image_locations = read.csv("image_locations_clean.csv") |>
  mutate(superseded_gcp_name_feb25=str_extract(image_path, "(?<=sampled_frames/)[^/]+"),
         superseded_gcp_name_feb25=sub("_processed$", "", superseded_gcp_name_feb25))
recordings = read.csv("recordings_proceessed.csv")
locations_data <- image_locations %>%
  left_join(recordings, by = c("superseded_gcp_name_feb25" = "superseded_gcp_name_feb25")) %>%
  filter(!is.na(age)) %>%
  mutate(rounded_age = round(age_mo / 5) * 5) %>%
  group_by(rounded_age, location_clean) %>%
  summarise(n = n(), .groups = "drop") %>%  
  filter(n > 30) %>%  # keep only location groups with n > 30
  group_by(rounded_age) %>%  # group by age to calculate proportions
  mutate(prop = n / sum(n)) %>%  # calculate proportion within each age group
  ungroup() %>%  # ungroup after calculation
  select(rounded_age, location_clean, prop) %>%  
  pivot_wider(
    id_cols = rounded_age,  # Explicitly specify ID column
    names_from = location_clean,
    values_from = prop,
    values_fill = 0  # fill missing values with 0
  )
# Reshape data back to long format for plotting
plot_data <- locations_data %>%
  pivot_longer(
    cols = -rounded_age,
    names_to = "location_clean",
    values_to = "prop"
  ) %>% filter(!is.na(rounded_age))

# Plot
ggplot(plot_data, aes(x = rounded_age, y = prop, color = location_clean, group = location_clean)) +
  geom_smooth(method = "lm") +  # lm method with no confidence interval shading
  geom_point() +
  labs(
    x = "Rounded Age (months)",
    y = "Proportion of Location",
    title = "Proportion of Locations by Age",
    color = "Location"
  ) +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

locations_data <- image_locations |> left_join(recordings) %>%  # ensure a join key is specified
  group_by(age, location_clean) %>%
  summarise(n = n(), .groups = "drop_last") %>%  # count per group
  mutate(prop = n / sum(n)) %>%  # proportion within each age group
  pivot_wider(names_from = location_clean, values_from = prop, values_fill = 0)
Joining with `by = join_by(superseded_gcp_name_feb25)`