Comparing drawings across contexts

Published

October 1, 2025

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(scales)

Attaching package: 'scales'

The following object is masked from 'package:purrr':

    discard

The following object is masked from 'package:readr':

    col_factor
library(viridis)
Loading required package: viridisLite

Attaching package: 'viridis'

The following object is masked from 'package:scales':

    viridis_pal
library(here)
here() starts at /Users/visuallearninglab/Documents/hybrid_draw
library(stringr)

CONSTANTS

# skipping any participants with an age below this
min_age = 4

Helpers

age_mapper = function(x) ifelse(x == 11, "adult", x)

# Helper function to summarize data
summarized_data <- function(data, x_var, y_var, group_var) {
  return(data %>%
           group_by_at(c(x_var, group_var)) %>%
           summarise(mean_value = mean(.data[[y_var]], na.rm = TRUE),
                     sd_value = sd(.data[[y_var]], na.rm = TRUE),
                     n = n(),
                     se = sd_value / sqrt(n()),
                     ci_lower = mean_value - qt(1 - (0.05 / 2), n - 1) * se,
                     ci_upper = mean_value + qt(1 - (0.05 / 2), n - 1) * se,
                     .groups = 'drop')
  )
}
age_plot <- function(df_participant, 
                     df_summary, 
                     x_var = "age", 
                     y_var, 
                     group_var = "location",
                     facet_var = NULL,
                     plot_title,
                     plot_subtitle = "Participant means and age means with 95% CIs",
                     y_label,
                     y_format = "number", # "percent" or "number"
                     smooth_method = "loess") {
  
  # Create base plot with participant-level data
if (!is.null(group_var)) {
  p <- ggplot(df_participant, aes(x = !!sym(x_var),
                                  y = !!sym(y_var),
                                  color = !!sym(group_var)))
} else {
  p <- ggplot(df_participant, aes(x = !!sym(x_var),
                                  y = !!sym(y_var)))
}
    # Participant-level means
 p <- p + geom_jitter(alpha = 0.3, size = 1.5, width = 0.1) +
    # Age-level means with 95% CI
    geom_pointrange(data = df_summary,
                    aes(x = .data[[x_var]], 
                        y = mean_value,
                        ymin = ci_lower, 
                        ymax = ci_upper), 
                    size = 0.8) +
    # Smoothed trend
    geom_smooth(aes_string(x = x_var, y = y_var),
                method = smooth_method, se = TRUE, alpha = 0.15) +
    scale_x_continuous(  breaks = seq(min(df_participant$age, na.rm = TRUE),
               max(df_participant$age, na.rm = TRUE),
               by = 1), labels=age_mapper)
  #+coord_cartesian(ylim = c(NA, 1))
  
  if (!is.null(facet_var)) {
    if (length(facet_var) == 1) {
      p <- p + facet_wrap(as.formula(paste("~", facet_var)), 
                          scales = "free_x")
    } else if (length(facet_var) == 2) {
      p <- p + facet_grid(as.formula(paste(facet_var[1], "~", facet_var[2])), 
                          scales = "free_x")
    }
  } else if (!is.null(group_var)) {
    p <- p + facet_wrap(as.formula(paste("~", group_var)))
  }
  
  # Format y-axis
  if (y_format == "percent") {
    p <- p + scale_y_continuous(labels = percent_format(accuracy = 1),limits = c(0, 1))
  } else {
    p <- p + scale_y_continuous(labels = number_format(accuracy = 0.01),  limits = c(0, 1))
  }
  
  # Add labels and theme
  p <- p +
    labs(
      title = plot_title,
      subtitle = plot_subtitle,
      x = "Age (years)",
      y = y_label
    ) +
    scale_color_viridis_d(begin = 0, end = 0.5) +
    theme_classic(base_size = 12)
  
  # Add correlation if single facet
  #if (is.null(facet_var) || length(facet_var) == 1) {
  #  p <- p + ggpubr::stat_cor(alpha = 0.4)
  #}
  
  return(p)
}
df <- read.csv(here("data/hybrid_recognizability.csv"))
filtered_df <- df |>
  filter(!(category %in% c("square", "shape"))) |>
  mutate(age = str_replace(age, "age", ""),
         age = ifelse(age == "adult", "11", age),
          age = as.integer(age)) |>
  filter(age >= min_age)

hybrid recognizability across age

across categories

only_correct_cat_df <- filtered_df |> filter(chosen_object == category) |> filter(category != "dog")

df_part <- only_correct_cat_df %>% 
  group_by(participant_id, age) %>%  # keep location for facets
  summarise(
    recognizability = mean(recognizability, na.rm = TRUE),
    across(starts_with("distance"), ~ mean(.x, na.rm = TRUE)),
    .groups = "drop"
  ) 

df_age_rec <- summarized_data(df_part, "age", "recognizability", "age")

# Create recognizability plots
rec.plot1 <- age_plot(
  df_participant = df_part,
  df_summary = df_age_rec,
  y_var = "recognizability",
  plot_title = "Drawing Recognizability by Age",
  y_label = "Recognizability",
  y_format = "number",
  group_var = NULL
)
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
rec.plot1
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_segment()`).

faceted by category

note that dog accuracy is skewed by distractors and needs to be fixed

df_part_cat <- filtered_df |> filter(chosen_object == category) |>
  group_by(participant_id, age, category) |>
   summarise(recognizability = mean(recognizability, na.rm = TRUE),
            .groups = "drop") 

df_age_cat <- summarized_data(df_part_cat, "age", "recognizability", c("category"))
Warning: There were 2 warnings in `summarise()`.
The first warning was:
ℹ In argument: `ci_lower = mean_value - qt(1 - (0.05/2), n - 1) * se`.
ℹ In group 52: `age = 9` `category = "tiger frog"`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
rec.plot2 <- age_plot(
  df_participant = df_part_cat,
  df_summary = df_age_cat,
  y_var = "recognizability",
  plot_title = "Hybrid drawing recognizability by age Across locations and categories",
  plot_subtitle = "Participant means and age means with 95% CIs",
  y_label = "Recognizability",
  facet_var=c("category"),
  group_var = NULL,
  y_format = "number"
)
rec.plot2
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 5 rows containing missing values or values outside the scale range
(`geom_segment()`).
Warning: Removed 8 rows containing missing values or values outside the scale range
(`geom_segment()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_segment()`).
Removed 2 rows containing missing values or values outside the scale range
(`geom_segment()`).
Warning: Removed 6 rows containing missing values or values outside the scale range
(`geom_segment()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_segment()`).
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_segment()`).
Warning: Removed 5 rows containing missing values or values outside the scale range
(`geom_segment()`).
Warning: Removed 6 rows containing missing values or values outside the scale range
(`geom_segment()`).
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
-Inf

ggsave(here("figures/rec_all.png"), rec.plot1, width=5, height=5)
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_segment()`).
ggsave(here("figures/rec_category.png"), rec.plot2, width=10, height=5)
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 5 rows containing missing values or values outside the scale range
(`geom_segment()`).
Warning: Removed 8 rows containing missing values or values outside the scale range
(`geom_segment()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_segment()`).
Removed 2 rows containing missing values or values outside the scale range
(`geom_segment()`).
Warning: Removed 6 rows containing missing values or values outside the scale range
(`geom_segment()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_segment()`).
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_segment()`).
Warning: Removed 5 rows containing missing values or values outside the scale range
(`geom_segment()`).
Warning: Removed 6 rows containing missing values or values outside the scale range
(`geom_segment()`).
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
-Inf

including subparts

note that dog accuracy is skewed by distractors and needs to be fixed

library(patchwork)

# Split data by category
categories <- unique(filtered_df$category)

# Create individual plots
plot_list <- lapply(categories, function(cat) {
  df_subset <- filtered_df %>% filter(category == cat)
  
  ggplot(data = df_subset, 
         aes(x = age, y = recognizability, color = chosen_object)) +
    geom_jitter(alpha = 0.3, size = 1.5, width = 0.1) +
    geom_smooth(method = "loess", se = TRUE, alpha = 0.15) +
    scale_y_continuous(labels = number_format(accuracy = 0.01)) +
    scale_color_viridis_d(name = "", 
                          begin = 0, end = 0.8) +
    labs(title = cat, x = "Age (years)", y = "Recognizablity") +
    theme_classic(base_size = 12) +
    theme(legend.position = "bottom")
})

# Combine plots
wrap_plots(plot_list, ncol = 4) +
  plot_annotation(
    title = "Drawing recognizability by parts"
  )
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

TODO: add urls to the dfs here, prettify