Comparing drawings across contexts

Published

September 18, 2025

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(scales)

Attaching package: 'scales'

The following object is masked from 'package:purrr':

    discard

The following object is masked from 'package:readr':

    col_factor
library(viridis)
Loading required package: viridisLite

Attaching package: 'viridis'

The following object is masked from 'package:scales':

    viridis_pal
library(here)
here() starts at /Users/visuallearninglab/Documents/kenya_draw
library(stringr)

Helpers

# Helper function to summarize data
summarized_data <- function(data, x_var, y_var, group_var) {
  return(data %>%
           group_by_at(c(x_var, group_var)) %>%
           summarise(mean_value = mean(.data[[y_var]], na.rm = TRUE),
                     sd_value = sd(.data[[y_var]], na.rm = TRUE),
                     n = n(),
                     se = sd_value / sqrt(n()),
                     ci_lower = mean_value - qt(1 - (0.05 / 2), n - 1) * se,
                     ci_upper = mean_value + qt(1 - (0.05 / 2), n - 1) * se,
                     .groups = 'drop')
  )
}

# Flexible age plot function
age_plot <- function(df_participant, 
                     df_summary, 
                     x_var = "age", 
                     y_var, 
                     group_var = "location",
                     facet_var = NULL,
                     plot_title,
                     plot_subtitle = "Participant means and age means with 95% CIs",
                     y_label,
                     y_format = "number", # "percent" or "number"
                     smooth_method = "loess") {
  
  # Create base plot with participant-level data
  p <- ggplot(data = df_participant, 
              aes_string(color = group_var, x = x_var, y = y_var)) +
    # Participant-level means
    geom_jitter(alpha = 0.3, size = 1.5, width = 0.1) +
    # Age-level means with 95% CI
    geom_pointrange(data = df_summary,
                    aes(x = .data[[x_var]], 
                        y = mean_value,
                        ymin = ci_lower, 
                        ymax = ci_upper), 
                    size = 0.8) +
    # Smoothed trend
    geom_smooth(aes_string(x = x_var, y = y_var),
                method = smooth_method, se = TRUE, alpha = 0.15)
  #+coord_cartesian(ylim = c(NA, 1))
  
  # Add faceting - all plots in single row
  if (!is.null(facet_var)) {
    if (length(facet_var) == 1) {
      p <- p + facet_wrap(as.formula(paste("~", facet_var)), 
                          scales = "free_x", nrow = 1)
    } else if (length(facet_var) == 2) {
      p <- p + facet_grid(as.formula(paste(facet_var[1], "~", facet_var[2])), 
                          scales = "free_x")
    }
  } else {
    p <- p + facet_wrap(as.formula(paste("~", group_var)), nrow = 1)
  }
  
  # Format y-axis
  if (y_format == "percent") {
    p <- p + scale_y_continuous(labels = percent_format(accuracy = 1))
  } else {
    p <- p + scale_y_continuous(labels = number_format(accuracy = 0.01))
  }
  
  # Add labels and theme
  p <- p +
    labs(
      title = plot_title,
      subtitle = plot_subtitle,
      x = "Age (years)",
      y = y_label
    ) +
    scale_color_viridis_d(begin = 0, end = 0.5) +
    theme_classic(base_size = 12)
  
  # Add correlation if single facet
  #if (is.null(facet_var) || length(facet_var) == 1) {
  #  p <- p + ggpubr::stat_cor(alpha = 0.4)
  #}
  
  return(p)
}
df <- read.csv(here("data/clip_recognizability.csv")) |>
  filter(drawing_category %in% c("airplane", "bike", "bird", "hat", "rabbit", "watch",
                                 "cat", "house", "cup", "chair", "tree", "car")) |>
  group_by(location, age) |>
  filter(n_distinct(participant_id) > 3) |>
  mutate(location = case_when(
    location == "India" ~ "New Delhi",
    location == "USA" ~ "San Jose",
    TRUE ~ location
  ))

Recognizability of drawings

Quantified using CLIP

library(dplyr)
library(ggplot2)
library(scales)   # percent_format

# 1) participant‑level means
df_part <- df %>% 
  group_by(participant_id, age, location) %>%      # keep location for facets
  summarise(recognizability = mean(recognizability, na.rm = TRUE),
            distance = mean(distance, na.rm=TRUE),
            distance_euclidean = mean(distance_euclidan, na.rm=TRUE),
            .groups = "drop") |> filter(age < 10)

# 2) age‑wise mean + CI within each location
df_age_rec <- summarized_data(df_part, "age", "recognizability", "location")

# Create recognizability plots
rec.plot1 <- age_plot(
  df_participant = df_part,
  df_summary = df_age_rec,
  y_var = "recognizability",
  plot_title = "Drawing Recognizability by Age Across Locations",
  y_label = "Recognizability",
  y_format = "number"
)
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
df_part_cat <- df |>
  group_by(participant_id, age, location, drawing_category) |>
   summarise(recognizability = mean(recognizability, na.rm = TRUE),
            distance = mean(distance, na.rm=TRUE),
            distance_euclidean = mean(distance_euclidan, na.rm=TRUE),
            .groups = "drop") |> filter(age < 10)

df_age_cat <- summarized_data(df_part_cat, "age", "recognizability", c("location", "drawing_category"))

rec.plot2 <- age_plot(
  df_participant = df_part_cat,
  df_summary = df_age_cat,
  y_var = "recognizability",
  plot_title = "Drawing Recognizability by Age Across Locations and Categories",
  plot_subtitle = "Participant means and age means with 95% CIs",
  y_label = "Recognizability",
  facet_var=c("location", "drawing_category"),
  y_format = "number"
)

Regognizability across categories

rec.plot1
`geom_smooth()` using formula = 'y ~ x'

ggsave(here("data/figures/recognizability.svg"),rec.plot1, bg="white", device="pdf")
Saving 7 x 5 in image
`geom_smooth()` using formula = 'y ~ x'

Recognizability within categories

rec.plot2
`geom_smooth()` using formula = 'y ~ x'

ggsave(here("data/figures/recognizability_by_category_sites.png"),rec.plot2,width = 20, height=10, bg="white")
`geom_smooth()` using formula = 'y ~ x'

Centroid distance

Quantifying how similar the drawings are one to another. Lower values indicate drawings are more consistent within that bucket.

# For distance plot with categories:
df_age_dist <- summarized_data(df_part, "age", "distance", c("location"))
df_age_dist_euclidean <- summarized_data(df_part, "age", "distance_euclidean", c("location"))
df_age_dist_cat <- summarized_data(df_part_cat, "age", "distance", c("location", "drawing_category"))

centroid.plot1 <- age_plot(
  df_participant = df_part,
  df_summary = df_age_dist_euclidean,
  y_var = "distance_euclidean",
  plot_title = "Drawing Euclidean distance from Centroid by Age\n Across Locations and Categories",
  y_label = "Euclidean distance",
  y_format = "number"
)

centroid.plot2 <- age_plot(
  df_participant = df_part,
  df_summary = df_age_dist,
  y_var = "distance",
  plot_title = "Drawing cosine dissimilarity from Centroid by Age\n Across Locations and Categories",
  y_label = "Cosine dissimilarity",
  y_format = "number"
)

centroid.plot3 <- age_plot(
  df_participant = df_part_cat,
  df_summary = df_age_dist_cat,
  y_var = "distance",
  plot_title = "Drawing dissimilarity from Centroid by Age\n Across Locations and Categories",
  plot_subtitle = "Participant means and age means with 95% CIs",
  y_label = "Cosine distance from centroid",
  facet_var=c("location", "drawing_category"),
  y_format = "number"
)

euclidean distance

centroid.plot1
`geom_smooth()` using formula = 'y ~ x'

cosine distance

centroid.plot2
`geom_smooth()` using formula = 'y ~ x'

ggsave(here("data/figures/centroid_distance_sites.png"),centroid.plot2, bg="white")
Saving 7 x 5 in image
`geom_smooth()` using formula = 'y ~ x'

cosine distance within categories

centroid.plot3
`geom_smooth()` using formula = 'y ~ x'

ggsave(here("data/figures/centroid_distance_by_category_sites.png"),centroid.plot3,width = 20, height=10, bg="white")
`geom_smooth()` using formula = 'y ~ x'