viz_options

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.3.0
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df_summ <- read_csv("df_summ.csv")
Rows: 111669 Columns: 15
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (5): dataset_id, role, target, structure, language
dbl (10): trial_id, condition_id, room_num, stage_num, trial_num, rep_num, o...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

You may have “default” preferences

theme_set(theme_bw())

Do something basic

ggplot(df_summ, aes(x = trial_num, y = total_num_words)) +
  geom_point()

ggplot(df_summ, aes(x = trial_num, y = total_num_words, color = dataset_id)) +
  geom_point()

What geom?

some to consider:

geom_point() geom_smooth() stat_summary()

ggplot(df_summ, aes(x = rep_num, y = total_num_words, color = dataset_id)) +
  geom_smooth(method = "glm", se = F) +
  stat_summary()
`geom_smooth()` using formula = 'y ~ x'
No summary function supplied, defaulting to `mean_se()`

can do details of what summary function or what functional form and how to group

ggplot(df_summ, aes(x = rep_num, y = total_num_words)) +
  geom_smooth(aes(group = interaction(dataset_id, stage_num), color = dataset_id), method = "lm", formula = y ~ log(x)) +
  geom_point()

What axes?

trial versus rep num

ggplot(df_summ, aes(x = rep_num, y = total_num_words, color = dataset_id)) +
  geom_smooth(aes(group = interaction(dataset_id, stage_num)), method = "lm", formula = y ~ log(x))

pct vs raw number vs pct change

pct_change <- df_summ |>
  filter(dataset_id != "mankewitz2025_compositional") |>
  group_by(role, room_num, target, stage_num, option_size, game_id, group_size, structure, language) |>
  arrange(rep_num) |>
  mutate(last_words = lag(total_num_words), last_trial = lag(trial_id)) |>
  filter(!is.na(last_words)) |>
  mutate(change = total_num_words / last_words)

pct_change |>
  ggplot(aes(x = rep_num, y = log(change), color = dataset_id)) +
  geom_smooth(aes(group = interaction(dataset_id, stage_num)), method = "lm", formula = y ~ log(x))

here’s a crazier one looking at how much shorter the next description is based on the length of the last one.

pct_change |>
  ggplot(aes(x = last_words, y = log(change), color = dataset_id)) +
  geom_smooth(method = "lm", formula = y ~ log(x))

What data?

do you want stage 1 only? do you have any exclusions to apply? describer only?

are there categories that should be collapsed?

pct_change |>
  filter(role == "describer") |>
  filter(stage_num == 1) |>
  ggplot(aes(x = log(last_words), y = log(change), color = dataset_id)) +
  geom_smooth()
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Facets & groupings

pct_change |>
  filter(role == "describer") |>
  filter(stage_num == 1) |>
  ggplot(aes(x = log(last_words), y = log(change), color = as.factor(group_size))) +
  geom_smooth()
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

pct_change |>
  filter(role == "describer") |>
  filter(stage_num == 1) |>
  ggplot(aes(x = log(last_words), y = log(change), color = as.factor(condition_id))) +
  geom_smooth() +
  # facet_grid(group_size ~ option_size)
  facet_wrap(~group_size)
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Think about color!

library(viridis)
Loading required package: viridisLite
pct_change |>
  filter(role == "describer") |>
  filter(stage_num == 1) |>
  ggplot(aes(x = log(last_words), y = log(change), color = as.factor(group_size))) +
  geom_smooth() +
  scale_color_viridis(discrete = T)
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Add back some rawer data or representation of spread

pct_change |>
  filter(role == "describer") |>
  filter(stage_num == 1) |>
  ggplot(aes(x = log(last_words), y = log(change), color = as.factor(condition_id))) +
  geom_point(alpha = .01) +
  geom_smooth() +
  facet_wrap(~group_size)
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

pct_change |>
  filter(role == "describer") |>
  filter(stage_num == 1) |>
  ggplot(aes(x = log(last_words), y = log(change), color = as.factor(condition_id))) +
  geom_point(alpha = .01, color = "grey") +
  geom_smooth() +
  facet_wrap(~group_size)
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

pct_change |>
  filter(role == "describer") |>
  filter(stage_num == 1) |>
  ggplot(aes(x = log(last_words), y = log(change), group = condition_id, color = as.factor(group_size))) +
  geom_point(alpha = .01, color = "grey") +
  geom_smooth() +
  scale_color_viridis(discrete = T)
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Annotations, label, themes

Are there any “zero-points” or baselines that should be marked?

pct_change |>
  filter(role == "describer") |>
  filter(stage_num == 1) |>
  ggplot(aes(x = log(last_words), y = log(change), color = as.factor(condition_id))) +
  geom_point(alpha = .01, color = "grey") +
  geom_hline(yintercept = 0, lty = "dashed") +
  geom_smooth() +
  facet_wrap(~group_size)
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Here we are still doing log scale, but labelling with actual numbers

pct_change |>
  filter(role == "describer") |>
  filter(stage_num == 1) |>
  ggplot(aes(x = last_words, y = change, color = as.factor(condition_id))) +
  scale_x_log10() +
  scale_y_log10() +
  geom_point(alpha = .01, color = "grey") +
  geom_hline(yintercept = 1, lty = "dashed") +
  geom_smooth() +
  facet_wrap(~group_size)
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Good labels for things

pct_change |>
  filter(role == "describer") |>
  filter(stage_num == 1) |>
  ggplot(aes(x = last_words, y = change, color = as.factor(condition_id))) +
  scale_x_log10() +
  scale_y_log10() +
  geom_point(alpha = .01, color = "grey") +
  geom_hline(yintercept = 1, lty = "dashed") +
  geom_smooth() +
  labs(x = "Words on previous repetition", y = "Multiplicative change in length") +
  facet_wrap(~group_size) +
  theme(legend.position = "none")
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Not a strict ordering, can revisit different things!

pct_change |>
  filter(role == "describer") |>
  filter(stage_num == 1) |>
  ggplot(aes(x = last_words, y = change, color = as.factor(rep_num))) +
  scale_x_log10() +
  scale_y_log10() +
  geom_point(alpha = .01, color = "grey") +
  geom_hline(yintercept = 1, lty = "dashed") +
  geom_smooth() +
  scale_color_viridis(discrete = T) +
  labs(x = "Words on previous repetition", y = "Multiplicative change in length", color = "Repetition number") +
  theme(legend.position = "bottom")
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

pct_change |>
  filter(role == "describer") |>
  filter(stage_num == 1) |>
  ggplot(aes(x = last_words, y = change, color = as.factor(rep_num))) +
  scale_x_log10() +
  scale_y_log10() +
  geom_point(alpha = .01, color = "grey") +
  geom_hline(yintercept = 1, lty = "dashed") +
  geom_smooth() +
  facet_wrap(~group_size) +
  scale_color_viridis(discrete = T) +
  labs(x = "Words on previous repetition", y = "Multiplicative change in length", color = "Repetition number") +
  theme(legend.position = "bottom")
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

For really nice graphs, sometimes need to do things to legends

pct_change |>
  filter(role == "describer") |>
  filter(stage_num == 1) |>
  ggplot(aes(x = last_words, y = change, color = as.factor(rep_num))) +
  scale_x_log10() +
  scale_y_log10() +
  geom_point(alpha = .01, color = "grey") +
  geom_hline(yintercept = 1, lty = "dashed") +
  geom_smooth() +
  facet_wrap(~group_size) +
  scale_color_viridis(discrete = T) +
  guides(color = guide_legend(override.aes = list(linewidth = 5, fill = NA))) +
  labs(x = "Words on previous repetition", y = "Multiplicative change in length", color = "Repetition number") +
  theme(legend.position = "bottom")
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'