library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4          ✔ readr     2.1.5     
✔ forcats   1.0.0          ✔ stringr   1.5.1     
✔ ggplot2   3.5.1.9000     ✔ tibble    3.2.1     
✔ lubridate 1.9.4          ✔ tidyr     1.3.1     
✔ purrr     1.0.4          
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(here)

here() starts at /Users/visuallearninglab/Documents/vedi_survey/vedi

library(cowplot)


Attaching package: 'cowplot'

The following object is masked from 'package:lubridate':

    stamp

  library(grid)

Loading data

combined_normed_wide <- read_csv(file.path(here('data','main', 'processed'),"processed-responses.csv"))

Rows: 4008 Columns: 25
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr   (6): object, globalID, category, childSiblingAges, feedback, age_half
dbl  (16): aoa, sessionID, Frequency_normalized_val, TotalCount_normalized_v...
dttm  (3): startTimestamp, endTimestamp, session1_time

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

normed_variabilities <- read_csv(file.path(here('data','main', 'processed'),"normed-variabilities.csv"))

Rows: 49 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): object, category
dbl (4): mean_item, sd_item, margin_of_error, cv

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

##Session 1 to Session 2 split-half reliability
all_sessions_data <- combined_normed_wide |>
  mutate(
    InteractionCount_numeric_log = ifelse(InteractionCount_numeric_val > 0, log(InteractionCount_numeric_val), 0),
    TotalCount_numeric_log = ifelse(TotalCount_numeric_val > 0, log(TotalCount_numeric_val), 0)
  ) |>
  select(-TotalCount_numeric_val, -InteractionCount_numeric_val) |>
  pivot_wider(
    id_cols = c("globalID", "aoa", "object", "category"),
    names_from = sessionID,
    names_sep = "_",
    values_from = matches("numeric")
  )

sessions_data <- all_sessions_data|>
  # entry exists for both session 1 and 2
  filter(!is.na(Frequency_numeric_val_2) & !is.na(Frequency_numeric_val_1))

Checking how many participants we have

paste("Participant count:",nrow(sessions_data |> distinct(globalID)))

[1] "Participant count: 29"

Variability session 1 to session 2

sessions_long <- sessions_data |>
  # Convert to long format for sessions
  pivot_longer(
    cols = matches("_[12]$"),
    names_to = c("measure", "session"),
    names_pattern = "(.+)_([12])$",
    values_to = "value"
  ) |>
  mutate(
    session = paste("Session", session),
    measure = str_remove(measure, "_numeric.*")
  ) |>
  # Create position offset for session dots
  group_by(category) |>
  arrange(object) |>
  mutate(
    object_num = as.numeric(as.factor(object)),
    x_pos = case_when(
      session == "Session 1" ~ object_num - 0.35,
      session == "Session 2" ~ object_num + 0.35
    )
  ) |>
  mutate(
    x_label_pos = object_num  # midpoint between sessions for labeling
  ) |>
  ungroup()

summary_stats <- sessions_long |>
  group_by(category, measure, object, session) |>
  summarise(
    sd_val = sd(value, na.rm=TRUE),
    mean_val = mean(value, na.rm=TRUE),
    cv_val = round(sd_val / mean_val, 2),
    .groups = "drop"
  ) |>
  # Get object position for labeling
  left_join(
    sessions_long |> distinct(category, object, object_num),
    by = c("category", "object")
  ) |>
  mutate(
    x_pos = case_when(
      session == "Session 1" ~ object_num - 0.25,
      session == "Session 2" ~ object_num + 0.25
    ),
    stat_label = paste0(round(cv_val, 2))
  )

# Create the plot for each measure
create_facet_plot <- function(measure_name) {
  # Filter data for this measure
  measure_name_clean <- str_replace_all(
      str_replace_all(measure_name, "_", " "),
      "(?<=.)([A-Z])", " \\1"                    # camelCase
    )
  plot_data <- sessions_long |> filter(measure == measure_name)
  # jittering ahead of time since position_jitter doesn't work well with the custom axes thing going on 
  plot_data_jittered <- plot_data %>%
  mutate(
    x_jitter = x_pos + runif(n(), min = -0.05, max = 0.05),
    y_jitter = value + runif(n(), min = -0.1 * sd(value, na.rm = TRUE), max = 0.1 * sd(value, na.rm = TRUE))
  )
  # Get summary stats for this measure
  plot_stats <- summary_stats |> filter(measure == measure_name)
  buffer <- 0.1 * max(plot_data$value)
  ggplot(plot_data_jittered, aes(x = x_jitter, y = y_jitter, fill = session)) +
  geom_line(aes(group = interaction(globalID, object)), alpha = 0.6, size = 0.5) +
  geom_point(aes(color = session), size = 2, alpha = 0.4) +
    # Facet by category
    facet_wrap(~ category, scales = "free") +
    # Add summary statistics as text above each object/session
    geom_text(data = plot_stats, 
              aes(x = x_pos, y = Inf, label = stat_label),
              vjust = 1.2, 
              color = "black", size = 3,
              inherit.aes = FALSE) +
    # Customize axes
   geom_text(
  data = plot_data |> distinct(category, object, x_label_pos),
  aes(x = x_label_pos-0.35, y = min(plot_data$value, na.rm = TRUE) - buffer, label = object),
  vjust = 2, inherit.aes = FALSE, size = 4, angle = 45
) +
    scale_x_continuous() +
    scale_y_continuous(expand = expansion(mult = c(0.5, 0.2)),breaks = function(limits) {
    scales::pretty_breaks(n = 5)(limits) |> keep(~ .x >= 0)  # Remove any negative breaks
  }) +
    # Styling
    theme_minimal() +
    theme(
      axis.text.x = element_blank(),
      legend.position = "none",  # Too many participants for useful legend
      strip.text = element_text(face = "bold"),
      panel.grid.minor = element_blank(),
      plot.margin = margin(t = 10, r = 10, b = 40, l = 10)
    ) +
    labs(
      title = paste("Session Comparison:",  measure_name_clean),
      x = "Object",
      y = measure_name_clean,
      subtitle = "Lines connect same participant across sessions. Left dots = Session 1, Right dots = Session 2. Stat labels indicate CVs"
    ) +
    # Add subtle background to distinguish sessions
   scale_color_brewer(palette = "Set1")
}

frequency_plot <- create_facet_plot("Frequency")

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

formats_plot <- create_facet_plot("FormatsSeen")
interaction_plot <- create_facet_plot("InteractionCount")
total_plot <- create_facet_plot("TotalCount")

print(frequency_plot)

print(formats_plot)

print(interaction_plot)

print(total_plot)

Session 1 v Session 2 correlations

numeric_cols <- names(sessions_data)
base_names <- unique(gsub("_[12]$", "", numeric_cols[grepl("numeric_", numeric_cols)]))

# Create plots for each numeric column pair
plots <- map(base_names, function(base) {
  col1 <- paste0(base, "_1")
  col2 <- paste0(base, "_2")
  
  ggplot(sessions_data, aes_string(x = col1, y = col2)) +
    geom_jitter() +
    geom_smooth(method="lm") +
    labs(x = paste("Session 1"), y = paste("Session 2"), title = paste("Session 1 vs. 2:", base)) +
    theme_minimal() +
    ggpubr::stat_cor()
})

Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.

# View all plots in a grid
plot_grid(plotlist = plots, ncol = 3)

`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

Correlation by participant

numeric_data <- sessions_data |>
  select(globalID, object, category, contains("numeric")) |>
  pivot_longer(
    cols = contains("numeric"), 
    names_to = c("question", "session"), 
    names_pattern = "(.*)_(.*)"
  ) |>
  filter(!is.na(value))

wide_data <- numeric_data |>
  pivot_wider(
    names_from = session,
    values_from = value
  )

# Group by globalID and question, and compute correlation across objects
correlation_data <- wide_data |>
  group_by(globalID, question) |>
  summarise(
    n = n(),
    cor_test = list(cor.test(`1`, `2`, use = "complete.obs")),
    .groups = "drop"
  ) |>
  mutate(
    correlation_value = purrr::map_dbl(cor_test, ~ .x$estimate),
    p_value = purrr::map_dbl(cor_test, ~ .x$p.value),
    significant = p_value < 0.05,
    ci_low = map_dbl(cor_test, ~ .x$conf.int[1]),  
    ci_high = map_dbl(cor_test, ~ .x$conf.int[2]) 
  )

# Plot with custom y-axis breaks and significance labeling
ggplot(correlation_data, aes(x = globalID, y = correlation_value, fill = question)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.8)) +  # Adjust the dodge width
  geom_text(
    aes(label = ifelse(significant, "*", "")),
    vjust = -0.5,
    position = position_dodge(width = 0.8),  # Match dodge width
    size = 4,
    color = "red"
  ) +
  geom_errorbar(
    aes(ymin = ci_low, ymax = ci_high),
    width = 0.2,
    position = position_dodge(width = 0.8)  # Match dodge width
  ) +
  scale_y_continuous(
    breaks = seq(-1, 1, by = 0.2),
    limits = c(-1, 1),
    expand = c(0, 0)  # Remove space at the top and bottom
  ) +
  theme_minimal() +
  labs(
    title = "Session 1 vs 2 Correlation Across Objects by Participant",
    x = "Participant ID",
    y = "Session 1-Session 2 correlation",
    fill = "Question"
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.ticks = element_blank()  # Remove axis ticks
  )

Correlation by object

group_counts <- wide_data |>
  group_by(object, question, category) |>
  summarise(n_pairs = sum(!is.na(`1`) & !is.na(`2`)), .groups = "drop") |>
  filter(n_pairs >= 5)

filtered_data <- wide_data |>
  inner_join(group_counts, by = c("object", "question", "category"))

cor_per_object_variable <- filtered_data |>
  group_by(object, question, category) |>
  summarise(
    n = n(),
    cor_test = list(cor.test(`1`, `2`, use = "complete.obs")),
    .groups = "drop"
  ) |>
  mutate(
    correlation_value = map_dbl(cor_test, ~ .x$estimate),
    p_value = map_dbl(cor_test, ~ .x$p.value),
    conf_low = map_dbl(cor_test, ~ .x$conf.int[1]),  
    conf_high = map_dbl(cor_test, ~ .x$conf.int[2]) 
  )

# Average per object, calculate CI for the mean of correlations
avg_correlation_per_object <- cor_per_object_variable |>
  group_by(object, category) |>
  summarise(
    avg_correlation = mean(correlation_value, na.rm = TRUE),
    sd_correlation = sd(correlation_value, na.rm = TRUE),
      avg_count = mean(n),
    n = n(),
    ci_low = mean(conf_low, na.rm = TRUE),  # Lower bound of CI
    ci_high = mean(conf_high, na.rm = TRUE),  # Upper bound of CI
    t_test = list(t.test(correlation_value)),
    .groups = "drop"
  ) |>
  mutate(
    p_value = map_dbl(t_test, ~ .x$p.value),
    significant = p_value < 0.05
  )

# Sort by average correlation and plot with CIs
avg_correlation_per_object <- avg_correlation_per_object |>
  arrange(desc(avg_correlation)) 

# |> filter(avg_correlation > 0.5 | avg_correlation < 0.3)
ggplot(avg_correlation_per_object, aes(x = reorder(object, -avg_correlation), y = avg_correlation)) +
  geom_bar(stat = "identity", aes(fill = category)) +
  geom_errorbar(
    aes(ymin = ci_low, ymax = ci_high),
    width = 0.2
  ) +
  geom_text(
    aes(label = ifelse(significant, "*", "")),
    vjust = -0.5,
    size = 4
  ) +
  scale_color_discrete(name = "Category") +
  scale_y_continuous(
    breaks = seq(-0.5, 1, by = 0.2),
    limits = c(-0.5, 1)
  ) +
  theme_minimal() +
  labs(
    title = "Average Correlation Across Session 1 and 2",
    x = "Object",
    y = "Average Correlation (95% CI)",
    color = "Category"  # Add a label for the color legend
  ) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size=14))

Warning: Ignoring unknown labels:
• `colour = "Category"`

Checking for correlation with coefficient of variability

cor_var <- avg_correlation_per_object |> left_join(normed_variabilities)

Joining with `by = join_by(object, category)`

cor.test(cor_var$avg_correlation, cor_var$cv)


    Pearson's product-moment correlation

data:  cor_var$avg_correlation and cor_var$cv
t = -0.90851, df = 47, p-value = 0.3682
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.3978696  0.1555729
sample estimates:
       cor 
-0.1313707