Wrangle incoming data from Prolific

Author

Tarun Sepuri

Published

January 8, 2024

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4          ✔ readr     2.1.5     
✔ forcats   1.0.0          ✔ stringr   1.5.1     
✔ ggplot2   3.5.1.9000     ✔ tibble    3.2.1     
✔ lubridate 1.9.4          ✔ tidyr     1.3.1     
✔ purrr     1.0.4          
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(here)
here() starts at /Users/visuallearninglab/Documents/vedi_survey/vedi
library(knitr)
library(cowplot)

Attaching package: 'cowplot'

The following object is masked from 'package:lubridate':

    stamp
library(grid)
library(ggthemes)

Attaching package: 'ggthemes'

The following object is masked from 'package:cowplot':

    theme_map
library(mirt)
Loading required package: stats4
Loading required package: lattice
library(grid)
library(viridis)
Loading required package: viridisLite
#devtools::install_github("langcog/wordbankr")
library(wordbankr)
base_size_print=10
label_size=3

Helper functions

response_variability <- function(data, question) {
  filtered_data <- data %>% summarize(n = n(), .by = c("response"))
  return(ggplot(filtered_data, aes(x = response, y = n)) +
           geom_bar(stat = "identity") +
           labs(title=paste("Responses based on options for", question))
         )
}

response_grouping <- function(data, question) {
  return(data %>% filter(question_type==question) %>% summarize(n = n(), .by = c("response")))
}

# To create a graph where the response data is randomly shuffled into two halves and compared and correlated
split_half <- function(data, categories, question, category_name="ResponseId", input_group=c("category", "type"), input_color="type", input_label="category", response_val="numeric_val") {
  shuffled_categories <- sample(categories)


  # Split the shuffled categories into two halves
  half1_categories <- shuffled_categories[1:floor(length(shuffled_categories) / 2)]
  half2_categories <- shuffled_categories[(floor(length(shuffled_categories) / 2) + 1):length(shuffled_categories)]
  
  half1 <- data |>
    filter(!!sym(category_name) %in% half1_categories) |>
    # Ensuring that there is only one average per category before calculating mean counts
    mutate((!!sym(response_val)) := mean(!!sym(response_val)), .by=c(category_name, input_group)) |>
    summarize(mean_count = mean(!!sym(response_val)), .by=input_group,show_col_types = FALSE)

  half2 <- data |>
    filter(!!sym(category_name) %in% half2_categories) |>
    mutate(!!sym(response_val) := mean(!!sym(response_val)), .by=c(category_name, input_group)) |>
    summarize(mean_count = mean(!!sym(response_val)), .by=input_group)

  split_half_plot <- ggplot(data=inner_join(half1, half2, by = input_group), aes(x = mean_count.x, y = mean_count.y, color=!!sym(input_color))) +
    geom_point(alpha=0.6, size=2) +
    geom_smooth(method='lm', formula='y~x', color="gray") +
    labs(x = "Responses first half", y = "Responses second half", title=paste("Split-half reliability for", question)) +
    ggpubr::stat_cor(size = 2) +
    theme(plot.title=element_text(size=10), axis.title = element_text(size = 10))
  # If we're grouping based on item names, add labels for each item
  if ("object" %in% input_group) {
    split_half_plot <- split_half_plot + ggrepel::geom_label_repel(aes(label=!!sym(input_label)), segment.colour="grey", segment.alpha =.5)
  }
  return(split_half_plot)
}

questiontype_age_lineplot <- function(data, input_title, y_val="numeric_val") {
  summarized_by_id <- summarized_data(data, "childs_age", y_val, c("ResponseId", "question_type"))
  summarized_by_age <- summarized_data(summarized_by_id, "childs_age", "mean_value", "question_type")
  return(ggplot(summarized_by_id, aes(x = childs_age, y = mean_value, group = question_type, color = question_type)) +
           geom_line(data = summarized_by_age, linewidth = 0.8, show.legend = TRUE, alpha = 0.8) +
           geom_point(alpha = 0.5, position = position_jitter(width = .01, height = .01)) +
           labs(x = "Child age (months)", y = "Response value", title=input_title, color="Question Type") +
           geom_smooth(method = 'lm', formula='y~x', aes(color=question_type)) +
           scale_x_continuous(breaks = seq(5, 45, by = 5), limits = c(10, 50))
         )
}

coefficient_of_var <- function(data, value, group) {
  item_variability <- data |>
    summarize(mean_item = mean(!!sym(value)),
              sd_item = sd(!!sym(value)),
              # Calculating 95% CI
              margin_of_error = qt(0.975, df = n() - 1) * (sd_item / sqrt(n())),
              cv = (sd_item / mean_item) * 100, .by = group)
  return(item_variability)
}

item_variability_scatterplot <- function(item_vars, all_responses, input_title, group=c("category", "object"), y_val="numeric_val") {
  return(ggplot(item_vars, aes(x = cv, y = mean_item, label = object)) +
    geom_errorbar(
      aes(ymin = mean_item - margin_of_error, ymax = mean_item + margin_of_error), 
      width = 0.2, color = "black",
      alpha = 0.2
    ) + 
    geom_point(aes(color = category), size = 3, shape=17) +       # Add points with color based on category
    ggrepel::geom_label_repel(size = 4, force = 40) +
    geom_jitter(data=(all_responses |> left_join(item_vars, by = group)), aes(x=cv, y=!!sym(y_val), color = category), alpha = 0.01, height = 0.3, width = 0.3) +
    labs(
      title = input_title,
      y = "Mean of Numeric Value",
      x = "Coefficient of Variation (%)"
    ) +
    theme(legend.position = "bottom"))
}

babiness_correlation <- function(data, x_val, xlab_val, title="") {
 return(ggplot(data, aes(x=!!sym(x_val), y=babiness_mean)) +
    geom_jitter(height=.03, width=.03, alpha=.8, size=2) +
    ggthemes::theme_few(base_size=base_size_print) +
    geom_smooth(formula='y~x', span=10, alpha=.2, color='dark grey') +
    ggrepel::geom_label_repel(aes(label=object, color=category), segment.colour="grey", segment.alpha =.2, size = label_size) +
    xlab(xlab_val) +
    ylab('Babiness rating') +
    labs(title=title) +
    theme(legend.position='none', aspect.ratio=1) +
    ggpubr::stat_cor(method = "pearson") +
    coord_cartesian(clip='off'))
}

normalized_responses <- function(data) {
  return(data |> 
           group_by(question_type) |>
           mutate(
             normalized_val = (numeric_val - min(numeric_val, na.rm = TRUE)) / 
               (max(numeric_val, na.rm = TRUE) - min(numeric_val, na.rm = TRUE))
           ) |>
           ungroup()
         )
}

# To add a title to the top of a cowplot arrangement
cowplot_title <- function(title_text) {
  title <- ggdraw() + 
    draw_label(
      title_text,
      fontface = 'bold',
      x = 0,
      hjust = 0
    ) +
    theme(
      plot.margin = margin(0, 0, 0, 4)
    )
  return(title)
}

summarized_data <- function(data, x_var, y_var, group_var) {
  return(data %>%
           group_by_at(c(x_var, group_var)) %>%
           summarise(mean_value = mean(.data[[y_var]], na.rm = TRUE),
                     sd_value = sd(.data[[y_var]], na.rm = TRUE),
                     n = n(),
                     se = sd_value / sqrt(n()),
                     ci_lower = mean_value - qt(1 - (0.05 / 2), n - 1) * se,
                     ci_upper = mean_value + qt(1 - (0.05 / 2), n - 1) * se,
                     .groups = 'drop')
  )
}

aoa_plot <- function(data, x_lab, x_var="mean_val", title="") {
  ggplot(data, aes(x=!!sym(x_var), y=aoa)) +
  geom_point(alpha=.8, size=2) +
  theme_few(base_size=base_size_print) +
  geom_smooth(formula='y~x', span=10, alpha=.2) + 
  ggrepel::geom_label_repel(aes(color=category, label=object),segment.colour="grey", segment.alpha =.2, size = 5) +
  ylim(14,30)+
  #annotation_custom(text_low,xmin=1,xmax=1,ymin=11,ymax=11) +
  #annotation_custom(text_high,xmin=7.5,xmax=7.5,ymin=11,ymax=11) +
  xlab(x_lab) +
  ylab('Avg. age-of-acquisition') +
  labs(title = title) +
  theme(legend.position='none', aspect.ratio=1) +
  coord_cartesian(clip='off') +
  ggpubr::stat_cor(method = "pearson")
}

aoa_plot_grouped <- function(data, title, group) {
  ggplot(data, aes(x=mean_val, y=aoa, label=object, color=!!sym(group))) +
  geom_point(alpha=.8, size=2) +
  theme_few(base_size=base_size_print) +
  geom_smooth(span=10, alpha=.2) +
  ylim(14,30)+
  xlab(title) +
  ylab('Avg. age-of-acquisition') +
  coord_cartesian(clip='off') +
  ggpubr::stat_cor(method = "pearson")
}

# Function to add age split
add_age_split <- function(data) {
  data |>
    mutate(
      median_age = median(childAge),
      age_half = ifelse(childAge > median_age, "older", "younger")
    )
}

# To extract data about the age-of-acquisition (AoA) of words using Wordbank
aoa_data <- function(language, form, input_measure) {
  instrument_data <- get_instrument_data(language, form, administration_info = TRUE) |>
    filter(!is.na(!!sym(input_measure)))
  aoa <- merge(x=fit_aoa(instrument_data, measure=input_measure),y=get_item_data(language, form),by="item_id") |>
    filter(!is.na(aoa)) |>
    # Only using the first phrase of the description
    mutate(item = str_extract(item_definition, "^[^/()]+")) |>
    # Only count the AoA as the last learned date of that word if there are multiple versions of the word
    filter(aoa == max(aoa, na.rm = TRUE), .by=item)
  return(aoa)
}

Load and preprocess data

# Words and Sentences dataset does not contain any words based on "understands" measure
english_ws_aoa <- aoa_data("English (American)", "WS", "produces")

raw_experience_data = read_csv(here('data','main','trial_data.csv'))
Rows: 3006 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): object, category, FormatsSeen, Frequency, globalID
dbl (6): objectNumber, categoryNumber, trialIndex, reactionTime, TotalCount,...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cleaned_experience_data = raw_experience_data |>
  # TODO: Need to scale this within participant, calculate z-scores
  mutate(TotalCount = ifelse(TotalCount > 100, 100, TotalCount),
         InteractionCount = ifelse(InteractionCount > 100, 100, InteractionCount)) |>
  filter(!is.na(Frequency)) |>
  left_join(english_ws_aoa |> select(item, aoa), by=c("object"="item"))
participant_data = read_csv(here('data', 'main', 'participant_data.csv')) |>
  # Fixing age of a single child whose parent inputted their age incorrectly, messaged parent to get the right age
  mutate(childAge = ifelse(childAge == -599988, 7, childAge))
Rows: 55 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (3): childSiblingAges, globalID, feedback
dbl  (1): childAge
dttm (2): startTimestamp, endTimestamp

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
aoa_categories <- cleaned_experience_data |> filter(!is.na(aoa) & category != "favorite") |> distinct(object)
babiness <- read_csv(here::here('data', 'metadata', 'babiness.csv')) |>
  as_tibble() |>
  select(word, rating, babyAVG) |>
  filter(word %in% aoa_categories$object) |>
  mutate(object = word) |>
  summarize(babiness_mean = mean(babyAVG),
            rating_mean = mean(rating),
            n = n(), .by="object")
Rows: 22682 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): word, subjCode, task, lexicalCategory
dbl (8): rating, X30mos, phonemes, totalMorphemes, concreteness, logFreq, sy...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hist(participant_data$childAge)

# Filtering to participants who, after the attention checks were added in, submitted complete data
full_non_pilot_data <- raw_experience_data |>
  group_by(globalID) |>
  mutate(num_trials = n()) 

time_taken <- full_non_pilot_data |>
  left_join(participant_data) |>
  mutate(
    time_taken = difftime(
      as.POSIXct(endTimestamp, format = "%Y-%m-%d %H:%M:%S"),
      as.POSIXct(startTimestamp, format = "%Y-%m-%d %H:%M:%S"),
      units = "mins"  # Adjust units as needed (e.g., "secs", "hours", etc.)
    )
  ) |> 
  ungroup() |>
  distinct(time_taken, globalID, num_trials) |>
  summarize(
    n = n(),
    mean_time = mean(as.numeric(time_taken), na.rm = TRUE),
    median_time = median(as.numeric(time_taken), na.rm = TRUE),
    .by = "num_trials"
  ) 
Joining with `by = join_by(globalID)`

Exclusions

participants_to_be_excluded = c()

Processing 4 built-in attention checks

ATTENTION_CHECKS <- tibble(
  correct_response = c('realLife', 'toy', 'realLife', 'toy'),
  object = c('unicorn', 'sunrise', 'typewriter', 'cloud')
)

attention_check_results <- raw_experience_data |> filter(category == "attention_check") |>
 left_join(ATTENTION_CHECKS) |>
  # A correct attention check response would have only included a single format, so we can directly remove '[' and ']' from any responses
  mutate(attention_check_response = gsub("\\[\\'", "", gsub("\\'\\]", "", FormatsSeen)),
    correctly_responded = attention_check_response == correct_response) |>
  mutate(correctly_responded = ifelse(is.na(correctly_responded), FALSE, correctly_responded))
Joining with `by = join_by(object)`
attention_checks_by_participant <- attention_check_results |>
  summarize(correct_attention_checks = sum(correctly_responded == TRUE),
            total_seen = n(),
            .by = globalID)

# Only excluding participants if they fail at least two attention checks, abiding by Prolific's attention check guidelines
attention_check_exclusions <- attention_checks_by_participant |>
  filter(total_seen - correct_attention_checks >= 2) |>
  pull(globalID)

participants_to_be_excluded = c(participants_to_be_excluded, attention_check_exclusions)

Processing attention checks that are inherent to the items we are asking, for example: it is not possible to have seen a real doll and highly unlikely to have seen a toy beach. If participants consistently fail these types of questions they are excluded from our analyses.

# TODO: flesh out this section - avoid repeated code with attention check section
WITHIN_STUDY_ATTENTION_CHECKS <- tibble(
  incorrect_response = c('realLife', 'realLife', 'toy', 'toy', 'toy'),
  object = c('doll', 'penguin', 'beach', 'park', 'penny')
)

within_study_attention_check_results <- WITHIN_STUDY_ATTENTION_CHECKS |>
  left_join(cleaned_experience_data) |>
  mutate(correctly_responded = !str_detect(incorrect_response, FormatsSeen)) |>
  summarize(correct_attention_checks = sum(correctly_responded == TRUE),
            total_seen = n(),
            .by = globalID)
Joining with `by = join_by(object)`
# Seeing if they fail at least 5 of our arbitrary attention checks
within_study_attention_check_exclusions <- within_study_attention_check_results |>
  filter(total_seen - correct_attention_checks >= 5) |>
  pull(globalID)

participants_to_be_excluded = c(participants_to_be_excluded, within_study_attention_check_exclusions) |> unique()

Processing feedback responses. Feedback questions are generally trickier so only excluding participants who could not answer all 4. Also just processing to see stats about which questions may have been more difficult for participants to fully understand.

# TODO: flesh out this section
cleaned_experience_data <- cleaned_experience_data |>
  filter(!(globalID %in% participants_to_be_excluded))

Pivoting to longer question-level CSV

Summaries for numerical responses

total_count_summary <- cleaned_experience_data |>
  summarize(total_count_avg = mean(TotalCount), .by=c("object", "category"))

interaction_count_summary <- cleaned_experience_data |>
  filter(!is.na(InteractionCount)) |>
  summarize(interaction_count_avg = mean(InteractionCount),  .by=c("object", "category"))

Creating numerical value structure for text-based questions

frequency_responses <- cleaned_experience_data |>
  mutate(question_type = "Frequency",
         numeric_val = case_when(Frequency == 'Not at all' ~ 1, 
                                      Frequency == 'A little' ~ 2, 
                                      Frequency == 'A moderate amount' ~ 3, 
                                      Frequency == 'A lot' ~ 4, 
                                      Frequency == 'A great deal' ~ 5
                                      ))
frequency_summary <- frequency_responses |>
 summarize(frequency_avg = mean(numeric_val), .by=c("object", "category"))
# Number of formats an object is seen in
format_responses <- cleaned_experience_data %>%
  mutate(question_type = "FormatsSeen",
         seen_drawing = str_detect(FormatsSeen,'drawing'),
         seen_toy = str_detect(FormatsSeen,'toy'),
         seen_photo = str_detect(FormatsSeen,'photo'),
         seen_video = str_detect(FormatsSeen,'video'),
         seen_life = str_detect(FormatsSeen,'realLife'),
         seen_never = str_detect(FormatsSeen, 'neverSeen')) %>%
  rowwise() %>%
  mutate(numeric_val = sum(seen_drawing + seen_toy + seen_photo + seen_video + seen_life)) %>%
  ungroup()

# Averaging each 'seen' variable per item
diff_formats_summary <- format_responses %>%
  group_by(object, category) %>%
  summarize(seen_toy_avg = mean(seen_toy), seen_drawing_avg = mean(seen_drawing), seen_photo_avg = mean(seen_photo), seen_video_avg = mean(seen_video), seen_real_life_avg = mean(seen_life)) %>%
  ungroup() %>%
  mutate(categories = fct_reorder(category, seen_real_life_avg, .desc=TRUE))
`summarise()` has grouped output by 'object'. You can override using the
`.groups` argument.
format_count <- format_responses %>%
  summarize(count_formats = sum(seen_drawing + seen_toy + seen_photo + seen_video + seen_life), .by=c("globalID", "category", "object")) %>%
  summarize(avg_num_formats = mean(count_formats), .by=c("category", "object"))

Combining numerical values of each question type

# Pivoting all the numerical survey question responses at the question level
numerical_responses_long <- cleaned_experience_data |>
  pivot_longer(c(TotalCount, InteractionCount), names_to = "question_type", values_to = "numeric_val")

question_cols = c("TotalCount", "InteractionCount", "Frequency", "FormatsSeen")
combined_responses <- rbind(frequency_responses |>
                              select(-all_of(question_cols)), numerical_responses_long |> select(-any_of(question_cols)), format_responses |> select(-starts_with("seen")) |> select(-all_of(question_cols))) |> filter(!is.na(numeric_val))

combined_normalized <- normalized_responses(combined_responses)

combined_responses_without_favorites <- combined_responses |> filter(category != "favorite") 

# Switching back to single row for each participant and adding participant data
combined_normed_wide <- combined_normalized |>
  filter(category != "favorite") |>
  pivot_wider(
    id_cols = c("object", "globalID", "category", "aoa"),
    names_from = question_type,
    values_from = c(normalized_val, numeric_val),
    names_glue = "{question_type}_{.value}"
  ) |>
    left_join(participant_data, by=c("globalID"="globalID")) |>
  add_age_split()

combined_normed_summary <- combined_normed_wide |>
  select(-globalID) |>
  summarize(across(where(is.numeric), ~ mean(.x, na.rm = TRUE)), .by=c("object", "category"))

combined_normed_summary_age_split <- combined_normed_wide |>
  select(-globalID) |>
  summarize(across(where(is.numeric), ~ mean(.x, na.rm = TRUE)), .by=c("object", "category", "age_half"), show_col_types=FALSE)

Reliability checks

##Within-item split-half reliability

categories <- unique(frequency_responses$globalID)
splithalf_title <- cowplot_title("Split-half reliabilities within-item")
# Creating a helper function to pass in common parameters from all within-item split-half reliability checks
withinitem_splithalf <- function(data, title, response_val="numeric_val") {
  return(split_half(data, categories, title, category_name = "globalID", input_group = "object", input_color = "", input_label="object", response_val = response_val))
}


splithalf_plots <- cowplot::plot_grid(withinitem_splithalf(combined_responses_without_favorites |> filter(question_type == "FormatsSeen"), "formats seen"), withinitem_splithalf(combined_responses_without_favorites |> filter(question_type == "InteractionCount"), "interaction count"), withinitem_splithalf(combined_responses_without_favorites |> filter(question_type == "TotalCount"), "total count"), labels = "auto", withinitem_splithalf(combined_normalized |> filter(category != "favorite"), "all normalized responses", response_val = "normalized_val"))
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(category_name)

  # Now:
  data %>% select(all_of(category_name))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(input_group)

  # Now:
  data %>% select(all_of(input_group))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
cowplot::plot_grid(splithalf_title, splithalf_plots, rel_heights = c(0.2, 1), ncol=1)
Warning: ggrepel: 44 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 39 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 43 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 45 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

cowplot::save_plot(here("data", "main", "figures", "larger_split_half_plots.png"), cowplot::plot_grid(splithalf_title, splithalf_plots, rel_heights = c(0.2, 1), ncol=1), base_width = 10, base_height = 12, bg="white")
Warning: ggrepel: 23 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 30 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 39 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 20 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Just looking at interaction count stats, distribution is a little skewed within participant

int_count_data <- combined_responses |> filter(question_type == "InteractionCount")
summary_interaction_count <- raw_experience_data |>
  group_by(object) |>
  summarize(variance = var(InteractionCount)) |>
  arrange(desc(variance))

raw_experience_data$object <- factor(raw_experience_data$object, levels = summary_interaction_count$object)

# Create the plot
ggplot(raw_experience_data, aes(x = globalID, y = InteractionCount)) +
  geom_boxplot(outlier.alpha = 0.6, fill = "skyblue") +
  geom_jitter(alpha = 0.3, color = "darkblue", width = 0.2) +
  labs(
    title = "Variance of numeric_val across globalID",
    x = "Global ID",
    y = "Numeric Value"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  )
Warning: Removed 888 rows containing non-finite outside the scale range
(`stat_boxplot()`).
Warning: Removed 888 rows containing missing values or values outside the scale range
(`geom_point()`).

##Within-participant split-half reliability

# TODO: add within-participant split-half reliability just for sanity

Analysis plots

Babiness correlations

experience_by_babiness <- combined_normed_summary |>
  left_join(babiness) %>%
  filter(!is.na(babiness_mean))
Joining with `by = join_by(object)`
experience_by_babiness_age_split <- combined_normed_summary_age_split |>
  left_join(babiness) |>
  filter(!is.na(babiness_mean))
Joining with `by = join_by(object)`
# all kids, rating mean
babiness_correlation(experience_by_babiness, "rating_mean", "normalized responses")
`geom_smooth()` using method = 'loess'
Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

cor.test(experience_by_babiness$Frequency_numeric_val, experience_by_babiness$babiness_mean)

    Pearson's product-moment correlation

data:  experience_by_babiness$Frequency_numeric_val and experience_by_babiness$babiness_mean
t = 3.2775, df = 36, p-value = 0.002325
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.1886134 0.6928872
sample estimates:
      cor 
0.4793911 
cor.test(experience_by_babiness$FormatsSeen_numeric_val, experience_by_babiness$babiness_mean)

    Pearson's product-moment correlation

data:  experience_by_babiness$FormatsSeen_numeric_val and experience_by_babiness$babiness_mean
t = 0.37618, df = 36, p-value = 0.709
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.2623578  0.3747604
sample estimates:
       cor 
0.06257368 
younger_kids_babiness <- babiness_correlation(experience_by_babiness_age_split |> filter(age_half == "younger"), "InteractionCount_numeric_val", "interaction count", title="younger children (< 22 months)")
older_kids_babiness <- babiness_correlation(experience_by_babiness_age_split |> filter(age_half == "older"), "InteractionCount_numeric_val", "interaction count", title="older children (>= 22 months)")
cowplot::plot_grid(younger_kids_babiness, older_kids_babiness)
`geom_smooth()` using method = 'loess'
`geom_smooth()` using method = 'loess'
Warning: ggrepel: 16 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 20 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

AoA plots

cor.test(cleaned_experience_data$InteractionCount, cleaned_experience_data$aoa)

    Pearson's product-moment correlation

data:  cleaned_experience_data$InteractionCount and cleaned_experience_data$aoa
t = -6.9785, df = 1282, p-value = 4.773e-12
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.2434607 -0.1380400
sample estimates:
      cor 
-0.191302 
cor.test(cleaned_experience_data$TotalCount, cleaned_experience_data$aoa)

    Pearson's product-moment correlation

data:  cleaned_experience_data$TotalCount and cleaned_experience_data$aoa
t = -8.8973, df = 1489, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.2723381 -0.1759189
sample estimates:
       cor 
-0.2246784 
empty <- textGrob("", gp=gpar(fontsizep=base_size_print))
# TODO: add labels back to the graphs, switch to RColorBrewer
text_low <- textGrob("Not at all", gp=gpar(fontsize=base_size_print))
text_high <- textGrob("A great deal", gp=gpar(fontsize=base_size_print))

experience_by_aoa <- combined_normed_summary |>
  filter(!is.na(aoa))

p1 <- aoa_plot(experience_by_aoa, "how often seen", "Frequency_numeric_val")

p2 = aoa_plot(experience_by_aoa, "formats seen", "FormatsSeen_numeric_val")

p3 = aoa_plot(experience_by_aoa, "interaction count", "InteractionCount_numeric_val")

p4 = aoa_plot(experience_by_aoa, "number of exemplars seen", "TotalCount_numeric_val")
aoa_plots <- cowplot::plot_grid(cowplot_title("AoA plots"), cowplot::plot_grid(p1, p2, p3, p4), rel_heights = c(0.2, 1), ncol=1)
`geom_smooth()` using method = 'loess'
`geom_smooth()` using method = 'loess'
`geom_smooth()` using method = 'loess'
`geom_smooth()` using method = 'loess'
cowplot::save_plot(here("data", "main", "figures", "aoa_plots.png"), aoa_plots, base_width = 10, base_height = 12, bg="white")
Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 17 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 23 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
p3
`geom_smooth()` using method = 'loess'
Warning: ggrepel: 17 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

## Child age correlation
age_split_aoa <- combined_normed_summary_age_split |>
  filter(!is.na(aoa))

younger_kids_aoa = aoa_plot(age_split_aoa |> filter(age_half == "younger"), "interaction count", "InteractionCount_numeric_val", "younger children (< 22 months)")
older_kids_aoa = aoa_plot(age_split_aoa |> filter(age_half == "older"), "interaction count", "InteractionCount_numeric_val", "older children (>= 22 months)")

really_old_children_aoa = combined_normed_wide |> filter(childAge > 30 & !is.na(aoa)) |> 
  select(-globalID) |>
  summarize(n = n(), across(where(is.numeric), ~ mean(.x, na.rm = TRUE)), .by=c("object", "category"))
aoa_plot(really_old_children_aoa, "interaction count", "InteractionCount_numeric_val", "children 30-36 months")
`geom_smooth()` using method = 'loess'
Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

cowplot::plot_grid(younger_kids_aoa, older_kids_aoa)
`geom_smooth()` using method = 'loess'
`geom_smooth()` using method = 'loess'
Warning: ggrepel: 30 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 27 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

By-item heatmaps

# Compute correlation matrix
data <- combined_normed_wide

# Select only relevant numeric columns
numeric_vars <- c("Frequency_normalized_val", "TotalCount_normalized_val", "InteractionCount_normalized_val", "FormatsSeen_normalized_val")

# Aggregate data by category using mean
agg_data <- combined_normed_summary |>
  dplyr::select(all_of(numeric_vars))

agg_data_pivoted <- as.data.frame(t(agg_data[,-1]))  # Transpose numeric values: decided not to use this since it's really just the mean values
# TODO: I'll have to do PCA or FA to find item and participant clusters

agg_data_per_participant <- combined_normed_wide |>
  arrange(category, object) |>
  pivot_wider(names_from = "object", values_from = "InteractionCount_normalized_val", id_cols=globalID) |>
  dplyr::select(-globalID)

# Create a category-color mapping
categories <- unique(combined_normed_wide$category) 
category_colors <- setNames(RColorBrewer::brewer.pal(length(categories), "Set2"), categories)
# Assign colors to objects based on category
object_categories <- combined_normed_wide |> 
  distinct(object, category) |> 
  arrange(category, object)
category_colors_mapped <- category_colors[object_categories$category]


# Compute correlation matrix
cor_matrix <- cor(agg_data_per_participant, use = "pairwise.complete.obs")

# Convert correlation matrix to long format for ggplot2
cor_melted <- reshape::melt(cor_matrix)
Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
the caller; using TRUE
Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
the caller; using TRUE
# Plot heatmap
ggplot(cor_melted, aes(X1, X2, fill = value)) +
  geom_tile() +
   scale_fill_viridis(option = "mako", direction = 1) + 
  theme_minimal() +
  labs(title = "Heatmap of Interaction Count correlations",
       fill = "Correlation") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, color = category_colors_mapped),
        axis.text.y = element_text(color = category_colors_mapped))
Warning: Vectorized input to `element_text()` is not officially supported.
ℹ Results may be unexpected or may change in future versions of ggplot2.
Warning: Vectorized input to `element_text()` is not officially supported.
ℹ Results may be unexpected or may change in future versions of ggplot2.

# TODO: add legend with category colors

# TODO: Analyze favorites data

Item variability

question_based_item_variabilities <- coefficient_of_var(combined_responses_without_favorites, "numeric_val", c("question_type", "object", "category"))
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(group)

  # Now:
  data %>% select(all_of(group))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
normed_item_variabilities <- coefficient_of_var(combined_normalized |> filter(category != "favorite"), "normalized_val", c("object", "category"))


item_variability_scatterplot(question_based_item_variabilities |> filter(question_type == "FormatsSeen"), combined_responses_without_favorites, "Number of formats")
Warning: ggrepel: 36 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

item_variability_plots <- cowplot::plot_grid(item_variability_scatterplot(question_based_item_variabilities |> filter(question_type == "TotalCount"), combined_responses_without_favorites, "Number of exemplars seen"), item_variability_scatterplot(question_based_item_variabilities |> filter(question_type == "InteractionCount"), combined_responses_without_favorites, "Interaction count"), item_variability_scatterplot(question_based_item_variabilities |> filter(question_type == "Frequency"), combined_responses_without_favorites, "How often seen"), item_variability_scatterplot(normed_item_variabilities, combined_normalized |> filter(category != "favorite"), "Normalized", y_val="normalized_val"))

item_variability_title <- cowplot_title("Item variabilities across participants")

larger_item_vars <- cowplot::plot_grid(item_variability_title, item_variability_plots, rel_heights = c(0.2, 1), ncol=1)

cowplot::save_plot(here("data", "main", "figures","larger_item_var_plots.png"), larger_item_vars, base_width = 20, base_height = 24, bg="white")
Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
larger_item_vars
Warning: ggrepel: 47 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 47 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 46 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 48 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

# TODO: move analysis to separate file