Test pilot visual experience survey data for reliability and validity of questions and items

Author

Tarun Sepuri

Published

November 15, 2024

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(here)

here() starts at /Users/visuallearninglab/Documents/vedi_survey/vedi

library(knitr)
library(cowplot)


Attaching package: 'cowplot'

The following object is masked from 'package:lubridate':

    stamp

library(grid)
library(ggthemes)


Attaching package: 'ggthemes'

The following object is masked from 'package:cowplot':

    theme_map

library(mirt)

Loading required package: stats4
Loading required package: lattice

Helper functions

response_variability <- function(data, question) {
  filtered_data <- data %>% summarize(n = n(), .by = c("response"))
  return(ggplot(filtered_data, aes(x = response, y = n)) +
           geom_bar(stat = "identity") +
           labs(title=paste("Responses based on options for", question))
         )
}

response_grouping <- function(data, question) {
  return(data %>% filter(question_type==question) %>% summarize(n = n(), .by = c("response")))
}

# To create a graph where the response data is randomly shuffled into two halves and compared and correlated
split_half <- function(data, categories, question, category_name="ResponseId", input_group=c("category", "type"), input_color="type", input_label="category", response_val="numeric_val") {
  shuffled_categories <- sample(categories)
  # Split the shuffled categories into two halves
  half1_categories <- shuffled_categories[1:floor(length(shuffled_categories) / 2)]
  half2_categories <- shuffled_categories[(floor(length(shuffled_categories) / 2) + 1):length(shuffled_categories)]
  half1 <- data |>
    filter(!!sym(category_name) %in% half1_categories) |>
    # Ensuring that there is only one average per category before calculating mean counts
    mutate((!!sym(response_val)) := mean(!!sym(response_val)), .by=c(category_name, input_group)) |>
    summarize(mean_count = mean(!!sym(response_val)), .by=input_group)
  
  half2 <- data |>
    filter(!!sym(category_name) %in% half2_categories) |>
    mutate(!!sym(response_val) := mean(!!sym(response_val)), .by=c(category_name, input_group)) |>
    summarize(mean_count = mean(!!sym(response_val)), .by=input_group)
  
  split_half_plot <- ggplot(data=inner_join(half1, half2, by = input_group), color="purple", aes(x = mean_count.x, y = mean_count.y, color=!!sym(input_color), label=!!sym(input_label))) +
    geom_point(alpha=0.6, size=2) +
    geom_smooth(method='lm', color="gray") +
    labs(x = "Responses first half", y = "Responses second half", title=paste("Split-half reliability for", question)) +
    ggpubr::stat_cor(size = 2) +
    theme(plot.title=element_text(size=10), axis.title = element_text(size = 10))
  # If we're grouping based on item names, add labels for each item
  if ("category" %in% input_group) {
    split_half_plot <- split_half_plot + ggrepel::geom_label_repel(segment.colour="grey", segment.alpha =.5)
  }
  return(split_half_plot)
}

questiontype_age_lineplot <- function(data, input_title, y_val="numeric_val") {
  summarized_by_id <- summarized_data(data, "childs_age", y_val, c("ResponseId", "question_type"))
  summarized_by_age <- summarized_data(summarized_by_id, "childs_age", "mean_value", "question_type")
  return(ggplot(summarized_by_id, aes(x = childs_age, y = mean_value, group = question_type, color = question_type)) +
           geom_line(data = summarized_by_age, linewidth = 0.8, show.legend = TRUE, alpha = 0.8) +
           geom_point(alpha = 0.5, position = position_jitter(width = .01, height = .01)) +
           labs(x = "Child age (months)", y = "Response value", title=input_title, color="Question Type") +
           geom_smooth(method = 'lm', aes(color=question_type)) +
           scale_x_continuous(breaks = seq(5, 45, by = 5), limits = c(10, 50))
         )
}

coefficient_of_var <- function(data, value, group) {
  item_variability <- data |>
    summarize(mean_item = mean(!!sym(value)),
              sd_item = sd(!!sym(value)),
              # Calculating 95% CI
              margin_of_error = qt(0.975, df = n() - 1) * (sd_item / sqrt(n())),
              cv = (sd_item / mean_item) * 100, .by = group)
  return(item_variability)
}

item_variability_scatterplot <- function(item_vars, all_responses, input_title, group=c("category", "type", "question_type"), y_val="numeric_val") {
  return(ggplot(item_vars, aes(x = cv, y = mean_item, label = category)) +
    geom_errorbar(
      aes(ymin = mean_item - margin_of_error, ymax = mean_item + margin_of_error), 
      width = 0.2, color = "black",
      alpha = 0.2
    ) + 
    geom_point(aes(color = type), size = 3, shape=17) +       # Add points with color based on category
    ggrepel::geom_label_repel(size = 4, force = 40) +
    geom_jitter(data=(all_responses |> left_join(item_vars, by = group)), aes(x=cv, y=!!sym(y_val), color = type), alpha = 0.2, height = 0.3, width = 0.3) +
    labs(
      title = input_title,
      y = "Mean of Numeric Value",
      x = "Coefficient of Variation (%)"
    ) +
    theme(legend.position = "bottom"))
}

babiness_correlation <- function(data, x_val, xlab_val) {
 return(ggplot(data, aes(x=!!sym(x_val), y=babiness_mean, label=category)) +
    geom_jitter(height=.03, width=.03, alpha=.8, size=2) +
    ggthemes::theme_few(base_size=base_size_print) +
    geom_smooth(span=10, alpha=.2, color='dark grey') +
    ggrepel::geom_label_repel(segment.colour="grey", segment.alpha =.2, aes(color=type), size = label_size) +
    xlab(xlab_val) +
    ylab('Babiness rating') +
    theme(legend.position='none', aspect.ratio=1) +
    coord_cartesian(clip='off'))
}

normalized_responses <- function(data) {
  return(data |> 
           group_by(question_type) |>
           mutate(
             normalized_val = (numeric_val - min(numeric_val, na.rm = TRUE)) / 
               (max(numeric_val, na.rm = TRUE) - min(numeric_val, na.rm = TRUE))
           ) |>
           ungroup()
         )
}

# To add a title to the top of a cowplot arrangement
cowplot_title <- function(title_text) {
  title <- ggdraw() + 
    draw_label(
      title_text,
      fontface = 'bold',
      x = 0,
      hjust = 0
    ) +
    theme(
      plot.margin = margin(0, 0, 0, 7)
    )
  return(title)
}

summarized_data <- function(data, x_var, y_var, group_var) {
  return(data %>%
           group_by_at(c(x_var, group_var)) %>%
           summarise(mean_value = mean(.data[[y_var]], na.rm = TRUE),
                     sd_value = sd(.data[[y_var]], na.rm = TRUE),
                     n = n(),
                     se = sd_value / sqrt(n()),
                     ci_lower = mean_value - qt(1 - (0.05 / 2), n - 1) * se,
                     ci_upper = mean_value + qt(1 - (0.05 / 2), n - 1) * se,
                     .groups = 'drop')
  )
}

Load and preprocess data

raw_data = read_csv(here('data','pilot','Visual Experience Development Survey_March 30, 2021_17.49.csv'))

Rows: 38 Columns: 174
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (174): StartDate, EndDate, Status, IPAddress, Progress, Duration (in sec...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

meta <- read_csv(here::here('data/pilot/category_metadata.csv')) %>%
  as_tibble()

Rows: 50 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): category, type, category_short

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

aoa <- read_csv(here::here('data/pilot/VEDI_AoAs.csv')) %>%
  as_tibble() %>%
  select(aoa, definition) %>%
  rename(category = definition)

New names:
Rows: 49 Columns: 11
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(7): item_id, definition, type, category, lexical_category, lexical_clas... dbl
(3): ...1, num_item_id, aoa lgl (1): complexity_category
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`

babiness <- read_csv(here::here('data/pilot/babiness.csv')) |>
  as_tibble() |>
  select(word, rating, babyAVG) |>
  filter(word %in% aoa$category) |>
  mutate(category = word) |>
  summarize(babiness_mean = mean(babyAVG),
            rating_mean = mean(rating),
            n = n(), .by="category")

Rows: 22682 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): word, subjCode, task, lexicalCategory
dbl (8): rating, X30mos, phonemes, totalMorphemes, concreteness, logFreq, sy...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Look through question names and preprocess qualtrics data

question_names <- raw_data %>%
  select(contains('_Q')) %>%
  filter(row_number()==1)  

question_type <- question_names %>%
  gather(key = question_id, value = question_text) %>%
  mutate(question_type = case_when(str_detect(question_text,'How often has your child seen') ~ "how_often",
                                   str_detect(question_text,'In what formats has your child seen')  ~ "different_formats", 
                                   str_detect(question_text,'How many different')  ~ "how_many")) %>%
  mutate(category = str_split_fixed(question_text,' -',2)[,1]) %>%
  select(-question_text) %>%
  left_join(meta)

Joining with `by = join_by(category)`

cleaned_data <- raw_data %>%
  filter(DistributionChannel != 'preview') %>% # not preview
  filter(Finished == 'True') %>%
  filter(Q8 == "Yes") %>% # positive consent
  mutate(childs_age = Q16) %>%
  rowwise() %>%
  mutate(childs_age = ifelse(grepl(" months", childs_age), as.numeric(str_split_1(childs_age, " ")[1]), as.numeric(childs_age)*12)) %>%
  filter(childs_age != 0) %>% # all of these responses are NA
  select(childs_age, ResponseId,contains('_Q')) %>%
  gather(key = question_id, value = response, -childs_age, -ResponseId) %>%
  left_join(question_type, by = c("question_id")) %>%
  select(-category) %>%
  rename(category = category_short)

Data structure for how often seen

how_often_responses <- cleaned_data %>%
  filter(question_type == 'how_often') %>%
  filter(!is.na(response)) %>%
  mutate(numeric_val = case_when(response == 'Never' ~ 0, 
                                      response == 'Once or twice' ~ 1, 
                                      response == 'A few times' ~ 2, 
                                      response == 'Every few months' ~ 3, 
                                      response == 'Monthly' ~ 4, 
                                      response == 'Weekly' ~ 5, 
                                      response == 'Daily' ~ 6, 
                                      response == 'Multiple times a day' ~ 7
                                      ))
how_often_summary <- how_often_responses |>
 summarize(how_often_avg = mean(numeric_val), .by="category")

Data structure for how many exemplars seen

how_many_responses <- cleaned_data |>
  filter(question_type == 'how_many') |>
  filter(!is.na(response)) |>
  mutate(numeric_val = case_when(response == '0' ~ 0, 
                                      response == '1' ~ 1, 
                                      response == '2' ~ 2, 
                                      response == '<5' ~ 3, 
                                      response == '<10' ~ 7.5, 
                                      response == '<25' ~ 17.5, 
                                      response == '25 or more' ~ 40
                                      ))

how_many_summary <- how_many_responses %>%
  summarize(how_many_avg = mean(numeric_val), .by=c("category"))

Data structure for counting formats

format_responses <- cleaned_data %>%
  filter(question_type == 'different_formats') %>%
  filter(!is.na(response)) %>%
  mutate(seen_drawing = str_detect(response,'A drawing')) %>%
  mutate(seen_toy = str_detect(response,'A toy')) %>%
  mutate(seen_photo = str_detect(response,'A photo')) %>%
  mutate(seen_video = str_detect(response,'A video')) %>%
  mutate(seen_life = str_detect(response,'In real life')) %>%
  rowwise() %>%
  mutate(numeric_val = sum(seen_drawing + seen_toy + seen_photo + seen_video + seen_life)) %>%
  ungroup()

diff_formats_summary <- format_responses %>%
  group_by(category) %>%
  summarize(seen_toy_avg = mean(seen_toy), seen_drawing_avg = mean(seen_drawing), seen_photo_avg = mean(seen_photo), seen_video_avg = mean(seen_video), seen_real_life_avg = mean(seen_life)) %>%
  ungroup() %>%
  mutate(categories = fct_reorder(category, seen_real_life_avg, .desc=TRUE))
  
format_count <- format_responses %>%
  summarize(count_formats = sum(seen_drawing + seen_toy + seen_photo + seen_video + seen_life), .by=c("ResponseId", "category", "type")) %>%
  summarize(avg_num_formats = mean(count_formats), .by=c("category", "type"))

Creating a master combined and normalized responses data frame, across question types.

combined_responses <- rbind(how_many_responses, how_often_responses, format_responses |> select(-starts_with("seen")))
combined_normalized <- normalized_responses(combined_responses)

##Split-half reliability ###Within-participants

categories <- unique(how_many_responses$category)

# Creating a helper function to pass in common parameters from all within-participant split-half reliability checks
withinparticipant_splithalf <- function(data, title) {
  return(split_half(data, categories, title, category_name = "category", input_group = "ResponseId", input_color = "", input_label=""))
}
splithalf_title <- cowplot_title("Split-half reliabilities across question types")
splithalf_plots <- cowplot::plot_grid(withinparticipant_splithalf(how_many_responses, "number of exemplars seen"), withinparticipant_splithalf(format_responses, "number of formats"), withinparticipant_splithalf(how_often_responses, "how often seen"), labels = "auto")

Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(category_name)

  # Now:
  data %>% select(all_of(category_name))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.

Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(input_group)

  # Now:
  data %>% select(all_of(input_group))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.

`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

cowplot::plot_grid(splithalf_title, splithalf_plots, rel_heights = c(0.2, 1), ncol=1)

# Maybe using Cronbach's alpha, need some measure to look at reliability across items too

###Within-item

categories <- unique(how_many_responses$ResponseId)
splithalf_title <- cowplot_title("Split-half reliabilities within-item")
splithalf_plots <- cowplot::plot_grid(split_half(how_many_responses, categories, "number of exemplars seen"), split_half(format_responses, categories, "number of formats"), split_half(how_often_responses, categories, "how often seen"), labels = "auto", split_half(combined_normalized, categories, "all normalized responses", response_val = "normalized_val"))

`geom_smooth()` using formula = 'y ~ x'

Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?

`geom_smooth()` using formula = 'y ~ x'

Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?

`geom_smooth()` using formula = 'y ~ x'

Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?

`geom_smooth()` using formula = 'y ~ x'

Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?

cowplot::plot_grid(splithalf_title, splithalf_plots, rel_heights = c(0.2, 1), ncol=1)

Warning: ggrepel: 48 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Warning: ggrepel: 45 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Warning: ggrepel: 48 unlabeled data points (too many overlaps). Consider increasing max.overlaps
ggrepel: 48 unlabeled data points (too many overlaps). Consider increasing max.overlaps

cowplot::save_plot("larger_split_half_plots.png", cowplot::plot_grid(splithalf_title, splithalf_plots, rel_heights = c(0.2, 1), ncol=1), base_width = 10, base_height = 12)

Warning: ggrepel: 18 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Normalized responses graph is showing signs that are sample size is too small for more intensive analysis.

##Looking at item-based variability

item_variability <- combined_responses |>
  summarize(mean_item = mean(numeric_val),
         sd_item = sd(numeric_val),
         # Calculating 95% CI
         margin_of_error = qt(0.975, df = n() - 1) * (sd_item / sqrt(n())),
         cv = (sd_item / mean_item) * 100, .by = c("question_type", "type", "category"))

question_based_item_variabilities <- coefficient_of_var(combined_responses, "numeric_val", c("question_type", "type", "category"))

Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(group)

  # Now:
  data %>% select(all_of(group))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.

normed_item_variabilities <- coefficient_of_var(combined_normalized, "normalized_val", c("type", "category"))

item_variability_scatterplot(question_based_item_variabilities |> filter(question_type == "different_formats"), format_responses, "Number of formats")

Warning: ggrepel: 23 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

item_variability_plots <- cowplot::plot_grid(item_variability_scatterplot(question_based_item_variabilities |> filter(question_type == "how_many"), how_many_responses, "Number of exemplars seen"), item_variability_scatterplot(question_based_item_variabilities |> filter(question_type == "different_formats"), format_responses, "Number of formats"), item_variability_scatterplot(question_based_item_variabilities |> filter(question_type == "how_often"), how_often_responses, "How often seen"), item_variability_scatterplot(normed_item_variabilities, combined_normalized, "Normalized", group=c("category", "type"), y_val="normalized_val"))

item_variability_title <- cowplot_title("Item variabilities across participants")

cowplot::plot_grid(item_variability_title, item_variability_plots, rel_heights = c(0.2, 1), ncol=1)

Warning: ggrepel: 49 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Warning: ggrepel: 48 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Warning: ggrepel: 47 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Warning: ggrepel: 50 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

cowplot::save_plot("larger_item_var_plots.png", cowplot::plot_grid(item_variability_title, item_variability_plots, rel_heights = c(0.2, 1), ncol=1), base_width = 20, base_height = 24)

Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

IRT

combined_normalized_response_summary <- combined_normalized |>
  summarize(mean_norm = mean(normalized_val), .by=c("ResponseId", "category", "type"))
  
combined_normalized_item_space <-  combined_normalized_response_summary |>
  filter(!(category == "penguin" & type == "object")) |>
  pivot_wider(
    id_cols = ResponseId,
    names_from = category,
    values_from = mean_norm
) |>
  select(-ResponseId)

A little too exploratory for our current sample size.

hist(unname(unlist(combined_normalized_item_space)))

combined_normalized_irt <- combined_normalized_item_space |>
 mutate(across(where(is.numeric), ~ if_else(. > 0.5, 1, 0)))

irt_2pl <- mirt(combined_normalized_irt, 1, itemtype = "2PL")
irt_4pl <- mirt(combined_normalized_irt, 1, itemtype = "4PL")

Warning in EM.group(pars = pars, constrain = constrain, Ls = Ls, PrepList =
PrepList, : EM cycles terminated after 500 iterations.

##Correlation with babiness

combined_normed_wide <- combined_normalized |>
  pivot_wider(
    id_cols = c("category", "type", "ResponseId"),
    names_from = question_type,
    values_from = c(normalized_val, numeric_val),
    names_glue = "{question_type}_{.value}"
  )

combined_normed_summary <- combined_normed_wide |>
  select(-ResponseId) |>
  summarize(across(where(is.numeric), ~ mean(.x, na.rm = TRUE)), .by=c("category", "type"))

experience_by_babiness <- combined_normed_summary |>
  left_join(babiness) %>%
  filter(!is.na(babiness_mean))

Joining with `by = join_by(category)`

cor.test(experience_by_babiness$how_many_numeric_val, experience_by_babiness$babiness_mean)


    Pearson's product-moment correlation

data:  experience_by_babiness$how_many_numeric_val and experience_by_babiness$babiness_mean
t = 1.8345, df = 39, p-value = 0.07422
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.02825714  0.54245779
sample estimates:
      cor 
0.2818435

cor.test(experience_by_babiness$how_often_numeric_val, experience_by_babiness$babiness_mean)


    Pearson's product-moment correlation

data:  experience_by_babiness$how_often_numeric_val and experience_by_babiness$babiness_mean
t = 1.8337, df = 39, p-value = 0.07433
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.02837004  0.54237804
sample estimates:
      cor 
0.2817394

cor.test(experience_by_babiness$different_formats_numeric_val, experience_by_babiness$babiness_mean)


    Pearson's product-moment correlation

data:  experience_by_babiness$different_formats_numeric_val and experience_by_babiness$babiness_mean
t = 2.7992, df = 39, p-value = 0.007924
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.1159604 0.6365671
sample estimates:
     cor 
0.409019

Plotting correlation with babiness

base_size_print=10
label_size=3

babiness_correlation(experience_by_babiness, "how_often_numeric_val", 'How often seen')

`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?

babiness_correlation(experience_by_babiness, "different_formats_numeric_val", "Number of different formats")

`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?

Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

babiness_correlation(experience_by_babiness, "how_many_numeric_val", "Number of exemplars seen")

`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?

##Age-based performance

age_title <- cowplot_title("Response values across age of child")
age_plots <- cowplot::plot_grid(questiontype_age_lineplot(how_many_responses, "Number of exemplars seen"), questiontype_age_lineplot(how_often_responses, "How often seen"), questiontype_age_lineplot(format_responses, "Number of formats seen"), questiontype_age_lineplot(combined_normalized, "Normed averages", y_val="normalized_val"), labels = "auto")

Warning: There were 8 warnings in `summarise()`.
The first warning was:
ℹ In argument: `ci_lower = mean_value - qt(1 - (0.05/2), n - 1) * se`.
ℹ In group 2: `childs_age = 18` and `question_type = "how_many"`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 7 remaining warnings.

Warning: There were 8 warnings in `summarise()`.
The first warning was:
ℹ In argument: `ci_lower = mean_value - qt(1 - (0.05/2), n - 1) * se`.
ℹ In group 2: `childs_age = 18` and `question_type = "how_often"`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 7 remaining warnings.

Warning: There were 8 warnings in `summarise()`.
The first warning was:
ℹ In argument: `ci_lower = mean_value - qt(1 - (0.05/2), n - 1) * se`.
ℹ In group 2: `childs_age = 18` and `question_type = "different_formats"`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 7 remaining warnings.

Warning: There were 24 warnings in `summarise()`.
The first warning was:
ℹ In argument: `ci_lower = mean_value - qt(1 - (0.05/2), n - 1) * se`.
ℹ In group 4: `childs_age = 18` and `question_type = "different_formats"`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 23 remaining warnings.

`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

cowplot::plot_grid(age_title, age_plots, rel_heights = c(0.2, 1), ncol=1)

##Age-of-acquisition graphs pulled from Bria’s analysis

Analyzing option choices for ‘how many’ question

response_variability(how_many_responses, "how_many")

print(head(how_many_responses |>
  group_by(response, ResponseId) |> 
  tally() %>%
  spread(response, n, fill = 0)))

# A tibble: 6 × 8
  ResponseId        `<10` `<25`  `<5`   `0`   `1`   `2` `25 or more`
  <chr>             <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>        <dbl>
1 R_11YNuidJ1noAsvh     8     3    23     1     3     9            3
2 R_1dLGCTto4UjrQ6r     5    12    11     2     0     0           20
3 R_1gI5RxsBLbmrjqq    13    15     6     0     0     0           16
4 R_1JDCr3G72hij1l3     7    21     3     0     0     0           19
5 R_1juEcK4yehmIfTI     4     1    13     3     8    20            1
6 R_1kHTcHNlPyWCbSw    10     0    22     0     4    10            4

response_summary <- how_many_responses %>%
  group_by(ResponseId) %>%
  summarize(mean_response = mean(numeric_val, na.rm = TRUE),
            mode_response = which.max(tabulate(numeric_val)))

# View the summary of mean responses per participant
head(response_summary)

# A tibble: 6 × 3
  ResponseId        mean_response mode_response
  <chr>                     <dbl>         <int>
1 R_11YNuidJ1noAsvh          6.45             3
2 R_1JDCr3G72hij1l3         23.8             17
3 R_1LMZj60uI67ebbS         14.5              7
4 R_1dLGCTto4UjrQ6r         21.6             40
5 R_1gI5RxsBLbmrjqq         20.4             40
6 R_1juEcK4yehmIfTI          3.49             2

# Visualizing the distribution of mean responses per participant
ggplot(response_summary, aes(x = mean_response)) +
  geom_histogram(bins = 20, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Mean Responses Per Participant",
       x = "Mean Response Value", y = "Frequency") +
  theme_minimal()