response_variability <-function(data, question) { filtered_data <- data %>%summarize(n =n(), .by =c("response"))return(ggplot(filtered_data, aes(x = response, y = n)) +geom_bar(stat ="identity") +labs(title=paste("Responses based on options for", question)) )}response_grouping <-function(data, question) {return(data %>%filter(question_type==question) %>%summarize(n =n(), .by =c("response")))}# To create a graph where the response data is randomly shuffled into two halves and compared and correlatedsplit_half <-function(data, categories, question, category_name="ResponseId", input_group=c("category", "type"), input_color="type", input_label="category", response_val="numeric_val") { shuffled_categories <-sample(categories)# Split the shuffled categories into two halves half1_categories <- shuffled_categories[1:floor(length(shuffled_categories) /2)] half2_categories <- shuffled_categories[(floor(length(shuffled_categories) /2) +1):length(shuffled_categories)] half1 <- data |>filter(!!sym(category_name) %in% half1_categories) |># Ensuring that there is only one average per category before calculating mean countsmutate((!!sym(response_val)) :=mean(!!sym(response_val)), .by=c(category_name, input_group)) |>summarize(mean_count =mean(!!sym(response_val)), .by=input_group,show_col_types =FALSE) half2 <- data |>filter(!!sym(category_name) %in% half2_categories) |>mutate(!!sym(response_val) :=mean(!!sym(response_val)), .by=c(category_name, input_group)) |>summarize(mean_count =mean(!!sym(response_val)), .by=input_group) split_half_plot <-ggplot(data=inner_join(half1, half2, by = input_group), aes(x = mean_count.x, y = mean_count.y, color=!!sym(input_color))) +geom_point(alpha=0.6, size=2) +geom_smooth(method='lm', formula='y~x', color="gray") +labs(x ="Responses first half", y ="Responses second half", title=paste("Split-half reliability for", question)) + ggpubr::stat_cor(size =2) +theme(plot.title=element_text(size=10), axis.title =element_text(size =10))# If we're grouping based on item names, add labels for each itemif ("object"%in% input_group) { split_half_plot <- split_half_plot + ggrepel::geom_label_repel(aes(label=!!sym(input_label)), segment.colour="grey", segment.alpha =.5) }return(split_half_plot)}questiontype_age_lineplot <-function(data, input_title, y_val="numeric_val") { summarized_by_id <-summarized_data(data, "childs_age", y_val, c("ResponseId", "question_type")) summarized_by_age <-summarized_data(summarized_by_id, "childs_age", "mean_value", "question_type")return(ggplot(summarized_by_id, aes(x = childs_age, y = mean_value, group = question_type, color = question_type)) +geom_line(data = summarized_by_age, linewidth =0.8, show.legend =TRUE, alpha =0.8) +geom_point(alpha =0.5, position =position_jitter(width = .01, height = .01)) +labs(x ="Child age (months)", y ="Response value", title=input_title, color="Question Type") +geom_smooth(method ='lm', formula='y~x', aes(color=question_type)) +scale_x_continuous(breaks =seq(5, 45, by =5), limits =c(10, 50)) )}coefficient_of_var <-function(data, value, group) { item_variability <- data |>summarize(mean_item =mean(!!sym(value)),sd_item =sd(!!sym(value)),# Calculating 95% CImargin_of_error =qt(0.975, df =n() -1) * (sd_item /sqrt(n())),cv = (sd_item / mean_item) *100, .by = group)return(item_variability)}item_variability_scatterplot <-function(item_vars, all_responses, input_title, group=c("category", "object"), y_val="numeric_val") {return(ggplot(item_vars, aes(x = cv, y = mean_item, label = object)) +geom_errorbar(aes(ymin = mean_item - margin_of_error, ymax = mean_item + margin_of_error), width =0.2, color ="black",alpha =0.2 ) +geom_point(aes(color = category), size =3, shape=17) +# Add points with color based on category ggrepel::geom_label_repel(size =4, force =40) +geom_jitter(data=(all_responses |>left_join(item_vars, by = group)), aes(x=cv, y=!!sym(y_val), color = category), alpha =0.01, height =0.3, width =0.3) +labs(title = input_title,y ="Mean of Numeric Value",x ="Coefficient of Variation (%)" ) +theme(legend.position ="bottom"))}babiness_correlation <-function(data, x_val, xlab_val, title="") {return(ggplot(data, aes(x=!!sym(x_val), y=babiness_mean)) +geom_jitter(height=.03, width=.03, alpha=.8, size=2) + ggthemes::theme_few(base_size=base_size_print) +geom_smooth(formula='y~x', span=10, alpha=.2, color='dark grey') + ggrepel::geom_label_repel(aes(label=object, color=category), segment.colour="grey", segment.alpha =.2, size = label_size) +xlab(xlab_val) +ylab('Babiness rating') +labs(title=title) +theme(legend.position='none', aspect.ratio=1) + ggpubr::stat_cor(method ="pearson") +coord_cartesian(clip='off'))}normalized_responses <-function(data) {return(data |>group_by(question_type) |>mutate(normalized_val = (numeric_val -min(numeric_val, na.rm =TRUE)) / (max(numeric_val, na.rm =TRUE) -min(numeric_val, na.rm =TRUE)) ) |>ungroup() )}# To add a title to the top of a cowplot arrangementcowplot_title <-function(title_text) { title <-ggdraw() +draw_label( title_text,fontface ='bold',x =0,hjust =0 ) +theme(plot.margin =margin(0, 0, 0, 4) )return(title)}summarized_data <-function(data, x_var, y_var, group_var) {return(data %>%group_by_at(c(x_var, group_var)) %>%summarise(mean_value =mean(.data[[y_var]], na.rm =TRUE),sd_value =sd(.data[[y_var]], na.rm =TRUE),n =n(),se = sd_value /sqrt(n()),ci_lower = mean_value -qt(1- (0.05/2), n -1) * se,ci_upper = mean_value +qt(1- (0.05/2), n -1) * se,.groups ='drop') )}aoa_plot <-function(data, x_lab, x_var="mean_val", title="") {ggplot(data, aes(x=!!sym(x_var), y=aoa)) +geom_point(alpha=.8, size=2) +theme_few(base_size=base_size_print) +geom_smooth(formula='y~x', span=10, alpha=.2) + ggrepel::geom_label_repel(aes(color=category, label=object),segment.colour="grey", segment.alpha =.2, size =5) +ylim(14,30)+#annotation_custom(text_low,xmin=1,xmax=1,ymin=11,ymax=11) +#annotation_custom(text_high,xmin=7.5,xmax=7.5,ymin=11,ymax=11) +xlab(x_lab) +ylab('Avg. age-of-acquisition') +labs(title = title) +theme(legend.position='none', aspect.ratio=1) +coord_cartesian(clip='off') + ggpubr::stat_cor(method ="pearson")}aoa_plot_grouped <-function(data, title, group) {ggplot(data, aes(x=mean_val, y=aoa, label=object, color=!!sym(group))) +geom_point(alpha=.8, size=2) +theme_few(base_size=base_size_print) +geom_smooth(span=10, alpha=.2) +ylim(14,30)+xlab(title) +ylab('Avg. age-of-acquisition') +coord_cartesian(clip='off') + ggpubr::stat_cor(method ="pearson")}# Function to add age splitadd_age_split <-function(data) { data |>mutate(median_age =median(childAge),age_half =ifelse(childAge > median_age, "older", "younger") )}# To extract data about the age-of-acquisition (AoA) of words using Wordbankaoa_data <-function(language, form, input_measure) { instrument_data <-get_instrument_data(language, form, administration_info =TRUE) |>filter(!is.na(!!sym(input_measure))) aoa <-merge(x=fit_aoa(instrument_data, measure=input_measure),y=get_item_data(language, form),by="item_id") |>filter(!is.na(aoa)) |># Only using the first phrase of the descriptionmutate(item =str_extract(item_definition, "^[^/()]+")) |># Only count the AoA as the last learned date of that word if there are multiple versions of the wordfilter(aoa ==max(aoa, na.rm =TRUE), .by=item)return(aoa)}
Load and preprocess data
# Words and Sentences dataset does not contain any words based on "understands" measureenglish_ws_aoa <-aoa_data("English (American)", "WS", "produces")raw_experience_data =read_csv(here('data','main','trial_data.csv'))
Rows: 3006 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): object, category, FormatsSeen, Frequency, globalID
dbl (6): objectNumber, categoryNumber, trialIndex, reactionTime, TotalCount,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cleaned_experience_data = raw_experience_data |># TODO: Need to scale this within participant, calculate z-scoresmutate(TotalCount =ifelse(TotalCount >100, 100, TotalCount),InteractionCount =ifelse(InteractionCount >100, 100, InteractionCount)) |>filter(!is.na(Frequency)) |>left_join(english_ws_aoa |>select(item, aoa), by=c("object"="item"))participant_data =read_csv(here('data', 'main', 'participant_data.csv')) |># Fixing age of a single child whose parent inputted their age incorrectly, messaged parent to get the right agemutate(childAge =ifelse(childAge ==-599988, 7, childAge))
Rows: 55 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): childSiblingAges, globalID, feedback
dbl (1): childAge
dttm (2): startTimestamp, endTimestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 22682 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): word, subjCode, task, lexicalCategory
dbl (8): rating, X30mos, phonemes, totalMorphemes, concreteness, logFreq, sy...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hist(participant_data$childAge)
# Filtering to participants who, after the attention checks were added in, submitted complete datafull_non_pilot_data <- raw_experience_data |>group_by(globalID) |>mutate(num_trials =n()) time_taken <- full_non_pilot_data |>left_join(participant_data) |>mutate(time_taken =difftime(as.POSIXct(endTimestamp, format ="%Y-%m-%d %H:%M:%S"),as.POSIXct(startTimestamp, format ="%Y-%m-%d %H:%M:%S"),units ="mins"# Adjust units as needed (e.g., "secs", "hours", etc.) ) ) |>ungroup() |>distinct(time_taken, globalID, num_trials) |>summarize(n =n(),mean_time =mean(as.numeric(time_taken), na.rm =TRUE),median_time =median(as.numeric(time_taken), na.rm =TRUE),.by ="num_trials" )
Joining with `by = join_by(globalID)`
Exclusions
participants_to_be_excluded =c()
Processing 4 built-in attention checks
ATTENTION_CHECKS <-tibble(correct_response =c('realLife', 'toy', 'realLife', 'toy'),object =c('unicorn', 'sunrise', 'typewriter', 'cloud'))attention_check_results <- raw_experience_data |>filter(category =="attention_check") |>left_join(ATTENTION_CHECKS) |># A correct attention check response would have only included a single format, so we can directly remove '[' and ']' from any responsesmutate(attention_check_response =gsub("\\[\\'", "", gsub("\\'\\]", "", FormatsSeen)),correctly_responded = attention_check_response == correct_response) |>mutate(correctly_responded =ifelse(is.na(correctly_responded), FALSE, correctly_responded))
Joining with `by = join_by(object)`
attention_checks_by_participant <- attention_check_results |>summarize(correct_attention_checks =sum(correctly_responded ==TRUE),total_seen =n(),.by = globalID)# Only excluding participants if they fail at least two attention checks, abiding by Prolific's attention check guidelinesattention_check_exclusions <- attention_checks_by_participant |>filter(total_seen - correct_attention_checks >=2) |>pull(globalID)participants_to_be_excluded =c(participants_to_be_excluded, attention_check_exclusions)
Processing attention checks that are inherent to the items we are asking, for example: it is not possible to have seen a real doll and highly unlikely to have seen a toy beach. If participants consistently fail these types of questions they are excluded from our analyses.
# Seeing if they fail at least 5 of our arbitrary attention checkswithin_study_attention_check_exclusions <- within_study_attention_check_results |>filter(total_seen - correct_attention_checks >=5) |>pull(globalID)participants_to_be_excluded =c(participants_to_be_excluded, within_study_attention_check_exclusions) |>unique()
Processing feedback responses. Feedback questions are generally trickier so only excluding participants who could not answer all 4. Also just processing to see stats about which questions may have been more difficult for participants to fully understand.
Creating numerical value structure for text-based questions
frequency_responses <- cleaned_experience_data |>mutate(question_type ="Frequency",numeric_val =case_when(Frequency =='Not at all'~1, Frequency =='A little'~2, Frequency =='A moderate amount'~3, Frequency =='A lot'~4, Frequency =='A great deal'~5 ))frequency_summary <- frequency_responses |>summarize(frequency_avg =mean(numeric_val), .by=c("object", "category"))
# Number of formats an object is seen informat_responses <- cleaned_experience_data %>%mutate(question_type ="FormatsSeen",seen_drawing =str_detect(FormatsSeen,'drawing'),seen_toy =str_detect(FormatsSeen,'toy'),seen_photo =str_detect(FormatsSeen,'photo'),seen_video =str_detect(FormatsSeen,'video'),seen_life =str_detect(FormatsSeen,'realLife'),seen_never =str_detect(FormatsSeen, 'neverSeen')) %>%rowwise() %>%mutate(numeric_val =sum(seen_drawing + seen_toy + seen_photo + seen_video + seen_life)) %>%ungroup()# Averaging each 'seen' variable per itemdiff_formats_summary <- format_responses %>%group_by(object, category) %>%summarize(seen_toy_avg =mean(seen_toy), seen_drawing_avg =mean(seen_drawing), seen_photo_avg =mean(seen_photo), seen_video_avg =mean(seen_video), seen_real_life_avg =mean(seen_life)) %>%ungroup() %>%mutate(categories =fct_reorder(category, seen_real_life_avg, .desc=TRUE))
`summarise()` has grouped output by 'object'. You can override using the
`.groups` argument.
# Pivoting all the numerical survey question responses at the question levelnumerical_responses_long <- cleaned_experience_data |>pivot_longer(c(TotalCount, InteractionCount), names_to ="question_type", values_to ="numeric_val")question_cols =c("TotalCount", "InteractionCount", "Frequency", "FormatsSeen")combined_responses <-rbind(frequency_responses |>select(-all_of(question_cols)), numerical_responses_long |>select(-any_of(question_cols)), format_responses |>select(-starts_with("seen")) |>select(-all_of(question_cols))) |>filter(!is.na(numeric_val))combined_normalized <-normalized_responses(combined_responses)combined_responses_without_favorites <- combined_responses |>filter(category !="favorite") # Switching back to single row for each participant and adding participant datacombined_normed_wide <- combined_normalized |>filter(category !="favorite") |>pivot_wider(id_cols =c("object", "globalID", "category", "aoa"),names_from = question_type,values_from =c(normalized_val, numeric_val),names_glue ="{question_type}_{.value}" ) |>left_join(participant_data, by=c("globalID"="globalID")) |>add_age_split()combined_normed_summary <- combined_normed_wide |>select(-globalID) |>summarize(across(where(is.numeric), ~mean(.x, na.rm =TRUE)), .by=c("object", "category"))combined_normed_summary_age_split <- combined_normed_wide |>select(-globalID) |>summarize(across(where(is.numeric), ~mean(.x, na.rm =TRUE)), .by=c("object", "category", "age_half"), show_col_types=FALSE)
Reliability checks
##Within-item split-half reliability
categories <-unique(frequency_responses$globalID)splithalf_title <-cowplot_title("Split-half reliabilities within-item")# Creating a helper function to pass in common parameters from all within-item split-half reliability checkswithinitem_splithalf <-function(data, title, response_val="numeric_val") {return(split_half(data, categories, title, category_name ="globalID", input_group ="object", input_color ="", input_label="object", response_val = response_val))}splithalf_plots <- cowplot::plot_grid(withinitem_splithalf(combined_responses_without_favorites |>filter(question_type =="FormatsSeen"), "formats seen"), withinitem_splithalf(combined_responses_without_favorites |>filter(question_type =="InteractionCount"), "interaction count"), withinitem_splithalf(combined_responses_without_favorites |>filter(question_type =="TotalCount"), "total count"), labels ="auto", withinitem_splithalf(combined_normalized |>filter(category !="favorite"), "all normalized responses", response_val ="normalized_val"))
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(category_name)
# Now:
data %>% select(all_of(category_name))
See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(input_group)
# Now:
data %>% select(all_of(input_group))
See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
Pearson's product-moment correlation
data: cleaned_experience_data$TotalCount and cleaned_experience_data$aoa
t = -8.8973, df = 1489, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.2723381 -0.1759189
sample estimates:
cor
-0.2246784
empty <-textGrob("", gp=gpar(fontsizep=base_size_print))# TODO: add labels back to the graphs, switch to RColorBrewertext_low <-textGrob("Not at all", gp=gpar(fontsize=base_size_print))text_high <-textGrob("A great deal", gp=gpar(fontsize=base_size_print))experience_by_aoa <- combined_normed_summary |>filter(!is.na(aoa))p1 <-aoa_plot(experience_by_aoa, "how often seen", "Frequency_numeric_val")p2 =aoa_plot(experience_by_aoa, "formats seen", "FormatsSeen_numeric_val")p3 =aoa_plot(experience_by_aoa, "interaction count", "InteractionCount_numeric_val")p4 =aoa_plot(experience_by_aoa, "number of exemplars seen", "TotalCount_numeric_val")aoa_plots <- cowplot::plot_grid(cowplot_title("AoA plots"), cowplot::plot_grid(p1, p2, p3, p4), rel_heights =c(0.2, 1), ncol=1)
`geom_smooth()` using method = 'loess'
`geom_smooth()` using method = 'loess'
`geom_smooth()` using method = 'loess'
`geom_smooth()` using method = 'loess'
`geom_smooth()` using method = 'loess'
`geom_smooth()` using method = 'loess'
Warning: ggrepel: 30 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 27 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
By-item heatmaps
# Compute correlation matrixdata <- combined_normed_wide# Select only relevant numeric columnsnumeric_vars <-c("Frequency_normalized_val", "TotalCount_normalized_val", "InteractionCount_normalized_val", "FormatsSeen_normalized_val")# Aggregate data by category using meanagg_data <- combined_normed_summary |> dplyr::select(all_of(numeric_vars))agg_data_pivoted <-as.data.frame(t(agg_data[,-1])) # Transpose numeric values: decided not to use this since it's really just the mean values# TODO: I'll have to do PCA or FA to find item and participant clustersagg_data_per_participant <- combined_normed_wide |>arrange(category, object) |>pivot_wider(names_from ="object", values_from ="InteractionCount_normalized_val", id_cols=globalID) |> dplyr::select(-globalID)# Create a category-color mappingcategories <-unique(combined_normed_wide$category) category_colors <-setNames(RColorBrewer::brewer.pal(length(categories), "Set2"), categories)# Assign colors to objects based on categoryobject_categories <- combined_normed_wide |>distinct(object, category) |>arrange(category, object)category_colors_mapped <- category_colors[object_categories$category]# Compute correlation matrixcor_matrix <-cor(agg_data_per_participant, use ="pairwise.complete.obs")# Convert correlation matrix to long format for ggplot2cor_melted <- reshape::melt(cor_matrix)
Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
the caller; using TRUE
Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
the caller; using TRUE
# Plot heatmapggplot(cor_melted, aes(X1, X2, fill = value)) +geom_tile() +scale_fill_viridis(option ="mako", direction =1) +theme_minimal() +labs(title ="Heatmap of Interaction Count correlations",fill ="Correlation") +theme(axis.text.x =element_text(angle =45, hjust =1, color = category_colors_mapped),axis.text.y =element_text(color = category_colors_mapped))
Warning: Vectorized input to `element_text()` is not officially supported.
ℹ Results may be unexpected or may change in future versions of ggplot2.
Warning: Vectorized input to `element_text()` is not officially supported.
ℹ Results may be unexpected or may change in future versions of ggplot2.
# TODO: add legend with category colors# TODO: Analyze favorites data
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(group)
# Now:
data %>% select(all_of(group))
See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.