response_variability <-function(data, question) { filtered_data <- data %>%summarize(n =n(), .by =c("response"))return(ggplot(filtered_data, aes(x = response, y = n)) +geom_bar(stat ="identity") +labs(title=paste("Responses based on options for", question)) )}response_grouping <-function(data, question) {return(data %>%filter(question_type==question) %>%summarize(n =n(), .by =c("response")))}# To create a graph where the response data is randomly shuffled into two halves and compared and correlatedsplit_half <-function(data, categories, question, category_name="ResponseId", input_group=c("category", "type"), input_color="type", input_label="category", response_val="numeric_val") { shuffled_categories <-sample(categories)# Split the shuffled categories into two halves half1_categories <- shuffled_categories[1:floor(length(shuffled_categories) /2)] half2_categories <- shuffled_categories[(floor(length(shuffled_categories) /2) +1):length(shuffled_categories)] half1 <- data |>filter(!!sym(category_name) %in% half1_categories) |># Ensuring that there is only one average per category before calculating mean countsmutate((!!sym(response_val)) :=mean(!!sym(response_val)), .by=c(category_name, input_group)) |>summarize(mean_count =mean(!!sym(response_val)), .by=input_group) half2 <- data |>filter(!!sym(category_name) %in% half2_categories) |>mutate(!!sym(response_val) :=mean(!!sym(response_val)), .by=c(category_name, input_group)) |>summarize(mean_count =mean(!!sym(response_val)), .by=input_group) split_half_plot <-ggplot(data=inner_join(half1, half2, by = input_group), color="purple", aes(x = mean_count.x, y = mean_count.y, color=!!sym(input_color), label=!!sym(input_label))) +geom_point(alpha=0.6, size=2) +geom_smooth(method='lm', color="gray") +labs(x ="Responses first half", y ="Responses second half", title=paste("Split-half reliability for", question)) + ggpubr::stat_cor(size =2) +theme(plot.title=element_text(size=10), axis.title =element_text(size =10))# If we're grouping based on item names, add labels for each itemif ("category"%in% input_group) { split_half_plot <- split_half_plot + ggrepel::geom_label_repel(segment.colour="grey", segment.alpha =.5) }return(split_half_plot)}questiontype_age_lineplot <-function(data, input_title, y_val="numeric_val") { summarized_by_id <-summarized_data(data, "childs_age", y_val, c("ResponseId", "question_type")) summarized_by_age <-summarized_data(summarized_by_id, "childs_age", "mean_value", "question_type")return(ggplot(summarized_by_id, aes(x = childs_age, y = mean_value, group = question_type, color = question_type)) +geom_line(data = summarized_by_age, linewidth =0.8, show.legend =TRUE, alpha =0.8) +geom_point(alpha =0.5, position =position_jitter(width = .01, height = .01)) +labs(x ="Child age (months)", y ="Response value", title=input_title, color="Question Type") +geom_smooth(method ='lm', aes(color=question_type)) +scale_x_continuous(breaks =seq(5, 45, by =5), limits =c(10, 50)) )}coefficient_of_var <-function(data, value, group) { item_variability <- data |>summarize(mean_item =mean(!!sym(value)),sd_item =sd(!!sym(value)),# Calculating 95% CImargin_of_error =qt(0.975, df =n() -1) * (sd_item /sqrt(n())),cv = (sd_item / mean_item) *100, .by = group)return(item_variability)}item_variability_scatterplot <-function(item_vars, all_responses, input_title, group=c("category", "type", "question_type"), y_val="numeric_val") {return(ggplot(item_vars, aes(x = cv, y = mean_item, label = category)) +geom_errorbar(aes(ymin = mean_item - margin_of_error, ymax = mean_item + margin_of_error), width =0.2, color ="black",alpha =0.2 ) +geom_point(aes(color = type), size =3, shape=17) +# Add points with color based on category ggrepel::geom_label_repel(size =4, force =40) +geom_jitter(data=(all_responses |>left_join(item_vars, by = group)), aes(x=cv, y=!!sym(y_val), color = type), alpha =0.2, height =0.3, width =0.3) +labs(title = input_title,y ="Mean of Numeric Value",x ="Coefficient of Variation (%)" ) +theme(legend.position ="bottom"))}babiness_correlation <-function(data, x_val, xlab_val) {return(ggplot(data, aes(x=!!sym(x_val), y=babiness_mean, label=category)) +geom_jitter(height=.03, width=.03, alpha=.8, size=2) + ggthemes::theme_few(base_size=base_size_print) +geom_smooth(span=10, alpha=.2, color='dark grey') + ggrepel::geom_label_repel(segment.colour="grey", segment.alpha =.2, aes(color=type), size = label_size) +xlab(xlab_val) +ylab('Babiness rating') +theme(legend.position='none', aspect.ratio=1) +coord_cartesian(clip='off'))}normalized_responses <-function(data) {return(data |>group_by(question_type) |>mutate(normalized_val = (numeric_val -min(numeric_val, na.rm =TRUE)) / (max(numeric_val, na.rm =TRUE) -min(numeric_val, na.rm =TRUE)) ) |>ungroup() )}# To add a title to the top of a cowplot arrangementcowplot_title <-function(title_text) { title <-ggdraw() +draw_label( title_text,fontface ='bold',x =0,hjust =0 ) +theme(plot.margin =margin(0, 0, 0, 7) )return(title)}summarized_data <-function(data, x_var, y_var, group_var) {return(data %>%group_by_at(c(x_var, group_var)) %>%summarise(mean_value =mean(.data[[y_var]], na.rm =TRUE),sd_value =sd(.data[[y_var]], na.rm =TRUE),n =n(),se = sd_value /sqrt(n()),ci_lower = mean_value -qt(1- (0.05/2), n -1) * se,ci_upper = mean_value +qt(1- (0.05/2), n -1) * se,.groups ='drop') )}
Load and preprocess data
raw_data =read_csv(here('data','pilot','Visual Experience Development Survey_March 30, 2021_17.49.csv'))
Rows: 38 Columns: 174
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (174): StartDate, EndDate, Status, IPAddress, Progress, Duration (in sec...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
meta <-read_csv(here::here('data/pilot/category_metadata.csv')) %>%as_tibble()
Rows: 50 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): category, type, category_short
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
New names:
Rows: 49 Columns: 11
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(7): item_id, definition, type, category, lexical_category, lexical_clas... dbl
(3): ...1, num_item_id, aoa lgl (1): complexity_category
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
Rows: 22682 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): word, subjCode, task, lexicalCategory
dbl (8): rating, X30mos, phonemes, totalMorphemes, concreteness, logFreq, sy...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Look through question names and preprocess qualtrics data
question_names <- raw_data %>%select(contains('_Q')) %>%filter(row_number()==1) question_type <- question_names %>%gather(key = question_id, value = question_text) %>%mutate(question_type =case_when(str_detect(question_text,'How often has your child seen') ~"how_often",str_detect(question_text,'In what formats has your child seen') ~"different_formats", str_detect(question_text,'How many different') ~"how_many")) %>%mutate(category =str_split_fixed(question_text,' -',2)[,1]) %>%select(-question_text) %>%left_join(meta)
Joining with `by = join_by(category)`
cleaned_data <- raw_data %>%filter(DistributionChannel !='preview') %>%# not previewfilter(Finished =='True') %>%filter(Q8 =="Yes") %>%# positive consentmutate(childs_age = Q16) %>%rowwise() %>%mutate(childs_age =ifelse(grepl(" months", childs_age), as.numeric(str_split_1(childs_age, " ")[1]), as.numeric(childs_age)*12)) %>%filter(childs_age !=0) %>%# all of these responses are NAselect(childs_age, ResponseId,contains('_Q')) %>%gather(key = question_id, value = response, -childs_age, -ResponseId) %>%left_join(question_type, by =c("question_id")) %>%select(-category) %>%rename(category = category_short)
Data structure for how often seen
how_often_responses <- cleaned_data %>%filter(question_type =='how_often') %>%filter(!is.na(response)) %>%mutate(numeric_val =case_when(response =='Never'~0, response =='Once or twice'~1, response =='A few times'~2, response =='Every few months'~3, response =='Monthly'~4, response =='Weekly'~5, response =='Daily'~6, response =='Multiple times a day'~7 ))how_often_summary <- how_often_responses |>summarize(how_often_avg =mean(numeric_val), .by="category")
categories <-unique(how_many_responses$category)# Creating a helper function to pass in common parameters from all within-participant split-half reliability checkswithinparticipant_splithalf <-function(data, title) {return(split_half(data, categories, title, category_name ="category", input_group ="ResponseId", input_color ="", input_label=""))}splithalf_title <-cowplot_title("Split-half reliabilities across question types")splithalf_plots <- cowplot::plot_grid(withinparticipant_splithalf(how_many_responses, "number of exemplars seen"), withinparticipant_splithalf(format_responses, "number of formats"), withinparticipant_splithalf(how_often_responses, "how often seen"), labels ="auto")
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(category_name)
# Now:
data %>% select(all_of(category_name))
See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(input_group)
# Now:
data %>% select(all_of(input_group))
See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
# Maybe using Cronbach's alpha, need some measure to look at reliability across items too
###Within-item
categories <-unique(how_many_responses$ResponseId)splithalf_title <-cowplot_title("Split-half reliabilities within-item")splithalf_plots <- cowplot::plot_grid(split_half(how_many_responses, categories, "number of exemplars seen"), split_half(format_responses, categories, "number of formats"), split_half(how_often_responses, categories, "how often seen"), labels ="auto", split_half(combined_normalized, categories, "all normalized responses", response_val ="normalized_val"))
`geom_smooth()` using formula = 'y ~ x'
Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
`geom_smooth()` using formula = 'y ~ x'
Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
`geom_smooth()` using formula = 'y ~ x'
Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
`geom_smooth()` using formula = 'y ~ x'
Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(group)
# Now:
data %>% select(all_of(group))
See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
Pearson's product-moment correlation
data: experience_by_babiness$different_formats_numeric_val and experience_by_babiness$babiness_mean
t = 2.7992, df = 39, p-value = 0.007924
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.1159604 0.6365671
sample estimates:
cor
0.409019
Plotting correlation with babiness
base_size_print=10label_size=3babiness_correlation(experience_by_babiness, "how_often_numeric_val", 'How often seen')
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
babiness_correlation(experience_by_babiness, "different_formats_numeric_val", "Number of different formats")
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
babiness_correlation(experience_by_babiness, "how_many_numeric_val", "Number of exemplars seen")
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: The following aesthetics were dropped during statistical transformation: label.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
##Age-based performance
age_title <-cowplot_title("Response values across age of child")age_plots <- cowplot::plot_grid(questiontype_age_lineplot(how_many_responses, "Number of exemplars seen"), questiontype_age_lineplot(how_often_responses, "How often seen"), questiontype_age_lineplot(format_responses, "Number of formats seen"), questiontype_age_lineplot(combined_normalized, "Normed averages", y_val="normalized_val"), labels ="auto")
Warning: There were 8 warnings in `summarise()`.
The first warning was:
ℹ In argument: `ci_lower = mean_value - qt(1 - (0.05/2), n - 1) * se`.
ℹ In group 2: `childs_age = 18` and `question_type = "how_many"`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 7 remaining warnings.
Warning: There were 8 warnings in `summarise()`.
The first warning was:
ℹ In argument: `ci_lower = mean_value - qt(1 - (0.05/2), n - 1) * se`.
ℹ In group 2: `childs_age = 18` and `question_type = "how_often"`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 7 remaining warnings.
Warning: There were 8 warnings in `summarise()`.
The first warning was:
ℹ In argument: `ci_lower = mean_value - qt(1 - (0.05/2), n - 1) * se`.
ℹ In group 2: `childs_age = 18` and `question_type = "different_formats"`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 7 remaining warnings.
Warning: There were 24 warnings in `summarise()`.
The first warning was:
ℹ In argument: `ci_lower = mean_value - qt(1 - (0.05/2), n - 1) * se`.
ℹ In group 4: `childs_age = 18` and `question_type = "different_formats"`.
Caused by warning in `qt()`:
! NaNs produced
ℹ Run `dplyr::last_dplyr_warnings()` to see the 23 remaining warnings.
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
response_summary <- how_many_responses %>%group_by(ResponseId) %>%summarize(mean_response =mean(numeric_val, na.rm =TRUE),mode_response =which.max(tabulate(numeric_val)))# View the summary of mean responses per participanthead(response_summary)
# Visualizing the distribution of mean responses per participantggplot(response_summary, aes(x = mean_response)) +geom_histogram(bins =20, fill ="skyblue", color ="black") +labs(title ="Distribution of Mean Responses Per Participant",x ="Mean Response Value", y ="Frequency") +theme_minimal()