df.raw <- read.csv('kids_color.csv')

df <- df.raw %>%
  filter(exclude == 0)
# df <- df.raw %>%
#   filter(excl_training == 0) 
length(unique(df$subjid)) # 33 participants

## [1] 33

summary(df)

##     subjid              dob                dot                 age       
##  Length:14388       Length:14388       Length:14388       Min.   :2.100  
##  Class :character   Class :character   Class :character   1st Qu.:2.438  
##  Mode  :character   Mode  :character   Mode  :character   Median :2.710  
##                                                           Mean   :2.687  
##                                                           3rd Qu.:2.925  
##                                                           Max.   :3.230  
##                                                                          
##      sex                lang              session      test.site        
##  Length:14388       Length:14388       Min.   :1.00   Length:14388      
##  Class :character   Class :character   1st Qu.:1.75   Class :character  
##  Mode  :character   Mode  :character   Median :2.50   Mode  :character  
##                                        Mean   :2.50                     
##                                        3rd Qu.:3.25                     
##                                        Max.   :4.00                     
##                                                                         
##      exp                order           task              target         
##  Length:14388       Min.   :1.000   Length:14388       Length:14388      
##  Class :character   1st Qu.:1.000   Class :character   Class :character  
##  Mode  :character   Median :2.000   Mode  :character   Mode  :character  
##                     Mean   :1.515                                        
##                     3rd Qu.:2.000                                        
##                     Max.   :2.000                                        
##                                                                          
##    col_num            col_code            p_adult        response        
##  Length:14388       Length:14388       Min.   :0.000   Length:14388      
##  Class :character   Class :character   1st Qu.:0.000   Class :character  
##  Mode  :character   Mode  :character   Median :0.050   Mode  :character  
##                                        Mean   :0.329                     
##                                        3rd Qu.:0.745                     
##                                        Max.   :1.000                     
##                                        NA's   :8052                      
##     select           sel_order           correct            comments        
##  Length:14388       Length:14388       Length:14388       Length:14388      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     exclude  excl_reason        excl_training
##  Min.   :0   Length:14388       Min.   :0    
##  1st Qu.:0   Class :character   1st Qu.:0    
##  Median :0   Mode  :character   Median :0    
##  Mean   :0                      Mean   :0    
##  3rd Qu.:0                      3rd Qu.:0    
##  Max.   :0                      Max.   :0    
##

Error analysis

Can only do overextension analysis here…

proximal errors are underestimates:

blue: purple, green

green: blue, yellow

purple: blue, red

yellow: red, green

Errors in Comprehension Task

df.compre <- df %>%
  filter(task == "comprehension")

color_code <- tibble(
  col_code = c('n3.5', '5y8.14', '5p4.12', '5yr6.14', '5yr4.8', '5r4.14', 'n9.5', '10gy5.12', '5rp7.10', 'n0.5', '10b5.12'), 
  test_color = c('gray', 'yellow', 'purple', 'orange', 'brown',
    'red', 'white', 'green', 'pink', 'black', 'blue'), 
)

proximal_colors <- tibble(
  target = c('blue', 'green', 'purple', 'yellow'), 
  proximal_color_1 = c('purple', 'blue', 'blue', 'red'), 
  proximal_color_2 = c('green', 'yellow', 'red', 'green')
)

#1412 responses
df.compre_response <- df.compre %>%
  left_join(., color_code, by = 'col_code') %>%
  left_join(., proximal_colors, by = 'target') %>%
  filter(select == 1) %>%
  group_by(subjid, session, target) %>%
  mutate(selected_target = max(correct), 
         is_overextension = case_when(
           correct == 1 ~ NA, # if correct then not an error
           #did you select the target correctly somewhere else? if yes then overextension
           correct == 0 & selected_target == 1 ~ 1, 
           .default = 0
           )) %>%
  mutate(is_proximal = case_when(
          correct == 1 ~ NA, 
          correct == 0 & (test_color %in% c(proximal_color_1, proximal_color_2)) ~ 1,
          .default = 0
  ))
#936 errors in total

get_proximal_errors_binom_comprehension <- function(df.response){
  n_errors <- df.response %>%
    filter(correct == 0) %>%
    nrow()
  
  df.compre_errors_label <- df.response %>%
    filter(correct == 0) %>%
    #group by which label used, and which stimuli responding to
    group_by(target) %>%
    summarise(prop = n() / n_errors)
  
  df.compre_errors_referent <- df.response %>%
    filter(correct == 0) %>%
    #group by which label used, and which stimuli responding to
    group_by(test_color) %>%
    summarise(prop = n() / n_errors)
  
  get_product_comprehension <- function(label, referent) { 
    return(df.compre_errors_label[df.compre_errors_label$target == label, ]$prop * 
      df.compre_errors_referent[df.compre_errors_referent$test_color == referent, ]$prop)
  }

  base_rate <- proximal_colors %>%
    pivot_longer(cols = -c(target)) %>%
    select(-name) %>%
    rename(referent = value) %>%
    rowwise() %>%
    mutate(product = get_product_comprehension(target, referent))

  overall_prior <- sum(base_rate$product)

  binom.test(x=sum(df.response$is_proximal, na.rm = T),n=n_errors,p=overall_prior)
}

get_proximal_errors_binom_comprehension(df.compre_response)

## 
##  Exact binomial test
## 
## data:  sum(df.response$is_proximal, na.rm = T) and n_errors
## number of successes = 193, number of trials = 930, p-value = 0.002959
## alternative hypothesis: true probability of success is not equal to 0.1699665
## 95 percent confidence interval:
##  0.1818851 0.2350355
## sample estimates:
## probability of success 
##              0.2075269

sum(df.compre_response$is_overextension, na.rm = T) #805 overextension errors, 86%

## [1] 814

sum(df.compre_response$is_proximal, na.rm = T) #193 proximal errors 20%

## [1] 193

Errors in Production Task

df.production <- df %>%
  filter(task == "production")

#1412 responses
df.production$target <- trimws(tolower(df.production$target))
df.production$response <- trimws(tolower(df.production$response))

df.production_response <- df.production %>%
  left_join(., proximal_colors, by = 'target') %>%
  rowwise() %>%
  mutate(correct = ifelse(grepl(target, response), 1, 0)) %>%
  group_by(subjid, session) %>%
  mutate(correct_targets = list(target[correct == 1])) %>%
  mutate(is_overextension = case_when(
           correct == 1 ~ NA, # if correct then not an error
           correct == 0 & response %in% correct_targets ~ 1, 
           .default = 0
           )) %>%
  ungroup() %>%
  group_by(subjid, session, target) %>%
  mutate(is_proximal = case_when(
          correct == 1 ~ NA, 
          correct == 0 & (response %in% c(proximal_color_1, proximal_color_2)) ~ 1,
          .default = 0
  )) %>%
  ungroup()

get_proximal_errors_binom_production <- function(df.response) {
  n_errors <- df.response %>%
    filter(correct == 0) %>%
    nrow()
  
  df.prod_errors_label <- df.response %>%
    filter(correct == 0) %>%
    #group by which label used, and which stimuli responding to
    group_by(response) %>%
    summarise(prop = n() / n_errors)

  df.prod_errors_referent <- df.response %>%
    filter(correct == 0) %>%
    #group by which label used, and which stimuli responding to
    group_by(target) %>%
    summarise(prop = n() / n_errors)
  
  get_product_production <- function(label, referent) { 
    return(df.prod_errors_label[df.prod_errors_label$response == label, ]$prop * 
      df.prod_errors_referent[df.prod_errors_referent$target == referent, ]$prop)
  }

  base_rate_production <- proximal_colors %>%
    pivot_longer(cols = -c(target)) %>%
    select(-name) %>%
    rename(referent = value) %>%
    rowwise() %>%
    mutate(product = get_product_production(target, referent))

  overall_prior <- sum(base_rate_production$product)
  
 return (binom.test(x=sum(df.response$is_proximal, na.rm = T),n=n_errors,p=overall_prior))

}

get_proximal_errors_binom_production(df.production_response)

## 
##  Exact binomial test
## 
## data:  sum(df.response$is_proximal, na.rm = T) and n_errors
## number of successes = 21, number of trials = 435, p-value = 0.009795
## alternative hypothesis: true probability of success is not equal to 0.026413
## 95 percent confidence interval:
##  0.03012841 0.07285039
## sample estimates:
## probability of success 
##             0.04827586

Trajectory of Production

#when does a kid first start using a response / color word correctly? 
df.prod_trajectory <- df.production %>%
  filter(task == "production") %>%
  mutate(response = tolower(response)) %>%
  rowwise() %>%
  mutate(correct = ifelse(grepl(target, response), 1, 0)) %>%
  filter(!response %in% c('nr', 'no response')) %>%
  group_by(subjid, target) %>%
  filter(correct == 1) %>%
  summarise(first_correct_session = min(session)) %>% 
  ungroup()

## `summarise()` has grouped output by 'subjid'. You can override using the
## `.groups` argument.

#some sanity check
ggplot(df.prod_trajectory,
      aes(x = target, y = first_correct_session, 
          color = target)) + 
  stat_summary(fun.data = "mean_cl_boot", 
               geom = "pointrange")+ 
  geom_jitter(height = 0, 
              alpha = 0.5,
              shape = 21,
              size = 3,
              aes(fill = target), 
              color = 'black') +
  theme(legend.position = "none") + 
  labs(x = 'Target color', 
       y = 'First session with correct production')

Number / Proportion of Errors by Trajectory Time Point

With kids who never produced a color word coded as ‘before production’ for all sessions

df.production_response_trajectory <- df.production_response %>%
  left_join(., df.prod_trajectory, join_by(subjid, response == target)) %>%
  mutate(prod_period_strict = case_when(
    is.na(first_correct_session) ~ 'before production',
    session < first_correct_session ~ 'before production', 
    session == first_correct_session ~ 'at production',
    session > first_correct_session ~ 'after production'
  ), 
  prod_period_within = case_when(
    is.na(first_correct_session) ~ "no production",
    session < first_correct_session ~ 'before production', 
    session == first_correct_session ~ 'at production',
    session > first_correct_session ~ 'after production'
  ))  %>%
    mutate(prod_period_strict = factor(prod_period_strict, 
                                       levels = c("before production", 
                                        "at production", 
                                        "after production")), 
          prod_period_within = factor(prod_period_within, 
                                      levels = c("no production", 
                                                 "before production", 
                                                 "at production", 
                                        "after production")))

df.production_trajectory_error <- df.production_response_trajectory %>%
   group_by(prod_period_strict, target) %>%
  summarise(total_errors = sum(correct == 0, na.rm = T),
            sum_overextension = sum(is_overextension, na.rm = T), 
            prop_overextension = sum_overextension / total_errors, 
            sum_proximal = sum(is_proximal, na.rm = T), 
            prop_proximal = sum_proximal / total_errors) %>%
  mutate(task = 'production')

## `summarise()` has grouped output by 'prod_period_strict'. You can override
## using the `.groups` argument.

df.compre_response_trajectory <- df.compre_response %>%
  left_join(., df.prod_trajectory, join_by(subjid, test_color == target)) %>%
  mutate(prod_period_strict = case_when(
    is.na(first_correct_session) ~ 'before production',
    session < first_correct_session ~ 'before production', 
    session == first_correct_session ~ 'at production',
    session > first_correct_session ~ 'after production'
  ), 
  prod_period_within = case_when(
    is.na(first_correct_session) ~ "no production",
    session < first_correct_session ~ 'before production', 
    session == first_correct_session ~ 'at production',
    session > first_correct_session ~ 'after production'
  ))  %>%
    mutate(prod_period_strict = factor(prod_period_strict, levels = c("before production", 
                                        "at production", 
                                        "after production")), 
          prod_period_within = factor(prod_period_within, levels = c("no production", 
          "before production", 
                                        "at production", 
                                        "after production")))

df.compre_trajectory_error <- df.compre_response_trajectory %>%
  group_by(prod_period_strict, target) %>%
  summarise(total_errors = sum(correct == 0, na.rm = T),
            sum_overextension = sum(is_overextension, na.rm = T), 
            prop_overextension = sum_overextension / total_errors, 
            sum_proximal = sum(is_proximal, na.rm = T), 
            prop_proximal = sum_proximal / total_errors) %>%
  mutate(task = 'comprehension')

## `summarise()` has grouped output by 'prod_period_strict'. You can override
## using the `.groups` argument.

df.error_trajectory <- bind_rows(df.compre_trajectory_error, df.production_trajectory_error) %>%
  pivot_longer(cols = c(prop_overextension, prop_proximal, 
                        sum_overextension, sum_proximal))

ggplot(df.error_trajectory %>%
         filter(name %in% c("sum_overextension", 
                            "sum_proximal")) %>%
         mutate(name = factor(name, labels = c('Extension errors', 'Proximal errors'))),
       aes(x = prod_period_strict, y = value, 
           fill = name)) + 
  geom_col(position = position_dodge(0.9)) + 
  facet_grid(~task) + 
  labs(x = 'Time period relative to first correct production of color word', 
       y = 'Number of errors', 
       fill = 'Error type')

ggplot(df.error_trajectory %>%
         filter(name %in% c("prop_overextension", 
                            "prop_proximal")) %>%
         mutate(name = factor(name, labels = c('Extension errors', 'Proximal errors'))),
       aes(x = prod_period_strict, y = value, 
           fill = name)) + 
  geom_col(position = position_dodge(0.9)) + 
  facet_grid(~task) + 
  labs(x = 'Time period relative to first correct production of color word', 
       y = 'Proportion of errors', 
       fill = 'Error type')

ggplot(df.error_trajectory %>%
         filter(task == "comprehension") %>%
         filter(name %in% c("sum_overextension", 
                            "sum_proximal")) %>%
         mutate(name = factor(name, labels = c('Extension errors', 'Proximal errors'))),
       aes(x = prod_period_strict, y = value, 
           fill = name)) + 
  geom_col(position = position_dodge(0.9)) + 
  facet_grid(~target) + 
  labs(x = 'Time period relative to first correct production of color word', 
       y = 'Number of errors', 
       fill = 'Error type')

ggplot(df.error_trajectory %>%
         filter(task == "comprehension") %>%
         filter(name %in% c("prop_overextension", 
                            "prop_proximal")) %>%
         mutate(name = factor(name, labels = c('Extension errors', 'Proximal errors'))),
       aes(x = prod_period_strict, y = value, 
           fill = name)) + 
  geom_col(position = position_dodge(0.9)) + 
  facet_grid(~target) + 
  labs(x = 'Time period relative to first correct production of color word', 
       y = 'Proportion of errors', 
       fill = 'Error type')

ggplot(df.error_trajectory %>%
         filter(task == "production") %>%
         filter(name %in% c("sum_overextension", 
                            "sum_proximal")) %>%
         mutate(name = factor(name, labels = c('Extension errors', 'Proximal errors'))),
       aes(x = prod_period_strict, y = value, 
           fill = name)) + 
  geom_col(position = position_dodge(0.9)) + 
  facet_grid(~target) + 
  labs(x = 'Time period relative to first correct production of color word', 
       y = 'Number of errors', 
       fill = 'Error type')

Recode

With kids who never produced a color word coded as ‘no production’ for all sessions

df.production_trajectory_error_within <- df.production_response_trajectory %>%
   group_by(prod_period_within, target) %>%
  summarise(total_errors = sum(correct == 0, na.rm = T),
            sum_overextension = sum(is_overextension, na.rm = T), 
            prop_overextension = sum_overextension / total_errors, 
            sum_proximal = sum(is_proximal, na.rm = T), 
            prop_proximal = sum_proximal / total_errors) %>%
  mutate(task = 'production')

## `summarise()` has grouped output by 'prod_period_within'. You can override
## using the `.groups` argument.

df.compre_trajectory_error_within <- df.compre_response_trajectory %>%
  group_by(prod_period_within, target) %>%
  summarise(total_errors = sum(correct == 0, na.rm = T),
            sum_overextension = sum(is_overextension, na.rm = T), 
            prop_overextension = sum_overextension / total_errors, 
            sum_proximal = sum(is_proximal, na.rm = T), 
            prop_proximal = sum_proximal / total_errors) %>%
  mutate(task = 'comprehension')

## `summarise()` has grouped output by 'prod_period_within'. You can override
## using the `.groups` argument.

df.error_trajectory_within <- bind_rows(df.compre_trajectory_error_within, df.production_trajectory_error_within) %>%
  pivot_longer(cols = c(prop_overextension, prop_proximal, 
                        sum_overextension, sum_proximal))

ggplot(df.error_trajectory_within %>%
         filter(name %in% c("sum_overextension", 
                            "sum_proximal")) %>%
         mutate(name = factor(name, labels = c('Extension errors', 'Proximal errors'))),
       aes(x = prod_period_within, y = value, 
           fill = name)) + 
  geom_col(position = position_dodge(0.9)) + 
  facet_grid(~task) + 
  labs(x = 'Time period relative to first correct production of color word', 
       y = 'Number of errors', 
       fill = 'Error type') + 
  theme(axis.text.x=element_text(size=12))

ggplot(df.error_trajectory_within %>%
         filter(name %in% c("prop_overextension", 
                            "prop_proximal")) %>%
         mutate(name = factor(name, labels = c('Extension errors', 'Proximal errors'))),
       aes(x = prod_period_within, y = value, 
           fill = name)) + 
  geom_col(position = position_dodge(0.9)) + 
  facet_grid(~task) + 
  labs(x = 'Time period relative to first correct production of color word', 
       y = 'Proportion of errors', 
       fill = 'Error type') + 
  theme(axis.text.x=element_text(size=12))

ggplot(df.error_trajectory_within %>%
         filter(task == "comprehension") %>%
         filter(name %in% c("sum_overextension", 
                            "sum_proximal")) %>%
         mutate(name = factor(name, labels = c('Extension errors', 'Proximal errors'))),
       aes(x = prod_period_within, y = value, 
           fill = name)) + 
  geom_col(position = position_dodge(0.9)) + 
  facet_grid(~target) + 
  labs(x = 'Time period relative to first correct production of color word', 
       y = 'Number of errors', 
       fill = 'Error type')

ggplot(df.error_trajectory_within %>%
         filter(task == "comprehension") %>%
         filter(name %in% c("prop_overextension", 
                            "prop_proximal")) %>%
         mutate(name = factor(name, labels = c('Extension errors', 'Proximal errors'))),
       aes(x = prod_period_within, y = value, 
           fill = name)) + 
  geom_col(position = position_dodge(0.9)) + 
  facet_grid(~target) + 
  labs(x = 'Time period relative to first correct production of color word', 
       y = 'Proportion of errors', 
       fill = 'Error type')

ggplot(df.error_trajectory_within %>%
         filter(task == "production") %>%
         filter(name %in% c("sum_overextension", 
                            "sum_proximal")) %>%
         mutate(name = factor(name, labels = c('Extension errors', 'Proximal errors'))),
       aes(x = prod_period_within, y = value, 
           fill = name)) + 
  geom_col(position = position_dodge(0.9)) + 
  facet_grid(~target) + 
  labs(x = 'Time period relative to first correct production of color word', 
       y = 'Number of errors', 
       fill = 'Error type')

Category Size by Trajectory Time Point

#let's get comprehension stuff out as well
df.compre_cat_size <- df.compre_response_trajectory %>%
  mutate(select = as.numeric(select), 
         correct = as.numeric(correct)) %>% 
  group_by(subjid, session, task, target, 
           first_correct_session, prod_period_within, prod_period_strict) %>%
  summarise(compre_n_select = sum(select), # how many was selected per target
            compre_include_correct = ifelse(sum(correct, na.rm = T) >= 1, 1, 0),
            compre_correct_first = max(ifelse(correct == 1 & sel_order == 1, 1, 0))) # was the target chosen first?

## `summarise()` has grouped output by 'subjid', 'session', 'task', 'target',
## 'first_correct_session', 'prod_period_within'. You can override using the
## `.groups` argument.

df.spectrum_cat_size <- df %>%
  filter(task == "spectrum" & col_num != "total") %>%
  left_join(., df.prod_trajectory, join_by(subjid, target == target)) %>%
  mutate(prod_period_strict = case_when(
    is.na(first_correct_session) ~ 'before production',
    session < first_correct_session ~ 'before production', 
    session == first_correct_session ~ 'at production',
    session > first_correct_session ~ 'after production'
  ), 
  prod_period_within = case_when(
    is.na(first_correct_session) ~ "no production",
    session < first_correct_session ~ 'before production', 
    session == first_correct_session ~ 'at production',
    session > first_correct_session ~ 'after production'
  ))  %>%
    mutate(prod_period_strict = factor(prod_period_strict, levels = c("before production", 
                                        "at production", 
                                        "after production")), 
          prod_period_within = factor(prod_period_within, levels = c("no production", "before production", 
                                        "at production", 
                                        "after production"))) %>%
  mutate(select = as.numeric(select), 
         correct = as.numeric(correct)) %>% 
  group_by(subjid, session, task, target, 
           first_correct_session, prod_period_within, prod_period_strict) %>%
  summarise(spectrum_n_select = sum(select, na.rm = T), # how many was selected per target
            spectrum_include_correct = ifelse(sum(correct, na.rm = T) > 1, 1, 0))  # does it include the target?

## `summarise()` has grouped output by 'subjid', 'session', 'task', 'target',
## 'first_correct_session', 'prod_period_within'. You can override using the
## `.groups` argument.

df.cat_size <- bind_rows(df.compre_cat_size, df.spectrum_cat_size)

Comprehension Category Size

ggplot(df.cat_size, 
       aes(x = prod_period_strict, y = compre_n_select)) + 
  geom_violin() + 
  stat_summary(fun.data = "mean_cl_boot", 
               geom = "pointrange") + 
  geom_jitter(height = 0, 
              alpha = 0.3) + 
  ylab("Number of squares selected as target") + 
  xlab("Comprehension task relative to first correct production of color word")

## Warning: Removed 264 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 264 rows containing non-finite outside the scale range
## (`stat_summary()`).

## Warning: Removed 264 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(df.cat_size %>%
         filter(task == "comprehension"), 
       aes(x = prod_period_strict, y = compre_n_select, 
           color = target)) + 
  stat_summary(fun.data = "mean_cl_boot", 
               geom = "errorbar",
               position = position_dodge(width = 0.5), 
               aes(width = 0.2)) + 
  stat_summary(fun = "mean", 
               geom = "point", 
               position = position_dodge(width = 0.5)) + 
  stat_summary(fun = "mean", 
               geom = "line", 
               aes(group = target), 
               position = position_dodge(width = 0.5)) + 
  scale_color_manual(values = c('blue', 'green', 'purple', 'gold')) + 
  facet_grid(~target) +
  theme(legend.position="none") +
  ylab("Number of squares selected as target") + 
  xlab("Comprehension task relative to first production")

ggplot(df.cat_size %>%
         filter(task == "comprehension"), 
       aes(x = prod_period_within, y = compre_n_select, 
           color = target)) + 
  stat_summary(fun.data = "mean_cl_boot", 
               geom = "errorbar",
               position = position_dodge(width = 0.5), 
               aes(width = 0.2)) + 
  stat_summary(fun = "mean", 
               geom = "point", 
               position = position_dodge(width = 0.5)) + 
  stat_summary(fun = "mean", 
               geom = "line", 
               aes(group = target), 
               position = position_dodge(width = 0.5)) + 
  scale_color_manual(values = c('blue', 'green', 'purple', 'gold')) + 
  facet_wrap(~target) + 
  theme(legend.position="none") +
  ylab("Number of squares selected as target") + 
  xlab("Comprehension task relative to first correct production of color word")

ggplot(df.cat_size %>%
         filter(task == "comprehension") %>%
         filter(!is.na(compre_include_correct)) %>%
         mutate(compre_include_correct = ifelse(
           compre_include_correct == 0, "did not select target", "selected target"
         )), 
       aes(x = prod_period_strict, y = compre_n_select, 
           color = target)) + 
  stat_summary(fun.data = "mean_cl_boot", 
               geom = "errorbar",
               position = position_dodge(width = 0.5), 
               aes(width = 0.2)) + 
  stat_summary(fun = "mean", 
               geom = "point", 
               position = position_dodge(width = 0.5)) + 
  stat_summary(fun = "mean", 
               geom = "line", 
               aes(group = target), 
               position = position_dodge(width = 0.5)) + 
  scale_color_manual(values = c('blue', 'green', 'purple', 'gold')) + 
  facet_grid(compre_include_correct~target) + 
  theme(legend.position="none") +
  ylab("Number of squares selected as target") + 
  xlab("Comprehension task relative to first correct production of color word")

Spectrum Category Size

ggplot(df.cat_size %>%
         filter(task == "spectrum"), 
       aes(x = prod_period_strict, y = spectrum_n_select)) + 
  geom_violin() + 
  stat_summary(fun.data = "mean_cl_boot", 
               geom = "pointrange") + 
  geom_jitter(height = 0, 
              alpha = 0.3) + 
  ylab("Number of squares selected as target") + 
  xlab("Spectrum task relative to first correct production of color word")

ggplot(df.cat_size %>%
         filter(task == "spectrum"), 
       aes(x = prod_period_within, y = spectrum_n_select)) + 
  geom_violin() + 
  stat_summary(fun.data = "mean_cl_boot", 
               geom = "pointrange") + 
  geom_jitter(height = 0, 
              alpha = 0.3) + 
  ylab("Number of squares selected as target") + 
  xlab("Spectrum task relative to first correct production of color word")

ggplot(df.cat_size %>%
         filter(task == "spectrum"), 
       aes(x = prod_period_strict, y = spectrum_n_select)) + 
  geom_violin() + 
  stat_summary(fun.data = "mean_cl_boot", 
               geom = "pointrange") + 
  geom_jitter(height = 0, 
              alpha = 0.3) + 
  facet_grid(~target) + 
  ylab("Number of squares selected as target") + 
  xlab("Comprehension task relative to first correct production of color word")

ggplot(df.cat_size %>%
         filter(task == "spectrum"), 
       aes(x = prod_period_strict, y = spectrum_n_select, 
           color = target)) + 
  stat_summary(fun.data = "mean_cl_boot", 
               geom = "errorbar",
               position = position_dodge(width = 0.5), 
               aes(width = 0.2)) + 
  stat_summary(fun = "mean", 
               geom = "point", 
               position = position_dodge(width = 0.5)) + 
  stat_summary(fun = "mean", 
               geom = "line", 
               aes(group = target), 
               position = position_dodge(width = 0.5)) + 
  scale_color_manual(values = c('blue', 'green', 'purple', 'gold')) + 
  ylab("Number of squares selected as target") + 
  xlab("Spectrum task relative to first production")

ggplot(df.cat_size %>%
         filter(task == "spectrum"), 
       aes(x = prod_period_within, y = spectrum_n_select, 
           color = target)) + 
  stat_summary(fun.data = "mean_cl_boot", 
               geom = "errorbar",
               position = position_dodge(width = 0.5), 
               aes(width = 0.2)) + 
  stat_summary(fun = "mean", 
               geom = "point", 
               position = position_dodge(width = 0.5)) + 
  stat_summary(fun = "mean", 
               geom = "line", 
               aes(group = target), 
               position = position_dodge(width = 0.5)) + 
  scale_color_manual(values = c('blue', 'green', 'purple', 'gold')) + 
  facet_grid(~target) + 
  theme(legend.position="none") +
  ylab("Number of squares selected as target") + 
  xlab("Spectrum task relative to first correct production of color word")

ggplot(df.cat_size %>%
         filter(task == "spectrum") %>%
         filter(!is.na(spectrum_include_correct)) %>%
         mutate(spectrum_include_correct = ifelse(
           spectrum_include_correct == 0, "did not select focal", "selected focal"
         )), 
       aes(x = prod_period_strict, y = spectrum_n_select, 
           color = target)) + 
  stat_summary(fun.data = "mean_cl_boot", 
               geom = "errorbar",
               position = position_dodge(width = 0.5), 
               aes(width = 0.2)) + 
  stat_summary(fun = "mean", 
               geom = "point", 
               position = position_dodge(width = 0.5)) + 
  stat_summary(fun = "mean", 
               geom = "line", 
               aes(group = target), 
               position = position_dodge(width = 0.5)) + 
  scale_color_manual(values = c('blue', 'green', 'purple', 'gold')) + 
  facet_grid(spectrum_include_correct~target) + 
  theme(legend.position="none") +
  ylab("Number of squares selected as target") + 
  xlab("Spectrum task relative to first correct production of color word")

Number of unique words produced vs. average category size.

# plot number of unique words produced vs. average category size? 
df.unique_production <- df %>% 
  filter(task == "production") %>%
  group_by(subjid, session) %>%
  mutate(response = tolower(response)) %>%
  filter(!response %in% c('idk', 'nr', 'no response', '3') & !str_detect(response, " ")) %>%
  summarise(unique_words_produced = list(unique(response))) %>%
  rowwise() %>%
  mutate(n_unique_words_produced = length(unique_words_produced)) %>%
  mutate(contains_blue = ifelse('blue' %in% unique_words_produced, 1, 0), 
         contains_green = ifelse('green' %in% unique_words_produced, 1, 0), 
         contains_purple = ifelse('purple' %in% unique_words_produced, 1, 0), 
         contains_yellow = ifelse('yellow' %in% unique_words_produced, 1, 0)) %>%
  mutate(n_compre_target_produced = sum(across(starts_with("contains"))), 
         n_spectrum_target_produced = sum(contains_blue, contains_green))

## `summarise()` has grouped output by 'subjid'. You can override using the
## `.groups` argument.

df.unique_production_compre <- left_join(df.unique_production, df.cat_size)

## Joining with `by = join_by(subjid, session)`

df.unique_production_compre_per_ppt <- df.unique_production_compre %>%
  group_by(subjid, session, task, n_unique_words_produced, 
           n_compre_target_produced, n_spectrum_target_produced) %>%
  summarise(mean_compre_n_select = mean(compre_n_select), 
            #calculate a mean of just the categories that has been 'comprehended'
            mean_compre_n_select_include_target = 
              mean(ifelse(compre_include_correct, compre_n_select, NA), na.rm = T), 
            mean_spectrum_n_select = mean(spectrum_n_select), 
             #calculate a mean of just the categories that has been 'comprehended'
            mean_spectrum_n_select_include_target = 
              mean(ifelse(spectrum_include_correct, spectrum_n_select, NA), na.rm = T)) %>%
  mutate(mean_compre_n_select = ifelse(is.na(mean_compre_n_select), mean_spectrum_n_select, mean_compre_n_select), 
         mean_compre_n_select_include_target = ifelse(is.na(mean_compre_n_select_include_target), mean_spectrum_n_select_include_target, mean_compre_n_select_include_target)) %>%
  rename(mean_select = mean_compre_n_select, 
         mean_select_include_target = mean_compre_n_select_include_target) %>%
  select(-mean_spectrum_n_select, 
         -mean_spectrum_n_select_include_target)

## `summarise()` has grouped output by 'subjid', 'session', 'task',
## 'n_unique_words_produced', 'n_compre_target_produced'. You can override using
## the `.groups` argument.

ggplot(df.unique_production_compre_per_ppt,
       aes(x = n_unique_words_produced, y = mean_select, 
           fill = task)) + 
  stat_summary(fun = "mean", 
               geom = "bar", 
               aes(fill = task), 
               position = position_dodge(width = 0.9)
               ) + 
  stat_summary(fun.data = "mean_cl_boot", 
               geom = "errorbar",
               position = position_dodge(width = 0.9), 
               aes(width = 0.2)) + 
  stat_summary(fun = "mean", 
               geom = "point", 
               position = position_dodge(width = 0.9)
               ) + 
    stat_summary(fun = "mean", 
               geom = "line", 
               aes(group = task), 
               position = position_dodge(width = 0.9)
               ) + 
  scale_x_continuous(breaks = seq(0, 12, 1)) + 
  facet_wrap(~task) +
  ylab("Number of squares selected as target (per task per session)") + 
  xlab("Number of unique color words produced (per session)") + 
  theme(legend.position = "none")

ggplot(df.unique_production_compre_per_ppt,
       aes(x = n_unique_words_produced, y = mean_select_include_target, 
           fill = task)) + 
  stat_summary(fun = "mean", 
               geom = "bar", 
               aes(fill = task), 
               position = position_dodge(width = 0.9)
               ) + 
  stat_summary(fun.data = "mean_cl_boot", 
               geom = "errorbar",
               position = position_dodge(width = 0.9), 
               aes(width = 0.2)) + 
  stat_summary(fun = "mean", 
               geom = "point", 
               position = position_dodge(width = 0.9)
               ) + 
    stat_summary(fun = "mean", 
               geom = "line", 
               aes(group = task), 
               position = position_dodge(width = 0.9)
               ) + 
  scale_x_continuous(breaks = seq(0, 12, 1)) + 
  facet_wrap(~task) +
  ylab("Number of squares selected as CORRECT target") + 
  xlab("Number of unique color words produced (per session)") + 
  theme(legend.position = "none")

## Warning: Removed 18 rows containing non-finite outside the scale range
## (`stat_summary()`).
## Removed 18 rows containing non-finite outside the scale range
## (`stat_summary()`).
## Removed 18 rows containing non-finite outside the scale range
## (`stat_summary()`).
## Removed 18 rows containing non-finite outside the scale range
## (`stat_summary()`).

#ONLY counting the production of targets from the comprehension task...
ggplot(df.unique_production_compre_per_ppt %>%
         filter(task == "comprehension"),
       aes(x = n_compre_target_produced, y = mean_select)) + 
  stat_summary(fun = "mean", 
               geom = "bar", 
               aes(fill = task), 
               position = position_dodge(width = 0.9)
               ) + 
  stat_summary(fun.data = "mean_cl_boot", 
               geom = "errorbar",
               position = position_dodge(width = 0.9), 
               aes(width = 0.2)) + 
  stat_summary(fun = "mean", 
               geom = "point", 
               position = position_dodge(width = 0.9)
               ) + 
    stat_summary(fun = "mean", 
               geom = "line", 
               aes(group = task), 
               position = position_dodge(width = 0.9)
               ) + 
  facet_wrap(~task) +
  ylab("Number of squares selected as target (per task per session)") + 
  xlab("Number of target color words (blue/green/purple/yellow) produced (per session)") + 
  theme(legend.position = "none")

# only counting production of blue / green
ggplot(df.unique_production_compre_per_ppt %>%
         filter(task == "spectrum"),
       aes(x = n_spectrum_target_produced, y = mean_select)) + 
  stat_summary(fun = "mean", 
               geom = "bar", 
               aes(fill = task), 
               position = position_dodge(width = 0.9)
               ) + 
  stat_summary(fun.data = "mean_cl_boot", 
               geom = "errorbar",
               position = position_dodge(width = 0.9), 
               aes(width = 0.2)) + 
  stat_summary(fun = "mean", 
               geom = "point", 
               position = position_dodge(width = 0.9)
               ) + 
    stat_summary(fun = "mean", 
               geom = "line", 
               aes(group = task), 
               position = position_dodge(width = 0.9)
               ) + 
  scale_x_continuous(breaks = seq(0, 4, 1)) + 
  facet_wrap(~task) +
  ylab("Number of squares selected as target (per task per session)") + 
  xlab("Number of unique color words produced (per session)") + 
  theme(legend.position = "none")

# okay, but maybe what we really care about is only the categories that they succeeded in the 
# comprehension tasks!

ggplot(df.unique_production_compre %>%
         filter(task == "comprehension"),
       aes(x = n_unique_words_produced, y = compre_n_select, 
           color = I(target))) + 
  stat_summary(fun.data = "mean_cl_boot", 
               geom = "errorbar",
               position = position_dodge(width = 0.5), 
               aes(width = 0.2)) + 
    stat_summary(fun = "mean", 
               geom = "point", 
               position = position_dodge(width = 0.5)) + 
  stat_summary(fun = "mean", 
               geom = "line", 
               aes(group = target), 
               position = position_dodge(width = 0.5)) + 
  facet_wrap(~target) + 
  ylab("Number of squares selected as target (per task per session)") + 
  xlab("Number of comprehension target words (blue, green, purple, yellow) produced (per session)")

ggplot(df.unique_production_compre %>%
         filter(task == "spectrum"), 
       aes(x = n_unique_words_produced, y = spectrum_n_select, 
           color = I(target))) + 
  stat_summary(fun.data = "mean_cl_boot", 
               geom = "errorbar",
               position = position_dodge(width = 0.5), 
               aes(width = 0.2)) + 
  stat_summary(fun = "mean", 
               geom = "point", 
               position = position_dodge(width = 0.5)) + 
  stat_summary(fun = "mean", 
               geom = "line", 
               aes(group = target), 
               position = position_dodge(width = 0.5)) + 
  scale_color_manual(values = c('blue', 'green', 'purple', 'gold')) + 
  ylab("Number of squares selected as target") + 
  xlab("Number of unique color words produced")

df.blue <- df %>%
  filter(target == "blue" | target == "green")

df.blue_prod <- df.blue %>%
  filter(task == "production")

df.blue_compre <- df.blue %>%
  filter(task == "comprehension") %>% 
  group_by(subjid, target) %>%
  mutate(select = as.numeric(select)) %>% 
  group_by(subjid, session, target) %>%
  summarise(compre_n_select = sum(select), # how many was select per target
            compre_include_correct = sum(correct, na.rm = T), # does it include the target?
            compre_correct_first = max(ifelse(correct == 1 & sel_order == 1, 1, 0))) # was the target chosen first?

df.blue_spectrum <- df.blue %>%
  filter(task == "spectrum" & col_num != "total") %>% 
  group_by(subjid, target) %>%
  mutate(select = as.numeric(select)) %>% 
  group_by(subjid, session, target) %>%
  summarise(spectrum_n_select = sum(select, na.rm = T), # how many was select per target
            spectrum_include_correct = sum(correct, na.rm = T))  # does it include the target?

df.blue_sum <- left_join(df.blue_prod, df.blue_compre) %>%
  left_join(., df.blue_spectrum) %>%
  select(-c(col_num, col_code, p_adult, select, sel_order,
            comments, exclude, excl_reason))

# let's just look at the answers for blue for now, and we want to just look at kids
# who began producing it during the process

# 6 kids that learned blue over 4 sessions
df.blue_prod_change <- df.blue_sum %>%
  filter(target == "blue") %>%
  group_by(subjid) %>%
  filter(sum(correct) < 4 & sum(correct) > 0) %>%
  ungroup()

ggplot(df.blue_prod_change, 
       aes(x = session, y = compre_n_select, color = subjid)) +
  geom_point(height = 0, 
             aes(color = I(ifelse(correct == 1, 'blue', 'black')))) + 
  geom_line(aes(group = subjid)) + 
  facet_wrap(~subjid) +
  geom_text(aes(label = ifelse(compre_include_correct > 0, '*', '')), 
            nudge_x = 0.1,
            size = 7)

ggplot(df.blue_prod_change, 
       aes(x = session, y = compre_n_select, color = subjid)) +
  geom_point(height = 0, 
             aes(color = I(ifelse(correct == 1, 'blue', 'black')))) + 
  geom_line(aes(group = subjid)) + 
  facet_wrap(~subjid) +
  geom_text(aes(label = ifelse(compre_correct_first > 0, '*', '')), 
            nudge_x = 0.1,
            size = 7)

ggplot(df.blue_prod_change, 
       aes(x = session, y = spectrum_n_select, color = subjid)) +
  geom_point(height = 0, 
             aes(color = I(ifelse(correct == 1, 'blue', 'black')))) + 
  geom_line(aes(group = subjid)) + 
  facet_wrap(~subjid) + 
  geom_text(aes(label = ifelse(spectrum_include_correct > 0, '*', '')), 
            nudge_x = 0.1,
            size = 7)

# how about green?

# 8 kids that learned blue over 4 sessions
df.green_prod_change <- df.blue_sum %>%
  filter(target == "green") %>%
  group_by(subjid) %>%
  filter(sum(correct) < 4 & sum(correct) > 0) %>%
  ungroup()

ggplot(df.green_prod_change, 
       aes(x = session, y = compre_n_select, color = subjid)) +
  geom_point(height = 0, 
             aes(color = I(ifelse(correct == 1, 'green', 'black')))) + 
  geom_line(aes(group = subjid)) + 
  facet_wrap(~subjid) + 
  geom_text(aes(label = ifelse(compre_include_correct > 0, '*', '')), 
            nudge_x = 0.1,
            size = 7)

ggplot(df.green_prod_change, 
       aes(x = session, y = spectrum_n_select, color = subjid)) +
  geom_point(height = 0, 
             aes(color = I(ifelse(correct == 1, 'green', 'black')))) + 
  geom_line(aes(group = subjid)) + 
  facet_wrap(~subjid) + 
  geom_text(aes(label = ifelse(spectrum_include_correct > 0, '*', '')), 
            nudge_x = 0.1,
            size = 7)

df.spectrum <- df.learned_words %>% # 76 of this observations....
  filter(task == "spectrum") %>%
  filter(first_correct_session > 1) %>%
  mutate(spectrum_period = case_when(
    session < first_correct_session ~ 'before production', 
    session == first_correct_session ~ 'at production', 
    session > first_correct_session ~ 'after production'
  )) %>% 
  mutate(spectrum_period = factor(spectrum_period, levels = c('before production', 'at production',
                                                          'after production')))

ggplot(df.spectrum, 
       aes(x = spectrum_period, y = spectrum_n_select)) + 
  geom_violin() + 
  stat_summary(fun.data = "mean_cl_boot", 
               geom = "pointrange")

ggplot(df.spectrum, 
       aes(x = spectrum_period, y = spectrum_n_select, 
           color = target)) + 
  stat_summary(fun.data = "mean_cl_boot", 
               geom = "errorbar",
               position = position_dodge(width = 0.5), 
               aes(width = 0.2)) + 
  stat_summary(fun = "mean", 
               geom = "point", 
               position = position_dodge(width = 0.5)) + 
  stat_summary(fun = "mean", 
               geom = "line", 
               aes(group = target), 
               position = position_dodge(width = 0.5)) + 
  scale_color_manual(values = c('blue', 'green')) + 
  ylab("Number of squares selected as target") + 
  xlab("Spectrum task relative to first production")

Color Word Analysis

07 March, 2025