Typicality by participant type

Typicality
Wiki similarity
- Cos dist
- Noun scaled cos dist

Typicality

UTTS_WITH_TYPICALITY <- here("exploratory_analyses/01_adj_typicality/data/utts_with_bergey_typicality.csv")

utts_with_typicality <- read_csv(UTTS_WITH_TYPICALITY)

by_participant_data <- utts_with_typicality %>%
  filter(speaker == "mother") %>%
  pivot_longer(cols = turker_judgment:ldp_similarity, names_to = "measure") %>%
  #distinct(group, participant_id, noun, adj, measure, value) %>% # get rid of repititions in teh same utterance?
  rename(transcript_id = participant_id) %>%
  mutate(participant_id = paste0(group, parse_number(transcript_id)),
         log_value = log(value))

num_pairs_by_kid <- count(by_participant_data, group, participant_id, measure) %>%
  distinct(group, participant_id, n) %>%
  data.frame()

There are utterances that overlap with the bergey/morris norms from 36 kids, for a total of 521 utterances.

mean_typicality_by_participant <- by_participant_data %>%
  group_by(group, measure, participant_id) %>%
  summarize(log_value = mean(log_value)) 

mean_typicality <- mean_typicality_by_participant %>%
  group_by(group, measure) %>%
  multi_boot_standard(col = "log_value", na.rm = TRUE)

ggplot(mean_typicality, aes(x = group, y = mean)) +
  facet_wrap(.~ measure, scale = "free_y") +
  ylab("mean typicality") +
  geom_point() +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
                 position = position_dodge(width = .9))

Model

df <- by_participant_data %>%
  filter(measure == "turker_judgment") 

lmer(log_value ~ group + (1|participant_id), data = df) %>%
  summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: log_value ~ group + (1 | participant_id)
##    Data: df
## 
## REML criterion at convergence: 613
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.67783 -0.58752 -0.03114  0.80423  2.09671 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  participant_id (Intercept) 0.009298 0.09643 
##  Residual                   0.180840 0.42525 
## Number of obs: 521, groups:  participant_id, 36
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  1.26847    0.03926  32.306
## groupTD     -0.07664    0.05342  -1.435
## 
## Correlation of Fixed Effects:
##         (Intr)
## groupTD -0.735

df <- by_participant_data %>%
  filter(measure == "wiki_similarity") 

lmer(log_value ~ group + (1|participant_id), data = df) %>%
  summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: log_value ~ group + (1 | participant_id)
##    Data: df
## 
## REML criterion at convergence: 101.9
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.7701 -0.5369 -0.2130  0.3031  3.2633 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  participant_id (Intercept) 0.02005  0.1416  
##  Residual                   0.06328  0.2516  
## Number of obs: 521, groups:  participant_id, 36
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) -0.79091    0.03917 -20.193
## groupTD     -0.04626    0.05524  -0.837
## 
## Correlation of Fixed Effects:
##         (Intr)
## groupTD -0.709

Next step…calculate wiki simlarity for all noun-adj pairs, and do this for VI group? Better embedding sim measure?

Wiki similarity

All noun-adj pairs based on wiki cosine similarity; pairs identified as noun-adj occuring in same utterance; pos identified based on subtlexus dominant.

UTTS_WITH_WIKI_SIM <- here("exploratory_analyses/01_adj_typicality/data/utts_with_wiki_sim.csv")

utts_with_wiki_sim <- read_csv(UTTS_WITH_WIKI_SIM)

by_participant_data <- utts_with_wiki_sim %>%
  filter(speaker == "mother") %>%
  #distinct(group, participant_id, noun, adj, cos_dist, scale_cos_dist) %>% # get rid of repititions in the same utterance?
  rename(transcript_id = participant_id) %>%
  mutate(participant_id = paste0(group, parse_number(transcript_id)),
         age = map(str_extract_all(transcript_id, "[0-9]+"), as.numeric),
         age = map(age, ~unlist(.[[2]])),
         age = unlist(age),
         age_bin = cut(age, breaks = 3)) #, case_when(age < 730 ~ "young",
                            # TRUE ~ "old"))

num_pairs_by_kid <- count(by_participant_data, group, age_bin, participant_id) %>%
  distinct(group, participant_id, n, age_bin) %>%
  data.frame()

Cos dist

mean_typicality_by_participant <- by_participant_data %>%
  group_by(group, participant_id) %>%
  summarize(cos_dist = mean(cos_dist)) 

mean_typicality <- mean_typicality_by_participant %>%
  group_by(group) %>%
  multi_boot_standard(col = "cos_dist", na.rm = TRUE)

ggplot(mean_typicality, aes(x = group, y = mean)) +
  ylab("mean typicality") +
  geom_point() +
  #geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
                 position = position_dodge(width = .9))

lmer(cos_dist ~ group + (1|participant_id) + (1|noun), 
     data = by_participant_data) %>%
  summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: cos_dist ~ group + (1 | participant_id) + (1 | noun)
##    Data: by_participant_data
## 
## REML criterion at convergence: -14462
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.0413 -0.4012  0.0106  0.3484  6.2880 
## 
## Random effects:
##  Groups         Name        Variance  Std.Dev.
##  noun           (Intercept) 4.544e-03 0.067408
##  participant_id (Intercept) 4.562e-05 0.006755
##  Residual                   3.692e-03 0.060759
## Number of obs: 5639, groups:  noun, 675; participant_id, 40
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept)  0.245306   0.003558  68.939
## groupTD     -0.004542   0.002842  -1.598
## 
## Correlation of Fixed Effects:
##         (Intr)
## groupTD -0.394

by age:

mean_typicality_by_participant <- by_participant_data %>%
  group_by(group, participant_id, age_bin) %>%
  summarize(cos_dist = mean(cos_dist)) 

mean_typicality <- mean_typicality_by_participant %>%
  group_by(group, age_bin) %>%
  multi_boot_standard(col = "cos_dist", na.rm = TRUE)

ggplot(mean_typicality, aes(x = age_bin, color = group, y = mean, group = group)) +
  ylab("mean typicality") +
  geom_point() +
  geom_line() +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper))

ggplot(mean_typicality, aes(x = group, y = mean)) +
  ylab("mean typicality") +
  facet_wrap(~age_bin) +
  geom_point() +
  #geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
                 position = position_dodge(width = .9))

lmer(cos_dist ~ group+age + (1|participant_id), data = by_participant_data) %>%
  summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: cos_dist ~ group + age + (1 | participant_id)
##    Data: by_participant_data
## 
## REML criterion at convergence: -9112.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.7121 -0.7351 -0.1914  0.7705  3.9084 
## 
## Random effects:
##  Groups         Name        Variance  Std.Dev.
##  participant_id (Intercept) 0.0005709 0.02389 
##  Residual                   0.0113965 0.10675 
## Number of obs: 5639, groups:  participant_id, 40
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)  2.965e-01  7.374e-03  40.212
## groupTD     -9.935e-03  8.226e-03  -1.208
## age         -5.684e-06  6.546e-06  -0.868
## 
## Correlation of Fixed Effects:
##         (Intr) gropTD
## groupTD -0.529       
## age     -0.651  0.020

Noun scaled cos dist

mean_typicality_by_participant_scaled <- by_participant_data %>%
  group_by(group, participant_id) %>%
  summarize(cos_dist = mean(scale_cos_dist)) 

mean_typicality <- mean_typicality_by_participant_scaled %>%
  group_by(group) %>%
  multi_boot_standard(col = "cos_dist", na.rm = TRUE)

ggplot(mean_typicality, aes(x = group, y = mean)) +
  ylab("mean typicality") +
  geom_point() +
  #geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
                 position = position_dodge(width = .9))

lmer(scale_cos_dist ~ group + (1|participant_id) + (1|noun) + (1|adj),
     data = by_participant_data) %>%
  summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: scale_cos_dist ~ group + (1 | participant_id) + (1 | noun) +  
##     (1 | adj)
##    Data: by_participant_data
## 
## REML criterion at convergence: 11823.9
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.1494 -0.3519  0.0284  0.3166  6.9727 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  noun           (Intercept) 0.55719  0.74645 
##  adj            (Intercept) 1.30008  1.14021 
##  participant_id (Intercept) 0.00143  0.03782 
##  Residual                   0.34371  0.58626 
## Number of obs: 5639, groups:  noun, 675; adj, 227; participant_id, 40
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  0.85262    0.08744   9.751
## groupTD     -0.02964    0.02179  -1.360
## 
## Correlation of Fixed Effects:
##         (Intr)
## groupTD -0.130