UTTS_WITH_TYPICALITY <- here("exploratory_analyses/01_adj_typicality/data/utts_with_bergey_typicality.csv")
utts_with_typicality <- read_csv(UTTS_WITH_TYPICALITY)
by_participant_data <- utts_with_typicality %>%
filter(speaker == "mother") %>%
pivot_longer(cols = turker_judgment:ldp_similarity, names_to = "measure") %>%
#distinct(group, participant_id, noun, adj, measure, value) %>% # get rid of repititions in teh same utterance?
rename(transcript_id = participant_id) %>%
mutate(participant_id = paste0(group, parse_number(transcript_id)),
log_value = log(value))
num_pairs_by_kid <- count(by_participant_data, group, participant_id, measure) %>%
distinct(group, participant_id, n) %>%
data.frame()
There are utterances that overlap with the bergey/morris norms from 36 kids, for a total of 521 utterances.
mean_typicality_by_participant <- by_participant_data %>%
group_by(group, measure, participant_id) %>%
summarize(log_value = mean(log_value))
mean_typicality <- mean_typicality_by_participant %>%
group_by(group, measure) %>%
multi_boot_standard(col = "log_value", na.rm = TRUE)
ggplot(mean_typicality, aes(x = group, y = mean)) +
facet_wrap(.~ measure, scale = "free_y") +
ylab("mean typicality") +
geom_point() +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
position = position_dodge(width = .9))
Model
df <- by_participant_data %>%
filter(measure == "turker_judgment")
lmer(log_value ~ group + (1|participant_id), data = df) %>%
summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: log_value ~ group + (1 | participant_id)
## Data: df
##
## REML criterion at convergence: 613
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.67783 -0.58752 -0.03114 0.80423 2.09671
##
## Random effects:
## Groups Name Variance Std.Dev.
## participant_id (Intercept) 0.009298 0.09643
## Residual 0.180840 0.42525
## Number of obs: 521, groups: participant_id, 36
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 1.26847 0.03926 32.306
## groupTD -0.07664 0.05342 -1.435
##
## Correlation of Fixed Effects:
## (Intr)
## groupTD -0.735
df <- by_participant_data %>%
filter(measure == "wiki_similarity")
lmer(log_value ~ group + (1|participant_id), data = df) %>%
summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: log_value ~ group + (1 | participant_id)
## Data: df
##
## REML criterion at convergence: 101.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.7701 -0.5369 -0.2130 0.3031 3.2633
##
## Random effects:
## Groups Name Variance Std.Dev.
## participant_id (Intercept) 0.02005 0.1416
## Residual 0.06328 0.2516
## Number of obs: 521, groups: participant_id, 36
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -0.79091 0.03917 -20.193
## groupTD -0.04626 0.05524 -0.837
##
## Correlation of Fixed Effects:
## (Intr)
## groupTD -0.709
Next step…calculate wiki simlarity for all noun-adj pairs, and do this for VI group? Better embedding sim measure?
All noun-adj pairs based on wiki cosine similarity; pairs identified as noun-adj occuring in same utterance; pos identified based on subtlexus dominant.
UTTS_WITH_WIKI_SIM <- here("exploratory_analyses/01_adj_typicality/data/utts_with_wiki_sim.csv")
utts_with_wiki_sim <- read_csv(UTTS_WITH_WIKI_SIM)
by_participant_data <- utts_with_wiki_sim %>%
filter(speaker == "mother") %>%
#distinct(group, participant_id, noun, adj, cos_dist, scale_cos_dist) %>% # get rid of repititions in the same utterance?
rename(transcript_id = participant_id) %>%
mutate(participant_id = paste0(group, parse_number(transcript_id)),
age = map(str_extract_all(transcript_id, "[0-9]+"), as.numeric),
age = map(age, ~unlist(.[[2]])),
age = unlist(age),
age_bin = cut(age, breaks = 3)) #, case_when(age < 730 ~ "young",
# TRUE ~ "old"))
num_pairs_by_kid <- count(by_participant_data, group, age_bin, participant_id) %>%
distinct(group, participant_id, n, age_bin) %>%
data.frame()
mean_typicality_by_participant <- by_participant_data %>%
group_by(group, participant_id) %>%
summarize(cos_dist = mean(cos_dist))
mean_typicality <- mean_typicality_by_participant %>%
group_by(group) %>%
multi_boot_standard(col = "cos_dist", na.rm = TRUE)
ggplot(mean_typicality, aes(x = group, y = mean)) +
ylab("mean typicality") +
geom_point() +
#geom_bar(stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
position = position_dodge(width = .9))
lmer(cos_dist ~ group + (1|participant_id) + (1|noun),
data = by_participant_data) %>%
summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: cos_dist ~ group + (1 | participant_id) + (1 | noun)
## Data: by_participant_data
##
## REML criterion at convergence: -14462
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.0413 -0.4012 0.0106 0.3484 6.2880
##
## Random effects:
## Groups Name Variance Std.Dev.
## noun (Intercept) 4.544e-03 0.067408
## participant_id (Intercept) 4.562e-05 0.006755
## Residual 3.692e-03 0.060759
## Number of obs: 5639, groups: noun, 675; participant_id, 40
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.245306 0.003558 68.939
## groupTD -0.004542 0.002842 -1.598
##
## Correlation of Fixed Effects:
## (Intr)
## groupTD -0.394
by age:
mean_typicality_by_participant <- by_participant_data %>%
group_by(group, participant_id, age_bin) %>%
summarize(cos_dist = mean(cos_dist))
mean_typicality <- mean_typicality_by_participant %>%
group_by(group, age_bin) %>%
multi_boot_standard(col = "cos_dist", na.rm = TRUE)
ggplot(mean_typicality, aes(x = age_bin, color = group, y = mean, group = group)) +
ylab("mean typicality") +
geom_point() +
geom_line() +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper))
ggplot(mean_typicality, aes(x = group, y = mean)) +
ylab("mean typicality") +
facet_wrap(~age_bin) +
geom_point() +
#geom_bar(stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
position = position_dodge(width = .9))
lmer(cos_dist ~ group+age + (1|participant_id), data = by_participant_data) %>%
summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: cos_dist ~ group + age + (1 | participant_id)
## Data: by_participant_data
##
## REML criterion at convergence: -9112.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.7121 -0.7351 -0.1914 0.7705 3.9084
##
## Random effects:
## Groups Name Variance Std.Dev.
## participant_id (Intercept) 0.0005709 0.02389
## Residual 0.0113965 0.10675
## Number of obs: 5639, groups: participant_id, 40
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 2.965e-01 7.374e-03 40.212
## groupTD -9.935e-03 8.226e-03 -1.208
## age -5.684e-06 6.546e-06 -0.868
##
## Correlation of Fixed Effects:
## (Intr) gropTD
## groupTD -0.529
## age -0.651 0.020
mean_typicality_by_participant_scaled <- by_participant_data %>%
group_by(group, participant_id) %>%
summarize(cos_dist = mean(scale_cos_dist))
mean_typicality <- mean_typicality_by_participant_scaled %>%
group_by(group) %>%
multi_boot_standard(col = "cos_dist", na.rm = TRUE)
ggplot(mean_typicality, aes(x = group, y = mean)) +
ylab("mean typicality") +
geom_point() +
#geom_bar(stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
position = position_dodge(width = .9))
lmer(scale_cos_dist ~ group + (1|participant_id) + (1|noun) + (1|adj),
data = by_participant_data) %>%
summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: scale_cos_dist ~ group + (1 | participant_id) + (1 | noun) +
## (1 | adj)
## Data: by_participant_data
##
## REML criterion at convergence: 11823.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.1494 -0.3519 0.0284 0.3166 6.9727
##
## Random effects:
## Groups Name Variance Std.Dev.
## noun (Intercept) 0.55719 0.74645
## adj (Intercept) 1.30008 1.14021
## participant_id (Intercept) 0.00143 0.03782
## Residual 0.34371 0.58626
## Number of obs: 5639, groups: noun, 675; adj, 227; participant_id, 40
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.85262 0.08744 9.751
## groupTD -0.02964 0.02179 -1.360
##
## Correlation of Fixed Effects:
## (Intr)
## groupTD -0.130
by age:
mean_typicality_by_participant_scaled <- by_participant_data %>%
group_by(group, participant_id, age_bin) %>%
summarize(cos_dist = mean(scale_cos_dist))
mean_typicality <- mean_typicality_by_participant_scaled %>%
group_by(group, age_bin) %>%
multi_boot_standard(col = "cos_dist", na.rm = TRUE)
ggplot(mean_typicality, aes(x = age_bin, color = group, y = mean, group = group)) +
ylab("mean typicality") +
geom_point() +
geom_line() +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper))
lmer(scale_cos_dist ~ group+age + (1|participant_id) + (1|noun) + (1|adj),
data = by_participant_data) %>%
summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: scale_cos_dist ~ group + age + (1 | participant_id) + (1 | noun) +
## (1 | adj)
## Data: by_participant_data
##
## REML criterion at convergence: 11842.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.1484 -0.3523 0.0291 0.3167 6.9728
##
## Random effects:
## Groups Name Variance Std.Dev.
## noun (Intercept) 0.557161 0.74643
## adj (Intercept) 1.300078 1.14021
## participant_id (Intercept) 0.001433 0.03786
## Residual 0.343776 0.58632
## Number of obs: 5639, groups: noun, 675; adj, 227; participant_id, 40
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 8.514e-01 9.364e-02 9.093
## groupTD -2.959e-02 2.184e-02 -1.355
## age 1.491e-06 4.226e-05 0.035
##
## Correlation of Fixed Effects:
## (Intr) gropTD
## groupTD -0.141
## age -0.358 0.057