SUMMARY: Older, more educated, L1, and female speakers produce associates that are more similiar to the cue, using word2vec similarity metric trained on the NYtimes. We also see that, controlling for word2vec similarity, L2 speakers produce more hypernyms and fewer synonyms than L1 speakers.
#Read in similarity measures between all cues and associates
# produced from wordtovec_similarity_proxies.Rmd
sims = read_csv("similarities.csv") %>%
select(-1) %>%
filter(word1 != word2) %>%
mutate(word1 = as.factor(word1),
word2 = as.factor(word2))
# Read in word-association trial data
d.clean = read_csv("../../data/dclean.csv")
all = left_join(d.clean, sims, by=c("cue" = "word1",
"associate" = "word2")) %>%
mutate(similarity = as.numeric(similarity),
cue = as.factor(cue),
associate = as.factor(associate),
education = as.factor(education)) %>%
select(-1) %>%
filter(cue != "associate") %>%
filter(!is.na(similarity))
ggplot(all, aes(x = similarity, fill = native.lang)) +
geom_histogram() +
theme_bw()
language.group = all %>%
group_by(native.lang) %>%
summarise(mean.sim = mean(similarity, na.rm = TRUE),
sd.sim = sd(similarity, na.rm = TRUE),
n = n()) %>%
mutate(lower = mean.sim - (1.96 * sd.sim/sqrt(n)),
upper = mean.sim + (1.96 * sd.sim/sqrt(n)))
ggplot(language.group, aes(y = mean.sim, fill = native.lang, x=native.lang)) +
geom_bar(stat = "identity", position = "dodge") +
geom_linerange(aes(ymin = lower, ymax = upper)) +
theme_bw()
L1 speakers have higher overall similarity than L2 speakers.
ggplot(all, aes(y = similarity, x = age)) +
geom_smooth(method = "lm") +
theme_bw()
ggplot(filter(all, age < 75), aes(y = similarity, x = age)) +
geom_smooth() +
theme_bw()
Regression
lm(similarity ~ age + Lg10WF + as.numeric(education),
filter(all, education != "0" )) %>%
tidy() %>%
kable()
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 0.1638289 | 0.0015792 | 103.742883 | 0 |
| age | 0.0001602 | 0.0000144 | 11.103658 | 0 |
| Lg10WF | 0.0107773 | 0.0003278 | 32.873492 | 0 |
| as.numeric(education) | 0.0022985 | 0.0002765 | 8.314373 | 0 |
Older speakers produces associates more similar to cue than younger, holds controling for education. Does this generalize across corpora??
ggplot(all, aes(x = similarity, fill = education)) +
geom_histogram() +
theme_bw()
education.group = all %>%
group_by(education) %>%
summarise(mean.sim = mean(similarity, na.rm = TRUE),
sd.sim = sd(similarity, na.rm = TRUE),
n = n()) %>%
mutate(lower = mean.sim - (1.96 * sd.sim/sqrt(n)),
upper = mean.sim + (1.96 * sd.sim/sqrt(n)))
ggplot(education.group, aes(y = mean.sim, fill = education, x = education)) +
geom_bar(stat = "identity", position = "dodge") +
geom_linerange(aes(ymin = lower, ymax = upper)) +
theme_bw()
As get higher educaiton, more similarity (but age and eduaiton are confounded).
all.gender = all %>%
mutate(gender = as.factor(gender)) %>%
filter(gender == "Fe" | gender == "Ma")
ggplot(all.gender, aes(x = similarity, fill = gender)) +
geom_histogram() +
theme_bw()
gender.group = all.gender %>%
group_by(gender) %>%
summarise(mean.sim = mean(similarity, na.rm = TRUE),
sd.sim = sd(similarity, na.rm = TRUE),
n = n()) %>%
mutate(lower = mean.sim - (1.96 * sd.sim/sqrt(n)),
upper = mean.sim + (1.96 * sd.sim/sqrt(n)))
ggplot(gender.group, aes(y = mean.sim, fill = gender, x=gender)) +
geom_bar(stat = "identity", position = "dodge") +
geom_linerange(aes(ymin = lower, ymax = upper)) +
theme_bw()
Female maybe have a tiny bit more similarity.
Merge in wordnet data
wn = read_csv("wn_dclean.csv") %>%
select(-1) %>%
left_join(sims, by=c("cue" = "word1", "associate" = "word2")) %>%
mutate(similarity = as.numeric(similarity),
cue = as.factor(cue),
associate = as.factor(associate)) %>%
filter(cue != "associate") %>%
filter(!is.na(similarity))
wn %>%
group_by(native.lang,relation_type) %>%
summarise(mean.sim = mean(similarity, na.rm = TRUE),
sd.sim = sd(similarity, na.rm = TRUE),
n = n()) %>%
mutate(lower = mean.sim - (1.96 * sd.sim/sqrt(n)),
upper = mean.sim + (1.96 * sd.sim/sqrt(n))) %>%
ggplot(aes(y = mean.sim, fill = native.lang, x = relation_type, group = native.lang)) +
geom_bar(stat = "identity", position = "dodge") +
geom_linerange(aes(ymin = lower, ymax = upper),
position =position_dodge(width=0.9)) +
theme_bw()
L2 synonyms slightly more similiar to cue than L1. Also, interesting that part relations (holonyms/meronyms) are more similiar on the word2vec measure than synonyms.
Logistic regression predicting synonyms and hypernyms by native languages, controling for word2vec similarity, number of synsets and spoken frequence of the word.
wn = wn %>%
mutate(hypernym = ifelse(relation_type == "hypernym", 1, 0),
synonym = ifelse(relation_type == "synonym", 1, 0))
glm(synonym ~native.lang + num_syns+ Lg10WF + + similarity, data= wn) %>%
tidy () %>%
kable(caption = "synonyms")
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 0.2537433 | 0.0024857 | 102.08208 | 0.0000000 |
| native.langL2 | -0.0289710 | 0.0022909 | -12.64612 | 0.0000000 |
| num_syns | 0.0004440 | 0.0001188 | 3.73618 | 0.0001869 |
| Lg10WF | 0.0142690 | 0.0009817 | 14.53426 | 0.0000000 |
| similarity | 0.0682907 | 0.0041894 | 16.30078 | 0.0000000 |
glm(hypernym ~ native.lang + num_syns+ Lg10WF + similarity, data= wn)%>%
tidy () %>%
kable(caption = "hypernyms")
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 0.5951605 | 0.0026122 | 227.83481 | 0 |
| native.langL2 | 0.0242372 | 0.0024075 | 10.06718 | 0 |
| num_syns | -0.0013559 | 0.0001249 | -10.85701 | 0 |
| Lg10WF | -0.0665780 | 0.0010317 | -64.52993 | 0 |
| similarity | -0.0615787 | 0.0044027 | -13.98649 | 0 |
With controls, we still see that L2 speakers produce more hypernyms and fewer synonyms.