SUMMARY: Older, more educated, L1, and female speakers produce associates that are more similiar to the cue, using word2vec similarity metric trained on the NYtimes. We also see that, controlling for word2vec similarity, L2 speakers produce more hypernyms and fewer synonyms than L1 speakers.

Similarity on full dataset

#Read in similarity measures between all cues and associates

# produced from wordtovec_similarity_proxies.Rmd
sims = read_csv("similarities.csv") %>% 
  select(-1) %>%
  filter(word1 != word2) %>%
  mutate(word1 = as.factor(word1),
         word2 = as.factor(word2))
# Read in word-association trial data

d.clean = read_csv("../../data/dclean.csv")

all = left_join(d.clean, sims, by=c("cue" = "word1", 
                                     "associate" = "word2")) %>%
  mutate(similarity = as.numeric(similarity),
         cue = as.factor(cue),
         associate = as.factor(associate),
         education = as.factor(education)) %>%
  select(-1) %>%
  filter(cue != "associate") %>%
  filter(!is.na(similarity))

by native language

ggplot(all, aes(x = similarity, fill = native.lang)) +
 geom_histogram() +
 theme_bw()

language.group = all %>%
  group_by(native.lang) %>%
  summarise(mean.sim = mean(similarity, na.rm = TRUE),
            sd.sim = sd(similarity, na.rm = TRUE),
            n = n()) %>%
  mutate(lower = mean.sim - (1.96 * sd.sim/sqrt(n)),
         upper = mean.sim + (1.96 * sd.sim/sqrt(n)))

ggplot(language.group, aes(y = mean.sim, fill = native.lang, x=native.lang)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_linerange(aes(ymin = lower, ymax = upper)) +
  theme_bw()

L1 speakers have higher overall similarity than L2 speakers.

age

ggplot(all, aes(y = similarity, x = age)) +
  geom_smooth(method = "lm") +
  theme_bw()

ggplot(filter(all, age < 75), aes(y = similarity, x = age)) +
  geom_smooth() +
  theme_bw()

Regression

lm(similarity ~ age + Lg10WF + as.numeric(education),
   filter(all, education != "0" )) %>%
  tidy() %>%
  kable()
term estimate std.error statistic p.value
(Intercept) 0.1638289 0.0015792 103.742883 0
age 0.0001602 0.0000144 11.103658 0
Lg10WF 0.0107773 0.0003278 32.873492 0
as.numeric(education) 0.0022985 0.0002765 8.314373 0

Older speakers produces associates more similar to cue than younger, holds controling for education. Does this generalize across corpora??

education

ggplot(all, aes(x = similarity, fill = education)) +
 geom_histogram() +
 theme_bw()

education.group = all %>%
  group_by(education) %>%
  summarise(mean.sim = mean(similarity, na.rm = TRUE),
            sd.sim = sd(similarity, na.rm = TRUE),
            n = n()) %>%
  mutate(lower = mean.sim - (1.96 * sd.sim/sqrt(n)),
         upper = mean.sim + (1.96 * sd.sim/sqrt(n)))

ggplot(education.group, aes(y = mean.sim, fill = education, x = education)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_linerange(aes(ymin = lower, ymax = upper)) +
  theme_bw()

As get higher educaiton, more similarity (but age and eduaiton are confounded).

gender

all.gender = all %>%
  mutate(gender = as.factor(gender)) %>%
  filter(gender == "Fe" | gender == "Ma")

ggplot(all.gender, aes(x = similarity, fill = gender)) +
 geom_histogram() +
 theme_bw()

gender.group = all.gender %>%
  group_by(gender) %>%
  summarise(mean.sim = mean(similarity, na.rm = TRUE),
            sd.sim = sd(similarity, na.rm = TRUE),
            n = n()) %>%
  mutate(lower = mean.sim - (1.96 * sd.sim/sqrt(n)),
         upper = mean.sim + (1.96 * sd.sim/sqrt(n)))

ggplot(gender.group, aes(y = mean.sim, fill = gender, x=gender)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_linerange(aes(ymin = lower, ymax = upper)) +
  theme_bw()

Female maybe have a tiny bit more similarity.

Wordnet analyses

Merge in wordnet data

wn = read_csv("wn_dclean.csv") %>%
  select(-1) %>%
  left_join(sims, by=c("cue" = "word1", "associate" = "word2")) %>%
  mutate(similarity = as.numeric(similarity),
         cue = as.factor(cue),
         associate = as.factor(associate)) %>%
  filter(cue != "associate") %>%
  filter(!is.na(similarity))

by word relation

wn %>%
  group_by(native.lang,relation_type) %>%
  summarise(mean.sim = mean(similarity, na.rm = TRUE),
            sd.sim = sd(similarity, na.rm = TRUE),
            n = n()) %>%
  mutate(lower = mean.sim - (1.96 * sd.sim/sqrt(n)),
         upper = mean.sim + (1.96 * sd.sim/sqrt(n))) %>%
  ggplot(aes(y = mean.sim, fill = native.lang,  x = relation_type, group = native.lang)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_linerange(aes(ymin = lower, ymax = upper), 
                 position =position_dodge(width=0.9)) +
  theme_bw()

L2 synonyms slightly more similiar to cue than L1. Also, interesting that part relations (holonyms/meronyms) are more similiar on the word2vec measure than synonyms.

Logistic regression predicting synonyms and hypernyms by native languages, controling for word2vec similarity, number of synsets and spoken frequence of the word.

wn = wn %>%
  mutate(hypernym = ifelse(relation_type == "hypernym", 1, 0),
         synonym = ifelse(relation_type == "synonym", 1, 0))

glm(synonym ~native.lang + num_syns+ Lg10WF + + similarity, data= wn) %>%
  tidy () %>%
  kable(caption =  "synonyms")
synonyms
term estimate std.error statistic p.value
(Intercept) 0.2537433 0.0024857 102.08208 0.0000000
native.langL2 -0.0289710 0.0022909 -12.64612 0.0000000
num_syns 0.0004440 0.0001188 3.73618 0.0001869
Lg10WF 0.0142690 0.0009817 14.53426 0.0000000
similarity 0.0682907 0.0041894 16.30078 0.0000000
glm(hypernym ~ native.lang + num_syns+ Lg10WF + similarity, data= wn)%>%
  tidy () %>%
  kable(caption =  "hypernyms")
hypernyms
term estimate std.error statistic p.value
(Intercept) 0.5951605 0.0026122 227.83481 0
native.langL2 0.0242372 0.0024075 10.06718 0
num_syns -0.0013559 0.0001249 -10.85701 0
Lg10WF -0.0665780 0.0010317 -64.52993 0
similarity -0.0615787 0.0044027 -13.98649 0

With controls, we still see that L2 speakers produce more hypernyms and fewer synonyms.