Word2Vec similarity analyses

SUMMARY: Older, more educated, L1, and female speakers produce associates that are more similiar to the cue, using word2vec similarity metric trained on the NYtimes. We also see that, controlling for word2vec similarity, L2 speakers produce more hypernyms and fewer synonyms than L1 speakers.

Similarity on full dataset

#Read in similarity measures between all cues and associates

# produced from wordtovec_similarity_proxies.Rmd
sims = read_csv("similarities.csv") %>% 
  select(-1) %>%
  filter(word1 != word2) %>%
  mutate(word1 = as.factor(word1),
         word2 = as.factor(word2))

# Read in word-association trial data

d.clean = read_csv("../../data/dclean.csv")

all = left_join(d.clean, sims, by=c("cue" = "word1", 
                                     "associate" = "word2")) %>%
  mutate(similarity = as.numeric(similarity),
         cue = as.factor(cue),
         associate = as.factor(associate),
         education = as.factor(education)) %>%
  select(-1) %>%
  filter(cue != "associate") %>%
  filter(!is.na(similarity))

by native language

ggplot(all, aes(x = similarity, fill = native.lang)) +
 geom_histogram() +
 theme_bw()

language.group = all %>%
  group_by(native.lang) %>%
  summarise(mean.sim = mean(similarity, na.rm = TRUE),
            sd.sim = sd(similarity, na.rm = TRUE),
            n = n()) %>%
  mutate(lower = mean.sim - (1.96 * sd.sim/sqrt(n)),
         upper = mean.sim + (1.96 * sd.sim/sqrt(n)))

ggplot(language.group, aes(y = mean.sim, fill = native.lang, x=native.lang)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_linerange(aes(ymin = lower, ymax = upper)) +
  theme_bw()

L1 speakers have higher overall similarity than L2 speakers.

age

ggplot(all, aes(y = similarity, x = age)) +
  geom_smooth(method = "lm") +
  theme_bw()

ggplot(filter(all, age < 75), aes(y = similarity, x = age)) +
  geom_smooth() +
  theme_bw()

Regression

lm(similarity ~ age + Lg10WF + as.numeric(education),
   filter(all, education != "0" )) %>%
  tidy() %>%
  kable()

term	estimate	std.error	statistic
(Intercept)	0.1638289	0.0015792	103.742883
age	0.0001602	0.0000144	11.103658
Lg10WF	0.0107773	0.0003278	32.873492
as.numeric(education)	0.0022985	0.0002765	8.314373

Older speakers produces associates more similar to cue than younger, holds controling for education. Does this generalize across corpora??

education

ggplot(all, aes(x = similarity, fill = education)) +
 geom_histogram() +
 theme_bw()

education.group = all %>%
  group_by(education) %>%
  summarise(mean.sim = mean(similarity, na.rm = TRUE),
            sd.sim = sd(similarity, na.rm = TRUE),
            n = n()) %>%
  mutate(lower = mean.sim - (1.96 * sd.sim/sqrt(n)),
         upper = mean.sim + (1.96 * sd.sim/sqrt(n)))

ggplot(education.group, aes(y = mean.sim, fill = education, x = education)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_linerange(aes(ymin = lower, ymax = upper)) +
  theme_bw()

As get higher educaiton, more similarity (but age and eduaiton are confounded).

gender

all.gender = all %>%
  mutate(gender = as.factor(gender)) %>%
  filter(gender == "Fe" | gender == "Ma")

ggplot(all.gender, aes(x = similarity, fill = gender)) +
 geom_histogram() +
 theme_bw()

gender.group = all.gender %>%
  group_by(gender) %>%
  summarise(mean.sim = mean(similarity, na.rm = TRUE),
            sd.sim = sd(similarity, na.rm = TRUE),
            n = n()) %>%
  mutate(lower = mean.sim - (1.96 * sd.sim/sqrt(n)),
         upper = mean.sim + (1.96 * sd.sim/sqrt(n)))

ggplot(gender.group, aes(y = mean.sim, fill = gender, x=gender)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_linerange(aes(ymin = lower, ymax = upper)) +
  theme_bw()

Female maybe have a tiny bit more similarity.

Wordnet analyses

Merge in wordnet data

wn = read_csv("wn_dclean.csv") %>%
  select(-1) %>%
  left_join(sims, by=c("cue" = "word1", "associate" = "word2")) %>%
  mutate(similarity = as.numeric(similarity),
         cue = as.factor(cue),
         associate = as.factor(associate)) %>%
  filter(cue != "associate") %>%
  filter(!is.na(similarity))

by word relation

wn %>%
  group_by(native.lang,relation_type) %>%
  summarise(mean.sim = mean(similarity, na.rm = TRUE),
            sd.sim = sd(similarity, na.rm = TRUE),
            n = n()) %>%
  mutate(lower = mean.sim - (1.96 * sd.sim/sqrt(n)),
         upper = mean.sim + (1.96 * sd.sim/sqrt(n))) %>%
  ggplot(aes(y = mean.sim, fill = native.lang,  x = relation_type, group = native.lang)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_linerange(aes(ymin = lower, ymax = upper), 
                 position =position_dodge(width=0.9)) +
  theme_bw()

L2 synonyms slightly more similiar to cue than L1. Also, interesting that part relations (holonyms/meronyms) are more similiar on the word2vec measure than synonyms.

Logistic regression predicting synonyms and hypernyms by native languages, controling for word2vec similarity, number of synsets and spoken frequence of the word.

wn = wn %>%
  mutate(hypernym = ifelse(relation_type == "hypernym", 1, 0),
         synonym = ifelse(relation_type == "synonym", 1, 0))

glm(synonym ~native.lang + num_syns+ Lg10WF + + similarity, data= wn) %>%
  tidy () %>%
  kable(caption =  "synonyms")

synonyms
term	estimate	std.error	statistic	p.value
(Intercept)	0.2537433	0.0024857	102.08208	0.0000000
native.langL2	-0.0289710	0.0022909	-12.64612	0.0000000
num_syns	0.0004440	0.0001188	3.73618	0.0001869
Lg10WF	0.0142690	0.0009817	14.53426	0.0000000
similarity	0.0682907	0.0041894	16.30078	0.0000000

glm(hypernym ~ native.lang + num_syns+ Lg10WF + similarity, data= wn)%>%
  tidy () %>%
  kable(caption =  "hypernyms")

hypernyms
term	estimate	std.error	statistic
(Intercept)	0.5951605	0.0026122	227.83481
native.langL2	0.0242372	0.0024075	10.06718
num_syns	-0.0013559	0.0001249	-10.85701
Lg10WF	-0.0665780	0.0010317	-64.52993
similarity	-0.0615787	0.0044027	-13.98649

With controls, we still see that L2 speakers produce more hypernyms and fewer synonyms.