L1 and L2 SWOW data with WordNet

Read in data and munge

d.clean = read.csv("../data/dclean.csv")

wn = read.csv("data/all_words.csv") %>%
     select(cue, word, relation_type, num_syns)

wn.clean = wn %>%
  mutate(word = str_replace_all(word, "_", " "),
         word = tolower(word))

wn %>%
  filter(cue == "dog") %>%
  head() %>%
  kable(caption = "Just as a sanity check, here's what the data look like for dog")

Just as a sanity check, here’s what the data look like for dog
cue	word	relation_type	num_syns
dog	canid	hypernym	8
dog	carnivore	hypernym	8
dog	lad	hypernym	8
dog	pursue	hypernym	8
dog	being	hypernym	8
dog	move	hypernym	8

96.44% of SWOW cues are in wordnet.

Merge SWOW and WN datasets.

d = d.clean %>%
  select(userID, native.lang, cue, associate, Lg10WF) %>%
  inner_join(wn.clean, by=c("cue", "associate"="word"))

Relation type by language group

I did the analyses on two samples (1) a balanced set with the exact same set of words for both language group (n = 7804 words), and (2) all the words available in the sample (n = 10,000), with L1 > L2.

Merge in cue information and get proportions by group by cue.

distinct.cues = wn %>%
  select(cue, num_syns) %>%
  distinct()

freqs = d %>%
  select(cue, Lg10WF) %>%
  distinct()

d.prop = d %>%
  group_by(native.lang, cue, relation_type) %>%
  summarise (n = n()) %>%
  mutate(prop = n / sum(n)) %>%
  select(-n) %>%
  spread(relation_type, prop) %>%
  mutate_all(funs(replace(., is.na(.), 0))) %>%
  left_join(distinct.cues) %>%
  left_join(freqs)

d.prop %>%
  arrange(cue) %>%
  tail() %>%
  kable(caption = "This is what the data look like:")

This is what the data look like:
native.lang	cue	holonym	hypernym	hyponym	meronym	synonym	num_syns	Lg10WF
L1	zone	0.0000000	0.9130435	0.0434783	0.0000000	0.0434783	6	3.0116
L2	zone	0.0000000	1.0000000	0.0000000	0.0000000	0.0000000	6	3.0116
L1	zoo	0.0000000	0.0000000	0.0000000	0.0000000	1.0000000	1	2.8432
L1	zoom	0.0000000	0.8181818	0.0000000	0.0000000	0.1818182	5	2.2601
L1	zucchini	0.0823529	0.7529412	0.0000000	0.0823529	0.0823529	2	1.6990
L2	zucchini	0.0000000	1.0000000	0.0000000	0.0000000	0.0000000	2	1.6990

Balanced word sets

l2words = d.prop %>% 
  ungroup() %>%
  filter(native.lang == "L2") %>%
  select(cue) 

l2words = unique(l2words$cue)

l1words = d.prop %>% 
  ungroup() %>%
  filter(native.lang == "L1") %>%
  select(cue) 

l1words = unique(l1words$cue)

common_words = intersect(l2words, l1words)

d.prop.common = d.prop %>%
  filter(cue %in% common_words)

There are 7804 words for each group.

Proportion distributions

d.prop.common %>%
  gather("relation", "prop", 3:7) %>%
  ggplot(aes(x = prop, fill = native.lang)) +
  facet_wrap(~relation, scales = "free") +
  geom_density(alpha = .3) +
  theme_bw()

Mean prop relation_type

Error bars are 95% CIs

ms = d.prop.common %>%
  gather("measure", "prop", 3:7) %>%
  group_by(native.lang, measure) %>%
  multi_boot_standard(column = "prop")

ggplot(ms, aes(x=measure, y = mean, group = native.lang,
             fill = native.lang)) +
   geom_bar(position = "dodge", stat= "identity") +
   geom_linerange(aes(ymin = ci_lower, 
                     ymax = ci_upper), 
                 position=position_dodge(width = .9)) +
   ylab("prop") +
   xlab("semantic relation") +
   theme_bw()

Regressions

Predicting prop relation type, controling for number of synsets and spoken frequency of the word. A separate model for each word type. There’s a reliable effect of hypernyms, hyponyms, and synonyms.

lm(hypernym ~ native.lang + num_syns + Lg10WF, 
   d = d.prop.common) %>%
  summary() %>%
  tidy() %>%
  kable(caption = "hypernym")

hypernym
term	estimate	std.error	statistic	p.value
(Intercept)	0.6456406	0.0107840	59.870491	0.0000000
native.langL2	0.0200710	0.0061566	3.260094	0.0011163
num_syns	-0.0023204	0.0006057	-3.831233	0.0001280
Lg10WF	-0.0812274	0.0044369	-18.307303	0.0000000

lm(hyponym ~ native.lang + num_syns + Lg10WF, 
   d = d.prop.common)%>%
  summary() %>%
  tidy () %>%
  kable(caption = "hyponym")

hyponym
term	estimate	std.error	statistic	p.value
(Intercept)	0.0621570	0.0079946	7.774827	0.0000000
native.langL2	-0.0104062	0.0045642	-2.279985	0.0226229
num_syns	0.0040788	0.0004490	9.084080	0.0000000
Lg10WF	0.0414817	0.0032893	12.611224	0.0000000

lm(synonym ~ native.lang + num_syns + Lg10WF, 
   d = d.prop.common) %>%
  summary() %>%
  tidy ()%>%
  kable(caption = "synonym")

synonym
term	estimate	std.error	statistic	p.value
(Intercept)	0.2522185	0.0101796	24.7767421	0.0000000
native.langL2	-0.0114671	0.0058116	-1.9731387	0.0484985
num_syns	0.0003212	0.0005717	0.5618444	0.5742307
Lg10WF	0.0276110	0.0041883	6.5924982	0.0000000

All words

Proportion distributions

d.prop %>%
  gather("relation", "prop", 3:7) %>%
  ggplot(aes(x = prop, fill = native.lang)) +
  facet_wrap(~relation, scales = "free") +
  geom_density(alpha = .3) +
  theme_bw()

Mean prop relation_type

Error bars are 95% CIs

ms = d.prop %>%
  gather("measure", "prop", 3:7) %>%
  group_by(native.lang, measure) %>%
  multi_boot_standard(column = "prop")

ggplot(ms, aes(x=measure, y = mean, group = native.lang,
             fill = native.lang)) +
   geom_bar(position = "dodge", stat= "identity") +
   geom_linerange(aes(ymin = ci_lower, 
                     ymax = ci_upper), 
                 position=position_dodge(width = .9)) +
  ylab("prop") +
  xlab("semantic relation") +
  theme_bw()

Regressions

Predicting prop relation type, controling for number of synsets and spoken frequency of the word. A separate model for each word type. There’s a reliable effect of hypernyms and synonyms.

lm(hypernym ~ native.lang + num_syns + Lg10WF, 
   d = d.prop) %>%
  summary() %>%
  tidy() %>%
  kable(caption = "hypernym")

hypernym
term	estimate	std.error	statistic	p.value
(Intercept)	0.6239006	0.0102481	60.879764	0.0000000
native.langL2	0.0308380	0.0059803	5.156624	0.0000003
num_syns	-0.0020450	0.0006003	-3.406598	0.0006594
Lg10WF	-0.0775709	0.0042740	-18.149668	0.0000000

lm(hyponym ~ native.lang + num_syns + Lg10WF, 
   d = d.prop)%>%
  summary() %>%
  tidy () %>%
  kable(caption = "hyponym")

hyponym
term	estimate	std.error	statistic	p.value
(Intercept)	0.0538834	0.0074551	7.2277351	0.0000000
native.langL2	-0.0007715	0.0043504	-0.1773363	0.8592465
num_syns	0.0043687	0.0004367	10.0036941	0.0000000
Lg10WF	0.0401660	0.0031091	12.9186788	0.0000000

lm(synonym ~ native.lang + num_syns + Lg10WF, 
   d = d.prop) %>%
  summary() %>%
  tidy ()%>%
  kable(caption = "synonym")

synonym
term	estimate	std.error	statistic	p.value
(Intercept)	0.2881453	0.0098725	29.1867423	0.0000000
native.langL2	-0.0360246	0.0057611	-6.2530934	0.0000000
num_syns	-0.0003665	0.0005783	-0.6336929	0.5262903
Lg10WF	0.0249184	0.0041173	6.0521115	0.0000000

Hyper:Hypo by language group

Balanced word sets

d.prop.common = d.prop.common %>%
  mutate(rel.hyper = hypernym/hyponym)%>%
  filter(is.finite(rel.hyper))

d.prop.common %>%
  ggplot(aes(x = rel.hyper, fill = native.lang)) +
  geom_density(alpha = .3) +
  theme_bw()

Note we’re losing a lot of words here.

Relative

Error bars are 95% CIs

ms = d.prop.common %>%
  group_by(native.lang) %>%
  multi_boot_standard(column = "rel.hyper", na.rm = T)

ggplot(ms, aes(x=native.lang, y = mean,
             fill = native.lang)) +
   geom_bar(position = "dodge", stat= "identity") +
   geom_linerange(aes(ymin = ci_lower, 
                     ymax = ci_upper), 
                 position=position_dodge(width = .9)) +
  ylab("hyper/hypo") +
  xlab("Language Group") +
  theme_bw() +
  theme(legend.position = "none")

I think this difference is just an artifact of the fact that L1 is more right skewed than L2.

L1 and L2 SWOW data with WordNet

Molly Lewis

2017-04-19

Read in data and munge

Relation type by language group

Balanced word sets

Proportion distributions

Mean prop relation_type

Regressions

All words

Proportion distributions

Mean prop relation_type

Regressions

Hyper:Hypo by language group

Balanced word sets

Relative