d.clean = read.csv("../data/dclean.csv")
wn = read.csv("data/all_words.csv") %>%
select(cue, word, relation_type, num_syns)
wn.clean = wn %>%
mutate(word = str_replace_all(word, "_", " "),
word = tolower(word))
wn %>%
filter(cue == "dog") %>%
head() %>%
kable(caption = "Just as a sanity check, here's what the data look like for dog")
| cue | word | relation_type | num_syns |
|---|---|---|---|
| dog | canid | hypernym | 8 |
| dog | carnivore | hypernym | 8 |
| dog | lad | hypernym | 8 |
| dog | pursue | hypernym | 8 |
| dog | being | hypernym | 8 |
| dog | move | hypernym | 8 |
96.44% of SWOW cues are in wordnet.
Merge SWOW and WN datasets.
d = d.clean %>%
select(userID, native.lang, cue, associate, Lg10WF) %>%
inner_join(wn.clean, by=c("cue", "associate"="word"))
I did the analyses on two samples (1) a balanced set with the exact same set of words for both language group (n = 7804 words), and (2) all the words available in the sample (n = 10,000), with L1 > L2.
Merge in cue information and get proportions by group by cue.
distinct.cues = wn %>%
select(cue, num_syns) %>%
distinct()
freqs = d %>%
select(cue, Lg10WF) %>%
distinct()
d.prop = d %>%
group_by(native.lang, cue, relation_type) %>%
summarise (n = n()) %>%
mutate(prop = n / sum(n)) %>%
select(-n) %>%
spread(relation_type, prop) %>%
mutate_all(funs(replace(., is.na(.), 0))) %>%
left_join(distinct.cues) %>%
left_join(freqs)
d.prop %>%
arrange(cue) %>%
tail() %>%
kable(caption = "This is what the data look like:")
| native.lang | cue | holonym | hypernym | hyponym | meronym | synonym | num_syns | Lg10WF |
|---|---|---|---|---|---|---|---|---|
| L1 | zone | 0.0000000 | 0.9130435 | 0.0434783 | 0.0000000 | 0.0434783 | 6 | 3.0116 |
| L2 | zone | 0.0000000 | 1.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | 6 | 3.0116 |
| L1 | zoo | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | 1.0000000 | 1 | 2.8432 |
| L1 | zoom | 0.0000000 | 0.8181818 | 0.0000000 | 0.0000000 | 0.1818182 | 5 | 2.2601 |
| L1 | zucchini | 0.0823529 | 0.7529412 | 0.0000000 | 0.0823529 | 0.0823529 | 2 | 1.6990 |
| L2 | zucchini | 0.0000000 | 1.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | 2 | 1.6990 |
l2words = d.prop %>%
ungroup() %>%
filter(native.lang == "L2") %>%
select(cue)
l2words = unique(l2words$cue)
l1words = d.prop %>%
ungroup() %>%
filter(native.lang == "L1") %>%
select(cue)
l1words = unique(l1words$cue)
common_words = intersect(l2words, l1words)
d.prop.common = d.prop %>%
filter(cue %in% common_words)
There are 7804 words for each group.
d.prop.common %>%
gather("relation", "prop", 3:7) %>%
ggplot(aes(x = prop, fill = native.lang)) +
facet_wrap(~relation, scales = "free") +
geom_density(alpha = .3) +
theme_bw()
Error bars are 95% CIs
ms = d.prop.common %>%
gather("measure", "prop", 3:7) %>%
group_by(native.lang, measure) %>%
multi_boot_standard(column = "prop")
ggplot(ms, aes(x=measure, y = mean, group = native.lang,
fill = native.lang)) +
geom_bar(position = "dodge", stat= "identity") +
geom_linerange(aes(ymin = ci_lower,
ymax = ci_upper),
position=position_dodge(width = .9)) +
ylab("prop") +
xlab("semantic relation") +
theme_bw()
Predicting prop relation type, controling for number of synsets and spoken frequency of the word. A separate model for each word type. There’s a reliable effect of hypernyms, hyponyms, and synonyms.
lm(hypernym ~ native.lang + num_syns + Lg10WF,
d = d.prop.common) %>%
summary() %>%
tidy() %>%
kable(caption = "hypernym")
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 0.6456406 | 0.0107840 | 59.870491 | 0.0000000 |
| native.langL2 | 0.0200710 | 0.0061566 | 3.260094 | 0.0011163 |
| num_syns | -0.0023204 | 0.0006057 | -3.831233 | 0.0001280 |
| Lg10WF | -0.0812274 | 0.0044369 | -18.307303 | 0.0000000 |
lm(hyponym ~ native.lang + num_syns + Lg10WF,
d = d.prop.common)%>%
summary() %>%
tidy () %>%
kable(caption = "hyponym")
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 0.0621570 | 0.0079946 | 7.774827 | 0.0000000 |
| native.langL2 | -0.0104062 | 0.0045642 | -2.279985 | 0.0226229 |
| num_syns | 0.0040788 | 0.0004490 | 9.084080 | 0.0000000 |
| Lg10WF | 0.0414817 | 0.0032893 | 12.611224 | 0.0000000 |
lm(synonym ~ native.lang + num_syns + Lg10WF,
d = d.prop.common) %>%
summary() %>%
tidy ()%>%
kable(caption = "synonym")
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 0.2522185 | 0.0101796 | 24.7767421 | 0.0000000 |
| native.langL2 | -0.0114671 | 0.0058116 | -1.9731387 | 0.0484985 |
| num_syns | 0.0003212 | 0.0005717 | 0.5618444 | 0.5742307 |
| Lg10WF | 0.0276110 | 0.0041883 | 6.5924982 | 0.0000000 |
d.prop %>%
gather("relation", "prop", 3:7) %>%
ggplot(aes(x = prop, fill = native.lang)) +
facet_wrap(~relation, scales = "free") +
geom_density(alpha = .3) +
theme_bw()
Error bars are 95% CIs
ms = d.prop %>%
gather("measure", "prop", 3:7) %>%
group_by(native.lang, measure) %>%
multi_boot_standard(column = "prop")
ggplot(ms, aes(x=measure, y = mean, group = native.lang,
fill = native.lang)) +
geom_bar(position = "dodge", stat= "identity") +
geom_linerange(aes(ymin = ci_lower,
ymax = ci_upper),
position=position_dodge(width = .9)) +
ylab("prop") +
xlab("semantic relation") +
theme_bw()
Predicting prop relation type, controling for number of synsets and spoken frequency of the word. A separate model for each word type. There’s a reliable effect of hypernyms and synonyms.
lm(hypernym ~ native.lang + num_syns + Lg10WF,
d = d.prop) %>%
summary() %>%
tidy() %>%
kable(caption = "hypernym")
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 0.6239006 | 0.0102481 | 60.879764 | 0.0000000 |
| native.langL2 | 0.0308380 | 0.0059803 | 5.156624 | 0.0000003 |
| num_syns | -0.0020450 | 0.0006003 | -3.406598 | 0.0006594 |
| Lg10WF | -0.0775709 | 0.0042740 | -18.149668 | 0.0000000 |
lm(hyponym ~ native.lang + num_syns + Lg10WF,
d = d.prop)%>%
summary() %>%
tidy () %>%
kable(caption = "hyponym")
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 0.0538834 | 0.0074551 | 7.2277351 | 0.0000000 |
| native.langL2 | -0.0007715 | 0.0043504 | -0.1773363 | 0.8592465 |
| num_syns | 0.0043687 | 0.0004367 | 10.0036941 | 0.0000000 |
| Lg10WF | 0.0401660 | 0.0031091 | 12.9186788 | 0.0000000 |
lm(synonym ~ native.lang + num_syns + Lg10WF,
d = d.prop) %>%
summary() %>%
tidy ()%>%
kable(caption = "synonym")
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 0.2881453 | 0.0098725 | 29.1867423 | 0.0000000 |
| native.langL2 | -0.0360246 | 0.0057611 | -6.2530934 | 0.0000000 |
| num_syns | -0.0003665 | 0.0005783 | -0.6336929 | 0.5262903 |
| Lg10WF | 0.0249184 | 0.0041173 | 6.0521115 | 0.0000000 |
d.prop.common = d.prop.common %>%
mutate(rel.hyper = hypernym/hyponym)%>%
filter(is.finite(rel.hyper))
d.prop.common %>%
ggplot(aes(x = rel.hyper, fill = native.lang)) +
geom_density(alpha = .3) +
theme_bw()
Note we’re losing a lot of words here.
Error bars are 95% CIs
ms = d.prop.common %>%
group_by(native.lang) %>%
multi_boot_standard(column = "rel.hyper", na.rm = T)
ggplot(ms, aes(x=native.lang, y = mean,
fill = native.lang)) +
geom_bar(position = "dodge", stat= "identity") +
geom_linerange(aes(ymin = ci_lower,
ymax = ci_upper),
position=position_dodge(width = .9)) +
ylab("hyper/hypo") +
xlab("Language Group") +
theme_bw() +
theme(legend.position = "none")
I think this difference is just an artifact of the fact that L1 is more right skewed than L2.