Read in data and munge

d.clean = read.csv("../data/dclean.csv")

wn = read.csv("data/all_words.csv") %>%
     select(cue, word, relation_type, num_syns)

wn.clean = wn %>%
  mutate(word = str_replace_all(word, "_", " "),
         word = tolower(word))
wn %>%
  filter(cue == "dog") %>%
  head() %>%
  kable(caption = "Just as a sanity check, here's what the data look like for dog")
Just as a sanity check, here’s what the data look like for dog
cue word relation_type num_syns
dog canid hypernym 8
dog carnivore hypernym 8
dog lad hypernym 8
dog pursue hypernym 8
dog being hypernym 8
dog move hypernym 8

96.44% of SWOW cues are in wordnet.

Merge SWOW and WN datasets.

d = d.clean %>%
  select(userID, native.lang, cue, associate, Lg10WF) %>%
  inner_join(wn.clean, by=c("cue", "associate"="word")) 

Relation type by language group

I did the analyses on two samples (1) a balanced set with the exact same set of words for both language group (n = 7804 words), and (2) all the words available in the sample (n = 10,000), with L1 > L2.

Merge in cue information and get proportions by group by cue.

distinct.cues = wn %>%
  select(cue, num_syns) %>%
  distinct()

freqs = d %>%
  select(cue, Lg10WF) %>%
  distinct()

d.prop = d %>%
  group_by(native.lang, cue, relation_type) %>%
  summarise (n = n()) %>%
  mutate(prop = n / sum(n)) %>%
  select(-n) %>%
  spread(relation_type, prop) %>%
  mutate_all(funs(replace(., is.na(.), 0))) %>%
  left_join(distinct.cues) %>%
  left_join(freqs)
d.prop %>%
  arrange(cue) %>%
  tail() %>%
  kable(caption = "This is what the data look like:")
This is what the data look like:
native.lang cue holonym hypernym hyponym meronym synonym num_syns Lg10WF
L1 zone 0.0000000 0.9130435 0.0434783 0.0000000 0.0434783 6 3.0116
L2 zone 0.0000000 1.0000000 0.0000000 0.0000000 0.0000000 6 3.0116
L1 zoo 0.0000000 0.0000000 0.0000000 0.0000000 1.0000000 1 2.8432
L1 zoom 0.0000000 0.8181818 0.0000000 0.0000000 0.1818182 5 2.2601
L1 zucchini 0.0823529 0.7529412 0.0000000 0.0823529 0.0823529 2 1.6990
L2 zucchini 0.0000000 1.0000000 0.0000000 0.0000000 0.0000000 2 1.6990

Balanced word sets

l2words = d.prop %>% 
  ungroup() %>%
  filter(native.lang == "L2") %>%
  select(cue) 

l2words = unique(l2words$cue)

l1words = d.prop %>% 
  ungroup() %>%
  filter(native.lang == "L1") %>%
  select(cue) 

l1words = unique(l1words$cue)

common_words = intersect(l2words, l1words)

d.prop.common = d.prop %>%
  filter(cue %in% common_words)

There are 7804 words for each group.

Proportion distributions

d.prop.common %>%
  gather("relation", "prop", 3:7) %>%
  ggplot(aes(x = prop, fill = native.lang)) +
  facet_wrap(~relation, scales = "free") +
  geom_density(alpha = .3) +
  theme_bw()

Mean prop relation_type

Error bars are 95% CIs

ms = d.prop.common %>%
  gather("measure", "prop", 3:7) %>%
  group_by(native.lang, measure) %>%
  multi_boot_standard(column = "prop")

ggplot(ms, aes(x=measure, y = mean, group = native.lang,
             fill = native.lang)) +
   geom_bar(position = "dodge", stat= "identity") +
   geom_linerange(aes(ymin = ci_lower, 
                     ymax = ci_upper), 
                 position=position_dodge(width = .9)) +
   ylab("prop") +
   xlab("semantic relation") +
   theme_bw()

Regressions

Predicting prop relation type, controling for number of synsets and spoken frequency of the word. A separate model for each word type. There’s a reliable effect of hypernyms, hyponyms, and synonyms.

lm(hypernym ~ native.lang + num_syns + Lg10WF, 
   d = d.prop.common) %>%
  summary() %>%
  tidy() %>%
  kable(caption = "hypernym")
hypernym
term estimate std.error statistic p.value
(Intercept) 0.6456406 0.0107840 59.870491 0.0000000
native.langL2 0.0200710 0.0061566 3.260094 0.0011163
num_syns -0.0023204 0.0006057 -3.831233 0.0001280
Lg10WF -0.0812274 0.0044369 -18.307303 0.0000000
lm(hyponym ~ native.lang + num_syns + Lg10WF, 
   d = d.prop.common)%>%
  summary() %>%
  tidy () %>%
  kable(caption = "hyponym")
hyponym
term estimate std.error statistic p.value
(Intercept) 0.0621570 0.0079946 7.774827 0.0000000
native.langL2 -0.0104062 0.0045642 -2.279985 0.0226229
num_syns 0.0040788 0.0004490 9.084080 0.0000000
Lg10WF 0.0414817 0.0032893 12.611224 0.0000000
lm(synonym ~ native.lang + num_syns + Lg10WF, 
   d = d.prop.common) %>%
  summary() %>%
  tidy ()%>%
  kable(caption = "synonym")
synonym
term estimate std.error statistic p.value
(Intercept) 0.2522185 0.0101796 24.7767421 0.0000000
native.langL2 -0.0114671 0.0058116 -1.9731387 0.0484985
num_syns 0.0003212 0.0005717 0.5618444 0.5742307
Lg10WF 0.0276110 0.0041883 6.5924982 0.0000000

All words

Proportion distributions

d.prop %>%
  gather("relation", "prop", 3:7) %>%
  ggplot(aes(x = prop, fill = native.lang)) +
  facet_wrap(~relation, scales = "free") +
  geom_density(alpha = .3) +
  theme_bw()

Mean prop relation_type

Error bars are 95% CIs

ms = d.prop %>%
  gather("measure", "prop", 3:7) %>%
  group_by(native.lang, measure) %>%
  multi_boot_standard(column = "prop")

ggplot(ms, aes(x=measure, y = mean, group = native.lang,
             fill = native.lang)) +
   geom_bar(position = "dodge", stat= "identity") +
   geom_linerange(aes(ymin = ci_lower, 
                     ymax = ci_upper), 
                 position=position_dodge(width = .9)) +
  ylab("prop") +
  xlab("semantic relation") +
  theme_bw()

Regressions

Predicting prop relation type, controling for number of synsets and spoken frequency of the word. A separate model for each word type. There’s a reliable effect of hypernyms and synonyms.

lm(hypernym ~ native.lang + num_syns + Lg10WF, 
   d = d.prop) %>%
  summary() %>%
  tidy() %>%
  kable(caption = "hypernym")
hypernym
term estimate std.error statistic p.value
(Intercept) 0.6239006 0.0102481 60.879764 0.0000000
native.langL2 0.0308380 0.0059803 5.156624 0.0000003
num_syns -0.0020450 0.0006003 -3.406598 0.0006594
Lg10WF -0.0775709 0.0042740 -18.149668 0.0000000
lm(hyponym ~ native.lang + num_syns + Lg10WF, 
   d = d.prop)%>%
  summary() %>%
  tidy () %>%
  kable(caption = "hyponym")
hyponym
term estimate std.error statistic p.value
(Intercept) 0.0538834 0.0074551 7.2277351 0.0000000
native.langL2 -0.0007715 0.0043504 -0.1773363 0.8592465
num_syns 0.0043687 0.0004367 10.0036941 0.0000000
Lg10WF 0.0401660 0.0031091 12.9186788 0.0000000
lm(synonym ~ native.lang + num_syns + Lg10WF, 
   d = d.prop) %>%
  summary() %>%
  tidy ()%>%
  kable(caption = "synonym")
synonym
term estimate std.error statistic p.value
(Intercept) 0.2881453 0.0098725 29.1867423 0.0000000
native.langL2 -0.0360246 0.0057611 -6.2530934 0.0000000
num_syns -0.0003665 0.0005783 -0.6336929 0.5262903
Lg10WF 0.0249184 0.0041173 6.0521115 0.0000000

Hyper:Hypo by language group

Balanced word sets

d.prop.common = d.prop.common %>%
  mutate(rel.hyper = hypernym/hyponym)%>%
  filter(is.finite(rel.hyper))

d.prop.common %>%
  ggplot(aes(x = rel.hyper, fill = native.lang)) +
  geom_density(alpha = .3) +
  theme_bw()

Note we’re losing a lot of words here.

Relative

Error bars are 95% CIs

ms = d.prop.common %>%
  group_by(native.lang) %>%
  multi_boot_standard(column = "rel.hyper", na.rm = T)

ggplot(ms, aes(x=native.lang, y = mean,
             fill = native.lang)) +
   geom_bar(position = "dodge", stat= "identity") +
   geom_linerange(aes(ymin = ci_lower, 
                     ymax = ci_upper), 
                 position=position_dodge(width = .9)) +
  ylab("hyper/hypo") +
  xlab("Language Group") +
  theme_bw() +
  theme(legend.position = "none")

I think this difference is just an artifact of the fact that L1 is more right skewed than L2.