Examine associates that are only produced by L2 speakers

Here I’m looking at the associates that are ONLY produced by L2 speakers for a cue, but I could in principle do a more fine-grained analysis.

# Read in wordnet data and get common cues only.
d = read_csv("data/wn_dclean.csv")

# get common cues
unique.cues = d %>%
  group_by(native.lang, cue) %>%
  distinct() 

L1_cues = unique.cues %>%
  filter(native.lang == "L1")

L2_cues = unique.cues %>%
  filter(native.lang == "L2")

common_cues = intersect(L1_cues$cue, L2_cues$cue)

d.common = d %>%
  filter(cue %in% common_cues)
# Get L2 associates not produced by L1 speakers for a particular cue.

L1_cue_associates = d.common %>% # common cue
  select(cue, associate, native.lang) %>%
  filter(native.lang == "L1")%>%
  distinct() %>%
  mutate(L1.present = 1)

L2_error_cues_associates = d.common %>%
  filter(native.lang == "L2") %>%
  select(cue, associate, Lg10WF, relation_type, num_syns) %>%
  left_join(L1_cue_associates %>% select(-native.lang),
            by = c("cue", "associate")) %>%
  filter(is.na(L1.present)) %>%
  select(-L1.present)
# Get L2 cues that elicit those errors
L2_error_cues = L2_error_cues_associates %>%
  select(cue) %>%
  distinct() 

34% of all cues elicited L2-only associates.

Are cues that elicit errors different?

concreteness

conc <- read_csv("../data/brysbaert_corpus.csv") %>%
  select(Word, Conc.M)

L2_correct_cues = common_cues[!(common_cues %in% L2_error_cues$cue)] 

conc.by.group = data.frame(cue = L2_correct_cues, group = "L2_correct") %>%
  bind_rows(data.frame(L2_error_cues, group = "L2_error")) %>% # only L2 cues 
  left_join(conc, by=c("cue" = "Word")) %>%
  group_by(group) %>%
  multi_boot_standard(column = "Conc.M", na.rm = T)

ggplot(conc.by.group, aes(y = mean, x = group)) + 
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(legend.position= "none") +
  xlab("set of cues") +
  ylab("mean concreteness")

synsets

syns.by.group = data.frame(cue = L2_correct_cues, group = "L2_correct") %>%
  bind_rows(data.frame(L2_error_cues, group = "L2_error")) %>% # only L2 cues 
  left_join(d %>% select(cue, num_syns) %>% distinct()) %>%
  group_by(group) %>%
  multi_boot_standard(column = "num_syns", na.rm = T)

ggplot(syns.by.group, aes(y = mean, x = group)) + 
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(legend.position= "none") +
  xlab("set of cues") +
  ylab("mean number of synsets")

depth

depths = read_csv("data/all_depths.csv")%>%
  select(-X1) %>%
  setNames(c("cue", "mean.depth"))

depth.by.group = data.frame(cue = L2_correct_cues, group = "L2_correct") %>%
  bind_rows(data.frame(L2_error_cues, group = "L2_error")) %>%
  left_join(depths %>% select(cue, mean.depth) %>% distinct()) %>%
  group_by(group) %>%
  multi_boot_standard(column = "mean.depth", na.rm = T)

ggplot(depth.by.group, aes(y = mean, x = group)) + 
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(legend.position= "none") +
  xlab("set of cues") +
  ylab("mean cue depth")

How correlated are these different measures of cues?

syns_conc_depth = data.frame(cue = L2_correct_cues, group = "L2_correct") %>%
  bind_rows(data.frame(L2_error_cues, group = "L2_error"))  %>%
  left_join(d %>% select(cue, num_syns) %>% distinct()) %>%
  left_join(conc, by=c("cue" = "Word")) %>%
  left_join(depths %>% select(cue, mean.depth) %>% distinct()) 

syns_conc_depth %>%
  select(-cue, -group) %>%
  correlate() %>%
  shave() %>%
  fashion %>%
  kable()
rowname num_syns Conc.M mean.depth
num_syns
Conc.M -.02
mean.depth -.26 .52

Concreteness and hierarchy depth are relatively highly correlated (r = .52): Words farther down in the hierarchy tend to be more concrete.

Depth and concreteness are also correlated: Words farther down in the hierachy have fewer senses.

Number of senses and concreteness are uncorrelated.

What kind of hierarchical relations do these weird associates have with the cues?

Distribution of relation types for associates only produced by L2 speakers:

relation.prop = L2_error_cues_associates  %>%
  group_by(relation_type) %>%
  summarise (n = n()) %>%
  mutate(prop = n / sum(n)) 

ggplot(relation.prop, aes(x = relation_type, y = prop, fill = relation_type)) + 
  geom_bar(stat = "identity") +
  theme(legend.position= "none")

Relative to the overall distribution of relation types, associates produced only by L2 speakers are more likely to by hyponyms than synonyms.

Is there an interaction between hypernymy/synonymy with concreteness?

To look at this, I did a median split on concreteness. Note that we don’t have the concretness measure for all cues.

all_prop_relations = read_csv("data/prop_relations.csv") %>%
  select(-X1) %>%
  left_join(conc, by=c("cue" = "Word"))

MEDIAN.CONC = all_prop_relations %>%
  select(cue, Conc.M) %>%
  distinct() %>%
  summarize(median = median(Conc.M, na.rm = T)) %>%
  as.numeric()

all_prop_relations %<>% 
  mutate(conc.bin = ifelse(Conc.M > MEDIAN.CONC, "concrete", "abstract"))

ms = all_prop_relations %>%
  filter(!is.na(conc.bin)) %>%
  gather("measure", "prop", 3:7) %>%
  group_by(native.lang, measure, conc.bin) %>%
  multi_boot_standard(column = "prop")

ggplot(ms, aes(x=measure, y = mean, group = conc.bin,
             fill = conc.bin)) +
   geom_bar(position = "dodge", stat= "identity") +
   facet_grid(~native.lang) +
   geom_linerange(aes(ymin = ci_lower, 
                     ymax = ci_upper), 
                 position=position_dodge(width = .9)) +
   ylab("prop") +
   xlab("semantic relation") +
   theme_bw()

Yes - there is an interaction. For both L1 and L2, synonyms tend to be more abstract, and hypernyms tend to be more concrete. This is consistent with the finding below that concrete cues are farther down in the tree and thus have more hypernyms.

How much of the variability between L1 and L2 differences does abstractness account for?

tscores = read_csv("data/t_scores_full.csv")
cues.ts =  tscores %>%
  select(cue, t, t_abs, Lg10WF, Conc.M) %>%
  distinct() %>%
  filter(!is.na(Lg10WF) & !is.na(Conc.M))

summary(lm(scale(t_abs)~scale(Conc.M ) + scale(Lg10WF), data = cues.ts))
## 
## Call:
## lm(formula = scale(t_abs) ~ scale(Conc.M) + scale(Lg10WF), data = cues.ts)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4714 -0.7856 -0.1860  0.5607  8.4792 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.270e-17  1.080e-02   0.000        1    
## scale(Conc.M) -5.883e-02  1.082e-02  -5.435 5.62e-08 ***
## scale(Lg10WF) -4.776e-02  1.082e-02  -4.413 1.03e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9974 on 8523 degrees of freedom
## Multiple R-squared:  0.005394,   Adjusted R-squared:  0.005161 
## F-statistic: 23.11 on 2 and 8523 DF,  p-value: 9.755e-11

Answer: not that much. R^2 = .05. But note that this on t-scores.