Examine associates that are only produced by L2 speakers

Here I’m looking at the associates that are ONLY produced by L2 speakers for a cue, but I could in principle to a more fine-grained analysis.

What kind of hierarchical relations do these words have with the cues?

# Read in wordnet data and get common cues only.

d = read_csv("data/wn_dclean.csv")

# get common cues
unique.cues = d %>%
  group_by(native.lang, cue) %>%
  distinct() 

L1words = unique.cues %>%
  filter(native.lang == "L1")

L2words = unique.cues %>%
  filter(native.lang == "L2")

common_words = intersect(L1words$cue, L2words$cue)

d.common = d %>%
  filter(cue %in% common_words)
# Get L2 associates not produced by L1 speakers for a particular cue.
L1.cue.associates = d.common %>%
  select(cue, associate, native.lang) %>%
  filter(native.lang == "L1")%>%
  distinct() %>%
  mutate(L1.present = 1)

L2.only = d.common %>%
  filter(native.lang == "L2") %>%
  select(cue, associate, Lg10WF, relation_type, num_syns) %>%
  left_join(L1.cue.associates %>% select(-native.lang), by = c("cue", "associate")) %>%
  filter(is.na(L1.present)) %>%
  select(-L1.present)

Distribution of relation types for associates only produced by L2 speakers:

relation.prop = L2.only  %>%
  group_by(relation_type) %>%
  summarise (n = n()) %>%
  mutate(prop = n / sum(n)) 

ggplot(relation.prop, aes(x = relation_type, y = prop, fill = relation_type)) + 
  geom_bar(stat = "identity") +
  theme(legend.position= "none")

Relative to the overall distribution of relation types, associates produced only by L2 speakers are more likely to by hyponyms than synonyms.

Are these words produced from concrete or abstract cues?

conc <- read_csv("../data/brysbaert_corpus.csv") %>%
  select(Word, Conc.M)

conc.by.group = L2words %>% # all L2 cues
  ungroup() %>%
  select(cue) %>%
  mutate(group = "all") %>%
  bind_rows(L2.only %>% select(cue) %>% mutate(group = "only.L2")) %>% # only L2 cues 
  left_join(conc, by=c("cue" = "Word")) %>%
  group_by(group) %>%
  multi_boot_standard(column = "Conc.M", na.rm = T)

ggplot(conc.by.group, aes(y = mean, x = group)) + 
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(legend.position= "none") +
  xlab("set of cues") +
  ylab("mean concreteness")

Cues that produce unique associates for L2 speakers tend to be less concrete than cues overall.

How many synsets do these words have?

syns.by.group = L2words %>% # all L2 (common L1) cues
  ungroup() %>%
  select(cue) %>%
  mutate(group = "all") %>%
  bind_rows(L2.only %>% select(cue) %>% mutate(group = "only.L2")) %>% # only L2 cues 
  left_join(d %>% select(cue, num_syns) %>% distinct()) %>%
  group_by(group) %>%
  multi_boot_standard(column = "num_syns", na.rm = T)

ggplot(syns.by.group, aes(y = mean, x = group)) + 
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  theme(legend.position= "none") +
  xlab("set of cues") +
  ylab("mean number of synsets")

Cues that produce unique associates for L2 speakers tend to have more synsets.

Is there an interaction between hypernymy/synonymy with concreteness?

To look at this, I did a median split on concreteness. Note that we don’t have the concretness measure for all cues.

all_prop_relations = read_csv("data/prop_relations.csv") %>%
  select(-X1) %>%
  left_join(conc, by=c("cue" = "Word"))

MEDIAN.CONC = all_prop_relations %>%
  select(cue, Conc.M) %>%
  distinct() %>%
  summarize(median = median(Conc.M, na.rm = T)) %>%
  as.numeric()

all_prop_relations %<>% 
  mutate(conc.bin = ifelse(Conc.M > MEDIAN.CONC, "concrete", "abstract"))

ms = all_prop_relations %>%
  filter(!is.na(conc.bin)) %>%
  gather("measure", "prop", 3:7) %>%
  group_by(native.lang, measure, conc.bin) %>%
  multi_boot_standard(column = "prop")

ggplot(ms, aes(x=measure, y = mean, group = conc.bin,
             fill = conc.bin)) +
   geom_bar(position = "dodge", stat= "identity") +
   facet_grid(~native.lang) +
   geom_linerange(aes(ymin = ci_lower, 
                     ymax = ci_upper), 
                 position=position_dodge(width = .9)) +
   ylab("prop") +
   xlab("semantic relation") +
   theme_bw()

Yes - there is an interaction. For both L1 and L2, synonyms tend to be more abstract, and hypernyms tend to be more concrete. This is consistent with the finding below that concrete cues are farther down in the tree and thus have more hypernyms.

How correlated are concreteness and depth in hierarchy (positive?)?

depths = read_csv("data/all_depths.csv")%>%
  select(-X1) %>%
  setNames(c("cue", "mean.depth"))

depths %<>%
  left_join(conc, by = c("cue"= "Word") )%>%
  filter(!is.na(Conc.M) & !is.na(mean.depth))

ggplot(depths, aes( x = mean.depth)) +
  geom_histogram() +
  ggtitle("distribution of depths across words")

There are a lot of words with depth = 1.

ggplot(depths, aes(x = Conc.M, y = mean.depth)) +
  geom_point()+
  geom_smooth(method = "lm")

cor.test(depths$mean.depth, depths$Conc.M) %>%
  tidy() %>%
  select(-parameter) %>%
  kable()
estimate statistic p.value conf.low conf.high method alternative
0.546815 61.66922 0 0.5320988 0.5612009 Pearson’s product-moment correlation two.sided

Concreteness and hierarchy depth are relatively highly correlated (r = .54): Words farther down in the hierarchy tend to be more concrete.

How much of the variability between L1 and L2 differences does abstractness account for?

tscores = read_csv("data/t_scores_full.csv")
cues.ts =  tscores %>%
  select(cue, t, t_abs, Lg10WF, Conc.M) %>%
  distinct() %>%
  filter(!is.na(Lg10WF) & !is.na(Conc.M))

summary(lm(t_abs~Conc.M  + Lg10WF, data = cues.ts))
## 
## Call:
## lm(formula = t_abs ~ Conc.M + Lg10WF, data = cues.ts)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.037458 -0.019999 -0.004734  0.014275  0.215862 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.0416727  0.0012614  33.036  < 2e-16 ***
## Conc.M      -0.0013854  0.0002549  -5.435 5.62e-08 ***
## Lg10WF      -0.0014881  0.0003372  -4.413 1.03e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02539 on 8523 degrees of freedom
## Multiple R-squared:  0.005394,   Adjusted R-squared:  0.005161 
## F-statistic: 23.11 on 2 and 8523 DF,  p-value: 9.755e-11

Answer: not that much.