Here I’m looking at the associates that are ONLY produced by L2 speakers for a cue, but I could in principle do a more fine-grained analysis.
# Read in wordnet data and get common cues only.
d = read_csv("data/wn_dclean.csv")
# get common cues
unique.cues = d %>%
group_by(native.lang, cue) %>%
distinct()
L1_cues = unique.cues %>%
filter(native.lang == "L1")
L2_cues = unique.cues %>%
filter(native.lang == "L2")
common_cues = intersect(L1_cues$cue, L2_cues$cue)
d.common = d %>%
filter(cue %in% common_cues)
# Get L2 associates not produced by L1 speakers for a particular cue.
L1_cue_associates = d.common %>% # common cue
select(cue, associate, native.lang) %>%
filter(native.lang == "L1")%>%
distinct() %>%
mutate(L1.present = 1)
L2_error_cues_associates = d.common %>%
filter(native.lang == "L2") %>%
select(cue, associate, Lg10WF, relation_type, num_syns) %>%
left_join(L1_cue_associates %>% select(-native.lang),
by = c("cue", "associate")) %>%
filter(is.na(L1.present)) %>%
select(-L1.present)
# Get L2 cues that elicit those errors
L2_error_cues = L2_error_cues_associates %>%
select(cue) %>%
distinct()
34% of all cues elicited L2-only associates.
conc <- read_csv("../data/brysbaert_corpus.csv") %>%
select(Word, Conc.M)
L2_correct_cues = common_cues[!(common_cues %in% L2_error_cues$cue)]
conc.by.group = data.frame(cue = L2_correct_cues, group = "L2_correct") %>%
bind_rows(data.frame(L2_error_cues, group = "L2_error")) %>% # only L2 cues
left_join(conc, by=c("cue" = "Word")) %>%
group_by(group) %>%
multi_boot_standard(column = "Conc.M", na.rm = T)
ggplot(conc.by.group, aes(y = mean, x = group)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
theme(legend.position= "none") +
xlab("set of cues") +
ylab("mean concreteness")
syns.by.group = data.frame(cue = L2_correct_cues, group = "L2_correct") %>%
bind_rows(data.frame(L2_error_cues, group = "L2_error")) %>% # only L2 cues
left_join(d %>% select(cue, num_syns) %>% distinct()) %>%
group_by(group) %>%
multi_boot_standard(column = "num_syns", na.rm = T)
ggplot(syns.by.group, aes(y = mean, x = group)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
theme(legend.position= "none") +
xlab("set of cues") +
ylab("mean number of synsets")
depths = read_csv("data/all_depths.csv")%>%
select(-X1) %>%
setNames(c("cue", "mean.depth"))
depth.by.group = data.frame(cue = L2_correct_cues, group = "L2_correct") %>%
bind_rows(data.frame(L2_error_cues, group = "L2_error")) %>%
left_join(depths %>% select(cue, mean.depth) %>% distinct()) %>%
group_by(group) %>%
multi_boot_standard(column = "mean.depth", na.rm = T)
ggplot(depth.by.group, aes(y = mean, x = group)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
theme(legend.position= "none") +
xlab("set of cues") +
ylab("mean cue depth")
Distribution of relation types for associates only produced by L2 speakers:
relation.prop = L2_error_cues_associates %>%
group_by(relation_type) %>%
summarise (n = n()) %>%
mutate(prop = n / sum(n))
ggplot(relation.prop, aes(x = relation_type, y = prop, fill = relation_type)) +
geom_bar(stat = "identity") +
theme(legend.position= "none")
Relative to the overall distribution of relation types, associates produced only by L2 speakers are more likely to by hyponyms than synonyms.
To look at this, I did a median split on concreteness. Note that we don’t have the concretness measure for all cues.
all_prop_relations = read_csv("data/prop_relations.csv") %>%
select(-X1) %>%
left_join(conc, by=c("cue" = "Word"))
MEDIAN.CONC = all_prop_relations %>%
select(cue, Conc.M) %>%
distinct() %>%
summarize(median = median(Conc.M, na.rm = T)) %>%
as.numeric()
all_prop_relations %<>%
mutate(conc.bin = ifelse(Conc.M > MEDIAN.CONC, "concrete", "abstract"))
ms = all_prop_relations %>%
filter(!is.na(conc.bin)) %>%
gather("measure", "prop", 3:7) %>%
group_by(native.lang, measure, conc.bin) %>%
multi_boot_standard(column = "prop")
ggplot(ms, aes(x=measure, y = mean, group = conc.bin,
fill = conc.bin)) +
geom_bar(position = "dodge", stat= "identity") +
facet_grid(~native.lang) +
geom_linerange(aes(ymin = ci_lower,
ymax = ci_upper),
position=position_dodge(width = .9)) +
ylab("prop") +
xlab("semantic relation") +
theme_bw()
Yes - there is an interaction. For both L1 and L2, synonyms tend to be more abstract, and hypernyms tend to be more concrete. This is consistent with the finding below that concrete cues are farther down in the tree and thus have more hypernyms.
tscores = read_csv("data/t_scores_full.csv")
cues.ts = tscores %>%
select(cue, t, t_abs, Lg10WF, Conc.M) %>%
distinct() %>%
filter(!is.na(Lg10WF) & !is.na(Conc.M))
summary(lm(scale(t_abs)~scale(Conc.M ) + scale(Lg10WF), data = cues.ts))
##
## Call:
## lm(formula = scale(t_abs) ~ scale(Conc.M) + scale(Lg10WF), data = cues.ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4714 -0.7856 -0.1860 0.5607 8.4792
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.270e-17 1.080e-02 0.000 1
## scale(Conc.M) -5.883e-02 1.082e-02 -5.435 5.62e-08 ***
## scale(Lg10WF) -4.776e-02 1.082e-02 -4.413 1.03e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9974 on 8523 degrees of freedom
## Multiple R-squared: 0.005394, Adjusted R-squared: 0.005161
## F-statistic: 23.11 on 2 and 8523 DF, p-value: 9.755e-11
Answer: not that much. R^2 = .05. But note that this on t-scores.