Here I’m looking at the associates that are ONLY produced by L2 speakers for a cue, but I could in principle to a more fine-grained analysis.
# Read in wordnet data and get common cues only.
d = read_csv("data/wn_dclean.csv")
# get common cues
unique.cues = d %>%
group_by(native.lang, cue) %>%
distinct()
L1words = unique.cues %>%
filter(native.lang == "L1")
L2words = unique.cues %>%
filter(native.lang == "L2")
common_words = intersect(L1words$cue, L2words$cue)
d.common = d %>%
filter(cue %in% common_words)
# Get L2 associates not produced by L1 speakers for a particular cue.
L1.cue.associates = d.common %>%
select(cue, associate, native.lang) %>%
filter(native.lang == "L1")%>%
distinct() %>%
mutate(L1.present = 1)
L2.only = d.common %>%
filter(native.lang == "L2") %>%
select(cue, associate, Lg10WF, relation_type, num_syns) %>%
left_join(L1.cue.associates %>% select(-native.lang), by = c("cue", "associate")) %>%
filter(is.na(L1.present)) %>%
select(-L1.present)
Distribution of relation types for associates only produced by L2 speakers:
relation.prop = L2.only %>%
group_by(relation_type) %>%
summarise (n = n()) %>%
mutate(prop = n / sum(n))
ggplot(relation.prop, aes(x = relation_type, y = prop, fill = relation_type)) +
geom_bar(stat = "identity") +
theme(legend.position= "none")
Relative to the overall distribution of relation types, associates produced only by L2 speakers are more likely to by hyponyms than synonyms.
conc <- read_csv("../data/brysbaert_corpus.csv") %>%
select(Word, Conc.M)
conc.by.group = L2words %>% # all L2 cues
ungroup() %>%
select(cue) %>%
mutate(group = "all") %>%
bind_rows(L2.only %>% select(cue) %>% mutate(group = "only.L2")) %>% # only L2 cues
left_join(conc, by=c("cue" = "Word")) %>%
group_by(group) %>%
multi_boot_standard(column = "Conc.M", na.rm = T)
ggplot(conc.by.group, aes(y = mean, x = group)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
theme(legend.position= "none") +
xlab("set of cues") +
ylab("mean concreteness")
Cues that produce unique associates for L2 speakers tend to be less concrete than cues overall.
syns.by.group = L2words %>% # all L2 (common L1) cues
ungroup() %>%
select(cue) %>%
mutate(group = "all") %>%
bind_rows(L2.only %>% select(cue) %>% mutate(group = "only.L2")) %>% # only L2 cues
left_join(d %>% select(cue, num_syns) %>% distinct()) %>%
group_by(group) %>%
multi_boot_standard(column = "num_syns", na.rm = T)
ggplot(syns.by.group, aes(y = mean, x = group)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
theme(legend.position= "none") +
xlab("set of cues") +
ylab("mean number of synsets")
Cues that produce unique associates for L2 speakers tend to have more synsets.
To look at this, I did a median split on concreteness. Note that we don’t have the concretness measure for all cues.
all_prop_relations = read_csv("data/prop_relations.csv") %>%
select(-X1) %>%
left_join(conc, by=c("cue" = "Word"))
MEDIAN.CONC = all_prop_relations %>%
select(cue, Conc.M) %>%
distinct() %>%
summarize(median = median(Conc.M, na.rm = T)) %>%
as.numeric()
all_prop_relations %<>%
mutate(conc.bin = ifelse(Conc.M > MEDIAN.CONC, "concrete", "abstract"))
ms = all_prop_relations %>%
filter(!is.na(conc.bin)) %>%
gather("measure", "prop", 3:7) %>%
group_by(native.lang, measure, conc.bin) %>%
multi_boot_standard(column = "prop")
ggplot(ms, aes(x=measure, y = mean, group = conc.bin,
fill = conc.bin)) +
geom_bar(position = "dodge", stat= "identity") +
facet_grid(~native.lang) +
geom_linerange(aes(ymin = ci_lower,
ymax = ci_upper),
position=position_dodge(width = .9)) +
ylab("prop") +
xlab("semantic relation") +
theme_bw()
Yes - there is an interaction. For both L1 and L2, synonyms tend to be more abstract, and hypernyms tend to be more concrete. This is consistent with the finding below that concrete cues are farther down in the tree and thus have more hypernyms.
tscores = read_csv("data/t_scores_full.csv")
cues.ts = tscores %>%
select(cue, t, t_abs, Lg10WF, Conc.M) %>%
distinct() %>%
filter(!is.na(Lg10WF) & !is.na(Conc.M))
summary(lm(t_abs~Conc.M + Lg10WF, data = cues.ts))
##
## Call:
## lm(formula = t_abs ~ Conc.M + Lg10WF, data = cues.ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.037458 -0.019999 -0.004734 0.014275 0.215862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0416727 0.0012614 33.036 < 2e-16 ***
## Conc.M -0.0013854 0.0002549 -5.435 5.62e-08 ***
## Lg10WF -0.0014881 0.0003372 -4.413 1.03e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02539 on 8523 degrees of freedom
## Multiple R-squared: 0.005394, Adjusted R-squared: 0.005161
## F-statistic: 23.11 on 2 and 8523 DF, p-value: 9.755e-11
Answer: not that much.