Do the properties of the cue influence the associates differently for speakers of different L2 groups? In particular, sentiment and concretenes of the cue.
SUMMARY: For family split, no differences. For genus split, trends for sentiment and concreteness.
Split language groups
Based on language family and genus
two_to_three = read.csv("data/ISO_2_3.csv") %>%
rename(ISO2 = langName_ISO)
lang_codes_genus = read.csv("data/lang_codes_with_genus.csv") %>%
select(wals.code, name, genus, family)
# based on IS03 merge in language infor from wals
codes = bind_rows(read.csv("data/lang_codes.csv"),
read.csv("data/lang_codes_supp.csv")) %>% # some info is missing from wals csv, even though on website (see lang_codes_supp.csv)
select(ascii_name, id, iso_codes, latitude, longitude, macroarea) %>%
mutate(iso_codes = as.factor(unlist(lapply(strsplit(as.character(iso_codes),
","),function(x) x[1])))) %>%# in cases where there are two ISO codes, takes first
rename(ISO3 = iso_codes,
wals.code = id) %>%
group_by(ISO3) %>%
slice(1) %>%
left_join(lang_codes_genus)
# merge in data, and remove L1 speakers
d.clean.fam = d.clean %>%
rename(ISO2 = langName_ISO) %>%
left_join(two_to_three) %>%
left_join(codes, by = "ISO3") %>%
filter(native.lang != "L1")
Distribution of languages
d.clean.fam %>%
count(ISO3) %>%
mutate(ISO3 = fct_reorder(ISO3, n, .desc = TRUE)) %>%
filter(n > 150) %>%
ggplot(aes(x = ISO3, y = n, fill = ISO3)) +
geom_bar(stat = 'identity') +
theme_bw() +
ggtitle("N Languages") +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "None")

d.clean.fam %>%
ggplot(aes(x=factor(macroarea), y=..count..)) +
geom_bar(stat = "count") +
xlab("macroarea")+
ggtitle("N macroareas") +
theme_bw()

d.clean.fam %>%
count(genus) %>%
mutate(genus = fct_reorder(genus, n, .desc = TRUE)) %>%
ggplot(aes(x = genus, y = n)) +
geom_bar(stat = 'identity') +
theme_bw() +
ggtitle("N genera") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

d.clean.fam %>%
count(family) %>%
mutate(family = fct_reorder(family, n, .desc = TRUE)) %>%
ggplot(aes(x = family, y = n)) +
geom_bar(stat = 'identity') +
ggtitle("N family") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

Not an obvious way to split languages here. Try indo-european vs. non-indo-european? Also Germanic vs. Romance, excluding other languages.
d.clean.fam = d.clean.fam %>%
mutate(family.split = ifelse(family == "Indo-European", "IE", "other"),
genus.split = ifelse(genus == "Germanic", "germanic",
ifelse(genus == "Romance", "romance", NA)),
family.split = as.factor(family.split),
genus.split = as.factor(genus.split))
t-score between family
Indo-European vs. Non-Indo-European
t-scores
Calculate t-scores (see t_L1_L2.Rmd for details about t-score calculation)
# get dataset with shared bigrams only (i think this is necessary)
IE.bigrams = unique(filter(d.clean.fam,family.split == "IE"))
other.bigrams = unique(filter(d.clean.fam,family.split == "other"))
shared.bigrams = intersect(IE.bigrams$bigram,
other.bigrams$bigram)
d.common = d.clean.fam %>%
ungroup() %>%
filter(bigram %in% shared.bigrams)
# get C(vw) in each language (bigram counts)
bigram.counts = d.common %>%
group_by(family.split, bigram, associate, cue) %>%
summarize(n_bigram = n())
# get C(v) in each language (associate counts)
associate.counts = d.common %>%
group_by(family.split, associate) %>%
summarize(n_associate = n())
# get C(vw)/C(v) in each langauge
bigram.counts.rf = bigram.counts %>%
left_join(associate.counts, by = c("family.split", "associate")) %>%
mutate(rf = n_bigram/n_associate)
t.scores <- bigram.counts.rf %>%
ungroup() %>%
as_tibble() %>%
select(family.split, rf, cue, associate) %>%
spread(family.split, rf) %>%
filter(!is.na(IE), !is.na(other)) %>%
mutate(t = (IE - other)/sqrt(IE + other))
t-scores
ggplot(t.scores, aes(x = t)) +
geom_histogram() +
theme_bw() +
ggtitle("distribution of t-scores")

absolute t-scores
ggplot(t.scores, aes(x = abs(t))) +
geom_histogram() +
theme_bw() +
ggtitle("distribution of t-scores")

Predicting divergerence with sentiment
Join t.scores and cues characteristics
cues.chars = d.clean %>%
group_by(cue) %>%
slice(1) %>%
select(Lg10WF, quant.sent, Conc.M, V.Mean.Sum, A.Mean.Sum, D.Mean.Sum)
t.scores.full = t.scores %>%
group_by(cue) %>%
summarize(t = mean(t),
t_abs = mean(abs(t))) %>%
left_join(cues.chars)
cues.ts = t.scores.full %>%
select(cue, t, t_abs, Lg10WF,V.Mean.Sum,A.Mean.Sum, D.Mean.Sum) %>%
distinct() %>%
filter(!is.na(Lg10WF) & !is.na(V.Mean.Sum))
t
Look at correlation between norms
correlate(cues.ts %>% select(-1,-3,-8)) %>%
shave() %>%
fashion() %>%
kable()
| t |
|
|
|
|
|
| Lg10WF |
-.07 |
|
|
|
|
| V.Mean.Sum |
-.03 |
.15 |
|
|
|
| A.Mean.Sum |
-.01 |
.05 |
-.16 |
|
|
| D.Mean.Sum |
-.03 |
.13 |
.72 |
-.16 |
|
Valence
kable(tidy(lm(t~ V.Mean.Sum + Lg10WF, data = cues.ts)))
| (Intercept) |
0.0183671 |
0.0051882 |
3.540208 |
0.0004026 |
| V.Mean.Sum |
-0.0011793 |
0.0007809 |
-1.510246 |
0.1310298 |
| Lg10WF |
-0.0076865 |
0.0015144 |
-5.075500 |
0.0000004 |
ggplot(cues.ts, aes(x = V.Mean.Sum, y = t)) +
geom_smooth(method = "lm") +
theme_bw()

Arousal
kable(tidy(lm(t ~ A.Mean.Sum+ Lg10WF, data = cues.ts)))
| (Intercept) |
0.0151588 |
0.0058605 |
2.5866241 |
0.0097139 |
| A.Mean.Sum |
-0.0005239 |
0.0010996 |
-0.4764394 |
0.6337776 |
| Lg10WF |
-0.0079993 |
0.0014989 |
-5.3369393 |
0.0000001 |
ggplot(cues.ts, aes(x = A.Mean.Sum, y = t)) +
geom_smooth(method = "lm") +
theme_bw()

Dominance
kable(tidy(lm(t ~ D.Mean.Sum+ Lg10WF, data = cues.ts)))
| (Intercept) |
0.0212512 |
0.0064713 |
3.283915 |
0.0010292 |
| D.Mean.Sum |
-0.0016883 |
0.0010761 |
-1.568933 |
0.1167128 |
| Lg10WF |
-0.0077179 |
0.0015103 |
-5.110039 |
0.0000003 |
ggplot(cues.ts, aes(x = D.Mean.Sum, y = t)) +
geom_smooth(method = "lm") +
theme_bw()

absolute-t
Look at correlation between norms
correlate(cues.ts %>% select(-1:-2,-8)) %>%
shave() %>%
fashion() %>%
kable()
| t_abs |
|
|
|
|
|
| Lg10WF |
.04 |
|
|
|
|
| V.Mean.Sum |
.00 |
.15 |
|
|
|
| A.Mean.Sum |
-.01 |
.05 |
-.16 |
|
|
| D.Mean.Sum |
.00 |
.13 |
.72 |
-.16 |
|
Valence
kable(tidy(lm(t_abs~ V.Mean.Sum + Lg10WF, data = cues.ts)))
| (Intercept) |
0.0511886 |
0.0036532 |
14.0119146 |
0.0000000 |
| V.Mean.Sum |
-0.0002869 |
0.0005499 |
-0.5217506 |
0.6018620 |
| Lg10WF |
0.0037558 |
0.0010664 |
3.5220418 |
0.0004312 |
ggplot(cues.ts, aes(x = V.Mean.Sum, y = t_abs)) +
geom_smooth(method = "lm") +
theme_bw()

Arousal
kable(tidy(lm(t_abs~ A.Mean.Sum + Lg10WF, data = cues.ts)))
| (Intercept) |
0.0520861 |
0.0041259 |
12.6241783 |
0.0000000 |
| A.Mean.Sum |
-0.0005411 |
0.0007741 |
-0.6989587 |
0.4846032 |
| Lg10WF |
0.0037065 |
0.0010552 |
3.5124955 |
0.0004470 |
ggplot(cues.ts, aes(x = A.Mean.Sum, y = t_abs)) +
geom_smooth(method = "lm") +
theme_bw()

Dominance
kable(tidy(lm(t_abs ~ D.Mean.Sum+ Lg10WF, data = cues.ts)))
| (Intercept) |
0.0508040 |
0.0045569 |
11.1488520 |
0.0000000 |
| D.Mean.Sum |
-0.0001875 |
0.0007577 |
-0.2474894 |
0.8045374 |
| Lg10WF |
0.0037065 |
0.0010635 |
3.4851058 |
0.0004952 |
ggplot(cues.ts, aes(x = D.Mean.Sum, y = t_abs)) +
geom_smooth(method = "lm") +
theme_bw()

Predicting divergence with concreteness
t
cues.ts = t.scores.full %>%
select(cue, t, t_abs, Lg10WF, Conc.M) %>%
distinct() %>%
filter(!is.na(Lg10WF) & !is.na(Conc.M))
kable(tidy(lm(t~Conc.M + Lg10WF, data = cues.ts)))
| (Intercept) |
0.0020643 |
0.0044096 |
0.4681337 |
0.6397027 |
| Conc.M |
0.0010431 |
0.0008850 |
1.1786794 |
0.2385635 |
| Lg10WF |
-0.0049230 |
0.0011807 |
-4.1696399 |
0.0000309 |
ggplot(cues.ts, aes(x = Conc.M, y = t)) +
#geom_point() +
geom_smooth(method = "lm") +
theme_bw()

absolute-t
kable(tidy(lm(t_abs~ Conc.M + Lg10WF, data = cues.ts)))
| (Intercept) |
0.0473497 |
0.0031062 |
15.2436649 |
0.0000000 |
| Conc.M |
0.0001023 |
0.0006234 |
0.1640293 |
0.8697125 |
| Lg10WF |
0.0045919 |
0.0008317 |
5.5211576 |
0.0000000 |
ggplot(cues.ts, aes(x = Conc.M, y = t_abs)) +
#geom_point() +
geom_smooth(method = "lm") +
theme_bw()

t-score between genus
Romance vs. Germanic
get t-scores
Calculate t-scores
# get dataset with shared bigrams only (i think this is necessary)
g.bigrams = unique(filter(d.clean.fam,genus.split == "germanic"))
r.bigrams = unique(filter(d.clean.fam,genus.split == "romance"))
shared.bigrams = intersect(g.bigrams$bigram,
r.bigrams$bigram)
d.common = d.clean.fam %>%
ungroup() %>%
filter(bigram %in% shared.bigrams)
# get C(vw) in each language (bigram counts)
bigram.counts = d.common %>%
group_by(genus.split, bigram, associate, cue) %>%
summarize(n_bigram = n())
# get C(v) in each language (associate counts)
associate.counts = d.common %>%
group_by(genus.split, associate) %>%
summarize(n_associate = n())
# get C(vw)/C(v) in each langauge
bigram.counts.rf = bigram.counts %>%
left_join(associate.counts, by = c("genus.split", "associate")) %>%
mutate(rf = n_bigram/n_associate)
t.scores <- bigram.counts.rf %>%
ungroup() %>%
as_tibble() %>%
select(genus.split, rf, cue, associate) %>%
spread(genus.split, rf) %>%
filter(!is.na(romance), !is.na(germanic)) %>%
mutate(t = (romance - germanic)/sqrt(romance + germanic))
t-scores
ggplot(t.scores, aes(x = t)) +
geom_histogram() +
theme_bw() +
ggtitle("distribution of t-scores")

absolute t-scores
ggplot(t.scores, aes(x = abs(t))) +
geom_histogram() +
theme_bw() +
ggtitle("distribution of t-scores")

Predicting divergence with sentiment
Join t.scores and cues characteristics
cues.chars = d.clean %>%
group_by(cue) %>%
slice(1) %>%
select(Lg10WF, quant.sent, Conc.M, V.Mean.Sum, A.Mean.Sum, D.Mean.Sum)
t.scores.full = t.scores %>%
group_by(cue) %>%
summarize(t = mean(t),
t_abs = mean(abs(t))) %>%
left_join(cues.chars)
cues.ts = t.scores.full %>%
select(cue, t, t_abs, Lg10WF,V.Mean.Sum,A.Mean.Sum, D.Mean.Sum) %>%
distinct() %>%
filter(!is.na(Lg10WF) & !is.na(V.Mean.Sum))
t
Look at correlation between norms
correlate(cues.ts %>% select(-1,-3,-8)) %>%
shave() %>%
fashion() %>%
kable()
| t |
|
|
|
|
|
| Lg10WF |
-.04 |
|
|
|
|
| V.Mean.Sum |
-.03 |
.14 |
|
|
|
| A.Mean.Sum |
.01 |
.06 |
-.16 |
|
|
| D.Mean.Sum |
-.05 |
.13 |
.70 |
-.17 |
|
Valence
kable(tidy(lm(t~ V.Mean.Sum + Lg10WF, data = cues.ts)))
| (Intercept) |
0.0201714 |
0.0056623 |
3.562389 |
0.0003705 |
| V.Mean.Sum |
-0.0018404 |
0.0008380 |
-2.196143 |
0.0281225 |
| Lg10WF |
-0.0049662 |
0.0017057 |
-2.911465 |
0.0036116 |
ggplot(cues.ts, aes(x = V.Mean.Sum, y = t)) +
geom_smooth(method = "lm") +
theme_bw()

Arousal
kable(tidy(lm(t ~ A.Mean.Sum+ Lg10WF, data = cues.ts)))
| (Intercept) |
0.0058071 |
0.0063339 |
0.9168387 |
0.3592666 |
| A.Mean.Sum |
0.0014923 |
0.0011827 |
1.2617867 |
0.2070781 |
| Lg10WF |
-0.0055987 |
0.0016933 |
-3.3064487 |
0.0009508 |
ggplot(cues.ts, aes(x = A.Mean.Sum, y = t)) +
geom_smooth(method = "lm") +
theme_bw()

Dominance
kable(tidy(lm(t ~ D.Mean.Sum+ Lg10WF, data = cues.ts)))
| (Intercept) |
0.0307448 |
0.0070154 |
4.382500 |
0.0000119 |
| D.Mean.Sum |
-0.0039020 |
0.0011597 |
-3.364612 |
0.0007717 |
| Lg10WF |
-0.0047529 |
0.0017025 |
-2.791677 |
0.0052613 |
ggplot(cues.ts, aes(x = D.Mean.Sum, y = t)) +
geom_smooth(method = "lm") +
theme_bw()

absolute-t
Look at correlation between norms
correlate(cues.ts %>% select(-1:-2,-8)) %>%
shave() %>%
fashion() %>%
kable()
| t_abs |
|
|
|
|
|
| Lg10WF |
.06 |
|
|
|
|
| V.Mean.Sum |
.02 |
.14 |
|
|
|
| A.Mean.Sum |
-.00 |
.06 |
-.16 |
|
|
| D.Mean.Sum |
.01 |
.13 |
.70 |
-.17 |
|
Valence
kable(tidy(lm(t_abs~ V.Mean.Sum + Lg10WF, data = cues.ts)))
| (Intercept) |
0.0373218 |
0.0042710 |
8.7383413 |
0.0000000 |
| V.Mean.Sum |
0.0005547 |
0.0006321 |
0.8774691 |
0.3802695 |
| Lg10WF |
0.0057769 |
0.0012866 |
4.4899713 |
0.0000073 |
ggplot(cues.ts, aes(x = V.Mean.Sum, y = t_abs)) +
geom_smooth(method = "lm") +
theme_bw()

Arousal
kable(tidy(lm(t_abs~ A.Mean.Sum + Lg10WF, data = cues.ts)))
| (Intercept) |
0.0416464 |
0.0047764 |
8.719167 |
0.0000000 |
| A.Mean.Sum |
-0.0004486 |
0.0008919 |
-0.503014 |
0.6149742 |
| Lg10WF |
0.0059674 |
0.0012769 |
4.673337 |
0.0000030 |
ggplot(cues.ts, aes(x = A.Mean.Sum, y = t_abs)) +
geom_smooth(method = "lm") +
theme_bw()

Dominance
kable(tidy(lm(t_abs ~ D.Mean.Sum+ Lg10WF, data = cues.ts)))
| (Intercept) |
0.0380248 |
0.0052950 |
7.1813200 |
0.0000000 |
| D.Mean.Sum |
0.0003748 |
0.0008753 |
0.4281628 |
0.6685490 |
| Lg10WF |
0.0058608 |
0.0012850 |
4.5609192 |
0.0000052 |
ggplot(cues.ts, aes(x = D.Mean.Sum, y = t_abs)) +
geom_smooth(method = "lm") +
theme_bw()

Predicting divergence with concreteness
t
cues.ts = t.scores.full %>%
select(cue, t, t_abs, Lg10WF, Conc.M) %>%
distinct() %>%
filter(!is.na(Lg10WF) & !is.na(Conc.M))
kable(tidy(lm(t~Conc.M + Lg10WF, data = cues.ts)))
| (Intercept) |
0.0103589 |
0.0047134 |
2.197773 |
0.0280004 |
| Conc.M |
-0.0013624 |
0.0009452 |
-1.441501 |
0.1494914 |
| Lg10WF |
-0.0024012 |
0.0012984 |
-1.849445 |
0.0644388 |
ggplot(cues.ts, aes(x = Conc.M, y = t)) +
#geom_point() +
geom_smooth(method = "lm") +
theme_bw()

absolute-t
kable(tidy(lm(t_abs~ Conc.M + Lg10WF, data = cues.ts)))
| (Intercept) |
0.0319708 |
0.0035505 |
9.004478 |
0.0e+00 |
| Conc.M |
0.0032607 |
0.0007120 |
4.579734 |
4.7e-06 |
| Lg10WF |
0.0043928 |
0.0009780 |
4.491419 |
7.2e-06 |
ggplot(cues.ts, aes(x = Conc.M, y = t_abs)) +
#geom_point() +
geom_smooth(method = "lm") +
theme_bw()
