Do the properties of the cue influence the associates differently for speakers of different L2 groups? In particular, sentiment and concretenes of the cue.

SUMMARY: For family split, no differences. For genus split, trends for sentiment and concreteness.

Read in data

d.clean = read.csv("../data/dclean.csv")

Split language groups

Based on language family and genus

two_to_three = read.csv("data/ISO_2_3.csv")  %>%
  rename(ISO2 = langName_ISO)

lang_codes_genus = read.csv("data/lang_codes_with_genus.csv") %>%
  select(wals.code, name, genus, family) 

# based on IS03 merge in language infor from wals
codes = bind_rows(read.csv("data/lang_codes.csv"),
                  read.csv("data/lang_codes_supp.csv")) %>% # some info is missing from wals csv, even though on website (see lang_codes_supp.csv)
  select(ascii_name, id, iso_codes, latitude, longitude, macroarea) %>%
  mutate(iso_codes = as.factor(unlist(lapply(strsplit(as.character(iso_codes),
                                      ","),function(x) x[1])))) %>%# in cases where there are two ISO codes, takes first
  rename(ISO3 = iso_codes,
         wals.code = id) %>%
  group_by(ISO3) %>%
  slice(1) %>%
  left_join(lang_codes_genus)

# merge in data, and remove L1 speakers
d.clean.fam = d.clean %>%
  rename(ISO2 = langName_ISO) %>%
  left_join(two_to_three)  %>%
  left_join(codes, by = "ISO3") %>%
  filter(native.lang != "L1") 

Distribution of languages

d.clean.fam %>%
  count(ISO3) %>%
  mutate(ISO3 = fct_reorder(ISO3, n, .desc = TRUE)) %>%
  filter(n > 150) %>%
  ggplot(aes(x = ISO3, y = n, fill = ISO3)) + 
  geom_bar(stat = 'identity') +
  theme_bw() +
  ggtitle("N Languages") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "None") 

d.clean.fam %>%
  ggplot(aes(x=factor(macroarea), y=..count..)) +
  geom_bar(stat = "count") +
  xlab("macroarea")+
  ggtitle("N macroareas") +
  theme_bw()

d.clean.fam %>%
  count(genus) %>%
  mutate(genus = fct_reorder(genus, n, .desc = TRUE)) %>%
  ggplot(aes(x = genus, y = n)) + 
  geom_bar(stat = 'identity') +
  theme_bw() +
  ggtitle("N genera") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) 

d.clean.fam %>%
  count(family) %>%
  mutate(family = fct_reorder(family, n, .desc = TRUE)) %>%
  ggplot(aes(x = family, y = n)) + 
  geom_bar(stat = 'identity') +
  ggtitle("N family") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) 

Not an obvious way to split languages here. Try indo-european vs. non-indo-european? Also Germanic vs. Romance, excluding other languages.

d.clean.fam = d.clean.fam %>%
  mutate(family.split = ifelse(family == "Indo-European", "IE", "other"),
         genus.split = ifelse(genus == "Germanic", "germanic",
                              ifelse(genus == "Romance", "romance", NA)),
          family.split = as.factor(family.split),
          genus.split = as.factor(genus.split))

t-score between family

Indo-European vs. Non-Indo-European

t-scores

Calculate t-scores (see t_L1_L2.Rmd for details about t-score calculation)

# get dataset with shared bigrams only (i think this is necessary)
IE.bigrams = unique(filter(d.clean.fam,family.split == "IE"))
other.bigrams = unique(filter(d.clean.fam,family.split == "other"))
shared.bigrams = intersect(IE.bigrams$bigram, 
                           other.bigrams$bigram)

d.common = d.clean.fam %>%
  ungroup() %>%
  filter(bigram %in% shared.bigrams)

# get C(vw) in each language (bigram counts)
bigram.counts = d.common %>%
  group_by(family.split, bigram, associate, cue) %>%
  summarize(n_bigram = n())  

# get C(v) in each language (associate counts)
associate.counts = d.common %>%
  group_by(family.split, associate) %>%
  summarize(n_associate = n())

# get C(vw)/C(v) in each langauge
bigram.counts.rf = bigram.counts %>%
  left_join(associate.counts, by = c("family.split", "associate")) %>%
  mutate(rf = n_bigram/n_associate)
  
t.scores <- bigram.counts.rf %>%
  ungroup() %>%
  as_tibble() %>%
  select(family.split, rf, cue, associate) %>%
  spread(family.split, rf) %>%
  filter(!is.na(IE), !is.na(other)) %>%
  mutate(t = (IE - other)/sqrt(IE + other))

t-scores

ggplot(t.scores, aes(x = t)) +
  geom_histogram() +
  theme_bw() +
  ggtitle("distribution of t-scores")

absolute t-scores

ggplot(t.scores, aes(x = abs(t))) +
  geom_histogram() +
  theme_bw() +
  ggtitle("distribution of t-scores")

Predicting divergerence with sentiment

Join t.scores and cues characteristics

cues.chars = d.clean %>%
  group_by(cue) %>%
  slice(1) %>%
  select(Lg10WF, quant.sent, Conc.M, V.Mean.Sum, A.Mean.Sum, D.Mean.Sum)

t.scores.full = t.scores %>%
  group_by(cue) %>%
  summarize(t = mean(t),
            t_abs = mean(abs(t))) %>%
  left_join(cues.chars) 
cues.ts =  t.scores.full %>%
  select(cue, t, t_abs, Lg10WF,V.Mean.Sum,A.Mean.Sum, D.Mean.Sum) %>%
  distinct() %>%
  filter(!is.na(Lg10WF) & !is.na(V.Mean.Sum))

t

Look at correlation between norms

correlate(cues.ts %>%  select(-1,-3,-8)) %>%
  shave() %>%
  fashion() %>%
  kable()
rowname t Lg10WF V.Mean.Sum A.Mean.Sum D.Mean.Sum
t
Lg10WF -.07
V.Mean.Sum -.03 .15
A.Mean.Sum -.01 .05 -.16
D.Mean.Sum -.03 .13 .72 -.16
Valence
kable(tidy(lm(t~ V.Mean.Sum  + Lg10WF, data = cues.ts)))
term estimate std.error statistic p.value
(Intercept) 0.0183671 0.0051882 3.540208 0.0004026
V.Mean.Sum -0.0011793 0.0007809 -1.510246 0.1310298
Lg10WF -0.0076865 0.0015144 -5.075500 0.0000004
ggplot(cues.ts, aes(x = V.Mean.Sum, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

Arousal
kable(tidy(lm(t ~ A.Mean.Sum+  Lg10WF, data = cues.ts)))
term estimate std.error statistic p.value
(Intercept) 0.0151588 0.0058605 2.5866241 0.0097139
A.Mean.Sum -0.0005239 0.0010996 -0.4764394 0.6337776
Lg10WF -0.0079993 0.0014989 -5.3369393 0.0000001
ggplot(cues.ts, aes(x = A.Mean.Sum, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

Dominance
kable(tidy(lm(t ~ D.Mean.Sum+  Lg10WF, data = cues.ts)))
term estimate std.error statistic p.value
(Intercept) 0.0212512 0.0064713 3.283915 0.0010292
D.Mean.Sum -0.0016883 0.0010761 -1.568933 0.1167128
Lg10WF -0.0077179 0.0015103 -5.110039 0.0000003
ggplot(cues.ts, aes(x = D.Mean.Sum, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

absolute-t

Look at correlation between norms

correlate(cues.ts %>%  select(-1:-2,-8)) %>%
  shave() %>%
  fashion() %>%
  kable()
rowname t_abs Lg10WF V.Mean.Sum A.Mean.Sum D.Mean.Sum
t_abs
Lg10WF .04
V.Mean.Sum .00 .15
A.Mean.Sum -.01 .05 -.16
D.Mean.Sum .00 .13 .72 -.16
Valence
kable(tidy(lm(t_abs~ V.Mean.Sum  + Lg10WF, data = cues.ts)))
term estimate std.error statistic p.value
(Intercept) 0.0511886 0.0036532 14.0119146 0.0000000
V.Mean.Sum -0.0002869 0.0005499 -0.5217506 0.6018620
Lg10WF 0.0037558 0.0010664 3.5220418 0.0004312
ggplot(cues.ts, aes(x = V.Mean.Sum, y = t_abs)) +
  geom_smooth(method  = "lm") +
  theme_bw()

Arousal
kable(tidy(lm(t_abs~ A.Mean.Sum  + Lg10WF, data = cues.ts)))
term estimate std.error statistic p.value
(Intercept) 0.0520861 0.0041259 12.6241783 0.0000000
A.Mean.Sum -0.0005411 0.0007741 -0.6989587 0.4846032
Lg10WF 0.0037065 0.0010552 3.5124955 0.0004470
ggplot(cues.ts, aes(x = A.Mean.Sum, y = t_abs)) +
  geom_smooth(method  = "lm") +
  theme_bw()

Dominance
kable(tidy(lm(t_abs ~ D.Mean.Sum+  Lg10WF, data = cues.ts)))
term estimate std.error statistic p.value
(Intercept) 0.0508040 0.0045569 11.1488520 0.0000000
D.Mean.Sum -0.0001875 0.0007577 -0.2474894 0.8045374
Lg10WF 0.0037065 0.0010635 3.4851058 0.0004952
ggplot(cues.ts, aes(x = D.Mean.Sum, y = t_abs)) +
  geom_smooth(method  = "lm") +
  theme_bw()

Predicting divergence with concreteness

t

cues.ts =  t.scores.full %>%
  select(cue, t, t_abs, Lg10WF, Conc.M) %>%
  distinct() %>%
  filter(!is.na(Lg10WF) & !is.na(Conc.M))

kable(tidy(lm(t~Conc.M  + Lg10WF, data = cues.ts)))
term estimate std.error statistic p.value
(Intercept) 0.0020643 0.0044096 0.4681337 0.6397027
Conc.M 0.0010431 0.0008850 1.1786794 0.2385635
Lg10WF -0.0049230 0.0011807 -4.1696399 0.0000309
ggplot(cues.ts, aes(x = Conc.M, y = t)) +
  #geom_point() +
  geom_smooth(method  = "lm") +
  theme_bw()

absolute-t

kable(tidy(lm(t_abs~ Conc.M  + Lg10WF, data = cues.ts)))
term estimate std.error statistic p.value
(Intercept) 0.0473497 0.0031062 15.2436649 0.0000000
Conc.M 0.0001023 0.0006234 0.1640293 0.8697125
Lg10WF 0.0045919 0.0008317 5.5211576 0.0000000
ggplot(cues.ts, aes(x = Conc.M, y = t_abs)) +
  #geom_point() +
  geom_smooth(method  = "lm") +
  theme_bw()

t-score between genus

Romance vs. Germanic

get t-scores

Calculate t-scores

# get dataset with shared bigrams only (i think this is necessary)
g.bigrams = unique(filter(d.clean.fam,genus.split == "germanic"))
r.bigrams = unique(filter(d.clean.fam,genus.split == "romance"))
shared.bigrams = intersect(g.bigrams$bigram, 
                           r.bigrams$bigram)

d.common = d.clean.fam %>%
  ungroup() %>%
  filter(bigram %in% shared.bigrams)

# get C(vw) in each language (bigram counts)
bigram.counts = d.common %>%
  group_by(genus.split, bigram, associate, cue) %>%
  summarize(n_bigram = n())  

# get C(v) in each language (associate counts)
associate.counts = d.common %>%
  group_by(genus.split, associate) %>%
  summarize(n_associate = n())

# get C(vw)/C(v) in each langauge
bigram.counts.rf = bigram.counts %>%
  left_join(associate.counts, by = c("genus.split", "associate")) %>%
  mutate(rf = n_bigram/n_associate)
  
t.scores <- bigram.counts.rf %>%
  ungroup() %>%
  as_tibble() %>%
  select(genus.split, rf, cue, associate) %>%
  spread(genus.split, rf) %>%
  filter(!is.na(romance), !is.na(germanic)) %>%
  mutate(t = (romance - germanic)/sqrt(romance + germanic))

t-scores

ggplot(t.scores, aes(x = t)) +
  geom_histogram() +
  theme_bw() +
  ggtitle("distribution of t-scores")

absolute t-scores

ggplot(t.scores, aes(x = abs(t))) +
  geom_histogram() +
  theme_bw() +
  ggtitle("distribution of t-scores")

Predicting divergence with sentiment

Join t.scores and cues characteristics

cues.chars = d.clean %>%
  group_by(cue) %>%
  slice(1) %>%
  select(Lg10WF, quant.sent, Conc.M, V.Mean.Sum, A.Mean.Sum, D.Mean.Sum)

t.scores.full = t.scores %>%
  group_by(cue) %>%
  summarize(t = mean(t),
            t_abs = mean(abs(t))) %>%
  left_join(cues.chars) 
cues.ts =  t.scores.full %>%
  select(cue, t, t_abs, Lg10WF,V.Mean.Sum,A.Mean.Sum, D.Mean.Sum) %>%
  distinct() %>%
  filter(!is.na(Lg10WF) & !is.na(V.Mean.Sum))

t

Look at correlation between norms

correlate(cues.ts %>%  select(-1,-3,-8)) %>%
  shave() %>%
  fashion() %>%
  kable()
rowname t Lg10WF V.Mean.Sum A.Mean.Sum D.Mean.Sum
t
Lg10WF -.04
V.Mean.Sum -.03 .14
A.Mean.Sum .01 .06 -.16
D.Mean.Sum -.05 .13 .70 -.17
Valence
kable(tidy(lm(t~ V.Mean.Sum  + Lg10WF, data = cues.ts)))
term estimate std.error statistic p.value
(Intercept) 0.0201714 0.0056623 3.562389 0.0003705
V.Mean.Sum -0.0018404 0.0008380 -2.196143 0.0281225
Lg10WF -0.0049662 0.0017057 -2.911465 0.0036116
ggplot(cues.ts, aes(x = V.Mean.Sum, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

Arousal
kable(tidy(lm(t ~ A.Mean.Sum+  Lg10WF, data = cues.ts)))
term estimate std.error statistic p.value
(Intercept) 0.0058071 0.0063339 0.9168387 0.3592666
A.Mean.Sum 0.0014923 0.0011827 1.2617867 0.2070781
Lg10WF -0.0055987 0.0016933 -3.3064487 0.0009508
ggplot(cues.ts, aes(x = A.Mean.Sum, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

Dominance
kable(tidy(lm(t ~ D.Mean.Sum+  Lg10WF, data = cues.ts)))
term estimate std.error statistic p.value
(Intercept) 0.0307448 0.0070154 4.382500 0.0000119
D.Mean.Sum -0.0039020 0.0011597 -3.364612 0.0007717
Lg10WF -0.0047529 0.0017025 -2.791677 0.0052613
ggplot(cues.ts, aes(x = D.Mean.Sum, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

absolute-t

Look at correlation between norms

correlate(cues.ts %>%  select(-1:-2,-8)) %>%
  shave() %>%
  fashion() %>%
  kable()
rowname t_abs Lg10WF V.Mean.Sum A.Mean.Sum D.Mean.Sum
t_abs
Lg10WF .06
V.Mean.Sum .02 .14
A.Mean.Sum -.00 .06 -.16
D.Mean.Sum .01 .13 .70 -.17
Valence
kable(tidy(lm(t_abs~ V.Mean.Sum  + Lg10WF, data = cues.ts)))
term estimate std.error statistic p.value
(Intercept) 0.0373218 0.0042710 8.7383413 0.0000000
V.Mean.Sum 0.0005547 0.0006321 0.8774691 0.3802695
Lg10WF 0.0057769 0.0012866 4.4899713 0.0000073
ggplot(cues.ts, aes(x = V.Mean.Sum, y = t_abs)) +
  geom_smooth(method  = "lm") +
  theme_bw()

Arousal
kable(tidy(lm(t_abs~ A.Mean.Sum  + Lg10WF, data = cues.ts)))
term estimate std.error statistic p.value
(Intercept) 0.0416464 0.0047764 8.719167 0.0000000
A.Mean.Sum -0.0004486 0.0008919 -0.503014 0.6149742
Lg10WF 0.0059674 0.0012769 4.673337 0.0000030
ggplot(cues.ts, aes(x = A.Mean.Sum, y = t_abs)) +
  geom_smooth(method  = "lm") +
  theme_bw()

Dominance
kable(tidy(lm(t_abs ~ D.Mean.Sum+  Lg10WF, data = cues.ts)))
term estimate std.error statistic p.value
(Intercept) 0.0380248 0.0052950 7.1813200 0.0000000
D.Mean.Sum 0.0003748 0.0008753 0.4281628 0.6685490
Lg10WF 0.0058608 0.0012850 4.5609192 0.0000052
ggplot(cues.ts, aes(x = D.Mean.Sum, y = t_abs)) +
  geom_smooth(method  = "lm") +
  theme_bw()

Predicting divergence with concreteness

t

cues.ts =  t.scores.full %>%
  select(cue, t, t_abs, Lg10WF, Conc.M) %>%
  distinct() %>%
  filter(!is.na(Lg10WF) & !is.na(Conc.M))

kable(tidy(lm(t~Conc.M  + Lg10WF, data = cues.ts)))
term estimate std.error statistic p.value
(Intercept) 0.0103589 0.0047134 2.197773 0.0280004
Conc.M -0.0013624 0.0009452 -1.441501 0.1494914
Lg10WF -0.0024012 0.0012984 -1.849445 0.0644388
ggplot(cues.ts, aes(x = Conc.M, y = t)) +
  #geom_point() +
  geom_smooth(method  = "lm") +
  theme_bw()

absolute-t

kable(tidy(lm(t_abs~ Conc.M  + Lg10WF, data = cues.ts)))
term estimate std.error statistic p.value
(Intercept) 0.0319708 0.0035505 9.004478 0.0e+00
Conc.M 0.0032607 0.0007120 4.579734 4.7e-06
Lg10WF 0.0043928 0.0009780 4.491419 7.2e-06
ggplot(cues.ts, aes(x = Conc.M, y = t_abs)) +
  #geom_point() +
  geom_smooth(method  = "lm") +
  theme_bw()