Associate differences between non-native speakers

Do the properties of the cue influence the associates differently for speakers of different L2 groups? In particular, sentiment and concretenes of the cue.

SUMMARY: For family split, no differences. For genus split, trends for sentiment and concreteness.

Read in data

d.clean = read.csv("../data/dclean.csv")

Split language groups

Based on language family and genus

two_to_three = read.csv("data/ISO_2_3.csv")  %>%
  rename(ISO2 = langName_ISO)

lang_codes_genus = read.csv("data/lang_codes_with_genus.csv") %>%
  select(wals.code, name, genus, family) 

# based on IS03 merge in language infor from wals
codes = bind_rows(read.csv("data/lang_codes.csv"),
                  read.csv("data/lang_codes_supp.csv")) %>% # some info is missing from wals csv, even though on website (see lang_codes_supp.csv)
  select(ascii_name, id, iso_codes, latitude, longitude, macroarea) %>%
  mutate(iso_codes = as.factor(unlist(lapply(strsplit(as.character(iso_codes),
                                      ","),function(x) x[1])))) %>%# in cases where there are two ISO codes, takes first
  rename(ISO3 = iso_codes,
         wals.code = id) %>%
  group_by(ISO3) %>%
  slice(1) %>%
  left_join(lang_codes_genus)

# merge in data, and remove L1 speakers
d.clean.fam = d.clean %>%
  rename(ISO2 = langName_ISO) %>%
  left_join(two_to_three)  %>%
  left_join(codes, by = "ISO3") %>%
  filter(native.lang != "L1")

Distribution of languages

d.clean.fam %>%
  count(ISO3) %>%
  mutate(ISO3 = fct_reorder(ISO3, n, .desc = TRUE)) %>%
  filter(n > 150) %>%
  ggplot(aes(x = ISO3, y = n, fill = ISO3)) + 
  geom_bar(stat = 'identity') +
  theme_bw() +
  ggtitle("N Languages") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "None")

d.clean.fam %>%
  ggplot(aes(x=factor(macroarea), y=..count..)) +
  geom_bar(stat = "count") +
  xlab("macroarea")+
  ggtitle("N macroareas") +
  theme_bw()

d.clean.fam %>%
  count(genus) %>%
  mutate(genus = fct_reorder(genus, n, .desc = TRUE)) %>%
  ggplot(aes(x = genus, y = n)) + 
  geom_bar(stat = 'identity') +
  theme_bw() +
  ggtitle("N genera") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

d.clean.fam %>%
  count(family) %>%
  mutate(family = fct_reorder(family, n, .desc = TRUE)) %>%
  ggplot(aes(x = family, y = n)) + 
  geom_bar(stat = 'identity') +
  ggtitle("N family") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Not an obvious way to split languages here. Try indo-european vs. non-indo-european? Also Germanic vs. Romance, excluding other languages.

d.clean.fam = d.clean.fam %>%
  mutate(family.split = ifelse(family == "Indo-European", "IE", "other"),
         genus.split = ifelse(genus == "Germanic", "germanic",
                              ifelse(genus == "Romance", "romance", NA)),
          family.split = as.factor(family.split),
          genus.split = as.factor(genus.split))

t-score between family

Indo-European vs. Non-Indo-European

t-scores

Calculate t-scores (see t_L1_L2.Rmd for details about t-score calculation)

# get dataset with shared bigrams only (i think this is necessary)
IE.bigrams = unique(filter(d.clean.fam,family.split == "IE"))
other.bigrams = unique(filter(d.clean.fam,family.split == "other"))
shared.bigrams = intersect(IE.bigrams$bigram, 
                           other.bigrams$bigram)

d.common = d.clean.fam %>%
  ungroup() %>%
  filter(bigram %in% shared.bigrams)

# get C(vw) in each language (bigram counts)
bigram.counts = d.common %>%
  group_by(family.split, bigram, associate, cue) %>%
  summarize(n_bigram = n())  

# get C(v) in each language (associate counts)
associate.counts = d.common %>%
  group_by(family.split, associate) %>%
  summarize(n_associate = n())

# get C(vw)/C(v) in each langauge
bigram.counts.rf = bigram.counts %>%
  left_join(associate.counts, by = c("family.split", "associate")) %>%
  mutate(rf = n_bigram/n_associate)
  
t.scores <- bigram.counts.rf %>%
  ungroup() %>%
  as_tibble() %>%
  select(family.split, rf, cue, associate) %>%
  spread(family.split, rf) %>%
  filter(!is.na(IE), !is.na(other)) %>%
  mutate(t = (IE - other)/sqrt(IE + other))

t-scores

ggplot(t.scores, aes(x = t)) +
  geom_histogram() +
  theme_bw() +
  ggtitle("distribution of t-scores")

absolute t-scores

ggplot(t.scores, aes(x = abs(t))) +
  geom_histogram() +
  theme_bw() +
  ggtitle("distribution of t-scores")

Predicting divergerence with sentiment

Join t.scores and cues characteristics

cues.chars = d.clean %>%
  group_by(cue) %>%
  slice(1) %>%
  select(Lg10WF, quant.sent, Conc.M, V.Mean.Sum, A.Mean.Sum, D.Mean.Sum)

t.scores.full = t.scores %>%
  group_by(cue) %>%
  summarize(t = mean(t),
            t_abs = mean(abs(t))) %>%
  left_join(cues.chars)

cues.ts =  t.scores.full %>%
  select(cue, t, t_abs, Lg10WF,V.Mean.Sum,A.Mean.Sum, D.Mean.Sum) %>%
  distinct() %>%
  filter(!is.na(Lg10WF) & !is.na(V.Mean.Sum))

t

Look at correlation between norms

correlate(cues.ts %>%  select(-1,-3,-8)) %>%
  shave() %>%
  fashion() %>%
  kable()

rowname	t	Lg10WF	V.Mean.Sum	A.Mean.Sum
t
Lg10WF	-.07
V.Mean.Sum	-.03	.15
A.Mean.Sum	-.01	.05	-.16
D.Mean.Sum	-.03	.13	.72	-.16

Valence

kable(tidy(lm(t~ V.Mean.Sum  + Lg10WF, data = cues.ts)))

term	estimate	std.error	statistic	p.value
(Intercept)	0.0183671	0.0051882	3.540208	0.0004026
V.Mean.Sum	-0.0011793	0.0007809	-1.510246	0.1310298
Lg10WF	-0.0076865	0.0015144	-5.075500	0.0000004

ggplot(cues.ts, aes(x = V.Mean.Sum, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

Arousal

kable(tidy(lm(t ~ A.Mean.Sum+  Lg10WF, data = cues.ts)))

term	estimate	std.error	statistic	p.value
(Intercept)	0.0151588	0.0058605	2.5866241	0.0097139
A.Mean.Sum	-0.0005239	0.0010996	-0.4764394	0.6337776
Lg10WF	-0.0079993	0.0014989	-5.3369393	0.0000001

ggplot(cues.ts, aes(x = A.Mean.Sum, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

Dominance

kable(tidy(lm(t ~ D.Mean.Sum+  Lg10WF, data = cues.ts)))

term	estimate	std.error	statistic	p.value
(Intercept)	0.0212512	0.0064713	3.283915	0.0010292
D.Mean.Sum	-0.0016883	0.0010761	-1.568933	0.1167128
Lg10WF	-0.0077179	0.0015103	-5.110039	0.0000003

ggplot(cues.ts, aes(x = D.Mean.Sum, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

absolute-t

Look at correlation between norms

correlate(cues.ts %>%  select(-1:-2,-8)) %>%
  shave() %>%
  fashion() %>%
  kable()

rowname	t_abs	Lg10WF	V.Mean.Sum	A.Mean.Sum
t_abs
Lg10WF	.04
V.Mean.Sum	.00	.15
A.Mean.Sum	-.01	.05	-.16
D.Mean.Sum	.00	.13	.72	-.16

Valence

kable(tidy(lm(t_abs~ V.Mean.Sum  + Lg10WF, data = cues.ts)))

term	estimate	std.error	statistic	p.value
(Intercept)	0.0511886	0.0036532	14.0119146	0.0000000
V.Mean.Sum	-0.0002869	0.0005499	-0.5217506	0.6018620
Lg10WF	0.0037558	0.0010664	3.5220418	0.0004312

ggplot(cues.ts, aes(x = V.Mean.Sum, y = t_abs)) +
  geom_smooth(method  = "lm") +
  theme_bw()

Arousal

kable(tidy(lm(t_abs~ A.Mean.Sum  + Lg10WF, data = cues.ts)))

term	estimate	std.error	statistic	p.value
(Intercept)	0.0520861	0.0041259	12.6241783	0.0000000
A.Mean.Sum	-0.0005411	0.0007741	-0.6989587	0.4846032
Lg10WF	0.0037065	0.0010552	3.5124955	0.0004470

ggplot(cues.ts, aes(x = A.Mean.Sum, y = t_abs)) +
  geom_smooth(method  = "lm") +
  theme_bw()

Dominance

kable(tidy(lm(t_abs ~ D.Mean.Sum+  Lg10WF, data = cues.ts)))

term	estimate	std.error	statistic	p.value
(Intercept)	0.0508040	0.0045569	11.1488520	0.0000000
D.Mean.Sum	-0.0001875	0.0007577	-0.2474894	0.8045374
Lg10WF	0.0037065	0.0010635	3.4851058	0.0004952

ggplot(cues.ts, aes(x = D.Mean.Sum, y = t_abs)) +
  geom_smooth(method  = "lm") +
  theme_bw()

Predicting divergence with concreteness

t

cues.ts =  t.scores.full %>%
  select(cue, t, t_abs, Lg10WF, Conc.M) %>%
  distinct() %>%
  filter(!is.na(Lg10WF) & !is.na(Conc.M))

kable(tidy(lm(t~Conc.M  + Lg10WF, data = cues.ts)))

term	estimate	std.error	statistic	p.value
(Intercept)	0.0020643	0.0044096	0.4681337	0.6397027
Conc.M	0.0010431	0.0008850	1.1786794	0.2385635
Lg10WF	-0.0049230	0.0011807	-4.1696399	0.0000309

ggplot(cues.ts, aes(x = Conc.M, y = t)) +
  #geom_point() +
  geom_smooth(method  = "lm") +
  theme_bw()

absolute-t

kable(tidy(lm(t_abs~ Conc.M  + Lg10WF, data = cues.ts)))

term	estimate	std.error	statistic	p.value
(Intercept)	0.0473497	0.0031062	15.2436649	0.0000000
Conc.M	0.0001023	0.0006234	0.1640293	0.8697125
Lg10WF	0.0045919	0.0008317	5.5211576	0.0000000

ggplot(cues.ts, aes(x = Conc.M, y = t_abs)) +
  #geom_point() +
  geom_smooth(method  = "lm") +
  theme_bw()

t-score between genus

Romance vs. Germanic

get t-scores

Calculate t-scores

# get dataset with shared bigrams only (i think this is necessary)
g.bigrams = unique(filter(d.clean.fam,genus.split == "germanic"))
r.bigrams = unique(filter(d.clean.fam,genus.split == "romance"))
shared.bigrams = intersect(g.bigrams$bigram, 
                           r.bigrams$bigram)

d.common = d.clean.fam %>%
  ungroup() %>%
  filter(bigram %in% shared.bigrams)

# get C(vw) in each language (bigram counts)
bigram.counts = d.common %>%
  group_by(genus.split, bigram, associate, cue) %>%
  summarize(n_bigram = n())  

# get C(v) in each language (associate counts)
associate.counts = d.common %>%
  group_by(genus.split, associate) %>%
  summarize(n_associate = n())

# get C(vw)/C(v) in each langauge
bigram.counts.rf = bigram.counts %>%
  left_join(associate.counts, by = c("genus.split", "associate")) %>%
  mutate(rf = n_bigram/n_associate)
  
t.scores <- bigram.counts.rf %>%
  ungroup() %>%
  as_tibble() %>%
  select(genus.split, rf, cue, associate) %>%
  spread(genus.split, rf) %>%
  filter(!is.na(romance), !is.na(germanic)) %>%
  mutate(t = (romance - germanic)/sqrt(romance + germanic))

t-scores

ggplot(t.scores, aes(x = t)) +
  geom_histogram() +
  theme_bw() +
  ggtitle("distribution of t-scores")

absolute t-scores

ggplot(t.scores, aes(x = abs(t))) +
  geom_histogram() +
  theme_bw() +
  ggtitle("distribution of t-scores")

Predicting divergence with sentiment

Join t.scores and cues characteristics

cues.chars = d.clean %>%
  group_by(cue) %>%
  slice(1) %>%
  select(Lg10WF, quant.sent, Conc.M, V.Mean.Sum, A.Mean.Sum, D.Mean.Sum)

t.scores.full = t.scores %>%
  group_by(cue) %>%
  summarize(t = mean(t),
            t_abs = mean(abs(t))) %>%
  left_join(cues.chars)

cues.ts =  t.scores.full %>%
  select(cue, t, t_abs, Lg10WF,V.Mean.Sum,A.Mean.Sum, D.Mean.Sum) %>%
  distinct() %>%
  filter(!is.na(Lg10WF) & !is.na(V.Mean.Sum))

t

Look at correlation between norms

correlate(cues.ts %>%  select(-1,-3,-8)) %>%
  shave() %>%
  fashion() %>%
  kable()

rowname	t	Lg10WF	V.Mean.Sum	A.Mean.Sum
t
Lg10WF	-.04
V.Mean.Sum	-.03	.14
A.Mean.Sum	.01	.06	-.16
D.Mean.Sum	-.05	.13	.70	-.17

Valence

kable(tidy(lm(t~ V.Mean.Sum  + Lg10WF, data = cues.ts)))

term	estimate	std.error	statistic	p.value
(Intercept)	0.0201714	0.0056623	3.562389	0.0003705
V.Mean.Sum	-0.0018404	0.0008380	-2.196143	0.0281225
Lg10WF	-0.0049662	0.0017057	-2.911465	0.0036116

ggplot(cues.ts, aes(x = V.Mean.Sum, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

Arousal

kable(tidy(lm(t ~ A.Mean.Sum+  Lg10WF, data = cues.ts)))

term	estimate	std.error	statistic	p.value
(Intercept)	0.0058071	0.0063339	0.9168387	0.3592666
A.Mean.Sum	0.0014923	0.0011827	1.2617867	0.2070781
Lg10WF	-0.0055987	0.0016933	-3.3064487	0.0009508

ggplot(cues.ts, aes(x = A.Mean.Sum, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

Dominance

kable(tidy(lm(t ~ D.Mean.Sum+  Lg10WF, data = cues.ts)))

term	estimate	std.error	statistic	p.value
(Intercept)	0.0307448	0.0070154	4.382500	0.0000119
D.Mean.Sum	-0.0039020	0.0011597	-3.364612	0.0007717
Lg10WF	-0.0047529	0.0017025	-2.791677	0.0052613

ggplot(cues.ts, aes(x = D.Mean.Sum, y = t)) +
  geom_smooth(method  = "lm") +
  theme_bw()

absolute-t

Look at correlation between norms

correlate(cues.ts %>%  select(-1:-2,-8)) %>%
  shave() %>%
  fashion() %>%
  kable()

rowname	t_abs	Lg10WF	V.Mean.Sum	A.Mean.Sum
t_abs
Lg10WF	.06
V.Mean.Sum	.02	.14
A.Mean.Sum	-.00	.06	-.16
D.Mean.Sum	.01	.13	.70	-.17

Valence

kable(tidy(lm(t_abs~ V.Mean.Sum  + Lg10WF, data = cues.ts)))

term	estimate	std.error	statistic	p.value
(Intercept)	0.0373218	0.0042710	8.7383413	0.0000000
V.Mean.Sum	0.0005547	0.0006321	0.8774691	0.3802695
Lg10WF	0.0057769	0.0012866	4.4899713	0.0000073

ggplot(cues.ts, aes(x = V.Mean.Sum, y = t_abs)) +
  geom_smooth(method  = "lm") +
  theme_bw()

Arousal

kable(tidy(lm(t_abs~ A.Mean.Sum  + Lg10WF, data = cues.ts)))

term	estimate	std.error	statistic	p.value
(Intercept)	0.0416464	0.0047764	8.719167	0.0000000
A.Mean.Sum	-0.0004486	0.0008919	-0.503014	0.6149742
Lg10WF	0.0059674	0.0012769	4.673337	0.0000030

ggplot(cues.ts, aes(x = A.Mean.Sum, y = t_abs)) +
  geom_smooth(method  = "lm") +
  theme_bw()

Dominance

kable(tidy(lm(t_abs ~ D.Mean.Sum+  Lg10WF, data = cues.ts)))

term	estimate	std.error	statistic	p.value
(Intercept)	0.0380248	0.0052950	7.1813200	0.0000000
D.Mean.Sum	0.0003748	0.0008753	0.4281628	0.6685490
Lg10WF	0.0058608	0.0012850	4.5609192	0.0000052

ggplot(cues.ts, aes(x = D.Mean.Sum, y = t_abs)) +
  geom_smooth(method  = "lm") +
  theme_bw()

Predicting divergence with concreteness

t

cues.ts =  t.scores.full %>%
  select(cue, t, t_abs, Lg10WF, Conc.M) %>%
  distinct() %>%
  filter(!is.na(Lg10WF) & !is.na(Conc.M))

kable(tidy(lm(t~Conc.M  + Lg10WF, data = cues.ts)))

term	estimate	std.error	statistic	p.value
(Intercept)	0.0103589	0.0047134	2.197773	0.0280004
Conc.M	-0.0013624	0.0009452	-1.441501	0.1494914
Lg10WF	-0.0024012	0.0012984	-1.849445	0.0644388

ggplot(cues.ts, aes(x = Conc.M, y = t)) +
  #geom_point() +
  geom_smooth(method  = "lm") +
  theme_bw()

absolute-t

kable(tidy(lm(t_abs~ Conc.M  + Lg10WF, data = cues.ts)))

term	estimate	std.error	statistic	p.value
(Intercept)	0.0319708	0.0035505	9.004478	0.0e+00
Conc.M	0.0032607	0.0007120	4.579734	4.7e-06
Lg10WF	0.0043928	0.0009780	4.491419	7.2e-06

ggplot(cues.ts, aes(x = Conc.M, y = t_abs)) +
  #geom_point() +
  geom_smooth(method  = "lm") +
  theme_bw()

Associate differences between non-native speakers

t-statistic

Molly Lewis

2017-04-17

Read in data

Split language groups

t-score between family

t-scores

t-scores

absolute t-scores

Predicting divergerence with sentiment

t

Valence

Arousal

Dominance

absolute-t

Valence

Arousal

Dominance

Predicting divergence with concreteness

t

absolute-t

t-score between genus

get t-scores

t-scores

absolute t-scores

Predicting divergence with sentiment

t

Valence

Arousal

Dominance

absolute-t

Valence

Arousal

Dominance

Predicting divergence with concreteness

t

absolute-t