L2ETS Study 2 analyses

Top 4 most frequent divergent words across language pairs for each language
Top 50 most frequent divergent words in general
Top 50 words that have highest variance across language

Divergent words = words with biggest residuals

LANGS <- c("ARA", "BEN", "BUL" ,"CHI", "DUT" ,"ENG" ,"FAS" ,"FRE" ,"GER", "GRE", "GUJ" ,"HIN","IBO" ,"IND" ,"ITA", "JPN" ,"KAN" ,"KOR", "MAL", "MAR" ,"NEP", "PAN" ,"POL", "POR","RUM", "RUS", "SPA" ,"TAM" ,"TEL" ,"TGL" ,"THA" ,"TUR", "URD" ,"VIE", "YOR")

PATH_PREFIX_READ <- "../../../data/processed/residuals/low_words_by_lang/"

FS <- 10

# Read in words that are most divergent for each language
all_words <- map_df(LANGS, 
                    function(x){bind_rows(
                      read_feather(paste0(PATH_PREFIX_READ, x, "_divergent_words_low.feather")) %>%
                        mutate(main_lang = x))}) %>%
  as.data.table() %>%
  mutate(word = total.word)

# For each lang, all the most divergent words
all_words_c <- all_words %>%
  mutate_if(is.character, as.factor) %>%
  select(main_lang, word)

Top 4 most frequent divergent words across language pairs for each language

N_PAIRS = 34
all_words_c %>%
  group_by(main_lang, word) %>%
  summarize(num_lang_pairs = n()) %>%
  mutate(log_num_lang_pairs = log(num_lang_pairs)) %>%
  arrange(main_lang, -log_num_lang_pairs) %>%
  mutate(rank = 1:n()) %>%
  slice(1:4) %>%
  mutate(prop_pairs = round(num_lang_pairs/N_PAIRS,2)) %>%
  select(main_lang, rank, word, num_lang_pairs, prop_pairs) %>%
  kable("html", caption = "prop_pairs = proportion of language pairs (out of 34) for which this word was in the top 100 most divergent words.") %>%
  kable_styling(font_size = FS, full_width = FALSE, bootstrap_options = "condensed")

prop_pairs = proportion of language pairs (out of 34) for which this word was in the top 100 most divergent words.
main_lang	rank	word	num_lang_pairs	prop_pairs
ARA	1	provided	16	0.47
ARA	2	aim	13	0.38
ARA	3	improved	12	0.35
ARA	4	paper	12	0.35
BEN	1	th	16	0.47
BEN	2	discussed	12	0.35
BEN	3	educational	12	0.35
BEN	4	price	12	0.35
BUL	1	channels	20	0.59
BUL	2	media	14	0.41
BUL	3	natural	14	0.41
BUL	4	paper	14	0.41
CHI	1	types	17	0.50
CHI	2	modern	13	0.38
CHI	3	america	11	0.32
CHI	4	oil	11	0.32
DUT	1	violence	22	0.65
DUT	2	media	20	0.59
DUT	3	modern	19	0.56
DUT	4	shopping	19	0.56
ENG	1	law	21	0.62
ENG	2	shop	20	0.59
ENG	3	aim	17	0.50
ENG	4	heart	17	0.50
FAS	1	century	13	0.38
FAS	2	modern	12	0.35
FAS	3	careful	11	0.32
FAS	4	paper	11	0.32
FRE	1	mentioned	16	0.47
FRE	2	developing	13	0.38
FRE	3	popular	13	0.38
FRE	4	searching	13	0.38
GER	1	shop	24	0.71
GER	2	recent	20	0.59
GER	3	site	18	0.53
GER	4	science	17	0.50
GRE	1	week	15	0.44
GRE	2	material	14	0.41
GRE	3	satisfied	14	0.41
GRE	4	facilities	13	0.38
GUJ	1	huge	14	0.41
GUJ	2	industry	14	0.41
GUJ	3	poor	14	0.41
GUJ	4	background	13	0.38
HIN	1	complex	20	0.59
HIN	2	lived	18	0.53
HIN	3	provided	18	0.53
HIN	4	etc	16	0.47
IBO	1	heart	20	0.59
IBO	2	imagination	20	0.59
IBO	3	open	17	0.50
IBO	4	development	15	0.44
IND	1	expect	18	0.53
IND	2	express	15	0.44
IND	3	called	14	0.41
IND	4	economy	14	0.41
ITA	1	generations	20	0.59
ITA	2	generation	19	0.56
ITA	3	last	18	0.53
ITA	4	town	17	0.50
JPN	1	months	25	0.74
JPN	2	starting	20	0.59
JPN	3	competition	17	0.50
JPN	4	law	16	0.47
KAN	1	machine	29	0.85
KAN	2	include	22	0.65
KAN	3	told	21	0.62
KAN	4	character	20	0.59
KOR	1	called	22	0.65
KOR	2	leave	20	0.59
KOR	3	educational	19	0.56
KOR	4	increasing	19	0.56
MAL	1	developed	23	0.68
MAL	2	train	23	0.68
MAL	3	decide	22	0.65
MAL	4	pressure	22	0.65
MAR	1	month	23	0.68
MAR	2	serious	22	0.65
MAR	3	changed	21	0.62
MAR	4	chance	20	0.59
NEP	1	clothes	21	0.62
NEP	2	shopping	21	0.62
NEP	3	th	21	0.62
NEP	4	local	20	0.59
PAN	1	experienced	25	0.74
PAN	2	against	24	0.71
PAN	3	tell	24	0.71
PAN	4	computers	22	0.65
POL	1	energy	29	0.85
POL	2	again	24	0.71
POL	3	conditions	24	0.71
POL	4	late	24	0.71
POR	1	various	28	0.82
POR	2	modern	25	0.74
POR	3	number	25	0.74
POR	4	present	25	0.74
RUM	1	against	27	0.79
RUM	2	older	26	0.76
RUM	3	sure	25	0.74
RUM	4	again	24	0.71
RUS	1	provided	28	0.82
RUS	2	strongly	26	0.76
RUS	3	technological	26	0.76
RUS	4	educational	25	0.74
SPA	1	developed	28	0.82
SPA	2	data	27	0.79
SPA	3	strongly	27	0.79
SPA	4	five	24	0.71
TAM	1	lived	28	0.82
TAM	2	playing	28	0.82
TAM	3	allowed	27	0.79
TAM	4	countries	27	0.79
TEL	1	against	29	0.85
TEL	2	computers	28	0.82
TEL	3	modern	27	0.79
TEL	4	newspaper	27	0.79
TGL	1	above	32	0.94
TGL	2	accept	29	0.85
TGL	3	completely	29	0.85
TGL	4	creative	29	0.85
THA	1	back	30	0.88
THA	2	energy	30	0.88
THA	3	called	29	0.85
THA	4	loose	28	0.82
TUR	1	developed	32	0.94
TUR	2	phone	32	0.94
TUR	3	increasing	31	0.91
TUR	4	modern	31	0.91
URD	1	believe	31	0.91
URD	2	point	31	0.91
URD	3	doctor	30	0.88
URD	4	fact	30	0.88
VIE	1	media	34	1.00
VIE	2	development	33	0.97
VIE	3	factors	33	0.97
VIE	4	abroad	32	0.94
YOR	1	book	34	1.00
YOR	2	company	34	1.00
YOR	3	few	34	1.00
YOR	4	teach	34	1.00

Top 50 most frequent divergent words in general

all_words_c %>%
  count(word) %>%
  rename(num_lang_pairs = n) %>%
  arrange(-num_lang_pairs) %>%
  mutate(num_lang_pairs = num_lang_pairs/2) %>%
  slice(1:50) %>%
  kable("html")%>%
  kable_styling(font_size = FS, full_width = FALSE, bootstrap_options = "condensed")

word	num_lang_pairs
modern	181
developed	178
media	176
various	175
technological	169
number	168
increasing	149
provided	137
types	137
paper	130
water	129
etc	128
th	127
development	125
went	125
population	124
strongly	121
available	119
called	119
educational	119
facilities	118
generation	117
shopping	117
variety	113
century	112
advanced	111
against	111
science	111
speed	110
industry	109
middle	109
recent	109
natural	108
transportation	108
air	106
invention	106
phone	106
energy	105
increased	105
changed	104
states	103
especially	102
generally	101
kid	101
resources	100
above	98
back	98
clothes	97
disagree	97
involve	97

### Top 50 most consistent words across languages
all_words_c %>%
  group_by(main_lang, word) %>%
  slice(1) %>%
  ungroup() %>%
  count(word) %>%
  arrange(-n) %>%
  slice(1:50) %>%
  kable("html")%>%
  kable_styling(font_size = FS, full_width = FALSE, bootstrap_options = "condensed")

Top 50 words that have highest variance across language

(i.e. the divergence of this word is true for many of languages)

all_words_c %>%
  count(word, main_lang) %>%
  group_by(word) %>%
  summarize(spread = sum(n)/sd(n)) %>%
  filter(spread != Inf) %>%
  arrange(-spread) %>%
  slice(1:100) %>%
  kable("html") %>%
  kable_styling(font_size = FS, full_width = FALSE, bootstrap_options = "condensed")

word	spread
various	57.08764
technological	50.85251
century	50.25028
modern	48.88262
number	48.08088
available	47.19974
etc	45.37378
developed	44.74997
went	44.47537
population	43.97827
facilities	43.57134
provided	43.42395
global	43.28749
paper	43.26554
natural	43.00072
media	42.79854
invention	42.61014
careful	42.46397
th	42.06093
increasing	41.71450
science	41.70933
developing	41.28724
types	41.26356
electronic	41.25984
points	40.16072
educational	39.98879
oil	39.70930
current	39.68739
economic	39.56935
generation	39.50515
air	39.34714
water	38.79554
speed	38.64012
variety	38.05685
states	37.77566
advanced	37.57940
increased	37.36877
didn	37.20337
hundred	36.98098
progress	36.88902
forced	36.31112
told	36.23788
development	35.88392
parts	35.77889
huge	35.66149
industry	35.61746
called	35.46701
changed	35.45856
market	35.43514
growth	35.38078
environment	34.75355
recent	34.73775
distance	34.60821
week	34.60095
importance	34.51603
top	34.50431
ask	34.31121
creating	34.16928
strongly	34.08225
era	33.96096
disagree	33.94405
communication	33.77184
values	33.61746
constant	33.48938
cities	33.32262
back	33.32100
saw	33.28898
transportation	33.28435
terms	33.21367
agree	33.17768
related	33.10299
involve	33.01384
conditions	32.99955
list	32.96452
opinion	32.89653
hold	32.84815
newspaper	32.84124
schedule	32.81750
web	32.79649
decades	32.78417
economy	32.67931
above	32.58160
particular	32.54542
mass	32.45874
tell	32.39962
newspapers	32.29353
mother	32.15918
complex	31.90274
popular	31.88721
everywhere	31.75426
earlier	31.72613
especially	31.45977
cars	31.45842
america	31.44051
clothes	31.39277
condition	31.35496
interaction	31.34651
recently	31.30959
asked	31.27162
road	31.24491

good_word_df <- all_words_c %>%
  count(word, main_lang) %>%
  group_by(word) %>%
  summarize(spread = sum(n)/sd(n)) %>%
  filter(spread != Inf) %>%
  arrange(-spread) %>%
  slice(1:500)

words_freq_by_lang <- all_words_c %>%
  inner_join(good_word_df) %>% # get only the words we care about
  count(main_lang, word)  %>% # count the frequency of these words in each language
  mutate(log_n = log(n)) %>% #take the log 
  select(-n)

data_for_plotly = words_freq_by_lang %>%
  spread(main_lang, log_n)  %>%
    data.frame()
  
data_for_plotly[is.na(data_for_plotly)] <- 0
  rownames(data_for_plotly) <- data_for_plotly$word
  data_for_plotly$word <- NULL

heatmaply::heatmaply(scale(data_for_plotly), 
                     margins = c(54,90,0,0), 
                     fontsize_row = 5, 
                     column_text_angle = 90)

Heat map of words by languages. Color indicates the number of language pairs for which a word was one of the top 100 most divergent words for a particular language. Yellow indicates that a word is unusual in that language relative to other languages.

good_word_df2 <- all_words_c %>%
  group_by(main_lang, word) %>%
  slice(1) %>%
  ungroup() %>%
  count(word) %>%
  arrange(-n) %>%
  slice(1:500) 

words_freq_by_lang <- all_words_c %>%
  inner_join(good_word_df2) %>% # get only the words we care about
  count(main_lang, word)  %>% # count the frequency of these words in each language
  mutate(log_n = log(nn)) %>% #take the log 
  select(-nn)

data_for_plotly = words_freq_by_lang %>%
  spread(main_lang, log_n)  %>%
    data.frame()
  
data_for_plotly[is.na(data_for_plotly)] <- 0
  rownames(data_for_plotly) <- data_for_plotly$word
  data_for_plotly$word <- NULL

heatmaply::heatmaply(scale(data_for_plotly), 
                     margins = c(54,90,0,0), 
                     fontsize_row = 5, 
                     column_text_angle = 90,
                     showticklabels = c(TRUE, FALSE))

L2ETS Study 2 analyses

Overlap across languages in divergent words

Molly Lewis

2017-11-30

Top 4 most frequent divergent words across language pairs for each language

Top 50 most frequent divergent words in general

Top 50 words that have highest variance across language