Divergent words = words with biggest residuals

LANGS <- c("ARA", "BEN", "BUL" ,"CHI", "DUT" ,"ENG" ,"FAS" ,"FRE" ,"GER", "GRE", "GUJ" ,"HIN","IBO" ,"IND" ,"ITA", "JPN" ,"KAN" ,"KOR", "MAL", "MAR" ,"NEP", "PAN" ,"POL", "POR","RUM", "RUS", "SPA" ,"TAM" ,"TEL" ,"TGL" ,"THA" ,"TUR", "URD" ,"VIE", "YOR")

PATH_PREFIX_READ <- "../../../data/processed/residuals/low_words_by_lang/"

FS <- 10
# Read in words that are most divergent for each language
all_words <- map_df(LANGS, 
                    function(x){bind_rows(
                      read_feather(paste0(PATH_PREFIX_READ, x, "_divergent_words_low.feather")) %>%
                        mutate(main_lang = x))}) %>%
  as.data.table() %>%
  mutate(word = total.word)

# For each lang, all the most divergent words
all_words_c <- all_words %>%
  mutate_if(is.character, as.factor) %>%
  select(main_lang, word)

Top 4 most frequent divergent words across language pairs for each language

N_PAIRS = 34
all_words_c %>%
  group_by(main_lang, word) %>%
  summarize(num_lang_pairs = n()) %>%
  mutate(log_num_lang_pairs = log(num_lang_pairs)) %>%
  arrange(main_lang, -log_num_lang_pairs) %>%
  mutate(rank = 1:n()) %>%
  slice(1:4) %>%
  mutate(prop_pairs = round(num_lang_pairs/N_PAIRS,2)) %>%
  select(main_lang, rank, word, num_lang_pairs, prop_pairs) %>%
  kable("html", caption = "prop_pairs = proportion of language pairs (out of 34) for which this word was in the top 100 most divergent words.") %>%
  kable_styling(font_size = FS, full_width = FALSE, bootstrap_options = "condensed")
prop_pairs = proportion of language pairs (out of 34) for which this word was in the top 100 most divergent words.
main_lang rank word num_lang_pairs prop_pairs
ARA 1 provided 16 0.47
ARA 2 aim 13 0.38
ARA 3 improved 12 0.35
ARA 4 paper 12 0.35
BEN 1 th 16 0.47
BEN 2 discussed 12 0.35
BEN 3 educational 12 0.35
BEN 4 price 12 0.35
BUL 1 channels 20 0.59
BUL 2 media 14 0.41
BUL 3 natural 14 0.41
BUL 4 paper 14 0.41
CHI 1 types 17 0.50
CHI 2 modern 13 0.38
CHI 3 america 11 0.32
CHI 4 oil 11 0.32
DUT 1 violence 22 0.65
DUT 2 media 20 0.59
DUT 3 modern 19 0.56
DUT 4 shopping 19 0.56
ENG 1 law 21 0.62
ENG 2 shop 20 0.59
ENG 3 aim 17 0.50
ENG 4 heart 17 0.50
FAS 1 century 13 0.38
FAS 2 modern 12 0.35
FAS 3 careful 11 0.32
FAS 4 paper 11 0.32
FRE 1 mentioned 16 0.47
FRE 2 developing 13 0.38
FRE 3 popular 13 0.38
FRE 4 searching 13 0.38
GER 1 shop 24 0.71
GER 2 recent 20 0.59
GER 3 site 18 0.53
GER 4 science 17 0.50
GRE 1 week 15 0.44
GRE 2 material 14 0.41
GRE 3 satisfied 14 0.41
GRE 4 facilities 13 0.38
GUJ 1 huge 14 0.41
GUJ 2 industry 14 0.41
GUJ 3 poor 14 0.41
GUJ 4 background 13 0.38
HIN 1 complex 20 0.59
HIN 2 lived 18 0.53
HIN 3 provided 18 0.53
HIN 4 etc 16 0.47
IBO 1 heart 20 0.59
IBO 2 imagination 20 0.59
IBO 3 open 17 0.50
IBO 4 development 15 0.44
IND 1 expect 18 0.53
IND 2 express 15 0.44
IND 3 called 14 0.41
IND 4 economy 14 0.41
ITA 1 generations 20 0.59
ITA 2 generation 19 0.56
ITA 3 last 18 0.53
ITA 4 town 17 0.50
JPN 1 months 25 0.74
JPN 2 starting 20 0.59
JPN 3 competition 17 0.50
JPN 4 law 16 0.47
KAN 1 machine 29 0.85
KAN 2 include 22 0.65
KAN 3 told 21 0.62
KAN 4 character 20 0.59
KOR 1 called 22 0.65
KOR 2 leave 20 0.59
KOR 3 educational 19 0.56
KOR 4 increasing 19 0.56
MAL 1 developed 23 0.68
MAL 2 train 23 0.68
MAL 3 decide 22 0.65
MAL 4 pressure 22 0.65
MAR 1 month 23 0.68
MAR 2 serious 22 0.65
MAR 3 changed 21 0.62
MAR 4 chance 20 0.59
NEP 1 clothes 21 0.62
NEP 2 shopping 21 0.62
NEP 3 th 21 0.62
NEP 4 local 20 0.59
PAN 1 experienced 25 0.74
PAN 2 against 24 0.71
PAN 3 tell 24 0.71
PAN 4 computers 22 0.65
POL 1 energy 29 0.85
POL 2 again 24 0.71
POL 3 conditions 24 0.71
POL 4 late 24 0.71
POR 1 various 28 0.82
POR 2 modern 25 0.74
POR 3 number 25 0.74
POR 4 present 25 0.74
RUM 1 against 27 0.79
RUM 2 older 26 0.76
RUM 3 sure 25 0.74
RUM 4 again 24 0.71
RUS 1 provided 28 0.82
RUS 2 strongly 26 0.76
RUS 3 technological 26 0.76
RUS 4 educational 25 0.74
SPA 1 developed 28 0.82
SPA 2 data 27 0.79
SPA 3 strongly 27 0.79
SPA 4 five 24 0.71
TAM 1 lived 28 0.82
TAM 2 playing 28 0.82
TAM 3 allowed 27 0.79
TAM 4 countries 27 0.79
TEL 1 against 29 0.85
TEL 2 computers 28 0.82
TEL 3 modern 27 0.79
TEL 4 newspaper 27 0.79
TGL 1 above 32 0.94
TGL 2 accept 29 0.85
TGL 3 completely 29 0.85
TGL 4 creative 29 0.85
THA 1 back 30 0.88
THA 2 energy 30 0.88
THA 3 called 29 0.85
THA 4 loose 28 0.82
TUR 1 developed 32 0.94
TUR 2 phone 32 0.94
TUR 3 increasing 31 0.91
TUR 4 modern 31 0.91
URD 1 believe 31 0.91
URD 2 point 31 0.91
URD 3 doctor 30 0.88
URD 4 fact 30 0.88
VIE 1 media 34 1.00
VIE 2 development 33 0.97
VIE 3 factors 33 0.97
VIE 4 abroad 32 0.94
YOR 1 book 34 1.00
YOR 2 company 34 1.00
YOR 3 few 34 1.00
YOR 4 teach 34 1.00

Top 50 most frequent divergent words in general

all_words_c %>%
  count(word) %>%
  rename(num_lang_pairs = n) %>%
  arrange(-num_lang_pairs) %>%
  mutate(num_lang_pairs = num_lang_pairs/2) %>%
  slice(1:50) %>%
  kable("html")%>%
  kable_styling(font_size = FS, full_width = FALSE, bootstrap_options = "condensed")
word num_lang_pairs
modern 181
developed 178
media 176
various 175
technological 169
number 168
increasing 149
provided 137
types 137
paper 130
water 129
etc 128
th 127
development 125
went 125
population 124
strongly 121
available 119
called 119
educational 119
facilities 118
generation 117
shopping 117
variety 113
century 112
advanced 111
against 111
science 111
speed 110
industry 109
middle 109
recent 109
natural 108
transportation 108
air 106
invention 106
phone 106
energy 105
increased 105
changed 104
states 103
especially 102
generally 101
kid 101
resources 100
above 98
back 98
clothes 97
disagree 97
involve 97
### Top 50 most consistent words across languages
all_words_c %>%
  group_by(main_lang, word) %>%
  slice(1) %>%
  ungroup() %>%
  count(word) %>%
  arrange(-n) %>%
  slice(1:50) %>%
  kable("html")%>%
  kable_styling(font_size = FS, full_width = FALSE, bootstrap_options = "condensed")

Top 50 words that have highest variance across language

(i.e. the divergence of this word is true for many of languages)

all_words_c %>%
  count(word, main_lang) %>%
  group_by(word) %>%
  summarize(spread = sum(n)/sd(n)) %>%
  filter(spread != Inf) %>%
  arrange(-spread) %>%
  slice(1:100) %>%
  kable("html") %>%
  kable_styling(font_size = FS, full_width = FALSE, bootstrap_options = "condensed")
word spread
various 57.08764
technological 50.85251
century 50.25028
modern 48.88262
number 48.08088
available 47.19974
etc 45.37378
developed 44.74997
went 44.47537
population 43.97827
facilities 43.57134
provided 43.42395
global 43.28749
paper 43.26554
natural 43.00072
media 42.79854
invention 42.61014
careful 42.46397
th 42.06093
increasing 41.71450
science 41.70933
developing 41.28724
types 41.26356
electronic 41.25984
points 40.16072
educational 39.98879
oil 39.70930
current 39.68739
economic 39.56935
generation 39.50515
air 39.34714
water 38.79554
speed 38.64012
variety 38.05685
states 37.77566
advanced 37.57940
increased 37.36877
didn 37.20337
hundred 36.98098
progress 36.88902
forced 36.31112
told 36.23788
development 35.88392
parts 35.77889
huge 35.66149
industry 35.61746
called 35.46701
changed 35.45856
market 35.43514
growth 35.38078
environment 34.75355
recent 34.73775
distance 34.60821
week 34.60095
importance 34.51603
top 34.50431
ask 34.31121
creating 34.16928
strongly 34.08225
era 33.96096
disagree 33.94405
communication 33.77184
values 33.61746
constant 33.48938
cities 33.32262
back 33.32100
saw 33.28898
transportation 33.28435
terms 33.21367
agree 33.17768
related 33.10299
involve 33.01384
conditions 32.99955
list 32.96452
opinion 32.89653
hold 32.84815
newspaper 32.84124
schedule 32.81750
web 32.79649
decades 32.78417
economy 32.67931
above 32.58160
particular 32.54542
mass 32.45874
tell 32.39962
newspapers 32.29353
mother 32.15918
complex 31.90274
popular 31.88721
everywhere 31.75426
earlier 31.72613
especially 31.45977
cars 31.45842
america 31.44051
clothes 31.39277
condition 31.35496
interaction 31.34651
recently 31.30959
asked 31.27162
road 31.24491
good_word_df <- all_words_c %>%
  count(word, main_lang) %>%
  group_by(word) %>%
  summarize(spread = sum(n)/sd(n)) %>%
  filter(spread != Inf) %>%
  arrange(-spread) %>%
  slice(1:500)

words_freq_by_lang <- all_words_c %>%
  inner_join(good_word_df) %>% # get only the words we care about
  count(main_lang, word)  %>% # count the frequency of these words in each language
  mutate(log_n = log(n)) %>% #take the log 
  select(-n)

data_for_plotly = words_freq_by_lang %>%
  spread(main_lang, log_n)  %>%
    data.frame()
  
data_for_plotly[is.na(data_for_plotly)] <- 0
  rownames(data_for_plotly) <- data_for_plotly$word
  data_for_plotly$word <- NULL

heatmaply::heatmaply(scale(data_for_plotly), 
                     margins = c(54,90,0,0), 
                     fontsize_row = 5, 
                     column_text_angle = 90)

Heat map of words by languages. Color indicates the number of language pairs for which a word was one of the top 100 most divergent words for a particular language. Yellow indicates that a word is unusual in that language relative to other languages.

good_word_df2 <- all_words_c %>%
  group_by(main_lang, word) %>%
  slice(1) %>%
  ungroup() %>%
  count(word) %>%
  arrange(-n) %>%
  slice(1:500) 

words_freq_by_lang <- all_words_c %>%
  inner_join(good_word_df2) %>% # get only the words we care about
  count(main_lang, word)  %>% # count the frequency of these words in each language
  mutate(log_n = log(nn)) %>% #take the log 
  select(-nn)

data_for_plotly = words_freq_by_lang %>%
  spread(main_lang, log_n)  %>%
    data.frame()
  
data_for_plotly[is.na(data_for_plotly)] <- 0
  rownames(data_for_plotly) <- data_for_plotly$word
  data_for_plotly$word <- NULL

heatmaply::heatmaply(scale(data_for_plotly), 
                     margins = c(54,90,0,0), 
                     fontsize_row = 5, 
                     column_text_angle = 90,
                     showticklabels = c(TRUE, FALSE))