Divergent words = words with biggest residuals
LANGS <- c("ARA", "BEN", "BUL" ,"CHI", "DUT" ,"ENG" ,"FAS" ,"FRE" ,"GER", "GRE", "GUJ" ,"HIN","IBO" ,"IND" ,"ITA", "JPN" ,"KAN" ,"KOR", "MAL", "MAR" ,"NEP", "PAN" ,"POL", "POR","RUM", "RUS", "SPA" ,"TAM" ,"TEL" ,"TGL" ,"THA" ,"TUR", "URD" ,"VIE", "YOR")
PATH_PREFIX_READ <- "../../../data/processed/residuals/low_words_by_lang/"
FS <- 10
# Read in words that are most divergent for each language
all_words <- map_df(LANGS,
function(x){bind_rows(
read_feather(paste0(PATH_PREFIX_READ, x, "_divergent_words_low.feather")) %>%
mutate(main_lang = x))}) %>%
as.data.table() %>%
mutate(word = total.word)
# For each lang, all the most divergent words
all_words_c <- all_words %>%
mutate_if(is.character, as.factor) %>%
select(main_lang, word)
N_PAIRS = 34
all_words_c %>%
group_by(main_lang, word) %>%
summarize(num_lang_pairs = n()) %>%
mutate(log_num_lang_pairs = log(num_lang_pairs)) %>%
arrange(main_lang, -log_num_lang_pairs) %>%
mutate(rank = 1:n()) %>%
slice(1:4) %>%
mutate(prop_pairs = round(num_lang_pairs/N_PAIRS,2)) %>%
select(main_lang, rank, word, num_lang_pairs, prop_pairs) %>%
kable("html", caption = "prop_pairs = proportion of language pairs (out of 34) for which this word was in the top 100 most divergent words.") %>%
kable_styling(font_size = FS, full_width = FALSE, bootstrap_options = "condensed")
| main_lang | rank | word | num_lang_pairs | prop_pairs |
|---|---|---|---|---|
| ARA | 1 | provided | 16 | 0.47 |
| ARA | 2 | aim | 13 | 0.38 |
| ARA | 3 | improved | 12 | 0.35 |
| ARA | 4 | paper | 12 | 0.35 |
| BEN | 1 | th | 16 | 0.47 |
| BEN | 2 | discussed | 12 | 0.35 |
| BEN | 3 | educational | 12 | 0.35 |
| BEN | 4 | price | 12 | 0.35 |
| BUL | 1 | channels | 20 | 0.59 |
| BUL | 2 | media | 14 | 0.41 |
| BUL | 3 | natural | 14 | 0.41 |
| BUL | 4 | paper | 14 | 0.41 |
| CHI | 1 | types | 17 | 0.50 |
| CHI | 2 | modern | 13 | 0.38 |
| CHI | 3 | america | 11 | 0.32 |
| CHI | 4 | oil | 11 | 0.32 |
| DUT | 1 | violence | 22 | 0.65 |
| DUT | 2 | media | 20 | 0.59 |
| DUT | 3 | modern | 19 | 0.56 |
| DUT | 4 | shopping | 19 | 0.56 |
| ENG | 1 | law | 21 | 0.62 |
| ENG | 2 | shop | 20 | 0.59 |
| ENG | 3 | aim | 17 | 0.50 |
| ENG | 4 | heart | 17 | 0.50 |
| FAS | 1 | century | 13 | 0.38 |
| FAS | 2 | modern | 12 | 0.35 |
| FAS | 3 | careful | 11 | 0.32 |
| FAS | 4 | paper | 11 | 0.32 |
| FRE | 1 | mentioned | 16 | 0.47 |
| FRE | 2 | developing | 13 | 0.38 |
| FRE | 3 | popular | 13 | 0.38 |
| FRE | 4 | searching | 13 | 0.38 |
| GER | 1 | shop | 24 | 0.71 |
| GER | 2 | recent | 20 | 0.59 |
| GER | 3 | site | 18 | 0.53 |
| GER | 4 | science | 17 | 0.50 |
| GRE | 1 | week | 15 | 0.44 |
| GRE | 2 | material | 14 | 0.41 |
| GRE | 3 | satisfied | 14 | 0.41 |
| GRE | 4 | facilities | 13 | 0.38 |
| GUJ | 1 | huge | 14 | 0.41 |
| GUJ | 2 | industry | 14 | 0.41 |
| GUJ | 3 | poor | 14 | 0.41 |
| GUJ | 4 | background | 13 | 0.38 |
| HIN | 1 | complex | 20 | 0.59 |
| HIN | 2 | lived | 18 | 0.53 |
| HIN | 3 | provided | 18 | 0.53 |
| HIN | 4 | etc | 16 | 0.47 |
| IBO | 1 | heart | 20 | 0.59 |
| IBO | 2 | imagination | 20 | 0.59 |
| IBO | 3 | open | 17 | 0.50 |
| IBO | 4 | development | 15 | 0.44 |
| IND | 1 | expect | 18 | 0.53 |
| IND | 2 | express | 15 | 0.44 |
| IND | 3 | called | 14 | 0.41 |
| IND | 4 | economy | 14 | 0.41 |
| ITA | 1 | generations | 20 | 0.59 |
| ITA | 2 | generation | 19 | 0.56 |
| ITA | 3 | last | 18 | 0.53 |
| ITA | 4 | town | 17 | 0.50 |
| JPN | 1 | months | 25 | 0.74 |
| JPN | 2 | starting | 20 | 0.59 |
| JPN | 3 | competition | 17 | 0.50 |
| JPN | 4 | law | 16 | 0.47 |
| KAN | 1 | machine | 29 | 0.85 |
| KAN | 2 | include | 22 | 0.65 |
| KAN | 3 | told | 21 | 0.62 |
| KAN | 4 | character | 20 | 0.59 |
| KOR | 1 | called | 22 | 0.65 |
| KOR | 2 | leave | 20 | 0.59 |
| KOR | 3 | educational | 19 | 0.56 |
| KOR | 4 | increasing | 19 | 0.56 |
| MAL | 1 | developed | 23 | 0.68 |
| MAL | 2 | train | 23 | 0.68 |
| MAL | 3 | decide | 22 | 0.65 |
| MAL | 4 | pressure | 22 | 0.65 |
| MAR | 1 | month | 23 | 0.68 |
| MAR | 2 | serious | 22 | 0.65 |
| MAR | 3 | changed | 21 | 0.62 |
| MAR | 4 | chance | 20 | 0.59 |
| NEP | 1 | clothes | 21 | 0.62 |
| NEP | 2 | shopping | 21 | 0.62 |
| NEP | 3 | th | 21 | 0.62 |
| NEP | 4 | local | 20 | 0.59 |
| PAN | 1 | experienced | 25 | 0.74 |
| PAN | 2 | against | 24 | 0.71 |
| PAN | 3 | tell | 24 | 0.71 |
| PAN | 4 | computers | 22 | 0.65 |
| POL | 1 | energy | 29 | 0.85 |
| POL | 2 | again | 24 | 0.71 |
| POL | 3 | conditions | 24 | 0.71 |
| POL | 4 | late | 24 | 0.71 |
| POR | 1 | various | 28 | 0.82 |
| POR | 2 | modern | 25 | 0.74 |
| POR | 3 | number | 25 | 0.74 |
| POR | 4 | present | 25 | 0.74 |
| RUM | 1 | against | 27 | 0.79 |
| RUM | 2 | older | 26 | 0.76 |
| RUM | 3 | sure | 25 | 0.74 |
| RUM | 4 | again | 24 | 0.71 |
| RUS | 1 | provided | 28 | 0.82 |
| RUS | 2 | strongly | 26 | 0.76 |
| RUS | 3 | technological | 26 | 0.76 |
| RUS | 4 | educational | 25 | 0.74 |
| SPA | 1 | developed | 28 | 0.82 |
| SPA | 2 | data | 27 | 0.79 |
| SPA | 3 | strongly | 27 | 0.79 |
| SPA | 4 | five | 24 | 0.71 |
| TAM | 1 | lived | 28 | 0.82 |
| TAM | 2 | playing | 28 | 0.82 |
| TAM | 3 | allowed | 27 | 0.79 |
| TAM | 4 | countries | 27 | 0.79 |
| TEL | 1 | against | 29 | 0.85 |
| TEL | 2 | computers | 28 | 0.82 |
| TEL | 3 | modern | 27 | 0.79 |
| TEL | 4 | newspaper | 27 | 0.79 |
| TGL | 1 | above | 32 | 0.94 |
| TGL | 2 | accept | 29 | 0.85 |
| TGL | 3 | completely | 29 | 0.85 |
| TGL | 4 | creative | 29 | 0.85 |
| THA | 1 | back | 30 | 0.88 |
| THA | 2 | energy | 30 | 0.88 |
| THA | 3 | called | 29 | 0.85 |
| THA | 4 | loose | 28 | 0.82 |
| TUR | 1 | developed | 32 | 0.94 |
| TUR | 2 | phone | 32 | 0.94 |
| TUR | 3 | increasing | 31 | 0.91 |
| TUR | 4 | modern | 31 | 0.91 |
| URD | 1 | believe | 31 | 0.91 |
| URD | 2 | point | 31 | 0.91 |
| URD | 3 | doctor | 30 | 0.88 |
| URD | 4 | fact | 30 | 0.88 |
| VIE | 1 | media | 34 | 1.00 |
| VIE | 2 | development | 33 | 0.97 |
| VIE | 3 | factors | 33 | 0.97 |
| VIE | 4 | abroad | 32 | 0.94 |
| YOR | 1 | book | 34 | 1.00 |
| YOR | 2 | company | 34 | 1.00 |
| YOR | 3 | few | 34 | 1.00 |
| YOR | 4 | teach | 34 | 1.00 |
all_words_c %>%
count(word) %>%
rename(num_lang_pairs = n) %>%
arrange(-num_lang_pairs) %>%
mutate(num_lang_pairs = num_lang_pairs/2) %>%
slice(1:50) %>%
kable("html")%>%
kable_styling(font_size = FS, full_width = FALSE, bootstrap_options = "condensed")
| word | num_lang_pairs |
|---|---|
| modern | 181 |
| developed | 178 |
| media | 176 |
| various | 175 |
| technological | 169 |
| number | 168 |
| increasing | 149 |
| provided | 137 |
| types | 137 |
| paper | 130 |
| water | 129 |
| etc | 128 |
| th | 127 |
| development | 125 |
| went | 125 |
| population | 124 |
| strongly | 121 |
| available | 119 |
| called | 119 |
| educational | 119 |
| facilities | 118 |
| generation | 117 |
| shopping | 117 |
| variety | 113 |
| century | 112 |
| advanced | 111 |
| against | 111 |
| science | 111 |
| speed | 110 |
| industry | 109 |
| middle | 109 |
| recent | 109 |
| natural | 108 |
| transportation | 108 |
| air | 106 |
| invention | 106 |
| phone | 106 |
| energy | 105 |
| increased | 105 |
| changed | 104 |
| states | 103 |
| especially | 102 |
| generally | 101 |
| kid | 101 |
| resources | 100 |
| above | 98 |
| back | 98 |
| clothes | 97 |
| disagree | 97 |
| involve | 97 |
### Top 50 most consistent words across languages
all_words_c %>%
group_by(main_lang, word) %>%
slice(1) %>%
ungroup() %>%
count(word) %>%
arrange(-n) %>%
slice(1:50) %>%
kable("html")%>%
kable_styling(font_size = FS, full_width = FALSE, bootstrap_options = "condensed")
(i.e. the divergence of this word is true for many of languages)
all_words_c %>%
count(word, main_lang) %>%
group_by(word) %>%
summarize(spread = sum(n)/sd(n)) %>%
filter(spread != Inf) %>%
arrange(-spread) %>%
slice(1:100) %>%
kable("html") %>%
kable_styling(font_size = FS, full_width = FALSE, bootstrap_options = "condensed")
| word | spread |
|---|---|
| various | 57.08764 |
| technological | 50.85251 |
| century | 50.25028 |
| modern | 48.88262 |
| number | 48.08088 |
| available | 47.19974 |
| etc | 45.37378 |
| developed | 44.74997 |
| went | 44.47537 |
| population | 43.97827 |
| facilities | 43.57134 |
| provided | 43.42395 |
| global | 43.28749 |
| paper | 43.26554 |
| natural | 43.00072 |
| media | 42.79854 |
| invention | 42.61014 |
| careful | 42.46397 |
| th | 42.06093 |
| increasing | 41.71450 |
| science | 41.70933 |
| developing | 41.28724 |
| types | 41.26356 |
| electronic | 41.25984 |
| points | 40.16072 |
| educational | 39.98879 |
| oil | 39.70930 |
| current | 39.68739 |
| economic | 39.56935 |
| generation | 39.50515 |
| air | 39.34714 |
| water | 38.79554 |
| speed | 38.64012 |
| variety | 38.05685 |
| states | 37.77566 |
| advanced | 37.57940 |
| increased | 37.36877 |
| didn | 37.20337 |
| hundred | 36.98098 |
| progress | 36.88902 |
| forced | 36.31112 |
| told | 36.23788 |
| development | 35.88392 |
| parts | 35.77889 |
| huge | 35.66149 |
| industry | 35.61746 |
| called | 35.46701 |
| changed | 35.45856 |
| market | 35.43514 |
| growth | 35.38078 |
| environment | 34.75355 |
| recent | 34.73775 |
| distance | 34.60821 |
| week | 34.60095 |
| importance | 34.51603 |
| top | 34.50431 |
| ask | 34.31121 |
| creating | 34.16928 |
| strongly | 34.08225 |
| era | 33.96096 |
| disagree | 33.94405 |
| communication | 33.77184 |
| values | 33.61746 |
| constant | 33.48938 |
| cities | 33.32262 |
| back | 33.32100 |
| saw | 33.28898 |
| transportation | 33.28435 |
| terms | 33.21367 |
| agree | 33.17768 |
| related | 33.10299 |
| involve | 33.01384 |
| conditions | 32.99955 |
| list | 32.96452 |
| opinion | 32.89653 |
| hold | 32.84815 |
| newspaper | 32.84124 |
| schedule | 32.81750 |
| web | 32.79649 |
| decades | 32.78417 |
| economy | 32.67931 |
| above | 32.58160 |
| particular | 32.54542 |
| mass | 32.45874 |
| tell | 32.39962 |
| newspapers | 32.29353 |
| mother | 32.15918 |
| complex | 31.90274 |
| popular | 31.88721 |
| everywhere | 31.75426 |
| earlier | 31.72613 |
| especially | 31.45977 |
| cars | 31.45842 |
| america | 31.44051 |
| clothes | 31.39277 |
| condition | 31.35496 |
| interaction | 31.34651 |
| recently | 31.30959 |
| asked | 31.27162 |
| road | 31.24491 |
good_word_df <- all_words_c %>%
count(word, main_lang) %>%
group_by(word) %>%
summarize(spread = sum(n)/sd(n)) %>%
filter(spread != Inf) %>%
arrange(-spread) %>%
slice(1:500)
words_freq_by_lang <- all_words_c %>%
inner_join(good_word_df) %>% # get only the words we care about
count(main_lang, word) %>% # count the frequency of these words in each language
mutate(log_n = log(n)) %>% #take the log
select(-n)
data_for_plotly = words_freq_by_lang %>%
spread(main_lang, log_n) %>%
data.frame()
data_for_plotly[is.na(data_for_plotly)] <- 0
rownames(data_for_plotly) <- data_for_plotly$word
data_for_plotly$word <- NULL
heatmaply::heatmaply(scale(data_for_plotly),
margins = c(54,90,0,0),
fontsize_row = 5,
column_text_angle = 90)
Heat map of words by languages. Color indicates the number of language pairs for which a word was one of the top 100 most divergent words for a particular language. Yellow indicates that a word is unusual in that language relative to other languages.
good_word_df2 <- all_words_c %>%
group_by(main_lang, word) %>%
slice(1) %>%
ungroup() %>%
count(word) %>%
arrange(-n) %>%
slice(1:500)
words_freq_by_lang <- all_words_c %>%
inner_join(good_word_df2) %>% # get only the words we care about
count(main_lang, word) %>% # count the frequency of these words in each language
mutate(log_n = log(nn)) %>% #take the log
select(-nn)
data_for_plotly = words_freq_by_lang %>%
spread(main_lang, log_n) %>%
data.frame()
data_for_plotly[is.na(data_for_plotly)] <- 0
rownames(data_for_plotly) <- data_for_plotly$word
data_for_plotly$word <- NULL
heatmaply::heatmaply(scale(data_for_plotly),
margins = c(54,90,0,0),
fontsize_row = 5,
column_text_angle = 90,
showticklabels = c(TRUE, FALSE))