GOAL: Identify some example languages where (1) the cluster containing the swadesh words is closer to different abstract clusters across languages, and (2) an abstract cluster is close to different concrete clusters across languages.
SWADESH_WORDS <- c("ash", "cloud", "day", "dust", "earth", "fire", "lake", "moon", "mountain", "night", "river", "salt", "sand", "sea", "sky", "smoke", "star", "stone", "sun", "water", "wind", "year")
all_words <- read_csv("data/target_translations_xling_words.csv")
swadesh_tiles <- all_words %>%
distinct(word, concreteness_tile) %>%
filter(word %in% SWADESH_WORDS) %>%
arrange(concreteness_tile)
kable(swadesh_tiles)
| word | concreteness_tile |
|---|---|
| year | 7 |
| day | 8 |
| wind | 8 |
| dust | 9 |
| night | 9 |
| earth | 10 |
| fire | 10 |
| lake | 10 |
| moon | 10 |
| mountain | 10 |
| salt | 10 |
| sea | 10 |
| smoke | 10 |
| star | 10 |
| stone | 10 |
| sun | 10 |
| 16/22 words | present in current sample of words. |
INPATH <- "data/decile_pairwise_lang_distances.csv"
dists <- read_csv(INPATH,
col_names = c("lang", "concreteness_decile1", "concreteness_decile2", "mean_cos_dist"))
#filter(!(lang %in% c("zh", "yo", "ja", "ig")))
ggplot(dists, aes(x = mean_cos_dist)) +
geom_histogram() +
theme_classic()
ggplot(dists, aes(x = mean_cos_dist, fill = lang)) +
# facet_wrap(~lang) +
geom_density() +
theme_classic()
Outliers: zh, yo, ja, ig
Mean across languages of mean decile distances (across words).
mean_dist <- dists %>%
group_by(concreteness_decile1, concreteness_decile2) %>%
summarize(mean_cos_dist = mean(mean_cos_dist)) %>%
ungroup() %>%
mutate(concreteness_decile1 = as.factor(concreteness_decile1))
ggplot(mean_dist, aes(x = concreteness_decile2,
y = mean_cos_dist,
group=concreteness_decile1,
color = concreteness_decile1)) +
geom_line() +
geom_point() +
theme_classic()
dists %>%
ggplot(aes(x = mean_cos_dist)) +
geom_density() +
facet_grid(concreteness_decile1~ concreteness_decile2,) +
theme_classic()
dists %>%
filter(concreteness_decile1 == 3) %>%
mutate(conc_pair = paste0(concreteness_decile1, "-",
concreteness_decile2)) %>%
ggplot(aes(x = mean_cos_dist, fill = conc_pair)) +
geom_density(alpha = .5) +
theme_classic()
Concreteness deciles 2-6 paired with lesser deciles unexpected
dists %>%
filter(concreteness_decile1 %in% c(1,2,3,8,9,10),
concreteness_decile2 %in% c(1,2,3,8,9,10)) %>%
filter(abs(concreteness_decile1 - concreteness_decile2) > 2) %>%
mutate(conc_pair = paste0(concreteness_decile1, "-",
concreteness_decile2)) %>%
select(-concreteness_decile1, -concreteness_decile2) %>%
group_by(conc_pair) %>%
arrange(conc_pair, mean_cos_dist) %>%
filter(row_number() %in% c(1:2,(n()-1): n())) %>%
DT::datatable()
DT::datatable(all_words %>% distinct(word,concreteness_tile) %>% arrange(concreteness_tile))
Note that this is translated words IN THE MODEL
COMMON_WORDS_OUTPATH <- "data/common_brysbaert_words_across_models.csv"
words_in_models <- read_csv(COMMON_WORDS_OUTPATH,
col_names = c("word", "translation", "lang")) %>%
filter(word %in% all_words$word) %>%
left_join(all_words %>% select(word, concreteness_tile) %>% distinct())
words_in_models %>%
count(lang, concreteness_tile) %>%
ggplot(aes(x = lang, group = concreteness_tile, fill = as.factor(concreteness_tile), y = n)) +
geom_bar(stat = "identity", position = "dodge") +
theme_classic() +
theme(legend.position = "none")
mean_ns <- words_in_models %>%
count(lang, concreteness_tile) %>%
group_by(lang) %>%
summarize(mean_n = mean(n))
mean_dists <- dists %>%
group_by(lang) %>%
summarize(mean_dist = mean(mean_cos_dist))
full_join(mean_ns, mean_dists) %>%
ggplot(aes(y = mean_dist, x = mean_n)) +
geom_text(aes(label = lang)) +
ggtitle("mean decile distance as a function\nof mean words in decile") +
geom_smooth(method = "lm") +
theme_classic()
Maybe?
Wikipedia articles num by language (https://meta.wikimedia.org/wiki/List_of_Wikipedias (September, 2018)):
Issues: