GOAL: Identify some example languages where (1) the cluster containing the swadesh words is closer to different abstract clusters across languages, and (2) an abstract cluster is close to different concrete clusters across languages.

Get Swadesh word concreteness deciles

SWADESH_WORDS <- c("ash", "cloud", "day", "dust", "earth", "fire", "lake", "moon", "mountain", "night", "river", "salt", "sand", "sea", "sky", "smoke", "star", "stone", "sun", "water", "wind", "year")

all_words <- read_csv("data/target_translations_xling_words.csv") 
swadesh_tiles <- all_words %>%
  distinct(word, concreteness_tile) %>%
  filter(word %in% SWADESH_WORDS)  %>%
  arrange(concreteness_tile)
  
kable(swadesh_tiles)
word concreteness_tile
year 7
day 8
wind 8
dust 9
night 9
earth 10
fire 10
lake 10
moon 10
mountain 10
salt 10
sea 10
smoke 10
star 10
stone 10
sun 10
16/22 words present in current sample of words.

Get distance between concreteness decile pairs, by language

INPATH <- "data/decile_pairwise_lang_distances.csv"

dists <- read_csv(INPATH, 
                  col_names = c("lang", "concreteness_decile1", "concreteness_decile2", "mean_cos_dist")) 
  #filter(!(lang %in% c("zh", "yo", "ja", "ig")))

ggplot(dists, aes(x = mean_cos_dist)) +
         geom_histogram() +
         theme_classic()

ggplot(dists, aes(x = mean_cos_dist, fill = lang)) +
        # facet_wrap(~lang) +
         geom_density() +
         theme_classic()

Outliers: zh, yo, ja, ig

Sanity check

Mean across languages of mean decile distances (across words).

mean_dist <- dists %>%
  group_by(concreteness_decile1, concreteness_decile2) %>%
  summarize(mean_cos_dist = mean(mean_cos_dist)) %>%
  ungroup() %>%
  mutate(concreteness_decile1 = as.factor(concreteness_decile1))

ggplot(mean_dist, aes(x = concreteness_decile2, 
                      y = mean_cos_dist, 
                      group=concreteness_decile1,
                      color = concreteness_decile1)) +
  geom_line() +
  geom_point() +
  theme_classic()

dists %>%
  ggplot(aes(x = mean_cos_dist)) +
  geom_density() +
  facet_grid(concreteness_decile1~ concreteness_decile2,) +
  theme_classic()

dists %>%
  filter(concreteness_decile1 == 3) %>%
  mutate(conc_pair = paste0(concreteness_decile1, "-",
                            concreteness_decile2)) %>%
  ggplot(aes(x = mean_cos_dist, fill = conc_pair)) +
  geom_density(alpha = .5) +
  theme_classic()

Concreteness deciles 2-6 paired with lesser deciles unexpected

Specific language concretenes decile pairs

dists %>%
  filter(concreteness_decile1 %in% c(1,2,3,8,9,10),
         concreteness_decile2 %in% c(1,2,3,8,9,10)) %>%
  filter(abs(concreteness_decile1 - concreteness_decile2) > 2) %>%
  mutate(conc_pair = paste0(concreteness_decile1, "-",
                            concreteness_decile2)) %>%  
  select(-concreteness_decile1, -concreteness_decile2) %>%
  group_by(conc_pair) %>%
  arrange(conc_pair, mean_cos_dist) %>%
  filter(row_number() %in% c(1:2,(n()-1): n())) %>%
  DT::datatable()

Words by concreteness-tile

DT::datatable(all_words %>% distinct(word,concreteness_tile) %>% arrange(concreteness_tile))

Are the differences in mean do to different numbers of words across languages?

Note that this is translated words IN THE MODEL

COMMON_WORDS_OUTPATH <- "data/common_brysbaert_words_across_models.csv"

words_in_models <- read_csv(COMMON_WORDS_OUTPATH, 
                            col_names = c("word", "translation", "lang"))  %>%
  filter(word %in% all_words$word) %>%
  left_join(all_words %>% select(word, concreteness_tile) %>% distinct())

words_in_models %>%
count(lang, concreteness_tile) %>%
  ggplot(aes(x = lang, group = concreteness_tile, fill = as.factor(concreteness_tile), y = n)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_classic() + 
  theme(legend.position = "none") 

mean_ns <- words_in_models %>%
    count(lang, concreteness_tile) %>% 
    group_by(lang) %>%
    summarize(mean_n = mean(n))

mean_dists <- dists %>%
  group_by(lang) %>%
  summarize(mean_dist = mean(mean_cos_dist))

full_join(mean_ns, mean_dists) %>%
  ggplot(aes(y = mean_dist, x = mean_n)) +
  geom_text(aes(label = lang)) +
  ggtitle("mean decile distance as a function\nof mean words in decile") +
  geom_smooth(method = "lm") +
  theme_classic()

Maybe?

Wikipedia articles num by language (https://meta.wikimedia.org/wiki/List_of_Wikipedias (September, 2018)):

Issues: