Read in doctag indices, docvecs, and metadata

metadata <- read_csv("../all_data/merged_metadata.csv") 
metadata_clean <- metadata %>%
  mutate_if(is.character, as.factor)  %>%
  mutate(essay_id = as.character(essay_id)) 

essay_word_counts <- read_feather("../all_data/essay_word_counts.feather") 
wc_clean = left_join(essay_word_counts, metadata_clean)

How correlated are the pairwise word frequencies across languages?

word_counts_by_lang <- wc_clean %>%
  group_by(L1_code, word) %>%
  summarize(count = sum(count)) %>%
  ungroup()
total_words <- wc_clean %>%
  group_by(L1_code) %>% 
  summarize(total = sum(count))

word_counts_by_lang <- word_counts_by_lang %>%
                          left_join(total_words) %>%
                          mutate(rf = count/total)

word_counts_by_lang_wide <- word_counts_by_lang %>%
  mutate(rf_log = log(rf)) %>%
  select(L1_code, rf_log, word)  %>%
  spread(L1_code, rf_log)
 
corrs = correlate(word_counts_by_lang_wide[,-1]) %>%
  as.data.frame()
rownames(corrs) = corrs$rowname

ggdendro::ggdendrogram(hclust(dist(corrs))) +
  ggtitle("Pairwise correlations of relative log word frequency")

Are frequency correlations themselves correlated with language distances?

ASJP distance

“These distances are computed on the basis of standardized short wordlists transcribed in a reduced set of symbols using a normalized Levenstein distance”

lang_dists <- read_csv("/Users/mollylewis/Documents/research/Projects/conceptviz/data/supplementary_data/cultural_sim_measures/lang/asjp_dists.csv") %>%
  select(lang1, lang2, asjp_dist) %>%
  mutate(lang1 = toupper(lang1),
         lang2 = toupper(lang2))

long_corrs <- corrs %>%
  gather("lang2", "r", -1) %>%
  rename(lang1 = rowname) %>%
  mutate(lang1 = fct_recode(lang1, ARB = "ARA", DEU = "GER", FRA ="FRE")) %>% # chinese, and telu, and arabic
  mutate(lang2 = fct_recode(lang2, ARB = "ARA", DEU="GER", FRA ="FRE")) %>% # chinese, and telu, and arabic
  left_join(lang_dists) %>%
  #left_join(lang_dists, by= c("lang1"= "lang2", "lang2" = "lang1")) %>%
  distinct()

ggplot(long_corrs, aes(x = asjp_dist, y = r)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_minimal()

WALS euclidean distance

“Uses the WALS typological database to compute dis- tances between languages using their feature values.”

wals <- read_csv("/Users/mollylewis/Documents/research/Projects/conceptviz/analyses/R_scripts/wals_for_ETS.csv") %>%
  select(lang1, lang2, wals_euclidean_dist) %>%
    mutate(lang1 = toupper(lang1),
           lang2 = toupper(lang2))

long_corrs_wals <- corrs %>%
  gather("lang2", "r", -1) %>%
  rename(lang1 = rowname) %>%
  left_join(wals) %>%
  distinct()

ggplot(long_corrs_wals, aes(x = wals_euclidean_dist, y = r)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_minimal()

Zipf’s law by language

freq_by_rank <- word_counts_by_lang %>% 
  group_by(L1_code) %>% 
  arrange(-count) %>%
  mutate(rank = row_number(), 
         `term frequency` = count/total)

rank_subset <- freq_by_rank %>% 
     filter(rank < 2000,
         rank > 50) 

mod = lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)

freq_by_rank %>% 
   filter(rank < 2000,
         rank > 50) %>%
  ggplot(aes(rank, `term frequency`, color = L1_code)) +
     
    geom_line(size = .8, alpha = 0.8) + 
 geom_abline(intercept = mod$coefficients[1][[1]] , slope = mod$coefficients[2][[1]], color = "black", linetype = 2) +
  scale_x_log10() +
  scale_y_log10() +
  ylab("log normalized term frequency") +
  xlab("log rank")+
  ggtitle("Zipf's law") +
  theme_minimal()

as.list(sample_n(as.data.frame(unique(word_counts_by_lang$word)), 50))
## $`unique(word_counts_by_lang$word)`
##  [1] neigbours         health-related    groundsand       
##  [4] treasuring        adulthood         exhaustion       
##  [7] expositions       physical          specilaizing     
## [10] otherhand         rngkj             specificly       
## [13] non-philosophical decides           scool            
## [16] weight-loss       giuge             advertismente    
## [19] substances        foot-ball         neibourghoods    
## [22] whicj             speciallities     concieved        
## [25] threr             romancing         bundeskanzler    
## [28] conditon          including         traway           
## [31] navigate          ppirt             regardless       
## [34] inindia           manuplate         wideranged       
## [37] puzzeld           econmic           sovles           
## [40] evertything       fines             confidant        
## [43] money-consuming   blockling         tehy             
## [46] soalr             conseme           sin              
## [49] sophosticated     snakes           
## 61382 Levels: _ __ ... zurich

a lot of misspellings - could that be the source of the semantic similarity?

Get binomial-split corpus as in Piantadosi (2014)

Plot residuals

MISC

By gender

By score

By age