PAGEVIEWS_PATH <- here("analyses/10_wikipedia_stats/data/processed/page_views_by_language.csv")
PRODUCITIVY_PATH <- here("analyses/10_wikipedia_stats/data/raw/wiki_productivity.csv")
NARTICLES_PATH <- here("analyses/10_wikipedia_stats/data/raw/num_articles.tsv")
LANG_NAMES <- here('analyses/04_predicting_semantic_sim/data/lang_distance_metrics/linguistic/data/iso_to_wals_for_ling_dists.csv')

WIKI_LANGS <- c("ar", "bn", "bg", "zh", "nl", "en", "fr", "de", "el", "gu", "hi", "ig", "id", "it", "ja", "kn", "ko",
                  "ml", "mr", "ne", "pa", "pl", "pt", "ro", "ru", "es", "tl", "ta", "te", "th", "tr", "ur", "vi", "yo",
                  "fa")


narticles <- read_tsv(NARTICLES_PATH) %>%
  clean_names() %>%
  select(language, wiki, articles, speakers, admins, active_users, 
         percent_of_admins, depth, speakers_article) %>%
  mutate(speakers = as.numeric(str_remove_all(speakers, ","))) %>%
  filter(wiki %in% WIKI_LANGS) 

productivity <- read_csv(PRODUCITIVY_PATH) %>%
  select(2,3)

page_views <- read_csv(PAGEVIEWS_PATH)
all_measures <- narticles %>%
  full_join(productivity, by = "wiki") %>%
  full_join(page_views, by = "wiki") %>%
  filter(!is.na(iso)) %>%
  select(-iso) %>%
  select(-speakers_article, -percent_of_admins) %>%
  mutate(prop_articles = log(articles)/log(speakers_l1))
get_simple_correlation <- function(m1, m2, df){
  cor_model <- cor.test(df[,m1] %>% unlist(use.names = F),
           df[,m2] %>% unlist(use.names = F))

  tibble(measure1 = m1,
         measure2 = m2,
         simple_cor = cor_model$estimate,
         simple_p =  cor_model$p.value)
}

measures <- cross_df(data.frame(measure1 = names(all_measures)[3:13],
                                measure2 = names(all_measures)[3:13])) %>%
  mutate_all(as.character) %>%
  filter(measure1 != measure2)

# get cors
all_corrs_table <- map2_df(measures$measure1, measures$measure2,
                  get_simple_correlation, all_measures) %>%
  mutate(sig = simple_p < .05,
         simple_cor = round(simple_cor, 2),
         cor_text = case_when(sig ~ paste0(simple_cor, "*"),
                              TRUE ~ as.character(simple_cor)))
table <- all_corrs_table %>%
  select(measure1, measure2, cor_text) %>%
  spread(measure2, cor_text) %>%
  mutate(
         measure1 = fct_relevel(measure1, "speakers", "speakers_l1",  "articles",
                                "active_users", "total_requests",  "admins", "depth", "prop_articles",  "productivity", "prop_requests")) %>%
  select("measure1", "speakers", "speakers_l1",  "articles",
                                "active_users", "total_requests",  "admins", "depth", "prop_articles",  "productivity", "prop_requests") %>%
    arrange(measure1) 

kable(table)
measure1 speakers speakers_l1 articles active_users total_requests admins depth prop_articles productivity prop_requests
speakers NA 0.98* 0.36* 0.36* 0.33 0.33 0.35* 0 -0.5 0.14
speakers_l1 0.98* NA 0.46* 0.48* 0.46* 0.45* 0.45* 0 -0.48 0.19
articles 0.36* 0.46* NA 0.9* 0.87* 0.89* 0.69* 0.61* -0.03 0.7*
active_users 0.36* 0.48* 0.9* NA 0.99* 0.99* 0.77* 0.3 -0.08 0.44*
total_requests 0.33 0.46* 0.87* 0.99* NA 0.98* 0.76* 0.26 -0.09 0.4*
admins 0.33 0.45* 0.89* 0.99* 0.98* NA 0.76* 0.3 -0.01 0.43*
depth 0.35* 0.45* 0.69* 0.77* 0.76* 0.76* NA 0.22 -0.24 0.3
prop_articles 0 0 0.61* 0.3 0.26 0.3 0.22 NA 0.62* 0.92*
productivity -0.5 -0.48 -0.03 -0.08 -0.09 -0.01 -0.24 0.62* NA 0.17
prop_requests 0.14 0.19 0.7* 0.44* 0.4* 0.43* 0.3 0.92* 0.17 NA

Data sources:

  1. Wikipedia speakers per article table, 2015
  2. Wikpedia productivity table, 2005
  3. Pageview data, page views for 24 hour period in 2016 (August 1)
  4. L1 speaker data, Amano et al, 2014, derived from Ethnologue, 2009

Variables: