PAGEVIEWS_PATH <- here("analyses/10_wikipedia_stats/data/processed/page_views_by_language.csv")
PRODUCITIVY_PATH <- here("analyses/10_wikipedia_stats/data/raw/wiki_productivity.csv")
NARTICLES_PATH <- here("analyses/10_wikipedia_stats/data/raw/num_articles.tsv")
LANG_NAMES <- here('analyses/04_predicting_semantic_sim/data/lang_distance_metrics/linguistic/data/iso_to_wals_for_ling_dists.csv')
WIKI_LANGS <- c("ar", "bn", "bg", "zh", "nl", "en", "fr", "de", "el", "gu", "hi", "ig", "id", "it", "ja", "kn", "ko",
"ml", "mr", "ne", "pa", "pl", "pt", "ro", "ru", "es", "tl", "ta", "te", "th", "tr", "ur", "vi", "yo",
"fa")
narticles <- read_tsv(NARTICLES_PATH) %>%
clean_names() %>%
select(language, wiki, articles, speakers, admins, active_users,
percent_of_admins, depth, speakers_article) %>%
mutate(speakers = as.numeric(str_remove_all(speakers, ","))) %>%
filter(wiki %in% WIKI_LANGS)
productivity <- read_csv(PRODUCITIVY_PATH) %>%
select(2,3)
page_views <- read_csv(PAGEVIEWS_PATH)
all_measures <- narticles %>%
full_join(productivity, by = "wiki") %>%
full_join(page_views, by = "wiki") %>%
filter(!is.na(iso)) %>%
select(-iso) %>%
select(-speakers_article, -percent_of_admins) %>%
mutate(prop_articles = log(articles)/log(speakers_l1))
get_simple_correlation <- function(m1, m2, df){
cor_model <- cor.test(df[,m1] %>% unlist(use.names = F),
df[,m2] %>% unlist(use.names = F))
tibble(measure1 = m1,
measure2 = m2,
simple_cor = cor_model$estimate,
simple_p = cor_model$p.value)
}
measures <- cross_df(data.frame(measure1 = names(all_measures)[3:13],
measure2 = names(all_measures)[3:13])) %>%
mutate_all(as.character) %>%
filter(measure1 != measure2)
# get cors
all_corrs_table <- map2_df(measures$measure1, measures$measure2,
get_simple_correlation, all_measures) %>%
mutate(sig = simple_p < .05,
simple_cor = round(simple_cor, 2),
cor_text = case_when(sig ~ paste0(simple_cor, "*"),
TRUE ~ as.character(simple_cor)))
table <- all_corrs_table %>%
select(measure1, measure2, cor_text) %>%
spread(measure2, cor_text) %>%
mutate(
measure1 = fct_relevel(measure1, "speakers", "speakers_l1", "articles",
"active_users", "total_requests", "admins", "depth", "prop_articles", "productivity", "prop_requests")) %>%
select("measure1", "speakers", "speakers_l1", "articles",
"active_users", "total_requests", "admins", "depth", "prop_articles", "productivity", "prop_requests") %>%
arrange(measure1)
kable(table)
measure1 | speakers | speakers_l1 | articles | active_users | total_requests | admins | depth | prop_articles | productivity | prop_requests |
---|---|---|---|---|---|---|---|---|---|---|
speakers | NA | 0.98* | 0.36* | 0.36* | 0.33 | 0.33 | 0.35* | 0 | -0.5 | 0.14 |
speakers_l1 | 0.98* | NA | 0.46* | 0.48* | 0.46* | 0.45* | 0.45* | 0 | -0.48 | 0.19 |
articles | 0.36* | 0.46* | NA | 0.9* | 0.87* | 0.89* | 0.69* | 0.61* | -0.03 | 0.7* |
active_users | 0.36* | 0.48* | 0.9* | NA | 0.99* | 0.99* | 0.77* | 0.3 | -0.08 | 0.44* |
total_requests | 0.33 | 0.46* | 0.87* | 0.99* | NA | 0.98* | 0.76* | 0.26 | -0.09 | 0.4* |
admins | 0.33 | 0.45* | 0.89* | 0.99* | 0.98* | NA | 0.76* | 0.3 | -0.01 | 0.43* |
depth | 0.35* | 0.45* | 0.69* | 0.77* | 0.76* | 0.76* | NA | 0.22 | -0.24 | 0.3 |
prop_articles | 0 | 0 | 0.61* | 0.3 | 0.26 | 0.3 | 0.22 | NA | 0.62* | 0.92* |
productivity | -0.5 | -0.48 | -0.03 | -0.08 | -0.09 | -0.01 | -0.24 | 0.62* | NA | 0.17 |
prop_requests | 0.14 | 0.19 | 0.7* | 0.44* | 0.4* | 0.43* | 0.3 | 0.92* | 0.17 | NA |
Data sources:
Variables: