Wikipedia stats

PAGEVIEWS_PATH <- here("analyses/10_wikipedia_stats/data/processed/page_views_by_language.csv")
PRODUCITIVY_PATH <- here("analyses/10_wikipedia_stats/data/raw/wiki_productivity.csv")
NARTICLES_PATH <- here("analyses/10_wikipedia_stats/data/raw/num_articles.tsv")
LANG_NAMES <- here('analyses/04_predicting_semantic_sim/data/lang_distance_metrics/linguistic/data/iso_to_wals_for_ling_dists.csv')

WIKI_LANGS <- c("ar", "bn", "bg", "zh", "nl", "en", "fr", "de", "el", "gu", "hi", "ig", "id", "it", "ja", "kn", "ko",
                  "ml", "mr", "ne", "pa", "pl", "pt", "ro", "ru", "es", "tl", "ta", "te", "th", "tr", "ur", "vi", "yo",
                  "fa")


narticles <- read_tsv(NARTICLES_PATH) %>%
  clean_names() %>%
  select(language, wiki, articles, speakers, admins, active_users, 
         percent_of_admins, depth, speakers_article) %>%
  mutate(speakers = as.numeric(str_remove_all(speakers, ","))) %>%
  filter(wiki %in% WIKI_LANGS) 

productivity <- read_csv(PRODUCITIVY_PATH) %>%
  select(2,3)

page_views <- read_csv(PAGEVIEWS_PATH)

all_measures <- narticles %>%
  full_join(productivity, by = "wiki") %>%
  full_join(page_views, by = "wiki") %>%
  filter(!is.na(iso)) %>%
  select(-iso) %>%
  select(-speakers_article, -percent_of_admins) %>%
  mutate(prop_articles = log(articles)/log(speakers_l1))

get_simple_correlation <- function(m1, m2, df){
  cor_model <- cor.test(df[,m1] %>% unlist(use.names = F),
           df[,m2] %>% unlist(use.names = F))

  tibble(measure1 = m1,
         measure2 = m2,
         simple_cor = cor_model$estimate,
         simple_p =  cor_model$p.value)
}

measures <- cross_df(data.frame(measure1 = names(all_measures)[3:13],
                                measure2 = names(all_measures)[3:13])) %>%
  mutate_all(as.character) %>%
  filter(measure1 != measure2)

# get cors
all_corrs_table <- map2_df(measures$measure1, measures$measure2,
                  get_simple_correlation, all_measures) %>%
  mutate(sig = simple_p < .05,
         simple_cor = round(simple_cor, 2),
         cor_text = case_when(sig ~ paste0(simple_cor, "*"),
                              TRUE ~ as.character(simple_cor)))

table <- all_corrs_table %>%
  select(measure1, measure2, cor_text) %>%
  spread(measure2, cor_text) %>%
  mutate(
         measure1 = fct_relevel(measure1, "speakers", "speakers_l1",  "articles",
                                "active_users", "total_requests",  "admins", "depth", "prop_articles",  "productivity", "prop_requests")) %>%
  select("measure1", "speakers", "speakers_l1",  "articles",
                                "active_users", "total_requests",  "admins", "depth", "prop_articles",  "productivity", "prop_requests") %>%
    arrange(measure1) 

kable(table)

measure1	speakers	speakers_l1	articles	active_users	total_requests	admins	depth	prop_articles	productivity	prop_requests
speakers	NA	0.98*	0.36*	0.36*	0.33	0.33	0.35*	0	-0.5	0.14
speakers_l1	0.98*	NA	0.46*	0.48*	0.46*	0.45*	0.45*	0	-0.48	0.19
articles	0.36*	0.46*	NA	0.9*	0.87*	0.89*	0.69*	0.61*	-0.03	0.7*
active_users	0.36*	0.48*	0.9*	NA	0.99*	0.99*	0.77*	0.3	-0.08	0.44*
total_requests	0.33	0.46*	0.87*	0.99*	NA	0.98*	0.76*	0.26	-0.09	0.4*
admins	0.33	0.45*	0.89*	0.99*	0.98*	NA	0.76*	0.3	-0.01	0.43*
depth	0.35*	0.45*	0.69*	0.77*	0.76*	0.76*	NA	0.22	-0.24	0.3
prop_articles	0	0	0.61*	0.3	0.26	0.3	0.22	NA	0.62*	0.92*
productivity	-0.5	-0.48	-0.03	-0.08	-0.09	-0.01	-0.24	0.62*	NA	0.17
prop_requests	0.14	0.19	0.7*	0.44*	0.4*	0.43*	0.3	0.92*	0.17	NA

Data sources:

Wikipedia speakers per article table, 2015
Wikpedia productivity table, 2005
Pageview data, page views for 24 hour period in 2016 (August 1)
L1 speaker data, Amano et al, 2014, derived from Ethnologue, 2009

Variables:

speakers - number of L1 and L2 speakers (2), from Ethnologue
speakers_l1 - total number of L1 speakers (4)
articles - number of articles (1)
active_users - registered users who have performed an action in the last 30 days (1)
total_requests - number of desktop page views by language for single day (3)
admins - number of administrators (1)
depth - measure of collaborative quality, indexing frequency that articles are updated (1)
prop_articles - log number of articles / log speakers l1 (1, 4)
productivity - number of articles in a language compared to the number of potential authors. To be a potential author, at least two requirements must be fulfilled: command of language, and access to internet (2)
prop_requests - log total requests / log speakers l1 (3,4)

Wikipedia stats

Molly Lewis

2020-04-28