require(xml2, quietly=T)
require(tidyverse, quietly=T)
require(lubridate, quietly=T)
require(tidytext, quietly=T)
require(stringr, quietly=T)

setwd("C:/Users/Em/Documents/2019/appropedia/data")
pages <- read_xml("pages_clean.xml")
pages_list <- as_list(pages)

pages_tibble <- as_tibble(
    sapply(pages_list[[1]], function(x){
        unlist(c(
            if(is.list(x$title)) x$title else NA, 
            if(is.list(x$id)) x$id else NA, 
            if(is.list(x$revision$id)) x$revision$id else NA, 
            if(is.list(x$revision$parentid)) x$revision$parentid else NA, 
            if(is.list(x$revision$timestamp)) x$revision$timestamp else NA,
            if(is.list(x$revision$contributor$username)) x$revision$contributor$username else NA, 
            if(is.list(x$revision$text) & length(x$revision$text)>0) as.character(x$revision$text) else NA,
            if(is.list(x$revision$minor)) T else F
        ))}), .name_repair = "minimal")


# https://stackoverflow.com/questions/42790219/how-do-i-transpose-a-tibble-in-r
pages_trans <- as_tibble(t(pages_tibble), name_repair = "minimal")

tibble_names <- c("title", 
           "id", 
           "revision_id", 
           "parent_id", 
           "timestamp", 
           "username", 
           "text",
           "is_minor"
           )
colnames(pages_trans) <- tibble_names

pages_trans <- pages_trans %>% mutate(length = nchar(text)) %>% mutate(word_count = str_count(text, '\\w+'))

write.csv(pages_trans, "pages.csv")
all_pages <- read.csv(file="pages.csv", header=T, stringsAsFactors = F)
all_pages <- as_tibble(all_pages)

all_pages$timestamp <- as_date(all_pages$timestamp)
# https://stackoverflow.com/questions/33221425/how-do-i-group-my-date-variable-into-month-year-in-r
by_period <- all_pages %>% group_by(month=floor_date(timestamp, "14 days")) %>% summarize(n = n())

by_period %>%
  ggplot(aes(x=month, y=n, color=n)) + 
  geom_point() + 
  ggtitle("Number of 'last' edits over time") + 
  xlab("Date") + 
  ylab("Edits") + 
  scale_color_gradient(low = "#2ca25f", high= "#2ca25f") + theme_minimal()

User pages and namespaces

user_pages <- all_pages %>% filter(grepl("User:", title))
# there are 3700 user pages.

title_types <- all_pages %>% 
  select(title) %>% 
  filter(grepl("(.*?):.*", all_pages$title)) %>% 
  mutate(short_title = sub("(.*?):.*", "\\1", title)) %>% 
  select(short_title) %>% 
  unique()

title_types
# Surprise: there are 346 different namespaces... that is one for every 20 articles!

all_pages %>% 
  select(title) %>% 
  mutate(short_title = grepl(":", title)) %>% 
  group_by(short_title) %>% 
  summarize(count = n()) %>% 
  mutate(per=paste0(round(100*count/sum(count),2),'%'))
# Percentage of namespace pages. Around 24% of all content on Appropedia. // Note: I've noted that not all these are namespaces. I will review a bit more thoroughly.

# 
named_users <- all_pages %>% 
  # filter(!(grepl("(25[0-5]|[0-4][0-9]|[01]?[0-9][0-9]?)(\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", username))) %>% 
  # I was filtering non-named users but I will leave them here for now.
  group_by(username) %>% 
  summarize(n=n()) %>% 
  arrange(n)

named_users %>% 
  group_by(n) %>% 
  summarize(count=n()) %>% 
  ggplot(aes(y=n, x=count, color=count)) +
    geom_point() + 
    geom_line() +
    ggtitle("Number of 'last' edits by individual users of Appropedia") + 
    xlab("Edits") + 
    ylab("Total users") + 
    scale_color_gradient(low = "#2ca25f", high= "#2ca25f") +
    theme_minimal()

# Take into account that this only lists last edits on a page, not every edit.


named_users %>% 
  group_by(n) %>% 
  summarize(count=n()) %>% 
  ggplot(aes(y=n, x=count, color=count)) +
    geom_point() + 
    geom_line() +
    ggtitle("Number of 'last' edits by individual users of Appropedia (Log10 scale)") + 
    xlab("Edits") + 
    ylab("Total users") +
    scale_y_continuous(trans = 'log10') + 
    scale_color_gradient(low = "#2ca25f", high= "#2ca25f") +
    theme_minimal()

# Transformed to a log scale so it is easier to visualize.

Length of pages by word count

lengths <- all_pages %>% 
  arrange(word_count) %>% 
  select(id, title, word_count) %>% 
  mutate(title = str_trunc(title, 50, "right")) %>% 
  drop_na()
lengths %>% summary()
##        id           title             word_count     
##  Min.   : 1284   Length:17455       Min.   :    0.0  
##  1st Qu.:18936   Class :character   1st Qu.:   10.0  
##  Median :33845   Mode  :character   Median :  158.0  
##  Mean   :36534                      Mean   :  627.8  
##  3rd Qu.:52088                      3rd Qu.:  556.0  
##  Max.   :80871                      Max.   :82283.0
# A view of how page length is distributed.

# Pages over 10k words
lengths_over10k <- lengths %>% 
  filter(word_count > 10000) %>% 
  arrange(-word_count)
lengths_over10k
# Longest articles
all_pages %>% 
  filter(str_detect(title, "User:", negate=T), str_detect(title, "Appropedia:", negate=T)) %>% 
  select(title, word_count) %>% 
  mutate(title = str_trunc(title, 50, "right")) %>% 
  arrange(-word_count) %>% 
  head(50)
## Joining, by = "word"

N-grams

all_pages_bigrams <- all_pages %>%
  unnest_tokens(bigram, title, token = "ngrams", n = 2)

all_pages_bigrams %>% select(id, bigram) %>% group_by(bigram) %>% summarize(n=n()) %>% arrange(desc(n)) %>% head(50)
# Most commons two-word combinations.

all_pages %>%
  unnest_tokens(ngram, title, token = "ngrams", n = 3) %>% 
  select(id, ngram) %>% 
  group_by(ngram) %>% summarize(n=n()) %>% arrange(desc(n)) %>% head(50)
# Most common three-word combinations.