require(xml2, quietly=T)
require(tidyverse, quietly=T)
require(lubridate, quietly=T)
require(tidytext, quietly=T)
require(stringr, quietly=T)
setwd("C:/Users/Em/Documents/2019/appropedia/data")
pages <- read_xml("pages_clean.xml")
pages_list <- as_list(pages)
pages_tibble <- as_tibble(
sapply(pages_list[[1]], function(x){
unlist(c(
if(is.list(x$title)) x$title else NA,
if(is.list(x$id)) x$id else NA,
if(is.list(x$revision$id)) x$revision$id else NA,
if(is.list(x$revision$parentid)) x$revision$parentid else NA,
if(is.list(x$revision$timestamp)) x$revision$timestamp else NA,
if(is.list(x$revision$contributor$username)) x$revision$contributor$username else NA,
if(is.list(x$revision$text) & length(x$revision$text)>0) as.character(x$revision$text) else NA,
if(is.list(x$revision$minor)) T else F
))}), .name_repair = "minimal")
# https://stackoverflow.com/questions/42790219/how-do-i-transpose-a-tibble-in-r
pages_trans <- as_tibble(t(pages_tibble), name_repair = "minimal")
tibble_names <- c("title",
"id",
"revision_id",
"parent_id",
"timestamp",
"username",
"text",
"is_minor"
)
colnames(pages_trans) <- tibble_names
pages_trans <- pages_trans %>% mutate(length = nchar(text)) %>% mutate(word_count = str_count(text, '\\w+'))
write.csv(pages_trans, "pages.csv")
all_pages <- read.csv(file="pages.csv", header=T, stringsAsFactors = F)
all_pages <- as_tibble(all_pages)
all_pages$timestamp <- as_date(all_pages$timestamp)
# https://stackoverflow.com/questions/33221425/how-do-i-group-my-date-variable-into-month-year-in-r
by_period <- all_pages %>% group_by(month=floor_date(timestamp, "14 days")) %>% summarize(n = n())
by_period %>%
ggplot(aes(x=month, y=n, color=n)) +
geom_point() +
ggtitle("Number of 'last' edits over time") +
xlab("Date") +
ylab("Edits") +
scale_color_gradient(low = "#2ca25f", high= "#2ca25f") + theme_minimal()

User pages and namespaces
user_pages <- all_pages %>% filter(grepl("User:", title))
# there are 3700 user pages.
title_types <- all_pages %>%
select(title) %>%
filter(grepl("(.*?):.*", all_pages$title)) %>%
mutate(short_title = sub("(.*?):.*", "\\1", title)) %>%
select(short_title) %>%
unique()
title_types
# Surprise: there are 346 different namespaces... that is one for every 20 articles!
all_pages %>%
select(title) %>%
mutate(short_title = grepl(":", title)) %>%
group_by(short_title) %>%
summarize(count = n()) %>%
mutate(per=paste0(round(100*count/sum(count),2),'%'))
# Percentage of namespace pages. Around 24% of all content on Appropedia. // Note: I've noted that not all these are namespaces. I will review a bit more thoroughly.
#
named_users <- all_pages %>%
# filter(!(grepl("(25[0-5]|[0-4][0-9]|[01]?[0-9][0-9]?)(\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", username))) %>%
# I was filtering non-named users but I will leave them here for now.
group_by(username) %>%
summarize(n=n()) %>%
arrange(n)
named_users %>%
group_by(n) %>%
summarize(count=n()) %>%
ggplot(aes(y=n, x=count, color=count)) +
geom_point() +
geom_line() +
ggtitle("Number of 'last' edits by individual users of Appropedia") +
xlab("Edits") +
ylab("Total users") +
scale_color_gradient(low = "#2ca25f", high= "#2ca25f") +
theme_minimal()

# Take into account that this only lists last edits on a page, not every edit.
named_users %>%
group_by(n) %>%
summarize(count=n()) %>%
ggplot(aes(y=n, x=count, color=count)) +
geom_point() +
geom_line() +
ggtitle("Number of 'last' edits by individual users of Appropedia (Log10 scale)") +
xlab("Edits") +
ylab("Total users") +
scale_y_continuous(trans = 'log10') +
scale_color_gradient(low = "#2ca25f", high= "#2ca25f") +
theme_minimal()

# Transformed to a log scale so it is easier to visualize.
Length of pages by word count
lengths <- all_pages %>%
arrange(word_count) %>%
select(id, title, word_count) %>%
mutate(title = str_trunc(title, 50, "right")) %>%
drop_na()
lengths %>% summary()
## id title word_count
## Min. : 1284 Length:17455 Min. : 0.0
## 1st Qu.:18936 Class :character 1st Qu.: 10.0
## Median :33845 Mode :character Median : 158.0
## Mean :36534 Mean : 627.8
## 3rd Qu.:52088 3rd Qu.: 556.0
## Max. :80871 Max. :82283.0
# A view of how page length is distributed.
# Pages over 10k words
lengths_over10k <- lengths %>%
filter(word_count > 10000) %>%
arrange(-word_count)
lengths_over10k
# Longest articles
all_pages %>%
filter(str_detect(title, "User:", negate=T), str_detect(title, "Appropedia:", negate=T)) %>%
select(title, word_count) %>%
mutate(title = str_trunc(title, 50, "right")) %>%
arrange(-word_count) %>%
head(50)
## Joining, by = "word"
N-grams
all_pages_bigrams <- all_pages %>%
unnest_tokens(bigram, title, token = "ngrams", n = 2)
all_pages_bigrams %>% select(id, bigram) %>% group_by(bigram) %>% summarize(n=n()) %>% arrange(desc(n)) %>% head(50)
# Most commons two-word combinations.
all_pages %>%
unnest_tokens(ngram, title, token = "ngrams", n = 3) %>%
select(id, ngram) %>%
group_by(ngram) %>% summarize(n=n()) %>% arrange(desc(n)) %>% head(50)
# Most common three-word combinations.