library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rvest)
library(magrittr)
library(stringr)
library(CCA)
## Loading required package: fda
## Loading required package: splines
## Loading required package: Matrix
##
## Attaching package: 'fda'
##
## The following object is masked from 'package:graphics':
##
## matplot
##
## Loading required package: fields
## Loading required package: spam
## Loading required package: grid
## Spam version 1.0-1 (2014-09-09) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
##
## Attaching package: 'spam'
##
## The following objects are masked from 'package:base':
##
## backsolve, forwardsolve
##
## Loading required package: maps
library(ggplot2)
data_html <- html("https://theintercept.co/condolences/")
data_html%>%
html_nodes("#main .cash")%>%
html_text()%>%
gsub("[\\$,]", "", .)%>%
as.numeric() -> payouts
data_html%>%
html_nodes("#main .light")%>%
html_children()%>%
lapply(.%>%use_series("text")%>%html_text())%>%
unlist() -> descrips
data_html%>%
html_nodes("#main .light .province")%>%
html_text() -> province
dat_df <- data_frame(pay = payouts,
descrip = descrips,
province = province)
dat_df %<>%
mutate(province = gsub("Province|Village", "", province) %>% gsub("\\s+$", "", .))
dat_df %>%
group_by(province)%>%
summarise(n = n(),
max = max(pay),
total = sum(pay))->totals
totals %>%
ggplot(aes(max))+
geom_bar()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

totals %>%
ggplot(aes(log2(n), log2(total/n)))+
geom_point()

word_bag <- dat_df %>%
use_series("descrip")%>%
tolower()%>%
gsub("\\s+$|^\\s+", "", .)%>%
gsub("\\s+"," ", .)%>%
str_split(pattern = "\\s+")%>%
unlist()%>%
gsub("&", "ampsand", .)%>%
gsub("#", "hastag", .)%>%
gsub("\\W", "", .)%>%
unique()%>%
extract(.!="")
word_mat <- matrix(0, nrow = nrow(dat_df), ncol = length(word_bag))
colnames(word_mat) <- word_bag
for(i in 1:nrow(dat_df)){
word_vec <- dat_df$descrip[i] %>%
tolower() %>%
gsub("\\s+$|^\\s+", "", .)%>%
gsub("\\s+"," ", .)%>%
str_split(pattern = " ")%>%
unlist()%>%
gsub("&", "ampsand", .)%>%
gsub("#", "hastag", .)%>%
gsub("\\W", "", .)%>%
unique()%>%
extract(.!="")
word_mat[i, word_vec] <- 1
}
pca <- princomp(word_mat)
first_three <- pca$loadings[,1:3]
first_three %<>%
data.frame()%>%
mutate(word = rownames(.))
first_three %>%
ggplot(aes(Comp.1, Comp.2))+
geom_text(aes(label = word))

totals <- colSums(word_mat)
Y <- cbind(dat_df$pay)
X <- word_mat[,totals>1]
mod <- lm(log1p(Y) ~ X)
coefs <- mod %>% coef %>% extract(!is.na(.))%>%sort()
data_frame(value = coefs,
word = names(coefs)%>%gsub("^X", "", .))%>%
mutate(word = factor(word)%>%reorder(value, mean))->effects
effects %>%
filter(value > quantile(value, 0.975) | value < quantile(value, 0.025))%>%
ggplot(aes(word, value))+
geom_point()
