library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
  library(rvest)
  library(magrittr)
  library(stringr)
  library(CCA)
## Loading required package: fda
## Loading required package: splines
## Loading required package: Matrix
## 
## Attaching package: 'fda'
## 
## The following object is masked from 'package:graphics':
## 
##     matplot
## 
## Loading required package: fields
## Loading required package: spam
## Loading required package: grid
## Spam version 1.0-1 (2014-09-09) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction 
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
## 
## Attaching package: 'spam'
## 
## The following objects are masked from 'package:base':
## 
##     backsolve, forwardsolve
## 
## Loading required package: maps
  library(ggplot2)
  data_html <- html("https://theintercept.co/condolences/")
  data_html%>%
    html_nodes("#main .cash")%>%
    html_text()%>%
    gsub("[\\$,]", "", .)%>%
    as.numeric() -> payouts

  data_html%>%
    html_nodes("#main .light")%>%
    html_children()%>%
    lapply(.%>%use_series("text")%>%html_text())%>%
    unlist() -> descrips


  data_html%>%
    html_nodes("#main .light .province")%>%
    html_text() -> province
  dat_df <- data_frame(pay = payouts,
                       descrip = descrips,
                       province = province)  

  dat_df %<>%
    mutate(province = gsub("Province|Village", "", province) %>% gsub("\\s+$", "", .))
  dat_df %>%
    group_by(province)%>%
    summarise(n = n(),
              max = max(pay),
              total = sum(pay))->totals

  totals %>%
  ggplot(aes(max))+
    geom_bar()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-4

  totals %>%
    ggplot(aes(log2(n), log2(total/n)))+
      geom_point()

plot of chunk unnamed-chunk-4

  word_bag <- dat_df %>%
                use_series("descrip")%>%
                tolower()%>%
                gsub("\\s+$|^\\s+", "", .)%>%
                gsub("\\s+"," ", .)%>%
                str_split(pattern = "\\s+")%>%
                unlist()%>%
                gsub("&", "ampsand", .)%>%
                gsub("#", "hastag", .)%>%        
                gsub("\\W", "", .)%>%
                unique()%>%
                extract(.!="")

  word_mat <- matrix(0, nrow = nrow(dat_df), ncol = length(word_bag))
  colnames(word_mat) <- word_bag
  for(i in 1:nrow(dat_df)){
    word_vec <- dat_df$descrip[i] %>% 
                tolower() %>% 
                gsub("\\s+$|^\\s+", "", .)%>%
                gsub("\\s+"," ", .)%>%
                str_split(pattern = " ")%>%
                unlist()%>%
                gsub("&", "ampsand", .)%>%
                gsub("#", "hastag", .)%>%      
                gsub("\\W", "", .)%>%
                unique()%>%
                extract(.!="")
    word_mat[i, word_vec] <- 1
  }
  pca <- princomp(word_mat)
  first_three <- pca$loadings[,1:3]
  first_three %<>% 
    data.frame()%>%
    mutate(word = rownames(.))

  first_three %>%
    ggplot(aes(Comp.1, Comp.2))+
    geom_text(aes(label = word))

plot of chunk unnamed-chunk-7

totals <- colSums(word_mat)
Y <- cbind(dat_df$pay)
X <- word_mat[,totals>1]

mod <- lm(log1p(Y) ~ X)

coefs <- mod %>% coef %>% extract(!is.na(.))%>%sort()

data_frame(value = coefs,
           word = names(coefs)%>%gsub("^X", "", .))%>%
        mutate(word = factor(word)%>%reorder(value, mean))->effects


effects %>%
    filter(value > quantile(value, 0.975) | value < quantile(value, 0.025))%>%
    ggplot(aes(word, value))+
      geom_point()

plot of chunk unnamed-chunk-8