Jim_Comey

library(RColorBrewer)
library(scales)
df$format <- gsub("(\\w+:\\s+)", "", df$format)
df%>%
  group_by(format)%>%
  summarise(n = n())%>%
  mutate(ToHighlight = ifelse(n > 2000, "yes", "no"))%>%
  ggplot(aes(format, n, fill = ToHighlight)) + geom_col()+coord_flip()+ scale_y_continuous(labels = comma) +
  theme_classic() + scale_fill_manual(values = c("yes" ="#F0E442", "no" = "#0072B2"), guide = F) +
  labs(title = "Which Book Format had the most Reviews?",
       subtitle = "Number of Reviews per Book Type for James Comey's \'A Higher Loyalty\'",
       caption = "Source: Amazon.com",
       x = "Format",
       y = "Number of Reviews")  +
  theme(plot.title = element_text(family ='', face = 'bold', colour = 'black', size = 18),
        plot.subtitle = element_text(family ='', face = 'italic', colour = 'black', size = 10),
        plot.caption = element_text(family ='', colour = 'black', size = 10),
        axis.title.x = element_text(family ='', colour = 'black', size = 12),
        axis.title.y = element_text(family ='', colour = 'black', size = 12),
        axis.line = element_line(size = 1, colour = "dark grey"))

df%>%
  group_by(month = floor_date(date, "month"))%>%
  summarise(n = n())%>%
  ggplot(aes(month, n)) + geom_line(colour="#F0E442", size = 2) + xlab("Date (aggregated by Month)") + ylab("Number of Reviews") + ggtitle("Number of Reviews by Month") + scale_y_continuous(labels = comma) +
    theme_classic() + 
  labs(title = "Review drop off?",
       subtitle = "Number of Reviews (aggregated by month)",
       caption = "Source: Amazon.com",
       x = "",
       y = "Number of Reviews")  +
  theme(plot.title = element_text(family ='', face = 'bold', colour = 'black', size = 18),
        plot.subtitle = element_text(family ='', face = 'italic', colour = 'black', size = 10),
        plot.caption = element_text(family ='', colour = 'black', size = 10),
        axis.title.x = element_text(family ='', colour = 'black', size = 12),
        axis.title.y = element_text(family ='', colour = 'black', size = 12),
        axis.line = element_line(size = 1, colour = "dark grey"))

reviews <- df

bigrams <- reviews%>%
  unnest_tokens(bigram, comments, token = "ngrams", n=2)

bigrams_sep <- bigrams%>%
  separate(bigram,c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_sep%>%
  filter(!word1 %in% stop_words$word)%>%
  filter(!word2 %in% stop_words$word)

bigram_counts <- bigrams_filtered%>%
  count(word1, word2, sort = T)

bigram_counts <- na.omit(bigram_counts)

bigrams_united <- bigrams_filtered%>%
  unite(bigram, word1, word2, sep = " ")


bigram_td_idf <- bigrams_united%>%
  count(stars, bigram)%>%
  bind_tf_idf(bigram, stars, n)%>%
  arrange(desc(tf_idf))

DT::datatable(bigram_td_idf, colnames = c("Rating", "Bigram", "N", "tf", "idf", "tf_idf"), filter = "top")

## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

AFINN <- get_sentiments("afinn")
not_words <- bigrams_sep%>%
  filter(word1 == "not")%>%
  inner_join(AFINN, by = c(word2 = "word"))%>%
  count(word2, score, sort = T)%>%
  ungroup()
not_words%>%
  mutate(contribution = n* score)%>%
  arrange(desc(abs(contribution)))%>%
  head(20)%>%
  mutate(word2 = reorder(word2, contribution))%>%
  ggplot(aes(word2, n * score, fill = n * score > 0))+ 
  geom_col(show.legend = FALSE) + xlab("Words preceded by \"not\"") + ylab("Sentiment score * number of occurences") + coord_flip()

negation_words <- c("not", "no", "never", "without")
negated_words <- bigrams_sep%>%
  filter(word1 %in% negation_words)%>%
  inner_join(AFINN, by = c(word2 = "word"))%>%
  count(word1, word2, score, sort = T)%>%
  head(15)%>%
  ungroup()



ggplot(negated_words, aes(word2, n * score, fill = n * score > 0))+ 
  geom_col(show.legend = FALSE) + xlab("Words preceded by \"not, never,v   no\"") + ylab("Sentiment score * number of occurences") + coord_flip() + facet_wrap(~word1)

# df <- df[na_in_review != T,]
# df <- df[df$Rating <= 2,]

mycorpus <- Corpus(VectorSource(df$comments))
mycorpus <- tm_map(mycorpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(mycorpus, content_transformer(tolower)):
## transformation drops documents

mycorpus <- tm_map(mycorpus, removeNumbers)

## Warning in tm_map.SimpleCorpus(mycorpus, removeNumbers): transformation
## drops documents

mycorpus <- tm_map(mycorpus, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(mycorpus, removeWords,
## stopwords("english")): transformation drops documents

mycorpus <- tm_map(mycorpus, removePunctuation)

## Warning in tm_map.SimpleCorpus(mycorpus, removePunctuation): transformation
## drops documents

mycorpus <- tm_map(mycorpus, stripWhitespace)

## Warning in tm_map.SimpleCorpus(mycorpus, stripWhitespace): transformation
## drops documents

mycorpus <- tm_map(mycorpus, PlainTextDocument)

## Warning in tm_map.SimpleCorpus(mycorpus, PlainTextDocument): transformation
## drops documents

mycorpus <- tm_map(mycorpus, removeWords, c("NA", "NANA", "NA NA"))


minfreq_bigram <- 45
token_delim <- " \\t\\r\\n,!?,;\"()"

bitoken <- NGramTokenizer(mycorpus, Weka_control(min=2, max = 2, delimiters = token_delim))
two_word <- data.frame(table(bitoken))
sort_two <- two_word[order(two_word$Freq, decreasing = T),]
wordcloud(sort_two$bitoken, sort_two$Freq, random.order = F, scale = c(2,0.35), min.freq = minfreq_bigram,colors=colorRampPalette(brewer.pal(9,"Accent"))(32))

## Warning in brewer.pal(9, "Accent"): n too large, allowed maximum for palette Accent is 8
## Returning the palette you asked for with that many colors

df <- df[df$stars <= 2,]

mycorpus <- Corpus(VectorSource(df$comments))
mycorpus <- tm_map(mycorpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(mycorpus, content_transformer(tolower)):
## transformation drops documents

mycorpus <- tm_map(mycorpus, removeNumbers)

## Warning in tm_map.SimpleCorpus(mycorpus, removeNumbers): transformation
## drops documents

mycorpus <- tm_map(mycorpus, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(mycorpus, removeWords,
## stopwords("english")): transformation drops documents

mycorpus <- tm_map(mycorpus, removePunctuation)

## Warning in tm_map.SimpleCorpus(mycorpus, removePunctuation): transformation
## drops documents

mycorpus <- tm_map(mycorpus, stripWhitespace)

## Warning in tm_map.SimpleCorpus(mycorpus, stripWhitespace): transformation
## drops documents

mycorpus <- tm_map(mycorpus, PlainTextDocument)

## Warning in tm_map.SimpleCorpus(mycorpus, PlainTextDocument): transformation
## drops documents

mycorpus <- tm_map(mycorpus, removeWords, c("NA", "NANA", "NA NA"))


minfreq_bigram <- 3
token_delim <- " \\t\\r\\n,!?,;\"()"

bitoken <- NGramTokenizer(mycorpus, Weka_control(min=2, max = 2, delimiters = token_delim))
two_word <- data.frame(table(bitoken))
sort_two <- two_word[order(two_word$Freq, decreasing = T),]
wordcloud(sort_two$bitoken, sort_two$Freq, random.order = F, scale = c(2,0.35), min.freq = minfreq_bigram,colors=colorRampPalette(brewer.pal(9,"Accent"))(32))

## Warning in brewer.pal(9, "Accent"): n too large, allowed maximum for palette Accent is 8
## Returning the palette you asked for with that many colors

Jim_Comey

Chris Shockley

September 15, 2018