Capstone: Wrapping up sentiment analysis

Review: Sentiment Analysis

How do people feel about movies?

library(tidyverse)

## ── Attaching packages ─────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.1.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.8
## ✔ tidyr   0.8.2     ✔ stringr 1.3.1
## ✔ readr   1.2.1     ✔ forcats 0.3.0

## ── Conflicts ────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

movies <- c("Venom","Antman","Avengers","BlackPanther")
movies_urls <- c("https://www.rottentomatoes.com/m/venom_2018/reviews/?page=",
                 "https://www.rottentomatoes.com/m/ant_man_and_the_wasp/reviews/?page=",
                 "https://www.rottentomatoes.com/m/avengers_infinity_war/reviews/?page=",
                 "https://www.rottentomatoes.com/m/black_panther_2018/reviews/?page=")
url_after <- "&type=user&sort="

Web Crawling & Creating Text Dataset

library(rvest)
review_extractor <- function(x){
  all.reviews <- c()
  for(page in 1:500){
    url <- paste(x, page, url_after, sep="")
    htxt <- read_html(url)
    table <- html_nodes(htxt, ".review_table") 
    content <- html_nodes(table, ".user_review")
    reviews <- html_text(content)
    if(length(reviews) == 0){break}
    all.reviews <- c(all.reviews, reviews)
    print(page)
  }
  return(all.reviews)
}

library(future.apply)
plan(multiprocess, workers = 7)
all_reviews <- future_sapply(movies_urls, review_extractor)

marvel_reviews <- data_frame(title=movies[1],text=all_reviews[[1]]) %>% 
  bind_rows(data_frame(title=movies[2],text=all_reviews[[2]])) %>% 
  bind_rows(data_frame(title=movies[3],text=all_reviews[[3]])) %>% 
  bind_rows(data_frame(title=movies[4],text=all_reviews[[4]])) %>% 
  rowid_to_column("id")

marvel_reviews %>% count(title, sort=T)

save(marvel_reviews, file="marvel_reviews.RData")

Text Pre-processing

library(future.apply)

## Loading required package: future

## 
## Attaching package: 'future.apply'

## The following object is masked from 'package:future':
## 
##     future_lapply

load("~/Dropbox/2018_Class_Teaching/KMOOC/KMOOC/marvel_reviews.RData")

remove.func <- function(doc) {
  doc <- str_remove_all(doc, "&amp;|&lt;|&gt;")
  doc <- str_remove_all(doc, "https?[^[:space:]]+|(www.)[^[:space:]]+")
  doc <- str_remove_all(doc, "[^[:ascii:]]+")
  doc <- tolower(doc)
  return(doc)
}

marvel_reviews <- marvel_reviews %>% 
  mutate(text = future_sapply(text, remove.func))

marvel_review_sents <- marvel_reviews %>% 
  tidytext::unnest_tokens(sentence, text, token = "regex", pattern = "[.!?:]")

library(textclean)
replace.func <- function(x){
  x <- replace_rating(x)
  x <- replace_ordinal(x)
  x <- replace_symbol(x, pound=FALSE, at=FALSE)
  x <- replace_time(x, replacement = " ")
  x <- replace_kern(x)
  x <- replace_internet_slang(x)
  x <- replace_contraction(x)
  x <- replace_emoji(x)
  x <- replace_emoticon(x)
  x <- replace_word_elongation(x)
  x <- replace_money(x, replacement=" ")
  x <- replace_number(x, remove=TRUE)
  x <- replace_names(x, replacement=" ")
  x <- replace_html(x, symbol=FALSE)
  x <- replace_date(x, replacement = " ")
  x <- replace_non_ascii(x)
  x <- replace_white(x)
  return(x)
}

marvel_review_sents <- marvel_review_sents %>% 
  mutate(text = future_sapply(sentence, replace.func)) %>% 
  dplyr::select(-sentence)
save(marvel_review_sents, file="marvel_review_sents.RData")

Analyzing sentiment score for a sentence within each review

library(qdap)
marvel_sentiment <- marvel_review_sents %>% 
  with(polarity(text, grouping.var = id)) 
save(marvel_sentiment, file="marvel_sentiment.RData")

load("~/Dropbox/2018_Class_Teaching/KMOOC/KMOOC/marvel_sentiment.RData")

class(marvel_sentiment)

## [1] "polarity" "list"

head(marvel_sentiment$all)

##   id wc   polarity         pos.words      neg.words
## 1  1 10 -1.1384200                 -   critics, bad
## 2  1 12  0.8660254 super, hero, good              -
## 3  1 23 -0.6255432      like, enough          choke
## 4  1 20 -0.2236068              good bogus, critics
## 5  2  2  0.7071068             loved              -
## 6  2  2  0.7071068             great              -
##                                                                                                              text.var
## 1                                                                  the so called critics really are bad at their jobs
## 2                                                          this is the first super hero movie in awhile that was good
## 3        i guess they did not like it because disney did not pay them or there was not enough sjw content to choke on
## 4 this is a good movie watch it at the theaters and don 1 / 2 1 / 2 1 / 2 ( tm ) t listen to the bogus critics scores
## 5                                                                                                            loved it
## 6                                                                                                         great movie

head(marvel_sentiment$group)

##   id total.sentences total.words ave.polarity sd.polarity
## 1  1               4          65   -0.2803861   0.8510462
## 2  2               2           4    0.7071068   0.0000000
## 3  3               1          11    0.3015113          NA
## 4  4               1           2    0.0000000          NA
## 5  5               1           8    0.7071068          NA
## 6  6               1          40    0.0000000          NA
##   stan.mean.polarity
## 1         -0.3294606
## 2                Inf
## 3                 NA
## 4                 NA
## 5                 NA
## 6                 NA

marvel_sentiment_group <- as_data_frame(marvel_sentiment$group) %>% 
  rowid_to_column("rowid") %>% 
  left_join(marvel_reviews %>% 
              select(id, title) %>% 
              distinct() %>% 
              rowid_to_column("rowid"), by=c("rowid"))

marvel_sentiment_group %>% 
  group_by(title) %>% 
  summarise(ave.polarity = mean(ave.polarity, na.rm=T))

## # A tibble: 4 x 2
##   title        ave.polarity
##   <chr>               <dbl>
## 1 Antman              0.275
## 2 Avengers            0.225
## 3 BlackPanther        0.149
## 4 Venom               0.170

Tidytext way of sentiment analysis

library(tidytext)
marvel_sentiment_tidy <- marvel_review_sents %>% 
  unnest_tokens(word, sentence) %>% 
  inner_join(get_sentiments("bing"))

## Joining, by = "word"

marvel_sentiment_tidy %>% 
  count(title, sentiment) %>% 
  left_join(marvel_sentiment_tidy %>% 
              group_by(title) %>% 
              summarise(total = n())) %>% 
  mutate(freq = n/total)

## Joining, by = "title"

## # A tibble: 8 x 5
##   title        sentiment     n total  freq
##   <chr>        <chr>     <int> <int> <dbl>
## 1 Antman       negative   1911  6441 0.297
## 2 Antman       positive   4530  6441 0.703
## 3 Avengers     negative   1600  5048 0.317
## 4 Avengers     positive   3448  5048 0.683
## 5 BlackPanther negative   2512  7370 0.341
## 6 BlackPanther positive   4858  7370 0.659
## 7 Venom        negative   3011  6821 0.441
## 8 Venom        positive   3810  6821 0.559

Visualization

library(wordcloud)

## Loading required package: RColorBrewer

# Sentiment words in reviews about Venom
marvel_sentiment_tidy %>% 
  filter(title=="Venom",
         sentiment=="positive") %>% 
  count(sentiment, word, sort=T) %>% 
  with(wordcloud(word = word,
                 freq = n,
                 rot.per = 0.1,
                 min.freq = 10,
                 max.words = 200,
                 scale = c(2, 0.3),
                 random.order = FALSE, 
                 colors = brewer.pal(8, "Dark2")))

# Overall distribution of sentiments toward the movie
library(ggplot2)
library(ggridges)

## 
## Attaching package: 'ggridges'

## The following object is masked from 'package:ggplot2':
## 
##     scale_discrete_manual

marvel_sentiment_group %>% 
  filter(title == "Venom") %>% 
    ggplot(aes(x = ave.polarity, y= title, fill=..x..)) +
      geom_density_ridges_gradient(show.legend = FALSE) +
      scale_fill_gradient(low="blue",high="red") +
      labs(x = "Negative (Blue) <--> Positive (Red)", y = "") +
      theme_bw()

## Picking joint bandwidth of 0.0631

save(marvel_sentiment_tidy, file="marvel_sentiment_tidy.RData")
save(marvel_sentiment_group, file="marvel_sentiment_group.RData")

Capstone: Wrapping up sentiment analysis

Shin Lee

12/7/2018

Review: Sentiment Analysis

How do people feel about movies?