How do people feel about movies?
library(tidyverse)
## ── Attaching packages ─────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.2.1 ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
movies <- c("Venom","Antman","Avengers","BlackPanther")
movies_urls <- c("https://www.rottentomatoes.com/m/venom_2018/reviews/?page=",
"https://www.rottentomatoes.com/m/ant_man_and_the_wasp/reviews/?page=",
"https://www.rottentomatoes.com/m/avengers_infinity_war/reviews/?page=",
"https://www.rottentomatoes.com/m/black_panther_2018/reviews/?page=")
url_after <- "&type=user&sort="
- Web Crawling & Creating Text Dataset
library(rvest)
review_extractor <- function(x){
all.reviews <- c()
for(page in 1:500){
url <- paste(x, page, url_after, sep="")
htxt <- read_html(url)
table <- html_nodes(htxt, ".review_table")
content <- html_nodes(table, ".user_review")
reviews <- html_text(content)
if(length(reviews) == 0){break}
all.reviews <- c(all.reviews, reviews)
print(page)
}
return(all.reviews)
}
library(future.apply)
plan(multiprocess, workers = 7)
all_reviews <- future_sapply(movies_urls, review_extractor)
marvel_reviews <- data_frame(title=movies[1],text=all_reviews[[1]]) %>%
bind_rows(data_frame(title=movies[2],text=all_reviews[[2]])) %>%
bind_rows(data_frame(title=movies[3],text=all_reviews[[3]])) %>%
bind_rows(data_frame(title=movies[4],text=all_reviews[[4]])) %>%
rowid_to_column("id")
marvel_reviews %>% count(title, sort=T)
save(marvel_reviews, file="marvel_reviews.RData")
library(future.apply)
## Loading required package: future
##
## Attaching package: 'future.apply'
## The following object is masked from 'package:future':
##
## future_lapply
load("~/Dropbox/2018_Class_Teaching/KMOOC/KMOOC/marvel_reviews.RData")
remove.func <- function(doc) {
doc <- str_remove_all(doc, "&|<|>")
doc <- str_remove_all(doc, "https?[^[:space:]]+|(www.)[^[:space:]]+")
doc <- str_remove_all(doc, "[^[:ascii:]]+")
doc <- tolower(doc)
return(doc)
}
marvel_reviews <- marvel_reviews %>%
mutate(text = future_sapply(text, remove.func))
marvel_review_sents <- marvel_reviews %>%
tidytext::unnest_tokens(sentence, text, token = "regex", pattern = "[.!?:]")
library(textclean)
replace.func <- function(x){
x <- replace_rating(x)
x <- replace_ordinal(x)
x <- replace_symbol(x, pound=FALSE, at=FALSE)
x <- replace_time(x, replacement = " ")
x <- replace_kern(x)
x <- replace_internet_slang(x)
x <- replace_contraction(x)
x <- replace_emoji(x)
x <- replace_emoticon(x)
x <- replace_word_elongation(x)
x <- replace_money(x, replacement=" ")
x <- replace_number(x, remove=TRUE)
x <- replace_names(x, replacement=" ")
x <- replace_html(x, symbol=FALSE)
x <- replace_date(x, replacement = " ")
x <- replace_non_ascii(x)
x <- replace_white(x)
return(x)
}
marvel_review_sents <- marvel_review_sents %>%
mutate(text = future_sapply(sentence, replace.func)) %>%
dplyr::select(-sentence)
save(marvel_review_sents, file="marvel_review_sents.RData")
- Analyzing sentiment score for a sentence within each review
library(qdap)
marvel_sentiment <- marvel_review_sents %>%
with(polarity(text, grouping.var = id))
save(marvel_sentiment, file="marvel_sentiment.RData")
load("~/Dropbox/2018_Class_Teaching/KMOOC/KMOOC/marvel_sentiment.RData")
class(marvel_sentiment)
## [1] "polarity" "list"
head(marvel_sentiment$all)
## id wc polarity pos.words neg.words
## 1 1 10 -1.1384200 - critics, bad
## 2 1 12 0.8660254 super, hero, good -
## 3 1 23 -0.6255432 like, enough choke
## 4 1 20 -0.2236068 good bogus, critics
## 5 2 2 0.7071068 loved -
## 6 2 2 0.7071068 great -
## text.var
## 1 the so called critics really are bad at their jobs
## 2 this is the first super hero movie in awhile that was good
## 3 i guess they did not like it because disney did not pay them or there was not enough sjw content to choke on
## 4 this is a good movie watch it at the theaters and don 1 / 2 1 / 2 1 / 2 ( tm ) t listen to the bogus critics scores
## 5 loved it
## 6 great movie
head(marvel_sentiment$group)
## id total.sentences total.words ave.polarity sd.polarity
## 1 1 4 65 -0.2803861 0.8510462
## 2 2 2 4 0.7071068 0.0000000
## 3 3 1 11 0.3015113 NA
## 4 4 1 2 0.0000000 NA
## 5 5 1 8 0.7071068 NA
## 6 6 1 40 0.0000000 NA
## stan.mean.polarity
## 1 -0.3294606
## 2 Inf
## 3 NA
## 4 NA
## 5 NA
## 6 NA
marvel_sentiment_group <- as_data_frame(marvel_sentiment$group) %>%
rowid_to_column("rowid") %>%
left_join(marvel_reviews %>%
select(id, title) %>%
distinct() %>%
rowid_to_column("rowid"), by=c("rowid"))
marvel_sentiment_group %>%
group_by(title) %>%
summarise(ave.polarity = mean(ave.polarity, na.rm=T))
## # A tibble: 4 x 2
## title ave.polarity
## <chr> <dbl>
## 1 Antman 0.275
## 2 Avengers 0.225
## 3 BlackPanther 0.149
## 4 Venom 0.170
- Tidytext way of sentiment analysis
library(tidytext)
marvel_sentiment_tidy <- marvel_review_sents %>%
unnest_tokens(word, sentence) %>%
inner_join(get_sentiments("bing"))
## Joining, by = "word"
marvel_sentiment_tidy %>%
count(title, sentiment) %>%
left_join(marvel_sentiment_tidy %>%
group_by(title) %>%
summarise(total = n())) %>%
mutate(freq = n/total)
## Joining, by = "title"
## # A tibble: 8 x 5
## title sentiment n total freq
## <chr> <chr> <int> <int> <dbl>
## 1 Antman negative 1911 6441 0.297
## 2 Antman positive 4530 6441 0.703
## 3 Avengers negative 1600 5048 0.317
## 4 Avengers positive 3448 5048 0.683
## 5 BlackPanther negative 2512 7370 0.341
## 6 BlackPanther positive 4858 7370 0.659
## 7 Venom negative 3011 6821 0.441
## 8 Venom positive 3810 6821 0.559
library(wordcloud)
## Loading required package: RColorBrewer
# Sentiment words in reviews about Venom
marvel_sentiment_tidy %>%
filter(title=="Venom",
sentiment=="positive") %>%
count(sentiment, word, sort=T) %>%
with(wordcloud(word = word,
freq = n,
rot.per = 0.1,
min.freq = 10,
max.words = 200,
scale = c(2, 0.3),
random.order = FALSE,
colors = brewer.pal(8, "Dark2")))

# Overall distribution of sentiments toward the movie
library(ggplot2)
library(ggridges)
##
## Attaching package: 'ggridges'
## The following object is masked from 'package:ggplot2':
##
## scale_discrete_manual
marvel_sentiment_group %>%
filter(title == "Venom") %>%
ggplot(aes(x = ave.polarity, y= title, fill=..x..)) +
geom_density_ridges_gradient(show.legend = FALSE) +
scale_fill_gradient(low="blue",high="red") +
labs(x = "Negative (Blue) <--> Positive (Red)", y = "") +
theme_bw()
## Picking joint bandwidth of 0.0631

save(marvel_sentiment_tidy, file="marvel_sentiment_tidy.RData")
save(marvel_sentiment_group, file="marvel_sentiment_group.RData")