Web-Scraping and Analyzing IMDb’s Top 2500 Domestic-Grossing Movies
Using the tools and packages discussed in this course (SelectorGadget, Inspect Item, View Source, rvest, xml2, etc.), scrape a website of your choosing that includes significant amount of text and some numerical data. Place the data into a data.frame, and code the variables properly (e.g., factor, numeric, character, etc.)
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:Hmisc':
##
## html
library(xml2)
imdb=NULL
for (i in 1:50){
a=paste0("https://www.imdb.com/search/title?sort=boxoffice_gross_us&title_type=feature&page=",i)
link=read_html(a)
rank=link %>% html_nodes(".text-primary") %>% html_text()
rank=gsub("\\.","",rank)
title=link %>% html_nodes(".lister-item-header a") %>% html_text()
year=link %>% html_nodes(".text-muted.unbold") %>% html_text()
year=gsub("\\(","",year)
year=gsub("\\)","",year)
year=gsub("I", "", year)
year=gsub("X", "", year)
year=gsub(" ", "", year)
rating=link %>% html_nodes(".certificate") %>% html_text()
duration=link %>% html_nodes(".text-muted .runtime") %>% html_text()
duration=gsub(" min","",duration)
genre=link %>% html_nodes(".genre") %>% html_text()
genre=gsub("\\n","",genre)
genre=trimws(genre,"r")
metascore=link %>% html_nodes(".metascore") %>% html_text()
synopsis=link %>% html_nodes(".ratings-bar+ .text-muted") %>% html_text()
synopsis=gsub("\n ","",synopsis)
director=link %>% html_nodes(".text-muted+ p a:nth-child(1)") %>% html_text()
directors=link %>% html_nodes("p a:nth-child(2) , .lister-item-content a:nth-child(1)") %>% html_text()
actors=link %>% html_nodes(".lister-item-content .ghost~ a") %>% html_text()
gross=link %>% html_nodes(".ghost~ .text-muted+ span") %>% html_text()
gross=gsub("\\$","",gross)
gross=gsub("M","",gross)
imdb_nextpage = data.frame(cbind(rank,title,year,duration,genre,synopsis,director,gross), stringsAsFactors = FALSE) #metascore, directors, actors, and rating have been removed
imdb = rbind(imdb,imdb_nextpage)}
imdb$rank=as.factor(imdb$rank)
imdb$year=as.factor(imdb$year)
imdb$genre=as.factor(imdb$genre)
imdb$director=as.factor(imdb$director)
imdb$duration=as.numeric(imdb$duration)
imdb$gross=as.numeric(imdb$gross)
str(imdb)
## 'data.frame': 2500 obs. of 8 variables:
## $ rank : Factor w/ 2500 levels "1","1,000","1,001",..: 1 1112 1724 1835 1946 2057 2168 2279 2390 1002 ...
## $ title : chr "Star Wars: The Force Awakens" "Avatar" "Black Panther" "Avengers: Infinity War" ...
## $ year : Factor w/ 74 levels "1937","1939",..: 70 64 73 73 52 70 67 72 73 63 ...
## $ duration: num 136 162 134 149 194 124 143 152 118 152 ...
## $ genre : Factor w/ 262 levels "Action, Adventure",..: 7 7 12 5 227 12 12 7 100 30 ...
## $ synopsis: chr "Three decades after the Empire's defeat, a new threat arises in the militant First Order. Stormtrooper defector"| __truncated__ "A paraplegic marine dispatched to the moon Pandora on a unique mission becomes torn between following his order"| __truncated__ "T'Challa, heir to the hidden but advanced kingdom of Wakanda, must step forward to lead his people into a new f"| __truncated__ "The Avengers and their allies must be willing to sacrifice all in an attempt to defeat the powerful Thanos befo"| __truncated__ ...
## $ director: Factor w/ 999 levels "Aaron Blaise",..: 368 375 824 51 375 168 520 758 94 157 ...
## $ gross : num 937 761 700 679 659 ...
2. Text mine this data using other tools and packages of your choosing (tm, dplyr, etc.). Be sure to clean the data by removing stopwords, spaces, etc.
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
## The following object is masked from 'package:httr':
##
## content
library(wordcloud)
## Loading required package: RColorBrewer
library(tidytext)
library(tidyverse)
## -- Attaching packages ------------------------------------ tidyverse 1.2.1 --
## v tibble 1.4.2 v purrr 0.2.5
## v tidyr 0.8.1 v dplyr 0.7.6
## v readr 1.1.1 v stringr 1.3.1
## v tibble 1.4.2 v forcats 0.3.0
## -- Conflicts --------------------------------------- tidyverse_conflicts() --
## x NLP::annotate() masks ggplot2::annotate()
## x plotly::config() masks httr::config()
## x NLP::content() masks httr::content()
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x purrr::flatten() masks jsonlite::flatten()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x rvest::html() masks Hmisc::html()
## x dplyr::lag() masks stats::lag()
## x purrr::pluck() masks rvest::pluck()
## x dplyr::src() masks Hmisc::src()
## x dplyr::summarize() masks Hmisc::summarize()
review_source=VectorSource(imdb$synopsis)
corpus=Corpus(review_source)
corpus=tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus=tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation
## drops documents
corpus=tm_map(corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation
## drops documents
corpus=tm_map(corpus, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
dtm=DocumentTermMatrix(corpus)
dtm2=as.matrix(dtm)
frequency=colSums(dtm2)
frequency=sort(frequency, decreasing=TRUE)
words<-names(frequency)
wordcloud(words[1:100], frequency[1:100], scale = c(2.5, .2))

frequency=sort(frequency, decreasing=TRUE)
frequency[1:10]
## new young must man life two family world woman one
## 265 231 222 199 196 196 177 170 162 152
barplot(frequency[1:10], main="Top 10 Synopsis Words: TM", cex.names=0.7)

#Looking at common synopsis words we see general descriptors of characters (man, woman), change/stakes ("new", "must", "life") and mentions of "family." You make a lot of commonly used trailer cliches like "In a world where one man ________, he meets a special woman, and together, they must find a new way to _______."
mytext=imdb$synopsis
df=as.list(mytext)
mydataframe=data_frame(text=df)
newdf=mydataframe%>%
unnest_tokens(word, text)%>%
anti_join(stop_words)
## Joining, by = "word"
newdf%>%
count(word, sort=TRUE)%>%
filter(n>=100)%>%
mutate(word=reorder(word,n))%>%
ggplot(aes(word,n))+
geom_col()+
xlab(NULL)+
coord_flip()+
ggtitle('Top 10 Synopsis Words: Tidytext')+
theme(plot.title = element_text(hjust = 0.5))

#When applying the Tidytext stopwords we're left with relatable, broad topics (life, family, love, war), common settings (home, school) and talking about a "world." Interestingly "man" is apparently a Tidytext stopword while "woman" is not; meanwhile, the tm stopwords don't contain either word.
Web-scraping the genres
review_source2=VectorSource(imdb$genre)
corpus2=Corpus(review_source2)
corpus2=tm_map(corpus2, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus2, content_transformer(tolower)):
## transformation drops documents
corpus2=tm_map(corpus2, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus2, removePunctuation): transformation
## drops documents
corpus2=tm_map(corpus2, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus2, stripWhitespace): transformation
## drops documents
corpus2=tm_map(corpus2, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus2, removeWords, stopwords("english")):
## transformation drops documents
dtm3=DocumentTermMatrix(corpus2)
dtm4=as.matrix(dtm3)
frequency2=colSums(dtm4)
frequency2=sort(frequency2, decreasing=TRUE)
words2<-names(frequency2)
wordcloud(words2[1:10], frequency2[1:10], scale = c(2.5, .2))

frequency2=sort(frequency2, decreasing=TRUE)
frequency2[1:10]
## comedy drama action adventure crime romance thriller
## 1077 1033 766 739 420 398 369
## family fantasy scifi
## 265 263 230
barplot(frequency2[1:10], main="Most Common Genres in Movies", cex.names=0.7, las=2)

#Comedy and drama are the most common movie genres, followed by action and adventure.
Build some plots using ggplot2 to categorize the data (e.g.., top 10 word frequency counts, histogram for numeric data, etc.) Discuss what your analysis indicates. Upload your final project to RPubs, and provide the link as your submission.
library(ggplot2)
library(dplyr)
ggplot(subset(imdb, year %in% c("1999":"2018"))) +
geom_boxplot(mapping = aes(x = year, y = gross))+
theme(axis.text.x = element_text(angle = 90, hjust = 0))+
ggtitle('Movie Domestic Grosses by Year: 1999-2018')+
theme(plot.title = element_text(hjust = 0.5))

#Have movies overall been generating more revenue (i.e., continue to increasing grosses from consumers) over the past 20 years? Not really for movies overall, as the boxes (i.e., medians and interquartile ranges) remain in range of each other. The outliers on the other hand are a different story, as those are have been hitting higher extremes in recent years due to mega-blockbusters like the Star Wars and Marvel movies.
ggplot(subset(imdb, year %in% c("1999":"2018"))) +
geom_boxplot(mapping = aes(x = year, y = duration))+
theme(axis.text.x = element_text(angle = 90, hjust = 0))+
ggtitle('Movie Durations by Year: 1999-2018')+
theme(plot.title = element_text(hjust = 0.5))

#Movie durations are a similar story: they've been comparable over the years.
ggplot(subset(imdb, year %in% c("1999":"2018"))) +
geom_point(mapping = aes(x = duration, y = gross))+
geom_smooth(mapping = aes(x = duration, y = gross))+
ggtitle('Movie Durations and Domestic Gross: 1999-2018')+
theme(plot.title = element_text(hjust = 0.5))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#Movies longer than 120 minutes end up making more money, with the sweet spot being around 150 minutes. Part of this could be what the audiences want, but another part could be movies that expect to make more could be given the latitude to have longer lengths, since there comes a point where a longer length means fewer showings at a theater on a given day.
Sentiment Analysis
nrc=get_sentiments("nrc")
newdf%>%
inner_join(nrc)%>%
count(sentiment, sort=TRUE)
## Joining, by = "word"
## # A tibble: 10 x 2
## sentiment n
## <chr> <int>
## 1 positive 4241
## 2 negative 3953
## 3 fear 3049
## 4 trust 2838
## 5 anger 2074
## 6 anticipation 1996
## 7 sadness 1762
## 8 joy 1721
## 9 disgust 1315
## 10 surprise 1117
mysent=newdf%>%
inner_join(get_sentiments("bing"))%>%
count(sentiment)%>%
spread(sentiment, n, fill=0)%>%
mutate(sentiment=positive-negative)
## Joining, by = "word"
mysent
## # A tibble: 1 x 3
## negative positive sentiment
## <dbl> <dbl> <dbl>
## 1 3393 1526 -1867
#When looking at the movie synopses, NRC and Bing give completely different sentiment analysis stories: they're slightly positive according to NRC, but firmly negative according to Bing. Interestingly I personally would've though that movie descriptions were more negative (siding with Bing), but the NRC emotions (fear, trust, anger, anticipation, sadness, etc.) seem like common emotions for descriptions since they point to (potential) conflict; the descriptions should suggest that some change is about to happen in characters' lives to get you to want to watch the movie.