library(jsonlite)


library(jsonlite)
library(httr)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:Hmisc':
## 
##     subplot
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:httr':
## 
##     config
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Web-Scraping and Analyzing IMDb’s Top 2500 Domestic-Grossing Movies

Using the tools and packages discussed in this course (SelectorGadget, Inspect Item, View Source, rvest, xml2, etc.), scrape a website of your choosing that includes significant amount of text and some numerical data. Place the data into a data.frame, and code the variables properly (e.g., factor, numeric, character, etc.)

library(rvest)
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:Hmisc':
## 
##     html
library(xml2)

imdb=NULL

for (i in 1:50){
a=paste0("https://www.imdb.com/search/title?sort=boxoffice_gross_us&title_type=feature&page=",i)
link=read_html(a)
rank=link %>% html_nodes(".text-primary") %>% html_text()
rank=gsub("\\.","",rank)
title=link %>% html_nodes(".lister-item-header a") %>% html_text()
year=link %>% html_nodes(".text-muted.unbold") %>% html_text()
year=gsub("\\(","",year)
year=gsub("\\)","",year)
year=gsub("I", "", year)
year=gsub("X", "", year)
year=gsub(" ", "", year)
rating=link %>% html_nodes(".certificate") %>% html_text()
duration=link %>% html_nodes(".text-muted .runtime") %>% html_text()
duration=gsub(" min","",duration)
genre=link %>% html_nodes(".genre") %>% html_text()
genre=gsub("\\n","",genre)
genre=trimws(genre,"r")
metascore=link %>% html_nodes(".metascore") %>% html_text()
synopsis=link %>% html_nodes(".ratings-bar+ .text-muted") %>% html_text()
synopsis=gsub("\n    ","",synopsis)
director=link %>% html_nodes(".text-muted+ p a:nth-child(1)") %>% html_text()
directors=link %>% html_nodes("p a:nth-child(2) , .lister-item-content a:nth-child(1)") %>% html_text()
actors=link %>% html_nodes(".lister-item-content .ghost~ a") %>% html_text()
gross=link %>% html_nodes(".ghost~ .text-muted+ span") %>% html_text()
gross=gsub("\\$","",gross)
gross=gsub("M","",gross)

imdb_nextpage = data.frame(cbind(rank,title,year,duration,genre,synopsis,director,gross), stringsAsFactors = FALSE) #metascore, directors, actors, and rating have been removed
imdb = rbind(imdb,imdb_nextpage)}

imdb$rank=as.factor(imdb$rank)
imdb$year=as.factor(imdb$year)
imdb$genre=as.factor(imdb$genre)
imdb$director=as.factor(imdb$director)
imdb$duration=as.numeric(imdb$duration)
imdb$gross=as.numeric(imdb$gross)

str(imdb)
## 'data.frame':    2500 obs. of  8 variables:
##  $ rank    : Factor w/ 2500 levels "1","1,000","1,001",..: 1 1112 1724 1835 1946 2057 2168 2279 2390 1002 ...
##  $ title   : chr  "Star Wars: The Force Awakens" "Avatar" "Black Panther" "Avengers: Infinity War" ...
##  $ year    : Factor w/ 74 levels "1937","1939",..: 70 64 73 73 52 70 67 72 73 63 ...
##  $ duration: num  136 162 134 149 194 124 143 152 118 152 ...
##  $ genre   : Factor w/ 262 levels "Action, Adventure",..: 7 7 12 5 227 12 12 7 100 30 ...
##  $ synopsis: chr  "Three decades after the Empire's defeat, a new threat arises in the militant First Order. Stormtrooper defector"| __truncated__ "A paraplegic marine dispatched to the moon Pandora on a unique mission becomes torn between following his order"| __truncated__ "T'Challa, heir to the hidden but advanced kingdom of Wakanda, must step forward to lead his people into a new f"| __truncated__ "The Avengers and their allies must be willing to sacrifice all in an attempt to defeat the powerful Thanos befo"| __truncated__ ...
##  $ director: Factor w/ 999 levels "Aaron Blaise",..: 368 375 824 51 375 168 520 758 94 157 ...
##  $ gross   : num  937 761 700 679 659 ...

2. Text mine this data using other tools and packages of your choosing (tm, dplyr, etc.). Be sure to clean the data by removing stopwords, spaces, etc.

library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
## The following object is masked from 'package:httr':
## 
##     content
library(wordcloud)
## Loading required package: RColorBrewer
library(tidytext)
library(tidyverse)
## -- Attaching packages ------------------------------------ tidyverse 1.2.1 --
## v tibble  1.4.2     v purrr   0.2.5
## v tidyr   0.8.1     v dplyr   0.7.6
## v readr   1.1.1     v stringr 1.3.1
## v tibble  1.4.2     v forcats 0.3.0
## -- Conflicts --------------------------------------- tidyverse_conflicts() --
## x NLP::annotate()         masks ggplot2::annotate()
## x plotly::config()        masks httr::config()
## x NLP::content()          masks httr::content()
## x dplyr::filter()         masks plotly::filter(), stats::filter()
## x purrr::flatten()        masks jsonlite::flatten()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x rvest::html()           masks Hmisc::html()
## x dplyr::lag()            masks stats::lag()
## x purrr::pluck()          masks rvest::pluck()
## x dplyr::src()            masks Hmisc::src()
## x dplyr::summarize()      masks Hmisc::summarize()
review_source=VectorSource(imdb$synopsis)
corpus=Corpus(review_source)
corpus=tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus=tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation
## drops documents
corpus=tm_map(corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation
## drops documents
corpus=tm_map(corpus, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
dtm=DocumentTermMatrix(corpus)
dtm2=as.matrix(dtm)

frequency=colSums(dtm2)
frequency=sort(frequency, decreasing=TRUE)
words<-names(frequency)
wordcloud(words[1:100], frequency[1:100], scale = c(2.5, .2))

frequency=sort(frequency, decreasing=TRUE)
frequency[1:10]
##    new  young   must    man   life    two family  world  woman    one 
##    265    231    222    199    196    196    177    170    162    152
barplot(frequency[1:10], main="Top 10 Synopsis Words: TM", cex.names=0.7)

#Looking at common synopsis words we see general descriptors of  characters (man, woman), change/stakes ("new", "must", "life") and mentions of "family." You make a lot of commonly used trailer cliches like "In a world where one man ________, he meets a special woman, and together, they must find a new way to _______."



mytext=imdb$synopsis
df=as.list(mytext)
mydataframe=data_frame(text=df)
newdf=mydataframe%>%
unnest_tokens(word, text)%>%
  anti_join(stop_words)
## Joining, by = "word"
newdf%>%
  count(word, sort=TRUE)%>%
  filter(n>=100)%>%
  mutate(word=reorder(word,n))%>%
  ggplot(aes(word,n))+
  geom_col()+
  xlab(NULL)+
  coord_flip()+
  ggtitle('Top 10 Synopsis Words: Tidytext')+
  theme(plot.title = element_text(hjust = 0.5))

#When applying the Tidytext stopwords we're left with relatable, broad topics (life, family, love, war), common settings (home, school) and talking about a "world." Interestingly "man" is apparently a Tidytext stopword while "woman" is not; meanwhile, the tm stopwords don't contain either word.

Web-scraping the genres

review_source2=VectorSource(imdb$genre)
corpus2=Corpus(review_source2)
corpus2=tm_map(corpus2, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus2, content_transformer(tolower)):
## transformation drops documents
corpus2=tm_map(corpus2, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus2, removePunctuation): transformation
## drops documents
corpus2=tm_map(corpus2, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus2, stripWhitespace): transformation
## drops documents
corpus2=tm_map(corpus2, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus2, removeWords, stopwords("english")):
## transformation drops documents
dtm3=DocumentTermMatrix(corpus2)
dtm4=as.matrix(dtm3)

frequency2=colSums(dtm4)
frequency2=sort(frequency2, decreasing=TRUE)
words2<-names(frequency2)
wordcloud(words2[1:10], frequency2[1:10], scale = c(2.5, .2))

frequency2=sort(frequency2, decreasing=TRUE)
frequency2[1:10]
##    comedy     drama    action adventure     crime   romance  thriller 
##      1077      1033       766       739       420       398       369 
##    family   fantasy     scifi 
##       265       263       230
barplot(frequency2[1:10], main="Most Common Genres in Movies", cex.names=0.7, las=2)

#Comedy and drama are the most common movie genres, followed by action and adventure.

Sentiment Analysis

nrc=get_sentiments("nrc")

newdf%>%
  inner_join(nrc)%>%
  count(sentiment, sort=TRUE)
## Joining, by = "word"
## # A tibble: 10 x 2
##    sentiment        n
##    <chr>        <int>
##  1 positive      4241
##  2 negative      3953
##  3 fear          3049
##  4 trust         2838
##  5 anger         2074
##  6 anticipation  1996
##  7 sadness       1762
##  8 joy           1721
##  9 disgust       1315
## 10 surprise      1117
mysent=newdf%>%
  inner_join(get_sentiments("bing"))%>%
  count(sentiment)%>%
  spread(sentiment, n, fill=0)%>%
  mutate(sentiment=positive-negative)
## Joining, by = "word"
mysent
## # A tibble: 1 x 3
##   negative positive sentiment
##      <dbl>    <dbl>     <dbl>
## 1     3393     1526     -1867
#When looking at the movie synopses, NRC and Bing give completely different sentiment analysis stories: they're slightly positive according to NRC, but firmly negative according to Bing. Interestingly I personally would've though that movie descriptions were more negative (siding with Bing), but the NRC emotions (fear, trust, anger, anticipation, sadness, etc.) seem like common emotions for descriptions since they point to (potential) conflict; the descriptions should suggest that some change is about to happen in characters' lives to get you to want to watch the movie.