library(jsonlite)


library(jsonlite)
library(httr)
library(Hmisc)

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## Loading required package: ggplot2

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:Hmisc':
## 
##     subplot

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:httr':
## 
##     config

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

Web-Scraping and Analyzing IMDb’s Top 2500 Domestic-Grossing Movies

Using the tools and packages discussed in this course (SelectorGadget, Inspect Item, View Source, rvest, xml2, etc.), scrape a website of your choosing that includes significant amount of text and some numerical data. Place the data into a data.frame, and code the variables properly (e.g., factor, numeric, character, etc.)

library(rvest)

## Loading required package: xml2

## 
## Attaching package: 'rvest'

## The following object is masked from 'package:Hmisc':
## 
##     html

library(xml2)

imdb=NULL

for (i in 1:50){
a=paste0("https://www.imdb.com/search/title?sort=boxoffice_gross_us&title_type=feature&page=",i)
link=read_html(a)
rank=link %>% html_nodes(".text-primary") %>% html_text()
rank=gsub("\\.","",rank)
title=link %>% html_nodes(".lister-item-header a") %>% html_text()
year=link %>% html_nodes(".text-muted.unbold") %>% html_text()
year=gsub("\\(","",year)
year=gsub("\\)","",year)
year=gsub("I", "", year)
year=gsub("X", "", year)
year=gsub(" ", "", year)
rating=link %>% html_nodes(".certificate") %>% html_text()
duration=link %>% html_nodes(".text-muted .runtime") %>% html_text()
duration=gsub(" min","",duration)
genre=link %>% html_nodes(".genre") %>% html_text()
genre=gsub("\\n","",genre)
genre=trimws(genre,"r")
metascore=link %>% html_nodes(".metascore") %>% html_text()
synopsis=link %>% html_nodes(".ratings-bar+ .text-muted") %>% html_text()
synopsis=gsub("\n    ","",synopsis)
director=link %>% html_nodes(".text-muted+ p a:nth-child(1)") %>% html_text()
directors=link %>% html_nodes("p a:nth-child(2) , .lister-item-content a:nth-child(1)") %>% html_text()
actors=link %>% html_nodes(".lister-item-content .ghost~ a") %>% html_text()
gross=link %>% html_nodes(".ghost~ .text-muted+ span") %>% html_text()
gross=gsub("\\$","",gross)
gross=gsub("M","",gross)

imdb_nextpage = data.frame(cbind(rank,title,year,duration,genre,synopsis,director,gross), stringsAsFactors = FALSE) #metascore, directors, actors, and rating have been removed
imdb = rbind(imdb,imdb_nextpage)}

imdb$rank=as.factor(imdb$rank)
imdb$year=as.factor(imdb$year)
imdb$genre=as.factor(imdb$genre)
imdb$director=as.factor(imdb$director)
imdb$duration=as.numeric(imdb$duration)
imdb$gross=as.numeric(imdb$gross)

str(imdb)

## 'data.frame':    2500 obs. of  8 variables:
##  $ rank    : Factor w/ 2500 levels "1","1,000","1,001",..: 1 1112 1724 1835 1946 2057 2168 2279 2390 1002 ...
##  $ title   : chr  "Star Wars: The Force Awakens" "Avatar" "Black Panther" "Avengers: Infinity War" ...
##  $ year    : Factor w/ 74 levels "1937","1939",..: 70 64 73 73 52 70 67 72 73 63 ...
##  $ duration: num  136 162 134 149 194 124 143 152 118 152 ...
##  $ genre   : Factor w/ 262 levels "Action, Adventure",..: 7 7 12 5 227 12 12 7 100 30 ...
##  $ synopsis: chr  "Three decades after the Empire's defeat, a new threat arises in the militant First Order. Stormtrooper defector"| __truncated__ "A paraplegic marine dispatched to the moon Pandora on a unique mission becomes torn between following his order"| __truncated__ "T'Challa, heir to the hidden but advanced kingdom of Wakanda, must step forward to lead his people into a new f"| __truncated__ "The Avengers and their allies must be willing to sacrifice all in an attempt to defeat the powerful Thanos befo"| __truncated__ ...
##  $ director: Factor w/ 999 levels "Aaron Blaise",..: 368 375 824 51 375 168 520 758 94 157 ...
##  $ gross   : num  937 761 700 679 659 ...

2. Text mine this data using other tools and packages of your choosing (tm, dplyr, etc.). Be sure to clean the data by removing stopwords, spaces, etc.

library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

## The following object is masked from 'package:httr':
## 
##     content

library(wordcloud)

## Loading required package: RColorBrewer

library(tidytext)
library(tidyverse)

## -- Attaching packages ------------------------------------ tidyverse 1.2.1 --

## v tibble  1.4.2     v purrr   0.2.5
## v tidyr   0.8.1     v dplyr   0.7.6
## v readr   1.1.1     v stringr 1.3.1
## v tibble  1.4.2     v forcats 0.3.0

## -- Conflicts --------------------------------------- tidyverse_conflicts() --
## x NLP::annotate()         masks ggplot2::annotate()
## x plotly::config()        masks httr::config()
## x NLP::content()          masks httr::content()
## x dplyr::filter()         masks plotly::filter(), stats::filter()
## x purrr::flatten()        masks jsonlite::flatten()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x rvest::html()           masks Hmisc::html()
## x dplyr::lag()            masks stats::lag()
## x purrr::pluck()          masks rvest::pluck()
## x dplyr::src()            masks Hmisc::src()
## x dplyr::summarize()      masks Hmisc::summarize()

review_source=VectorSource(imdb$synopsis)
corpus=Corpus(review_source)
corpus=tm_map(corpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents

corpus=tm_map(corpus, removePunctuation)

## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation
## drops documents

corpus=tm_map(corpus, stripWhitespace)

## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation
## drops documents

corpus=tm_map(corpus, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents

dtm=DocumentTermMatrix(corpus)
dtm2=as.matrix(dtm)

frequency=colSums(dtm2)
frequency=sort(frequency, decreasing=TRUE)
words<-names(frequency)
wordcloud(words[1:100], frequency[1:100], scale = c(2.5, .2))

frequency=sort(frequency, decreasing=TRUE)
frequency[1:10]

##    new  young   must    man   life    two family  world  woman    one 
##    265    231    222    199    196    196    177    170    162    152

barplot(frequency[1:10], main="Top 10 Synopsis Words: TM", cex.names=0.7)

#Looking at common synopsis words we see general descriptors of  characters (man, woman), change/stakes ("new", "must", "life") and mentions of "family." You make a lot of commonly used trailer cliches like "In a world where one man ________, he meets a special woman, and together, they must find a new way to _______."



mytext=imdb$synopsis
df=as.list(mytext)
mydataframe=data_frame(text=df)
newdf=mydataframe%>%
unnest_tokens(word, text)%>%
  anti_join(stop_words)

## Joining, by = "word"

newdf%>%
  count(word, sort=TRUE)%>%
  filter(n>=100)%>%
  mutate(word=reorder(word,n))%>%
  ggplot(aes(word,n))+
  geom_col()+
  xlab(NULL)+
  coord_flip()+
  ggtitle('Top 10 Synopsis Words: Tidytext')+
  theme(plot.title = element_text(hjust = 0.5))

#When applying the Tidytext stopwords we're left with relatable, broad topics (life, family, love, war), common settings (home, school) and talking about a "world." Interestingly "man" is apparently a Tidytext stopword while "woman" is not; meanwhile, the tm stopwords don't contain either word.

Web-scraping the genres

review_source2=VectorSource(imdb$genre)
corpus2=Corpus(review_source2)
corpus2=tm_map(corpus2, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(corpus2, content_transformer(tolower)):
## transformation drops documents

corpus2=tm_map(corpus2, removePunctuation)

## Warning in tm_map.SimpleCorpus(corpus2, removePunctuation): transformation
## drops documents

corpus2=tm_map(corpus2, stripWhitespace)

## Warning in tm_map.SimpleCorpus(corpus2, stripWhitespace): transformation
## drops documents

corpus2=tm_map(corpus2, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(corpus2, removeWords, stopwords("english")):
## transformation drops documents

dtm3=DocumentTermMatrix(corpus2)
dtm4=as.matrix(dtm3)

frequency2=colSums(dtm4)
frequency2=sort(frequency2, decreasing=TRUE)
words2<-names(frequency2)
wordcloud(words2[1:10], frequency2[1:10], scale = c(2.5, .2))

frequency2=sort(frequency2, decreasing=TRUE)
frequency2[1:10]

##    comedy     drama    action adventure     crime   romance  thriller 
##      1077      1033       766       739       420       398       369 
##    family   fantasy     scifi 
##       265       263       230

barplot(frequency2[1:10], main="Most Common Genres in Movies", cex.names=0.7, las=2)

#Comedy and drama are the most common movie genres, followed by action and adventure.

Build some plots using ggplot2 to categorize the data (e.g.., top 10 word frequency counts, histogram for numeric data, etc.) Discuss what your analysis indicates. Upload your final project to RPubs, and provide the link as your submission.

library(ggplot2)
library(dplyr)

ggplot(subset(imdb, year %in% c("1999":"2018"))) +
  geom_boxplot(mapping = aes(x = year, y = gross))+
  theme(axis.text.x = element_text(angle = 90, hjust = 0))+
  ggtitle('Movie Domestic Grosses by Year: 1999-2018')+
  theme(plot.title = element_text(hjust = 0.5))

#Have movies overall been generating more revenue (i.e.,  continue to increasing grosses from consumers) over the past 20 years? Not really for movies overall, as the boxes (i.e., medians and interquartile ranges) remain in range of each other. The outliers on the other hand are a different story, as those are have been hitting higher extremes in recent years due to mega-blockbusters like the Star Wars and Marvel movies.


ggplot(subset(imdb, year %in% c("1999":"2018"))) +
  geom_boxplot(mapping = aes(x = year, y = duration))+
  theme(axis.text.x = element_text(angle = 90, hjust = 0))+
  ggtitle('Movie Durations by Year: 1999-2018')+
  theme(plot.title = element_text(hjust = 0.5))

#Movie durations are a similar story: they've been comparable over the years. 

ggplot(subset(imdb, year %in% c("1999":"2018"))) +
  geom_point(mapping = aes(x = duration, y = gross))+
  geom_smooth(mapping = aes(x = duration, y = gross))+
  ggtitle('Movie Durations and Domestic Gross: 1999-2018')+
  theme(plot.title = element_text(hjust = 0.5))

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#Movies longer than 120 minutes end up making more money, with the sweet spot being around 150 minutes. Part of this could be what the audiences want, but another part could be movies that expect to make more could be given the latitude to have longer lengths, since there comes a point where a longer length means fewer showings at a theater on a given day.

Sentiment Analysis

nrc=get_sentiments("nrc")

newdf%>%
  inner_join(nrc)%>%
  count(sentiment, sort=TRUE)

## Joining, by = "word"

## # A tibble: 10 x 2
##    sentiment        n
##    <chr>        <int>
##  1 positive      4241
##  2 negative      3953
##  3 fear          3049
##  4 trust         2838
##  5 anger         2074
##  6 anticipation  1996
##  7 sadness       1762
##  8 joy           1721
##  9 disgust       1315
## 10 surprise      1117

mysent=newdf%>%
  inner_join(get_sentiments("bing"))%>%
  count(sentiment)%>%
  spread(sentiment, n, fill=0)%>%
  mutate(sentiment=positive-negative)

## Joining, by = "word"

mysent

## # A tibble: 1 x 3
##   negative positive sentiment
##      <dbl>    <dbl>     <dbl>
## 1     3393     1526     -1867

#When looking at the movie synopses, NRC and Bing give completely different sentiment analysis stories: they're slightly positive according to NRC, but firmly negative according to Bing. Interestingly I personally would've though that movie descriptions were more negative (siding with Bing), but the NRC emotions (fear, trust, anger, anticipation, sadness, etc.) seem like common emotions for descriptions since they point to (potential) conflict; the descriptions should suggest that some change is about to happen in characters' lives to get you to want to watch the movie.

Week 6 - Final

Corey Chu

September 16, 2018

Web-Scraping and Analyzing IMDb’s Top 2500 Domestic-Grossing Movies

2. Text mine this data using other tools and packages of your choosing (tm, dplyr, etc.). Be sure to clean the data by removing stopwords, spaces, etc.

Web-scraping the genres

Build some plots using ggplot2 to categorize the data (e.g.., top 10 word frequency counts, histogram for numeric data, etc.) Discuss what your analysis indicates. Upload your final project to RPubs, and provide the link as your submission.

Sentiment Analysis