IMDB Top 100 Bese moive in 2016

library('rvest')

## Warning: package 'rvest' was built under R version 3.3.2

## Loading required package: xml2

## Warning: package 'xml2' was built under R version 3.3.2

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.3.2

url <-'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
imdb_page <- read_html(url)

rank_data_html <- html_nodes(imdb_page,'.text-primary')
rank_data <- html_text(rank_data_html)
str(rank_data)

##  chr [1:100] "1." "2." "3." "4." "5." "6." "7." "8." ...

rank_data<-as.numeric(rank_data)
str(rank_data)

##  num [1:100] 1 2 3 4 5 6 7 8 9 10 ...

#Using CSS selectors to scrap the title section
title_data_html <- html_nodes(imdb_page,'.lister-item-header a')

#Converting the title data to text
title_data <- html_text(title_data_html)

#Let's have a look at the title
head(title_data)

## [1] "Rogue One"            "Sing"                 "Moana"               
## [4] "Split"                "Passengers"           "The Belko Experiment"

description_data_html <- html_nodes(imdb_page,'.ratings-bar+ .text-muted')

#Converting the description data to text
description_data <- html_text(description_data_html)

#Let's have a look at the description data
head(description_data)

## [1] "\nThe Rebel Alliance makes a risky move to steal the plans for the Death Star, setting up the epic saga to follow."                                                                                                                               
## [2] "\nIn a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists' find that their lives will never be the same."                 
## [3] "\nIn Ancient Polynesia, when a terrible curse incurred by the Demigod Maui reaches an impetuous Chieftain's daughter's island, she answers the Ocean's call to seek out the Demigod to set things right."                                         
## [4] "\nThree girls are kidnapped by a man with a diagnosed 23 distinct personalities, they must try to escape before the apparent emergence of a frightful new 24th."                                                                                  
## [5] "\nA spacecraft traveling to a distant colony planet and transporting thousands of people has a malfunction in its sleep chambers. As a result, two passengers are awakened 90 years early."                                                       
## [6] "\nIn a twisted social experiment, 80 Americans are locked in their high-rise corporate office in Bogota, Colombia and ordered by an unknown voice coming from the company's intercom system to participate in a deadly game of kill or be killed."

#Using CSS selectors to scrap the Movie runtime section
runtime_data_html <- html_nodes(imdb_page,'.text-muted .runtime')

#Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)

#Let's have a look at the runtime
head(runtime_data)

## [1] "133 min" "108 min" "107 min" "117 min" "116 min" "88 min"

runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)

genre_data_html <- html_nodes(imdb_page,'.genre')

#Converting the genre data to text
genre_data <- html_text(genre_data_html)

#Let's have a look at the runtime
head(genre_data)

## [1] "\nAction, Adventure, Sci-Fi            "   
## [2] "\nAnimation, Comedy, Family            "   
## [3] "\nAnimation, Adventure, Comedy            "
## [4] "\nHorror, Thriller            "            
## [5] "\nAdventure, Drama, Romance            "   
## [6] "\nAction, Horror, Thriller            "

#Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)

#Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data)

#taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data)

#Convering each genre from text to factor
genre_data<-as.factor(genre_data)

#Let's have another look at the genre data
head(genre_data)

## [1] Action    Animation Animation Horror    Adventure Action   
## 10 Levels: Action Adventure Animation Biography Comedy Crime ... Thriller

#Using CSS selectors to scrap the IMDB rating section
rating_data_html <- html_nodes(imdb_page,'.ratings-imdb-rating strong')

#Converting the ratings data to text
rating_data <- html_text(rating_data_html)

#Let's have a look at the ratings
head(rating_data)

## [1] "8.0" "7.2" "7.7" "7.4" "7.0" "6.6"

#Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)

#Let's have another look at the ratings data
head(rating_data)

## [1] 8.0 7.2 7.7 7.4 7.0 6.6

votes_data_html <- html_nodes(imdb_page,'.sort-num_votes-visible span:nth-child(2)')

#Converting the votes data to text
votes_data <- html_text(votes_data_html)

#Let's have a look at the votes data
head(votes_data)

## [1] "277,945" "41,491"  "92,525"  "83,758"  "150,410" "1,738"

#Data-Preprocessing: removing commas
votes_data<-gsub(",","",votes_data)

#Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)

#Let's have another look at the votes data
head(votes_data)

## [1] 277945  41491  92525  83758 150410   1738

#Using CSS selectors to scrap the directors section
directors_data_html <- html_nodes(imdb_page,'.text-muted+ p a:nth-child(1)')

#Converting the directors data to text
directors_data <- html_text(directors_data_html)
#Let's have a look at the directors data
head(directors_data)

## [1] "Gareth Edwards"       "Christophe Lourdelet" "Ron Clements"        
## [4] "M. Night Shyamalan"   "Morten Tyldum"        "Greg McLean"

actors_data_html <- html_nodes(imdb_page,'.lister-item-content .ghost+ a')

#Converting the gross actors data to text
actors_data <- html_text(actors_data_html)

#Let's have a look at the actors data
head(actors_data)

## [1] "Felicity Jones"      "Matthew McConaughey" "Auli'i Cravalho"    
## [4] "James McAvoy"        "Jennifer Lawrence"   "John Gallagher Jr."

#Data-Preprocessing: converting actors data into factors
actors_data<-as.factor(actors_data)



#Converting the runtime data to text

metascore_data_html <- html_nodes(imdb_page,'.metascore')
metascore_data <- html_text(metascore_data_html)
summary(as.numeric(metascore_data))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   23.00   48.00   64.00   61.52   76.00   99.00

for (i in c(34,46,69,83,84,99)){
  
  a<-metascore_data[1:(i-1)]
  
  b<-metascore_data[i:length(metascore_data)]
  
  metascore_data<-append(a,list("NA"))
  
  metascore_data<-append(metascore_data,b)
  
}

summary(metascore_data)

##        Length Class  Mode     
##   [1,] 1      -none- character
##   [2,] 1      -none- character
##   [3,] 1      -none- character
##   [4,] 1      -none- character
##   [5,] 1      -none- character
##   [6,] 1      -none- character
##   [7,] 1      -none- character
##   [8,] 1      -none- character
##   [9,] 1      -none- character
##  [10,] 1      -none- character
##  [11,] 1      -none- character
##  [12,] 1      -none- character
##  [13,] 1      -none- character
##  [14,] 1      -none- character
##  [15,] 1      -none- character
##  [16,] 1      -none- character
##  [17,] 1      -none- character
##  [18,] 1      -none- character
##  [19,] 1      -none- character
##  [20,] 1      -none- character
##  [21,] 1      -none- character
##  [22,] 1      -none- character
##  [23,] 1      -none- character
##  [24,] 1      -none- character
##  [25,] 1      -none- character
##  [26,] 1      -none- character
##  [27,] 1      -none- character
##  [28,] 1      -none- character
##  [29,] 1      -none- character
##  [30,] 1      -none- character
##  [31,] 1      -none- character
##  [32,] 1      -none- character
##  [33,] 1      -none- character
##  [34,] 1      -none- character
##  [35,] 1      -none- character
##  [36,] 1      -none- character
##  [37,] 1      -none- character
##  [38,] 1      -none- character
##  [39,] 1      -none- character
##  [40,] 1      -none- character
##  [41,] 1      -none- character
##  [42,] 1      -none- character
##  [43,] 1      -none- character
##  [44,] 1      -none- character
##  [45,] 1      -none- character
##  [46,] 1      -none- character
##  [47,] 1      -none- character
##  [48,] 1      -none- character
##  [49,] 1      -none- character
##  [50,] 1      -none- character
##  [51,] 1      -none- character
##  [52,] 1      -none- character
##  [53,] 1      -none- character
##  [54,] 1      -none- character
##  [55,] 1      -none- character
##  [56,] 1      -none- character
##  [57,] 1      -none- character
##  [58,] 1      -none- character
##  [59,] 1      -none- character
##  [60,] 1      -none- character
##  [61,] 1      -none- character
##  [62,] 1      -none- character
##  [63,] 1      -none- character
##  [64,] 1      -none- character
##  [65,] 1      -none- character
##  [66,] 1      -none- character
##  [67,] 1      -none- character
##  [68,] 1      -none- character
##  [69,] 1      -none- character
##  [70,] 1      -none- character
##  [71,] 1      -none- character
##  [72,] 1      -none- character
##  [73,] 1      -none- character
##  [74,] 1      -none- character
##  [75,] 1      -none- character
##  [76,] 1      -none- character
##  [77,] 1      -none- character
##  [78,] 1      -none- character
##  [79,] 1      -none- character
##  [80,] 1      -none- character
##  [81,] 1      -none- character
##  [82,] 1      -none- character
##  [83,] 1      -none- character
##  [84,] 1      -none- character
##  [85,] 1      -none- character
##  [86,] 1      -none- character
##  [87,] 1      -none- character
##  [88,] 1      -none- character
##  [89,] 1      -none- character
##  [90,] 1      -none- character
##  [91,] 1      -none- character
##  [92,] 1      -none- character
##  [93,] 1      -none- character
##  [94,] 1      -none- character
##  [95,] 1      -none- character
##  [96,] 1      -none- character
##  [97,] 1      -none- character
##  [98,] 1      -none- character
##  [99,] 1      -none- character
## [100,] 0      -none- NULL     
## [101,] 1      -none- character

#Using CSS selectors to scrap the gross revenue section
gross_data_html <- html_nodes(imdb_page,'.ghost~ .text-muted+ span')

#Converting the gross revenue data to text
gross_data <- html_text(gross_data_html)

#Let's have a look at the votes data
head(gross_data)

## [1] "$530.75M" "$269.36M" "$248.04M" "$136.86M" "$99.47M"  "$7.58M"

gross_data<-gsub("M","",gross_data)

gross_data<-substring(gross_data,2,6)

length(gross_data)

## [1] 84

for (i in c(28,34,35,46,55,60,67,69,73,75,77,83,84,92,99)){
  
  a<-gross_data[1:(i-1)]
  
  b<-gross_data[i:length(gross_data)]
  
  gross_data<-append(a,list("NA"))
  
  gross_data<-append(gross_data,b)
  
}



movies_df<-data.frame(Rank = rank_data, Title = title_data,
                      
                      Runtime = runtime_data,
                      
                      Genre = genre_data, Rating = rating_data,
                      
                      Metascore =unlist(metascore_data), Votes = votes_data,
                      Gross_Earning_in_Mil = as.factor(unlist(gross_data)),
                      
                      Director = directors_data)

library(ggplot2)
ggplot(movies_df,aes(x=Rating, y= Votes,label= Title))+
  geom_point(aes(col=Genre,size = Runtime))

ggplot(movies_df,aes(x=Rating, y= Votes,label= Title))+
  geom_point(aes(col=Genre,size = Runtime))+
geom_text(size=4)

library(plotly)

## Warning: package 'plotly' was built under R version 3.3.2

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

plot_ly(movies_df, x = ~Rating, y = ~Votes,color=~Genre, type = 'scatter', mode = 'markers',
        text = ~paste('Title: ', Title))

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

IMDB Top 100 Bese moive in 2016

JHONG TING HUANG

2017年3月27日