library('rvest')
## Warning: package 'rvest' was built under R version 3.3.2
## Loading required package: xml2
## Warning: package 'xml2' was built under R version 3.3.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
url <-'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
imdb_page <- read_html(url)
rank_data_html <- html_nodes(imdb_page,'.text-primary')
rank_data <- html_text(rank_data_html)
str(rank_data)
## chr [1:100] "1." "2." "3." "4." "5." "6." "7." "8." ...
rank_data<-as.numeric(rank_data)
str(rank_data)
## num [1:100] 1 2 3 4 5 6 7 8 9 10 ...
#Using CSS selectors to scrap the title section
title_data_html <- html_nodes(imdb_page,'.lister-item-header a')
#Converting the title data to text
title_data <- html_text(title_data_html)
#Let's have a look at the title
head(title_data)
## [1] "Rogue One" "Sing" "Moana"
## [4] "Split" "Passengers" "The Belko Experiment"
description_data_html <- html_nodes(imdb_page,'.ratings-bar+ .text-muted')
#Converting the description data to text
description_data <- html_text(description_data_html)
#Let's have a look at the description data
head(description_data)
## [1] "\nThe Rebel Alliance makes a risky move to steal the plans for the Death Star, setting up the epic saga to follow."
## [2] "\nIn a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists' find that their lives will never be the same."
## [3] "\nIn Ancient Polynesia, when a terrible curse incurred by the Demigod Maui reaches an impetuous Chieftain's daughter's island, she answers the Ocean's call to seek out the Demigod to set things right."
## [4] "\nThree girls are kidnapped by a man with a diagnosed 23 distinct personalities, they must try to escape before the apparent emergence of a frightful new 24th."
## [5] "\nA spacecraft traveling to a distant colony planet and transporting thousands of people has a malfunction in its sleep chambers. As a result, two passengers are awakened 90 years early."
## [6] "\nIn a twisted social experiment, 80 Americans are locked in their high-rise corporate office in Bogota, Colombia and ordered by an unknown voice coming from the company's intercom system to participate in a deadly game of kill or be killed."
#Using CSS selectors to scrap the Movie runtime section
runtime_data_html <- html_nodes(imdb_page,'.text-muted .runtime')
#Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)
#Let's have a look at the runtime
head(runtime_data)
## [1] "133 min" "108 min" "107 min" "117 min" "116 min" "88 min"
runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)
genre_data_html <- html_nodes(imdb_page,'.genre')
#Converting the genre data to text
genre_data <- html_text(genre_data_html)
#Let's have a look at the runtime
head(genre_data)
## [1] "\nAction, Adventure, Sci-Fi "
## [2] "\nAnimation, Comedy, Family "
## [3] "\nAnimation, Adventure, Comedy "
## [4] "\nHorror, Thriller "
## [5] "\nAdventure, Drama, Romance "
## [6] "\nAction, Horror, Thriller "
#Data-Preprocessing: removing \n
genre_data<-gsub("\n","",genre_data)
#Data-Preprocessing: removing excess spaces
genre_data<-gsub(" ","",genre_data)
#taking only the first genre of each movie
genre_data<-gsub(",.*","",genre_data)
#Convering each genre from text to factor
genre_data<-as.factor(genre_data)
#Let's have another look at the genre data
head(genre_data)
## [1] Action Animation Animation Horror Adventure Action
## 10 Levels: Action Adventure Animation Biography Comedy Crime ... Thriller
#Using CSS selectors to scrap the IMDB rating section
rating_data_html <- html_nodes(imdb_page,'.ratings-imdb-rating strong')
#Converting the ratings data to text
rating_data <- html_text(rating_data_html)
#Let's have a look at the ratings
head(rating_data)
## [1] "8.0" "7.2" "7.7" "7.4" "7.0" "6.6"
#Data-Preprocessing: converting ratings to numerical
rating_data<-as.numeric(rating_data)
#Let's have another look at the ratings data
head(rating_data)
## [1] 8.0 7.2 7.7 7.4 7.0 6.6
votes_data_html <- html_nodes(imdb_page,'.sort-num_votes-visible span:nth-child(2)')
#Converting the votes data to text
votes_data <- html_text(votes_data_html)
#Let's have a look at the votes data
head(votes_data)
## [1] "277,945" "41,491" "92,525" "83,758" "150,410" "1,738"
#Data-Preprocessing: removing commas
votes_data<-gsub(",","",votes_data)
#Data-Preprocessing: converting votes to numerical
votes_data<-as.numeric(votes_data)
#Let's have another look at the votes data
head(votes_data)
## [1] 277945 41491 92525 83758 150410 1738
#Using CSS selectors to scrap the directors section
directors_data_html <- html_nodes(imdb_page,'.text-muted+ p a:nth-child(1)')
#Converting the directors data to text
directors_data <- html_text(directors_data_html)
#Let's have a look at the directors data
head(directors_data)
## [1] "Gareth Edwards" "Christophe Lourdelet" "Ron Clements"
## [4] "M. Night Shyamalan" "Morten Tyldum" "Greg McLean"
actors_data_html <- html_nodes(imdb_page,'.lister-item-content .ghost+ a')
#Converting the gross actors data to text
actors_data <- html_text(actors_data_html)
#Let's have a look at the actors data
head(actors_data)
## [1] "Felicity Jones" "Matthew McConaughey" "Auli'i Cravalho"
## [4] "James McAvoy" "Jennifer Lawrence" "John Gallagher Jr."
#Data-Preprocessing: converting actors data into factors
actors_data<-as.factor(actors_data)
#Converting the runtime data to text
metascore_data_html <- html_nodes(imdb_page,'.metascore')
metascore_data <- html_text(metascore_data_html)
summary(as.numeric(metascore_data))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.00 48.00 64.00 61.52 76.00 99.00
for (i in c(34,46,69,83,84,99)){
a<-metascore_data[1:(i-1)]
b<-metascore_data[i:length(metascore_data)]
metascore_data<-append(a,list("NA"))
metascore_data<-append(metascore_data,b)
}
summary(metascore_data)
## Length Class Mode
## [1,] 1 -none- character
## [2,] 1 -none- character
## [3,] 1 -none- character
## [4,] 1 -none- character
## [5,] 1 -none- character
## [6,] 1 -none- character
## [7,] 1 -none- character
## [8,] 1 -none- character
## [9,] 1 -none- character
## [10,] 1 -none- character
## [11,] 1 -none- character
## [12,] 1 -none- character
## [13,] 1 -none- character
## [14,] 1 -none- character
## [15,] 1 -none- character
## [16,] 1 -none- character
## [17,] 1 -none- character
## [18,] 1 -none- character
## [19,] 1 -none- character
## [20,] 1 -none- character
## [21,] 1 -none- character
## [22,] 1 -none- character
## [23,] 1 -none- character
## [24,] 1 -none- character
## [25,] 1 -none- character
## [26,] 1 -none- character
## [27,] 1 -none- character
## [28,] 1 -none- character
## [29,] 1 -none- character
## [30,] 1 -none- character
## [31,] 1 -none- character
## [32,] 1 -none- character
## [33,] 1 -none- character
## [34,] 1 -none- character
## [35,] 1 -none- character
## [36,] 1 -none- character
## [37,] 1 -none- character
## [38,] 1 -none- character
## [39,] 1 -none- character
## [40,] 1 -none- character
## [41,] 1 -none- character
## [42,] 1 -none- character
## [43,] 1 -none- character
## [44,] 1 -none- character
## [45,] 1 -none- character
## [46,] 1 -none- character
## [47,] 1 -none- character
## [48,] 1 -none- character
## [49,] 1 -none- character
## [50,] 1 -none- character
## [51,] 1 -none- character
## [52,] 1 -none- character
## [53,] 1 -none- character
## [54,] 1 -none- character
## [55,] 1 -none- character
## [56,] 1 -none- character
## [57,] 1 -none- character
## [58,] 1 -none- character
## [59,] 1 -none- character
## [60,] 1 -none- character
## [61,] 1 -none- character
## [62,] 1 -none- character
## [63,] 1 -none- character
## [64,] 1 -none- character
## [65,] 1 -none- character
## [66,] 1 -none- character
## [67,] 1 -none- character
## [68,] 1 -none- character
## [69,] 1 -none- character
## [70,] 1 -none- character
## [71,] 1 -none- character
## [72,] 1 -none- character
## [73,] 1 -none- character
## [74,] 1 -none- character
## [75,] 1 -none- character
## [76,] 1 -none- character
## [77,] 1 -none- character
## [78,] 1 -none- character
## [79,] 1 -none- character
## [80,] 1 -none- character
## [81,] 1 -none- character
## [82,] 1 -none- character
## [83,] 1 -none- character
## [84,] 1 -none- character
## [85,] 1 -none- character
## [86,] 1 -none- character
## [87,] 1 -none- character
## [88,] 1 -none- character
## [89,] 1 -none- character
## [90,] 1 -none- character
## [91,] 1 -none- character
## [92,] 1 -none- character
## [93,] 1 -none- character
## [94,] 1 -none- character
## [95,] 1 -none- character
## [96,] 1 -none- character
## [97,] 1 -none- character
## [98,] 1 -none- character
## [99,] 1 -none- character
## [100,] 0 -none- NULL
## [101,] 1 -none- character
#Using CSS selectors to scrap the gross revenue section
gross_data_html <- html_nodes(imdb_page,'.ghost~ .text-muted+ span')
#Converting the gross revenue data to text
gross_data <- html_text(gross_data_html)
#Let's have a look at the votes data
head(gross_data)
## [1] "$530.75M" "$269.36M" "$248.04M" "$136.86M" "$99.47M" "$7.58M"
gross_data<-gsub("M","",gross_data)
gross_data<-substring(gross_data,2,6)
length(gross_data)
## [1] 84
for (i in c(28,34,35,46,55,60,67,69,73,75,77,83,84,92,99)){
a<-gross_data[1:(i-1)]
b<-gross_data[i:length(gross_data)]
gross_data<-append(a,list("NA"))
gross_data<-append(gross_data,b)
}
movies_df<-data.frame(Rank = rank_data, Title = title_data,
Runtime = runtime_data,
Genre = genre_data, Rating = rating_data,
Metascore =unlist(metascore_data), Votes = votes_data,
Gross_Earning_in_Mil = as.factor(unlist(gross_data)),
Director = directors_data)
library(ggplot2)
ggplot(movies_df,aes(x=Rating, y= Votes,label= Title))+
geom_point(aes(col=Genre,size = Runtime))

ggplot(movies_df,aes(x=Rating, y= Votes,label= Title))+
geom_point(aes(col=Genre,size = Runtime))+
geom_text(size=4)

library(plotly)
## Warning: package 'plotly' was built under R version 3.3.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(movies_df, x = ~Rating, y = ~Votes,color=~Genre, type = 'scatter', mode = 'markers',
text = ~paste('Title: ', Title))
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors