This is an R Markdown document which has R codes use to extract certain information from Top 250 IMDB movies. We extract all the required information and write those into a dataset (.csv) file so that we can use it for various types of analysis.
In this article, it is shown how to extract data from IMDB website using “rvest” package in R. We also use SelectorGadget which is an open source tool use for CSS generation. Along with all this, we also need XML package.
At first we read all the required library files
#Read the required libraries
library("rvest")
## Loading required package: xml2
library("XML")
##
## Attaching package: 'XML'
## The following object is masked from 'package:rvest':
##
## xml
library("gtools")
library ("ggplot2")
library(data.table)
Find all movies that released between 1996 and 1998 (both years including).
#Read the IMDB files
read_IMDB<-read.csv("IMDB top 250.csv")
#Movies between year 1996 and 1998 (both years included)
movies<-data.frame(read_IMDB[which(as.numeric(read_IMDB$year) %in% c(1996:1998)),])
Read in the URL’s imdb page and scrape the following information:
Director, stars, Taglines, Genres, (partial) storyline, Box office budget and box office gross.
#Reading IMDB pages
urls<-read_IMDB$movie.link
#Retrieving movie summary from all web pages
movie_info<-NULL
#Loop through all the URL to extract information
for(url in urls){
page<-read_html(url)
#Web Scraping for getting movie name
movie_nodes <-html_nodes(page,'.title_wrapper h1')
movie_name <-trimws(html_text(movie_nodes))
movie_name<-sapply(strsplit(movie_name,"\\("), '[', 1) # Remove '('
#Web Scraping for getting movie summary
movie_summary<-html_nodes(page,'.summary_text')
movie_summary<-trimws(gsub("\n","",html_text(movie_summary)))
#Web Scraping for getting movie's story line
story_line<-html_nodes(page,"#titleStoryLine p")
story_line<-gsub("\n","",html_text(story_line))
#Web Scraping for getting movie's tag line
txt_blocks<-html_nodes(page,".canwrap .inline, .txt-block")
tag_line<-trimws(gsub("Taglines:","",gsub("\n","",html_text(txt_blocks[2]))))
tag_line<-sapply(strsplit(tag_line,"»"), '[', 1) # Remove '»'
#Web Scraping for getting genres
genres<-html_nodes(page,".canwrap")
genres<-trimws(gsub("Genres:","",gsub("\n","",html_text(genres[3]))))
#Web Scraping for getting movie budget
box_office_budget<-html_nodes(page,".subheading+ .txt-block")
for(i in html_text(box_office_budget)){
b<-trimws(gsub("\n","",i))
if(is.na(pmatch("Budget",b))){
m<-1
}
else{
budget<-trimws(gsub("Budget:","",b))
budget<-trimws(sapply(strsplit(budget,"\\("), '[', 1)) # Remove '('
}
}
#Web Scraping for getting movie gross income
gross<-html_nodes(page,".txt-block")
for(i in html_text(gross)){
g<-trimws(gsub("\n","",i))
if(is.na(pmatch("Gross",g))){
m<-1
}
else{
gross_budget<-trimws(gsub("Gross:","",g))
gross_budget<-trimws(sapply(strsplit(gross_budget,"\\("), '[', 1)) # Remove '('
}
}
#Web Scraping for getting movie's director and stars
cast<-html_nodes(page,".credit_summary_item , #titleDetails .inline , .subheading+ .txt-block")
director<-trimws(gsub("Director:","",gsub("\n","",html_text(cast[1]))))
stars<-trimws(gsub("See full cast & crew","",gsub("Stars:","",gsub("\n","",html_text(cast[3])))))
stars<-sapply(strsplit(stars,"»"), '[', 1) # Remove '»
#Creating individual data frames to form a collective dataframe
movie_name<-data.frame(Movie_Name=movie_name)
director<-data.frame(Director=director)
stars<-data.frame(Stars=stars)
movie_summary<-data.frame(Summary=movie_summary)
story_line<-data.frame(Storyline=story_line)
tag_line<-data.frame(Tagline=tag_line)
genres<-data.frame(Genres=genres)
budget<-data.frame(Budget=budget)
gross_budget<-data.frame(Gross=gross_budget)
#Bind all the dataframes and their data to a single dataframe movie_info
movie_info<-rbind(movie_info,data.frame(movie_name,director,stars,tag_line,genres,movie_summary,story_line,budget,gross_budget))
}
Write the extracted information into a dataset (.csv file) using the dataframe created above
write.csv(movie_info,'Movie Info.csv', row.names = F)
Compare information about Movie-Count and Genre
#Table movie-count vs genres
#Read the dataset that was created above
read_movie_info<-read.csv("Movie Info.csv")
#Loop through all the genres from the dataset to filter out the delimitters
genres<-NULL
for(genre in read_movie_info$Genres){
genres<-c(genres,strsplit(genre,"\\|"))
}
movie_genre_count<-table(sapply(genres, "[[",1))
#Form a dataframe to show the movie-count vs genres info
movie_genre_count<-data.frame(movie_genre_count)
colnames(movie_genre_count)<-c("Genres","Movie-Count")
#Plot graph between movie-count vs genres
ggplot(movie_genre_count,aes(x=movie_genre_count$Genres,y=movie_genre_count$`Movie-Count`))+geom_bar(stat="identity")+xlab("Genres")+ylab("Movie-Count")
#The above graph reveals that movies of gnere "Drama" feature the most in the IMDB Top-250 list and genre "Sci-Fi" appears the least.
max_gross<-max(as.integer(noquote(gsub(",","",gsub("\\$","",read_movie_info$Gross))))) #Remove $ and , to calculate maximum
movie<-read_movie_info
#Highest Gross Income
movie$Gross<-as.integer(noquote(gsub(",","",gsub("\\$","",movie$Gross))))
highest_gross<-subset(movie,movie$Gross==max_gross)
#Above analysis shows that the movie "Star Wars: The Force Awakens" has the highest gross income
#Lowest Gross Icome
min_gross<-min(as.integer(noquote(gsub(",","",gsub("\\$","",read_movie_info$Gross))))) #Remove $ and , to calculate minimum
lowest_gross<-subset(movie,movie$Gross==min_gross)
#Above analysis shows that the movie "All About Eve" has the lowest gross income