Motivation for this Analysis

  1. To understand the approach and nuances of scraping the data from a website and preparing the data for further analysis
  2. Create Hypothesis and identify patterns & insights in data
  3. Present the whole scope of work in a easy understandable and readable format by using markdown.

Problem Context and Formulation

  1. Find all movies that released between 1996 and 1998 (both years including).

  2. Read in the URL’s imdb page and scrape the following information:

    Director, stars, Taglines, Genres, (partial) storyline, Box office budget and box office gross.

  3. Make a dataframe out of these variables as columns with movie name being the first variable.

  4. Make a table movie-count versus Genres

library("rvest")
## Loading required package: xml2
library("XML")
## 
## Attaching package: 'XML'
## The following object is masked from 'package:rvest':
## 
##     xml
library( "stringr" )
library("plyr")
library("data.table")

Read the file which has urls of top 250 movies in IMDB. This file has been created by understanding and running the code provided as part of the assignment. This input file is also included in the submission.

movies <- read.csv("IMDB Top 250.csv")

Filter the movies that are released in the years between 1996 and 1998 (inclusive)

filtered_movies <- movies[which(movies$year>=1996 & movies$year <=1998),]
urls <- as.character(filtered_movies$movie.link)
urlist <- as.list(urls)

A Function to fetch the required variables from each movie and clean up the data to extract the required information for each field.

fetchMovieDetails <- function(url) {
page = read_html(url)
  ## Clearing all the non ascii codes, Year (Using gsub) present inside the brackets
  title <-    gsub('\u00E8|\u00A0',"",gsub("\\s*\\([^\\)]+\\)","",html_text(html_nodes(page,"h1"))))
  
  director <- html_text(html_nodes(page,".summary_text+ .credit_summary_item .itemprop"))
  
  stars <- html_text(html_nodes(page,".credit_summary_item~ .credit_summary_item+ .credit_summary_item .itemprop"))
  
  ## Extracting all the text present after the tag 'Taglines:' (Using str_extract) and clearning escape characters like '\n'
  tagline <- trimws(gsub("Taglines:","",gsub("\n","",gsub("See more","",html_text(html_nodes(page,"#titleStoryLine .txt-block:nth-child(8)"))))))
  
  genre <- html_text(html_nodes(page,".txt-block~ .canwrap a"))
  
  ##Clearing all the escape characters using gsub and trimming the whole text using trimws
  storyline <- trimws(gsub("\n","",html_text(html_nodes(page,"#titleStoryLine p"))))
  
  ##Extracting only the numeric number (using regexp in gsub) by clearning out the year and location (using gsub) and pasting '$' against the number (using paste)
  budget <- paste("$",gsub("\\D","",gsub("\\s*\\([^\\)]+\\)","",html_text(html_nodes(page,"#titleDetails .txt-block:nth-child(11)")))),sep = "")
  gross <- ifelse(length(html_text(html_nodes(page,"#titleDetails .txt-block:nth-child(13)")))>0,paste("$",gsub("\\D","",gsub("\\s*\\([^\\)]+\\)","",html_text(html_nodes(page,"#titleDetails .txt-block:nth-child(13)")))),sep = ""),paste("$",gsub("\\D","",gsub("\\s*\\([^\\)]+\\)","",html_text(html_nodes(page,"#titleDetails .txt-block:nth-child(12)")))),sep = ""))
  
  movieList <- c(title,toString(director),toString(stars),tagline,toString(genre),storyline,budget,gross)
  return(movieList)
}

Fetch the movie details for each URL filtered using the User Function.

movieDetails <- lapply(urlist,fetchMovieDetails)

Store the results of all the movies into a data frame

movieFrame <- data.frame(lapply(movieDetails, `length<-`, max(lengths(movieDetails))))
colnames(movieFrame) <- c(1:length(movieFrame))
rownames(movieFrame) <- c("Movie Title","Director","Star Cast","Tag Line","Genre","Story Line","Budget","Gross")
movieDetailsForAll <- as.data.frame(t(movieFrame))
write.csv(movieDetailsForAll,'IMDB 96 to 98 Movie Details.csv', row.names = F)

Generate the summary on the Genres against the total number of movies

summaryofmovies<- trimws(unlist(strsplit(as.character(movieDetailsForAll[,5]), ',')))
movieSummaryOnGenre<-count(summaryofmovies)
movieSummaryGenreBased <- setDT(movieSummaryOnGenre,keep.rownames = TRUE)
colnames(movieSummaryGenreBased)<- c("S.No","Type of Genre","Number of movies")
movieSummaryGenreBased
##     S.No Type of Genre Number of movies
##  1:    1        Action                1
##  2:    2     Adventure                1
##  3:    3     Animation                1
##  4:    4        Comedy                4
##  5:    5         Crime                5
##  6:    6         Drama                9
##  7:    7        Family                1
##  8:    8       Fantasy                1
##  9:    9       Mystery                1
## 10:   10        Sci-Fi                1
## 11:   11      Thriller                2
## 12:   12           War                2

Create a summary by Genre with Budget, Gross, ROI details

fetchbudgetdetails <- function(genre){
  budgetrows<- movieDetailsForAll[grep(genre,movieDetailsForAll$Genre),]
  val <- as.numeric(gsub("\\D","",budgetrows$Budget))
  Finalmean<- mean(val)/1000
  budgetlist <- c(genre,Finalmean)
  return(budgetlist)
}

value <- as.list(unique(summaryofmovies))
summarygenre <- lapply(value, fetchbudgetdetails)

genreFrame <- data.frame(lapply(summarygenre, `length<-`, max(lengths(summarygenre))))
colnames(genreFrame) <- c(1:length(genreFrame))
rownames(genreFrame) <- c("Genre","Average Budget in '000")
budgetDetailsForAllGenres <- as.data.frame(t(genreFrame))

fetchgrossdetails <- function(genre){
  budgetrows<- movieDetailsForAll[grep(genre,movieDetailsForAll$Genre),]
  gross <- as.numeric(gsub("\\D","",budgetrows$Gross))
  Finalmean<- mean(gross)/1000
  grosslist <- c(genre,Finalmean)
  return(grosslist)
}

value <- as.list(unique(summaryofmovies))
summarygenre <- lapply(value, fetchgrossdetails)

genreFrame <- data.frame(lapply(summarygenre, `length<-`, max(lengths(summarygenre))))
colnames(genreFrame) <- c(1:length(genreFrame))
rownames(genreFrame) <- c("Genre","Average Gross in '000")
GrossDetailsForAllGenres <- as.data.frame(t(genreFrame))

Finaldata<- merge(budgetDetailsForAllGenres,GrossDetailsForAllGenres,by = "Genre")

final <- as.data.frame(movieSummaryOnGenre)
checkthis<- as.data.frame(final[,2:3])
colnames(checkthis) <- c("Genre","Number of movies released")

genrebudgetgross <- merge(checkthis,Finaldata,by = "Genre")

genrebudgetgross$`Average Budget in '000` <- as.numeric(as.character(genrebudgetgross$`Average Budget in '000`))
genrebudgetgross$`Average Gross in '000` <- as.numeric(as.character(genrebudgetgross$`Average Gross in '000`))

genrebudgetgross$roi <- (genrebudgetgross$`Average Gross in '000` - genrebudgetgross$`Average Budget in '000`)/genrebudgetgross$`Average Budget in '000`
genrebudgetgross
##        Genre Number of movies released Average Budget in '000
## 1     Action                         1              70000.000
## 2  Adventure                         1                144.446
## 3  Animation                         1                144.446
## 4     Comedy                         4              21618.871
## 5      Crime                         5              11195.096
## 6      Drama                         9              23686.667
## 7     Family                         1                180.000
## 8    Fantasy                         1                144.446
## 9    Mystery                         1              35000.000
## 10    Sci-Fi                         1              60000.000
## 11  Thriller                         2              21000.000
## 12       War                         2              45000.000
##    Average Gross in '000        roi
## 1             216119.491  2.0874213
## 2               2298.191 14.9103817
## 3               2298.191 14.9103817
## 4              51072.862  1.3624204
## 5              23657.886  1.1132365
## 6              72476.365  2.0597959
## 7                925.402  4.1411222
## 8               2298.191 14.9103817
## 9              64604.977  0.8458565
## 10            125603.360  1.0933893
## 11             45243.675  1.1544607
## 12            136858.869  2.0413082

Hypothesis

  1. Action and Sci-Fi movies typically have high average budget and average gross among all Genres
genrebudgetgross[with(genrebudgetgross, order(-`Average Budget in '000`)), ]
##        Genre Number of movies released Average Budget in '000
## 1     Action                         1              70000.000
## 10    Sci-Fi                         1              60000.000
## 12       War                         2              45000.000
## 9    Mystery                         1              35000.000
## 6      Drama                         9              23686.667
## 4     Comedy                         4              21618.871
## 11  Thriller                         2              21000.000
## 5      Crime                         5              11195.096
## 7     Family                         1                180.000
## 2  Adventure                         1                144.446
## 3  Animation                         1                144.446
## 8    Fantasy                         1                144.446
##    Average Gross in '000        roi
## 1             216119.491  2.0874213
## 10            125603.360  1.0933893
## 12            136858.869  2.0413082
## 9              64604.977  0.8458565
## 6              72476.365  2.0597959
## 4              51072.862  1.3624204
## 11             45243.675  1.1544607
## 5              23657.886  1.1132365
## 7                925.402  4.1411222
## 2               2298.191 14.9103817
## 3               2298.191 14.9103817
## 8               2298.191 14.9103817
  1. Typically there are more Dramas made in an year and hence a higher possibility of having more drama in Top 250
genrebudgetgross[with(genrebudgetgross, order(-`Number of movies released`)), ]
##        Genre Number of movies released Average Budget in '000
## 6      Drama                         9              23686.667
## 5      Crime                         5              11195.096
## 4     Comedy                         4              21618.871
## 11  Thriller                         2              21000.000
## 12       War                         2              45000.000
## 1     Action                         1              70000.000
## 2  Adventure                         1                144.446
## 3  Animation                         1                144.446
## 7     Family                         1                180.000
## 8    Fantasy                         1                144.446
## 9    Mystery                         1              35000.000
## 10    Sci-Fi                         1              60000.000
##    Average Gross in '000        roi
## 6              72476.365  2.0597959
## 5              23657.886  1.1132365
## 4              51072.862  1.3624204
## 11             45243.675  1.1544607
## 12            136858.869  2.0413082
## 1             216119.491  2.0874213
## 2               2298.191 14.9103817
## 3               2298.191 14.9103817
## 7                925.402  4.1411222
## 8               2298.191 14.9103817
## 9              64604.977  0.8458565
## 10            125603.360  1.0933893