Analyzing TMDB Dataset v2.0

Dataset source:https://www.kaggle.com/tmdb/tmdb-movie-metadata/data

Importing Libraries

Let’s begin with installing libraries, this function below checks if the libraries are installed and installs only the libraries which are currently not installed.

packages <- c("dplyr", "ggplot2","readr", "lubridate","stringr","tidyr","sqldf","rpart","rpart.plot","tm","wordcloud")
if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
  install.packages(setdiff(packages, rownames(installed.packages())),repos='http://cran.us.r-project.org')  
}

for (p in packages) { suppressMessages(library(p,character.only = TRUE))}

R Markdown

tmdb_5000_movies <- read.csv("tmdb_5000_movies.csv")
moviedata<-tmdb_5000_movies

Extracting Genre from Movie data, using the first genre since other instances have huge amounts of NA values.

Similarly extracting singular values from Production, Keywords, Languages etc.

We check the prevalance of NAs in the following cells and hence choose singular extractions for these options.

Data Cleaning

Genre<-moviedata$genres
Genre<-as.data.frame(Genre)
Genre<-separate(Genre, col = Genre, into=c("1","2","3","4","5","6"))
##View(Genre)
moviedata$genres<-Genre$`5`
##View(moviedata)
Keywords<-moviedata$keywords
Keywords<-as.data.frame(Keywords)
Keywords<-separate(Keywords, col = Keywords, into=c("1","2","3","4","5","6"))
moviedata$keywords<-Keywords$`5`
##View(moviedata)
Production<-moviedata$production_companies
Production<-as.data.frame(Production)
##View(Production)
test<-separate(Production, col = Production, into=c("1","2","3","4","5","6"), sep = ":")
##View(test)
table(is.na(test[,4]))

## 
## FALSE  TRUE 
##  3386  1417

table(is.na(test[,2]))

## 
## FALSE  TRUE 
##  4452   351

Production<-separate(Production, col = Production, into=c("1","2","3","4","5","6"), sep = ":")
##View(Production)
moviedata$production_companies<-Production$`2`

head(moviedata, n=1)

##      budget genres                    homepage    id keywords
## 1 237000000 Action http://www.avatarmovie.com/ 19995  culture
##   original_language original_title
## 1                en         Avatar
##                                                                                                                                                                          overview
## 1 In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.
##   popularity             production_companies
## 1   150.4376  "Ingenious Film Partners", "id"
##                                                                                         production_countries
## 1 [{"iso_3166_1": "US", "name": "United States of America"}, {"iso_3166_1": "GB", "name": "United Kingdom"}]
##   release_date    revenue runtime
## 1   2009-12-10 2787965087     162
##                                                                         spoken_languages
## 1 [{"iso_639_1": "en", "name": "English"}, {"iso_639_1": "es", "name": "Espa\\u00f1ol"}]
##     status                     tagline  title vote_average vote_count
## 1 Released Enter the World of Pandora. Avatar          7.2      11800

Quantitative Information

Let’s now separate textual and quantitative data from the Movie’s file.

##separating quantitative data
Quantitivedata<-sqldf("select budget, genres, keywords, original_language, original_title, popularity, production_companies, production_countries, release_date, revenue, runtime, spoken_languages, status, vote_average, vote_count from moviedata")

Textual Information

##separating textual data
Textualdata<-sqldf("select genres, keywords, original_title, overview, tagline from moviedata")

Explorartory analysis of movie ratings

##Explorartory analysis of movie ratings
ggplot(moviedata, aes(moviedata$vote_average)) + geom_density(color="darkblue", fill="cornflowerblue")

Exploratory analysis of Budget

##Exploratory analysis of Budget
ggplot(moviedata, aes(moviedata$budget)) + geom_density(color="darkblue", fill="cornflowerblue")

Exploratory analysis of Revenue

##Explorartory analysis of revenue 
ggplot(moviedata, aes(moviedata$revenue)) + geom_density(color="darkblue", fill="cornflowerblue")

Exploratory analysis of Runtime

##Explorartory analysis of runtime 
ggplot(moviedata, aes(moviedata$runtime)) + geom_density(color="darkblue", fill="cornflowerblue")

Predicting movie HIT using a decision tree

#calculating profit of movies
moviedata$profit<-moviedata$revenue-moviedata$budget

#calculating flop movies
moviedata$flag<-ifelse(moviedata$profit>0,1,0)

#calculatin ratio of earnings
moviedata$ratio<-moviedata$revenue/moviedata$budget

#calculatin movie hit 
#(a movie is generally considered a hit if it earns more than twice it's cost of production)
moviedata$hit<-ifelse(moviedata$ratio>=2,1,0)

#removing NAs for better modeling
moviedata2<-na.omit(moviedata)

#creating a new datset with desired variables
moviedata3<-moviedata2[,c("budget","genres","original_language","popularity","runtime","vote_average","hit")]

#modeling a decision tree
tree<-rpart(hit~.,data=moviedata3, method = "class")

#plotting a decision tree to understadn key factors and classification
rpart.plot(tree)

Here we see Popularity and Budget play important role in classfiying a movie as HIT. Popularity can be seen as a measure of the marketing efforts taken by the production team. This model can be used by production houses to structure the Budget of movies and decide the marketing budget associated with the movies

Textual Analysis

#following function automates the text mining process for our file

analyzetext<-function(txt){
  
writeLines(as.character(txt),"txt")  
txt<-readLines(as.character("txt"))

#converting the extracted text into a corpus

docs <-Corpus(VectorSource(txt))
summary(docs)

#the following function creates the punction replacement function

replacePunctuation <- content_transformer(function(x) {return (gsub("[[:punct:]]"," ", x))})



docs <- tm_map(docs, replacePunctuation )

#removing numbers using the exsting function

docs <- tm_map(docs, removeNumbers)  

#cpnverting to lower using the exsting function

docs <- tm_map(docs, tolower) 

#removing english stopwords using the existing function

docs <- tm_map(docs, removeWords, stopwords("english"))  

#removing whitespace withing the extrated dataset

docs <- tm_map(docs, stripWhitespace)

#docs <- tm_map(docs, PlainTextDocument)

#creating a dtm file

dtm <- DocumentTermMatrix(docs)  

#creating a tdm file

tdm <- TermDocumentMatrix(docs)  

#calculating the frequency of each word in the excuted column (some of the words could be irerlevant and our study should focu on relevant words only )

freq <- colSums(as.matrix(dtm))  

length(freq)  

freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE) 

#head(freq, 14)  

wordcloud(names(freq), freq, min.freq=500)

}

#using the above automated function above function find most frequency words and creating a wordcloud for the same

set.seed(2020)
analyzetext(Textualdata$genres)

analyzetext(Textualdata$keywords)

analyzetext(Textualdata$overview)

analyzetext(Textualdata$tagline)