Dataset source:https://www.kaggle.com/tmdb/tmdb-movie-metadata/data
Let’s begin with installing libraries, this function below checks if the libraries are installed and installs only the libraries which are currently not installed.
packages <- c("dplyr", "ggplot2","readr", "lubridate","stringr","tidyr","sqldf","rpart","rpart.plot","tm","wordcloud")
if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
install.packages(setdiff(packages, rownames(installed.packages())),repos='http://cran.us.r-project.org')
}
for (p in packages) { suppressMessages(library(p,character.only = TRUE))}
tmdb_5000_movies <- read.csv("tmdb_5000_movies.csv")
moviedata<-tmdb_5000_movies
Extracting Genre from Movie data, using the first genre since other instances have huge amounts of NA values.
Similarly extracting singular values from Production, Keywords, Languages etc.
We check the prevalance of NAs in the following cells and hence choose singular extractions for these options.
Genre<-moviedata$genres
Genre<-as.data.frame(Genre)
Genre<-separate(Genre, col = Genre, into=c("1","2","3","4","5","6"))
##View(Genre)
moviedata$genres<-Genre$`5`
##View(moviedata)
Keywords<-moviedata$keywords
Keywords<-as.data.frame(Keywords)
Keywords<-separate(Keywords, col = Keywords, into=c("1","2","3","4","5","6"))
moviedata$keywords<-Keywords$`5`
##View(moviedata)
Production<-moviedata$production_companies
Production<-as.data.frame(Production)
##View(Production)
test<-separate(Production, col = Production, into=c("1","2","3","4","5","6"), sep = ":")
##View(test)
table(is.na(test[,4]))
##
## FALSE TRUE
## 3386 1417
table(is.na(test[,2]))
##
## FALSE TRUE
## 4452 351
Production<-separate(Production, col = Production, into=c("1","2","3","4","5","6"), sep = ":")
##View(Production)
moviedata$production_companies<-Production$`2`
head(moviedata, n=1)
## budget genres homepage id keywords
## 1 237000000 Action http://www.avatarmovie.com/ 19995 culture
## original_language original_title
## 1 en Avatar
## overview
## 1 In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.
## popularity production_companies
## 1 150.4376 "Ingenious Film Partners", "id"
## production_countries
## 1 [{"iso_3166_1": "US", "name": "United States of America"}, {"iso_3166_1": "GB", "name": "United Kingdom"}]
## release_date revenue runtime
## 1 2009-12-10 2787965087 162
## spoken_languages
## 1 [{"iso_639_1": "en", "name": "English"}, {"iso_639_1": "es", "name": "Espa\\u00f1ol"}]
## status tagline title vote_average vote_count
## 1 Released Enter the World of Pandora. Avatar 7.2 11800
Let’s now separate textual and quantitative data from the Movie’s file.
##separating quantitative data
Quantitivedata<-sqldf("select budget, genres, keywords, original_language, original_title, popularity, production_companies, production_countries, release_date, revenue, runtime, spoken_languages, status, vote_average, vote_count from moviedata")
##separating textual data
Textualdata<-sqldf("select genres, keywords, original_title, overview, tagline from moviedata")
##Explorartory analysis of movie ratings
ggplot(moviedata, aes(moviedata$vote_average)) + geom_density(color="darkblue", fill="cornflowerblue")
##Exploratory analysis of Budget
ggplot(moviedata, aes(moviedata$budget)) + geom_density(color="darkblue", fill="cornflowerblue")
##Explorartory analysis of revenue
ggplot(moviedata, aes(moviedata$revenue)) + geom_density(color="darkblue", fill="cornflowerblue")
##Explorartory analysis of runtime
ggplot(moviedata, aes(moviedata$runtime)) + geom_density(color="darkblue", fill="cornflowerblue")
#calculating profit of movies
moviedata$profit<-moviedata$revenue-moviedata$budget
#calculating flop movies
moviedata$flag<-ifelse(moviedata$profit>0,1,0)
#calculatin ratio of earnings
moviedata$ratio<-moviedata$revenue/moviedata$budget
#calculatin movie hit
#(a movie is generally considered a hit if it earns more than twice it's cost of production)
moviedata$hit<-ifelse(moviedata$ratio>=2,1,0)
#removing NAs for better modeling
moviedata2<-na.omit(moviedata)
#creating a new datset with desired variables
moviedata3<-moviedata2[,c("budget","genres","original_language","popularity","runtime","vote_average","hit")]
#modeling a decision tree
tree<-rpart(hit~.,data=moviedata3, method = "class")
#plotting a decision tree to understadn key factors and classification
rpart.plot(tree)
Here we see Popularity and Budget play important role in classfiying a movie as HIT. Popularity can be seen as a measure of the marketing efforts taken by the production team. This model can be used by production houses to structure the Budget of movies and decide the marketing budget associated with the movies
#following function automates the text mining process for our file
analyzetext<-function(txt){
writeLines(as.character(txt),"txt")
txt<-readLines(as.character("txt"))
#converting the extracted text into a corpus
docs <-Corpus(VectorSource(txt))
summary(docs)
#the following function creates the punction replacement function
replacePunctuation <- content_transformer(function(x) {return (gsub("[[:punct:]]"," ", x))})
docs <- tm_map(docs, replacePunctuation )
#removing numbers using the exsting function
docs <- tm_map(docs, removeNumbers)
#cpnverting to lower using the exsting function
docs <- tm_map(docs, tolower)
#removing english stopwords using the existing function
docs <- tm_map(docs, removeWords, stopwords("english"))
#removing whitespace withing the extrated dataset
docs <- tm_map(docs, stripWhitespace)
#docs <- tm_map(docs, PlainTextDocument)
#creating a dtm file
dtm <- DocumentTermMatrix(docs)
#creating a tdm file
tdm <- TermDocumentMatrix(docs)
#calculating the frequency of each word in the excuted column (some of the words could be irerlevant and our study should focu on relevant words only )
freq <- colSums(as.matrix(dtm))
length(freq)
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
#head(freq, 14)
wordcloud(names(freq), freq, min.freq=500)
}
#using the above automated function above function find most frequency words and creating a wordcloud for the same
set.seed(2020)
analyzetext(Textualdata$genres)
analyzetext(Textualdata$keywords)
analyzetext(Textualdata$overview)
analyzetext(Textualdata$tagline)