To identify key factors and features that make a movie successful and eventually help in gaining a high IMDB Score.
Approach:
Data:
library(ggplot2)
library(ggrepel)
library(ggthemes)
library(scales)
library(dplyr)
library(VIM)
library(data.table)
library(formattable)
library(plotly)
library(corrplot)
library(GGally)
library(caret)
library(car)
library(DataExplorer)
library(flextable)
library(knitr)
library(stringr)
library(formattable)
library(corrplot)
library(broom)
library(rpart)
library(rpart.plot)
library(randomForest)
library(caret)
library(e1071)
library(gbm)
rating <- readr::read_csv("D:/Course/Projects/Pending/IMDB rating/movie_metadata.csv")
dim(rating)
str(rating)
sum(duplicated(rating))
rating <- rating[!duplicated(rating), ]
summary(rating)
colSums(is.na(rating))
##Removing Special Characters in the movie_title column
rating$movie_title <- gsub("Â", "", as.character(factor(rating$movie_title)))
str_trim(rating$movie_title, side = "right")
#Splitting the genres
genre.split<-rating%>%
select(genres,imdb_score)%>%
mutate(Action=ifelse(grepl("Action",genres),1,0),
Adventure=ifelse(grepl("Adventure",genres),1,0),
Animation=ifelse(grepl("Animation",genres),1,0),
Biography=ifelse(grepl("Biography",genres),1,0),
Comedy=ifelse(grepl("Comedy",genres),1,0),
Crime =ifelse(grepl("Crime",genres),1,0),
Documentary=ifelse(grepl("Documentary",genres),1,0),
Drama=ifelse(grepl("Drama",genres),1,0),
Family=ifelse(grepl("Family",genres),1,0),
Fantasy=ifelse(grepl("Fantasy",genres),1,0),
`Film-Noir`=ifelse(grepl("Film-Noir",genres),1,0),
History =ifelse(grepl("History",genres),1,0),
Horror=ifelse(grepl("Horror",genres),1,0),
Musical=ifelse(grepl("Musical",genres),1,0),
Mystery=ifelse(grepl("Mystery",genres),1,0),
News=ifelse(grepl("News",genres),1,0),
Romance=ifelse(grepl("Romance",genres),1,0),
`Sci-Fi`=ifelse(grepl("Sci-Fi",genres),1,0),
Short=ifelse(grepl("Short",genres),1,0),
Sport=ifelse(grepl("Sport",genres),1,0),
War=ifelse(grepl("War",genres),1,0),
Western=ifelse(grepl("Western",genres),1,0))
#Genre wise movie Score
genre.split%>%
tidyr::gather(Genre_Type,Binary,Action:Western)%>%
filter(Binary==1)%>%
select(-c(Binary,genres))%>%
group_by(Genre_Type)%>%
summarise(Mean_Score=mean(imdb_score))%>%
arrange(Mean_Score)%>%
ggplot(aes(x=Genre_Type,y=Mean_Score,fill=Genre_Type))+
geom_bar(stat="identity", color="black")+
coord_flip()
#Removing Genres
rating <- rating%>%select(-genres)
missing.values <- aggr(rating, sortVars = T, prop = T, sortCombs = T, cex.lab = 1.5, cex.axis = .6, cex.numbers = 5, combined = F, gap = -.2)
rating <- rating[!is.na(rating$gross), ]
rating <- rating[!is.na(rating$budget), ]
rating <- rating[!is.na(rating$actor_2_name), ]
rating <- rating[!is.na(rating$color), ]
rating <- rating[!is.na(rating$actor_3_name), ]
rating <- rating[!is.na(rating$plot_keywords), ]
rating <- rating[!is.na(rating$language), ]
rating <- rating[!is.na(rating$content_rating), ]
summary(rating)
colSums(is.na(rating))
dim(rating)
rating$aspect_ratio[is.na(rating$aspect_ratio)] <- median(rating$aspect_ratio, na.rm = TRUE)
mean(rating$imdb_score[rating$aspect_ratio == 1.85])
## [1] 6.364512
mean(rating$imdb_score[rating$aspect_ratio == 2.35])
## [1] 6.504406
mean(rating$imdb_score[rating$aspect_ratio != 1.85 & rating$aspect_ratio != 2.35])
## [1] 6.859322
summary(rating)
## color director_name num_critic_for_reviews duration
## Length:3775 Length:3775 Min. : 2.0 Min. : 37.0
## Class :character Class :character 1st Qu.: 75.0 1st Qu.: 95.0
## Mode :character Mode :character Median :137.0 Median :106.0
## Mean :165.6 Mean :110.1
## 3rd Qu.:223.0 3rd Qu.:120.0
## Max. :813.0 Max. :330.0
##
## director_facebook_likes actor_3_facebook_likes actor_2_name
## Min. : 0.0 Min. : 0.0 Length:3775
## 1st Qu.: 10.0 1st Qu.: 191.0 Class :character
## Median : 60.0 Median : 433.0 Mode :character
## Mean : 799.2 Mean : 758.4
## 3rd Qu.: 233.0 3rd Qu.: 690.0
## Max. :23000.0 Max. :23000.0
##
## actor_1_facebook_likes gross actor_1_name
## Min. : 0 Min. : 162 Length:3775
## 1st Qu.: 739 1st Qu.: 7748090 Class :character
## Median : 1000 Median : 29106737 Mode :character
## Mean : 7691 Mean : 51950057
## 3rd Qu.: 13000 3rd Qu.: 66478202
## Max. :640000 Max. :760505847
##
## movie_title num_voted_users cast_total_facebook_likes
## Length:3775 Min. : 48 Min. : 0
## Class :character 1st Qu.: 18670 1st Qu.: 1885
## Mode :character Median : 52894 Median : 3970
## Mean : 104358 Mean : 11413
## 3rd Qu.: 126900 3rd Qu.: 16140
## Max. :1689764 Max. :656730
##
## actor_3_name facenumber_in_poster plot_keywords movie_imdb_link
## Length:3775 Min. : 0.000 Length:3775 Length:3775
## Class :character 1st Qu.: 0.000 Class :character Class :character
## Mode :character Median : 1.000 Mode :character Mode :character
## Mean : 1.376
## 3rd Qu.: 2.000
## Max. :43.000
## NA's :5
## num_user_for_reviews language country content_rating
## Min. : 2.0 Length:3775 Length:3775 Length:3775
## 1st Qu.: 106.0 Class :character Class :character Class :character
## Median : 207.0 Mode :character Mode :character Mode :character
## Mean : 332.3
## 3rd Qu.: 395.0
## Max. :5060.0
##
## budget title_year actor_2_facebook_likes imdb_score
## Min. :2.180e+02 Min. :1927 Min. : 0 Min. :1.600
## 1st Qu.:1.000e+07 1st Qu.:1999 1st Qu.: 377 1st Qu.:5.900
## Median :2.500e+07 Median :2005 Median : 680 Median :6.600
## Mean :4.576e+07 Mean :2003 Mean : 1992 Mean :6.463
## 3rd Qu.:5.000e+07 3rd Qu.:2010 3rd Qu.: 973 3rd Qu.:7.200
## Max. :1.222e+10 Max. :2016 Max. :137000 Max. :9.300
##
## aspect_ratio movie_facebook_likes
## Min. : 1.180 Min. : 0
## 1st Qu.: 1.850 1st Qu.: 0
## Median : 2.350 Median : 217
## Mean : 2.114 Mean : 9247
## 3rd Qu.: 2.350 3rd Qu.: 11000
## Max. :16.000 Max. :349000
##
#Removing Aspect Ratio
rating <- subset(rating, select = -c(aspect_ratio))
# replace NA with column average for facenumber_in_poster
rating$facenumber_in_poster[is.na(rating$facenumber_in_poster)] <- round(mean(rating$facenumber_in_poster, na.rm = TRUE))
# convert 0s into NAs for other predictors
rating[,c(5,6,8,13,24,26)][rating[,c(5,6,8,13,24,26)] == 0] <- NA
# impute missing value with column mean
rating$num_critic_for_reviews[is.na(rating$num_critic_for_reviews)] <- round(mean(rating$num_critic_for_reviews, na.rm = TRUE))
rating$duration[is.na(rating$duration)] <- round(mean(rating$duration, na.rm = TRUE))
rating$director_facebook_likes[is.na(rating$director_facebook_likes)] <- round(mean(rating$director_facebook_likes, na.rm = TRUE))
rating$actor_3_facebook_likes[is.na(rating$actor_3_facebook_likes)] <- round(mean(rating$actor_3_facebook_likes, na.rm = TRUE))
rating$actor_1_facebook_likes[is.na(rating$actor_1_facebook_likes)] <- round(mean(rating$actor_1_facebook_likes, na.rm = TRUE))
rating$cast_total_facebook_likes[is.na(rating$cast_total_facebook_likes)] <- round(mean(rating$cast_total_facebook_likes, na.rm = TRUE))
rating$actor_2_facebook_likes[is.na(rating$actor_2_facebook_likes)] <- round(mean(rating$actor_2_facebook_likes, na.rm = TRUE))
rating$movie_facebook_likes[is.na(rating$movie_facebook_likes)] <- round(mean(rating$movie_facebook_likes, na.rm = TRUE))
#Content Ratings
Movie_Ratings<-rating%>%
select(content_rating)%>%
group_by(content_rating)%>%
summarise(Count=n())%>%
select(content_rating,Count)
Movie_Ratings.df<-as.data.frame(Movie_Ratings)
#Remove Blank Observations
rating <- rating[!(rating$content_rating %in% ""),]
#Categorization of the content_ratings variable
rating$content_rating[rating$content_rating == 'M'] <- 'PG'
rating$content_rating[rating$content_rating == 'GP'] <- 'PG'
rating$content_rating[rating$content_rating == 'X'] <- 'NC-17'
rating$content_rating[rating$content_rating == 'Approved'] <- 'R'
rating$content_rating[rating$content_rating == 'Not Rated'] <- 'R'
rating$content_rating[rating$content_rating == 'Passed'] <- 'R'
rating$content_rating[rating$content_rating == 'Unrated'] <- 'R'
rating$content_rating <- factor(rating$content_rating)
table(rating$content_rating)
##
## G NC-17 PG PG-13 R
## 89 16 574 1306 1790
#Profit Column
rating <- rating %>%
mutate(profit = gross - budget,
return_on_investment_perc = (profit/budget)*100)
#Removing Color and Language Columns
rating <- subset(rating, select = -c(color))
rating <- subset(rating, select = -c(language))
#Cleaning the Country column into 3 categories
levels(rating$country) <- c(levels(rating$country), "Others")
rating$country[(rating$country != 'USA')&(rating$country != 'UK')] <- 'Others'
rating$country <- factor(rating$country)
Analysis of IMDB Score distribution for all movies in the dataset
##Distribution of IMDB Score Variable
ggplot(rating, aes(x=imdb_score)) +
geom_density(fill="red",alpha = 0.6)+coord_cartesian(xlim = c(0, 10))+
geom_vline(xintercept = mean(rating$imdb_score), color="blue")
summary(rating$imdb_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.600 5.900 6.600 6.463 7.200 9.300
sd(rating$imdb_score)
## [1] 1.053923
Top 20 Profitable Movies based on Profit Value
profit.movie <-rating%>%
select(movie_title,profit)%>%
filter(!is.na(profit))%>%
arrange(desc(profit))%>%
top_n(20)
## Selecting by profit
p1 <- ggplot(profit.movie, aes(x=reorder(movie_title,profit/1000000), profit/1000000,fill=factor(movie_title))) +
geom_bar(stat = "identity") +
ggtitle("Top Profitable Movies")+coord_flip()+xlab("Movie Name")+ylab("Profit in Million $")+theme_bw()
p1
Relationship between Profit and Budget
rating %>%
# filter(title_year %in% c(2000:2016)) %>%
arrange(desc(profit)) %>%
top_n(20, profit) %>%
ggplot(aes(x=budget/1000000, y=profit/1000000)) +
geom_point(size=3) +
geom_smooth(size=2) +
geom_text_repel(aes(label=movie_title)) +
labs(x = "Budget in Million $", y = "Profit in Million $", title = "Top 20 Profitable Movies") +
theme(plot.title = element_text(hjust = 0.5))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Profitable Movies Based on Return of Investment
rating %>%
mutate(profit = gross - budget,
return_on_investment_perc = (profit/budget)*100) %>%
arrange(desc(profit)) %>%
top_n(20, profit) %>%
ggplot(aes(x=budget/1000000, y = return_on_investment_perc)) +
geom_point(size = 3) +
geom_smooth(size = 2) +
geom_text_repel(aes(label = movie_title), size = 3) +
xlab("Budget in Million $") +
ylab("Percentage Return on Investment")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Number of Voters and IMDB Score
p<-ggplot(rating, aes(x=imdb_score, y=num_voted_users, group=content_rating))+
geom_point(aes(color=content_rating),size=0.7)+
scale_color_brewer(palette="Dark2")+geom_smooth(aes(color=content_rating),se = FALSE, method = lm)+
xlab("IMDB Score")+ylab("Number of Voters")+labs(color = "Rating\n")
ggplotly(p)
Commercial Success v/s Critical Acclaim
rating %>%
top_n(20, profit) %>%
ggplot(aes(x = imdb_score, y = gross/10^6, size = profit/10^6, color = content_rating)) +
geom_point() +
geom_hline(aes(yintercept = 550)) +
geom_vline(aes(xintercept = 7.75)) +
geom_text_repel(aes(label = movie_title), size = 4) +
xlab("IMDB Score") +
ylab("Gross Money Earned(in million dollars)") +
ggtitle("Commercial Success Vs Critical Acclaim") +
annotate("text", x = 8.5, y = 700, label = "High IMDB Score & High Gross",size=5) +
theme(plot.title = element_text(hjust = 0.5))
Yearly Trends for IMDB Score (from 1926 to 2016)
#Time Series for IMDB Score
imdb.ts<-rating%>%
select(title_year,imdb_score,country)%>%
group_by(title_year)%>%
summarise(IMDB_Rating=mean(imdb_score))
plot.ts1<-ggplot(data=imdb.ts,aes(x=title_year,y=IMDB_Rating))+geom_point(size=3)+geom_line(size=1)+
geom_smooth(col="red")+xlab("Year of Release")+ylab("IMDB Rating")
ggplotly(plot.ts1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Yearly Trends for Return on Investments (from 1926 to 2016)
#Time Series for Return on Investment
roi.ts<-rating%>%
select(title_year,return_on_investment_perc)%>%
group_by(title_year)%>%
summarise(ROI=mean(return_on_investment_perc))
plot.ts2<-ggplot(data=roi.ts,aes(x=title_year,y=ROI))+geom_point(size=3)+geom_line(size=1)+
geom_smooth(col="green")+xlab("Year of Release")+ylab("Return on Investment")
ggplotly(plot.ts2)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Directors with best IMDB Score
director.imdb<-rating%>%
select(director_name,imdb_score)%>%
group_by(director_name)%>%
summarise(Average_IMDB_Rating=mean(imdb_score))%>%
arrange(desc(Average_IMDB_Rating))%>%
top_n(20)
## Selecting by Average_IMDB_Rating
director.df<-as.data.frame(director.imdb)
names(director.df)[names(director.df) == "director_name"] <- "Director"
names(director.df)[names(director.df) == "Average_IMDB_Rating"] <- "Average_IMDB_Rating"
director.table <- formattable(director.df,list(Average_IMDB_Rating=color_bar("lightgreen")))
director.table
| Director | Average_IMDB_Rating |
|---|---|
| Akira Kurosawa | 8.700000 |
| Charles Chaplin | 8.600000 |
| Tony Kaye | 8.600000 |
| Alfred Hitchcock | 8.500000 |
| Damien Chazelle | 8.500000 |
| Majid Majidi | 8.500000 |
| Ron Fricke | 8.500000 |
| Sergio Leone | 8.433333 |
| Christopher Nolan | 8.425000 |
| Asghar Farhadi | 8.400000 |
| Marius A. Markevicius | 8.400000 |
| Richard Marquand | 8.400000 |
| Billy Wilder | 8.300000 |
| Fritz Lang | 8.300000 |
| Lee Unkrich | 8.300000 |
| Lenny Abrahamson | 8.300000 |
| Pete Docter | 8.233333 |
| Hayao Miyazaki | 8.225000 |
| Elia Kazan | 8.200000 |
| George Roy Hill | 8.200000 |
| Joshua Oppenheimer | 8.200000 |
| Juan José Campanella | 8.200000 |
| Quentin Tarantino | 8.200000 |
Reviews and IMDB Score for different countries
imdb.user<-ggplot(data=rating,aes(x=imdb_score,y=num_user_for_reviews,colour=factor(country)))+
geom_point(aes(colour= factor(country)),size=0.7)+
geom_smooth(se = FALSE, method = "lm")+xlab("IMDB Score")+ylab("Number of User Reviews")+
ylim(0,1500)+labs(color = "Country\n")
ggplotly(imdb.user)
Movie Facebook Likes and Actor Facebook Likes
ggplot(data=rating,aes(x=actor_1_facebook_likes,y=movie_facebook_likes))+
geom_point()+
geom_smooth(se = TRUE, method = "lm")+xlim(0,50000)+ylim(0,200000)
Additional Data Cleaning
The final cleaned data set contains 3806 observations and 14 variables
#Unique Director and Actor Names
sum(uniqueN(rating$director_name))
## [1] 1691
sum(uniqueN(rating[, c("actor_1_name", "actor_2_name", "actor_3_name")]))
## [1] 3682
#Dropping unnecessary columns
rating<-rating%>%
select(-c(actor_1_name,actor_2_name,actor_3_name,director_name,
plot_keywords,movie_imdb_link,movie_title,profit,return_on_investment_perc))
#Visualizing Correlation Plots
ggcorr(rating, label = TRUE, label_round = 3, label_size = 3, size = 2, hjust = .85) +
ggtitle("Correlation between continuous variables") +
theme(plot.title = element_text(hjust = 0.5))
#Adding new columns and deleting unnecessary columns
rating<-rating%>%
mutate(other_actor_facebook_likes=actor_2_facebook_likes + actor_3_facebook_likes,
critic_total_ratio=num_critic_for_reviews/num_user_for_reviews)%>%
select (-c(cast_total_facebook_likes, actor_2_facebook_likes, actor_3_facebook_likes,
num_critic_for_reviews, num_user_for_reviews))
#Creating Score Categories
rating <- rating %>% mutate(Rating_Category = cut(imdb_score, c(0, 4, 7, 9, 10),
labels = c("LOW", "MEDIUM", "HIGH", "EXCELLENT")))
movie.final<-rating%>%select(-imdb_score)
##Splitting Data
training.samples <- movie.final$Rating_Category%>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- movie.final[training.samples, ]
test.data <- movie.final[-training.samples, ]
##Multinomial Logistic Regression
# Fit the model
model.multi <- nnet::multinom(Rating_Category ~., data = train.data)
tidy(model.multi)
formattable(tidy(model.multi))
# Summarize the model
summary(model.multi)
# Make predictions
predicted.classes <- model.multi %>% predict(test.data)
head(predicted.classes)
# Model accuracy
mean(predicted.classes == test.data$Rating_Category)
## [1] 0.7625995
rpart.fit <- rpart(Rating_Category~., data = train.data, method = 'class')
plotcp(rpart.fit)
rpart.fit.2<-prune.rpart(rpart.fit,cp=0.01)
rpart.plot(rpart.fit.2, extra = 104)
#Prediction
predict_unseen <-predict(rpart.fit.2, test.data, type = 'class')
table_mat <- table(test.data$Rating_Category, predict_unseen)
table_mat
## predict_unseen
## LOW MEDIUM HIGH EXCELLENT
## LOW 0 18 1 0
## MEDIUM 0 475 32 0
## HIGH 0 122 106 0
## EXCELLENT 0 0 0 0
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
print(paste('Accuracy for test', accuracy_Test))
## [1] "Accuracy for test 0.770557029177719"
#3hyper parameter Tuning
accuracy_tune <- function(fit) {
predict_unseen <- predict(fit, test.data, type = 'class')
table_mat <- table(test.data$Rating_Category, predict_unseen)
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
accuracy_Test
}
control <- rpart.control(minsplit = 20,
minbucket = round(20 / 3),
maxdepth = 20,
cp = 0.01)
tune_fit <- rpart(Rating_Category~., data = train.data, method = 'class', control = control)
accuracy_tune(tune_fit)
## [1] 0.770557
trControl <- trainControl(method = "cv",number = 10,search = "grid")
rf_default <- train(Rating_Category~.,data = train.data,method = "rf",metric = "Accuracy",
trControl = trControl)
print(rf_default)
## Random Forest
##
## 3021 samples
## 13 predictor
## 4 classes: 'LOW', 'MEDIUM', 'HIGH', 'EXCELLENT'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2718, 2720, 2718, 2719, 2719, 2720, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8050393 0.5164266
## 9 0.8162900 0.5616152
## 17 0.8166200 0.5664938
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 17.
#Best mtry
tuneGrid <- expand.grid(.mtry = c(1: 10))
rf_mtry <- train(Rating_Category~.,
data = train.data,
method = "rf",
metric = "Accuracy",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
ntree = 300)
print(rf_mtry)
## Random Forest
##
## 3021 samples
## 13 predictor
## 4 classes: 'LOW', 'MEDIUM', 'HIGH', 'EXCELLENT'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2719, 2719, 2719, 2719, 2720, 2718, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 1 0.7368473 0.2662390
## 2 0.7997438 0.5026510
## 3 0.8096667 0.5364832
## 4 0.8159636 0.5564788
## 5 0.8162936 0.5569768
## 6 0.8166236 0.5598994
## 7 0.8119912 0.5494664
## 8 0.8162892 0.5596867
## 9 0.8149680 0.5582593
## 10 0.8159570 0.5598258
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 6.
best_mtry <- rf_mtry$bestTune$mtry
best_mtry
## [1] 6
max(rf_mtry$results$Accuracy)
## [1] 0.8166236
#Best max nodes
store_maxnode <- list()
tuneGrid <- expand.grid(.mtry = best_mtry)
for (maxnodes in c(5: 30)) {
set.seed(1234)
rf_maxnode <- train(Rating_Category~.,
data = train.data,
method = "rf",
metric = "Accuracy",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
maxnodes = maxnodes,
ntree = 300)
current_iteration <- toString(maxnodes)
store_maxnode[[current_iteration]] <- rf_maxnode
}
results_mtry <- resamples(store_maxnode)
summary(results_mtry) #Best max node=27
##
## Call:
## summary.resamples(object = results_mtry)
##
## Models: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5 0.7326733 0.7365405 0.7433775 0.7438064 0.7483498 0.7641196 0
## 6 0.7342193 0.7353179 0.7471040 0.7474412 0.7549505 0.7674419 0
## 7 0.7359736 0.7398859 0.7479208 0.7500957 0.7599010 0.7707641 0
## 8 0.7317881 0.7429416 0.7533113 0.7514158 0.7557756 0.7707641 0
## 9 0.7350993 0.7443978 0.7458746 0.7510869 0.7557756 0.7740864 0
## 10 0.7375415 0.7458746 0.7533113 0.7557150 0.7628587 0.7807309 0
## 11 0.7375415 0.7485520 0.7566225 0.7573706 0.7673267 0.7774086 0
## 12 0.7375415 0.7500000 0.7603272 0.7610098 0.7714521 0.7840532 0
## 13 0.7417219 0.7485451 0.7636384 0.7623354 0.7747525 0.7840532 0
## 14 0.7408638 0.7574257 0.7698675 0.7659734 0.7747525 0.7840532 0
## 15 0.7408638 0.7549669 0.7652886 0.7653133 0.7772277 0.7840532 0
## 16 0.7475083 0.7591060 0.7623762 0.7682913 0.7788779 0.7906977 0
## 17 0.7375415 0.7549669 0.7702391 0.7669679 0.7755776 0.7906977 0
## 18 0.7408638 0.7533113 0.7623762 0.7669701 0.7780528 0.8006645 0
## 19 0.7408638 0.7663104 0.7768562 0.7745805 0.7813531 0.8039867 0
## 20 0.7475083 0.7650864 0.7801620 0.7768941 0.7854785 0.8039867 0
## 21 0.7375415 0.7682119 0.7821782 0.7778787 0.7869012 0.8073090 0
## 22 0.7408638 0.7715232 0.7854785 0.7805244 0.7893929 0.8039867 0
## 23 0.7375415 0.7675699 0.7768616 0.7752373 0.7838284 0.8006645 0
## 24 0.7508306 0.7756623 0.7805281 0.7811867 0.7909033 0.7986799 0
## 25 0.7441860 0.7733619 0.7834732 0.7818500 0.7968515 0.8039867 0
## 26 0.7408638 0.7723510 0.7805281 0.7801977 0.7937294 0.8039867 0
## 27 0.7375415 0.7762196 0.7818176 0.7825123 0.7968515 0.8039867 0
## 28 0.7475083 0.7808040 0.7927053 0.7871404 0.7978548 0.8073090 0
## 29 0.7408638 0.7795227 0.7884182 0.7848280 0.7968515 0.8106312 0
## 30 0.7641196 0.7791538 0.7897383 0.7878092 0.7993328 0.8073090 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5 0.2902885 0.3094097 0.3246922 0.3270674 0.3464941 0.3771930 0
## 6 0.2949279 0.3109350 0.3337576 0.3433273 0.3734932 0.4072469 0
## 7 0.3066758 0.3195106 0.3360310 0.3449642 0.3703080 0.4063625 0
## 8 0.2742538 0.3276565 0.3415242 0.3420717 0.3646453 0.3907598 0
## 9 0.2949279 0.3193024 0.3404092 0.3446821 0.3630778 0.4168329 0
## 10 0.3203281 0.3263811 0.3495178 0.3593383 0.3756036 0.4375743 0
## 11 0.3159484 0.3384965 0.3609896 0.3647679 0.3909922 0.4308574 0
## 12 0.3115120 0.3358827 0.3740912 0.3757984 0.4138574 0.4443340 0
## 13 0.3080313 0.3422544 0.3902141 0.3827717 0.4206959 0.4513153 0
## 14 0.3224242 0.3744609 0.3944853 0.3906135 0.4203103 0.4513153 0
## 15 0.3224242 0.3446335 0.3902141 0.3872891 0.4264099 0.4478467 0
## 16 0.3397980 0.3593532 0.3779270 0.3944126 0.4330334 0.4579832 0
## 17 0.3115120 0.3482567 0.3934808 0.3917833 0.4263535 0.4648360 0
## 18 0.3224242 0.3422999 0.3888893 0.3931040 0.4255791 0.4854408 0
## 19 0.3267764 0.3918404 0.4182153 0.4157033 0.4365348 0.4923970 0
## 20 0.3454136 0.3819166 0.4336330 0.4235405 0.4545903 0.4988147 0
## 21 0.3203281 0.3899930 0.4428832 0.4276531 0.4591735 0.5088618 0
## 22 0.3267764 0.4044806 0.4565160 0.4356928 0.4674273 0.4956262 0
## 23 0.3159484 0.3920420 0.4296852 0.4205611 0.4441746 0.4887039 0
## 24 0.3629000 0.4129710 0.4414135 0.4372588 0.4608757 0.4914988 0
## 25 0.3375350 0.4105427 0.4462496 0.4397441 0.4876867 0.4988147 0
## 26 0.3267764 0.4023255 0.4414371 0.4345278 0.4783135 0.4988147 0
## 27 0.3203281 0.4214502 0.4471395 0.4422005 0.4835742 0.5019631 0
## 28 0.3482250 0.4330393 0.4744162 0.4546420 0.4878938 0.5119374 0
## 29 0.3267764 0.4321952 0.4567319 0.4482898 0.4866125 0.5127237 0
## 30 0.3852195 0.4260754 0.4661120 0.4549787 0.4920789 0.5088618 0
#Best ntrees
store_maxtrees <- list()
for (ntree in c(250, 300, 350, 400, 450, 500, 550, 600, 800, 1000, 2000)) {
rf_maxtrees <- train(Rating_Category~.,
data = train.data,
method = "rf",
metric = "Accuracy",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
maxnodes = 27,
ntree = ntree)
key <- toString(ntree)
store_maxtrees[[key]] <- rf_maxtrees
}
results_tree <- resamples(store_maxtrees)
summary(results_tree)
##
## Call:
## summary.resamples(object = results_tree)
##
## Models: 250, 300, 350, 400, 450, 500, 550, 600, 800, 1000, 2000
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 250 0.7483444 0.7725368 0.7814570 0.7811872 0.7902515 0.8118812 0
## 300 0.7500000 0.7682724 0.7867790 0.7822206 0.7968515 0.8106312 0
## 350 0.7467105 0.7723510 0.7943500 0.7868574 0.8018163 0.8172757 0
## 400 0.7450331 0.7673841 0.7906977 0.7838460 0.8006633 0.8052805 0
## 450 0.7475083 0.7717117 0.7847659 0.7828368 0.7985050 0.8085809 0
## 500 0.7417219 0.7632450 0.7831126 0.7792130 0.7912541 0.8079470 0
## 550 0.7508306 0.7760271 0.7847682 0.7841641 0.7965116 0.8085809 0
## 600 0.7350993 0.7605403 0.7834787 0.7818505 0.7983460 0.8283828 0
## 800 0.7656766 0.7731788 0.7798013 0.7795528 0.7845894 0.7980132 0
## 1000 0.7590759 0.7726095 0.7814546 0.7828544 0.7890876 0.8118812 0
## 2000 0.7582781 0.7796226 0.7847682 0.7851601 0.7943823 0.8052805 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 250 0.3511782 0.4066596 0.4469752 0.4373576 0.4682432 0.5137258 0
## 300 0.3548171 0.4075116 0.4525421 0.4410169 0.4747443 0.5127237 0
## 350 0.3483478 0.4030857 0.4741190 0.4527197 0.5022117 0.5327934 0
## 400 0.3407609 0.4028015 0.4491605 0.4461570 0.4946138 0.5220565 0
## 450 0.3482250 0.4072057 0.4413826 0.4418224 0.4844537 0.5187315 0
## 500 0.3125547 0.3901331 0.4412926 0.4316090 0.4769522 0.5164665 0
## 550 0.3505840 0.4264074 0.4487449 0.4472825 0.4751401 0.5187315 0
## 600 0.3084893 0.3750395 0.4490703 0.4392698 0.4825766 0.5787616 0
## 800 0.3904802 0.4191042 0.4285196 0.4319628 0.4463418 0.4770340 0
## 1000 0.3800034 0.4123936 0.4369386 0.4414454 0.4568691 0.5313034 0
## 2000 0.3628692 0.4380191 0.4568534 0.4489446 0.4780857 0.5051212 0
#Best Model
fit_rf <- train(Rating_Category~.,
train.data,
method = "rf",
metric = "Accuracy",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
ntree = 600,
maxnodes = 27)
prediction.rf <-predict(fit_rf, test.data)
confusionMatrix(prediction.rf, test.data$Rating_Category)
## Confusion Matrix and Statistics
##
## Reference
## Prediction LOW MEDIUM HIGH EXCELLENT
## LOW 0 0 0 0
## MEDIUM 19 489 130 0
## HIGH 0 18 98 0
## EXCELLENT 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.7785
## 95% CI : (0.7472, 0.8077)
## No Information Rate : 0.6724
## P-Value [Acc > NIR] : 8.979e-11
##
## Kappa : 0.424
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: LOW Class: MEDIUM Class: HIGH Class: EXCELLENT
## Sensitivity 0.0000 0.9645 0.4298 NA
## Specificity 1.0000 0.3968 0.9658 1
## Pos Pred Value NaN 0.7665 0.8448 NA
## Neg Pred Value 0.9748 0.8448 0.7962 NA
## Prevalence 0.0252 0.6724 0.3024 0
## Detection Rate 0.0000 0.6485 0.1300 0
## Detection Prevalence 0.0000 0.8462 0.1538 0
## Balanced Accuracy 0.5000 0.6806 0.6978 NA
varImp(fit_rf)
## rf variable importance
##
## variables are sorted by maximum importance across the classes
## LOW MEDIUM HIGH EXCELLENT
## num_voted_users 3.943 70.168 100.000 10.074
## budget 4.295 48.811 22.192 5.646
## duration 10.743 36.920 39.690 7.625
## director_facebook_likes 5.197 17.807 38.867 6.307
## movie_facebook_likes 9.346 19.780 30.316 5.646
## countryUSA 1.439 0.000 26.133 4.055
## gross 1.996 25.256 2.230 4.055
## title_year 2.321 21.341 15.885 5.646
## actor_1_facebook_likes 6.559 4.977 20.399 5.646
## content_ratingPG-13 4.244 20.365 12.220 4.055
## critic_total_ratio 11.685 17.945 16.119 9.125
## other_actor_facebook_likes 4.951 12.777 9.504 4.055
## facenumber_in_poster 4.907 6.498 11.404 4.055
## content_ratingR 5.491 10.983 10.274 5.646
## countryUK 5.052 10.839 10.061 4.055
## content_ratingPG 5.646 5.956 3.455 4.055
## content_ratingNC-17 4.055 5.646 4.180 4.055
rf <- randomForest(Rating_Category ~ . , data = train.data, mtry = 4)
# Get importance
importance <- importance(rf)
varImportance <- data.frame(Variables = row.names(importance),
Importance = round(importance[ ,'MeanDecreaseGini'],2))
# Create a rank variable based on importance
rankImportance <- varImportance %>%
mutate(Rank = paste0('#',dense_rank(desc(Importance))))
# Use ggplot2 to visualize the relative importance of variables
ggplot(rankImportance, aes(x = reorder(Variables, Importance),
y = Importance, fill = Importance)) +
geom_bar(stat='identity') +
geom_text(aes(x = Variables, y = 0.5, label = Rank),
hjust=0, vjust=0.55, size = 4, colour = 'red') +
labs(x = 'Variables') +
coord_flip() +
theme_few()
tc<-trainControl(method = "repeatedcv", number = 10)
gbm.model = train(Rating_Category ~., data=train.data, method="gbm", trControl=tc)
plot(gbm.model)
pred.gbm = predict(gbm.model, test.data)
result = data.frame(test.data$Rating_Category, pred.gbm)
cm = confusionMatrix(test.data$Rating_Category, as.factor(pred.gbm))
print(cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction LOW MEDIUM HIGH EXCELLENT
## LOW 2 17 0 0
## MEDIUM 2 473 32 0
## HIGH 0 92 136 0
## EXCELLENT 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.8103
## 95% CI : (0.7805, 0.8377)
## No Information Rate : 0.7719
## P-Value [Acc > NIR] : 0.005907
##
## Kappa : 0.5413
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: LOW Class: MEDIUM Class: HIGH Class: EXCELLENT
## Sensitivity 0.500000 0.8127 0.8095 NA
## Specificity 0.977333 0.8023 0.8430 1
## Pos Pred Value 0.105263 0.9329 0.5965 NA
## Neg Pred Value 0.997279 0.5587 0.9392 NA
## Prevalence 0.005305 0.7719 0.2228 0
## Detection Rate 0.002653 0.6273 0.1804 0
## Detection Prevalence 0.025199 0.6724 0.3024 0
## Balanced Accuracy 0.738667 0.8075 0.8263 NA
The Model Comparison Table is shown below:
models <- data.frame("Model" = c("Logistic Regression","Classification Tree", "Random Forest", "Gradient Boosting"),
"Accuracy" = c('73.15%', '74.07%','77.11%','79.34%'))
models
I have not considered any analysis for the genre of the movie. This could be a potentially important variable to measure success(not for IMDB Score). Also, text mining can be done on the description variable to obtain importance of certain words in the trailer/reviews that might contribute to the movie’s overall viewership