This report provides Box Office movies prediction using regression algorithm. The dataset used in this report for modelling from The Movie Database.
The dataset is hosted in Kabble and can be downloaded here.
The report is structured as follows:
1. Data Extraction
2. Exploratory Data Analysis
3. Data Preparation
4. Modeling
5. Evaluation
6. Recommendation
Import necessary libraries
rm(list = ls())
library(ggplot2)
Import train dataset and see its structure
# read data
boxtrain_df <- read.csv("data/train.csv")
# structure of dataframe
str(boxtrain_df)
## 'data.frame': 2400 obs. of 23 variables:
## $ id : int 272 2804 2549 260 532 1370 972 300 1732 551 ...
## $ belongs_to_collection: chr "" "" "[{'id': 10456, 'name': 'Dirty Harry Collection', 'poster_path': '/bS1ioomHh2emKAZ6vWn17UZnhDx.jpg', 'backdrop_p"| __truncated__ "" ...
## $ budget : int 72500000 0 0 24554 0 8000000 13000000 9000000 3900000 13000000 ...
## $ genres : chr "[{'id': 35, 'name': 'Comedy'}]" "[{'id': 10751, 'name': 'Family'}, {'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}]" "[{'id': 28, 'name': 'Action'}, {'id': 80, 'name': 'Crime'}, {'id': 53, 'name': 'Thriller'}]" "[{'id': 18, 'name': 'Drama'}]" ...
## $ homepage : chr "http://www.sonypictures.com/homevideo/talladeganights/" "" "" "" ...
## $ imdb_id : chr "tt0415306" "tt0305396" "tt0070355" "tt0169302" ...
## $ original_language : chr "en" "en" "en" "en" ...
## $ original_title : chr "Talladega Nights: The Ballad of Ricky Bobby" "The Crocodile Hunter: Collision Course" "Magnum Force" "Theeviravaathi: The Terrorist" ...
## $ overview : chr "Lifelong friends and national idols Ricky Bobby and Cal Naughton Jr. have earned their NASCAR stripes with thei"| __truncated__ "Aussie adventurer Steve Irwin aka The Crocodile Hunter has avoided the death-roll and nabbed another feisty cro"| __truncated__ "\"Dirty\" Harry Calahan is a San Francisco Police Inspector on the trail of a group of rogue cops who have take"| __truncated__ "A young female terrorist goes on a suicide assassination mission, but her resolve to complete it is put to the test." ...
## $ popularity : num 6.937 3.149 10.226 0.155 6.317 ...
## $ poster_path : chr "/hi8whfL7t6cL2LITLJjzJ7UWuZA.jpg" "/kciZV7jywirkEqIW2pDMguFfzwb.jpg" "/3gqV4jpKNFqxUWug3BRD6yUzSL1.jpg" "/hUbH2hhcBmcothEIpqyuBHXE2FB.jpg" ...
## $ production_companies : chr "[{'name': 'Columbia Pictures', 'id': 5}, {'name': 'Apatow Productions', 'id': 10105}]" "[{'name': 'Cheyenne Enterprises', 'id': 890}, {'name': 'Discovery Channel Pictures', 'id': 7492}, {'name': 'Met"| __truncated__ "[{'name': 'Warner Bros.', 'id': 6194}]" "" ...
## $ production_countries : chr "[{'iso_3166_1': 'US', 'name': 'United States of America'}]" "[{'iso_3166_1': 'AU', 'name': 'Australia'}, {'iso_3166_1': 'US', 'name': 'United States of America'}]" "[{'iso_3166_1': 'US', 'name': 'United States of America'}]" "" ...
## $ release_date : chr "8/4/06" "7/26/02" "12/23/73" "9/12/98" ...
## $ runtime : num 116 90 124 95 117 122 112 94 199 81 ...
## $ spoken_languages : chr "[{'iso_639_1': 'en', 'name': 'English'}]" "[{'iso_639_1': 'en', 'name': 'English'}]" "[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'es', 'name': 'Español'}]" "[{'iso_639_1': 'hi', 'name': 'हिनà¥\215दà¥\200'}]" ...
## $ status : chr "Released" "Released" "Released" "Released" ...
## $ tagline : chr "The story of a man who could only count to #1" "His First Big-Screen Adventure!" "A man's got to know his limitations." "Her death will not be ordinary." ...
## $ title : chr "Talladega Nights: The Ballad of Ricky Bobby" "The Crocodile Hunter: Collision Course" "Magnum Force" "The Terrorist" ...
## $ Keywords : chr "[{'id': 5922, 'name': 'north carolina'}, {'id': 6157, 'name': 'prayer'}, {'id': 9848, 'name': 'family dinner'},"| __truncated__ "[{'id': 3185, 'name': 'crocodile'}]" "[{'id': 1679, 'name': 'arbitrary law'}, {'id': 3877, 'name': 'covered investigation'}, {'id': 5136, 'name': 'un"| __truncated__ "" ...
## $ cast : chr "[{'cast_id': 8, 'character': 'Ricky Bobby', 'credit_id': '52fe4521c3a36847f80be1a5', 'gender': 2, 'id': 23659, "| __truncated__ "[{'cast_id': 1, 'character': 'Himself', 'credit_id': '52fe47019251416c7508acf3', 'gender': 0, 'id': 559670, 'na"| __truncated__ "[{'cast_id': 1, 'character': 'Insp. Harry Calahan', 'credit_id': '52fe439b9251416c75017009', 'gender': 2, 'id':"| __truncated__ "[{'cast_id': 2, 'character': 'Malli', 'credit_id': '52fe475bc3a36847f8131393', 'gender': 0, 'id': 33191, 'name'"| __truncated__ ...
## $ crew : chr "[{'credit_id': '52fe4521c3a36847f80be20f', 'department': 'Camera', 'gender': 2, 'id': 11409, 'job': 'Director o"| __truncated__ "[{'credit_id': '52fe47019251416c7508ad2d', 'department': 'Directing', 'gender': 2, 'id': 944713, 'job': 'Direct"| __truncated__ "[{'credit_id': '52fe439b9251416c7501701b', 'department': 'Directing', 'gender': 2, 'id': 18635, 'job': 'Directo"| __truncated__ "[{'credit_id': '52fe475bc3a36847f813138f', 'department': 'Directing', 'gender': 2, 'id': 37236, 'job': 'Directo"| __truncated__ ...
## $ revenue : int 162966177 33082548 39768000 140021 226286 49000000 318000141 26055 4900000 97837138 ...
The dataset has 2,400 obeservations and 23 variables.
The target variables is revenue and the remaining variables are features.
Extract statistical summary of each variable.
# data dimension
d <- dim(boxtrain_df)
m <- d[1] # m: number of rows
n <- d[2] # n: number of columns
# statictical summary
summary(boxtrain_df)
## id belongs_to_collection budget genres
## Min. : 2.0 Length:2400 Min. :0.00e+00 Length:2400
## 1st Qu.: 742.5 Class :character 1st Qu.:0.00e+00 Class :character
## Median :1475.0 Mode :character Median :8.00e+06 Mode :character
## Mean :1486.6 Mean :2.24e+07
## 3rd Qu.:2220.2 3rd Qu.:2.80e+07
## Max. :3000.0 Max. :3.80e+08
##
## homepage imdb_id original_language original_title
## Length:2400 Length:2400 Length:2400 Length:2400
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## overview popularity poster_path production_companies
## Length:2400 Min. : 0.000 Length:2400 Length:2400
## Class :character 1st Qu.: 4.071 Class :character Class :character
## Mode :character Median : 7.411 Mode :character Mode :character
## Mean : 8.403
## 3rd Qu.: 10.937
## Max. :294.337
##
## production_countries release_date runtime spoken_languages
## Length:2400 Length:2400 Min. : 0.0 Length:2400
## Class :character Class :character 1st Qu.: 94.0 Class :character
## Mode :character Mode :character Median :104.0 Mode :character
## Mean :107.7
## 3rd Qu.:118.0
## Max. :338.0
## NA's :2
## status tagline title Keywords
## Length:2400 Length:2400 Length:2400 Length:2400
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## cast crew revenue
## Length:2400 Length:2400 Min. :1.000e+00
## Class :character Class :character 1st Qu.:2.402e+06
## Mode :character Mode :character Median :1.705e+07
## Mean :6.765e+07
## 3rd Qu.:7.174e+07
## Max. :1.520e+09
##
We can see minimum, median, mean, and maximum values of each numeric variables.
It is interesting that some datas have the minimum values:
revenue is 1.00e+00.
budget is 0.00e+00
runtime is 0.0
These could be an incorrect data.
We notice that the maximum values of revenue is statistically not far from median and third quantile.
Plot distribution of budget using histogram
ggplot(data = boxtrain_df, aes(x = budget)) +
geom_histogram(color="red",
aes(fill=..count..)) +
theme(axis.text.x = element_text(angle = 0)) +
scale_x_continuous(labels = scales::label_number_si()) +
scale_y_continuous(labels = scales::label_number_si(),
limits = c(0, 100))
Based on histogram above, we can see that most budget are in between 50M and 200M.
Plot distribution of revenue using histogram
ggplot(data = boxtrain_df, aes(x = revenue)) +
geom_histogram(color="red",
aes(fill=..count..)) +
theme(axis.text.x = element_text(angle = 0)) +
scale_x_continuous(labels = scales::label_number_si()) +
scale_y_continuous(labels = scales::label_number_si(),
limits = c(0, 75))
Based on histogram above, we can see that most revenue are in between 200M and 1B
Plot distribution of popularity using histogram
ggplot(data = boxtrain_df, aes(x = popularity)) +
geom_histogram(color="red",
aes(fill=..count..)) +
theme(axis.text.x = element_text(angle = 0)) +
scale_x_continuous(labels = scales::label_number_si()) +
scale_y_continuous(labels = scales::label_number_si(),
limits = c(0, 20))
Based on histogram above, we can see that most popularity are in between 25 and 60.
Plot distribution of runtime using histogram
ggplot(data = boxtrain_df, aes(x = runtime)) +
geom_histogram(color="red",
aes(fill=..count..)) +
theme(axis.text.x = element_text(angle = 0)) +
scale_x_continuous(labels = scales::label_number_si())
Based on histogram above, we can see that most runtime are in between 75 and 150 minutes
Plot revenue based on budget.
# linear model with geom_smooth()
ggplot(data = boxtrain_df, aes(x = budget, y = revenue)) +
geom_point(pch = 7, color = "blue", size = 2) +
geom_smooth(method = "lm", color = "red", linetype = 2) +
labs(title = "Budget vs. Revenue",
x = "Budget",
y = "Revenue") +
theme(axis.text.x = element_text(angle = 0)) +
scale_x_continuous(labels = scales::label_number_si()) +
scale_y_continuous(labels = scales::label_number_si())
Based on revenue by budget, we can see the following:
1. In general, the higher budget, the higher the revenue.
2. It is interesting that few movies with budget more than 100M generate revenue more than 400M.
Budget of 100M could be the acceptable investment.
ggplot(data = boxtrain_df, aes(x = runtime, y = revenue)) +
geom_point(pch = 7, color = "blue", size = 2) +
geom_smooth(method = "lm", color = "red", linetype = 2) +
labs(title = "Runtime vs. Revenue",
x = "Runtime",
y = "Revenue") +
theme(axis.text.x = element_text(angle = 0)) +
scale_x_continuous(labels = scales::label_number_si()) +
scale_y_continuous(labels = scales::label_number_si())
Based on revenue by runtime, we can see the following:
1. In general, the higher runtime, the higher the revenue.
2. It is interesting that few movies with runtime less than 75 minutes generates revenue. 3. It is also interesting to see that movies with runtime more than 150 minutes generates more revenue. Runtime between 75 and 150 minutes could be the acceptable investment.
ggplot(data = boxtrain_df, aes(x = popularity, y = revenue)) +
geom_point(pch = 7, color = "blue", size = 2) +
geom_smooth(method = "lm", color = "red", linetype = 2) +
labs(title = "Popularity vs. Revenue",
x = "Popularity",
y = "Revenue") +
theme(axis.text.x = element_text(angle = 0)) +
scale_x_continuous(labels = scales::label_number_si()) +
scale_y_continuous(labels = scales::label_number_si())
Based on popularity by runtime, we can see the following:
1. In general, the higher number of popularity, the higher the revenue.
2. It is interesting that popularity number mostly range between 0 to 50.
3. It is also interesting to see that most number of popularity are 30. Ths could be outliers.
boxtrain_data_num <- boxtrain_df[ , c("budget", "popularity",
"runtime", "revenue")]
r <- cor(boxtrain_data_num)
r
## budget popularity runtime revenue
## budget 1.0000000 0.3737708 NA 0.7476347
## popularity 0.3737708 1.0000000 NA 0.4803557
## runtime NA NA 1 NA
## revenue 0.7476347 0.4803557 NA 1.0000000
library(corrgram)
corrgram(boxtrain_data_num[ , ],
order = TRUE,
upper.panel = panel.pie)
Several variables are highly correlated.
For example: budget and revenue.
For target variable (revenue), the variables with high correlation in order are budget, popularity, and runtime.
Remove rows with incorrect values in budget, runtime and revenue.
boxtrain_data_num <- boxtrain_df[ , c("budget", "popularity",
"runtime", "revenue")]
boxtrain_budget_idx <- which(boxtrain_data_num$budget < 500000 |
boxtrain_data_num$runtime < 30 |
boxtrain_data_num$revenue < 500000 )
boxtrain_data_num2 <- boxtrain_data_num[-boxtrain_budget_idx, ]
summary(boxtrain_data_num2)
## budget popularity runtime revenue
## Min. : 500000 Min. : 0.01157 Min. : 65.0 Min. :5.000e+05
## 1st Qu.: 7000000 1st Qu.: 5.95662 1st Qu.: 97.0 1st Qu.:1.108e+07
## Median : 20000000 Median : 8.80468 Median :107.0 Median :3.844e+07
## Mean : 33726662 Mean : 10.16323 Mean :111.7 Mean :9.749e+07
## 3rd Qu.: 42000000 3rd Qu.: 12.00750 3rd Qu.:122.0 3rd Qu.:1.129e+08
## Max. :380000000 Max. :294.33704 Max. :338.0 Max. :1.520e+09
Minimum value of budget now is 500,000
Minimum vauue of runtime now is 65 minutes
Minimum value of revenue now is 5.00e+05
Randomly divided the dataset into training and testing with ratio = 70:30.
For reproducible result, it is necessary to set the seed.
d <- dim(boxtrain_data_num2)
m <- d[1] # m: number of rows
n <- d[2] # n: number of columns
box_idx <- 0.7 * m
box_idx
## [1] 1100.4
train_data <- boxtrain_data_num2[ 1 : box_idx , ]
test_data <- boxtrain_data_num2[ (box_idx+1) : m , ]
set.seed(2021)
train_idx <- sample(m, 0.7 * m)
train_idx[1:3]
## [1] 903 166 1454
train_data <- boxtrain_data_num2[ train_idx , ]
test_data <- boxtrain_data_num2[ -train_idx , ]
dim(train_data)
## [1] 1100 4
dim(test_data)
## [1] 472 4
Remove rows with incorrect values in budget, runtime and revenue.
boxtrain_data_num <- boxtrain_df[ , c("budget", "popularity",
"runtime", "revenue")]
boxtrain_budget_idx <- which(boxtrain_data_num$budget < 500000 |
boxtrain_data_num$runtime < 30 |
boxtrain_data_num$revenue < 500000 )
boxtrain_data_num2 <- boxtrain_data_num[-boxtrain_budget_idx, ]
summary(boxtrain_data_num2)
## budget popularity runtime revenue
## Min. : 500000 Min. : 0.01157 Min. : 65.0 Min. :5.000e+05
## 1st Qu.: 7000000 1st Qu.: 5.95662 1st Qu.: 97.0 1st Qu.:1.108e+07
## Median : 20000000 Median : 8.80468 Median :107.0 Median :3.844e+07
## Mean : 33726662 Mean : 10.16323 Mean :111.7 Mean :9.749e+07
## 3rd Qu.: 42000000 3rd Qu.: 12.00750 3rd Qu.:122.0 3rd Qu.:1.129e+08
## Max. :380000000 Max. :294.33704 Max. :338.0 Max. :1.520e+09
Minimum value of budget now is 500,000
Minimum vauue of runtime now is 65 minutes
Minimum value of revenue now is 5.00e+05
Randomly divided the dataset into training and testing with ratio = 70:30.
For reproducible result, it is necessary to set the seed.
d <- dim(boxtrain_data_num2)
m <- d[1] # m: number of rows
n <- d[2] # n: number of columns
box_idx <- 0.7 * m
box_idx
## [1] 1100.4
train_data <- boxtrain_data_num2[ 1 : box_idx , ]
test_data <- boxtrain_data_num2[ (box_idx+1) : m , ]
set.seed(2021)
train_idx <- sample(m, 0.7 * m)
train_idx[1:3]
## [1] 903 166 1454
train_data <- boxtrain_data_num2[ train_idx , ]
test_data <- boxtrain_data_num2[ -train_idx , ]
dim(train_data)
## [1] 1100 4
dim(test_data)
## [1] 472 4
The train data has 1,100 rows, and test data has 472 rows.
Create Regression Model
mymodel_train <- lm(formula = revenue ~ . +
I(popularity^2) +
popularity:budget,
data = train_data)
summary(mymodel_train)
##
## Call:
## lm(formula = revenue ~ . + I(popularity^2) + popularity:budget,
## data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -503937210 -43127851 -9396412 21444806 944804924
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.912e+07 1.599e+07 -3.072 0.00218 **
## budget 2.174e+00 1.071e-01 20.297 < 2e-16 ***
## popularity 5.088e+06 5.409e+05 9.406 < 2e-16 ***
## runtime 1.900e+05 1.369e+05 1.388 0.16539
## I(popularity^2) -1.749e+04 2.708e+03 -6.460 1.58e-10 ***
## budget:popularity 8.819e-03 3.997e-03 2.207 0.02755 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 101300000 on 1094 degrees of freedom
## Multiple R-squared: 0.5914, Adjusted R-squared: 0.5896
## F-statistic: 316.7 on 5 and 1094 DF, p-value: < 2.2e-16
We can see that budget and popularity hsa significant influence on revenue.
This means that budget is important to predict revenue.
Predict Price
## actual revenue
actual <- test_data$revenue
## prediction revenue
pred.mymodel <- predict(mymodel_train, test_data)
## plot actual vs predicted
revenue_df <- data.frame(actual, pred.mymodel)
ggplot(data = revenue_df, aes(x = actual, y = pred.mymodel)) +
geom_point() +
scale_x_continuous(labels = scales::label_number_si(),
limits = c(0, 250000000)) +
scale_y_continuous(labels = scales::label_number_si(),
limits = c(0, 250000000)) +
labs(title = "Actual vs. Predicted ",
x = "Actual",
y = "Predicted")
Most points are located near diagonal. It means the predicted values are close to actual values. However, there are still some points that relativaly far from diagonal.
Measure RMSE and R
## create function ->
performance <- function(prediction, actual, method){
e <- prediction - actual # error
se <- e^2 # squared error
sse <- sum(se) # sum of squared error
mse <- mean(se) # mean squared error
rmse <- sqrt(mse) # root mean squared error
r <- cor(prediction, actual) # correlation coefficient
result <- paste("*** Method : ", method,
"\nRMSE = ", round(rmse,3),
"\nR = ", round(r,3))
cat(result)
}
performance(pred.mymodel, actual, "My Model")
## *** Method : My Model
## RMSE = 99695063.48
## R = 0.79