Description

This report provides Box Office movies prediction using regression algorithm. The dataset used in this report for modelling from The Movie Database.

The dataset is hosted in Kabble and can be downloaded here.

The report is structured as follows:
1. Data Extraction
2. Exploratory Data Analysis
3. Data Preparation
4. Modeling
5. Evaluation
6. Recommendation

1. Data Extraction

Import necessary libraries

rm(list = ls())
library(ggplot2)

Import train dataset and see its structure

# read data
boxtrain_df <- read.csv("data/train.csv")

# structure of dataframe
str(boxtrain_df)
## 'data.frame':    2400 obs. of  23 variables:
##  $ id                   : int  272 2804 2549 260 532 1370 972 300 1732 551 ...
##  $ belongs_to_collection: chr  "" "" "[{'id': 10456, 'name': 'Dirty Harry Collection', 'poster_path': '/bS1ioomHh2emKAZ6vWn17UZnhDx.jpg', 'backdrop_p"| __truncated__ "" ...
##  $ budget               : int  72500000 0 0 24554 0 8000000 13000000 9000000 3900000 13000000 ...
##  $ genres               : chr  "[{'id': 35, 'name': 'Comedy'}]" "[{'id': 10751, 'name': 'Family'}, {'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}]" "[{'id': 28, 'name': 'Action'}, {'id': 80, 'name': 'Crime'}, {'id': 53, 'name': 'Thriller'}]" "[{'id': 18, 'name': 'Drama'}]" ...
##  $ homepage             : chr  "http://www.sonypictures.com/homevideo/talladeganights/" "" "" "" ...
##  $ imdb_id              : chr  "tt0415306" "tt0305396" "tt0070355" "tt0169302" ...
##  $ original_language    : chr  "en" "en" "en" "en" ...
##  $ original_title       : chr  "Talladega Nights: The Ballad of Ricky Bobby" "The Crocodile Hunter: Collision Course" "Magnum Force" "Theeviravaathi: The Terrorist" ...
##  $ overview             : chr  "Lifelong friends and national idols Ricky Bobby and Cal Naughton Jr. have earned their NASCAR stripes with thei"| __truncated__ "Aussie adventurer Steve Irwin aka The Crocodile Hunter has avoided the death-roll and nabbed another feisty cro"| __truncated__ "\"Dirty\" Harry Calahan is a San Francisco Police Inspector on the trail of a group of rogue cops who have take"| __truncated__ "A young female terrorist goes on a suicide assassination mission, but her resolve to complete it is put to the test." ...
##  $ popularity           : num  6.937 3.149 10.226 0.155 6.317 ...
##  $ poster_path          : chr  "/hi8whfL7t6cL2LITLJjzJ7UWuZA.jpg" "/kciZV7jywirkEqIW2pDMguFfzwb.jpg" "/3gqV4jpKNFqxUWug3BRD6yUzSL1.jpg" "/hUbH2hhcBmcothEIpqyuBHXE2FB.jpg" ...
##  $ production_companies : chr  "[{'name': 'Columbia Pictures', 'id': 5}, {'name': 'Apatow Productions', 'id': 10105}]" "[{'name': 'Cheyenne Enterprises', 'id': 890}, {'name': 'Discovery Channel Pictures', 'id': 7492}, {'name': 'Met"| __truncated__ "[{'name': 'Warner Bros.', 'id': 6194}]" "" ...
##  $ production_countries : chr  "[{'iso_3166_1': 'US', 'name': 'United States of America'}]" "[{'iso_3166_1': 'AU', 'name': 'Australia'}, {'iso_3166_1': 'US', 'name': 'United States of America'}]" "[{'iso_3166_1': 'US', 'name': 'United States of America'}]" "" ...
##  $ release_date         : chr  "8/4/06" "7/26/02" "12/23/73" "9/12/98" ...
##  $ runtime              : num  116 90 124 95 117 122 112 94 199 81 ...
##  $ spoken_languages     : chr  "[{'iso_639_1': 'en', 'name': 'English'}]" "[{'iso_639_1': 'en', 'name': 'English'}]" "[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'es', 'name': 'Español'}]" "[{'iso_639_1': 'hi', 'name': 'हिनà¥\215दà¥\200'}]" ...
##  $ status               : chr  "Released" "Released" "Released" "Released" ...
##  $ tagline              : chr  "The story of a man who could only count to #1" "His First Big-Screen Adventure!" "A man's got to know his limitations." "Her death will not be ordinary." ...
##  $ title                : chr  "Talladega Nights: The Ballad of Ricky Bobby" "The Crocodile Hunter: Collision Course" "Magnum Force" "The Terrorist" ...
##  $ Keywords             : chr  "[{'id': 5922, 'name': 'north carolina'}, {'id': 6157, 'name': 'prayer'}, {'id': 9848, 'name': 'family dinner'},"| __truncated__ "[{'id': 3185, 'name': 'crocodile'}]" "[{'id': 1679, 'name': 'arbitrary law'}, {'id': 3877, 'name': 'covered investigation'}, {'id': 5136, 'name': 'un"| __truncated__ "" ...
##  $ cast                 : chr  "[{'cast_id': 8, 'character': 'Ricky Bobby', 'credit_id': '52fe4521c3a36847f80be1a5', 'gender': 2, 'id': 23659, "| __truncated__ "[{'cast_id': 1, 'character': 'Himself', 'credit_id': '52fe47019251416c7508acf3', 'gender': 0, 'id': 559670, 'na"| __truncated__ "[{'cast_id': 1, 'character': 'Insp. Harry Calahan', 'credit_id': '52fe439b9251416c75017009', 'gender': 2, 'id':"| __truncated__ "[{'cast_id': 2, 'character': 'Malli', 'credit_id': '52fe475bc3a36847f8131393', 'gender': 0, 'id': 33191, 'name'"| __truncated__ ...
##  $ crew                 : chr  "[{'credit_id': '52fe4521c3a36847f80be20f', 'department': 'Camera', 'gender': 2, 'id': 11409, 'job': 'Director o"| __truncated__ "[{'credit_id': '52fe47019251416c7508ad2d', 'department': 'Directing', 'gender': 2, 'id': 944713, 'job': 'Direct"| __truncated__ "[{'credit_id': '52fe439b9251416c7501701b', 'department': 'Directing', 'gender': 2, 'id': 18635, 'job': 'Directo"| __truncated__ "[{'credit_id': '52fe475bc3a36847f813138f', 'department': 'Directing', 'gender': 2, 'id': 37236, 'job': 'Directo"| __truncated__ ...
##  $ revenue              : int  162966177 33082548 39768000 140021 226286 49000000 318000141 26055 4900000 97837138 ...

The dataset has 2,400 obeservations and 23 variables.
The target variables is revenue and the remaining variables are features.

Extract statistical summary of each variable.

# data dimension
d <- dim(boxtrain_df)
m <- d[1] # m: number of rows
n <- d[2] # n: number of columns

# statictical summary
summary(boxtrain_df)
##        id         belongs_to_collection     budget            genres         
##  Min.   :   2.0   Length:2400           Min.   :0.00e+00   Length:2400       
##  1st Qu.: 742.5   Class :character      1st Qu.:0.00e+00   Class :character  
##  Median :1475.0   Mode  :character      Median :8.00e+06   Mode  :character  
##  Mean   :1486.6                         Mean   :2.24e+07                     
##  3rd Qu.:2220.2                         3rd Qu.:2.80e+07                     
##  Max.   :3000.0                         Max.   :3.80e+08                     
##                                                                              
##    homepage           imdb_id          original_language  original_title    
##  Length:2400        Length:2400        Length:2400        Length:2400       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    overview           popularity      poster_path        production_companies
##  Length:2400        Min.   :  0.000   Length:2400        Length:2400         
##  Class :character   1st Qu.:  4.071   Class :character   Class :character    
##  Mode  :character   Median :  7.411   Mode  :character   Mode  :character    
##                     Mean   :  8.403                                          
##                     3rd Qu.: 10.937                                          
##                     Max.   :294.337                                          
##                                                                              
##  production_countries release_date          runtime      spoken_languages  
##  Length:2400          Length:2400        Min.   :  0.0   Length:2400       
##  Class :character     Class :character   1st Qu.: 94.0   Class :character  
##  Mode  :character     Mode  :character   Median :104.0   Mode  :character  
##                                          Mean   :107.7                     
##                                          3rd Qu.:118.0                     
##                                          Max.   :338.0                     
##                                          NA's   :2                         
##     status            tagline             title             Keywords        
##  Length:2400        Length:2400        Length:2400        Length:2400       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      cast               crew              revenue         
##  Length:2400        Length:2400        Min.   :1.000e+00  
##  Class :character   Class :character   1st Qu.:2.402e+06  
##  Mode  :character   Mode  :character   Median :1.705e+07  
##                                        Mean   :6.765e+07  
##                                        3rd Qu.:7.174e+07  
##                                        Max.   :1.520e+09  
## 

We can see minimum, median, mean, and maximum values of each numeric variables.
It is interesting that some datas have the minimum values:
revenue is 1.00e+00.
budget is 0.00e+00
runtime is 0.0
These could be an incorrect data.

We notice that the maximum values of revenue is statistically not far from median and third quantile.

2. Exploratory Data Analysis

2.1. Univariate Analysis

Plot distribution of budget using histogram

ggplot(data = boxtrain_df, aes(x = budget)) +
  geom_histogram(color="red",
                 aes(fill=..count..)) +
  theme(axis.text.x = element_text(angle = 0)) +
  scale_x_continuous(labels = scales::label_number_si()) +
  scale_y_continuous(labels = scales::label_number_si(),
                     limits = c(0, 100))

Based on histogram above, we can see that most budget are in between 50M and 200M.

Plot distribution of revenue using histogram

ggplot(data = boxtrain_df, aes(x = revenue)) +
  geom_histogram(color="red",
                 aes(fill=..count..)) +
  theme(axis.text.x = element_text(angle = 0)) +
  scale_x_continuous(labels = scales::label_number_si()) +
  scale_y_continuous(labels = scales::label_number_si(),
                     limits = c(0, 75))

Based on histogram above, we can see that most revenue are in between 200M and 1B

Plot distribution of popularity using histogram

ggplot(data = boxtrain_df, aes(x = popularity)) +
  geom_histogram(color="red",
                 aes(fill=..count..)) +
  theme(axis.text.x = element_text(angle = 0)) +
  scale_x_continuous(labels = scales::label_number_si()) +
  scale_y_continuous(labels = scales::label_number_si(), 
                     limits = c(0, 20))

Based on histogram above, we can see that most popularity are in between 25 and 60.

Plot distribution of runtime using histogram

ggplot(data = boxtrain_df, aes(x = runtime)) +
  geom_histogram(color="red",
                 aes(fill=..count..)) +
  theme(axis.text.x = element_text(angle = 0)) +
  scale_x_continuous(labels = scales::label_number_si())

Based on histogram above, we can see that most runtime are in between 75 and 150 minutes

2.2. Bivariate Analysis

Plot revenue based on budget.

# linear model with geom_smooth()

ggplot(data = boxtrain_df, aes(x = budget, y = revenue)) + 
  geom_point(pch = 7, color = "blue", size = 2) + 
  geom_smooth(method = "lm", color = "red", linetype = 2) +
  labs(title = "Budget vs. Revenue", 
       x = "Budget",
       y = "Revenue") +
  theme(axis.text.x = element_text(angle = 0)) + 
  scale_x_continuous(labels = scales::label_number_si()) +
  scale_y_continuous(labels = scales::label_number_si())

Based on revenue by budget, we can see the following:
1. In general, the higher budget, the higher the revenue.
2. It is interesting that few movies with budget more than 100M generate revenue more than 400M.
Budget of 100M could be the acceptable investment.

ggplot(data = boxtrain_df, aes(x = runtime, y = revenue)) + 
  geom_point(pch = 7, color = "blue", size = 2) + 
  geom_smooth(method = "lm", color = "red", linetype = 2) +
  labs(title = "Runtime vs. Revenue", 
       x = "Runtime",
       y = "Revenue") +
  theme(axis.text.x = element_text(angle = 0)) + 
  scale_x_continuous(labels = scales::label_number_si()) +
  scale_y_continuous(labels = scales::label_number_si())

Based on revenue by runtime, we can see the following:
1. In general, the higher runtime, the higher the revenue.
2. It is interesting that few movies with runtime less than 75 minutes generates revenue. 3. It is also interesting to see that movies with runtime more than 150 minutes generates more revenue. Runtime between 75 and 150 minutes could be the acceptable investment.

ggplot(data = boxtrain_df, aes(x = popularity, y = revenue)) + 
  geom_point(pch = 7, color = "blue", size = 2) + 
  geom_smooth(method = "lm", color = "red", linetype = 2) +
  labs(title = "Popularity vs. Revenue", 
       x = "Popularity",
       y = "Revenue") +
  theme(axis.text.x = element_text(angle = 0)) + 
  scale_x_continuous(labels = scales::label_number_si()) +
  scale_y_continuous(labels = scales::label_number_si())

Based on popularity by runtime, we can see the following:
1. In general, the higher number of popularity, the higher the revenue.
2. It is interesting that popularity number mostly range between 0 to 50.
3. It is also interesting to see that most number of popularity are 30. Ths could be outliers.

2.3. Multivariate Analysis

boxtrain_data_num <- boxtrain_df[ , c("budget", "popularity", 
                                      "runtime", "revenue")]
r <- cor(boxtrain_data_num)
r
##               budget popularity runtime   revenue
## budget     1.0000000  0.3737708      NA 0.7476347
## popularity 0.3737708  1.0000000      NA 0.4803557
## runtime           NA         NA       1        NA
## revenue    0.7476347  0.4803557      NA 1.0000000
library(corrgram)
corrgram(boxtrain_data_num[ , ],
         order = TRUE,
         upper.panel = panel.pie)

Several variables are highly correlated.
For example: budget and revenue.

For target variable (revenue), the variables with high correlation in order are budget, popularity, and runtime.

Insight from EDA: (Exploratory Data Analysis)

  1. Incorrect values on price (price == 0 )
  2. Extreme outliers on price variables
  3. There are co-linear variables
  4. Based on Pearson’s correlation coefficient (r),
    the variables with highest correlation with target (revenue) is budget.
  5. In general, the higher number of budget, the higher the revenue. Exception for revenuye == 0.
  6. There are some other variables such as gender, cast, crew, language, etc. However, they are non-numeric.

3. Data Preparation

3.1. Data cleaning

Remove rows with incorrect values in budget, runtime and revenue.

boxtrain_data_num <- boxtrain_df[ , c("budget", "popularity", 
                                       "runtime", "revenue")]

boxtrain_budget_idx <- which(boxtrain_data_num$budget < 500000 |
                               boxtrain_data_num$runtime < 30 |
                               boxtrain_data_num$revenue < 500000 ) 
boxtrain_data_num2 <- boxtrain_data_num[-boxtrain_budget_idx, ]
summary(boxtrain_data_num2)
##      budget            popularity           runtime         revenue         
##  Min.   :   500000   Min.   :  0.01157   Min.   : 65.0   Min.   :5.000e+05  
##  1st Qu.:  7000000   1st Qu.:  5.95662   1st Qu.: 97.0   1st Qu.:1.108e+07  
##  Median : 20000000   Median :  8.80468   Median :107.0   Median :3.844e+07  
##  Mean   : 33726662   Mean   : 10.16323   Mean   :111.7   Mean   :9.749e+07  
##  3rd Qu.: 42000000   3rd Qu.: 12.00750   3rd Qu.:122.0   3rd Qu.:1.129e+08  
##  Max.   :380000000   Max.   :294.33704   Max.   :338.0   Max.   :1.520e+09

Minimum value of budget now is 500,000
Minimum vauue of runtime now is 65 minutes
Minimum value of revenue now is 5.00e+05

3.2. Training and Testing Division

Randomly divided the dataset into training and testing with ratio = 70:30.
For reproducible result, it is necessary to set the seed.

d <- dim(boxtrain_data_num2)
m <- d[1] # m: number of rows
n <- d[2] # n: number of columns

box_idx <- 0.7 * m
box_idx
## [1] 1100.4
train_data <- boxtrain_data_num2[ 1 : box_idx , ]
test_data <- boxtrain_data_num2[ (box_idx+1) : m , ]

set.seed(2021)
train_idx <- sample(m, 0.7 * m)
train_idx[1:3]
## [1]  903  166 1454
train_data <- boxtrain_data_num2[ train_idx , ]
test_data <- boxtrain_data_num2[ -train_idx , ]

dim(train_data)
## [1] 1100    4
dim(test_data)
## [1] 472   4

3. Data Preparation

3.1. Data cleaning

Remove rows with incorrect values in budget, runtime and revenue.

boxtrain_data_num <- boxtrain_df[ , c("budget", "popularity", 
                                       "runtime", "revenue")]

boxtrain_budget_idx <- which(boxtrain_data_num$budget < 500000 |
                               boxtrain_data_num$runtime < 30 |
                               boxtrain_data_num$revenue < 500000 ) 
boxtrain_data_num2 <- boxtrain_data_num[-boxtrain_budget_idx, ]
summary(boxtrain_data_num2)
##      budget            popularity           runtime         revenue         
##  Min.   :   500000   Min.   :  0.01157   Min.   : 65.0   Min.   :5.000e+05  
##  1st Qu.:  7000000   1st Qu.:  5.95662   1st Qu.: 97.0   1st Qu.:1.108e+07  
##  Median : 20000000   Median :  8.80468   Median :107.0   Median :3.844e+07  
##  Mean   : 33726662   Mean   : 10.16323   Mean   :111.7   Mean   :9.749e+07  
##  3rd Qu.: 42000000   3rd Qu.: 12.00750   3rd Qu.:122.0   3rd Qu.:1.129e+08  
##  Max.   :380000000   Max.   :294.33704   Max.   :338.0   Max.   :1.520e+09

Minimum value of budget now is 500,000
Minimum vauue of runtime now is 65 minutes
Minimum value of revenue now is 5.00e+05

3.2. Training and Testing Division

Randomly divided the dataset into training and testing with ratio = 70:30.
For reproducible result, it is necessary to set the seed.

d <- dim(boxtrain_data_num2)
m <- d[1] # m: number of rows
n <- d[2] # n: number of columns

box_idx <- 0.7 * m
box_idx
## [1] 1100.4
train_data <- boxtrain_data_num2[ 1 : box_idx , ]
test_data <- boxtrain_data_num2[ (box_idx+1) : m , ]

set.seed(2021)
train_idx <- sample(m, 0.7 * m)
train_idx[1:3]
## [1]  903  166 1454
train_data <- boxtrain_data_num2[ train_idx , ]
test_data <- boxtrain_data_num2[ -train_idx , ]

dim(train_data)
## [1] 1100    4
dim(test_data)
## [1] 472   4

The train data has 1,100 rows, and test data has 472 rows.

4. Modeling

Create Regression Model

mymodel_train <- lm(formula = revenue ~ . +
                I(popularity^2) +
                popularity:budget,
              data = train_data)
summary(mymodel_train)
## 
## Call:
## lm(formula = revenue ~ . + I(popularity^2) + popularity:budget, 
##     data = train_data)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -503937210  -43127851   -9396412   21444806  944804924 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -4.912e+07  1.599e+07  -3.072  0.00218 ** 
## budget             2.174e+00  1.071e-01  20.297  < 2e-16 ***
## popularity         5.088e+06  5.409e+05   9.406  < 2e-16 ***
## runtime            1.900e+05  1.369e+05   1.388  0.16539    
## I(popularity^2)   -1.749e+04  2.708e+03  -6.460 1.58e-10 ***
## budget:popularity  8.819e-03  3.997e-03   2.207  0.02755 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 101300000 on 1094 degrees of freedom
## Multiple R-squared:  0.5914, Adjusted R-squared:  0.5896 
## F-statistic: 316.7 on 5 and 1094 DF,  p-value: < 2.2e-16

We can see that budget and popularity hsa significant influence on revenue.
This means that budget is important to predict revenue.

5. Evaluation

Predict Price

## actual revenue
actual <- test_data$revenue

## prediction revenue
pred.mymodel <- predict(mymodel_train, test_data)
## plot actual vs predicted
revenue_df <- data.frame(actual, pred.mymodel)

ggplot(data = revenue_df, aes(x = actual, y = pred.mymodel)) + 
  geom_point() +
  scale_x_continuous(labels = scales::label_number_si(),
                     limits = c(0, 250000000)) +
  scale_y_continuous(labels = scales::label_number_si(),
                     limits = c(0, 250000000)) +
  labs(title = "Actual vs. Predicted ", 
       x = "Actual",
       y = "Predicted") 

Most points are located near diagonal. It means the predicted values are close to actual values. However, there are still some points that relativaly far from diagonal.

Measure RMSE and R

## create function -> 
performance <- function(prediction, actual, method){
  e <- prediction - actual # error
  se <- e^2 # squared error
  sse <- sum(se) # sum of squared error
  mse <- mean(se) # mean squared error
  rmse <- sqrt(mse) # root mean squared error
  r <- cor(prediction, actual) # correlation coefficient
  
  result <- paste("*** Method : ", method,
                  "\nRMSE = ", round(rmse,3),
                  "\nR = ", round(r,3))
  cat(result)
}

performance(pred.mymodel, actual, "My Model")
## *** Method :  My Model 
## RMSE =  99695063.48 
## R =  0.79