Analyzing Film Revenue

Data Exploration

At this point, I’ve already removed columns that were otherwise redundant or unnecessary to the study, these columns included information like number of reviews which wasn’t needed if the average review score of each movie was already provided. Other factors I opted to ignore in addition to number of reviews were film summaries and alternate titles that were more of a fit for studies regarding machine learning and recommender systems than analysis.

data <- read_csv("movies.csv", show_col_types = F)
data <- na.omit(data)


# Rather than include each individual year from 1980 - 2020, I opted to change the years variable into decades of release for better analysis.
# Additionally, because 2020 only has 7 entries, I opted to omit it from the data set.
data <- data[data$year < 2020,]

data$year[data$year >= 2010 & data$year < 2020] <- "2010s"
data$year[data$year >= 2000 & data$year < 2010] <- "2000s"
data$year[data$year >= 1990 & data$year < 2000] <- "1990s"
data$year[data$year >= 1980 & data$year < 1990] <- "1980s"

# Combine the two values for simplification.
data$country[data$country == "West Germany"] <- "Germany"

# Convert countries with < 30 objects into a new variable, "Other" for simplification.
data$country[!(data$country %in% c("Australia", "Canada", "France", "Germany", "United Kingdom", "United States"))] <- "Other"

# Convert genres with < 30 objects into a new variable, "Other" for simplification.
data$genre[(data$genre %in% c("Family", "Mystery", "Romance", "Sci-Fi", "Thriller", "Western"))] <- "Other"

# Check to make sure countries are > 30 objects in data set and that conversion worked properly.
table(data$country)
## 
##      Australia         Canada         France        Germany          Other 
##             42            109            105             85            266 
## United Kingdom  United States 
##            492           4314
# Remove films without any budget information and focus only on MPAA ratings instead of TV or other entries on the list.
data <- data %>%
  filter(data$budget != 0) %>%
  filter(data$rating %in% c("G", "PG", "PG-13", "R"))

# Data Exploration and Visualization:
barplot(table(data$rating) / 5344, main = "Rating Distribution", xlab = "Movie Rating", col = "#1C4E80")

table(data$rating)
## 
##     G    PG PG-13     R 
##   111   904  1728  2594
barplot(table(data$year) / 5344, xlab = "Decade of Release", col = "#1C4E80")

table(data$year)
## 
## 1980s 1990s 2000s 2010s 
##   877  1347  1620  1493
barplot(table(data$genre) / 5344,xlab = "Film Genre", col = "#1C4E80")

table(data$genre)
## 
##    Action Adventure Animation Biography    Comedy     Crime     Drama   Fantasy 
##      1405       324       276       308      1480       388       831        39 
##    Horror     Other 
##       245        41
par(mar=c(5, 4, 4, 2))
barplot(table(data$country) / 5344, xlab = "Film Country", col = "#1C4E80", cex.names = .7)

table(data$country)
## 
##      Australia         Canada         France        Germany          Other 
##             39            104             99             85            245 
## United Kingdom  United States 
##            481           4284
# Remove variables that aren't of interest and convert character variables into factors.
data <- data[-c(1, 5, 8, 9, 10, 14)]
data[sapply(data, is.character)] <- lapply(data[sapply(data, is.character)], as.factor)
# Plot regression model of all variables.
model <- lm(data = data, gross ~ budget + rating + genre + year + score + runtime + country)
summary(model)
## 
## Call:
## lm(formula = gross ~ budget + rating + genre + year + score + 
##     runtime + country, data = data)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -530205020  -47689956   -8709881   31288873 2056579630 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -2.655e+08  2.695e+07  -9.849  < 2e-16 ***
## budget                 3.138e+00  5.351e-02  58.633  < 2e-16 ***
## ratingPG              -9.310e+05  1.311e+07  -0.071  0.94339    
## ratingPG-13           -3.925e+06  1.380e+07  -0.284  0.77610    
## ratingR               -2.289e+07  1.370e+07  -1.671  0.09475 .  
## genreAdventure         2.947e+06  7.777e+06   0.379  0.70472    
## genreAnimation         3.051e+07  1.011e+07   3.018  0.00256 ** 
## genreBiography        -3.590e+07  8.106e+06  -4.429 9.67e-06 ***
## genreComedy            7.720e+06  4.824e+06   1.600  0.10961    
## genreCrime            -8.489e+06  7.232e+06  -1.174  0.24055    
## genreDrama            -1.080e+07  5.657e+06  -1.909  0.05625 .  
## genreFantasy           1.808e+07  1.957e+07   0.924  0.35558    
## genreHorror            5.496e+07  8.636e+06   6.364 2.13e-10 ***
## genreOther             4.217e+07  1.908e+07   2.210  0.02712 *  
## year1990s             -1.535e+07  5.341e+06  -2.875  0.00406 ** 
## year2000s             -1.709e+07  5.384e+06  -3.174  0.00151 ** 
## year2010s              1.500e+07  5.561e+06   2.698  0.00700 ** 
## score                  3.826e+07  1.985e+06  19.274  < 2e-16 ***
## runtime                1.094e+05  1.202e+05   0.910  0.36287    
## countryCanada          1.857e+07  2.259e+07   0.822  0.41104    
## countryFrance         -1.782e+07  2.275e+07  -0.783  0.43353    
## countryGermany        -9.980e+06  2.333e+07  -0.428  0.66882    
## countryOther           3.880e+06  2.073e+07   0.187  0.85154    
## countryUnited Kingdom  4.575e+06  2.001e+07   0.229  0.81915    
## countryUnited States   1.737e+07  1.935e+07   0.898  0.36923    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 119900000 on 5312 degrees of freedom
## Multiple R-squared:  0.5956, Adjusted R-squared:  0.5938 
## F-statistic:   326 on 24 and 5312 DF,  p-value: < 2.2e-16
plot(model)

# Use a Box-Cox transformation for the sake of both the regression model normality as well as to improve the relationship between budget and revenue/gross.
bc <- boxcox(data = data, gross ~ budget + rating + genre + year + score + runtime + country)
boxcox(data = data, gross ~ budget + rating + genre + year + score + runtime + country)

lambda <- bc$x[which.max(bc$y)]
lambda
## [1] 0.2222222
spreadLevelPlot(model)
## Warning in spreadLevelPlot.lm(model): 
## 838 negative fitted values removed

## 
## Suggested power transformation:  0.2686499
# Both lambda and the suggested power transformation of spreadLevelPlot can be rounded to about 0.25

# Transform both variables to improve their linear relationship.
x <- log(data$budget)
y <- (data$gross^0.25)
hist(y)

# Scatter plot for linear relationships
# Potentially linear relationship but debatable at best.
plot(data$budget, data$gross, xlab = "Film Budget", ylab = "Film Revenue")

# Potentially linear relationship.
plot(data$budget, y, xlab = "Film Budget", ylab = "Transformed Film Revenue")

# Potentially linear relationship.
plot(x, y, xlab = "Natural Log of Film Budget", ylab = "Transformed Film Revenue")

# No linear relationship, even when adjusted using natural log of runtime.
plot(data$runtime, y, xlab = "Film Runtime", ylab = "Transformed Film Revenue")

# Weak correlation
plot(data$year, y, xlab = "Decade of Film Release Date", ylab = "Transformed Film Revenue")

# Seemingly little to no correlation.
plot(data$rating, y, xlab = "Film MPAA Rating", ylab = "Transformed Film Revenue")

# Arguably linear relationship but nothing clear and can be argued both ways.
plot(data$score, y, xlab = "Film Score", ylab = "Transformed Film Revenue")

# No correlation
plot(data$genre, y, xlab = "Film Genre", ylab = "Transformed Film Revenue")

# No correlation
plot(data$country, y, xlab = "Country of Origin", ylab = "Transformed Film Revenue")

# Transform the original gross column variable.
data$gross <- y

# Using ANOVA and a linear regression model, all categorical values have a relatively small correlation..
model <- aov(data = data, gross ~ rating)
summary(model)
##               Df  Sum Sq Mean Sq F value Pr(>F)    
## rating         3  625698  208566   199.6 <2e-16 ***
## Residuals   5333 5571627    1045                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model <- lm(data = data, gross ~ rating)
summary(model)
## 
## Call:
## lm(formula = gross ~ rating, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -89.295 -22.960  -3.463  20.415 137.509 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   99.880      3.068  32.556  < 2e-16 ***
## ratingPG     -10.239      3.251  -3.150  0.00164 ** 
## ratingPG-13   -6.392      3.165  -2.020  0.04347 *  
## ratingR      -28.801      3.133  -9.193  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.32 on 5333 degrees of freedom
## Multiple R-squared:  0.101,  Adjusted R-squared:  0.1005 
## F-statistic: 199.6 on 3 and 5333 DF,  p-value: < 2.2e-16
model <- aov(data = data, gross ~ genre)
summary(model)
##               Df  Sum Sq Mean Sq F value Pr(>F)    
## genre          9  792715   88079   86.81 <2e-16 ***
## Residuals   5327 5404609    1015                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model <- lm(data = data, gross ~ genre)
summary(model)
## 
## Call:
## lm(formula = gross ~ genre, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -101.606  -22.779   -2.258   20.511  144.302 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     94.8720     0.8498 111.644  < 2e-16 ***
## genreAdventure  -7.9755     1.9630  -4.063 4.92e-05 ***
## genreAnimation  19.9365     2.0972   9.506  < 2e-16 ***
## genreBiography -19.0173     2.0040  -9.489  < 2e-16 ***
## genreComedy    -20.2655     1.1864 -17.081  < 2e-16 ***
## genreCrime     -25.8743     1.8267 -14.164  < 2e-16 ***
## genreDrama     -22.5601     1.3939 -16.185  < 2e-16 ***
## genreFantasy   -25.7157     5.1708  -4.973 6.79e-07 ***
## genreHorror    -19.7419     2.2053  -8.952  < 2e-16 ***
## genreOther     -17.7581     5.0466  -3.519 0.000437 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.85 on 5327 degrees of freedom
## Multiple R-squared:  0.1279, Adjusted R-squared:  0.1264 
## F-statistic: 86.81 on 9 and 5327 DF,  p-value: < 2.2e-16
# Little to no correlation, remove from model.
# model <- aov(data = data, gross ~ country)
# summary(model)
# 
# model <- lm(data = data, gross ~ country)
# summary(model)



model <- aov(data = data, gross ~ year)
summary(model)
##               Df  Sum Sq Mean Sq F value Pr(>F)    
## year           3  747042  249014   243.7 <2e-16 ***
## Residuals   5333 5450282    1022                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model <- lm(data = data, gross ~ year)
summary(model)
## 
## Call:
## lm(formula = gross ~ year, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -86.18 -21.27  -2.82  18.59 144.50 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   64.010      1.080  59.296  < 2e-16 ***
## year1990s      8.099      1.387   5.839 5.57e-09 ***
## year2000s     23.556      1.340  17.576  < 2e-16 ***
## year2010s     31.716      1.360  23.319  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.97 on 5333 degrees of freedom
## Multiple R-squared:  0.1205, Adjusted R-squared:   0.12 
## F-statistic: 243.7 on 3 and 5333 DF,  p-value: < 2.2e-16
# Using correlation test, all continuous values significant.
cor.test(data$gross, log(data$budget), method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  data$gross and log(data$budget)
## t = 65.05, df = 5335, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6498397 0.6797740
## sample estimates:
##       cor 
## 0.6650739
cor.test(data$gross, data$runtime, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  data$gross and data$runtime
## t = 21.402, df = 5335, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2562975 0.3057171
## sample estimates:
##       cor 
## 0.2811937
cor.test(data$gross, data$score, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  data$gross and data$score
## t = 17.408, df = 5335, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2062896 0.2570670
## sample estimates:
##       cor 
## 0.2318362
model <- lm(gross ~ log(budget) + rating + genre + year + score + runtime + country, data = data)
summary(model)
## 
## Call:
## lm(formula = gross ~ log(budget) + rating + genre + year + score + 
##     runtime + country, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -95.833 -14.623  -1.206  13.845 127.071 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -231.56708    6.84126 -33.849  < 2e-16 ***
## log(budget)             14.40953    0.32739  44.013  < 2e-16 ***
## ratingPG                -1.32415    2.40536  -0.550 0.582001    
## ratingPG-13             -2.97914    2.53137  -1.177 0.239292    
## ratingR                -11.79336    2.51793  -4.684 2.89e-06 ***
## genreAdventure          -5.21255    1.42557  -3.656 0.000258 ***
## genreAnimation           4.69372    1.84785   2.540 0.011110 *  
## genreBiography         -19.17150    1.45444 -13.181  < 2e-16 ***
## genreComedy             -5.34449    0.87251  -6.125 9.69e-10 ***
## genreCrime             -11.71011    1.31506  -8.905  < 2e-16 ***
## genreDrama             -12.59764    1.02164 -12.331  < 2e-16 ***
## genreFantasy             0.94573    3.59049   0.263 0.792254    
## genreHorror             11.97662    1.59835   7.493 7.83e-14 ***
## genreOther              -7.41962    3.49629  -2.122 0.033872 *  
## year1990s               -0.62151    0.99129  -0.627 0.530707    
## year2000s                8.64336    1.00133   8.632  < 2e-16 ***
## year2010s               14.63787    1.01674  14.397  < 2e-16 ***
## score                    9.53386    0.36739  25.950  < 2e-16 ***
## runtime                  0.11101    0.02239   4.959 7.32e-07 ***
## countryCanada            1.05023    4.14539   0.253 0.800009    
## countryFrance           -9.17798    4.17329  -2.199 0.027905 *  
## countryGermany          -1.06873    4.28189  -0.250 0.802912    
## countryOther            -3.27725    3.80721  -0.861 0.389386    
## countryUnited Kingdom   -2.14282    3.67105  -0.584 0.559443    
## countryUnited States     5.42071    3.54910   1.527 0.126733    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.99 on 5312 degrees of freedom
## Multiple R-squared:  0.5855, Adjusted R-squared:  0.5836 
## F-statistic: 312.6 on 24 and 5312 DF,  p-value: < 2.2e-16
plot(model)

model <- lm(gross ~ log(budget) + rating + year + score, data = data)
summary(model)
## 
## Call:
## lm(formula = gross ~ log(budget) + rating + year + score, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -98.93 -16.01  -1.13  14.59 154.93 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -233.3247     5.6144 -41.558  < 2e-16 ***
## log(budget)   15.9924     0.2850  56.114  < 2e-16 ***
## ratingPG      -5.2214     2.3387  -2.233  0.02561 *  
## ratingPG-13   -7.1439     2.2764  -3.138  0.00171 ** 
## ratingR      -15.2686     2.2611  -6.753 1.61e-11 ***
## year1990s     -2.4337     1.0340  -2.354  0.01863 *  
## year2000s      6.3498     1.0304   6.162 7.69e-10 ***
## year2010s     13.1841     1.0473  12.589  < 2e-16 ***
## score          8.1670     0.3325  24.565  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.14 on 5328 degrees of freedom
## Multiple R-squared:  0.5396, Adjusted R-squared:  0.5389 
## F-statistic: 780.6 on 8 and 5328 DF,  p-value: < 2.2e-16
plot(model)

# No change to the model
# model <- stepAIC(model, direction = "backward")
# summary(model)
# plot(model)

# Check for multicollinearity
vif_values <- vif(model)
barplot(vif_values[, 3], main = "VIF Values", horiz = TRUE, col = "steelblue", xlim = c(0,3))
#add vertical line at 2.5
abline(v = 2.5, lwd = 3, lty = 2)

# Normality
qqPlot(model, main="QQ Plot")

## [1] 2079 3419
sresid <- studres(model)
hist(sresid, freq = F,
   main = "Distribution of Studentized Residuals")
xfit <- seq(min(sresid), max(sresid), length = 40)
yfit <- dnorm(xfit)
lines(xfit, yfit) 

# Constant error variance
ncvTest(model)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 14.83126, Df = 1, p = 0.00011757
spreadLevelPlot(model)
## Warning in spreadLevelPlot.lm(model): 
## 22 negative fitted values removed

## 
## Suggested power transformation:  0.9964055
bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 109.78, df = 8, p-value < 2.2e-16
# Already checked the multi collinearity assumption earlier, this is just a simpler version of that check.
# Kept the original check as it looked better from a visual standpoint with regards to presentation.
# vif(model)
# sqrt(vif(model)) > 2

# Linearity
crPlots(model)

ceresPlots(model)
## Warning in ceresPlots.default(model): Factors skipped in drawing CERES plots.

# Independence of Error terms
# Value of 1.70 is relatively acceptable and isn't a serious concern regarding autocorrelation..
durbinWatsonTest(model)$dw
## [1] 1.65762
wt <- 1 / lm(abs(model$residuals) ~ model$fitted.values)$fitted.values^2
#perform weighted least squares regression
wls_model <- lm(gross ~ log(budget) + rating + year + score, data = data, weights=wt)
#view summary of model
summary(wls_model)
## 
## Call:
## lm(formula = gross ~ log(budget) + rating + year + score, data = data, 
##     weights = wt)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4737 -0.8809 -0.0607  0.8001  8.4533 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -234.4140     5.6204 -41.707  < 2e-16 ***
## log(budget)   16.0515     0.2855  56.218  < 2e-16 ***
## ratingPG      -5.2083     2.3347  -2.231   0.0257 *  
## ratingPG-13   -7.1345     2.2725  -3.140   0.0017 ** 
## ratingR      -15.2569     2.2574  -6.759 1.54e-11 ***
## year1990s     -2.4522     1.0354  -2.368   0.0179 *  
## year2000s      6.3185     1.0314   6.126 9.64e-10 ***
## year2010s     13.1637     1.0480  12.560  < 2e-16 ***
## score          8.1833     0.3326  24.606  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.273 on 5328 degrees of freedom
## Multiple R-squared:  0.5406, Adjusted R-squared:  0.5399 
## F-statistic: 783.7 on 8 and 5328 DF,  p-value: < 2.2e-16
plot(wls_model)