At this point, I’ve already removed columns that were otherwise redundant or unnecessary to the study, these columns included information like number of reviews which wasn’t needed if the average review score of each movie was already provided. Other factors I opted to ignore in addition to number of reviews were film summaries and alternate titles that were more of a fit for studies regarding machine learning and recommender systems than analysis.
data <- read_csv("movies.csv", show_col_types = F)
data <- na.omit(data)
# Rather than include each individual year from 1980 - 2020, I opted to change the years variable into decades of release for better analysis.
# Additionally, because 2020 only has 7 entries, I opted to omit it from the data set.
data <- data[data$year < 2020,]
data$year[data$year >= 2010 & data$year < 2020] <- "2010s"
data$year[data$year >= 2000 & data$year < 2010] <- "2000s"
data$year[data$year >= 1990 & data$year < 2000] <- "1990s"
data$year[data$year >= 1980 & data$year < 1990] <- "1980s"
# Combine the two values for simplification.
data$country[data$country == "West Germany"] <- "Germany"
# Convert countries with < 30 objects into a new variable, "Other" for simplification.
data$country[!(data$country %in% c("Australia", "Canada", "France", "Germany", "United Kingdom", "United States"))] <- "Other"
# Convert genres with < 30 objects into a new variable, "Other" for simplification.
data$genre[(data$genre %in% c("Family", "Mystery", "Romance", "Sci-Fi", "Thriller", "Western"))] <- "Other"
# Check to make sure countries are > 30 objects in data set and that conversion worked properly.
table(data$country)
##
## Australia Canada France Germany Other
## 42 109 105 85 266
## United Kingdom United States
## 492 4314
# Remove films without any budget information and focus only on MPAA ratings instead of TV or other entries on the list.
data <- data %>%
filter(data$budget != 0) %>%
filter(data$rating %in% c("G", "PG", "PG-13", "R"))
# Data Exploration and Visualization:
barplot(table(data$rating) / 5344, main = "Rating Distribution", xlab = "Movie Rating", col = "#1C4E80")
table(data$rating)
##
## G PG PG-13 R
## 111 904 1728 2594
barplot(table(data$year) / 5344, xlab = "Decade of Release", col = "#1C4E80")
table(data$year)
##
## 1980s 1990s 2000s 2010s
## 877 1347 1620 1493
barplot(table(data$genre) / 5344,xlab = "Film Genre", col = "#1C4E80")
table(data$genre)
##
## Action Adventure Animation Biography Comedy Crime Drama Fantasy
## 1405 324 276 308 1480 388 831 39
## Horror Other
## 245 41
par(mar=c(5, 4, 4, 2))
barplot(table(data$country) / 5344, xlab = "Film Country", col = "#1C4E80", cex.names = .7)
table(data$country)
##
## Australia Canada France Germany Other
## 39 104 99 85 245
## United Kingdom United States
## 481 4284
# Remove variables that aren't of interest and convert character variables into factors.
data <- data[-c(1, 5, 8, 9, 10, 14)]
data[sapply(data, is.character)] <- lapply(data[sapply(data, is.character)], as.factor)
# Plot regression model of all variables.
model <- lm(data = data, gross ~ budget + rating + genre + year + score + runtime + country)
summary(model)
##
## Call:
## lm(formula = gross ~ budget + rating + genre + year + score +
## runtime + country, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -530205020 -47689956 -8709881 31288873 2056579630
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.655e+08 2.695e+07 -9.849 < 2e-16 ***
## budget 3.138e+00 5.351e-02 58.633 < 2e-16 ***
## ratingPG -9.310e+05 1.311e+07 -0.071 0.94339
## ratingPG-13 -3.925e+06 1.380e+07 -0.284 0.77610
## ratingR -2.289e+07 1.370e+07 -1.671 0.09475 .
## genreAdventure 2.947e+06 7.777e+06 0.379 0.70472
## genreAnimation 3.051e+07 1.011e+07 3.018 0.00256 **
## genreBiography -3.590e+07 8.106e+06 -4.429 9.67e-06 ***
## genreComedy 7.720e+06 4.824e+06 1.600 0.10961
## genreCrime -8.489e+06 7.232e+06 -1.174 0.24055
## genreDrama -1.080e+07 5.657e+06 -1.909 0.05625 .
## genreFantasy 1.808e+07 1.957e+07 0.924 0.35558
## genreHorror 5.496e+07 8.636e+06 6.364 2.13e-10 ***
## genreOther 4.217e+07 1.908e+07 2.210 0.02712 *
## year1990s -1.535e+07 5.341e+06 -2.875 0.00406 **
## year2000s -1.709e+07 5.384e+06 -3.174 0.00151 **
## year2010s 1.500e+07 5.561e+06 2.698 0.00700 **
## score 3.826e+07 1.985e+06 19.274 < 2e-16 ***
## runtime 1.094e+05 1.202e+05 0.910 0.36287
## countryCanada 1.857e+07 2.259e+07 0.822 0.41104
## countryFrance -1.782e+07 2.275e+07 -0.783 0.43353
## countryGermany -9.980e+06 2.333e+07 -0.428 0.66882
## countryOther 3.880e+06 2.073e+07 0.187 0.85154
## countryUnited Kingdom 4.575e+06 2.001e+07 0.229 0.81915
## countryUnited States 1.737e+07 1.935e+07 0.898 0.36923
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 119900000 on 5312 degrees of freedom
## Multiple R-squared: 0.5956, Adjusted R-squared: 0.5938
## F-statistic: 326 on 24 and 5312 DF, p-value: < 2.2e-16
plot(model)
# Use a Box-Cox transformation for the sake of both the regression model normality as well as to improve the relationship between budget and revenue/gross.
bc <- boxcox(data = data, gross ~ budget + rating + genre + year + score + runtime + country)
boxcox(data = data, gross ~ budget + rating + genre + year + score + runtime + country)
lambda <- bc$x[which.max(bc$y)]
lambda
## [1] 0.2222222
spreadLevelPlot(model)
## Warning in spreadLevelPlot.lm(model):
## 838 negative fitted values removed
##
## Suggested power transformation: 0.2686499
# Both lambda and the suggested power transformation of spreadLevelPlot can be rounded to about 0.25
# Transform both variables to improve their linear relationship.
x <- log(data$budget)
y <- (data$gross^0.25)
hist(y)
# Scatter plot for linear relationships
# Potentially linear relationship but debatable at best.
plot(data$budget, data$gross, xlab = "Film Budget", ylab = "Film Revenue")
# Potentially linear relationship.
plot(data$budget, y, xlab = "Film Budget", ylab = "Transformed Film Revenue")
# Potentially linear relationship.
plot(x, y, xlab = "Natural Log of Film Budget", ylab = "Transformed Film Revenue")
# No linear relationship, even when adjusted using natural log of runtime.
plot(data$runtime, y, xlab = "Film Runtime", ylab = "Transformed Film Revenue")
# Weak correlation
plot(data$year, y, xlab = "Decade of Film Release Date", ylab = "Transformed Film Revenue")
# Seemingly little to no correlation.
plot(data$rating, y, xlab = "Film MPAA Rating", ylab = "Transformed Film Revenue")
# Arguably linear relationship but nothing clear and can be argued both ways.
plot(data$score, y, xlab = "Film Score", ylab = "Transformed Film Revenue")
# No correlation
plot(data$genre, y, xlab = "Film Genre", ylab = "Transformed Film Revenue")
# No correlation
plot(data$country, y, xlab = "Country of Origin", ylab = "Transformed Film Revenue")
# Transform the original gross column variable.
data$gross <- y
# Using ANOVA and a linear regression model, all categorical values have a relatively small correlation..
model <- aov(data = data, gross ~ rating)
summary(model)
## Df Sum Sq Mean Sq F value Pr(>F)
## rating 3 625698 208566 199.6 <2e-16 ***
## Residuals 5333 5571627 1045
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model <- lm(data = data, gross ~ rating)
summary(model)
##
## Call:
## lm(formula = gross ~ rating, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -89.295 -22.960 -3.463 20.415 137.509
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99.880 3.068 32.556 < 2e-16 ***
## ratingPG -10.239 3.251 -3.150 0.00164 **
## ratingPG-13 -6.392 3.165 -2.020 0.04347 *
## ratingR -28.801 3.133 -9.193 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.32 on 5333 degrees of freedom
## Multiple R-squared: 0.101, Adjusted R-squared: 0.1005
## F-statistic: 199.6 on 3 and 5333 DF, p-value: < 2.2e-16
model <- aov(data = data, gross ~ genre)
summary(model)
## Df Sum Sq Mean Sq F value Pr(>F)
## genre 9 792715 88079 86.81 <2e-16 ***
## Residuals 5327 5404609 1015
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model <- lm(data = data, gross ~ genre)
summary(model)
##
## Call:
## lm(formula = gross ~ genre, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -101.606 -22.779 -2.258 20.511 144.302
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 94.8720 0.8498 111.644 < 2e-16 ***
## genreAdventure -7.9755 1.9630 -4.063 4.92e-05 ***
## genreAnimation 19.9365 2.0972 9.506 < 2e-16 ***
## genreBiography -19.0173 2.0040 -9.489 < 2e-16 ***
## genreComedy -20.2655 1.1864 -17.081 < 2e-16 ***
## genreCrime -25.8743 1.8267 -14.164 < 2e-16 ***
## genreDrama -22.5601 1.3939 -16.185 < 2e-16 ***
## genreFantasy -25.7157 5.1708 -4.973 6.79e-07 ***
## genreHorror -19.7419 2.2053 -8.952 < 2e-16 ***
## genreOther -17.7581 5.0466 -3.519 0.000437 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.85 on 5327 degrees of freedom
## Multiple R-squared: 0.1279, Adjusted R-squared: 0.1264
## F-statistic: 86.81 on 9 and 5327 DF, p-value: < 2.2e-16
# Little to no correlation, remove from model.
# model <- aov(data = data, gross ~ country)
# summary(model)
#
# model <- lm(data = data, gross ~ country)
# summary(model)
model <- aov(data = data, gross ~ year)
summary(model)
## Df Sum Sq Mean Sq F value Pr(>F)
## year 3 747042 249014 243.7 <2e-16 ***
## Residuals 5333 5450282 1022
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model <- lm(data = data, gross ~ year)
summary(model)
##
## Call:
## lm(formula = gross ~ year, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -86.18 -21.27 -2.82 18.59 144.50
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64.010 1.080 59.296 < 2e-16 ***
## year1990s 8.099 1.387 5.839 5.57e-09 ***
## year2000s 23.556 1.340 17.576 < 2e-16 ***
## year2010s 31.716 1.360 23.319 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.97 on 5333 degrees of freedom
## Multiple R-squared: 0.1205, Adjusted R-squared: 0.12
## F-statistic: 243.7 on 3 and 5333 DF, p-value: < 2.2e-16
# Using correlation test, all continuous values significant.
cor.test(data$gross, log(data$budget), method = "pearson")
##
## Pearson's product-moment correlation
##
## data: data$gross and log(data$budget)
## t = 65.05, df = 5335, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6498397 0.6797740
## sample estimates:
## cor
## 0.6650739
cor.test(data$gross, data$runtime, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: data$gross and data$runtime
## t = 21.402, df = 5335, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2562975 0.3057171
## sample estimates:
## cor
## 0.2811937
cor.test(data$gross, data$score, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: data$gross and data$score
## t = 17.408, df = 5335, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2062896 0.2570670
## sample estimates:
## cor
## 0.2318362
model <- lm(gross ~ log(budget) + rating + genre + year + score + runtime + country, data = data)
summary(model)
##
## Call:
## lm(formula = gross ~ log(budget) + rating + genre + year + score +
## runtime + country, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -95.833 -14.623 -1.206 13.845 127.071
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -231.56708 6.84126 -33.849 < 2e-16 ***
## log(budget) 14.40953 0.32739 44.013 < 2e-16 ***
## ratingPG -1.32415 2.40536 -0.550 0.582001
## ratingPG-13 -2.97914 2.53137 -1.177 0.239292
## ratingR -11.79336 2.51793 -4.684 2.89e-06 ***
## genreAdventure -5.21255 1.42557 -3.656 0.000258 ***
## genreAnimation 4.69372 1.84785 2.540 0.011110 *
## genreBiography -19.17150 1.45444 -13.181 < 2e-16 ***
## genreComedy -5.34449 0.87251 -6.125 9.69e-10 ***
## genreCrime -11.71011 1.31506 -8.905 < 2e-16 ***
## genreDrama -12.59764 1.02164 -12.331 < 2e-16 ***
## genreFantasy 0.94573 3.59049 0.263 0.792254
## genreHorror 11.97662 1.59835 7.493 7.83e-14 ***
## genreOther -7.41962 3.49629 -2.122 0.033872 *
## year1990s -0.62151 0.99129 -0.627 0.530707
## year2000s 8.64336 1.00133 8.632 < 2e-16 ***
## year2010s 14.63787 1.01674 14.397 < 2e-16 ***
## score 9.53386 0.36739 25.950 < 2e-16 ***
## runtime 0.11101 0.02239 4.959 7.32e-07 ***
## countryCanada 1.05023 4.14539 0.253 0.800009
## countryFrance -9.17798 4.17329 -2.199 0.027905 *
## countryGermany -1.06873 4.28189 -0.250 0.802912
## countryOther -3.27725 3.80721 -0.861 0.389386
## countryUnited Kingdom -2.14282 3.67105 -0.584 0.559443
## countryUnited States 5.42071 3.54910 1.527 0.126733
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.99 on 5312 degrees of freedom
## Multiple R-squared: 0.5855, Adjusted R-squared: 0.5836
## F-statistic: 312.6 on 24 and 5312 DF, p-value: < 2.2e-16
plot(model)
model <- lm(gross ~ log(budget) + rating + year + score, data = data)
summary(model)
##
## Call:
## lm(formula = gross ~ log(budget) + rating + year + score, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98.93 -16.01 -1.13 14.59 154.93
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -233.3247 5.6144 -41.558 < 2e-16 ***
## log(budget) 15.9924 0.2850 56.114 < 2e-16 ***
## ratingPG -5.2214 2.3387 -2.233 0.02561 *
## ratingPG-13 -7.1439 2.2764 -3.138 0.00171 **
## ratingR -15.2686 2.2611 -6.753 1.61e-11 ***
## year1990s -2.4337 1.0340 -2.354 0.01863 *
## year2000s 6.3498 1.0304 6.162 7.69e-10 ***
## year2010s 13.1841 1.0473 12.589 < 2e-16 ***
## score 8.1670 0.3325 24.565 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.14 on 5328 degrees of freedom
## Multiple R-squared: 0.5396, Adjusted R-squared: 0.5389
## F-statistic: 780.6 on 8 and 5328 DF, p-value: < 2.2e-16
plot(model)
# No change to the model
# model <- stepAIC(model, direction = "backward")
# summary(model)
# plot(model)
# Check for multicollinearity
vif_values <- vif(model)
barplot(vif_values[, 3], main = "VIF Values", horiz = TRUE, col = "steelblue", xlim = c(0,3))
#add vertical line at 2.5
abline(v = 2.5, lwd = 3, lty = 2)
# Normality
qqPlot(model, main="QQ Plot")
## [1] 2079 3419
sresid <- studres(model)
hist(sresid, freq = F,
main = "Distribution of Studentized Residuals")
xfit <- seq(min(sresid), max(sresid), length = 40)
yfit <- dnorm(xfit)
lines(xfit, yfit)
# Constant error variance
ncvTest(model)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 14.83126, Df = 1, p = 0.00011757
spreadLevelPlot(model)
## Warning in spreadLevelPlot.lm(model):
## 22 negative fitted values removed
##
## Suggested power transformation: 0.9964055
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 109.78, df = 8, p-value < 2.2e-16
# Already checked the multi collinearity assumption earlier, this is just a simpler version of that check.
# Kept the original check as it looked better from a visual standpoint with regards to presentation.
# vif(model)
# sqrt(vif(model)) > 2
# Linearity
crPlots(model)
ceresPlots(model)
## Warning in ceresPlots.default(model): Factors skipped in drawing CERES plots.
# Independence of Error terms
# Value of 1.70 is relatively acceptable and isn't a serious concern regarding autocorrelation..
durbinWatsonTest(model)$dw
## [1] 1.65762
wt <- 1 / lm(abs(model$residuals) ~ model$fitted.values)$fitted.values^2
#perform weighted least squares regression
wls_model <- lm(gross ~ log(budget) + rating + year + score, data = data, weights=wt)
#view summary of model
summary(wls_model)
##
## Call:
## lm(formula = gross ~ log(budget) + rating + year + score, data = data,
## weights = wt)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -5.4737 -0.8809 -0.0607 0.8001 8.4533
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -234.4140 5.6204 -41.707 < 2e-16 ***
## log(budget) 16.0515 0.2855 56.218 < 2e-16 ***
## ratingPG -5.2083 2.3347 -2.231 0.0257 *
## ratingPG-13 -7.1345 2.2725 -3.140 0.0017 **
## ratingR -15.2569 2.2574 -6.759 1.54e-11 ***
## year1990s -2.4522 1.0354 -2.368 0.0179 *
## year2000s 6.3185 1.0314 6.126 9.64e-10 ***
## year2010s 13.1637 1.0480 12.560 < 2e-16 ***
## score 8.1833 0.3326 24.606 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.273 on 5328 degrees of freedom
## Multiple R-squared: 0.5406, Adjusted R-squared: 0.5399
## F-statistic: 783.7 on 8 and 5328 DF, p-value: < 2.2e-16
plot(wls_model)