Prof. Lianxi Zhou
Dustin Pinch (6754550)
Rob Mantini (6282024)
Jeffery J. Unrau (7307572)
getwd()
setwd("C:/Users/dwp13/Desktop/School/MKTG 3P98/Project 3 Team 8")
library(readr)
#reading the two original data sets.
Movie_Financials <- read_csv("Movie_Financials.csv")
Movie_Aud <- read_csv("Movie Dataset_General Audience.csv")
#merging the two data sets together for easier analysis.
names(Movie_Aud)[1]<-c("original_title")
Movie_Data <-merge(Movie_Aud, Movie_Financials, By= "original_title")
str(Movie_Data)
#checking for null values in the Movie_Data set.
library(dplyr)
na_counts_before <- summarise_all(Movie_Data, ~sum(is.na(.)))
print(na_counts_before)
#replacing the categorical null values with mode.
replace_na_mode <- function(x) { if (is.character(x) || is.factor(x))
{ mode_value<-names(sort(table(x), decreaseing = TRUE))[1]
replace(x, is.na(x), mode_value)}else{x}}
Movie_Data <- Movie_Data %>% mutate(across(where(is.character), replace_na_mode))
null_count_post <- summarise_all(Movie_Data, ~sum(is.na(.)))
print(null_count_post)
# Filter top 25 & 300 grossing movies
top_25_movies <- Movie_Data[order(-Movie_Data$`revenue (Millions)`), ][1:25, ]
top_300_movies <- Movie_Data[order(-Movie_Data$`revenue (Millions)`), ][1:300, ]
# Find average run time of top 25 & 300
average_runtime25 <- mean(top_25_movies$runtime)
average_runtime300 <- mean(top_300_movies$runtime)
# View the average run time
print(average_runtime25)
## [1] 106.32
print(average_runtime300)
## [1] 106.0467
library(ggplot2)
# Plot run time vs revenue
ggplot(Movie_Data, aes(x = runtime, y = `revenue (Millions)`)) +
geom_point() +
geom_smooth(method = "lm", col = "blue", se = FALSE) +
labs(title = "Runtime vs Gross Revenue (All)",
x = "Runtime (minutes)",
y = "Gross Revenue ($Millions)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Plot run time vs revenue for top 25
ggplot(top_25_movies, aes(x = runtime, y = `revenue (Millions)`)) +
geom_point() +
geom_smooth(method = "lm", col = "blue", se = FALSE) +
labs(title = "Runtime vs Gross Revenue Top 25 Grossing Movies",
x = "Runtime (minutes)",
y = "Gross Revenue ($Millions)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Plot run time vs revenue for top 300
ggplot(top_300_movies, aes(x = runtime, y = `revenue (Millions)`)) +
geom_point() +
geom_smooth(method = "lm", col = "blue", se = FALSE) +
labs(title = "Runtime vs Gross Revenue Top 300 Grossing Movies",
x = "Runtime (minutes)",
y = "Gross Revenue ($Millions)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Does run time affect revenue? Top 300 & 25
# Pearson Test
correlation_test_Run_vs_Rev300 <- cor.test(top_300_movies$runtime, top_300_movies$`revenue (Millions)`)
correlation_test_Run_vs_Rev25 <- cor.test(top_25_movies$runtime, top_25_movies$`revenue (Millions)`)
# Print the test result
print(correlation_test_Run_vs_Rev300)
##
## Pearson's product-moment correlation
##
## data: top_300_movies$runtime and top_300_movies$`revenue (Millions)`
## t = 0.28158, df = 298, p-value = 0.7785
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.09711063 0.12931157
## sample estimates:
## cor
## 0.01630956
print(correlation_test_Run_vs_Rev25)
##
## Pearson's product-moment correlation
##
## data: top_25_movies$runtime and top_25_movies$`revenue (Millions)`
## t = 1.5145, df = 23, p-value = 0.1435
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1066932 0.6222268
## sample estimates:
## cor
## 0.3011328
# No correlation between revenue and run time
# Does run time affect IMDB ratings?
correlation_test_Run_vs_IMDB <- cor.test(top_300_movies$runtime, top_300_movies$imdb_rating)
# Print the test result
print(correlation_test_Run_vs_IMDB)
##
## Pearson's product-moment correlation
##
## data: top_300_movies$runtime and top_300_movies$imdb_rating
## t = -0.21301, df = 298, p-value = 0.8315
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1254043 0.1010434
## sample estimates:
## cor
## -0.01233864
# No correlation between run time and IMDB rating
# Does run time affect critic score?
correlation_test_Run_vs_critic<- cor.test(top_300_movies$runtime, top_300_movies$critics_score)
# Print the test result
print(correlation_test_Run_vs_critic)
##
## Pearson's product-moment correlation
##
## data: top_300_movies$runtime and top_300_movies$critics_score
## t = 1.6556, df = 298, p-value = 0.09885
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.01796527 0.20647838
## sample estimates:
## cor
## 0.09546979
# No correlation
# Does run time affect audience score?
correlation_test_Run_vs_aud <- cor.test(top_300_movies$runtime, top_300_movies$audience_score)
# Print the test result
print(correlation_test_Run_vs_aud)
##
## Pearson's product-moment correlation
##
## data: top_300_movies$runtime and top_300_movies$audience_score
## t = 1.2982, df = 298, p-value = 0.1952
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03857547 0.18664873
## sample estimates:
## cor
## 0.07499297
# No correlation
#dividing each genre into its separate tables.
library(dplyr)
Action_Adventure <-Movie_Data %>% filter(grepl("Action & Adventure", genre))
# View(Action_Adventure)
Animation <-Movie_Data %>% filter(grepl("Animation", genre))
# View(Animation)
ArtHouse_International <-Movie_Data %>% filter(grepl("Art House & International", genre))
# View(ArtHouse_International)
Comedy <-Movie_Data %>% filter(grepl("Comedy", genre))
# View(Comedy)
Documentary <-Movie_Data %>% filter(grepl("Documentary", genre))
# View(Documentary)
Drama <-Movie_Data %>% filter(grepl("Drama", genre))
# View(Drama)
Horror <-Movie_Data %>% filter(grepl("Horror", genre))
# View(Horror)
Musical_PerformingArts <-Movie_Data %>% filter(grepl("Musical & Performing Arts", genre))
# View(Musical_PerformingArts)
Mystery_Suspense <-Movie_Data %>% filter(grepl("Mystery & Suspense", genre))
# View(Mystery_Suspense)
Other <-Movie_Data %>% filter(grepl("Other", genre))
# View(Other)
Science_FictionFantasy <- Movie_Data %>% filter(grepl("Science Fiction & Fantasy", genre))
# View(Science_FictionFantasy)
#running regression for each genre
#Action $ Adventure
RegActionAdventure <-lm(Action_Adventure$`revenue (Millions)` ~ Action_Adventure$runtime, data = Action_Adventure)
summary(RegActionAdventure)
##
## Call:
## lm(formula = Action_Adventure$`revenue (Millions)` ~ Action_Adventure$runtime,
## data = Action_Adventure)
##
## Residuals:
## Min 1Q Median 3Q Max
## -194.47 -76.17 -49.43 32.23 671.76
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -227.890 120.139 -1.897 0.06243 .
## Action_Adventure$runtime 3.439 1.143 3.009 0.00377 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 152.2 on 63 degrees of freedom
## Multiple R-squared: 0.1257, Adjusted R-squared: 0.1118
## F-statistic: 9.054 on 1 and 63 DF, p-value: 0.003765
plot(Action_Adventure$`revenue (Millions)`, Action_Adventure$runtime,
xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Action & Adventure")
abline(RegActionAdventure, col="Red")
#equation: revenue = -227.890 + 3.439*run time. contributes to 12.7% of revenue.
#Animation
RegAnimation <-lm(Animation$`revenue (Millions)` ~ Animation$runtime, data = Animation)
summary(RegAnimation)
##
## Call:
## lm(formula = Animation$`revenue (Millions)` ~ Animation$runtime,
## data = Animation)
##
## Residuals:
## Min 1Q Median 3Q Max
## -284.63 -124.12 24.36 100.93 281.73
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1309.965 485.428 -2.699 0.0307 *
## Animation$runtime 17.731 5.517 3.214 0.0148 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 192 on 7 degrees of freedom
## Multiple R-squared: 0.5961, Adjusted R-squared: 0.5384
## F-statistic: 10.33 on 1 and 7 DF, p-value: 0.01478
plot(Animation$`revenue (Millions)`, Animation$runtime,
xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Animation")
abline(RegAnimation, col="Red")
#equation: revenue = -1309.965 + 17.731* run time (minutes). contributes to 60% of revenue.
#Art House & International
RegArtHouse_International <-lm(ArtHouse_International$`revenue (Millions)` ~ ArtHouse_International$runtime, data = ArtHouse_International)
summary(RegArtHouse_International)
##
## Call:
## lm(formula = ArtHouse_International$`revenue (Millions)` ~ ArtHouse_International$runtime,
## data = ArtHouse_International)
##
## Residuals:
## Min 1Q Median 3Q Max
## -120.304 -35.954 0.942 42.550 123.999
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -524.031 166.558 -3.146 0.00843 **
## ArtHouse_International$runtime 6.371 1.618 3.939 0.00197 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 78.48 on 12 degrees of freedom
## Multiple R-squared: 0.5638, Adjusted R-squared: 0.5275
## F-statistic: 15.51 on 1 and 12 DF, p-value: 0.001968
plot(ArtHouse_International$`revenue (Millions)`, ArtHouse_International$runtime,
xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Art House & International")
abline(RegArtHouse_International, col="Red")
#the equation revenue = -524.031 + 6.371*run time (minutes). run time contributes to 56.38% of revenue for genre.
#Comedy
Reg_Comedy <-lm(Comedy$`revenue (Millions)` ~ Comedy$runtime, data = Comedy)
summary(Reg_Comedy)
##
## Call:
## lm(formula = Comedy$`revenue (Millions)` ~ Comedy$runtime, data = Comedy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -190.79 -109.13 -79.09 20.99 1246.84
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.514 191.676 -0.034 0.973
## Comedy$runtime 1.484 1.965 0.755 0.452
##
## Residual standard error: 213.2 on 87 degrees of freedom
## Multiple R-squared: 0.006514, Adjusted R-squared: -0.004905
## F-statistic: 0.5705 on 1 and 87 DF, p-value: 0.4521
plot(Comedy$`revenue (Millions)`, Comedy$runtime,
xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Comedy")
abline(Reg_Comedy, col="Red")
#equation: revenue = -6.514 + 1.484* runtime (minutes). can account to 0.6514% of revenue for comedy.
#Documentary
Reg_Documentary <-lm(Documentary$`revenue (Millions)` ~ Documentary$runtime, data = Documentary)
summary(Reg_Documentary)
##
## Call:
## lm(formula = Documentary$`revenue (Millions)` ~ Documentary$runtime,
## data = Documentary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -149.41 -119.44 -83.04 -3.94 1021.95
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 163.0328 99.6051 1.637 0.108
## Documentary$runtime -0.3210 0.9773 -0.328 0.744
##
## Residual standard error: 227 on 53 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.002031, Adjusted R-squared: -0.0168
## F-statistic: 0.1079 on 1 and 53 DF, p-value: 0.7439
plot(Documentary$`revenue (Millions)`, Documentary$runtime,
xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Documentary")
abline(Reg_Documentary, col="Red")
#equation: revenue = 162.8184 - 0.3350*runtime (minutes). has a 0.2208% impact on revenue.
#Drama
Reg_Drama <-lm(Drama$`revenue (Millions)` ~ Drama$runtime, data = Drama)
summary(Reg_Drama)
##
## Call:
## lm(formula = Drama$`revenue (Millions)` ~ Drama$runtime, data = Drama)
##
## Residuals:
## Min 1Q Median 3Q Max
## -138.41 -117.08 -75.88 13.12 1935.89
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 153.5468 77.5266 1.981 0.0485 *
## Drama$runtime -0.1529 0.6920 -0.221 0.8252
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 215.1 on 310 degrees of freedom
## Multiple R-squared: 0.0001576, Adjusted R-squared: -0.003068
## F-statistic: 0.04885 on 1 and 310 DF, p-value: 0.8252
plot(Drama$`revenue (Millions)`, Drama$runtime,
xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Drama")
abline(Reg_Drama, col="Red")
#equation: revenue = 153.5468 - 0.1529*runtime(minutes). run time impacts 0.01576% of revenue.
#Horror
Reg_Horror <-lm(Horror$`revenue (Millions)` ~ Horror$runtime, data = Horror)
summary(RegArtHouse_International)
##
## Call:
## lm(formula = ArtHouse_International$`revenue (Millions)` ~ ArtHouse_International$runtime,
## data = ArtHouse_International)
##
## Residuals:
## Min 1Q Median 3Q Max
## -120.304 -35.954 0.942 42.550 123.999
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -524.031 166.558 -3.146 0.00843 **
## ArtHouse_International$runtime 6.371 1.618 3.939 0.00197 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 78.48 on 12 degrees of freedom
## Multiple R-squared: 0.5638, Adjusted R-squared: 0.5275
## F-statistic: 15.51 on 1 and 12 DF, p-value: 0.001968
plot(Horror$`revenue (Millions)`, Horror$runtime,
xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Horror")
abline(Reg_Horror, col="Red")
#equation: revenue = -524.031 + 6.371*run time. accounts for 56.38% of revenue for genre.
#Musical and Performing Arts
RegMusical_PerformArt <-lm(Musical_PerformingArts$`revenue (Millions)` ~ Musical_PerformingArts$runtime, data = Musical_PerformingArts)
summary(RegMusical_PerformArt)
##
## Call:
## lm(formula = Musical_PerformingArts$`revenue (Millions)` ~ Musical_PerformingArts$runtime,
## data = Musical_PerformingArts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -190.40 -103.73 -39.67 76.99 352.10
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 545.340 364.640 1.496 0.166
## Musical_PerformingArts$runtime -3.586 3.180 -1.128 0.286
##
## Residual standard error: 159.1 on 10 degrees of freedom
## Multiple R-squared: 0.1128, Adjusted R-squared: 0.02411
## F-statistic: 1.272 on 1 and 10 DF, p-value: 0.2858
plot(Musical_PerformingArts$`revenue (Millions)`, Musical_PerformingArts$runtime,
xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Musical & Performing Arts")
abline(RegMusical_PerformArt, col="Red")
#equation: revenue = 545.340 - 3.586*run time. contributes to 11.28 % of revenue.
#Mystery & Suspense
RegMystery_Suspense <-lm(Mystery_Suspense$`revenue (Millions)` ~ Mystery_Suspense$runtime, data = Mystery_Suspense)
summary(RegMystery_Suspense)
##
## Call:
## lm(formula = Mystery_Suspense$`revenue (Millions)` ~ Mystery_Suspense$runtime,
## data = Mystery_Suspense)
##
## Residuals:
## Min 1Q Median 3Q Max
## -133.86 -103.09 -73.26 8.20 840.79
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 89.2316 153.3081 0.582 0.563
## Mystery_Suspense$runtime 0.3808 1.3744 0.277 0.783
##
## Residual standard error: 190.6 on 58 degrees of freedom
## Multiple R-squared: 0.001322, Adjusted R-squared: -0.0159
## F-statistic: 0.07677 on 1 and 58 DF, p-value: 0.7827
plot(Mystery_Suspense$`revenue (Millions)`, Mystery_Suspense$runtime,
xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Mystery & Suspense")
abline(RegMystery_Suspense, col="Red")
#equation: revenue = 89.23 + 0.3808*run time. contributes to 0.132% of revenue.
#Other
Reg_Other <-lm(Other$`revenue (Millions)` ~ Other$runtime, data = Other)
summary(Reg_Other)
##
## Call:
## lm(formula = Other$`revenue (Millions)` ~ Other$runtime, data = Other)
##
## Residuals:
## Min 1Q Median 3Q Max
## -158.55 -130.96 -116.29 -54.89 599.53
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 95.1628 353.9078 0.269 0.792
## Other$runtime 0.5701 3.1743 0.180 0.860
##
## Residual standard error: 262.1 on 14 degrees of freedom
## Multiple R-squared: 0.002298, Adjusted R-squared: -0.06897
## F-statistic: 0.03225 on 1 and 14 DF, p-value: 0.86
plot(Other$`revenue (Millions)`, Other$runtime,
xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Other")
abline(Reg_Other, col="Red")
#equation: revenue = 95.16 + 0.5701*run time. contributes to 0.2298% of revenue.
#Science Fiction & Fantasy
RegScienceFic_Fantasy <-lm(Science_FictionFantasy$`revenue (Millions)` ~ Science_FictionFantasy$runtime, data = Science_FictionFantasy)
summary(RegScienceFic_Fantasy)
##
## Call:
## lm(formula = Science_FictionFantasy$`revenue (Millions)` ~ Science_FictionFantasy$runtime,
## data = Science_FictionFantasy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -83.71 -39.24 -24.36 36.10 122.07
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -101.089 112.621 -0.898 0.399
## Science_FictionFantasy$runtime 1.666 1.088 1.532 0.169
##
## Residual standard error: 74.27 on 7 degrees of freedom
## Multiple R-squared: 0.251, Adjusted R-squared: 0.1441
## F-statistic: 2.346 on 1 and 7 DF, p-value: 0.1694
plot(Science_FictionFantasy$`revenue (Millions)`, Science_FictionFantasy$runtime,
xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Science Fiction & Fantasy")
abline(RegScienceFic_Fantasy, col="Red")
#equation: Revenue = -101.089 + 1.66*run time. contributes to 25% of revenue.
# Do Facebook likes affect Revenue?
correlation_test_FB_vs_Rev<- cor.test(top_300_movies$Facebook_Likes, top_300_movies$`revenue (Millions)`)
print(correlation_test_FB_vs_Rev)
##
## Pearson's product-moment correlation
##
## data: top_300_movies$Facebook_Likes and top_300_movies$`revenue (Millions)`
## t = 13.672, df = 298, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5460041 0.6858766
## sample estimates:
## cor
## 0.6208573
# Significant Result
Strong positive correlation between higher Facebook likes and higher revenue
# Plot FB Likes vs Revenue
ggplot(top_300_movies, aes(x = Facebook_Likes, y = `revenue (Millions)`)) +
geom_point() +
geom_smooth(method = "lm", col = "blue", se = FALSE) +
labs(title = "Facebook Likes vs Gross Revenue Top 300 Grossing Movies",
x = "Facebook Likes",
y = "Gross Revenue ($Millions)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Predicted revenue for the average movie
# Linear regression model to predict revenue
model <- lm(revenue..Millions. ~ budget..Millions. + runtime + imdb_rating + critics_score + audience_score + Facebook_Likes, data = Movie_Data)
summary(model)
##
## Call:
## lm(formula = revenue..Millions. ~ budget..Millions. + runtime +
## imdb_rating + critics_score + audience_score + Facebook_Likes,
## data = Movie_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -402.93 -53.18 -8.03 28.78 1517.66
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.620e+01 3.144e+01 -1.788 0.0743 .
## budget..Millions. 1.800e+00 1.345e-01 13.382 <2e-16 ***
## runtime -2.454e-01 2.706e-01 -0.907 0.3647
## imdb_rating 2.922e-01 1.995e+00 0.146 0.8836
## critics_score 7.969e-01 3.590e-01 2.219 0.0268 *
## audience_score 7.061e-01 4.289e-01 1.646 0.1002
## Facebook_Likes 9.669e-04 6.430e-05 15.036 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 133.8 on 657 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6039, Adjusted R-squared: 0.6003
## F-statistic: 167 on 6 and 657 DF, p-value: < 2.2e-16
# Predicting box office sales for a new movie using the average budget
new_movie <- data.frame(
budget..Millions. = average_budget, # using the calculated average budget
runtime = mean(Movie_Data$runtime, na.rm = TRUE), # using average runtime
imdb_rating = mean(Movie_Data$imdb_rating, na.rm = TRUE), # using average IMDB rating
critics_score = 55, # given critics score
audience_score = mean(Movie_Data$audience_score, na.rm = TRUE), # using average audience score
Facebook_Likes = 1250 # given Facebook likes
)
predicted_revenue <- predict(model, newdata = new_movie)
print(predicted_revenue)
## 1
## 76.99852
Average movie will generate $76.9M in revenue
# Adjusting for the advertising campaign
new_movie_campaign <- new_movie
new_movie_campaign$Facebook_Likes <- new_movie$Facebook_Likes * 100 # increasing Facebook likes by 10,000%
# Predicting the revenue after the advertising campaign
predicted_revenue_campaign <- predict(model, newdata = new_movie_campaign)
print(predicted_revenue_campaign)
## 1
## 196.6487
# Calculating the increase in revenue due to the campaign
revenue_increase <- predicted_revenue_campaign - predicted_revenue
print(revenue_increase)
## 1
## 119.6502
With the new campaign the movie will generate $196.6M in revenue, an increase of $119.6M
#Is there a relationship between budget and revenue?
Budget_reg <-lm(Movie_Data$`revenue (Millions)`~Movie_Data$`budget (Millions)`, data = Movie_Data)
summary(Budget_reg)
##
## Call:
## lm(formula = Movie_Data$`revenue (Millions)` ~ Movie_Data$`budget (Millions)`,
## data = Movie_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -402.96 -58.06 -26.12 13.71 1449.83
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.0530 8.3086 1.932 0.0538 .
## Movie_Data$`budget (Millions)` 3.0115 0.1364 22.086 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 160.6 on 663 degrees of freedom
## Multiple R-squared: 0.4239, Adjusted R-squared: 0.423
## F-statistic: 487.8 on 1 and 663 DF, p-value: < 2.2e-16
plot(Movie_Data$`revenue (Millions)`,Movie_Data$`budget (Millions)`,
xlab = "Revenue", ylab = "Budget", main = "Budget Impact on Revenue")
abline(Budget_reg, col="Red")
#budget has a positive relationship with revenue. As budget increases the revenue does increase. Can see
#...42.3% of revenue can be accounted for by budget increases.
#... budget = 20.97 + 0.1408(revenue) to find what budget would give a specific revenue.
# Grouping genres into broader categories
Movie_Data$genre <- case_when(
Movie_Data$genre %in% c("Drama", "Comedy", "Romance", "Musical") ~ "Drama/Comedy",
Movie_Data$genre %in% c("Action", "Adventure", "Thriller", "Crime") ~ "Action/Thriller",
Movie_Data$genre %in% c("Horror", "Sci-Fi", "Fantasy") ~ "Horror/Sci-Fi",
TRUE ~ "Other"
)
# Visualizing the average revenue by genre
ggplot(Movie_Data, aes(x = genre, y = revenue..Millions., fill = genre)) +
geom_bar(stat = "summary", fun = "mean") +
theme_minimal() +
labs(title = "Average Revenue by Genre", x = "Genre", y = "Average Revenue (Millions)")
#2
#Does genre have an impact on revenue? Use ANOVA test to evaluate.
library(dplyr)
Movie_Data$genre<- as.factor(Movie_Data$genre)
Movie_Data$`revenue (Millions)`<- as.numeric(Movie_Data$`revenue (Millions)`)
Genre_ANOVA <-aov(Movie_Data$`revenue (Millions)`~ Movie_Data$genre)
TukeyHSD(Genre_ANOVA)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Movie_Data$`revenue (Millions)` ~ Movie_Data$genre)
##
## $`Movie_Data$genre`
## diff lwr
## Animation-Action & Adventure 107.4337607 -136.51089
## Art House & International-Action & Adventure -2.3791758 -204.46983
## Comedy-Action & Adventure 8.1271564 -103.78108
## Documentary-Action & Adventure 1.1631456 -123.89021
## Drama-Action & Adventure 7.5061218 -86.01077
## Horror-Action & Adventure 59.6289298 -106.77904
## Musical & Performing Arts-Action & Adventure 8.2890385 -207.21302
## Mystery & Suspense-Action & Adventure 2.0412051 -120.75249
## Other-Action & Adventure 28.5034135 -162.91302
## Science Fiction & Fantasy-Action & Adventure -61.9151282 -305.85978
## Art House & International-Animation -109.8129365 -402.85638
## Comedy-Animation -99.3066042 -339.21760
## Documentary-Animation -106.2706151 -352.58784
## Drama-Animation -99.9276389 -331.83116
## Horror-Animation -47.8048309 -317.48138
## Musical & Performing Arts-Animation -99.1447222 -401.59302
## Mystery & Suspense-Animation -105.3925556 -350.57029
## Other-Animation -78.9303472 -364.71712
## Science Fiction & Fantasy-Animation -169.3488889 -492.67972
## Comedy-Art House & International 10.5063323 -186.69641
## Documentary-Art House & International 3.5423214 -201.40600
## Drama-Art House & International 9.8852976 -177.49368
## Horror-Art House & International 62.0081056 -170.49368
## Musical & Performing Arts-Art House & International 10.6682143 -259.15878
## Mystery & Suspense-Art House & International 4.4203810 -199.15703
## Other-Art House & International 30.8825893 -220.12681
## Science Fiction & Fantasy-Art House & International -59.5359524 -352.57940
## Documentary-Comedy -6.9640108 -123.95391
## Drama-Comedy -0.6210346 -83.04495
## Horror-Comedy 51.5017733 -108.93483
## Musical & Performing Arts-Comedy 0.1618820 -210.76330
## Mystery & Suspense-Comedy -6.0859513 -120.65726
## Other-Comedy 20.3762570 -165.87235
## Science Fiction & Fantasy-Comedy -70.0422846 -309.95328
## Drama-Documentary 6.3429762 -93.19893
## Horror-Documentary 58.4657842 -111.40120
## Musical & Performing Arts-Documentary 7.1258929 -211.05825
## Mystery & Suspense-Documentary 0.8780595 -126.56398
## Other-Documentary 27.3402679 -167.09078
## Science Fiction & Fantasy-Documentary -63.0782738 -309.39550
## Horror-Drama 52.1228080 -96.07254
## Musical & Performing Arts-Drama 0.7829167 -200.98772
## Mystery & Suspense-Drama -5.4649167 -102.15277
## Other-Drama 20.9972917 -154.81653
## Science Fiction & Fantasy-Drama -69.4212500 -301.32477
## Musical & Performing Arts-Horror -51.3398913 -295.58892
## Mystery & Suspense-Horror -57.5877246 -225.79814
## Other-Horror -31.1255163 -254.41166
## Science Fiction & Fantasy-Horror -121.5440580 -391.22061
## Mystery & Suspense-Musical & Performing Arts -6.2478333 -223.14475
## Other-Musical & Performing Arts 20.2143750 -241.71353
## Science Fiction & Fantasy-Musical & Performing Arts -70.2041667 -372.65246
## Other-Mystery & Suspense 26.4622083 -166.52323
## Science Fiction & Fantasy-Mystery & Suspense -63.9563333 -309.13407
## Science Fiction & Fantasy-Other -90.4185417 -376.20532
## upr p adj
## Animation-Action & Adventure 351.37841 0.9425798
## Art House & International-Action & Adventure 199.71147 1.0000000
## Comedy-Action & Adventure 120.03539 1.0000000
## Documentary-Action & Adventure 126.21651 1.0000000
## Drama-Action & Adventure 101.02301 1.0000000
## Horror-Action & Adventure 226.03689 0.9865860
## Musical & Performing Arts-Action & Adventure 223.79110 1.0000000
## Mystery & Suspense-Action & Adventure 124.83490 1.0000000
## Other-Action & Adventure 219.91984 0.9999941
## Science Fiction & Fantasy-Action & Adventure 182.02952 0.9992053
## Art House & International-Animation 183.23051 0.9812707
## Comedy-Animation 140.60439 0.9620224
## Documentary-Animation 140.04661 0.9498151
## Drama-Animation 131.97588 0.9502261
## Horror-Animation 221.87172 0.9999694
## Musical & Performing Arts-Animation 203.30357 0.9932696
## Mystery & Suspense-Animation 139.78518 0.9510165
## Other-Animation 206.85643 0.9983553
## Science Fiction & Fantasy-Animation 153.98194 0.8395632
## Comedy-Art House & International 207.70908 1.0000000
## Documentary-Art House & International 208.49064 1.0000000
## Drama-Art House & International 197.26427 1.0000000
## Horror-Art House & International 294.50989 0.9987798
## Musical & Performing Arts-Art House & International 280.49521 1.0000000
## Mystery & Suspense-Art House & International 207.99779 1.0000000
## Other-Art House & International 281.89199 0.9999991
## Science Fiction & Fantasy-Art House & International 233.50750 0.9998917
## Documentary-Comedy 110.02588 1.0000000
## Drama-Comedy 81.80288 1.0000000
## Horror-Comedy 211.93838 0.9943041
## Musical & Performing Arts-Comedy 211.08706 1.0000000
## Mystery & Suspense-Comedy 108.48536 1.0000000
## Other-Comedy 206.62486 0.9999997
## Science Fiction & Fantasy-Comedy 169.86871 0.9973765
## Drama-Documentary 105.88489 1.0000000
## Horror-Documentary 228.33277 0.9901411
## Musical & Performing Arts-Documentary 225.31004 1.0000000
## Mystery & Suspense-Documentary 128.32010 1.0000000
## Other-Documentary 221.77131 0.9999966
## Science Fiction & Fantasy-Documentary 183.23895 0.9991410
## Horror-Drama 200.31815 0.9883566
## Musical & Performing Arts-Drama 202.55355 1.0000000
## Mystery & Suspense-Drama 91.22294 1.0000000
## Other-Drama 196.81111 0.9999993
## Science Fiction & Fantasy-Drama 162.48227 0.9967714
## Musical & Performing Arts-Horror 192.90914 0.9998522
## Mystery & Suspense-Horror 110.62269 0.9905398
## Other-Horror 192.16062 0.9999969
## Science Fiction & Fantasy-Horror 148.13249 0.9333949
## Mystery & Suspense-Musical & Performing Arts 210.64908 1.0000000
## Other-Musical & Performing Arts 282.14228 1.0000000
## Science Fiction & Fantasy-Musical & Performing Arts 232.24413 0.9996386
## Other-Mystery & Suspense 219.44765 0.9999973
## Science Fiction & Fantasy-Mystery & Suspense 181.22141 0.9989919
## Science Fiction & Fantasy-Other 195.36823 0.9949308
#based on the p-values, there is not significant difference in genre and revenue creation. can not confidently
#..say one genre will generate more revenue than another.
#plotting sum revenue per genre
library(dplyr)
Movie_Data$genre <- as.factor(Movie_Data$genre)
Movie_Data$`revenue (Millions)` <- as.numeric(Movie_Data$`revenue (Millions)`)
RevenueByGenre <- Movie_Data %>% group_by(genre) %>%
summarize(Average_Revenue = mean(`revenue (Millions)`, na.rm = TRUE))
library(ggplot2)
ggplot(RevenueByGenre, aes(x = genre, y = Average_Revenue)) +
geom_col() +
labs(x = "Genre", y = "Average Revenue (Millions)", title = "Average Revenue by Genre") +
theme_minimal()
#Based on graph, animation and horror have the highest average revenue. average revenue is realtively flat when comparing to other genres.
#..horror or animation movie might be the best way to approach new movies for revenue success.
#evaluating if genre and rating has an impact on revenue.
library(dplyr)
library(ggplot2)
Movie_Data$genre <- as.factor(Movie_Data$genre)
Movie_Data$mpaa_rating <- as.factor(Movie_Data$mpaa_rating)
Movie_Data$`revenue (Millions)` <- as.numeric(Movie_Data$`revenue (Millions)`)
group_counts <- Movie_Data %>%
group_by(genre, mpaa_rating) %>%
summarize(n = n())
## `summarise()` has grouped output by 'genre'. You can override using the
## `.groups` argument.
filtered_data <- Movie_Data %>%
group_by(genre, mpaa_rating) %>%
filter(n() >= 2) %>%
ungroup()
anova_result <- aov(`revenue (Millions)` ~ genre * mpaa_rating, data = filtered_data)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## genre 10 217521 21752 0.484 0.9008
## mpaa_rating 5 449730 89946 2.001 0.0767 .
## genre:mpaa_rating 20 1043885 52194 1.161 0.2824
## Residuals 621 27911107 44945
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
shapiro.test(residuals(anova_result))
##
## Shapiro-Wilk normality test
##
## data: residuals(anova_result)
## W = 0.70159, p-value < 2.2e-16
bartlett_test_result <- bartlett.test(`revenue (Millions)` ~ interaction(genre, mpaa_rating), data = filtered_data)
print(bartlett_test_result)
##
## Bartlett test of homogeneity of variances
##
## data: revenue (Millions) by interaction(genre, mpaa_rating)
## Bartlett's K-squared = 191.62, df = 35, p-value < 2.2e-16