Movie Market Analysis

Prof. Lianxi Zhou

Dustin Pinch (6754550)

Rob Mantini (6282024)

Jeffery J. Unrau (7307572)

Cleaning Data

getwd() 
setwd("C:/Users/dwp13/Desktop/School/MKTG 3P98/Project 3 Team 8") 

library(readr) 

#reading the two original data sets. 
Movie_Financials <- read_csv("Movie_Financials.csv")
Movie_Aud <- read_csv("Movie Dataset_General Audience.csv") 


#merging the two data sets together for easier analysis. 
names(Movie_Aud)[1]<-c("original_title") 

Movie_Data <-merge(Movie_Aud, Movie_Financials, By= "original_title") 
str(Movie_Data) 

#checking for null values in the Movie_Data set. 
library(dplyr) 

na_counts_before <- summarise_all(Movie_Data, ~sum(is.na(.))) 
print(na_counts_before) 


#replacing the categorical null values with mode.  
replace_na_mode <- function(x) { if (is.character(x) || is.factor(x)) 
{ mode_value<-names(sort(table(x), decreaseing = TRUE))[1]  
replace(x, is.na(x), mode_value)}else{x}} 
Movie_Data <- Movie_Data %>% mutate(across(where(is.character), replace_na_mode)) 
null_count_post <- summarise_all(Movie_Data, ~sum(is.na(.))) 

print(null_count_post) 

Run time

# Filter top 25 & 300 grossing movies
top_25_movies <- Movie_Data[order(-Movie_Data$`revenue (Millions)`), ][1:25, ]
top_300_movies <- Movie_Data[order(-Movie_Data$`revenue (Millions)`), ][1:300, ]

# Find average run time of top 25 & 300
average_runtime25 <- mean(top_25_movies$runtime)
average_runtime300 <- mean(top_300_movies$runtime)

# View the average run time
print(average_runtime25)
## [1] 106.32
print(average_runtime300)
## [1] 106.0467
library(ggplot2)
# Plot run time vs revenue
ggplot(Movie_Data, aes(x = runtime, y = `revenue (Millions)`)) +
  geom_point() +
  geom_smooth(method = "lm", col = "blue", se = FALSE) +
  labs(title = "Runtime vs Gross Revenue (All)",
       x = "Runtime (minutes)",
       y = "Gross Revenue ($Millions)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Plot run time vs revenue for top 25
ggplot(top_25_movies, aes(x = runtime, y = `revenue (Millions)`)) +
  geom_point() +
  geom_smooth(method = "lm", col = "blue", se = FALSE) +
  labs(title = "Runtime vs Gross Revenue Top 25 Grossing Movies",
       x = "Runtime (minutes)",
       y = "Gross Revenue ($Millions)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Plot run time vs revenue for top 300
ggplot(top_300_movies, aes(x = runtime, y = `revenue (Millions)`)) +
  geom_point() +
  geom_smooth(method = "lm", col = "blue", se = FALSE) +
  labs(title = "Runtime vs Gross Revenue Top 300 Grossing Movies",
       x = "Runtime (minutes)",
       y = "Gross Revenue ($Millions)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Does run time affect revenue? Top 300 & 25
# Pearson Test
correlation_test_Run_vs_Rev300 <- cor.test(top_300_movies$runtime, top_300_movies$`revenue (Millions)`)
correlation_test_Run_vs_Rev25 <- cor.test(top_25_movies$runtime, top_25_movies$`revenue (Millions)`)

# Print the test result
print(correlation_test_Run_vs_Rev300)
## 
##  Pearson's product-moment correlation
## 
## data:  top_300_movies$runtime and top_300_movies$`revenue (Millions)`
## t = 0.28158, df = 298, p-value = 0.7785
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.09711063  0.12931157
## sample estimates:
##        cor 
## 0.01630956
print(correlation_test_Run_vs_Rev25)
## 
##  Pearson's product-moment correlation
## 
## data:  top_25_movies$runtime and top_25_movies$`revenue (Millions)`
## t = 1.5145, df = 23, p-value = 0.1435
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1066932  0.6222268
## sample estimates:
##       cor 
## 0.3011328
# No correlation between revenue and run time

IMDB Ratings

# Does run time affect IMDB ratings?
correlation_test_Run_vs_IMDB <- cor.test(top_300_movies$runtime, top_300_movies$imdb_rating)

# Print the test result
print(correlation_test_Run_vs_IMDB)
## 
##  Pearson's product-moment correlation
## 
## data:  top_300_movies$runtime and top_300_movies$imdb_rating
## t = -0.21301, df = 298, p-value = 0.8315
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1254043  0.1010434
## sample estimates:
##         cor 
## -0.01233864
# No correlation between run time and IMDB rating

Critic Score

# Does run time affect critic score?
correlation_test_Run_vs_critic<- cor.test(top_300_movies$runtime, top_300_movies$critics_score)

# Print the test result
print(correlation_test_Run_vs_critic)
## 
##  Pearson's product-moment correlation
## 
## data:  top_300_movies$runtime and top_300_movies$critics_score
## t = 1.6556, df = 298, p-value = 0.09885
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01796527  0.20647838
## sample estimates:
##        cor 
## 0.09546979
# No correlation

Audience Score

# Does run time affect audience score?
correlation_test_Run_vs_aud <- cor.test(top_300_movies$runtime, top_300_movies$audience_score)

# Print the test result
print(correlation_test_Run_vs_aud)
## 
##  Pearson's product-moment correlation
## 
## data:  top_300_movies$runtime and top_300_movies$audience_score
## t = 1.2982, df = 298, p-value = 0.1952
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03857547  0.18664873
## sample estimates:
##        cor 
## 0.07499297
# No correlation

How run time affects revenue based on genre

#dividing each genre into its separate tables.
library(dplyr)
Action_Adventure <-Movie_Data %>% filter(grepl("Action & Adventure", genre))
# View(Action_Adventure)

Animation <-Movie_Data %>% filter(grepl("Animation", genre))
# View(Animation)

ArtHouse_International <-Movie_Data %>% filter(grepl("Art House & International", genre))
# View(ArtHouse_International)

Comedy <-Movie_Data %>% filter(grepl("Comedy", genre))
# View(Comedy)

Documentary <-Movie_Data %>% filter(grepl("Documentary", genre))
# View(Documentary)

Drama <-Movie_Data %>% filter(grepl("Drama", genre))
# View(Drama)

Horror <-Movie_Data %>% filter(grepl("Horror", genre))
# View(Horror)

Musical_PerformingArts <-Movie_Data %>% filter(grepl("Musical & Performing Arts", genre))
# View(Musical_PerformingArts)

Mystery_Suspense <-Movie_Data %>% filter(grepl("Mystery & Suspense", genre))
# View(Mystery_Suspense)

Other <-Movie_Data %>% filter(grepl("Other", genre))
# View(Other)

Science_FictionFantasy <- Movie_Data %>% filter(grepl("Science Fiction & Fantasy", genre))
# View(Science_FictionFantasy)

#running regression for each genre

#Action $ Adventure
RegActionAdventure <-lm(Action_Adventure$`revenue (Millions)` ~ Action_Adventure$runtime, data = Action_Adventure)
summary(RegActionAdventure)
## 
## Call:
## lm(formula = Action_Adventure$`revenue (Millions)` ~ Action_Adventure$runtime, 
##     data = Action_Adventure)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -194.47  -76.17  -49.43   32.23  671.76 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)   
## (Intercept)              -227.890    120.139  -1.897  0.06243 . 
## Action_Adventure$runtime    3.439      1.143   3.009  0.00377 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 152.2 on 63 degrees of freedom
## Multiple R-squared:  0.1257, Adjusted R-squared:  0.1118 
## F-statistic: 9.054 on 1 and 63 DF,  p-value: 0.003765
plot(Action_Adventure$`revenue (Millions)`, Action_Adventure$runtime, 
     xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Action & Adventure")
abline(RegActionAdventure, col="Red")

#equation: revenue = -227.890 + 3.439*run time. contributes to 12.7% of revenue. 

#Animation
RegAnimation <-lm(Animation$`revenue (Millions)` ~ Animation$runtime, data = Animation)
summary(RegAnimation)
## 
## Call:
## lm(formula = Animation$`revenue (Millions)` ~ Animation$runtime, 
##     data = Animation)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -284.63 -124.12   24.36  100.93  281.73 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)  
## (Intercept)       -1309.965    485.428  -2.699   0.0307 *
## Animation$runtime    17.731      5.517   3.214   0.0148 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 192 on 7 degrees of freedom
## Multiple R-squared:  0.5961, Adjusted R-squared:  0.5384 
## F-statistic: 10.33 on 1 and 7 DF,  p-value: 0.01478
plot(Animation$`revenue (Millions)`, Animation$runtime, 
     xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Animation")
abline(RegAnimation, col="Red")  

#equation: revenue = -1309.965 + 17.731* run time (minutes). contributes to 60% of revenue. 

#Art House & International
  RegArtHouse_International <-lm(ArtHouse_International$`revenue (Millions)` ~ ArtHouse_International$runtime, data = ArtHouse_International)
summary(RegArtHouse_International)
## 
## Call:
## lm(formula = ArtHouse_International$`revenue (Millions)` ~ ArtHouse_International$runtime, 
##     data = ArtHouse_International)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -120.304  -35.954    0.942   42.550  123.999 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                    -524.031    166.558  -3.146  0.00843 **
## ArtHouse_International$runtime    6.371      1.618   3.939  0.00197 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78.48 on 12 degrees of freedom
## Multiple R-squared:  0.5638, Adjusted R-squared:  0.5275 
## F-statistic: 15.51 on 1 and 12 DF,  p-value: 0.001968
plot(ArtHouse_International$`revenue (Millions)`, ArtHouse_International$runtime, 
     xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Art House & International")
abline(RegArtHouse_International, col="Red")  

#the equation revenue = -524.031 + 6.371*run time (minutes). run time contributes to 56.38% of revenue for genre. 

#Comedy
  Reg_Comedy <-lm(Comedy$`revenue (Millions)` ~ Comedy$runtime, data = Comedy)
summary(Reg_Comedy)
## 
## Call:
## lm(formula = Comedy$`revenue (Millions)` ~ Comedy$runtime, data = Comedy)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -190.79 -109.13  -79.09   20.99 1246.84 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)      -6.514    191.676  -0.034    0.973
## Comedy$runtime    1.484      1.965   0.755    0.452
## 
## Residual standard error: 213.2 on 87 degrees of freedom
## Multiple R-squared:  0.006514,   Adjusted R-squared:  -0.004905 
## F-statistic: 0.5705 on 1 and 87 DF,  p-value: 0.4521
plot(Comedy$`revenue (Millions)`, Comedy$runtime, 
     xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Comedy")
abline(Reg_Comedy, col="Red")  

#equation: revenue = -6.514 + 1.484* runtime (minutes). can account to 0.6514% of revenue for comedy.

#Documentary
  Reg_Documentary <-lm(Documentary$`revenue (Millions)` ~ Documentary$runtime, data = Documentary)
summary(Reg_Documentary)
## 
## Call:
## lm(formula = Documentary$`revenue (Millions)` ~ Documentary$runtime, 
##     data = Documentary)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -149.41 -119.44  -83.04   -3.94 1021.95 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)         163.0328    99.6051   1.637    0.108
## Documentary$runtime  -0.3210     0.9773  -0.328    0.744
## 
## Residual standard error: 227 on 53 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.002031,   Adjusted R-squared:  -0.0168 
## F-statistic: 0.1079 on 1 and 53 DF,  p-value: 0.7439
plot(Documentary$`revenue (Millions)`, Documentary$runtime, 
     xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Documentary")
abline(Reg_Documentary, col="Red")  

#equation: revenue = 162.8184 - 0.3350*runtime (minutes). has a 0.2208% impact on revenue.

#Drama
  Reg_Drama <-lm(Drama$`revenue (Millions)` ~ Drama$runtime, data = Drama)
summary(Reg_Drama)
## 
## Call:
## lm(formula = Drama$`revenue (Millions)` ~ Drama$runtime, data = Drama)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -138.41 -117.08  -75.88   13.12 1935.89 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   153.5468    77.5266   1.981   0.0485 *
## Drama$runtime  -0.1529     0.6920  -0.221   0.8252  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 215.1 on 310 degrees of freedom
## Multiple R-squared:  0.0001576,  Adjusted R-squared:  -0.003068 
## F-statistic: 0.04885 on 1 and 310 DF,  p-value: 0.8252
plot(Drama$`revenue (Millions)`, Drama$runtime, 
     xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Drama")
abline(Reg_Drama, col="Red")  

#equation: revenue = 153.5468 - 0.1529*runtime(minutes). run time impacts 0.01576% of revenue.

#Horror
  Reg_Horror <-lm(Horror$`revenue (Millions)` ~ Horror$runtime, data = Horror)
summary(RegArtHouse_International)
## 
## Call:
## lm(formula = ArtHouse_International$`revenue (Millions)` ~ ArtHouse_International$runtime, 
##     data = ArtHouse_International)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -120.304  -35.954    0.942   42.550  123.999 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                    -524.031    166.558  -3.146  0.00843 **
## ArtHouse_International$runtime    6.371      1.618   3.939  0.00197 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78.48 on 12 degrees of freedom
## Multiple R-squared:  0.5638, Adjusted R-squared:  0.5275 
## F-statistic: 15.51 on 1 and 12 DF,  p-value: 0.001968
plot(Horror$`revenue (Millions)`, Horror$runtime, 
     xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Horror")
abline(Reg_Horror, col="Red")  

#equation: revenue = -524.031 + 6.371*run time. accounts for 56.38% of revenue for genre. 

#Musical and Performing Arts
  RegMusical_PerformArt <-lm(Musical_PerformingArts$`revenue (Millions)` ~ Musical_PerformingArts$runtime, data = Musical_PerformingArts)
summary(RegMusical_PerformArt)
## 
## Call:
## lm(formula = Musical_PerformingArts$`revenue (Millions)` ~ Musical_PerformingArts$runtime, 
##     data = Musical_PerformingArts)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -190.40 -103.73  -39.67   76.99  352.10 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)
## (Intercept)                     545.340    364.640   1.496    0.166
## Musical_PerformingArts$runtime   -3.586      3.180  -1.128    0.286
## 
## Residual standard error: 159.1 on 10 degrees of freedom
## Multiple R-squared:  0.1128, Adjusted R-squared:  0.02411 
## F-statistic: 1.272 on 1 and 10 DF,  p-value: 0.2858
plot(Musical_PerformingArts$`revenue (Millions)`, Musical_PerformingArts$runtime, 
     xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Musical & Performing Arts")
abline(RegMusical_PerformArt, col="Red")  

#equation: revenue = 545.340 - 3.586*run time. contributes to 11.28 % of revenue.

#Mystery & Suspense
  RegMystery_Suspense <-lm(Mystery_Suspense$`revenue (Millions)` ~ Mystery_Suspense$runtime, data = Mystery_Suspense)
summary(RegMystery_Suspense)
## 
## Call:
## lm(formula = Mystery_Suspense$`revenue (Millions)` ~ Mystery_Suspense$runtime, 
##     data = Mystery_Suspense)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -133.86 -103.09  -73.26    8.20  840.79 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)
## (Intercept)               89.2316   153.3081   0.582    0.563
## Mystery_Suspense$runtime   0.3808     1.3744   0.277    0.783
## 
## Residual standard error: 190.6 on 58 degrees of freedom
## Multiple R-squared:  0.001322,   Adjusted R-squared:  -0.0159 
## F-statistic: 0.07677 on 1 and 58 DF,  p-value: 0.7827
plot(Mystery_Suspense$`revenue (Millions)`, Mystery_Suspense$runtime, 
     xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Mystery & Suspense")
abline(RegMystery_Suspense, col="Red")  

#equation: revenue = 89.23 + 0.3808*run time. contributes to 0.132% of revenue.

#Other
  Reg_Other <-lm(Other$`revenue (Millions)` ~ Other$runtime, data = Other)
summary(Reg_Other)
## 
## Call:
## lm(formula = Other$`revenue (Millions)` ~ Other$runtime, data = Other)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -158.55 -130.96 -116.29  -54.89  599.53 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)    95.1628   353.9078   0.269    0.792
## Other$runtime   0.5701     3.1743   0.180    0.860
## 
## Residual standard error: 262.1 on 14 degrees of freedom
## Multiple R-squared:  0.002298,   Adjusted R-squared:  -0.06897 
## F-statistic: 0.03225 on 1 and 14 DF,  p-value: 0.86
plot(Other$`revenue (Millions)`, Other$runtime, 
     xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Other")
abline(Reg_Other, col="Red")  

#equation: revenue = 95.16 + 0.5701*run time. contributes to 0.2298% of revenue.

#Science Fiction & Fantasy
  RegScienceFic_Fantasy <-lm(Science_FictionFantasy$`revenue (Millions)` ~ Science_FictionFantasy$runtime, data = Science_FictionFantasy)
summary(RegScienceFic_Fantasy)
## 
## Call:
## lm(formula = Science_FictionFantasy$`revenue (Millions)` ~ Science_FictionFantasy$runtime, 
##     data = Science_FictionFantasy)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -83.71 -39.24 -24.36  36.10 122.07 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)
## (Intercept)                    -101.089    112.621  -0.898    0.399
## Science_FictionFantasy$runtime    1.666      1.088   1.532    0.169
## 
## Residual standard error: 74.27 on 7 degrees of freedom
## Multiple R-squared:  0.251,  Adjusted R-squared:  0.1441 
## F-statistic: 2.346 on 1 and 7 DF,  p-value: 0.1694
plot(Science_FictionFantasy$`revenue (Millions)`, Science_FictionFantasy$runtime, 
     xlab = "Revenue", ylab = "Run Time", main = "Run Time Impact on Revenue for Science Fiction & Fantasy")
abline(RegScienceFic_Fantasy, col="Red")  

#equation: Revenue = -101.089 + 1.66*run time. contributes to 25% of revenue.

Facebook Likes

# Do Facebook likes affect Revenue?
correlation_test_FB_vs_Rev<- cor.test(top_300_movies$Facebook_Likes, top_300_movies$`revenue (Millions)`)
print(correlation_test_FB_vs_Rev)
## 
##  Pearson's product-moment correlation
## 
## data:  top_300_movies$Facebook_Likes and top_300_movies$`revenue (Millions)`
## t = 13.672, df = 298, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5460041 0.6858766
## sample estimates:
##       cor 
## 0.6208573
# Significant Result

Strong positive correlation between higher Facebook likes and higher revenue

# Plot FB Likes vs Revenue
ggplot(top_300_movies, aes(x = Facebook_Likes, y = `revenue (Millions)`)) +
  geom_point() +
  geom_smooth(method = "lm", col = "blue", se = FALSE) +
  labs(title = "Facebook Likes vs Gross Revenue Top 300 Grossing Movies",
       x = "Facebook Likes",
       y = "Gross Revenue ($Millions)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Predictive Analysis

Predicted revenue for the average movie

# Linear regression model to predict revenue 
model <- lm(revenue..Millions. ~ budget..Millions. + runtime + imdb_rating + critics_score + audience_score + Facebook_Likes, data = Movie_Data) 
summary(model) 
## 
## Call:
## lm(formula = revenue..Millions. ~ budget..Millions. + runtime + 
##     imdb_rating + critics_score + audience_score + Facebook_Likes, 
##     data = Movie_Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -402.93  -53.18   -8.03   28.78 1517.66 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -5.620e+01  3.144e+01  -1.788   0.0743 .  
## budget..Millions.  1.800e+00  1.345e-01  13.382   <2e-16 ***
## runtime           -2.454e-01  2.706e-01  -0.907   0.3647    
## imdb_rating        2.922e-01  1.995e+00   0.146   0.8836    
## critics_score      7.969e-01  3.590e-01   2.219   0.0268 *  
## audience_score     7.061e-01  4.289e-01   1.646   0.1002    
## Facebook_Likes     9.669e-04  6.430e-05  15.036   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 133.8 on 657 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6039, Adjusted R-squared:  0.6003 
## F-statistic:   167 on 6 and 657 DF,  p-value: < 2.2e-16
# Predicting box office sales for a new movie using the average budget 
new_movie <- data.frame( 
  budget..Millions. = average_budget, # using the calculated average budget 
  runtime = mean(Movie_Data$runtime, na.rm = TRUE), # using average runtime 
  imdb_rating = mean(Movie_Data$imdb_rating, na.rm = TRUE), # using average IMDB rating 
  critics_score = 55, # given critics score 
  audience_score = mean(Movie_Data$audience_score, na.rm = TRUE), # using average audience score 
  Facebook_Likes = 1250 # given Facebook likes 
) 

predicted_revenue <- predict(model, newdata = new_movie) 
print(predicted_revenue)
##        1 
## 76.99852

Average movie will generate $76.9M in revenue

Predicting revenue with new advertising campaign

# Adjusting for the advertising campaign 
new_movie_campaign <- new_movie 
new_movie_campaign$Facebook_Likes <- new_movie$Facebook_Likes * 100 # increasing Facebook likes by 10,000% 

# Predicting the revenue after the advertising campaign 
predicted_revenue_campaign <- predict(model, newdata = new_movie_campaign) 
print(predicted_revenue_campaign) 
##        1 
## 196.6487
# Calculating the increase in revenue due to the campaign 
revenue_increase <- predicted_revenue_campaign - predicted_revenue 
print(revenue_increase)
##        1 
## 119.6502

With the new campaign the movie will generate $196.6M in revenue, an increase of $119.6M

Budget

#Is there a relationship between budget and revenue? 
Budget_reg <-lm(Movie_Data$`revenue (Millions)`~Movie_Data$`budget (Millions)`, data = Movie_Data)
summary(Budget_reg)
## 
## Call:
## lm(formula = Movie_Data$`revenue (Millions)` ~ Movie_Data$`budget (Millions)`, 
##     data = Movie_Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -402.96  -58.06  -26.12   13.71 1449.83 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     16.0530     8.3086   1.932   0.0538 .  
## Movie_Data$`budget (Millions)`   3.0115     0.1364  22.086   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 160.6 on 663 degrees of freedom
## Multiple R-squared:  0.4239, Adjusted R-squared:  0.423 
## F-statistic: 487.8 on 1 and 663 DF,  p-value: < 2.2e-16
plot(Movie_Data$`revenue (Millions)`,Movie_Data$`budget (Millions)`, 
     xlab = "Revenue", ylab = "Budget", main = "Budget Impact on Revenue")
abline(Budget_reg, col="Red")

#budget has a positive relationship with revenue. As budget increases the revenue does increase. Can see
#...42.3% of revenue can be accounted for by budget increases. 
#... budget = 20.97 + 0.1408(revenue) to find what budget would give a specific revenue. 

Genre

# Grouping genres into broader categories 
Movie_Data$genre <- case_when( 
  Movie_Data$genre %in% c("Drama", "Comedy", "Romance", "Musical") ~ "Drama/Comedy", 
  Movie_Data$genre %in% c("Action", "Adventure", "Thriller", "Crime") ~ "Action/Thriller", 
  Movie_Data$genre %in% c("Horror", "Sci-Fi", "Fantasy") ~ "Horror/Sci-Fi", 
  TRUE ~ "Other" 
) 

# Visualizing the average revenue by genre 
ggplot(Movie_Data, aes(x = genre, y = revenue..Millions., fill = genre)) + 
  geom_bar(stat = "summary", fun = "mean") + 
  theme_minimal() + 
  labs(title = "Average Revenue by Genre", x = "Genre", y = "Average Revenue (Millions)")

#2
#Does genre have an impact on revenue? Use ANOVA test to evaluate.
library(dplyr)

Movie_Data$genre<- as.factor(Movie_Data$genre)
Movie_Data$`revenue (Millions)`<- as.numeric(Movie_Data$`revenue (Millions)`)
Genre_ANOVA <-aov(Movie_Data$`revenue (Millions)`~ Movie_Data$genre)

TukeyHSD(Genre_ANOVA)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Movie_Data$`revenue (Millions)` ~ Movie_Data$genre)
## 
## $`Movie_Data$genre`
##                                                             diff        lwr
## Animation-Action & Adventure                         107.4337607 -136.51089
## Art House & International-Action & Adventure          -2.3791758 -204.46983
## Comedy-Action & Adventure                              8.1271564 -103.78108
## Documentary-Action & Adventure                         1.1631456 -123.89021
## Drama-Action & Adventure                               7.5061218  -86.01077
## Horror-Action & Adventure                             59.6289298 -106.77904
## Musical & Performing Arts-Action & Adventure           8.2890385 -207.21302
## Mystery & Suspense-Action & Adventure                  2.0412051 -120.75249
## Other-Action & Adventure                              28.5034135 -162.91302
## Science Fiction & Fantasy-Action & Adventure         -61.9151282 -305.85978
## Art House & International-Animation                 -109.8129365 -402.85638
## Comedy-Animation                                     -99.3066042 -339.21760
## Documentary-Animation                               -106.2706151 -352.58784
## Drama-Animation                                      -99.9276389 -331.83116
## Horror-Animation                                     -47.8048309 -317.48138
## Musical & Performing Arts-Animation                  -99.1447222 -401.59302
## Mystery & Suspense-Animation                        -105.3925556 -350.57029
## Other-Animation                                      -78.9303472 -364.71712
## Science Fiction & Fantasy-Animation                 -169.3488889 -492.67972
## Comedy-Art House & International                      10.5063323 -186.69641
## Documentary-Art House & International                  3.5423214 -201.40600
## Drama-Art House & International                        9.8852976 -177.49368
## Horror-Art House & International                      62.0081056 -170.49368
## Musical & Performing Arts-Art House & International   10.6682143 -259.15878
## Mystery & Suspense-Art House & International           4.4203810 -199.15703
## Other-Art House & International                       30.8825893 -220.12681
## Science Fiction & Fantasy-Art House & International  -59.5359524 -352.57940
## Documentary-Comedy                                    -6.9640108 -123.95391
## Drama-Comedy                                          -0.6210346  -83.04495
## Horror-Comedy                                         51.5017733 -108.93483
## Musical & Performing Arts-Comedy                       0.1618820 -210.76330
## Mystery & Suspense-Comedy                             -6.0859513 -120.65726
## Other-Comedy                                          20.3762570 -165.87235
## Science Fiction & Fantasy-Comedy                     -70.0422846 -309.95328
## Drama-Documentary                                      6.3429762  -93.19893
## Horror-Documentary                                    58.4657842 -111.40120
## Musical & Performing Arts-Documentary                  7.1258929 -211.05825
## Mystery & Suspense-Documentary                         0.8780595 -126.56398
## Other-Documentary                                     27.3402679 -167.09078
## Science Fiction & Fantasy-Documentary                -63.0782738 -309.39550
## Horror-Drama                                          52.1228080  -96.07254
## Musical & Performing Arts-Drama                        0.7829167 -200.98772
## Mystery & Suspense-Drama                              -5.4649167 -102.15277
## Other-Drama                                           20.9972917 -154.81653
## Science Fiction & Fantasy-Drama                      -69.4212500 -301.32477
## Musical & Performing Arts-Horror                     -51.3398913 -295.58892
## Mystery & Suspense-Horror                            -57.5877246 -225.79814
## Other-Horror                                         -31.1255163 -254.41166
## Science Fiction & Fantasy-Horror                    -121.5440580 -391.22061
## Mystery & Suspense-Musical & Performing Arts          -6.2478333 -223.14475
## Other-Musical & Performing Arts                       20.2143750 -241.71353
## Science Fiction & Fantasy-Musical & Performing Arts  -70.2041667 -372.65246
## Other-Mystery & Suspense                              26.4622083 -166.52323
## Science Fiction & Fantasy-Mystery & Suspense         -63.9563333 -309.13407
## Science Fiction & Fantasy-Other                      -90.4185417 -376.20532
##                                                           upr     p adj
## Animation-Action & Adventure                        351.37841 0.9425798
## Art House & International-Action & Adventure        199.71147 1.0000000
## Comedy-Action & Adventure                           120.03539 1.0000000
## Documentary-Action & Adventure                      126.21651 1.0000000
## Drama-Action & Adventure                            101.02301 1.0000000
## Horror-Action & Adventure                           226.03689 0.9865860
## Musical & Performing Arts-Action & Adventure        223.79110 1.0000000
## Mystery & Suspense-Action & Adventure               124.83490 1.0000000
## Other-Action & Adventure                            219.91984 0.9999941
## Science Fiction & Fantasy-Action & Adventure        182.02952 0.9992053
## Art House & International-Animation                 183.23051 0.9812707
## Comedy-Animation                                    140.60439 0.9620224
## Documentary-Animation                               140.04661 0.9498151
## Drama-Animation                                     131.97588 0.9502261
## Horror-Animation                                    221.87172 0.9999694
## Musical & Performing Arts-Animation                 203.30357 0.9932696
## Mystery & Suspense-Animation                        139.78518 0.9510165
## Other-Animation                                     206.85643 0.9983553
## Science Fiction & Fantasy-Animation                 153.98194 0.8395632
## Comedy-Art House & International                    207.70908 1.0000000
## Documentary-Art House & International               208.49064 1.0000000
## Drama-Art House & International                     197.26427 1.0000000
## Horror-Art House & International                    294.50989 0.9987798
## Musical & Performing Arts-Art House & International 280.49521 1.0000000
## Mystery & Suspense-Art House & International        207.99779 1.0000000
## Other-Art House & International                     281.89199 0.9999991
## Science Fiction & Fantasy-Art House & International 233.50750 0.9998917
## Documentary-Comedy                                  110.02588 1.0000000
## Drama-Comedy                                         81.80288 1.0000000
## Horror-Comedy                                       211.93838 0.9943041
## Musical & Performing Arts-Comedy                    211.08706 1.0000000
## Mystery & Suspense-Comedy                           108.48536 1.0000000
## Other-Comedy                                        206.62486 0.9999997
## Science Fiction & Fantasy-Comedy                    169.86871 0.9973765
## Drama-Documentary                                   105.88489 1.0000000
## Horror-Documentary                                  228.33277 0.9901411
## Musical & Performing Arts-Documentary               225.31004 1.0000000
## Mystery & Suspense-Documentary                      128.32010 1.0000000
## Other-Documentary                                   221.77131 0.9999966
## Science Fiction & Fantasy-Documentary               183.23895 0.9991410
## Horror-Drama                                        200.31815 0.9883566
## Musical & Performing Arts-Drama                     202.55355 1.0000000
## Mystery & Suspense-Drama                             91.22294 1.0000000
## Other-Drama                                         196.81111 0.9999993
## Science Fiction & Fantasy-Drama                     162.48227 0.9967714
## Musical & Performing Arts-Horror                    192.90914 0.9998522
## Mystery & Suspense-Horror                           110.62269 0.9905398
## Other-Horror                                        192.16062 0.9999969
## Science Fiction & Fantasy-Horror                    148.13249 0.9333949
## Mystery & Suspense-Musical & Performing Arts        210.64908 1.0000000
## Other-Musical & Performing Arts                     282.14228 1.0000000
## Science Fiction & Fantasy-Musical & Performing Arts 232.24413 0.9996386
## Other-Mystery & Suspense                            219.44765 0.9999973
## Science Fiction & Fantasy-Mystery & Suspense        181.22141 0.9989919
## Science Fiction & Fantasy-Other                     195.36823 0.9949308
#based on the p-values, there is not significant difference in genre and revenue creation. can not confidently
#..say one genre will generate more revenue than another.
#plotting sum revenue per genre 
library(dplyr)

Movie_Data$genre <- as.factor(Movie_Data$genre)
Movie_Data$`revenue (Millions)` <- as.numeric(Movie_Data$`revenue (Millions)`)

RevenueByGenre <- Movie_Data %>% group_by(genre) %>% 
  summarize(Average_Revenue = mean(`revenue (Millions)`, na.rm = TRUE))

library(ggplot2)
ggplot(RevenueByGenre, aes(x = genre, y = Average_Revenue)) +
  geom_col() +
  labs(x = "Genre", y = "Average Revenue (Millions)", title = "Average Revenue by Genre") +
  theme_minimal()

#Based on graph, animation and horror have the highest average revenue. average revenue is realtively flat when comparing to other genres.
#..horror or animation movie might be the best way to approach new movies for revenue success. 

Genre and Rating

#evaluating if genre and rating has an impact on revenue.
library(dplyr)
library(ggplot2)

Movie_Data$genre <- as.factor(Movie_Data$genre)
Movie_Data$mpaa_rating <- as.factor(Movie_Data$mpaa_rating)
Movie_Data$`revenue (Millions)` <- as.numeric(Movie_Data$`revenue (Millions)`)

group_counts <- Movie_Data %>%
  group_by(genre, mpaa_rating) %>%
  summarize(n = n())
## `summarise()` has grouped output by 'genre'. You can override using the
## `.groups` argument.
filtered_data <- Movie_Data %>%
  group_by(genre, mpaa_rating) %>%
  filter(n() >= 2) %>%
  ungroup()

anova_result <- aov(`revenue (Millions)` ~ genre * mpaa_rating, data = filtered_data)
summary(anova_result)
##                    Df   Sum Sq Mean Sq F value Pr(>F)  
## genre              10   217521   21752   0.484 0.9008  
## mpaa_rating         5   449730   89946   2.001 0.0767 .
## genre:mpaa_rating  20  1043885   52194   1.161 0.2824  
## Residuals         621 27911107   44945                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
shapiro.test(residuals(anova_result))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(anova_result)
## W = 0.70159, p-value < 2.2e-16
bartlett_test_result <- bartlett.test(`revenue (Millions)` ~ interaction(genre, mpaa_rating), data = filtered_data)
print(bartlett_test_result)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  revenue (Millions) by interaction(genre, mpaa_rating)
## Bartlett's K-squared = 191.62, df = 35, p-value < 2.2e-16