Set Working Directory

getwd()
## [1] "/Users/markyuhasz/FallWinter 2024 Classes/MKTG3P98-Business analytics and intell/Assignment 3"
setwd("/Users/markyuhasz/FallWinter 2024 Classes/MKTG3P98-Business analytics and intell/Assignment 3")

Activate Libraries that are Going to be Used

library(readxl)
library(ggplot2)
library(readr) # To Read CSV Files
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(performance)
library(patchwork)
library(ggplotify)
library(reshape2)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
## 
##     smiths

Import the Seperate Data Sets

Movie_General<-read.csv("Movie Dataset_General Audience.csv")
Movie_Financial<-read.csv("Movie Dataset_Financials.csv")

Viewing the Data Sets

View(Movie_General)
View(Movie_Financial)

Merge the Data Sets

Movie_Total<-merge(Movie_Financial,Movie_General,by= "original_title")

Find Any Duplicates in Original Title Name

Movie_Total_Clean<-clean_names(Movie_Total)
Movie_Total_Clean%>% get_dupes(original_title)
##                                   original_title dupe_count budget_millions
## 1                             Fantastic Four\xca          4            17.0
## 2                             Fantastic Four\xca          4            17.0
## 3                             Fantastic Four\xca          4            13.0
## 4                             Fantastic Four\xca          4            13.0
## 5                        Godzilla Resurgence\xca          4             8.9
## 6                        Godzilla Resurgence\xca          4             8.9
## 7                        Godzilla Resurgence\xca          4             1.9
## 8                        Godzilla Resurgence\xca          4             1.9
## 9                                   Hercules\xca          4            20.0
## 10                                  Hercules\xca          4            20.0
## 11                                  Hercules\xca          4             6.0
## 12                                  Hercules\xca          4             6.0
## 13                                       Pan\xca          4             4.0
## 14                                       Pan\xca          4             4.0
## 15                                       Pan\xca          4             3.0
## 16                                       Pan\xca          4             3.0
## 17                  The Fast and the Furious\xca          4            35.0
## 18                  The Fast and the Furious\xca          4            35.0
## 19                  The Fast and the Furious\xca          4            20.0
## 20                  The Fast and the Furious\xca          4            20.0
## 21                      The Legend of Tarzan\xca          4            35.0
## 22                      The Legend of Tarzan\xca          4            35.0
## 23                      The Legend of Tarzan\xca          4             6.0
## 24                      The Legend of Tarzan\xca          4             6.0
## 25 The Twilight Saga: Breaking Dawn - Part 2\xca          4             1.0
## 26 The Twilight Saga: Breaking Dawn - Part 2\xca          4             1.0
## 27 The Twilight Saga: Breaking Dawn - Part 2\xca          4             0.5
## 28 The Twilight Saga: Breaking Dawn - Part 2\xca          4             0.5
##    revenue_millions language country         type              genre runtime
## 1            136.62  English     USA Feature Film Mystery & Suspense     106
## 2            136.62  English     USA Feature Film              Drama     101
## 3             45.30  English     USA Feature Film Mystery & Suspense     106
## 4             45.30  English     USA Feature Film              Drama     101
## 5              4.16 Japanese   Japan Feature Film             Comedy      93
## 6              4.16 Japanese   Japan Feature Film              Drama      98
## 7            157.11 Japanese   Japan Feature Film             Comedy      93
## 8            157.11 Japanese   Japan Feature Film              Drama      98
## 9             34.08  English     USA Feature Film              Drama     105
## 10            34.08  English     USA  Documentary        Documentary     107
## 11             1.27  English     USA Feature Film              Drama     105
## 12             1.27  English     USA  Documentary        Documentary     107
## 13             1.60  English     USA Feature Film              Drama     130
## 14             1.60  English     USA Feature Film              Other      90
## 15             0.70  English     USA Feature Film              Drama     130
## 16             0.70  English     USA Feature Film              Other      90
## 17            77.48  English     USA Feature Film              Drama     104
## 18            77.48  English     USA Feature Film              Drama      95
## 19            18.22  English     USA Feature Film              Drama     104
## 20            18.22  English     USA Feature Film              Drama      95
## 21            82.35  English     USA  Documentary        Documentary      92
## 22            82.35  English     USA Feature Film              Drama      94
## 23            11.12  English     USA  Documentary        Documentary      92
## 24            11.12  English     USA Feature Film              Drama      94
## 25             3.39  English     USA  Documentary        Documentary      98
## 26             3.39  English     USA Feature Film             Comedy      95
## 27          1025.47  English     USA  Documentary        Documentary      98
## 28          1025.47  English     USA Feature Film             Comedy      95
##    mpaa_rating imdb_rating imdb_num_votes  critics_rating critics_score
## 1            R         6.3          25054          Rotten            43
## 2            R         2.1           9904           Fresh            35
## 3            R         6.3          25054          Rotten            43
## 4            R         2.1           9904           Fresh            35
## 5           PG         4.2         182983 Certified Fresh            34
## 6        PG-13         6.3          50340           Fresh            60
## 7           PG         4.2         182983 Certified Fresh            34
## 8        PG-13         6.3          50340           Fresh            60
## 9            R         3.0           9216          Rotten            50
## 10     Unrated         3.8          10522           Fresh            21
## 11           R         3.0           9216          Rotten            50
## 12     Unrated         3.8          10522           Fresh            21
## 13           R         2.0           9216          Rotten            34
## 14          PG         3.6           1010           Fresh            33
## 15           R         2.0           9216          Rotten            34
## 16          PG         3.6           1010           Fresh            33
## 17           R         7.1         128361           Fresh            68
## 18           R         7.2          35635 Certified Fresh            93
## 19           R         7.1         128361           Fresh            68
## 20           R         7.2          35635 Certified Fresh            93
## 21     Unrated         6.8           1942 Certified Fresh            66
## 22           R         3.4          57933          Rotten            25
## 23     Unrated         6.8           1942 Certified Fresh            66
## 24           R         3.4          57933          Rotten            25
## 25     Unrated         3.1           1010           Fresh            45
## 26          PG         7.5            880           Fresh            90
## 27     Unrated         3.1           1010           Fresh            45
## 28          PG         7.5            880           Fresh            90
##    audience_rating audience_score best_pic_nom facebook_likes
## 1          Spilled             49           no           1261
## 2          Upright             26           no          51261
## 3          Spilled             49           no           1261
## 4          Upright             26           no          51261
## 5          Upright             33          yes           5699
## 6          Spilled             67           no            699
## 7          Upright             33          yes           5699
## 8          Spilled             67           no            699
## 9          Upright             40           no          16235
## 10         Upright             20           no            235
## 11         Upright             40           no          16235
## 12         Upright             20           no            235
## 13         Upright             24           no            393
## 14         Upright             24           no            393
## 15         Upright             24           no            393
## 16         Upright             24           no            393
## 17         Upright             78           no         125327
## 18         Upright             80           no          45327
## 19         Upright             78           no         125327
## 20         Upright             80           no          45327
## 21         Upright             68           no         121175
## 22         Spilled             39           no          11175
## 23         Upright             68           no         121175
## 24         Spilled             39           no          11175
## 25         Upright             46           no           1177
## 26         Upright             89           no         359177
## 27         Upright             46           no           1177
## 28         Upright             89           no         359177

Delete all Duplicate Movie Titles

Movie_Total_Unique <- Movie_Total_Clean %>% distinct(original_title, .keep_all = TRUE)
print(head(Movie_Total_Unique, 10))
##                 original_title budget_millions revenue_millions language
## 1  10,000 B.C.\xca                        12.6            18.66         
## 2           102 Dalmatians\xca            45.0            60.22  English
## 3         2 Fast 2 Furious\xca            16.0            31.56  English
## 4                     2012\xca             3.5             0.75  English
## 5   300: Rise of an Empire\xca            16.0            19.68  English
## 6                 47 Ronin\xca            50.0           240.36  English
## 7           50 First Dates\xca            50.0            36.35  English
## 8         A Beautiful Mind\xca            26.0            43.32  English
## 9        A Christmas Carol\xca            65.0           235.67  English
## 10  A Good Day to Die Hard\xca             8.5             5.77  English
##    country         type              genre runtime mpaa_rating imdb_rating
## 1          Feature Film              Drama     134           R         6.8
## 2      USA Feature Film              Drama     108          PG         4.9
## 3      USA Feature Film Mystery & Suspense      97       PG-13         6.3
## 4      USA Feature Film             Comedy      98       PG-13         6.3
## 5      USA Feature Film Action & Adventure     111       PG-13         6.0
## 6      USA Feature Film              Drama     106          PG         7.8
## 7      USA Feature Film             Comedy      87           R         5.4
## 8      USA Feature Film Action & Adventure      83           G         7.6
## 9      USA Feature Film              Drama     100           R         7.0
## 10     USA Feature Film Action & Adventure      86           R         4.1
##    imdb_num_votes  critics_rating critics_score audience_rating audience_score
## 1            9025           Fresh            60         Upright             76
## 2            5136          Rotten             5         Spilled             13
## 3           54771          Rotten            40         Spilled             49
## 4            8646 Certified Fresh            44         Spilled             54
## 5          103789          Rotten            51         Spilled             51
## 6           12450           Fresh            75         Spilled             85
## 7            6811          Rotten            35         Spilled             31
## 8           78862          Rotten            50         Upright             81
## 9            8320          Rotten            70         Spilled             74
## 10            739           Fresh            53         Upright             42
##    best_pic_nom facebook_likes
## 1            no          23343
## 2            no          84182
## 3            no          35296
## 4            no            445
## 5            no          21583
## 6            no          20965
## 7            no          12952
## 8            no          52827
## 9            no          48878
## 10           no           5481

Catagorize and Group Based on Genre

Movie_Total_Unique <- Movie_Total_Unique %>% #Call Dataframe and Create New Column with New Groupings
  mutate(Genre1 = case_when(genre == "Science Fiction & Fantasy" ~ "SciFi",
                            genre == "Mystery & Suspense" ~ "Mystery",
                            genre == "Drama" ~ "Drama",
                            genre == "Documentary" ~ "Documentary",
                            genre == "Comedy" ~ "Comedy",
                            genre == "Art House & International" ~ "Arts",
                            genre == "Other" ~ "Other",
                            genre == "Action & Adventure" ~ "Action",
                            genre == "Animation" ~ "Animation",
                            genre == "Horror" ~ "Horror",
                            genre == "Musical & Performing Arts" ~ "Arts",
                            TRUE ~ "Check"))

Mutate Dataset for Predictive Modeling

MT <- Movie_Total_Unique %>%
  mutate(
    budget_log = log(budget_millions),
    revenue_log = log(revenue_millions)
  )

Transform IMDB rating and correct outliers to be out of 100 so same scale is used

#Changes the scale of the imdb rating to a scale of 100
MT$imdb_rating <- MT$imdb_rating * 10

MT$genre <- as.factor(MT$genre)
# replace outliers of IMDB score that are greater than 100
MT$imdb_rating[MT$imdb_rating > 100]
## [1] 550 440
# which rows are responsible for the over 100
outlier <- MT[MT$imdb_rating>100,]
print(outlier)
##                 original_title budget_millions revenue_millions language
## 447 The Amazing Spider-Man\xca              30            91.71  English
## 638  X-Men: The Last Stand\xca               6            35.40  English
##     country         type       genre runtime mpaa_rating imdb_rating
## 447     USA  Documentary Documentary      82     Unrated         550
## 638  Canada Feature Film Documentary     112           R         440
##     imdb_num_votes critics_rating critics_score audience_rating audience_score
## 447           3459          Fresh            55         Upright             56
## 638           4908         Rotten            40         Spilled             47
##     best_pic_nom facebook_likes      Genre1 budget_log revenue_log
## 447           no           2489 Documentary   3.401197    4.518631
## 638           no          21714 Documentary   1.791759    3.566712
# change the outliers numbers to the actual numbers based on the website
MT$imdb_rating[c(447, 638)] <- c(69, 66)

Rename the revenue column to a more manageable name

Movie_Total <- clean_names(Movie_Total)
colnames(Movie_Total)
##  [1] "original_title"   "budget_millions"  "revenue_millions" "language"        
##  [5] "country"          "type"             "genre"            "runtime"         
##  [9] "mpaa_rating"      "imdb_rating"      "imdb_num_votes"   "critics_rating"  
## [13] "critics_score"    "audience_rating"  "audience_score"   "best_pic_nom"    
## [17] "facebook_likes"
Movie_Total <- Movie_Total %>%
  rename(revenue_millions = revenue_millions)

Renaming the column revenue_millions for better readability and consistency.

We want to answer how are certain factors driving overall revenue (critics score, audience, runtime, imdb and genre)
Movie_Data <- Movie_Total %>%
  select(revenue_millions, critics_score, audience_score, runtime, imdb_rating, genre)

Extracting only the columns relevant to the analysis to streamline further operations.

Remove NA values to prevent errors during analysis

Movie_Data <- na.omit(Movie_Data)

Analyze average factors by genre category

Factors_by_Genre <- Movie_Data %>%
  group_by(genre) %>%
  summarise(
    Avg_Critics_Score = mean(critics_score, na.rm = TRUE),
    Avg_Audience_Score = mean(audience_score, na.rm = TRUE),
    Avg_Runtime = mean(runtime, na.rm = TRUE),
    Avg_IMDb_Rating = mean(imdb_rating, na.rm = TRUE),
    Avg_Revenue = mean(revenue_millions, na.rm = TRUE)
  )

Visualize average revenue by genre

ggplot(Factors_by_Genre, aes(x = genre, y = Avg_Revenue, fill = genre)) +
  geom_bar(stat = "identity") +
  labs(title = "Average Revenue by Genre", x = "Genre", y = "Average Revenue (Millions)") +
  theme_minimal()

### Interpretation: #### Animation leads as the highest revenue-generating genre, followed by Science Fiction & Fantasy and Mystery & Suspense, showcasing their strong audience appeal. Conversely, Art House & International and Documentaries have lower revenues, reflecting their niche markets. Most other genres, like Comedy and Drama, exhibit stable mid-range performance, suggesting balanced but less exceptional financial returns.

Creating a scatter plot to explore the relationship between critics’ scores and revenue for each genre.

ggplot(Movie_Data, aes(x = critics_score, y = revenue_millions, color = genre)) +
  geom_point(alpha = 0.7) +
  labs(title = "Critics Score vs Revenue by Genre", x = "Critics Score", y = "Revenue (Millions)") +
  theme_minimal()

Identify correlations between factors and revenue for each genre

Top_Factors_By_Genre <- Movie_Data %>%
  group_by(genre) %>%
  summarise(
    Cor_Critics = cor(revenue_millions, critics_score, use = "complete.obs"),
    Cor_Audience = cor(revenue_millions, audience_score, use = "complete.obs"),
    Cor_Runtime = cor(revenue_millions, runtime, use = "complete.obs"),
    Cor_IMDb = cor(revenue_millions, imdb_rating, use = "complete.obs")
  ) %>%
  pivot_longer(cols = starts_with("Cor"), names_to = "Factor", values_to = "Correlation") %>%
  arrange(genre, desc(Correlation))

Calculating and reshaping correlations to understand how each factor affects revenue by genre.

Plot top two driving factors for each genre

ggplot(Top_Factors_By_Genre, aes(x = genre, y = Correlation, fill = Factor)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Top Factors Driving Revenue by Genre", x = "Genre", y = "Correlation") +
  theme_minimal()

#### To maximize box office revenue: Prioritize audience engagement through targeted marketing and viewer-centric storytelling. Tailor runtimes by genre, investing in longer narratives for Animation and Sci-Fi, while keeping Horror and Musicals short. Leverage IMDb Ratings as a quality benchmark to ensure broad public appeal.

#Question 1: Do Some Genres Generate More Revenue Than Others ## Create Table

Avg_Revenue <- Movie_Total_Unique %>%
  group_by(Genre1) %>%
  summarise(mean_revenue = mean(revenue_millions, na.rm = TRUE))
print(Avg_Revenue)
## # A tibble: 10 × 2
##    Genre1      mean_revenue
##    <chr>              <dbl>
##  1 Action             129. 
##  2 Animation          237. 
##  3 Arts               132. 
##  4 Comedy             128. 
##  5 Documentary        120. 
##  6 Drama              140. 
##  7 Horror             189. 
##  8 Mystery            133. 
##  9 Other              180. 
## 10 SciFi               67.2

Graph

ggplot(Avg_Revenue, aes(x = Genre1, y = mean_revenue)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Average Revenue by Genre", x = "Genre", y = "Average Revenue (Millions)")

### Therefore, Animation films tend to generate the most average revenue.

Question 2: Do Critics have a Particular Preference for Certain Genres?

Create Table

Avg_Critic_Score <- Movie_Total_Unique %>%
  group_by(Genre1) %>%
  summarise(mean_critics_score = mean(critics_score, na.rm = TRUE))

Graph

ggplot(Avg_Critic_Score, aes(x = Genre1, y = mean_critics_score)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Average Critics Score by Genre", x = "Genre", y = "Average Critics Score")

### Therefore, Critics tend to have a preference for Documentary Films

Question 3: Is Critic Score more Important than Viewer Score?

Correlation Analysis

cor_critics <- cor(Movie_Total_Unique$critics_score, Movie_Total_Unique$revenue_millions, use = "complete.obs")
cor_audience <- cor(Movie_Total_Unique$audience_score, Movie_Total_Unique$revenue_millions, use = "complete.obs")
cat("Correlation with Critics Score:", cor_critics, "\n")
## Correlation with Critics Score: 0.392774
cat("Correlation with Audience Score:", cor_audience, "\n")
## Correlation with Audience Score: 0.4428168

Therefore, Audience score has a stronger linear relationshio with revenue (millions)

Multiple Linear Regression

lm_model <- lm(revenue_millions ~ critics_score + audience_score, data = Movie_Total_Unique)
summary(lm_model)
## 
## Call:
## lm(formula = revenue_millions ~ critics_score + audience_score, 
##     data = Movie_Total_Unique)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -279.37  -88.33  -25.64   29.13 1835.76 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -119.6096    21.4587  -5.574 3.67e-08 ***
## critics_score     1.1282     0.5019   2.248   0.0249 *  
## audience_score    3.4444     0.5540   6.218 9.10e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 186.3 on 641 degrees of freedom
## Multiple R-squared:  0.2024, Adjusted R-squared:  0.1999 
## F-statistic: 81.32 on 2 and 641 DF,  p-value: < 2.2e-16

Critics Score (1.1282): For every 1- unit increase in critics score, the revenue is expected to increase by 1.13 million dollars. The p-value (0.0249) is less than 0.05 meaning it is statistically significant

Audience Score (3.4444): For every 1- unit increase in audience score, the revenue is expected to increase by 3.44 million dollars. The p-value (9.10e - 10) is less than 0.05 meaning it is highly statistically significant

Audience score has a stronger impact on revenue compared to Critics score

Interaction Effects

# Add an interaction term
lm_interaction <- lm(revenue_millions ~ critics_score * audience_score, data = Movie_Total_Unique)
summary(lm_interaction)
## 
## Call:
## lm(formula = revenue_millions ~ critics_score * audience_score, 
##     data = Movie_Total_Unique)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -299.02  -84.21  -25.48   20.35 1834.23 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                  -59.2411    51.9350  -1.141   0.2544  
## critics_score                 -0.1927     1.1501  -0.168   0.8670  
## audience_score                 2.3249     1.0373   2.241   0.0254 *
## critics_score:audience_score   0.0217     0.0170   1.276   0.2023  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 186.2 on 640 degrees of freedom
## Multiple R-squared:  0.2044, Adjusted R-squared:  0.2007 
## F-statistic: 54.81 on 3 and 640 DF,  p-value: < 2.2e-16

The p- value (0.2023) is greater than 0.05 menaing the interaction between Critics score and Audience score is not statistically significant, meaning their combimed influeunce does not significally impact revneue beyond their individual effects

Bar Plot of Coefficients

#Coefficients from the Regression Model
coefficients <- data.frame(
  Variable = c("Critics Score", "Audience Score"),
  Coefficient = c(1.1282, 3.4444)
)
# Coefficients from the regression model
coefficients <- data.frame(
  Variable = c("Critics Score", "Audience Score"),
  Coefficient = c(1.1282, 3.4444)
)

# Bar plot
library(ggplot2)
ggplot(coefficients, aes(x = Variable, y = Coefficient, fill = Variable)) +
  geom_bar(stat = "identity", width = 0.6) +
  labs(
    title = "Impact of Critics and Audience Scores on Revenue",
    y = "Coefficient Value",
    x = "Variable"
  ) +
  theme_minimal() +
  scale_fill_manual(values = c("Critics Score" = "pink", "Audience Score" = "blue")) +
  geom_text(aes(label = round(Coefficient, 2)), vjust = -0.2)

### Audience Score is more important than Critics Score for predicting film revenue. It has a larger coefficient, meaning its influence on revenue is approximately 3 times greater than that of Critics Score.

Graph Audience Score vs Critics Score in Relation to Genre

options(repr.plot.res = 200, repr.plot.height = 5, repr.plot.width = 8)
ggplot(Movie_Total_Unique, aes(x = critics_score, y = audience_score)) +
  geom_point(aes(colour = genre), alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "black") + 
  labs(
    title = " Audience's Score vs Critics' Score",
    x = "Critics Score",
    y = "Audience Score",
    colour = "Genre"
  )
## `geom_smooth()` using formula = 'y ~ x'

### Movies with higher critics’ scores tend to have higher audience scores and vice versa, as seen from the upward-sloping trendline.

Visualize Correlation Between Audience Score and Critics Score and Revenue

correlation_matrix <- cor(Movie_Total_Unique[, c("critics_score", "audience_score", "revenue_millions")], use = "complete.obs")
print(correlation_matrix)
##                  critics_score audience_score revenue_millions
## critics_score        1.0000000      0.7734838        0.3927740
## audience_score       0.7734838      1.0000000        0.4428168
## revenue_millions     0.3927740      0.4428168        1.0000000

Plot Correlation Matrix

library(reshape2)
melted_correlation <- melt(correlation_matrix)
ggplot(melted_correlation, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1, 1)) +
  labs(title = "Correlation Matrix", x = "", y = "") +
  theme_minimal()

### There is a strong positive correlation between Critics’ score and Audience score, as indicated by the darker red ### The correlation between revenue and Audience score is stronger (darker red) than the correlation between revenue and Critics’ score. ### While Critics’ score shows a positive correlation with revenue, the relationship appears weaker compared to Audience score, as suggested by the lighter shade of red.

Graph Each Predictor

# Critics' Score vs Revenue
ggplot(Movie_Total_Unique, aes(x = critics_score, y = log10(revenue_millions))) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", colour = "red") +
  labs(title = "Critics' Score vs Revenue (Log Scale)", x = "Critics Score", y = "Log10 Revenue")
## `geom_smooth()` using formula = 'y ~ x'

The graph shows a weak to moderate positive correlation between Critics’ Score and Revenue, indicating that higher Critics’ scores slightly increase movie revenue, but other factors likely play a more significant role.

# Audience Score vs Revenue
ggplot(Movie_Total_Unique, aes(x = audience_score, y = log10(revenue_millions))) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", colour = "blue") +
  labs(title = "Audience Score vs Revenue (Log Scale)", x = "Audience Score", y = "Log10 Revenue")
## `geom_smooth()` using formula = 'y ~ x'

Moderate positive correlation between Audience Score and Revenue, indicating that higher Audience scores are more strongly associated with increased movie revenue compared to Critics’ scores

Comparing R- Squared Values for Individual Predictors

model_critics <- lm(revenue_millions ~ critics_score, data = Movie_Total_Unique)
model_audience <- lm(revenue_millions ~ audience_score, data = Movie_Total_Unique)

summary_critics <- summary(model_critics)
summary_audience <- summary(model_audience)

cat("R-squared for Critics Score Model:", summary_critics$r.squared, "\n")
## R-squared for Critics Score Model: 0.1542714
cat("R-squared for Audience Score Model:", summary_audience$r.squared, "\n")
## R-squared for Audience Score Model: 0.1960867

About 15.4% of the variance in movie revenue is explained by Critics’ scores alone.

About 19.6% of the variance in movie revenue is explained by Audience scores alone.

Neither variable explains the majority of the variance in revenue (both R-squared values are below 0.2)

QUESTION 4: Audience Score vs Critics’ Score by Movie Budget

Find Max and Min Budget Value

max_budget <- max(Movie_Total_Unique$budget_millions, na.rm = TRUE) # Remove NA values
min_budget <- min(Movie_Total_Unique$budget_millions, na.rm = TRUE)
cat("Maximum budget:", max_budget, "\n")
## Maximum budget: 280
cat("Minimum budget:", min_budget, "\n")
## Minimum budget: 0.32

Break Budget Down into 4 Catagories

# Define 4 range groups
Movie_Total_Unique$budget_category <- cut(
  Movie_Total_Unique$budget_millions,
  breaks = c(0.32, 70, 140, 210, 280),
  labels = c("Low", "Moderate", "High", "Very High"),
  include.lowest = TRUE
)

Graph

options(repr.plot.res = 100, repr.plot.height = 5, repr.plot.width = 8)
ggplot(Movie_Total_Unique, aes(x = critics_score, y = audience_score)) +
  geom_point(aes(colour = budget_category)) +
  facet_wrap(~genre) +
  labs(
    title = "Audience Score vs Critics' Score by Budget Category",
    x = "Critics' Score",
    y = "Audience Score",
    colour = "Budget Category"
  )

The connection between Critics’ Scores and Audience Scores changes depending on the genre. Higher-budget genres like Drama and Action & Adventure tend to have more agreement, while lower-budget genres like Horror and Sci-Fi often show bigger differences. Documentaries have the most overlap between what critics and audiences like.

Look at Log regression compared to unmodified regression

ggplot(MT, aes(x = budget_millions, y = revenue_millions)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE, color = 'red')
## `geom_smooth()` using formula = 'y ~ x'

ggplot(MT, aes(x = budget_log, y = revenue_log)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE, color = 'red')
## `geom_smooth()` using formula = 'y ~ x'

Visualize Relationship of variables as they relate to Revenue

ggplot(MT) + aes(y = revenue_log, x = runtime, colour = audience_score, size = critics_score) +
         geom_point() +
         scale_color_gradientn(colors = c("red", "yellow", "green")) +
         labs(
           y = "Generated Revenue (Millions)",
           x = "Runtime (Min)",
           color = "Audience Score",
           size = "Critic Score"
         ) +
         theme_minimal() +
         facet_wrap(~ genre)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

Another Visualization of Variables as they relate to Revenue

ggplot(MT) + aes(y = revenue_log, x = budget_log, colour = imdb_rating, size = runtime) +
         geom_point() +
         scale_color_gradientn(colors = c("red", "yellow", "green")) +
         labs(
           y = "Generated Revenue (Millions)",
           x = "Budget (Millions)",
           color = "IMDB Rating (1-100)",
           size = "Runtime (Min)"
         ) +
         theme_minimal() +
         facet_wrap(~ genre)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

## Develop Multiple Linear Regression to see Pr(>|t|) ## Multiple Linear regression of all variables

MTM1<- lm(revenue_log ~ budget_log+facebook_likes+critics_score+audience_score+imdb_rating+runtime+genre+mpaa_rating+imdb_num_votes, data = MT)
summary(MTM1)
## 
## Call:
## lm(formula = revenue_log ~ budget_log + facebook_likes + critics_score + 
##     audience_score + imdb_rating + runtime + genre + mpaa_rating + 
##     imdb_num_votes, data = MT)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.1261 -0.5296  0.0487  0.5997  3.1415 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     2.073e-01  3.616e-01   0.573  0.56659    
## budget_log                      4.415e-01  3.542e-02  12.465  < 2e-16 ***
## facebook_likes                  5.001e-06  4.621e-07  10.822  < 2e-16 ***
## critics_score                   8.549e-03  2.726e-03   3.136  0.00179 ** 
## audience_score                  2.980e-03  4.160e-03   0.716  0.47411    
## imdb_rating                     2.924e-02  4.378e-03   6.679 5.35e-11 ***
## runtime                        -2.178e-03  2.208e-03  -0.986  0.32443    
## genreAnimation                  1.841e-02  3.840e-01   0.048  0.96178    
## genreArt House & International -6.032e-02  2.983e-01  -0.202  0.83983    
## genreComedy                    -9.428e-04  1.640e-01  -0.006  0.99542    
## genreDocumentary               -3.953e-01  2.172e-01  -1.820  0.06926 .  
## genreDrama                     -2.658e-01  1.401e-01  -1.898  0.05817 .  
## genreHorror                    -1.507e-01  2.455e-01  -0.614  0.53963    
## genreMusical & Performing Arts -4.941e-01  3.131e-01  -1.578  0.11506    
## genreMystery & Suspense        -1.871e-01  1.833e-01  -1.021  0.30785    
## genreOther                     -4.603e-02  2.929e-01  -0.157  0.87520    
## genreScience Fiction & Fantasy -1.782e-01  3.518e-01  -0.507  0.61260    
## mpaa_ratingNC-17                7.878e-01  7.451e-01   1.057  0.29074    
## mpaa_ratingPG                   1.860e-01  2.701e-01   0.689  0.49120    
## mpaa_ratingPG-13                3.529e-01  2.762e-01   1.278  0.20184    
## mpaa_ratingR                    2.308e-01  2.673e-01   0.864  0.38816    
## mpaa_ratingUnrated              8.311e-02  3.071e-01   0.271  0.78673    
## imdb_num_votes                 -9.223e-07  4.300e-07  -2.145  0.03234 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9816 on 620 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6522, Adjusted R-squared:  0.6399 
## F-statistic: 52.85 on 22 and 620 DF,  p-value: < 2.2e-16

Remove highest p-value and run model again

MTM2<- lm(revenue_log ~ budget_log+facebook_likes+critics_score+audience_score+imdb_rating+runtime+mpaa_rating+imdb_num_votes, data = MT)
summary(MTM2)
## 
## Call:
## lm(formula = revenue_log ~ budget_log + facebook_likes + critics_score + 
##     audience_score + imdb_rating + runtime + mpaa_rating + imdb_num_votes, 
##     data = MT)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0938 -0.5563  0.0415  0.6230  3.1233 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         2.477e-01  3.200e-01   0.774  0.43918    
## budget_log          4.462e-01  3.488e-02  12.792  < 2e-16 ***
## facebook_likes      5.052e-06  4.582e-07  11.025  < 2e-16 ***
## critics_score       7.476e-03  2.684e-03   2.785  0.00552 ** 
## audience_score      3.074e-03  4.139e-03   0.743  0.45800    
## imdb_rating         2.904e-02  4.345e-03   6.684 5.14e-11 ***
## runtime            -3.305e-03  2.089e-03  -1.582  0.11416    
## mpaa_ratingNC-17    6.642e-01  7.322e-01   0.907  0.36470    
## mpaa_ratingPG       1.464e-01  2.453e-01   0.597  0.55083    
## mpaa_ratingPG-13    3.071e-01  2.462e-01   1.248  0.21267    
## mpaa_ratingR        1.471e-01  2.349e-01   0.626  0.53157    
## mpaa_ratingUnrated -1.111e-01  2.690e-01  -0.413  0.67977    
## imdb_num_votes     -7.603e-07  4.236e-07  -1.795  0.07316 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9813 on 630 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6468, Adjusted R-squared:   0.64 
## F-statistic: 96.13 on 12 and 630 DF,  p-value: < 2.2e-16
MTM3<- lm(revenue_log ~ budget_log+facebook_likes+critics_score+audience_score+imdb_rating+runtime+imdb_num_votes, data = MT)
summary(MTM3)
## 
## Call:
## lm(formula = revenue_log ~ budget_log + facebook_likes + critics_score + 
##     audience_score + imdb_rating + runtime + imdb_num_votes, 
##     data = MT)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3932 -0.5484  0.0351  0.6552  3.1412 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.620e-01  2.536e-01   1.427   0.1540    
## budget_log      4.477e-01  3.490e-02  12.829  < 2e-16 ***
## facebook_likes  5.070e-06  4.572e-07  11.089  < 2e-16 ***
## critics_score   6.800e-03  2.664e-03   2.552   0.0109 *  
## audience_score  2.032e-03  4.125e-03   0.493   0.6224    
## imdb_rating     3.002e-02  4.333e-03   6.929 1.05e-11 ***
## runtime        -2.617e-03  2.051e-03  -1.276   0.2024    
## imdb_num_votes -6.548e-07  4.203e-07  -1.558   0.1198    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9831 on 635 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6427, Adjusted R-squared:  0.6388 
## F-statistic: 163.2 on 7 and 635 DF,  p-value: < 2.2e-16
MTM4<- lm(revenue_log ~ budget_log+facebook_likes+critics_score+imdb_rating+runtime+imdb_num_votes, data = MT)
summary(MTM4)
## 
## Call:
## lm(formula = revenue_log ~ budget_log + facebook_likes + critics_score + 
##     imdb_rating + runtime + imdb_num_votes, data = MT)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3821 -0.5511  0.0382  0.6527  3.1495 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.584e-01  2.534e-01   1.414  0.15774    
## budget_log      4.481e-01  3.487e-02  12.852  < 2e-16 ***
## facebook_likes  5.094e-06  4.545e-07  11.208  < 2e-16 ***
## critics_score   7.458e-03  2.304e-03   3.238  0.00127 ** 
## imdb_rating     3.142e-02  3.277e-03   9.588  < 2e-16 ***
## runtime        -2.625e-03  2.050e-03  -1.281  0.20079    
## imdb_num_votes -6.303e-07  4.171e-07  -1.511  0.13128    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9825 on 636 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6426, Adjusted R-squared:  0.6392 
## F-statistic: 190.6 on 6 and 636 DF,  p-value: < 2.2e-16
MTM5<- lm(revenue_log ~ budget_log+facebook_likes+critics_score+imdb_rating+imdb_num_votes, data = MT)
summary(MTM5)
## 
## Call:
## lm(formula = revenue_log ~ budget_log + facebook_likes + critics_score + 
##     imdb_rating + imdb_num_votes, data = MT)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4224 -0.5445  0.0182  0.6441  3.1447 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     9.122e-02  1.420e-01   0.642  0.52087    
## budget_log      4.492e-01  3.485e-02  12.889  < 2e-16 ***
## facebook_likes  5.087e-06  4.544e-07  11.196  < 2e-16 ***
## critics_score   7.316e-03  2.299e-03   3.182  0.00153 ** 
## imdb_rating     3.143e-02  3.274e-03   9.601  < 2e-16 ***
## imdb_num_votes -7.521e-07  4.063e-07  -1.851  0.06465 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9823 on 638 degrees of freedom
## Multiple R-squared:  0.6416, Adjusted R-squared:  0.6388 
## F-statistic: 228.4 on 5 and 638 DF,  p-value: < 2.2e-16
MTM6<- lm(revenue_log ~ budget_log+facebook_likes+critics_score+imdb_rating, data = MT)
summary(MTM6)
## 
## Call:
## lm(formula = revenue_log ~ budget_log + facebook_likes + critics_score + 
##     imdb_rating, data = MT)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4278 -0.5570  0.0164  0.6507  3.2044 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.117e-01  1.419e-01   0.787  0.43132    
## budget_log     4.511e-01  3.490e-02  12.923  < 2e-16 ***
## facebook_likes 5.090e-06  4.552e-07  11.183  < 2e-16 ***
## critics_score  7.211e-03  2.303e-03   3.132  0.00182 ** 
## imdb_rating    3.041e-02  3.233e-03   9.406  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9842 on 639 degrees of freedom
## Multiple R-squared:  0.6397, Adjusted R-squared:  0.6374 
## F-statistic: 283.6 on 4 and 639 DF,  p-value: < 2.2e-16

With final model we have all the variables that reject the null hypothesis

MTModelFinal <- lm(revenue_log ~ budget_log+facebook_likes+critics_score+imdb_rating, data = MT)
summary(MTModelFinal)
## 
## Call:
## lm(formula = revenue_log ~ budget_log + facebook_likes + critics_score + 
##     imdb_rating, data = MT)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4278 -0.5570  0.0164  0.6507  3.2044 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.117e-01  1.419e-01   0.787  0.43132    
## budget_log     4.511e-01  3.490e-02  12.923  < 2e-16 ***
## facebook_likes 5.090e-06  4.552e-07  11.183  < 2e-16 ***
## critics_score  7.211e-03  2.303e-03   3.132  0.00182 ** 
## imdb_rating    3.041e-02  3.233e-03   9.406  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9842 on 639 degrees of freedom
## Multiple R-squared:  0.6397, Adjusted R-squared:  0.6374 
## F-statistic: 283.6 on 4 and 639 DF,  p-value: < 2.2e-16

From summary Greatest Factors contributing to Revenue is Budget, Facebook likes, critic score and IMDB rating

Multiple R-squared: 0.64 indicates that 64% of the variation is explained by audience, critic, budget etc.

Reject null hypothesis for Budget, Facebook likes, critic score and IMDB rating as they show a relationship with revenue

Checking Correlation between variables to prevent Homoscedasticity

check_model(MTModelFinal)

View in table format for collinearity

performance::check_collinearity(MTModelFinal)
## # Check for Multicollinearity
## 
## Low Correlation
## 
##            Term  VIF   VIF 95% CI Increased SE Tolerance Tolerance 95% CI
##      budget_log 1.34 [1.23, 1.49]         1.16      0.75     [0.67, 0.81]
##  facebook_likes 1.21 [1.12, 1.35]         1.10      0.83     [0.74, 0.89]
##   critics_score 1.88 [1.69, 2.12]         1.37      0.53     [0.47, 0.59]
##     imdb_rating 2.10 [1.88, 2.38]         1.45      0.48     [0.42, 0.53]
performance_accuracy(MTModelFinal)
## # Accuracy of Model Predictions
## 
## Accuracy (95% CI): 79.48% [76.04%, 83.84%]
## Method: Correlation between observed and predicted

Look at Correlation

head(MT %>% 
       select(budget_log,revenue_log,critics_score, runtime, facebook_likes, imdb_rating),10)
##    budget_log revenue_log critics_score runtime facebook_likes imdb_rating
## 1    2.533697   2.9263822            60     134          23343          68
## 2    3.806662   4.0980045             5     108          84182          49
## 3    2.772589   3.4518905            40      97          35296          63
## 4    1.252763  -0.2876821            44      98            445          63
## 5    2.772589   2.9796029            51     111          21583          60
## 6    3.912023   5.4821378            75     106          20965          78
## 7    3.912023   3.5931942            35      87          12952          54
## 8    3.258097   3.7686144            50      83          52827          76
## 9    4.174387   5.4624325            70     100          48878          70
## 10   2.140066   1.7526721            53      86           5481          41
MT_sub<-MT%>%select(budget_log,revenue_log,critics_score, facebook_likes, imdb_rating) 
cor(MT_sub) # create the correlation matrix 
##                budget_log revenue_log critics_score facebook_likes imdb_rating
## budget_log      1.0000000   0.6319071     0.2844238      0.3738611   0.4292595
## revenue_log     0.6319071   1.0000000     0.5005811      0.5474557   0.6320976
## critics_score   0.2844238   0.5005811     1.0000000      0.2656832   0.6798900
## facebook_likes  0.3738611   0.5474557     0.2656832      1.0000000   0.2966853
## imdb_rating     0.4292595   0.6320976     0.6798900      0.2966853   1.0000000
#anything over o.5 is highly correlated
# if - then negatively correlated

Company screen a movie premiere that had a critic score of 55 and 1,250 likes on facebook

What is predicted box office sales?

head(predict(MTModelFinal),10)
##        1        2        3        4        5        6        7        8 
## 3.873931 3.783411 3.746259 2.912163 3.664545 4.895802 3.836724 4.521936 
##        9       10 
## 4.876898 2.733903

Predict Revenue based on critic score and facebook likes

# Define new data for prediction (Critics score = 55, Facebook likes = 1,250)
Question_a <- data.frame(
  budget_log = mean(MT$budget_log, na.rm = TRUE), # Default value
  facebook_likes = 1250,
  critics_score = 55,
  audience_score = mean(MT$audience_score, na.rm = TRUE),   # Default value
  imdb_rating = mean(MT$imdb_rating, na.rm = TRUE),         # Default value
  runtime = mean(MT$runtime, na.rm = TRUE),                 # Default value
  genre = levels(MT$genre)[1]                               # Default category
)

Part B. if the producers conduct an advertsing campaign leadings to an overall movie budget of $20 million and is expected to increase Facebook likes be 10,000%. how is that campaign expected to improve overall revenue?

# Define new data for the campaign
Question_b <- data.frame(
  budget_log = log(20),
  facebook_likes = 126250,          # Updated likes
  critics_score = 55,               # Same critics score
  audience_score = mean(MT$audience_score, na.rm = TRUE),   # Default value
  imdb_rating = mean(MT$imdb_rating, na.rm = TRUE),         # Default value
  runtime = mean(MT$runtime, na.rm = TRUE),                 # Default value
  genre = levels(MT$genre)[1]                               # Default category
)

Based on the predictive model we can expect an increase of

comp = adjusted_prediction_b - adjusted_prediction_a 
print(comp)
##        1 
## 50.54937
# or percentage increase of
percent_increase = ((adjusted_prediction_b - adjusted_prediction_a)/ adjusted_prediction_a) * 100 
print(percent_increase)
##        1 
## 82.55087