getwd()
## [1] "/Users/markyuhasz/FallWinter 2024 Classes/MKTG3P98-Business analytics and intell/Assignment 3"
setwd("/Users/markyuhasz/FallWinter 2024 Classes/MKTG3P98-Business analytics and intell/Assignment 3")
library(readxl)
library(ggplot2)
library(readr) # To Read CSV Files
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(performance)
library(patchwork)
library(ggplotify)
library(reshape2)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
Movie_General<-read.csv("Movie Dataset_General Audience.csv")
Movie_Financial<-read.csv("Movie Dataset_Financials.csv")
View(Movie_General)
View(Movie_Financial)
Movie_Total<-merge(Movie_Financial,Movie_General,by= "original_title")
Movie_Total_Clean<-clean_names(Movie_Total)
Movie_Total_Clean%>% get_dupes(original_title)
## original_title dupe_count budget_millions
## 1 Fantastic Four\xca 4 17.0
## 2 Fantastic Four\xca 4 17.0
## 3 Fantastic Four\xca 4 13.0
## 4 Fantastic Four\xca 4 13.0
## 5 Godzilla Resurgence\xca 4 8.9
## 6 Godzilla Resurgence\xca 4 8.9
## 7 Godzilla Resurgence\xca 4 1.9
## 8 Godzilla Resurgence\xca 4 1.9
## 9 Hercules\xca 4 20.0
## 10 Hercules\xca 4 20.0
## 11 Hercules\xca 4 6.0
## 12 Hercules\xca 4 6.0
## 13 Pan\xca 4 4.0
## 14 Pan\xca 4 4.0
## 15 Pan\xca 4 3.0
## 16 Pan\xca 4 3.0
## 17 The Fast and the Furious\xca 4 35.0
## 18 The Fast and the Furious\xca 4 35.0
## 19 The Fast and the Furious\xca 4 20.0
## 20 The Fast and the Furious\xca 4 20.0
## 21 The Legend of Tarzan\xca 4 35.0
## 22 The Legend of Tarzan\xca 4 35.0
## 23 The Legend of Tarzan\xca 4 6.0
## 24 The Legend of Tarzan\xca 4 6.0
## 25 The Twilight Saga: Breaking Dawn - Part 2\xca 4 1.0
## 26 The Twilight Saga: Breaking Dawn - Part 2\xca 4 1.0
## 27 The Twilight Saga: Breaking Dawn - Part 2\xca 4 0.5
## 28 The Twilight Saga: Breaking Dawn - Part 2\xca 4 0.5
## revenue_millions language country type genre runtime
## 1 136.62 English USA Feature Film Mystery & Suspense 106
## 2 136.62 English USA Feature Film Drama 101
## 3 45.30 English USA Feature Film Mystery & Suspense 106
## 4 45.30 English USA Feature Film Drama 101
## 5 4.16 Japanese Japan Feature Film Comedy 93
## 6 4.16 Japanese Japan Feature Film Drama 98
## 7 157.11 Japanese Japan Feature Film Comedy 93
## 8 157.11 Japanese Japan Feature Film Drama 98
## 9 34.08 English USA Feature Film Drama 105
## 10 34.08 English USA Documentary Documentary 107
## 11 1.27 English USA Feature Film Drama 105
## 12 1.27 English USA Documentary Documentary 107
## 13 1.60 English USA Feature Film Drama 130
## 14 1.60 English USA Feature Film Other 90
## 15 0.70 English USA Feature Film Drama 130
## 16 0.70 English USA Feature Film Other 90
## 17 77.48 English USA Feature Film Drama 104
## 18 77.48 English USA Feature Film Drama 95
## 19 18.22 English USA Feature Film Drama 104
## 20 18.22 English USA Feature Film Drama 95
## 21 82.35 English USA Documentary Documentary 92
## 22 82.35 English USA Feature Film Drama 94
## 23 11.12 English USA Documentary Documentary 92
## 24 11.12 English USA Feature Film Drama 94
## 25 3.39 English USA Documentary Documentary 98
## 26 3.39 English USA Feature Film Comedy 95
## 27 1025.47 English USA Documentary Documentary 98
## 28 1025.47 English USA Feature Film Comedy 95
## mpaa_rating imdb_rating imdb_num_votes critics_rating critics_score
## 1 R 6.3 25054 Rotten 43
## 2 R 2.1 9904 Fresh 35
## 3 R 6.3 25054 Rotten 43
## 4 R 2.1 9904 Fresh 35
## 5 PG 4.2 182983 Certified Fresh 34
## 6 PG-13 6.3 50340 Fresh 60
## 7 PG 4.2 182983 Certified Fresh 34
## 8 PG-13 6.3 50340 Fresh 60
## 9 R 3.0 9216 Rotten 50
## 10 Unrated 3.8 10522 Fresh 21
## 11 R 3.0 9216 Rotten 50
## 12 Unrated 3.8 10522 Fresh 21
## 13 R 2.0 9216 Rotten 34
## 14 PG 3.6 1010 Fresh 33
## 15 R 2.0 9216 Rotten 34
## 16 PG 3.6 1010 Fresh 33
## 17 R 7.1 128361 Fresh 68
## 18 R 7.2 35635 Certified Fresh 93
## 19 R 7.1 128361 Fresh 68
## 20 R 7.2 35635 Certified Fresh 93
## 21 Unrated 6.8 1942 Certified Fresh 66
## 22 R 3.4 57933 Rotten 25
## 23 Unrated 6.8 1942 Certified Fresh 66
## 24 R 3.4 57933 Rotten 25
## 25 Unrated 3.1 1010 Fresh 45
## 26 PG 7.5 880 Fresh 90
## 27 Unrated 3.1 1010 Fresh 45
## 28 PG 7.5 880 Fresh 90
## audience_rating audience_score best_pic_nom facebook_likes
## 1 Spilled 49 no 1261
## 2 Upright 26 no 51261
## 3 Spilled 49 no 1261
## 4 Upright 26 no 51261
## 5 Upright 33 yes 5699
## 6 Spilled 67 no 699
## 7 Upright 33 yes 5699
## 8 Spilled 67 no 699
## 9 Upright 40 no 16235
## 10 Upright 20 no 235
## 11 Upright 40 no 16235
## 12 Upright 20 no 235
## 13 Upright 24 no 393
## 14 Upright 24 no 393
## 15 Upright 24 no 393
## 16 Upright 24 no 393
## 17 Upright 78 no 125327
## 18 Upright 80 no 45327
## 19 Upright 78 no 125327
## 20 Upright 80 no 45327
## 21 Upright 68 no 121175
## 22 Spilled 39 no 11175
## 23 Upright 68 no 121175
## 24 Spilled 39 no 11175
## 25 Upright 46 no 1177
## 26 Upright 89 no 359177
## 27 Upright 46 no 1177
## 28 Upright 89 no 359177
Movie_Total_Unique <- Movie_Total_Clean %>% distinct(original_title, .keep_all = TRUE)
print(head(Movie_Total_Unique, 10))
## original_title budget_millions revenue_millions language
## 1 10,000 B.C.\xca 12.6 18.66
## 2 102 Dalmatians\xca 45.0 60.22 English
## 3 2 Fast 2 Furious\xca 16.0 31.56 English
## 4 2012\xca 3.5 0.75 English
## 5 300: Rise of an Empire\xca 16.0 19.68 English
## 6 47 Ronin\xca 50.0 240.36 English
## 7 50 First Dates\xca 50.0 36.35 English
## 8 A Beautiful Mind\xca 26.0 43.32 English
## 9 A Christmas Carol\xca 65.0 235.67 English
## 10 A Good Day to Die Hard\xca 8.5 5.77 English
## country type genre runtime mpaa_rating imdb_rating
## 1 Feature Film Drama 134 R 6.8
## 2 USA Feature Film Drama 108 PG 4.9
## 3 USA Feature Film Mystery & Suspense 97 PG-13 6.3
## 4 USA Feature Film Comedy 98 PG-13 6.3
## 5 USA Feature Film Action & Adventure 111 PG-13 6.0
## 6 USA Feature Film Drama 106 PG 7.8
## 7 USA Feature Film Comedy 87 R 5.4
## 8 USA Feature Film Action & Adventure 83 G 7.6
## 9 USA Feature Film Drama 100 R 7.0
## 10 USA Feature Film Action & Adventure 86 R 4.1
## imdb_num_votes critics_rating critics_score audience_rating audience_score
## 1 9025 Fresh 60 Upright 76
## 2 5136 Rotten 5 Spilled 13
## 3 54771 Rotten 40 Spilled 49
## 4 8646 Certified Fresh 44 Spilled 54
## 5 103789 Rotten 51 Spilled 51
## 6 12450 Fresh 75 Spilled 85
## 7 6811 Rotten 35 Spilled 31
## 8 78862 Rotten 50 Upright 81
## 9 8320 Rotten 70 Spilled 74
## 10 739 Fresh 53 Upright 42
## best_pic_nom facebook_likes
## 1 no 23343
## 2 no 84182
## 3 no 35296
## 4 no 445
## 5 no 21583
## 6 no 20965
## 7 no 12952
## 8 no 52827
## 9 no 48878
## 10 no 5481
Movie_Total_Unique <- Movie_Total_Unique %>% #Call Dataframe and Create New Column with New Groupings
mutate(Genre1 = case_when(genre == "Science Fiction & Fantasy" ~ "SciFi",
genre == "Mystery & Suspense" ~ "Mystery",
genre == "Drama" ~ "Drama",
genre == "Documentary" ~ "Documentary",
genre == "Comedy" ~ "Comedy",
genre == "Art House & International" ~ "Arts",
genre == "Other" ~ "Other",
genre == "Action & Adventure" ~ "Action",
genre == "Animation" ~ "Animation",
genre == "Horror" ~ "Horror",
genre == "Musical & Performing Arts" ~ "Arts",
TRUE ~ "Check"))
MT <- Movie_Total_Unique %>%
mutate(
budget_log = log(budget_millions),
revenue_log = log(revenue_millions)
)
#Changes the scale of the imdb rating to a scale of 100
MT$imdb_rating <- MT$imdb_rating * 10
MT$genre <- as.factor(MT$genre)
# replace outliers of IMDB score that are greater than 100
MT$imdb_rating[MT$imdb_rating > 100]
## [1] 550 440
# which rows are responsible for the over 100
outlier <- MT[MT$imdb_rating>100,]
print(outlier)
## original_title budget_millions revenue_millions language
## 447 The Amazing Spider-Man\xca 30 91.71 English
## 638 X-Men: The Last Stand\xca 6 35.40 English
## country type genre runtime mpaa_rating imdb_rating
## 447 USA Documentary Documentary 82 Unrated 550
## 638 Canada Feature Film Documentary 112 R 440
## imdb_num_votes critics_rating critics_score audience_rating audience_score
## 447 3459 Fresh 55 Upright 56
## 638 4908 Rotten 40 Spilled 47
## best_pic_nom facebook_likes Genre1 budget_log revenue_log
## 447 no 2489 Documentary 3.401197 4.518631
## 638 no 21714 Documentary 1.791759 3.566712
# change the outliers numbers to the actual numbers based on the website
MT$imdb_rating[c(447, 638)] <- c(69, 66)
Movie_Total <- clean_names(Movie_Total)
colnames(Movie_Total)
## [1] "original_title" "budget_millions" "revenue_millions" "language"
## [5] "country" "type" "genre" "runtime"
## [9] "mpaa_rating" "imdb_rating" "imdb_num_votes" "critics_rating"
## [13] "critics_score" "audience_rating" "audience_score" "best_pic_nom"
## [17] "facebook_likes"
Movie_Total <- Movie_Total %>%
rename(revenue_millions = revenue_millions)
Movie_Data <- Movie_Total %>%
select(revenue_millions, critics_score, audience_score, runtime, imdb_rating, genre)
Movie_Data <- na.omit(Movie_Data)
Factors_by_Genre <- Movie_Data %>%
group_by(genre) %>%
summarise(
Avg_Critics_Score = mean(critics_score, na.rm = TRUE),
Avg_Audience_Score = mean(audience_score, na.rm = TRUE),
Avg_Runtime = mean(runtime, na.rm = TRUE),
Avg_IMDb_Rating = mean(imdb_rating, na.rm = TRUE),
Avg_Revenue = mean(revenue_millions, na.rm = TRUE)
)
ggplot(Factors_by_Genre, aes(x = genre, y = Avg_Revenue, fill = genre)) +
geom_bar(stat = "identity") +
labs(title = "Average Revenue by Genre", x = "Genre", y = "Average Revenue (Millions)") +
theme_minimal()
### Interpretation: #### Animation leads as the highest
revenue-generating genre, followed by Science Fiction & Fantasy and
Mystery & Suspense, showcasing their strong audience appeal.
Conversely, Art House & International and Documentaries have lower
revenues, reflecting their niche markets. Most other genres, like Comedy
and Drama, exhibit stable mid-range performance, suggesting balanced but
less exceptional financial returns.
ggplot(Movie_Data, aes(x = critics_score, y = revenue_millions, color = genre)) +
geom_point(alpha = 0.7) +
labs(title = "Critics Score vs Revenue by Genre", x = "Critics Score", y = "Revenue (Millions)") +
theme_minimal()
Top_Factors_By_Genre <- Movie_Data %>%
group_by(genre) %>%
summarise(
Cor_Critics = cor(revenue_millions, critics_score, use = "complete.obs"),
Cor_Audience = cor(revenue_millions, audience_score, use = "complete.obs"),
Cor_Runtime = cor(revenue_millions, runtime, use = "complete.obs"),
Cor_IMDb = cor(revenue_millions, imdb_rating, use = "complete.obs")
) %>%
pivot_longer(cols = starts_with("Cor"), names_to = "Factor", values_to = "Correlation") %>%
arrange(genre, desc(Correlation))
print(Top_Factors_By_Genre)
## # A tibble: 44 × 3
## genre Factor Correlation
## <chr> <chr> <dbl>
## 1 Action & Adventure Cor_Audience 0.466
## 2 Action & Adventure Cor_IMDb 0.464
## 3 Action & Adventure Cor_Runtime 0.354
## 4 Action & Adventure Cor_Critics 0.337
## 5 Animation Cor_Runtime 0.772
## 6 Animation Cor_Critics 0.525
## 7 Animation Cor_Audience 0.431
## 8 Animation Cor_IMDb 0.415
## 9 Art House & International Cor_Runtime 0.751
## 10 Art House & International Cor_Audience 0.421
## # ℹ 34 more rows
ggplot(Top_Factors_By_Genre, aes(x = genre, y = Correlation, fill = Factor)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Top Factors Driving Revenue by Genre", x = "Genre", y = "Correlation") +
theme_minimal()
#### To maximize box office revenue: Prioritize audience engagement
through targeted marketing and viewer-centric storytelling. Tailor
runtimes by genre, investing in longer narratives for Animation and
Sci-Fi, while keeping Horror and Musicals short. Leverage IMDb Ratings
as a quality benchmark to ensure broad public appeal.
#Question 1: Do Some Genres Generate More Revenue Than Others ## Create Table
Avg_Revenue <- Movie_Total_Unique %>%
group_by(Genre1) %>%
summarise(mean_revenue = mean(revenue_millions, na.rm = TRUE))
print(Avg_Revenue)
## # A tibble: 10 × 2
## Genre1 mean_revenue
## <chr> <dbl>
## 1 Action 129.
## 2 Animation 237.
## 3 Arts 132.
## 4 Comedy 128.
## 5 Documentary 120.
## 6 Drama 140.
## 7 Horror 189.
## 8 Mystery 133.
## 9 Other 180.
## 10 SciFi 67.2
ggplot(Avg_Revenue, aes(x = Genre1, y = mean_revenue)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Average Revenue by Genre", x = "Genre", y = "Average Revenue (Millions)")
### Therefore, Animation films tend to generate the most average
revenue.
Avg_Critic_Score <- Movie_Total_Unique %>%
group_by(Genre1) %>%
summarise(mean_critics_score = mean(critics_score, na.rm = TRUE))
ggplot(Avg_Critic_Score, aes(x = Genre1, y = mean_critics_score)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Average Critics Score by Genre", x = "Genre", y = "Average Critics Score")
### Therefore, Critics tend to have a preference for Documentary
Films
cor_critics <- cor(Movie_Total_Unique$critics_score, Movie_Total_Unique$revenue_millions, use = "complete.obs")
cor_audience <- cor(Movie_Total_Unique$audience_score, Movie_Total_Unique$revenue_millions, use = "complete.obs")
cat("Correlation with Critics Score:", cor_critics, "\n")
## Correlation with Critics Score: 0.392774
cat("Correlation with Audience Score:", cor_audience, "\n")
## Correlation with Audience Score: 0.4428168
lm_model <- lm(revenue_millions ~ critics_score + audience_score, data = Movie_Total_Unique)
summary(lm_model)
##
## Call:
## lm(formula = revenue_millions ~ critics_score + audience_score,
## data = Movie_Total_Unique)
##
## Residuals:
## Min 1Q Median 3Q Max
## -279.37 -88.33 -25.64 29.13 1835.76
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -119.6096 21.4587 -5.574 3.67e-08 ***
## critics_score 1.1282 0.5019 2.248 0.0249 *
## audience_score 3.4444 0.5540 6.218 9.10e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 186.3 on 641 degrees of freedom
## Multiple R-squared: 0.2024, Adjusted R-squared: 0.1999
## F-statistic: 81.32 on 2 and 641 DF, p-value: < 2.2e-16
# Add an interaction term
lm_interaction <- lm(revenue_millions ~ critics_score * audience_score, data = Movie_Total_Unique)
summary(lm_interaction)
##
## Call:
## lm(formula = revenue_millions ~ critics_score * audience_score,
## data = Movie_Total_Unique)
##
## Residuals:
## Min 1Q Median 3Q Max
## -299.02 -84.21 -25.48 20.35 1834.23
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -59.2411 51.9350 -1.141 0.2544
## critics_score -0.1927 1.1501 -0.168 0.8670
## audience_score 2.3249 1.0373 2.241 0.0254 *
## critics_score:audience_score 0.0217 0.0170 1.276 0.2023
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 186.2 on 640 degrees of freedom
## Multiple R-squared: 0.2044, Adjusted R-squared: 0.2007
## F-statistic: 54.81 on 3 and 640 DF, p-value: < 2.2e-16
#Coefficients from the Regression Model
coefficients <- data.frame(
Variable = c("Critics Score", "Audience Score"),
Coefficient = c(1.1282, 3.4444)
)
# Coefficients from the regression model
coefficients <- data.frame(
Variable = c("Critics Score", "Audience Score"),
Coefficient = c(1.1282, 3.4444)
)
# Bar plot
library(ggplot2)
ggplot(coefficients, aes(x = Variable, y = Coefficient, fill = Variable)) +
geom_bar(stat = "identity", width = 0.6) +
labs(
title = "Impact of Critics and Audience Scores on Revenue",
y = "Coefficient Value",
x = "Variable"
) +
theme_minimal() +
scale_fill_manual(values = c("Critics Score" = "pink", "Audience Score" = "blue")) +
geom_text(aes(label = round(Coefficient, 2)), vjust = -0.2)
### Audience Score is more important than Critics Score for predicting
film revenue. It has a larger coefficient, meaning its influence on
revenue is approximately 3 times greater than that of Critics Score.
options(repr.plot.res = 200, repr.plot.height = 5, repr.plot.width = 8)
ggplot(Movie_Total_Unique, aes(x = critics_score, y = audience_score)) +
geom_point(aes(colour = genre), alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "black") +
labs(
title = " Audience's Score vs Critics' Score",
x = "Critics Score",
y = "Audience Score",
colour = "Genre"
)
## `geom_smooth()` using formula = 'y ~ x'
### Movies with higher critics’ scores tend to have higher audience
scores and vice versa, as seen from the upward-sloping trendline.
correlation_matrix <- cor(Movie_Total_Unique[, c("critics_score", "audience_score", "revenue_millions")], use = "complete.obs")
print(correlation_matrix)
## critics_score audience_score revenue_millions
## critics_score 1.0000000 0.7734838 0.3927740
## audience_score 0.7734838 1.0000000 0.4428168
## revenue_millions 0.3927740 0.4428168 1.0000000
library(reshape2)
melted_correlation <- melt(correlation_matrix)
ggplot(melted_correlation, aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1, 1)) +
labs(title = "Correlation Matrix", x = "", y = "") +
theme_minimal()
### There is a strong positive correlation between Critics’ score and
Audience score, as indicated by the darker red ### The correlation
between revenue and Audience score is stronger (darker red) than the
correlation between revenue and Critics’ score. ### While Critics’ score
shows a positive correlation with revenue, the relationship appears
weaker compared to Audience score, as suggested by the lighter shade of
red.
# Critics' Score vs Revenue
ggplot(Movie_Total_Unique, aes(x = critics_score, y = log10(revenue_millions))) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", colour = "red") +
labs(title = "Critics' Score vs Revenue (Log Scale)", x = "Critics Score", y = "Log10 Revenue")
## `geom_smooth()` using formula = 'y ~ x'
# Audience Score vs Revenue
ggplot(Movie_Total_Unique, aes(x = audience_score, y = log10(revenue_millions))) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", colour = "blue") +
labs(title = "Audience Score vs Revenue (Log Scale)", x = "Audience Score", y = "Log10 Revenue")
## `geom_smooth()` using formula = 'y ~ x'
model_critics <- lm(revenue_millions ~ critics_score, data = Movie_Total_Unique)
model_audience <- lm(revenue_millions ~ audience_score, data = Movie_Total_Unique)
summary_critics <- summary(model_critics)
summary_audience <- summary(model_audience)
cat("R-squared for Critics Score Model:", summary_critics$r.squared, "\n")
## R-squared for Critics Score Model: 0.1542714
cat("R-squared for Audience Score Model:", summary_audience$r.squared, "\n")
## R-squared for Audience Score Model: 0.1960867
max_budget <- max(Movie_Total_Unique$budget_millions, na.rm = TRUE) # Remove NA values
min_budget <- min(Movie_Total_Unique$budget_millions, na.rm = TRUE)
cat("Maximum budget:", max_budget, "\n")
## Maximum budget: 280
cat("Minimum budget:", min_budget, "\n")
## Minimum budget: 0.32
# Define 4 range groups
Movie_Total_Unique$budget_category <- cut(
Movie_Total_Unique$budget_millions,
breaks = c(0.32, 70, 140, 210, 280),
labels = c("Low", "Moderate", "High", "Very High"),
include.lowest = TRUE
)
options(repr.plot.res = 100, repr.plot.height = 5, repr.plot.width = 8)
ggplot(Movie_Total_Unique, aes(x = critics_score, y = audience_score)) +
geom_point(aes(colour = budget_category)) +
facet_wrap(~genre) +
labs(
title = "Audience Score vs Critics' Score by Budget Category",
x = "Critics' Score",
y = "Audience Score",
colour = "Budget Category"
)
ggplot(MT, aes(x = budget_millions, y = revenue_millions)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE, color = 'red')
## `geom_smooth()` using formula = 'y ~ x'
ggplot(MT, aes(x = budget_log, y = revenue_log)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE, color = 'red')
## `geom_smooth()` using formula = 'y ~ x'
ggplot(MT) + aes(y = revenue_log, x = runtime, colour = audience_score, size = critics_score) +
geom_point() +
scale_color_gradientn(colors = c("red", "yellow", "green")) +
labs(
y = "Generated Revenue (Millions)",
x = "Runtime (Min)",
color = "Audience Score",
size = "Critic Score"
) +
theme_minimal() +
facet_wrap(~ genre)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(MT) + aes(y = revenue_log, x = budget_log, colour = imdb_rating, size = runtime) +
geom_point() +
scale_color_gradientn(colors = c("red", "yellow", "green")) +
labs(
y = "Generated Revenue (Millions)",
x = "Budget (Millions)",
color = "IMDB Rating (1-100)",
size = "Runtime (Min)"
) +
theme_minimal() +
facet_wrap(~ genre)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Develop Multiple Linear Regression to see Pr(>|t|) ## Multiple
Linear regression of all variables
MTM1<- lm(revenue_log ~ budget_log+facebook_likes+critics_score+audience_score+imdb_rating+runtime+genre+mpaa_rating+imdb_num_votes, data = MT)
summary(MTM1)
##
## Call:
## lm(formula = revenue_log ~ budget_log + facebook_likes + critics_score +
## audience_score + imdb_rating + runtime + genre + mpaa_rating +
## imdb_num_votes, data = MT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1261 -0.5296 0.0487 0.5997 3.1415
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.073e-01 3.616e-01 0.573 0.56659
## budget_log 4.415e-01 3.542e-02 12.465 < 2e-16 ***
## facebook_likes 5.001e-06 4.621e-07 10.822 < 2e-16 ***
## critics_score 8.549e-03 2.726e-03 3.136 0.00179 **
## audience_score 2.980e-03 4.160e-03 0.716 0.47411
## imdb_rating 2.924e-02 4.378e-03 6.679 5.35e-11 ***
## runtime -2.178e-03 2.208e-03 -0.986 0.32443
## genreAnimation 1.841e-02 3.840e-01 0.048 0.96178
## genreArt House & International -6.032e-02 2.983e-01 -0.202 0.83983
## genreComedy -9.428e-04 1.640e-01 -0.006 0.99542
## genreDocumentary -3.953e-01 2.172e-01 -1.820 0.06926 .
## genreDrama -2.658e-01 1.401e-01 -1.898 0.05817 .
## genreHorror -1.507e-01 2.455e-01 -0.614 0.53963
## genreMusical & Performing Arts -4.941e-01 3.131e-01 -1.578 0.11506
## genreMystery & Suspense -1.871e-01 1.833e-01 -1.021 0.30785
## genreOther -4.603e-02 2.929e-01 -0.157 0.87520
## genreScience Fiction & Fantasy -1.782e-01 3.518e-01 -0.507 0.61260
## mpaa_ratingNC-17 7.878e-01 7.451e-01 1.057 0.29074
## mpaa_ratingPG 1.860e-01 2.701e-01 0.689 0.49120
## mpaa_ratingPG-13 3.529e-01 2.762e-01 1.278 0.20184
## mpaa_ratingR 2.308e-01 2.673e-01 0.864 0.38816
## mpaa_ratingUnrated 8.311e-02 3.071e-01 0.271 0.78673
## imdb_num_votes -9.223e-07 4.300e-07 -2.145 0.03234 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9816 on 620 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6522, Adjusted R-squared: 0.6399
## F-statistic: 52.85 on 22 and 620 DF, p-value: < 2.2e-16
MTM2<- lm(revenue_log ~ budget_log+facebook_likes+critics_score+audience_score+imdb_rating+runtime+mpaa_rating+imdb_num_votes, data = MT)
summary(MTM2)
##
## Call:
## lm(formula = revenue_log ~ budget_log + facebook_likes + critics_score +
## audience_score + imdb_rating + runtime + mpaa_rating + imdb_num_votes,
## data = MT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0938 -0.5563 0.0415 0.6230 3.1233
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.477e-01 3.200e-01 0.774 0.43918
## budget_log 4.462e-01 3.488e-02 12.792 < 2e-16 ***
## facebook_likes 5.052e-06 4.582e-07 11.025 < 2e-16 ***
## critics_score 7.476e-03 2.684e-03 2.785 0.00552 **
## audience_score 3.074e-03 4.139e-03 0.743 0.45800
## imdb_rating 2.904e-02 4.345e-03 6.684 5.14e-11 ***
## runtime -3.305e-03 2.089e-03 -1.582 0.11416
## mpaa_ratingNC-17 6.642e-01 7.322e-01 0.907 0.36470
## mpaa_ratingPG 1.464e-01 2.453e-01 0.597 0.55083
## mpaa_ratingPG-13 3.071e-01 2.462e-01 1.248 0.21267
## mpaa_ratingR 1.471e-01 2.349e-01 0.626 0.53157
## mpaa_ratingUnrated -1.111e-01 2.690e-01 -0.413 0.67977
## imdb_num_votes -7.603e-07 4.236e-07 -1.795 0.07316 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9813 on 630 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6468, Adjusted R-squared: 0.64
## F-statistic: 96.13 on 12 and 630 DF, p-value: < 2.2e-16
MTM3<- lm(revenue_log ~ budget_log+facebook_likes+critics_score+audience_score+imdb_rating+runtime+imdb_num_votes, data = MT)
summary(MTM3)
##
## Call:
## lm(formula = revenue_log ~ budget_log + facebook_likes + critics_score +
## audience_score + imdb_rating + runtime + imdb_num_votes,
## data = MT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3932 -0.5484 0.0351 0.6552 3.1412
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.620e-01 2.536e-01 1.427 0.1540
## budget_log 4.477e-01 3.490e-02 12.829 < 2e-16 ***
## facebook_likes 5.070e-06 4.572e-07 11.089 < 2e-16 ***
## critics_score 6.800e-03 2.664e-03 2.552 0.0109 *
## audience_score 2.032e-03 4.125e-03 0.493 0.6224
## imdb_rating 3.002e-02 4.333e-03 6.929 1.05e-11 ***
## runtime -2.617e-03 2.051e-03 -1.276 0.2024
## imdb_num_votes -6.548e-07 4.203e-07 -1.558 0.1198
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9831 on 635 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6427, Adjusted R-squared: 0.6388
## F-statistic: 163.2 on 7 and 635 DF, p-value: < 2.2e-16
MTM4<- lm(revenue_log ~ budget_log+facebook_likes+critics_score+imdb_rating+runtime+imdb_num_votes, data = MT)
summary(MTM4)
##
## Call:
## lm(formula = revenue_log ~ budget_log + facebook_likes + critics_score +
## imdb_rating + runtime + imdb_num_votes, data = MT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3821 -0.5511 0.0382 0.6527 3.1495
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.584e-01 2.534e-01 1.414 0.15774
## budget_log 4.481e-01 3.487e-02 12.852 < 2e-16 ***
## facebook_likes 5.094e-06 4.545e-07 11.208 < 2e-16 ***
## critics_score 7.458e-03 2.304e-03 3.238 0.00127 **
## imdb_rating 3.142e-02 3.277e-03 9.588 < 2e-16 ***
## runtime -2.625e-03 2.050e-03 -1.281 0.20079
## imdb_num_votes -6.303e-07 4.171e-07 -1.511 0.13128
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9825 on 636 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6426, Adjusted R-squared: 0.6392
## F-statistic: 190.6 on 6 and 636 DF, p-value: < 2.2e-16
MTM5<- lm(revenue_log ~ budget_log+facebook_likes+critics_score+imdb_rating+imdb_num_votes, data = MT)
summary(MTM5)
##
## Call:
## lm(formula = revenue_log ~ budget_log + facebook_likes + critics_score +
## imdb_rating + imdb_num_votes, data = MT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.4224 -0.5445 0.0182 0.6441 3.1447
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.122e-02 1.420e-01 0.642 0.52087
## budget_log 4.492e-01 3.485e-02 12.889 < 2e-16 ***
## facebook_likes 5.087e-06 4.544e-07 11.196 < 2e-16 ***
## critics_score 7.316e-03 2.299e-03 3.182 0.00153 **
## imdb_rating 3.143e-02 3.274e-03 9.601 < 2e-16 ***
## imdb_num_votes -7.521e-07 4.063e-07 -1.851 0.06465 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9823 on 638 degrees of freedom
## Multiple R-squared: 0.6416, Adjusted R-squared: 0.6388
## F-statistic: 228.4 on 5 and 638 DF, p-value: < 2.2e-16
MTM6<- lm(revenue_log ~ budget_log+facebook_likes+critics_score+imdb_rating, data = MT)
summary(MTM6)
##
## Call:
## lm(formula = revenue_log ~ budget_log + facebook_likes + critics_score +
## imdb_rating, data = MT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.4278 -0.5570 0.0164 0.6507 3.2044
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.117e-01 1.419e-01 0.787 0.43132
## budget_log 4.511e-01 3.490e-02 12.923 < 2e-16 ***
## facebook_likes 5.090e-06 4.552e-07 11.183 < 2e-16 ***
## critics_score 7.211e-03 2.303e-03 3.132 0.00182 **
## imdb_rating 3.041e-02 3.233e-03 9.406 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9842 on 639 degrees of freedom
## Multiple R-squared: 0.6397, Adjusted R-squared: 0.6374
## F-statistic: 283.6 on 4 and 639 DF, p-value: < 2.2e-16
MTModelFinal <- lm(revenue_log ~ budget_log+facebook_likes+critics_score+imdb_rating, data = MT)
summary(MTModelFinal)
##
## Call:
## lm(formula = revenue_log ~ budget_log + facebook_likes + critics_score +
## imdb_rating, data = MT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.4278 -0.5570 0.0164 0.6507 3.2044
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.117e-01 1.419e-01 0.787 0.43132
## budget_log 4.511e-01 3.490e-02 12.923 < 2e-16 ***
## facebook_likes 5.090e-06 4.552e-07 11.183 < 2e-16 ***
## critics_score 7.211e-03 2.303e-03 3.132 0.00182 **
## imdb_rating 3.041e-02 3.233e-03 9.406 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9842 on 639 degrees of freedom
## Multiple R-squared: 0.6397, Adjusted R-squared: 0.6374
## F-statistic: 283.6 on 4 and 639 DF, p-value: < 2.2e-16
check_model(MTModelFinal)
performance::check_collinearity(MTModelFinal)
## # Check for Multicollinearity
##
## Low Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI
## budget_log 1.34 [1.23, 1.49] 1.16 0.75 [0.67, 0.81]
## facebook_likes 1.21 [1.12, 1.35] 1.10 0.83 [0.74, 0.89]
## critics_score 1.88 [1.69, 2.12] 1.37 0.53 [0.47, 0.59]
## imdb_rating 2.10 [1.88, 2.38] 1.45 0.48 [0.42, 0.53]
performance_accuracy(MTModelFinal)
## # Accuracy of Model Predictions
##
## Accuracy (95% CI): 79.48% [76.04%, 83.84%]
## Method: Correlation between observed and predicted
head(MT %>%
select(budget_log,revenue_log,critics_score, runtime, facebook_likes, imdb_rating),10)
## budget_log revenue_log critics_score runtime facebook_likes imdb_rating
## 1 2.533697 2.9263822 60 134 23343 68
## 2 3.806662 4.0980045 5 108 84182 49
## 3 2.772589 3.4518905 40 97 35296 63
## 4 1.252763 -0.2876821 44 98 445 63
## 5 2.772589 2.9796029 51 111 21583 60
## 6 3.912023 5.4821378 75 106 20965 78
## 7 3.912023 3.5931942 35 87 12952 54
## 8 3.258097 3.7686144 50 83 52827 76
## 9 4.174387 5.4624325 70 100 48878 70
## 10 2.140066 1.7526721 53 86 5481 41
MT_sub<-MT%>%select(budget_log,revenue_log,critics_score, facebook_likes, imdb_rating)
cor(MT_sub) # create the correlation matrix
## budget_log revenue_log critics_score facebook_likes imdb_rating
## budget_log 1.0000000 0.6319071 0.2844238 0.3738611 0.4292595
## revenue_log 0.6319071 1.0000000 0.5005811 0.5474557 0.6320976
## critics_score 0.2844238 0.5005811 1.0000000 0.2656832 0.6798900
## facebook_likes 0.3738611 0.5474557 0.2656832 1.0000000 0.2966853
## imdb_rating 0.4292595 0.6320976 0.6798900 0.2966853 1.0000000
#anything over o.5 is highly correlated
# if - then negatively correlated
head(predict(MTModelFinal),10)
## 1 2 3 4 5 6 7 8
## 3.873931 3.783411 3.746259 2.912163 3.664545 4.895802 3.836724 4.521936
## 9 10
## 4.876898 2.733903
# Define new data for prediction (Critics score = 55, Facebook likes = 1,250)
Question_a <- data.frame(
budget_log = mean(MT$budget_log, na.rm = TRUE), # Default value
facebook_likes = 1250,
critics_score = 55,
audience_score = mean(MT$audience_score, na.rm = TRUE), # Default value
imdb_rating = mean(MT$imdb_rating, na.rm = TRUE), # Default value
runtime = mean(MT$runtime, na.rm = TRUE), # Default value
genre = levels(MT$genre)[1] # Default category
)
pre_rev_a <- predict(MTModelFinal, newdata = Question_a)
# Residual variance from the log-transformed model
sigma_squared <- summary(MTModelFinal)$sigma^2
# Adjusted prediction back to the original scale
adjusted_prediction_a <- exp(pre_rev_a + sigma_squared / 2)
print(adjusted_prediction_a)
## 1
## 61.23421
# Define new data for the campaign
Question_b <- data.frame(
budget_log = log(20),
facebook_likes = 126250, # Updated likes
critics_score = 55, # Same critics score
audience_score = mean(MT$audience_score, na.rm = TRUE), # Default value
imdb_rating = mean(MT$imdb_rating, na.rm = TRUE), # Default value
runtime = mean(MT$runtime, na.rm = TRUE), # Default value
genre = levels(MT$genre)[1] # Default category
)
pre_rev_b <- predict(MTModelFinal, newdata = Question_b)
# Residual variance from the log-transformed model
sigma_squared <- summary(MTModelFinal)$sigma^2
# Adjusted prediction back to the original scale
adjusted_prediction_b<- exp(pre_rev_b + sigma_squared / 2)
print(adjusted_prediction_b)
## 1
## 111.7836
comp = adjusted_prediction_b - adjusted_prediction_a
print(comp)
## 1
## 50.54937
# or percentage increase of
percent_increase = ((adjusted_prediction_b - adjusted_prediction_a)/ adjusted_prediction_a) * 100
print(percent_increase)
## 1
## 82.55087