# Set CRAN mirror
options(repos = c(CRAN = "https://cloud.r-project.org"))
# Function to install packages if not already installed
install_if_missing <- function(packages) {
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
}
# Install required packages
required_packages <- c("tidyverse", "ggplot2", "dplyr", "scales",
"corrplot", "caret", "randomForest", "readxl")
install_if_missing(required_packages)
# Load the libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(corrplot)
## corrplot 0.95 loaded
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(readxl)
# Create a custom theme with solid white background
custom_theme <- theme_minimal() +
theme(
plot.background = element_rect(fill = "white", color = "black"),
panel.background = element_rect(fill = "white", color = "black"),
panel.grid.major = element_line(color = "grey90"),
panel.grid.minor = element_line(color = "grey95"),
legend.background = element_rect(fill = "white", color = "black"),
legend.box.background = element_rect(fill = "white", color = "black"),
axis.text = element_text(color = "black"),
axis.title = element_text(color = "black"),
plot.title = element_text(color = "black")
)
# Set the custom theme as default
theme_set(custom_theme)
setwd("/Users/ethanbalogh/Desktop/mktg3p98-proj3")
# 1. Data Import and Cleaning
audience_data <- read_excel("Movie_Dataset_General_Audience.xlsx")
financial_data <- read_excel("Movie_Dataset_Financials.xlsx")
# Print dimensions of imported data
cat("\nDimensions of datasets:")
##
## Dimensions of datasets:
cat("\nAudience data:", dim(audience_data))
##
## Audience data: 651 13
cat("\nFinancial data:", dim(financial_data))
##
## Financial data: 651 5
# Remove trailing spaces from original_title in both datasets
audience_data$original_title <- str_trim(audience_data$original_title)
financial_data$original_title <- str_trim(financial_data$original_title)
# Print column names
cat("\n\nColumn names:")
##
##
## Column names:
cat("\nAudience data:", names(audience_data))
##
## Audience data: original_title type genre runtime mpaa_rating imdb_rating imdb_num_votes critics_rating critics_score audience_rating audience_score best_pic_nom Facebook_Likes
cat("\nFinancial data:", names(financial_data))
##
## Financial data: original_title budget (Millions) revenue (Millions) language country
# Merge datasets
movies_df <- merge(audience_data, financial_data, by = "original_title", all = FALSE)
# Clean up column names for modeling
names(movies_df) <- make.names(names(movies_df))
# Print dimensions after merge
cat("\n\nDimensions after merge:", dim(movies_df))
##
##
## Dimensions after merge: 665 17
# Print first few rows of financial columns
cat("\n\nFirst few rows of financial columns:")
##
##
## First few rows of financial columns:
print(head(select(movies_df, original_title, budget..Millions., revenue..Millions.)))
## original_title budget..Millions. revenue..Millions.
## 1 10,000 B.C. 12.6 18.658381
## 2 102 Dalmatians 45.0 60.222298
## 3 2 Fast 2 Furious 16.0 31.556061
## 4 2012 3.5 0.754249
## 5 300: Rise of an Empire 16.0 19.682924
## 6 47 Ronin 50.0 240.360392
# Create profit column with error checking
movies_df <- movies_df %>%
mutate(profit = as.numeric(revenue..Millions.) - as.numeric(budget..Millions.))
# Print summary of financial metrics
cat("\n\nSummary of financial metrics:")
##
##
## Summary of financial metrics:
print(summary(select(movies_df, revenue..Millions., budget..Millions., profit)))
## revenue..Millions. budget..Millions. profit
## Min. : 0.0093 Min. : 0.318 Min. :-109.3480
## 1st Qu.: 19.6829 1st Qu.: 10.000 1st Qu.: 0.6707
## Median : 61.5487 Median : 25.000 Median : 33.0000
## Mean : 137.4735 Mean : 40.319 Mean : 97.1540
## 3rd Qu.: 153.9630 3rd Qu.: 51.000 3rd Qu.: 105.3465
## Max. :2068.1782 Max. :280.000 Max. :1868.1782
# 2. Genre Categorization
# Simplify genres into main categories
movies_df$main_genre <- str_extract(movies_df$genre, "^[^,]+")
# Print genre distribution
cat("\n\nGenre distribution:")
##
##
## Genre distribution:
print(table(movies_df$main_genre))
##
## Action & Adventure Animation Art House & International
## 65 9 14
## Comedy Documentary Drama
## 89 56 312
## Horror Musical & Performing Arts Mystery & Suspense
## 23 12 60
## Other Science Fiction & Fantasy
## 16 9
# 3. Convert critics_score to 10-point scale
movies_df$critics_score_10 <- movies_df$critics_score / 10
# 4. Basic Statistics and Visualizations
# Genre Revenue Analysis
genre_revenue <- movies_df %>%
group_by(main_genre) %>%
summarise(
avg_revenue = mean(as.numeric(revenue..Millions.), na.rm = TRUE),
avg_profit = mean(profit, na.rm = TRUE),
count = n()
) %>%
arrange(desc(avg_revenue))
# Print genre analysis
cat("\n\nGenre Revenue Analysis:")
##
##
## Genre Revenue Analysis:
print(genre_revenue)
## # A tibble: 11 × 4
## main_genre avg_revenue avg_profit count
## <chr> <dbl> <dbl> <int>
## 1 Animation 237. 171. 9
## 2 Horror 189. 145. 23
## 3 Other 158. 115. 16
## 4 Musical & Performing Arts 137. 94.7 12
## 5 Comedy 137. 101. 89
## 6 Drama 137. 96.2 312
## 7 Mystery & Suspense 131. 82.7 60
## 8 Documentary 130. 98.6 56
## 9 Action & Adventure 129. 88.1 65
## 10 Art House & International 127. 92.1 14
## 11 Science Fiction & Fantasy 67.2 28.8 9
# Create visualization for genre revenue
p1 <- ggplot(genre_revenue, aes(x = reorder(main_genre, avg_revenue), y = avg_revenue)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(title = "Average Revenue by Genre",
x = "Genre",
y = "Average Revenue (Millions)") +
theme_minimal()
# Critics vs Viewer Scores Analysis
score_correlation <- cor.test(movies_df$critics_score_10, movies_df$imdb_rating)
# Print correlation results
cat("\n\nCritics vs Viewer Scores Correlation:")
##
##
## Critics vs Viewer Scores Correlation:
print(score_correlation)
##
## Pearson's product-moment correlation
##
## data: movies_df$critics_score_10 and movies_df$imdb_rating
## t = 10.926, df = 663, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3242253 0.4531951
## sample estimates:
## cor
## 0.3906253
# Visualization of Critics vs Viewer Scores
p2 <- ggplot(movies_df, aes(x = critics_score_10, y = imdb_rating)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm") +
labs(title = "Critics Score vs IMDB Rating",
x = "Critics Score (10-point scale)",
y = "IMDB Rating")
# 5. Predictive Analysis
# Prepare data for modeling
model_data <- movies_df %>%
select(
revenue..Millions.,
budget..Millions.,
imdb_rating,
critics_score,
audience_score,
runtime,
Facebook_Likes,
main_genre
) %>%
na.omit()
# Split data into training and testing sets
set.seed(123)
train_index <- createDataPartition(model_data$revenue..Millions., p = 0.8, list = FALSE)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]
# Train Random Forest model
rf_model <- randomForest(
revenue..Millions. ~ .,
data = train_data,
importance = TRUE
)
# Get variable importance
importance_df <- as.data.frame(importance(rf_model))
importance_df$variable <- rownames(importance_df)
# Plot variable importance
p3 <- ggplot(importance_df, aes(x = reorder(variable, IncNodePurity), y = IncNodePurity)) +
geom_bar(stat = "identity", fill = "coral") +
coord_flip() +
labs(title = "Variable Importance in Predicting Revenue",
x = "Variables",
y = "Importance (Node Purity)")
# 6. Genre-Specific Analysis
# Get list of genres with sufficient data
major_genres <- names(table(movies_df$main_genre))[table(movies_df$main_genre) >= 30]
# Initialize list to store results
genre_results <- list()
# Analyze each major genre
for(genre in major_genres) {
# Filter data for this genre and remove NA values
genre_data <- movies_df[movies_df$main_genre == genre, ] %>%
select(
revenue..Millions.,
budget..Millions.,
imdb_rating,
critics_score,
audience_score,
runtime,
Facebook_Likes
) %>%
na.omit()
# Only proceed if we have enough data after removing NAs
if(nrow(genre_data) >= 30) {
# Train model for this genre
genre_model <- randomForest(
revenue..Millions. ~ .,
data = genre_data,
importance = TRUE
)
# Get importance scores
imp_scores <- importance(genre_model)
# Store results
genre_results[[genre]] <- data.frame(
Genre = genre,
Variable = rownames(imp_scores),
Importance = imp_scores[,1]
)
}
}
# Combine results
genre_analysis <- do.call(rbind, genre_results)
# 7. Prediction for the new movie
new_movie <- data.frame(
critics_score = 55,
Facebook_Likes = 1250,
budget..Millions. = 20,
imdb_rating = 7.0, # Using an average value
audience_score = 70, # Using an average value
runtime = 120, # Using an average value
main_genre = "Drama" # Using a common genre
)
# Make prediction
predicted_revenue <- predict(rf_model, newdata = new_movie)
# Calculate impact of advertising campaign
new_movie_with_campaign <- new_movie
new_movie_with_campaign$Facebook_Likes <- new_movie$Facebook_Likes * 100 # 10,000% increase
new_movie_with_campaign$budget..Millions. <- 20
predicted_revenue_with_campaign <- predict(rf_model, newdata = new_movie_with_campaign)
# Print results
cat("\nAnalysis Results:\n")
##
## Analysis Results:
cat("\n1. Genre Revenue Analysis:\n")
##
## 1. Genre Revenue Analysis:
print(genre_revenue)
## # A tibble: 11 × 4
## main_genre avg_revenue avg_profit count
## <chr> <dbl> <dbl> <int>
## 1 Animation 237. 171. 9
## 2 Horror 189. 145. 23
## 3 Other 158. 115. 16
## 4 Musical & Performing Arts 137. 94.7 12
## 5 Comedy 137. 101. 89
## 6 Drama 137. 96.2 312
## 7 Mystery & Suspense 131. 82.7 60
## 8 Documentary 130. 98.6 56
## 9 Action & Adventure 129. 88.1 65
## 10 Art House & International 127. 92.1 14
## 11 Science Fiction & Fantasy 67.2 28.8 9
cat("\n2. Critics vs Viewer Scores Correlation:\n")
##
## 2. Critics vs Viewer Scores Correlation:
print(score_correlation)
##
## Pearson's product-moment correlation
##
## data: movies_df$critics_score_10 and movies_df$imdb_rating
## t = 10.926, df = 663, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3242253 0.4531951
## sample estimates:
## cor
## 0.3906253
cat("\n3. Top Revenue Driving Factors:\n")
##
## 3. Top Revenue Driving Factors:
print(importance_df %>% arrange(desc(IncNodePurity)) %>% head())
## %IncMSE IncNodePurity variable
## Facebook_Likes 36.268891 8933463 Facebook_Likes
## budget..Millions. 14.443764 5410243 budget..Millions.
## audience_score 8.577364 2283083 audience_score
## imdb_rating 5.941354 1840413 imdb_rating
## critics_score 3.575106 1818529 critics_score
## runtime -3.412131 1097365 runtime
cat("\n4. Predicted Revenue for New Movie:\n")
##
## 4. Predicted Revenue for New Movie:
cat("Base prediction:", round(predicted_revenue, 2), "million\n")
## Base prediction: 113.87 million
cat("With campaign prediction:", round(predicted_revenue_with_campaign, 2), "million\n")
## With campaign prediction: 150.6 million
# Print additional insights
cat("\n\nAdditional Insights:")
##
##
## Additional Insights:
cat("\n1. Genre-specific analysis (for genres with 30+ movies):\n")
##
## 1. Genre-specific analysis (for genres with 30+ movies):
# Print top 2 driving factors for each genre
genre_analysis %>%
group_by(Genre) %>%
arrange(desc(Importance)) %>%
slice_head(n = 2) %>%
print()
## # A tibble: 10 × 3
## # Groups: Genre [5]
## Genre Variable Importance
## <chr> <chr> <dbl>
## 1 Action & Adventure Facebook_Likes 18.6
## 2 Action & Adventure budget..Millions. 13.0
## 3 Comedy budget..Millions. 10.7
## 4 Comedy Facebook_Likes 6.77
## 5 Documentary Facebook_Likes 10.4
## 6 Documentary audience_score 6.67
## 7 Drama Facebook_Likes 31.6
## 8 Drama budget..Millions. 12.7
## 9 Mystery & Suspense Facebook_Likes 17.0
## 10 Mystery & Suspense audience_score 8.39
cat("\n2. Model Performance on Test Set:\n")
##
## 2. Model Performance on Test Set:
test_predictions <- predict(rf_model, newdata = test_data)
test_rmse <- sqrt(mean((test_data$revenue..Millions. - test_predictions)^2))
test_r2 <- cor(test_data$revenue..Millions., test_predictions)^2
cat("RMSE:", round(test_rmse, 2), "million\n")
## RMSE: 116.77 million
cat("R-squared:", round(test_r2, 3), "\n")
## R-squared: 0.756
# Save plots
ggsave("genre_revenue.png", plot = p1, width = 10, height = 6)
ggsave("critics_vs_imdb.png", plot = p2, width = 8, height = 6)
## `geom_smooth()` using formula = 'y ~ x'
ggsave("variable_importance.png", plot = p3, width = 10, height = 6)
# Create and save genre-specific importance plots
for(genre in names(genre_results)) {
genre_data <- genre_analysis[genre_analysis$Genre == genre,]
p <- ggplot(genre_data, aes(x = reorder(Variable, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "coral") +
coord_flip() +
labs(title = paste("Variable Importance for", genre, "Movies"),
x = "Variables",
y = "Importance") +
theme_minimal()
ggsave(paste0("importance_", make.names(genre), ".png"), plot = p, width = 8, height = 6)
}
# 8. Additional Analyses
# Calculate ROI
movies_df <- movies_df %>%
mutate(ROI = (revenue..Millions. - budget..Millions.) / budget..Millions. * 100)
# ROI Analysis
cat("\n\n=== ROI Analysis ===\n")
##
##
## === ROI Analysis ===
roi_by_genre <- movies_df %>%
group_by(main_genre) %>%
summarise(
avg_roi = mean(ROI, na.rm = TRUE),
median_roi = median(ROI, na.rm = TRUE),
roi_sd = sd(ROI, na.rm = TRUE),
count = n()
) %>%
arrange(desc(avg_roi))
print(roi_by_genre)
## # A tibble: 11 × 5
## main_genre avg_roi median_roi roi_sd count
## <chr> <dbl> <dbl> <dbl> <int>
## 1 Documentary 4131. 157. 27359. 56
## 2 Comedy 2798. 173. 21711. 89
## 3 Action & Adventure 1802. 146. 11026. 65
## 4 Drama 988. 125. 6236. 312
## 5 Art House & International 447. 254. 456. 14
## 6 Other 420. 216. 896. 16
## 7 Mystery & Suspense 334. 111. 800. 60
## 8 Horror 288. 109. 423. 23
## 9 Animation 278. 59.3 377. 9
## 10 Musical & Performing Arts 169. 96.7 203. 12
## 11 Science Fiction & Fantasy 72.9 40.6 131. 9
# Plot ROI by Genre
p_roi <- ggplot(roi_by_genre, aes(x = reorder(main_genre, avg_roi), y = avg_roi)) +
geom_bar(stat = "identity", fill = "lightgreen") +
coord_flip() +
labs(title = "Average ROI by Genre",
x = "Genre",
y = "Average ROI (%)")
ggsave("roi_by_genre.png", plot = p_roi, width = 10, height = 6)
# Top 10 Most Profitable Movies
cat("\n\nTop 10 Most Profitable Movies (by ROI):\n")
##
##
## Top 10 Most Profitable Movies (by ROI):
top_roi <- movies_df %>%
select(original_title, main_genre, budget..Millions., revenue..Millions., ROI) %>%
arrange(desc(ROI)) %>%
head(10)
print(top_roi)
## original_title main_genre
## 1 The Twilight Saga: Breaking Dawn - Part 2 Documentary
## 2 The Twilight Saga: Breaking Dawn - Part 2 Comedy
## 3 War of the Worlds Drama
## 4 The Hunger Games: Catching Fire Action & Adventure
## 5 Puss in Boots Drama
## 6 Ant-Man Drama
## 7 Home Drama
## 8 Miami Vice Drama
## 9 R.I.P.D. Documentary
## 10 Godzilla Resurgence Comedy
## budget..Millions. revenue..Millions. ROI
## 1 0.5 1025.4671 204993.422
## 2 0.5 1025.4671 204993.422
## 3 1.0 954.3059 95330.587
## 4 0.7 623.9333 89033.333
## 5 0.6 274.4704 45645.066
## 6 3.5 825.5000 23485.714
## 7 2.0 327.8037 16290.187
## 8 1.0 97.3826 9638.260
## 9 3.0 271.4302 8947.673
## 10 1.9 157.1078 8168.829
# Correlation Matrix of Audience Metrics
audience_metrics <- movies_df %>%
select(imdb_rating, critics_score, audience_score, Facebook_Likes,
revenue..Millions., budget..Millions., ROI)
correlation_matrix <- cor(audience_metrics, use = "complete.obs")
png("correlation_matrix.png", width = 800, height = 800)
corrplot(correlation_matrix, method = "color", type = "upper",
addCoef.col = "black", number.cex = 0.7,
tl.col = "black", tl.srt = 45)
dev.off()
## quartz_off_screen
## 2
# MPAA Rating Analysis
mpaa_analysis <- movies_df %>%
group_by(mpaa_rating) %>%
summarise(
avg_revenue = mean(revenue..Millions., na.rm = TRUE),
avg_roi = mean(ROI, na.rm = TRUE),
count = n()
) %>%
arrange(desc(avg_revenue))
cat("\n\nMPAA Rating Analysis:\n")
##
##
## MPAA Rating Analysis:
print(mpaa_analysis)
## # A tibble: 6 × 4
## mpaa_rating avg_revenue avg_roi count
## <chr> <dbl> <dbl> <int>
## 1 PG 182. 2982. 121
## 2 Unrated 160. 4327. 53
## 3 NC-17 146. 351. 2
## 4 R 128. 616. 336
## 5 G 122. 330. 19
## 6 PG-13 115. 1094. 134
# Plot MPAA Rating Revenue
p_mpaa <- ggplot(mpaa_analysis, aes(x = reorder(mpaa_rating, avg_revenue), y = avg_revenue)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Average Revenue by MPAA Rating",
x = "MPAA Rating",
y = "Average Revenue (Millions)")
ggsave("mpaa_revenue.png", plot = p_mpaa, width = 8, height = 6)
# Budget Bracket Analysis
movies_df <- movies_df %>%
mutate(budget_bracket = case_when(
budget..Millions. < 10 ~ "Low Budget (<$10M)",
budget..Millions. < 50 ~ "Medium Budget ($10M-$50M)",
budget..Millions. < 100 ~ "High Budget ($50M-$100M)",
TRUE ~ "Blockbuster (>$100M)"
))
budget_analysis <- movies_df %>%
group_by(budget_bracket) %>%
summarise(
avg_revenue = mean(revenue..Millions., na.rm = TRUE),
avg_roi = mean(ROI, na.rm = TRUE),
success_rate = mean(ROI > 0, na.rm = TRUE) * 100,
count = n()
) %>%
arrange(desc(avg_revenue))
cat("\n\nBudget Bracket Analysis:\n")
##
##
## Budget Bracket Analysis:
print(budget_analysis)
## # A tibble: 4 × 5
## budget_bracket avg_revenue avg_roi success_rate count
## <chr> <dbl> <dbl> <dbl> <int>
## 1 Blockbuster (>$100M) 458. 201. 90.7 75
## 2 High Budget ($50M-$100M) 190. 182. 80.9 110
## 3 Medium Budget ($10M-$50M) 81.7 259. 76.1 322
## 4 Low Budget (<$10M) 62.4 5267. 67.7 158
# Plot Budget Bracket Analysis
p_budget <- ggplot(budget_analysis, aes(x = budget_bracket, y = avg_revenue)) +
geom_bar(stat = "identity", fill = "coral") +
geom_text(aes(label = sprintf("%.1f%%", success_rate), y = avg_revenue + 10)) +
labs(title = "Average Revenue by Budget Bracket\n(with Success Rate %)",
x = "Budget Bracket",
y = "Average Revenue (Millions)")
ggsave("budget_analysis.png", plot = p_budget, width = 10, height = 6)
# Risk Analysis
risk_analysis <- movies_df %>%
group_by(main_genre) %>%
summarise(
avg_roi = mean(ROI, na.rm = TRUE),
roi_volatility = sd(ROI, na.rm = TRUE),
success_rate = mean(ROI > 0, na.rm = TRUE) * 100,
count = n()
) %>%
arrange(desc(avg_roi))
cat("\n\nRisk Analysis by Genre:\n")
##
##
## Risk Analysis by Genre:
print(risk_analysis)
## # A tibble: 11 × 5
## main_genre avg_roi roi_volatility success_rate count
## <chr> <dbl> <dbl> <dbl> <int>
## 1 Documentary 4131. 27359. 83.9 56
## 2 Comedy 2798. 21711. 80.9 89
## 3 Action & Adventure 1802. 11026. 73.8 65
## 4 Drama 988. 6236. 75.3 312
## 5 Art House & International 447. 456. 100 14
## 6 Other 420. 896. 75 16
## 7 Mystery & Suspense 334. 800. 73.3 60
## 8 Horror 288. 423. 69.6 23
## 9 Animation 278. 377. 88.9 9
## 10 Musical & Performing Arts 169. 203. 66.7 12
## 11 Science Fiction & Fantasy 72.9 131. 55.6 9
# Plot Risk vs Return
p_risk <- ggplot(risk_analysis, aes(x = roi_volatility, y = avg_roi, label = main_genre)) +
geom_point(aes(size = count, color = success_rate)) +
geom_text(hjust = -0.1, vjust = 0) +
scale_color_gradient(low = "red", high = "green") +
labs(title = "Risk vs Return by Genre",
x = "ROI Volatility (Risk)",
y = "Average ROI (Return)",
size = "Number of Movies",
color = "Success Rate (%)") +
theme(legend.position = "right")
ggsave("risk_return.png", plot = p_risk, width = 12, height = 8)
# Language and Country Analysis
market_analysis <- movies_df %>%
group_by(country) %>%
summarise(
avg_revenue = mean(revenue..Millions., na.rm = TRUE),
avg_roi = mean(ROI, na.rm = TRUE),
count = n()
) %>%
filter(count >= 5) %>% # Only countries with at least 5 movies
arrange(desc(avg_revenue))
cat("\n\nMarket Analysis by Country (min 5 movies):\n")
##
##
## Market Analysis by Country (min 5 movies):
print(market_analysis)
## # A tibble: 7 × 4
## country avg_revenue avg_roi count
## <chr> <dbl> <dbl> <int>
## 1 France 279. 161. 10
## 2 Japan 153. 3344. 5
## 3 USA 142. 1618. 550
## 4 Germany 124. 408. 11
## 5 UK 112. 712. 36
## 6 Canada 104. 112. 19
## 7 Australia 25.5 193. 24
# Summary Statistics
cat("\n\nKey Findings:\n")
##
##
## Key Findings:
cat("1. Most Profitable Genre (by ROI):", roi_by_genre$main_genre[1], "\n")
## 1. Most Profitable Genre (by ROI): Documentary
cat("2. Safest Investment (Highest Success Rate):",
risk_analysis$main_genre[which.max(risk_analysis$success_rate)], "\n")
## 2. Safest Investment (Highest Success Rate): Art House & International
cat("3. Best Budget Bracket (by ROI):",
budget_analysis$budget_bracket[which.max(budget_analysis$avg_roi)], "\n")
## 3. Best Budget Bracket (by ROI): Low Budget (<$10M)
cat("4. Most Profitable MPAA Rating:", mpaa_analysis$mpaa_rating[1], "\n")
## 4. Most Profitable MPAA Rating: PG
cat("5. Top Market:", market_analysis$country[1], "\n")
## 5. Top Market: France
# Save all analysis results to a file
sink("detailed_analysis_results.txt")
cat("=== Complete Movie Market Analysis ===\n\n")
cat("1. ROI Analysis by Genre:\n")
print(roi_by_genre)
cat("\n2. Top 10 Most Profitable Movies:\n")
print(top_roi)
cat("\n3. MPAA Rating Analysis:\n")
print(mpaa_analysis)
cat("\n4. Budget Bracket Analysis:\n")
print(budget_analysis)
cat("\n5. Risk Analysis:\n")
print(risk_analysis)
cat("\n6. Market Analysis by Country:\n")
print(market_analysis)
sink()
# Add Critic vs Viewer Score Analysis
cat("\n\n=== Critic vs Viewer Score Impact Analysis ===\n")
##
##
## === Critic vs Viewer Score Impact Analysis ===
# Create score impact analysis
score_impact <- movies_df %>%
select(revenue..Millions., critics_score, audience_score, imdb_rating) %>%
gather(score_type, score, -revenue..Millions.) %>%
mutate(
score_type = case_when(
score_type == "critics_score" ~ "Critics Score",
score_type == "audience_score" ~ "Audience Score",
score_type == "imdb_rating" ~ "IMDB Rating"
)
)
# Create boxplot of revenue by score ranges
p_scores <- ggplot(score_impact, aes(x = cut(score, breaks = seq(0, 100, by = 20)), y = revenue..Millions.)) +
geom_boxplot(fill = "lightblue") +
facet_wrap(~score_type, scales = "free_x") +
labs(title = "Revenue Distribution by Score Ranges",
x = "Score Range",
y = "Revenue (Millions)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggsave("score_impact.png", plot = p_scores, width = 12, height = 6)
# Calculate correlation with revenue
score_correlations <- movies_df %>%
summarise(
critics_correlation = cor(critics_score, revenue..Millions., use = "complete.obs"),
audience_correlation = cor(audience_score, revenue..Millions., use = "complete.obs"),
imdb_correlation = cor(imdb_rating * 10, revenue..Millions., use = "complete.obs")
)
# Create scatter plots
p_critics <- ggplot(movies_df, aes(x = critics_score, y = revenue..Millions.)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "red") +
labs(title = "Critics Score vs Revenue",
x = "Critics Score",
y = "Revenue (Millions)")
p_audience <- ggplot(movies_df, aes(x = audience_score, y = revenue..Millions.)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "blue") +
labs(title = "Audience Score vs Revenue",
x = "Audience Score",
y = "Revenue (Millions)")
# Combine plots
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:randomForest':
##
## combine
##
## The following object is masked from 'package:dplyr':
##
## combine
p_combined <- grid.arrange(p_critics, p_audience, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

ggsave("score_comparison.png", plot = p_combined, width = 12, height = 6)
# Print correlations
cat("\nCorrelations with Revenue:")
##
## Correlations with Revenue:
print(score_correlations)
## critics_correlation audience_correlation imdb_correlation
## 1 0.3839412 0.4316298 0.2176629
# Calculate average revenue by score brackets
score_brackets <- movies_df %>%
mutate(
critic_bracket = cut(critics_score, breaks = seq(0, 100, by = 20)),
audience_bracket = cut(audience_score, breaks = seq(0, 100, by = 20))
) %>%
summarise(
avg_rev_by_critic = mean(revenue..Millions.[critics_score > 80], na.rm = TRUE),
avg_rev_by_audience = mean(revenue..Millions.[audience_score > 80], na.rm = TRUE)
)
cat("\nAverage Revenue for High Scores (>80):")
##
## Average Revenue for High Scores (>80):
print(score_brackets)
## avg_rev_by_critic avg_rev_by_audience
## 1 249.2747 200.8896
# Create Top Driving Factors by Genre Analysis
top_factors_by_genre <- genre_analysis %>%
group_by(Genre) %>%
arrange(desc(Importance)) %>%
slice_head(n = 2) %>%
mutate(
Variable = factor(Variable),
Genre = factor(Genre, levels = unique(Genre[order(Importance, decreasing = TRUE)]))
)
# Create a visualization for top factors by genre
p_top_factors <- ggplot(top_factors_by_genre,
aes(x = Genre, y = Importance, fill = Variable)) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
scale_fill_brewer(palette = "Set2") +
labs(title = "Top 2 Driving Factors by Genre",
x = "Genre",
y = "Importance Score",
fill = "Factor") +
theme(
legend.position = "right",
axis.text.y = element_text(size = 10),
plot.title = element_text(size = 14, face = "bold")
)
# Save the plot
ggsave("top_factors_by_genre.png", plot = p_top_factors, width = 12, height = 8)
# Create a summary table of top factors
top_factors_summary <- top_factors_by_genre %>%
arrange(Genre, desc(Importance)) %>%
mutate(Importance = round(Importance, 2)) %>%
pivot_wider(
id_cols = Genre,
names_from = c(Variable),
values_from = Importance,
names_prefix = "Factor_"
)
# Print the summary
cat("\n\nTop Driving Factors by Genre:\n")
##
##
## Top Driving Factors by Genre:
print(top_factors_summary)
## # A tibble: 5 × 4
## # Groups: Genre [5]
## Genre Factor_Facebook_Likes Factor_budget..Milli…¹ Factor_audience_score
## <fct> <dbl> <dbl> <dbl>
## 1 Action & A… 18.6 13.0 NA
## 2 Comedy 6.77 10.7 NA
## 3 Documentary 10.4 NA 6.67
## 4 Drama 31.6 12.7 NA
## 5 Mystery & … 17.0 NA 8.39
## # ℹ abbreviated name: ¹Factor_budget..Millions.
# Create Genre Groups Analysis
movies_df <- movies_df %>%
mutate(genre_group = case_when(
main_genre %in% c("Action & Adventure", "Science Fiction & Fantasy","Animation") ~ "Action/Adventure/SciFi/Animation",
main_genre %in% c("Drama", "Art House & International") ~ "Drama/Art House",
main_genre %in% c("Comedy") ~ "Comedy",
main_genre %in% c("Horror", "Mystery & Suspense") ~ "Horror/Thriller",
main_genre %in% c("Documentary", "Musical & Performing Arts", "Other") ~ "Documentary/Musical/Other",
))
# Analyze driving factors for genre groups
genre_group_data <- movies_df %>%
group_by(genre_group) %>%
summarise(count = n()) %>%
filter(count >= 20) # Only analyze groups with sufficient data
# Initialize list to store results
genre_group_results <- list()
# Analyze each genre group
for(group in genre_group_data$genre_group) {
# Filter data for this genre group and remove NA values
group_data <- movies_df[movies_df$genre_group == group, ] %>%
select(
revenue..Millions.,
budget..Millions.,
imdb_rating,
critics_score,
audience_score,
runtime,
Facebook_Likes
) %>%
na.omit()
# Train model for this genre group
group_model <- randomForest(
revenue..Millions. ~ .,
data = group_data,
importance = TRUE
)
# Get importance scores
imp_scores <- importance(group_model)
# Store results
genre_group_results[[group]] <- data.frame(
Genre_Group = group,
Variable = rownames(imp_scores),
Importance = imp_scores[,1]
)
}
# Combine results
genre_group_analysis <- do.call(rbind, genre_group_results)
# Create visualization for genre groups
p_group_factors <- ggplot(
genre_group_analysis %>%
group_by(Genre_Group) %>%
arrange(desc(Importance)) %>%
slice_head(n = 3), # Show top 3 factors
aes(x = reorder(Genre_Group, Importance), y = Importance, fill = Variable)
) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
scale_fill_brewer(palette = "Set3") +
labs(title = "Top 3 Driving Factors by Genre Group",
x = "Genre Group",
y = "Importance Score",
fill = "Factor") +
theme(
legend.position = "right",
axis.text.y = element_text(size = 10),
plot.title = element_text(size = 14, face = "bold")
)
# Save the plot
ggsave("genre_group_factors.png", plot = p_group_factors, width = 12, height = 8)
# Create summary table
genre_group_summary <- genre_group_analysis %>%
group_by(Genre_Group) %>%
arrange(desc(Importance)) %>%
slice_head(n = 3) %>% # Top 3 factors
mutate(Importance = round(Importance, 2)) %>%
arrange(Genre_Group, desc(Importance))
# Print summary
cat("\n\nTop Driving Factors by Genre Group:\n")
##
##
## Top Driving Factors by Genre Group:
print(genre_group_summary)
## # A tibble: 15 × 3
## # Groups: Genre_Group [5]
## Genre_Group Variable Importance
## <chr> <chr> <dbl>
## 1 Action/Adventure/SciFi/Animation Facebook_Likes 20.1
## 2 Action/Adventure/SciFi/Animation budget..Millions. 9.68
## 3 Action/Adventure/SciFi/Animation audience_score 9.15
## 4 Comedy budget..Millions. 7.67
## 5 Comedy Facebook_Likes 7.07
## 6 Comedy critics_score 3.47
## 7 Documentary/Musical/Other Facebook_Likes 20.0
## 8 Documentary/Musical/Other budget..Millions. 6.69
## 9 Documentary/Musical/Other audience_score 6.02
## 10 Drama/Art House Facebook_Likes 27.2
## 11 Drama/Art House budget..Millions. 13.0
## 12 Drama/Art House imdb_rating 9.28
## 13 Horror/Thriller Facebook_Likes 21.1
## 14 Horror/Thriller budget..Millions. 8.98
## 15 Horror/Thriller critics_score 1.74
# Calculate success metrics for genre groups
genre_group_metrics <- movies_df %>%
group_by(genre_group) %>%
summarise(
avg_revenue = mean(revenue..Millions., na.rm = TRUE),
avg_roi = mean(ROI, na.rm = TRUE),
success_rate = mean(ROI > 0, na.rm = TRUE) * 100,
count = n()
) %>%
arrange(desc(avg_revenue))
cat("\n\nGenre Group Performance Metrics:\n")
##
##
## Genre Group Performance Metrics:
print(genre_group_metrics)
## # A tibble: 5 × 5
## genre_group avg_revenue avg_roi success_rate count
## <chr> <dbl> <dbl> <dbl> <int>
## 1 Horror/Thriller 147. 321. 72.3 83
## 2 Comedy 137. 2798. 80.9 89
## 3 Documentary/Musical/Other 137. 2858. 79.8 84
## 4 Drama/Art House 136. 965. 76.4 326
## 5 Action/Adventure/SciFi/Animation 134. 1449. 73.5 83
# Create Investment Strategy Pie Chart
investment_data <- data.frame(
Category = c("Core Investment", "Growth Investment", "Speculative Investment"),
Percentage = c(60, 30, 10),
Description = c("Medium Budget ($10M-$50M)\nDrama & Comedy\nPG/PG-13",
"High Budget ($50M-$100M)\nAction & Animation\nStrong Social",
"Low Budget (<$10M)\nDoc & Horror\nHigh ROI")
)
# Create pie chart with custom colors and clean design
p_investment <- ggplot(investment_data, aes(x = "", y = Percentage, fill = Category)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
scale_fill_manual(values = c("#2E86C1", "#28B463", "#E67E22")) +
theme_minimal() +
theme(
axis.title = element_blank(),
axis.text = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
legend.title = element_blank(),
legend.position = "right"
) +
geom_text(aes(label = paste0(Percentage, "%\n", Description)),
position = position_stack(vjust = 0.5),
size = 3.5, color = "white", lineheight = 0.8) +
labs(title = "Investment Strategy Portfolio")
# Save the plot
ggsave("investment_strategy_pie.png", plot = p_investment, width = 10, height = 7, dpi = 300)