# Set CRAN mirror
options(repos = c(CRAN = "https://cloud.r-project.org"))

# Function to install packages if not already installed
install_if_missing <- function(packages) {
  new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
  if(length(new_packages)) install.packages(new_packages)
}

# Install required packages
required_packages <- c("tidyverse", "ggplot2", "dplyr", "scales", 
                      "corrplot", "caret", "randomForest", "readxl")
install_if_missing(required_packages)

# Load the libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(corrplot)
## corrplot 0.95 loaded
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(readxl)

# Create a custom theme with solid white background
custom_theme <- theme_minimal() +
  theme(
    plot.background = element_rect(fill = "white", color = "black"),
    panel.background = element_rect(fill = "white", color = "black"),
    panel.grid.major = element_line(color = "grey90"),
    panel.grid.minor = element_line(color = "grey95"),
    legend.background = element_rect(fill = "white", color = "black"),
    legend.box.background = element_rect(fill = "white", color = "black"),
    axis.text = element_text(color = "black"),
    axis.title = element_text(color = "black"),
    plot.title = element_text(color = "black")
  )

# Set the custom theme as default
theme_set(custom_theme)
setwd("/Users/ethanbalogh/Desktop/mktg3p98-proj3")
# 1. Data Import and Cleaning
audience_data <- read_excel("Movie_Dataset_General_Audience.xlsx")
financial_data <- read_excel("Movie_Dataset_Financials.xlsx")

# Print dimensions of imported data
cat("\nDimensions of datasets:")
## 
## Dimensions of datasets:
cat("\nAudience data:", dim(audience_data))
## 
## Audience data: 651 13
cat("\nFinancial data:", dim(financial_data))
## 
## Financial data: 651 5
# Remove trailing spaces from original_title in both datasets
audience_data$original_title <- str_trim(audience_data$original_title)
financial_data$original_title <- str_trim(financial_data$original_title)

# Print column names
cat("\n\nColumn names:")
## 
## 
## Column names:
cat("\nAudience data:", names(audience_data))
## 
## Audience data: original_title type genre runtime mpaa_rating imdb_rating imdb_num_votes critics_rating critics_score audience_rating audience_score best_pic_nom Facebook_Likes
cat("\nFinancial data:", names(financial_data))
## 
## Financial data: original_title budget (Millions) revenue (Millions) language country
# Merge datasets
movies_df <- merge(audience_data, financial_data, by = "original_title", all = FALSE)

# Clean up column names for modeling
names(movies_df) <- make.names(names(movies_df))

# Print dimensions after merge
cat("\n\nDimensions after merge:", dim(movies_df))
## 
## 
## Dimensions after merge: 665 17
# Print first few rows of financial columns
cat("\n\nFirst few rows of financial columns:")
## 
## 
## First few rows of financial columns:
print(head(select(movies_df, original_title, budget..Millions., revenue..Millions.)))
##           original_title budget..Millions. revenue..Millions.
## 1            10,000 B.C.              12.6          18.658381
## 2         102 Dalmatians              45.0          60.222298
## 3       2 Fast 2 Furious              16.0          31.556061
## 4                   2012               3.5           0.754249
## 5 300: Rise of an Empire              16.0          19.682924
## 6               47 Ronin              50.0         240.360392
# Create profit column with error checking
movies_df <- movies_df %>%
  mutate(profit = as.numeric(revenue..Millions.) - as.numeric(budget..Millions.))

# Print summary of financial metrics
cat("\n\nSummary of financial metrics:")
## 
## 
## Summary of financial metrics:
print(summary(select(movies_df, revenue..Millions., budget..Millions., profit)))
##  revenue..Millions.  budget..Millions.     profit         
##  Min.   :   0.0093   Min.   :  0.318   Min.   :-109.3480  
##  1st Qu.:  19.6829   1st Qu.: 10.000   1st Qu.:   0.6707  
##  Median :  61.5487   Median : 25.000   Median :  33.0000  
##  Mean   : 137.4735   Mean   : 40.319   Mean   :  97.1540  
##  3rd Qu.: 153.9630   3rd Qu.: 51.000   3rd Qu.: 105.3465  
##  Max.   :2068.1782   Max.   :280.000   Max.   :1868.1782
# 2. Genre Categorization
# Simplify genres into main categories
movies_df$main_genre <- str_extract(movies_df$genre, "^[^,]+")

# Print genre distribution
cat("\n\nGenre distribution:")
## 
## 
## Genre distribution:
print(table(movies_df$main_genre))
## 
##        Action & Adventure                 Animation Art House & International 
##                        65                         9                        14 
##                    Comedy               Documentary                     Drama 
##                        89                        56                       312 
##                    Horror Musical & Performing Arts        Mystery & Suspense 
##                        23                        12                        60 
##                     Other Science Fiction & Fantasy 
##                        16                         9
# 3. Convert critics_score to 10-point scale
movies_df$critics_score_10 <- movies_df$critics_score / 10

# 4. Basic Statistics and Visualizations

# Genre Revenue Analysis
genre_revenue <- movies_df %>%
  group_by(main_genre) %>%
  summarise(
    avg_revenue = mean(as.numeric(revenue..Millions.), na.rm = TRUE),
    avg_profit = mean(profit, na.rm = TRUE),
    count = n()
  ) %>%
  arrange(desc(avg_revenue))

# Print genre analysis
cat("\n\nGenre Revenue Analysis:")
## 
## 
## Genre Revenue Analysis:
print(genre_revenue)
## # A tibble: 11 × 4
##    main_genre                avg_revenue avg_profit count
##    <chr>                           <dbl>      <dbl> <int>
##  1 Animation                       237.       171.      9
##  2 Horror                          189.       145.     23
##  3 Other                           158.       115.     16
##  4 Musical & Performing Arts       137.        94.7    12
##  5 Comedy                          137.       101.     89
##  6 Drama                           137.        96.2   312
##  7 Mystery & Suspense              131.        82.7    60
##  8 Documentary                     130.        98.6    56
##  9 Action & Adventure              129.        88.1    65
## 10 Art House & International       127.        92.1    14
## 11 Science Fiction & Fantasy        67.2       28.8     9
# Create visualization for genre revenue
p1 <- ggplot(genre_revenue, aes(x = reorder(main_genre, avg_revenue), y = avg_revenue)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +
  labs(title = "Average Revenue by Genre",
       x = "Genre",
       y = "Average Revenue (Millions)") +
  theme_minimal()

# Critics vs Viewer Scores Analysis
score_correlation <- cor.test(movies_df$critics_score_10, movies_df$imdb_rating)

# Print correlation results
cat("\n\nCritics vs Viewer Scores Correlation:")
## 
## 
## Critics vs Viewer Scores Correlation:
print(score_correlation)
## 
##  Pearson's product-moment correlation
## 
## data:  movies_df$critics_score_10 and movies_df$imdb_rating
## t = 10.926, df = 663, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3242253 0.4531951
## sample estimates:
##       cor 
## 0.3906253
# Visualization of Critics vs Viewer Scores
p2 <- ggplot(movies_df, aes(x = critics_score_10, y = imdb_rating)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm") +
  labs(title = "Critics Score vs IMDB Rating",
       x = "Critics Score (10-point scale)",
       y = "IMDB Rating")

# 5. Predictive Analysis
# Prepare data for modeling
model_data <- movies_df %>%
  select(
    revenue..Millions.,
    budget..Millions.,
    imdb_rating,
    critics_score,
    audience_score,
    runtime,
    Facebook_Likes,
    main_genre
  ) %>%
  na.omit()

# Split data into training and testing sets
set.seed(123)
train_index <- createDataPartition(model_data$revenue..Millions., p = 0.8, list = FALSE)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]

# Train Random Forest model
rf_model <- randomForest(
  revenue..Millions. ~ .,
  data = train_data,
  importance = TRUE
)

# Get variable importance
importance_df <- as.data.frame(importance(rf_model))
importance_df$variable <- rownames(importance_df)

# Plot variable importance
p3 <- ggplot(importance_df, aes(x = reorder(variable, IncNodePurity), y = IncNodePurity)) +
  geom_bar(stat = "identity", fill = "coral") +
  coord_flip() +
  labs(title = "Variable Importance in Predicting Revenue",
       x = "Variables",
       y = "Importance (Node Purity)")

# 6. Genre-Specific Analysis
# Get list of genres with sufficient data
major_genres <- names(table(movies_df$main_genre))[table(movies_df$main_genre) >= 30]

# Initialize list to store results
genre_results <- list()

# Analyze each major genre
for(genre in major_genres) {
  # Filter data for this genre and remove NA values
  genre_data <- movies_df[movies_df$main_genre == genre, ] %>%
    select(
      revenue..Millions.,
      budget..Millions.,
      imdb_rating,
      critics_score,
      audience_score,
      runtime,
      Facebook_Likes
    ) %>%
    na.omit()
  
  # Only proceed if we have enough data after removing NAs
  if(nrow(genre_data) >= 30) {
    # Train model for this genre
    genre_model <- randomForest(
      revenue..Millions. ~ .,
      data = genre_data,
      importance = TRUE
    )
    
    # Get importance scores
    imp_scores <- importance(genre_model)
    
    # Store results
    genre_results[[genre]] <- data.frame(
      Genre = genre,
      Variable = rownames(imp_scores),
      Importance = imp_scores[,1]
    )
  }
}

# Combine results
genre_analysis <- do.call(rbind, genre_results)

# 7. Prediction for the new movie
new_movie <- data.frame(
  critics_score = 55,
  Facebook_Likes = 1250,
  budget..Millions. = 20,
  imdb_rating = 7.0,  # Using an average value
  audience_score = 70,  # Using an average value
  runtime = 120,  # Using an average value
  main_genre = "Drama"  # Using a common genre
)

# Make prediction
predicted_revenue <- predict(rf_model, newdata = new_movie)

# Calculate impact of advertising campaign
new_movie_with_campaign <- new_movie
new_movie_with_campaign$Facebook_Likes <- new_movie$Facebook_Likes * 100  # 10,000% increase
new_movie_with_campaign$budget..Millions. <- 20

predicted_revenue_with_campaign <- predict(rf_model, newdata = new_movie_with_campaign)

# Print results
cat("\nAnalysis Results:\n")
## 
## Analysis Results:
cat("\n1. Genre Revenue Analysis:\n")
## 
## 1. Genre Revenue Analysis:
print(genre_revenue)
## # A tibble: 11 × 4
##    main_genre                avg_revenue avg_profit count
##    <chr>                           <dbl>      <dbl> <int>
##  1 Animation                       237.       171.      9
##  2 Horror                          189.       145.     23
##  3 Other                           158.       115.     16
##  4 Musical & Performing Arts       137.        94.7    12
##  5 Comedy                          137.       101.     89
##  6 Drama                           137.        96.2   312
##  7 Mystery & Suspense              131.        82.7    60
##  8 Documentary                     130.        98.6    56
##  9 Action & Adventure              129.        88.1    65
## 10 Art House & International       127.        92.1    14
## 11 Science Fiction & Fantasy        67.2       28.8     9
cat("\n2. Critics vs Viewer Scores Correlation:\n")
## 
## 2. Critics vs Viewer Scores Correlation:
print(score_correlation)
## 
##  Pearson's product-moment correlation
## 
## data:  movies_df$critics_score_10 and movies_df$imdb_rating
## t = 10.926, df = 663, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3242253 0.4531951
## sample estimates:
##       cor 
## 0.3906253
cat("\n3. Top Revenue Driving Factors:\n")
## 
## 3. Top Revenue Driving Factors:
print(importance_df %>% arrange(desc(IncNodePurity)) %>% head())
##                     %IncMSE IncNodePurity          variable
## Facebook_Likes    36.268891       8933463    Facebook_Likes
## budget..Millions. 14.443764       5410243 budget..Millions.
## audience_score     8.577364       2283083    audience_score
## imdb_rating        5.941354       1840413       imdb_rating
## critics_score      3.575106       1818529     critics_score
## runtime           -3.412131       1097365           runtime
cat("\n4. Predicted Revenue for New Movie:\n")
## 
## 4. Predicted Revenue for New Movie:
cat("Base prediction:", round(predicted_revenue, 2), "million\n")
## Base prediction: 113.87 million
cat("With campaign prediction:", round(predicted_revenue_with_campaign, 2), "million\n")
## With campaign prediction: 150.6 million
# Print additional insights
cat("\n\nAdditional Insights:")
## 
## 
## Additional Insights:
cat("\n1. Genre-specific analysis (for genres with 30+ movies):\n")
## 
## 1. Genre-specific analysis (for genres with 30+ movies):
# Print top 2 driving factors for each genre
genre_analysis %>%
  group_by(Genre) %>%
  arrange(desc(Importance)) %>%
  slice_head(n = 2) %>%
  print()
## # A tibble: 10 × 3
## # Groups:   Genre [5]
##    Genre              Variable          Importance
##    <chr>              <chr>                  <dbl>
##  1 Action & Adventure Facebook_Likes         18.6 
##  2 Action & Adventure budget..Millions.      13.0 
##  3 Comedy             budget..Millions.      10.7 
##  4 Comedy             Facebook_Likes          6.77
##  5 Documentary        Facebook_Likes         10.4 
##  6 Documentary        audience_score          6.67
##  7 Drama              Facebook_Likes         31.6 
##  8 Drama              budget..Millions.      12.7 
##  9 Mystery & Suspense Facebook_Likes         17.0 
## 10 Mystery & Suspense audience_score          8.39
cat("\n2. Model Performance on Test Set:\n")
## 
## 2. Model Performance on Test Set:
test_predictions <- predict(rf_model, newdata = test_data)
test_rmse <- sqrt(mean((test_data$revenue..Millions. - test_predictions)^2))
test_r2 <- cor(test_data$revenue..Millions., test_predictions)^2
cat("RMSE:", round(test_rmse, 2), "million\n")
## RMSE: 116.77 million
cat("R-squared:", round(test_r2, 3), "\n")
## R-squared: 0.756
# Save plots
ggsave("genre_revenue.png", plot = p1, width = 10, height = 6)
ggsave("critics_vs_imdb.png", plot = p2, width = 8, height = 6)
## `geom_smooth()` using formula = 'y ~ x'
ggsave("variable_importance.png", plot = p3, width = 10, height = 6)

# Create and save genre-specific importance plots
for(genre in names(genre_results)) {
  genre_data <- genre_analysis[genre_analysis$Genre == genre,]
  p <- ggplot(genre_data, aes(x = reorder(Variable, Importance), y = Importance)) +
    geom_bar(stat = "identity", fill = "coral") +
    coord_flip() +
    labs(title = paste("Variable Importance for", genre, "Movies"),
         x = "Variables",
         y = "Importance") +
    theme_minimal()
  ggsave(paste0("importance_", make.names(genre), ".png"), plot = p, width = 8, height = 6)
}

# 8. Additional Analyses

# Calculate ROI
movies_df <- movies_df %>%
  mutate(ROI = (revenue..Millions. - budget..Millions.) / budget..Millions. * 100)

# ROI Analysis
cat("\n\n=== ROI Analysis ===\n")
## 
## 
## === ROI Analysis ===
roi_by_genre <- movies_df %>%
  group_by(main_genre) %>%
  summarise(
    avg_roi = mean(ROI, na.rm = TRUE),
    median_roi = median(ROI, na.rm = TRUE),
    roi_sd = sd(ROI, na.rm = TRUE),
    count = n()
  ) %>%
  arrange(desc(avg_roi))

print(roi_by_genre)
## # A tibble: 11 × 5
##    main_genre                avg_roi median_roi roi_sd count
##    <chr>                       <dbl>      <dbl>  <dbl> <int>
##  1 Documentary                4131.       157.  27359.    56
##  2 Comedy                     2798.       173.  21711.    89
##  3 Action & Adventure         1802.       146.  11026.    65
##  4 Drama                       988.       125.   6236.   312
##  5 Art House & International   447.       254.    456.    14
##  6 Other                       420.       216.    896.    16
##  7 Mystery & Suspense          334.       111.    800.    60
##  8 Horror                      288.       109.    423.    23
##  9 Animation                   278.        59.3   377.     9
## 10 Musical & Performing Arts   169.        96.7   203.    12
## 11 Science Fiction & Fantasy    72.9       40.6   131.     9
# Plot ROI by Genre
p_roi <- ggplot(roi_by_genre, aes(x = reorder(main_genre, avg_roi), y = avg_roi)) +
  geom_bar(stat = "identity", fill = "lightgreen") +
  coord_flip() +
  labs(title = "Average ROI by Genre",
       x = "Genre",
       y = "Average ROI (%)")
ggsave("roi_by_genre.png", plot = p_roi, width = 10, height = 6)

# Top 10 Most Profitable Movies
cat("\n\nTop 10 Most Profitable Movies (by ROI):\n")
## 
## 
## Top 10 Most Profitable Movies (by ROI):
top_roi <- movies_df %>%
  select(original_title, main_genre, budget..Millions., revenue..Millions., ROI) %>%
  arrange(desc(ROI)) %>%
  head(10)
print(top_roi)
##                               original_title         main_genre
## 1  The Twilight Saga: Breaking Dawn - Part 2        Documentary
## 2  The Twilight Saga: Breaking Dawn - Part 2             Comedy
## 3                          War of the Worlds              Drama
## 4            The Hunger Games: Catching Fire Action & Adventure
## 5                              Puss in Boots              Drama
## 6                                    Ant-Man              Drama
## 7                                       Home              Drama
## 8                                 Miami Vice              Drama
## 9                                   R.I.P.D.        Documentary
## 10                       Godzilla Resurgence             Comedy
##    budget..Millions. revenue..Millions.        ROI
## 1                0.5          1025.4671 204993.422
## 2                0.5          1025.4671 204993.422
## 3                1.0           954.3059  95330.587
## 4                0.7           623.9333  89033.333
## 5                0.6           274.4704  45645.066
## 6                3.5           825.5000  23485.714
## 7                2.0           327.8037  16290.187
## 8                1.0            97.3826   9638.260
## 9                3.0           271.4302   8947.673
## 10               1.9           157.1078   8168.829
# Correlation Matrix of Audience Metrics
audience_metrics <- movies_df %>%
  select(imdb_rating, critics_score, audience_score, Facebook_Likes, 
         revenue..Millions., budget..Millions., ROI)

correlation_matrix <- cor(audience_metrics, use = "complete.obs")
png("correlation_matrix.png", width = 800, height = 800)
corrplot(correlation_matrix, method = "color", type = "upper", 
         addCoef.col = "black", number.cex = 0.7,
         tl.col = "black", tl.srt = 45)
dev.off()
## quartz_off_screen 
##                 2
# MPAA Rating Analysis
mpaa_analysis <- movies_df %>%
  group_by(mpaa_rating) %>%
  summarise(
    avg_revenue = mean(revenue..Millions., na.rm = TRUE),
    avg_roi = mean(ROI, na.rm = TRUE),
    count = n()
  ) %>%
  arrange(desc(avg_revenue))

cat("\n\nMPAA Rating Analysis:\n")
## 
## 
## MPAA Rating Analysis:
print(mpaa_analysis)
## # A tibble: 6 × 4
##   mpaa_rating avg_revenue avg_roi count
##   <chr>             <dbl>   <dbl> <int>
## 1 PG                 182.   2982.   121
## 2 Unrated            160.   4327.    53
## 3 NC-17              146.    351.     2
## 4 R                  128.    616.   336
## 5 G                  122.    330.    19
## 6 PG-13              115.   1094.   134
# Plot MPAA Rating Revenue
p_mpaa <- ggplot(mpaa_analysis, aes(x = reorder(mpaa_rating, avg_revenue), y = avg_revenue)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Average Revenue by MPAA Rating",
       x = "MPAA Rating",
       y = "Average Revenue (Millions)")
ggsave("mpaa_revenue.png", plot = p_mpaa, width = 8, height = 6)

# Budget Bracket Analysis
movies_df <- movies_df %>%
  mutate(budget_bracket = case_when(
    budget..Millions. < 10 ~ "Low Budget (<$10M)",
    budget..Millions. < 50 ~ "Medium Budget ($10M-$50M)",
    budget..Millions. < 100 ~ "High Budget ($50M-$100M)",
    TRUE ~ "Blockbuster (>$100M)"
  ))

budget_analysis <- movies_df %>%
  group_by(budget_bracket) %>%
  summarise(
    avg_revenue = mean(revenue..Millions., na.rm = TRUE),
    avg_roi = mean(ROI, na.rm = TRUE),
    success_rate = mean(ROI > 0, na.rm = TRUE) * 100,
    count = n()
  ) %>%
  arrange(desc(avg_revenue))

cat("\n\nBudget Bracket Analysis:\n")
## 
## 
## Budget Bracket Analysis:
print(budget_analysis)
## # A tibble: 4 × 5
##   budget_bracket            avg_revenue avg_roi success_rate count
##   <chr>                           <dbl>   <dbl>        <dbl> <int>
## 1 Blockbuster (>$100M)            458.     201.         90.7    75
## 2 High Budget ($50M-$100M)        190.     182.         80.9   110
## 3 Medium Budget ($10M-$50M)        81.7    259.         76.1   322
## 4 Low Budget (<$10M)               62.4   5267.         67.7   158
# Plot Budget Bracket Analysis
p_budget <- ggplot(budget_analysis, aes(x = budget_bracket, y = avg_revenue)) +
  geom_bar(stat = "identity", fill = "coral") +
  geom_text(aes(label = sprintf("%.1f%%", success_rate), y = avg_revenue + 10)) +
  labs(title = "Average Revenue by Budget Bracket\n(with Success Rate %)",
       x = "Budget Bracket",
       y = "Average Revenue (Millions)")
ggsave("budget_analysis.png", plot = p_budget, width = 10, height = 6)

# Risk Analysis
risk_analysis <- movies_df %>%
  group_by(main_genre) %>%
  summarise(
    avg_roi = mean(ROI, na.rm = TRUE),
    roi_volatility = sd(ROI, na.rm = TRUE),
    success_rate = mean(ROI > 0, na.rm = TRUE) * 100,
    count = n()
  ) %>%
  arrange(desc(avg_roi))

cat("\n\nRisk Analysis by Genre:\n")
## 
## 
## Risk Analysis by Genre:
print(risk_analysis)
## # A tibble: 11 × 5
##    main_genre                avg_roi roi_volatility success_rate count
##    <chr>                       <dbl>          <dbl>        <dbl> <int>
##  1 Documentary                4131.          27359.         83.9    56
##  2 Comedy                     2798.          21711.         80.9    89
##  3 Action & Adventure         1802.          11026.         73.8    65
##  4 Drama                       988.           6236.         75.3   312
##  5 Art House & International   447.            456.        100      14
##  6 Other                       420.            896.         75      16
##  7 Mystery & Suspense          334.            800.         73.3    60
##  8 Horror                      288.            423.         69.6    23
##  9 Animation                   278.            377.         88.9     9
## 10 Musical & Performing Arts   169.            203.         66.7    12
## 11 Science Fiction & Fantasy    72.9           131.         55.6     9
# Plot Risk vs Return
p_risk <- ggplot(risk_analysis, aes(x = roi_volatility, y = avg_roi, label = main_genre)) +
  geom_point(aes(size = count, color = success_rate)) +
  geom_text(hjust = -0.1, vjust = 0) +
  scale_color_gradient(low = "red", high = "green") +
  labs(title = "Risk vs Return by Genre",
       x = "ROI Volatility (Risk)",
       y = "Average ROI (Return)",
       size = "Number of Movies",
       color = "Success Rate (%)") +
  theme(legend.position = "right")
ggsave("risk_return.png", plot = p_risk, width = 12, height = 8)

# Language and Country Analysis
market_analysis <- movies_df %>%
  group_by(country) %>%
  summarise(
    avg_revenue = mean(revenue..Millions., na.rm = TRUE),
    avg_roi = mean(ROI, na.rm = TRUE),
    count = n()
  ) %>%
  filter(count >= 5) %>%  # Only countries with at least 5 movies
  arrange(desc(avg_revenue))

cat("\n\nMarket Analysis by Country (min 5 movies):\n")
## 
## 
## Market Analysis by Country (min 5 movies):
print(market_analysis)
## # A tibble: 7 × 4
##   country   avg_revenue avg_roi count
##   <chr>           <dbl>   <dbl> <int>
## 1 France          279.     161.    10
## 2 Japan           153.    3344.     5
## 3 USA             142.    1618.   550
## 4 Germany         124.     408.    11
## 5 UK              112.     712.    36
## 6 Canada          104.     112.    19
## 7 Australia        25.5    193.    24
# Summary Statistics
cat("\n\nKey Findings:\n")
## 
## 
## Key Findings:
cat("1. Most Profitable Genre (by ROI):", roi_by_genre$main_genre[1], "\n")
## 1. Most Profitable Genre (by ROI): Documentary
cat("2. Safest Investment (Highest Success Rate):", 
    risk_analysis$main_genre[which.max(risk_analysis$success_rate)], "\n")
## 2. Safest Investment (Highest Success Rate): Art House & International
cat("3. Best Budget Bracket (by ROI):", 
    budget_analysis$budget_bracket[which.max(budget_analysis$avg_roi)], "\n")
## 3. Best Budget Bracket (by ROI): Low Budget (<$10M)
cat("4. Most Profitable MPAA Rating:", mpaa_analysis$mpaa_rating[1], "\n")
## 4. Most Profitable MPAA Rating: PG
cat("5. Top Market:", market_analysis$country[1], "\n")
## 5. Top Market: France
# Save all analysis results to a file
sink("detailed_analysis_results.txt")
cat("=== Complete Movie Market Analysis ===\n\n")
cat("1. ROI Analysis by Genre:\n")
print(roi_by_genre)
cat("\n2. Top 10 Most Profitable Movies:\n")
print(top_roi)
cat("\n3. MPAA Rating Analysis:\n")
print(mpaa_analysis)
cat("\n4. Budget Bracket Analysis:\n")
print(budget_analysis)
cat("\n5. Risk Analysis:\n")
print(risk_analysis)
cat("\n6. Market Analysis by Country:\n")
print(market_analysis)
sink()

# Add Critic vs Viewer Score Analysis
cat("\n\n=== Critic vs Viewer Score Impact Analysis ===\n")
## 
## 
## === Critic vs Viewer Score Impact Analysis ===
# Create score impact analysis
score_impact <- movies_df %>%
  select(revenue..Millions., critics_score, audience_score, imdb_rating) %>%
  gather(score_type, score, -revenue..Millions.) %>%
  mutate(
    score_type = case_when(
      score_type == "critics_score" ~ "Critics Score",
      score_type == "audience_score" ~ "Audience Score",
      score_type == "imdb_rating" ~ "IMDB Rating"
    )
  )

# Create boxplot of revenue by score ranges
p_scores <- ggplot(score_impact, aes(x = cut(score, breaks = seq(0, 100, by = 20)), y = revenue..Millions.)) +
  geom_boxplot(fill = "lightblue") +
  facet_wrap(~score_type, scales = "free_x") +
  labs(title = "Revenue Distribution by Score Ranges",
       x = "Score Range",
       y = "Revenue (Millions)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggsave("score_impact.png", plot = p_scores, width = 12, height = 6)

# Calculate correlation with revenue
score_correlations <- movies_df %>%
  summarise(
    critics_correlation = cor(critics_score, revenue..Millions., use = "complete.obs"),
    audience_correlation = cor(audience_score, revenue..Millions., use = "complete.obs"),
    imdb_correlation = cor(imdb_rating * 10, revenue..Millions., use = "complete.obs")
  )

# Create scatter plots
p_critics <- ggplot(movies_df, aes(x = critics_score, y = revenue..Millions.)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", color = "red") +
  labs(title = "Critics Score vs Revenue",
       x = "Critics Score",
       y = "Revenue (Millions)")

p_audience <- ggplot(movies_df, aes(x = audience_score, y = revenue..Millions.)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "Audience Score vs Revenue",
       x = "Audience Score",
       y = "Revenue (Millions)")

# Combine plots
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:randomForest':
## 
##     combine
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
p_combined <- grid.arrange(p_critics, p_audience, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

ggsave("score_comparison.png", plot = p_combined, width = 12, height = 6)

# Print correlations
cat("\nCorrelations with Revenue:")
## 
## Correlations with Revenue:
print(score_correlations)
##   critics_correlation audience_correlation imdb_correlation
## 1           0.3839412            0.4316298        0.2176629
# Calculate average revenue by score brackets
score_brackets <- movies_df %>%
  mutate(
    critic_bracket = cut(critics_score, breaks = seq(0, 100, by = 20)),
    audience_bracket = cut(audience_score, breaks = seq(0, 100, by = 20))
  ) %>%
  summarise(
    avg_rev_by_critic = mean(revenue..Millions.[critics_score > 80], na.rm = TRUE),
    avg_rev_by_audience = mean(revenue..Millions.[audience_score > 80], na.rm = TRUE)
  )

cat("\nAverage Revenue for High Scores (>80):")
## 
## Average Revenue for High Scores (>80):
print(score_brackets)
##   avg_rev_by_critic avg_rev_by_audience
## 1          249.2747            200.8896
# Create Top Driving Factors by Genre Analysis
top_factors_by_genre <- genre_analysis %>%
  group_by(Genre) %>%
  arrange(desc(Importance)) %>%
  slice_head(n = 2) %>%
  mutate(
    Variable = factor(Variable),
    Genre = factor(Genre, levels = unique(Genre[order(Importance, decreasing = TRUE)]))
  )

# Create a visualization for top factors by genre
p_top_factors <- ggplot(top_factors_by_genre, 
                       aes(x = Genre, y = Importance, fill = Variable)) +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Top 2 Driving Factors by Genre",
       x = "Genre",
       y = "Importance Score",
       fill = "Factor") +
  theme(
    legend.position = "right",
    axis.text.y = element_text(size = 10),
    plot.title = element_text(size = 14, face = "bold")
  )

# Save the plot
ggsave("top_factors_by_genre.png", plot = p_top_factors, width = 12, height = 8)

# Create a summary table of top factors
top_factors_summary <- top_factors_by_genre %>%
  arrange(Genre, desc(Importance)) %>%
  mutate(Importance = round(Importance, 2)) %>%
  pivot_wider(
    id_cols = Genre,
    names_from = c(Variable),
    values_from = Importance,
    names_prefix = "Factor_"
  )

# Print the summary
cat("\n\nTop Driving Factors by Genre:\n")
## 
## 
## Top Driving Factors by Genre:
print(top_factors_summary)
## # A tibble: 5 × 4
## # Groups:   Genre [5]
##   Genre       Factor_Facebook_Likes Factor_budget..Milli…¹ Factor_audience_score
##   <fct>                       <dbl>                  <dbl>                 <dbl>
## 1 Action & A…                 18.6                    13.0                 NA   
## 2 Comedy                       6.77                   10.7                 NA   
## 3 Documentary                 10.4                    NA                    6.67
## 4 Drama                       31.6                    12.7                 NA   
## 5 Mystery & …                 17.0                    NA                    8.39
## # ℹ abbreviated name: ¹​Factor_budget..Millions.
# Create Genre Groups Analysis
movies_df <- movies_df %>%
  mutate(genre_group = case_when(
    main_genre %in% c("Action & Adventure", "Science Fiction & Fantasy","Animation") ~ "Action/Adventure/SciFi/Animation",
    main_genre %in% c("Drama", "Art House & International") ~ "Drama/Art House",
    main_genre %in% c("Comedy") ~ "Comedy",
    main_genre %in% c("Horror", "Mystery & Suspense") ~ "Horror/Thriller",
    main_genre %in% c("Documentary", "Musical & Performing Arts", "Other") ~ "Documentary/Musical/Other",

  ))

# Analyze driving factors for genre groups
genre_group_data <- movies_df %>%
  group_by(genre_group) %>%
  summarise(count = n()) %>%
  filter(count >= 20)  # Only analyze groups with sufficient data

# Initialize list to store results
genre_group_results <- list()

# Analyze each genre group
for(group in genre_group_data$genre_group) {
  # Filter data for this genre group and remove NA values
  group_data <- movies_df[movies_df$genre_group == group, ] %>%
    select(
      revenue..Millions.,
      budget..Millions.,
      imdb_rating,
      critics_score,
      audience_score,
      runtime,
      Facebook_Likes
    ) %>%
    na.omit()
  
  # Train model for this genre group
  group_model <- randomForest(
    revenue..Millions. ~ .,
    data = group_data,
    importance = TRUE
  )
  
  # Get importance scores
  imp_scores <- importance(group_model)
  
  # Store results
  genre_group_results[[group]] <- data.frame(
    Genre_Group = group,
    Variable = rownames(imp_scores),
    Importance = imp_scores[,1]
  )
}

# Combine results
genre_group_analysis <- do.call(rbind, genre_group_results)

# Create visualization for genre groups
p_group_factors <- ggplot(
  genre_group_analysis %>% 
    group_by(Genre_Group) %>% 
    arrange(desc(Importance)) %>% 
    slice_head(n = 3),  # Show top 3 factors
  aes(x = reorder(Genre_Group, Importance), y = Importance, fill = Variable)
) +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  scale_fill_brewer(palette = "Set3") +
  labs(title = "Top 3 Driving Factors by Genre Group",
       x = "Genre Group",
       y = "Importance Score",
       fill = "Factor") +
  theme(
    legend.position = "right",
    axis.text.y = element_text(size = 10),
    plot.title = element_text(size = 14, face = "bold")
  )

# Save the plot
ggsave("genre_group_factors.png", plot = p_group_factors, width = 12, height = 8)

# Create summary table
genre_group_summary <- genre_group_analysis %>%
  group_by(Genre_Group) %>%
  arrange(desc(Importance)) %>%
  slice_head(n = 3) %>%  # Top 3 factors
  mutate(Importance = round(Importance, 2)) %>%
  arrange(Genre_Group, desc(Importance))

# Print summary
cat("\n\nTop Driving Factors by Genre Group:\n")
## 
## 
## Top Driving Factors by Genre Group:
print(genre_group_summary)
## # A tibble: 15 × 3
## # Groups:   Genre_Group [5]
##    Genre_Group                      Variable          Importance
##    <chr>                            <chr>                  <dbl>
##  1 Action/Adventure/SciFi/Animation Facebook_Likes         20.1 
##  2 Action/Adventure/SciFi/Animation budget..Millions.       9.68
##  3 Action/Adventure/SciFi/Animation audience_score          9.15
##  4 Comedy                           budget..Millions.       7.67
##  5 Comedy                           Facebook_Likes          7.07
##  6 Comedy                           critics_score           3.47
##  7 Documentary/Musical/Other        Facebook_Likes         20.0 
##  8 Documentary/Musical/Other        budget..Millions.       6.69
##  9 Documentary/Musical/Other        audience_score          6.02
## 10 Drama/Art House                  Facebook_Likes         27.2 
## 11 Drama/Art House                  budget..Millions.      13.0 
## 12 Drama/Art House                  imdb_rating             9.28
## 13 Horror/Thriller                  Facebook_Likes         21.1 
## 14 Horror/Thriller                  budget..Millions.       8.98
## 15 Horror/Thriller                  critics_score           1.74
# Calculate success metrics for genre groups
genre_group_metrics <- movies_df %>%
  group_by(genre_group) %>%
  summarise(
    avg_revenue = mean(revenue..Millions., na.rm = TRUE),
    avg_roi = mean(ROI, na.rm = TRUE),
    success_rate = mean(ROI > 0, na.rm = TRUE) * 100,
    count = n()
  ) %>%
  arrange(desc(avg_revenue))

cat("\n\nGenre Group Performance Metrics:\n")
## 
## 
## Genre Group Performance Metrics:
print(genre_group_metrics)
## # A tibble: 5 × 5
##   genre_group                      avg_revenue avg_roi success_rate count
##   <chr>                                  <dbl>   <dbl>        <dbl> <int>
## 1 Horror/Thriller                         147.    321.         72.3    83
## 2 Comedy                                  137.   2798.         80.9    89
## 3 Documentary/Musical/Other               137.   2858.         79.8    84
## 4 Drama/Art House                         136.    965.         76.4   326
## 5 Action/Adventure/SciFi/Animation        134.   1449.         73.5    83
# Create Investment Strategy Pie Chart
investment_data <- data.frame(
  Category = c("Core Investment", "Growth Investment", "Speculative Investment"),
  Percentage = c(60, 30, 10),
  Description = c("Medium Budget ($10M-$50M)\nDrama & Comedy\nPG/PG-13",
                 "High Budget ($50M-$100M)\nAction & Animation\nStrong Social",
                 "Low Budget (<$10M)\nDoc & Horror\nHigh ROI")
)

# Create pie chart with custom colors and clean design
p_investment <- ggplot(investment_data, aes(x = "", y = Percentage, fill = Category)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y", start = 0) +
  scale_fill_manual(values = c("#2E86C1", "#28B463", "#E67E22")) +
  theme_minimal() +
  theme(
    axis.title = element_blank(),
    axis.text = element_blank(),
    panel.grid = element_blank(),
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
    legend.title = element_blank(),
    legend.position = "right"
  ) +
  geom_text(aes(label = paste0(Percentage, "%\n", Description)), 
            position = position_stack(vjust = 0.5),
            size = 3.5, color = "white", lineheight = 0.8) +
  labs(title = "Investment Strategy Portfolio")

# Save the plot
ggsave("investment_strategy_pie.png", plot = p_investment, width = 10, height = 7, dpi = 300)