R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

# Set up working directory 
setwd("C:/Users/blake/OneDrive/R MKTG")

# Load necessary libraries
library(tidyverse)   # Comprehensive data manipulation and visualization
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)       # Data manipulation
library(stringr)     # String manipulation
library(readxl)      # Reading Excel files
## Warning: package 'readxl' was built under R version 4.4.3
library(ggplot2)     # Data visualization
library(car)         # Checking multicollinearity
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
# Step 1: Load the datasets
audience <- read.csv("Movie Dataset_General Audience.csv")  # Load the audience dataset
financials <- read.csv("Movie Dataset_Financials (1).csv")  # Load the financials dataset

# Step 2: Rename columns in the financials dataset
financials <- financials %>%
  rename(
    title = `original_title`,          # Rename `original_title` to `title`
    budget = `budget..Millions.`,      # Rename `budget..Millions.` to `budget`
    revenue = `revenue..Millions.`    # Rename `revenue..Millions.` to `revenue`
  )

# Step 3: Clean up the title column in both datasets
audience <- audience %>%
  rename(title = `original_title`) %>%
  mutate(
    title = str_trim(title),  # Trim whitespace
    title = str_squish(title) # Remove extra spaces
  )

financials <- financials %>%
  mutate(
    title = str_trim(title),  # Trim whitespace
    title = str_squish(title) # Remove extra spaces
  )

# Step 4: Remove duplicates from both datasets
audience <- audience %>% filter(!duplicated(title))
financials <- financials %>% filter(!duplicated(title))

# Step 5: Merge the two datasets using a left join
df <- left_join(audience, financials, by = "title")

# Step 6: Remove rows with missing data
df <- df %>% filter(complete.cases(df))

# Step 7: Create a new variable 'profit' (Revenue - Budget)
df <- df %>% mutate(profit = revenue - budget)

# Step 8: Fit regression model for part a (without budget)
model_a <- lm(revenue ~ critics_score + Facebook_Likes, data = df)

# Step 9: Extract coefficients for part a
coefficients_a <- coef(model_a)
Intercept_a <- coefficients_a["(Intercept)"]
coef_critics_score_a <- coefficients_a["critics_score"]
coef_Facebook_Likes_a <- coefficients_a["Facebook_Likes"]

# Step 10: Given values for part a
critics_score <- 55  # Critics score of 55
Facebook_Likes_a <- 1250  # Original Facebook likes

# Step 11: Calculate predicted revenue for part a
revenue_a <- Intercept_a +
  (critics_score * coef_critics_score_a) +
  (Facebook_Likes_a * coef_Facebook_Likes_a)

# Step 12: Print predicted revenue for part a
print(paste("Predicted Revenue (Part a): $", round(revenue_a, 2), " million"))
## [1] "Predicted Revenue (Part a): $ 50.24  million"
# Step 13: Fit regression model for part b (with budget)
model_b <- lm(revenue ~ budget + critics_score + Facebook_Likes, data = df)

# Step 14: Extract coefficients for part b
coefficients_b <- coef(model_b)
Intercept_b <- coefficients_b["(Intercept)"]
coef_budget_b <- coefficients_b["budget"]
coef_critics_score_b <- coefficients_b["critics_score"]
coef_Facebook_Likes_b <- coefficients_b["Facebook_Likes"]

# Step 15: Given values for part b
budget_b <- 20  # $20 million
Facebook_Likes_b <- 1250 * 100  # 10,000% increase from 1,250 to 125,000

# Step 16: Calculate predicted revenue for part b
revenue_b <- Intercept_b +
  (budget_b * coef_budget_b) +
  (critics_score * coef_critics_score_b) +
  (Facebook_Likes_b * coef_Facebook_Likes_b)

# Step 17: Print predicted revenue for part b
print(paste("Predicted Revenue (Part b): $", round(revenue_b, 2), " million"))
## [1] "Predicted Revenue (Part b): $ 156.93  million"
# Step 18: Calculate improvement in revenue
improvement <- revenue_b - revenue_a
print(paste("Improvement in Revenue: $", round(improvement, 2), " million"))
## [1] "Improvement in Revenue: $ 106.69  million"
#R^2


# Step 1: Fit regression model for part a (without budget)
model_a <- lm(revenue ~ critics_score + Facebook_Likes, data = df)

# Step 2: Extract R-squared for part a
r_squared_a <- summary(model_a)$r.squared
print(paste("R-squared for Part (a):", round(r_squared_a, 4)))
## [1] "R-squared for Part (a): 0.5149"
# Step 3: Fit regression model for part b (with budget)
model_b <- lm(revenue ~ budget + critics_score + Facebook_Likes, data = df)

# Step 4: Extract R-squared for part b
r_squared_b <- summary(model_b)$r.squared
print(paste("R-squared for Part (b):", round(r_squared_b, 4)))
## [1] "R-squared for Part (b): 0.639"
# Question 5

# Load necessary libraries
library(tidyverse)
library(ggplot2)
library(dplyr)

# Step 1: Analyze Revenue by Genre
# Group the data by genre and calculate average revenue
genre_revenue <- df %>%
  group_by(genre) %>%
  summarise(
    avg_revenue = mean(revenue, na.rm = TRUE),
    count = n()
  ) %>%
  arrange(desc(avg_revenue))

# Plot average revenue by genre
ggplot(genre_revenue, aes(x = reorder(genre, -avg_revenue), y = avg_revenue)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(
    title = "Average Revenue by Genre",
    x = "Genre",
    y = "Average Revenue (Millions)"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Step 2: Compare Critics' Scores vs. Audience Scores
# Calculate correlation between critics' scores and revenue
cor_critics <- cor(df$critics_score, df$revenue, use = "complete.obs")

# Calculate correlation between audience scores and revenue
cor_audience <- cor(df$audience_score, df$revenue, use = "complete.obs")

# Print correlation values
print(paste("Correlation between Critics' Scores and Revenue:", round(cor_critics, 2)))
## [1] "Correlation between Critics' Scores and Revenue: 0.39"
print(paste("Correlation between Audience Scores and Revenue:", round(cor_audience, 2)))
## [1] "Correlation between Audience Scores and Revenue: 0.44"
# Step 3: Identify Top Factors Driving Revenue
# Fit a regression model to identify key drivers of revenue
model <- lm(revenue ~ budget + critics_score + audience_score + Facebook_Likes, data = df)

# Extract coefficients and importance of factors
coefficients <- coef(model)
importance <- summary(model)$coefficients

# Print coefficients
print("Coefficients for Revenue Drivers:")
## [1] "Coefficients for Revenue Drivers:"
print(coefficients)
##    (Intercept)         budget  critics_score audience_score Facebook_Likes 
##  -8.782079e+01   1.837945e+00   7.822971e-01   7.889459e-01   9.632029e-04
# Step 4: Visualize Top Factors Driving Revenue
# Create a bar plot of coefficients (excluding the intercept)
coef_data <- data.frame(
  factor = names(coefficients)[-1],
  value = coefficients[-1]
)

ggplot(coef_data, aes(x = reorder(factor, -value), y = value)) +
  geom_bar(stat = "identity", fill = "orange") +
  labs(
    title = "Top Factors Driving Revenue",
    x = "Factor",
    y = "Coefficient Value"
  ) +
  theme_minimal()

# Step 5: Analyze Revenue Drivers by Genre
# Group data by genre and fit regression models for each genre
genres <- unique(df$genre)
results_list <- list()

for (genre in genres) {
  genre_data <- df %>% filter(genre == genre)
  model_genre <- lm(revenue ~ budget + critics_score + audience_score + Facebook_Likes, data = genre_data)
  results_list[[genre]] <- summary(model_genre)
}

# Print R-squared values for each genre
print("R-squared Values by Genre:")
## [1] "R-squared Values by Genre:"
for (genre in genres) {
  r_squared <- results_list[[genre]]$r.squared
  print(paste(genre, "R-squared:", round(r_squared, 2)))
}
## [1] "Drama R-squared: 0.64"
## [1] "Comedy R-squared: 0.64"
## [1] "Horror R-squared: 0.64"
## [1] "Documentary R-squared: 0.64"
## [1] "Action & Adventure R-squared: 0.64"
## [1] "Art House & International R-squared: 0.64"
## [1] "Musical & Performing Arts R-squared: 0.64"
## [1] "Mystery & Suspense R-squared: 0.64"
## [1] "Animation R-squared: 0.64"
## [1] "Science Fiction & Fantasy R-squared: 0.64"
## [1] "Other R-squared: 0.64"
# Extract coefficients and exclude the intercept
coefficients <- coef(model)[-1]  # Remove the intercept
importance <- summary(model)$coefficients[-1, ]  # Remove the intercept row

# Sort factors by absolute coefficient values (descending order)
top_factors <- importance[order(abs(importance[, "Estimate"]), decreasing = TRUE), ]

# Print the top 2 factors
print("Top 2 Driving Factors:")
## [1] "Top 2 Driving Factors:"
print(top_factors[1:2, ])
##                 Estimate Std. Error   t value     Pr(>|t|)
## budget         1.8379452  0.1282948 14.325949 1.461070e-40
## audience_score 0.7889459  0.3853044  2.047591 4.100901e-02
# Fit the regression model (if not already done)
model <- lm(revenue ~ budget + critics_score + audience_score + Facebook_Likes, data = df)

# Extract coefficients and exclude the intercept
coefficients <- coef(model)[-1]  # Remove the intercept
importance <- summary(model)$coefficients[-1, ]  # Remove the intercept row

# Convert to a data frame for plotting
top_factors_df <- data.frame(
  factor = names(coefficients),
  estimate = coefficients
)

# Sort by absolute value of estimates (descending order)
top_factors_df <- top_factors_df[order(abs(top_factors_df$estimate), decreasing = TRUE), ]

# Plot the top factors
ggplot(top_factors_df, aes(x = reorder(factor, -estimate), y = estimate)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(
    title = "Top Driving Factors of Movie Revenue",
    x = "Factor",
    y = "Coefficient Estimate"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Fit the regression model 
model <- lm(revenue ~ budget + critics_score + audience_score + Facebook_Likes, data = df)

# Extract coefficients and their statistics
coefficients <- coef(model)
importance <- summary(model)$coefficients

# Convert to a data frame for better readability
coefficients_df <- data.frame(
  Variable = rownames(importance),
  Estimate = importance[, "Estimate"],
  Std_Error = importance[, "Std. Error"],
  t_value = importance[, "t value"],
  p_value = importance[, "Pr(>|t|)"]
)

# Print the coefficients in a clean format
print("Coefficients for All Variables:")
## [1] "Coefficients for All Variables:"
print(coefficients_df)
##                      Variable      Estimate    Std_Error   t_value      p_value
## (Intercept)       (Intercept) -8.782079e+01 1.447092e+01 -6.068777 2.210967e-09
## budget                 budget  1.837945e+00 1.282948e-01 14.325949 1.461070e-40
## critics_score   critics_score  7.822971e-01 3.373127e-01  2.319204 2.069889e-02
## audience_score audience_score  7.889459e-01 3.853044e-01  2.047591 4.100901e-02
## Facebook_Likes Facebook_Likes  9.632029e-04 6.167680e-05 15.616940 8.462428e-47