This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
# Set up working directory
setwd("C:/Users/blake/OneDrive/R MKTG")
# Load necessary libraries
library(tidyverse) # Comprehensive data manipulation and visualization
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr) # Data manipulation
library(stringr) # String manipulation
library(readxl) # Reading Excel files
## Warning: package 'readxl' was built under R version 4.4.3
library(ggplot2) # Data visualization
library(car) # Checking multicollinearity
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
# Step 1: Load the datasets
audience <- read.csv("Movie Dataset_General Audience.csv") # Load the audience dataset
financials <- read.csv("Movie Dataset_Financials (1).csv") # Load the financials dataset
# Step 2: Rename columns in the financials dataset
financials <- financials %>%
rename(
title = `original_title`, # Rename `original_title` to `title`
budget = `budget..Millions.`, # Rename `budget..Millions.` to `budget`
revenue = `revenue..Millions.` # Rename `revenue..Millions.` to `revenue`
)
# Step 3: Clean up the title column in both datasets
audience <- audience %>%
rename(title = `original_title`) %>%
mutate(
title = str_trim(title), # Trim whitespace
title = str_squish(title) # Remove extra spaces
)
financials <- financials %>%
mutate(
title = str_trim(title), # Trim whitespace
title = str_squish(title) # Remove extra spaces
)
# Step 4: Remove duplicates from both datasets
audience <- audience %>% filter(!duplicated(title))
financials <- financials %>% filter(!duplicated(title))
# Step 5: Merge the two datasets using a left join
df <- left_join(audience, financials, by = "title")
# Step 6: Remove rows with missing data
df <- df %>% filter(complete.cases(df))
# Step 7: Create a new variable 'profit' (Revenue - Budget)
df <- df %>% mutate(profit = revenue - budget)
# Step 8: Fit regression model for part a (without budget)
model_a <- lm(revenue ~ critics_score + Facebook_Likes, data = df)
# Step 9: Extract coefficients for part a
coefficients_a <- coef(model_a)
Intercept_a <- coefficients_a["(Intercept)"]
coef_critics_score_a <- coefficients_a["critics_score"]
coef_Facebook_Likes_a <- coefficients_a["Facebook_Likes"]
# Step 10: Given values for part a
critics_score <- 55 # Critics score of 55
Facebook_Likes_a <- 1250 # Original Facebook likes
# Step 11: Calculate predicted revenue for part a
revenue_a <- Intercept_a +
(critics_score * coef_critics_score_a) +
(Facebook_Likes_a * coef_Facebook_Likes_a)
# Step 12: Print predicted revenue for part a
print(paste("Predicted Revenue (Part a): $", round(revenue_a, 2), " million"))
## [1] "Predicted Revenue (Part a): $ 50.24 million"
# Step 13: Fit regression model for part b (with budget)
model_b <- lm(revenue ~ budget + critics_score + Facebook_Likes, data = df)
# Step 14: Extract coefficients for part b
coefficients_b <- coef(model_b)
Intercept_b <- coefficients_b["(Intercept)"]
coef_budget_b <- coefficients_b["budget"]
coef_critics_score_b <- coefficients_b["critics_score"]
coef_Facebook_Likes_b <- coefficients_b["Facebook_Likes"]
# Step 15: Given values for part b
budget_b <- 20 # $20 million
Facebook_Likes_b <- 1250 * 100 # 10,000% increase from 1,250 to 125,000
# Step 16: Calculate predicted revenue for part b
revenue_b <- Intercept_b +
(budget_b * coef_budget_b) +
(critics_score * coef_critics_score_b) +
(Facebook_Likes_b * coef_Facebook_Likes_b)
# Step 17: Print predicted revenue for part b
print(paste("Predicted Revenue (Part b): $", round(revenue_b, 2), " million"))
## [1] "Predicted Revenue (Part b): $ 156.93 million"
# Step 18: Calculate improvement in revenue
improvement <- revenue_b - revenue_a
print(paste("Improvement in Revenue: $", round(improvement, 2), " million"))
## [1] "Improvement in Revenue: $ 106.69 million"
#R^2
# Step 1: Fit regression model for part a (without budget)
model_a <- lm(revenue ~ critics_score + Facebook_Likes, data = df)
# Step 2: Extract R-squared for part a
r_squared_a <- summary(model_a)$r.squared
print(paste("R-squared for Part (a):", round(r_squared_a, 4)))
## [1] "R-squared for Part (a): 0.5149"
# Step 3: Fit regression model for part b (with budget)
model_b <- lm(revenue ~ budget + critics_score + Facebook_Likes, data = df)
# Step 4: Extract R-squared for part b
r_squared_b <- summary(model_b)$r.squared
print(paste("R-squared for Part (b):", round(r_squared_b, 4)))
## [1] "R-squared for Part (b): 0.639"
# Question 5
# Load necessary libraries
library(tidyverse)
library(ggplot2)
library(dplyr)
# Step 1: Analyze Revenue by Genre
# Group the data by genre and calculate average revenue
genre_revenue <- df %>%
group_by(genre) %>%
summarise(
avg_revenue = mean(revenue, na.rm = TRUE),
count = n()
) %>%
arrange(desc(avg_revenue))
# Plot average revenue by genre
ggplot(genre_revenue, aes(x = reorder(genre, -avg_revenue), y = avg_revenue)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(
title = "Average Revenue by Genre",
x = "Genre",
y = "Average Revenue (Millions)"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Step 2: Compare Critics' Scores vs. Audience Scores
# Calculate correlation between critics' scores and revenue
cor_critics <- cor(df$critics_score, df$revenue, use = "complete.obs")
# Calculate correlation between audience scores and revenue
cor_audience <- cor(df$audience_score, df$revenue, use = "complete.obs")
# Print correlation values
print(paste("Correlation between Critics' Scores and Revenue:", round(cor_critics, 2)))
## [1] "Correlation between Critics' Scores and Revenue: 0.39"
print(paste("Correlation between Audience Scores and Revenue:", round(cor_audience, 2)))
## [1] "Correlation between Audience Scores and Revenue: 0.44"
# Step 3: Identify Top Factors Driving Revenue
# Fit a regression model to identify key drivers of revenue
model <- lm(revenue ~ budget + critics_score + audience_score + Facebook_Likes, data = df)
# Extract coefficients and importance of factors
coefficients <- coef(model)
importance <- summary(model)$coefficients
# Print coefficients
print("Coefficients for Revenue Drivers:")
## [1] "Coefficients for Revenue Drivers:"
print(coefficients)
## (Intercept) budget critics_score audience_score Facebook_Likes
## -8.782079e+01 1.837945e+00 7.822971e-01 7.889459e-01 9.632029e-04
# Step 4: Visualize Top Factors Driving Revenue
# Create a bar plot of coefficients (excluding the intercept)
coef_data <- data.frame(
factor = names(coefficients)[-1],
value = coefficients[-1]
)
ggplot(coef_data, aes(x = reorder(factor, -value), y = value)) +
geom_bar(stat = "identity", fill = "orange") +
labs(
title = "Top Factors Driving Revenue",
x = "Factor",
y = "Coefficient Value"
) +
theme_minimal()
# Step 5: Analyze Revenue Drivers by Genre
# Group data by genre and fit regression models for each genre
genres <- unique(df$genre)
results_list <- list()
for (genre in genres) {
genre_data <- df %>% filter(genre == genre)
model_genre <- lm(revenue ~ budget + critics_score + audience_score + Facebook_Likes, data = genre_data)
results_list[[genre]] <- summary(model_genre)
}
# Print R-squared values for each genre
print("R-squared Values by Genre:")
## [1] "R-squared Values by Genre:"
for (genre in genres) {
r_squared <- results_list[[genre]]$r.squared
print(paste(genre, "R-squared:", round(r_squared, 2)))
}
## [1] "Drama R-squared: 0.64"
## [1] "Comedy R-squared: 0.64"
## [1] "Horror R-squared: 0.64"
## [1] "Documentary R-squared: 0.64"
## [1] "Action & Adventure R-squared: 0.64"
## [1] "Art House & International R-squared: 0.64"
## [1] "Musical & Performing Arts R-squared: 0.64"
## [1] "Mystery & Suspense R-squared: 0.64"
## [1] "Animation R-squared: 0.64"
## [1] "Science Fiction & Fantasy R-squared: 0.64"
## [1] "Other R-squared: 0.64"
# Extract coefficients and exclude the intercept
coefficients <- coef(model)[-1] # Remove the intercept
importance <- summary(model)$coefficients[-1, ] # Remove the intercept row
# Sort factors by absolute coefficient values (descending order)
top_factors <- importance[order(abs(importance[, "Estimate"]), decreasing = TRUE), ]
# Print the top 2 factors
print("Top 2 Driving Factors:")
## [1] "Top 2 Driving Factors:"
print(top_factors[1:2, ])
## Estimate Std. Error t value Pr(>|t|)
## budget 1.8379452 0.1282948 14.325949 1.461070e-40
## audience_score 0.7889459 0.3853044 2.047591 4.100901e-02
# Fit the regression model (if not already done)
model <- lm(revenue ~ budget + critics_score + audience_score + Facebook_Likes, data = df)
# Extract coefficients and exclude the intercept
coefficients <- coef(model)[-1] # Remove the intercept
importance <- summary(model)$coefficients[-1, ] # Remove the intercept row
# Convert to a data frame for plotting
top_factors_df <- data.frame(
factor = names(coefficients),
estimate = coefficients
)
# Sort by absolute value of estimates (descending order)
top_factors_df <- top_factors_df[order(abs(top_factors_df$estimate), decreasing = TRUE), ]
# Plot the top factors
ggplot(top_factors_df, aes(x = reorder(factor, -estimate), y = estimate)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(
title = "Top Driving Factors of Movie Revenue",
x = "Factor",
y = "Coefficient Estimate"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Fit the regression model
model <- lm(revenue ~ budget + critics_score + audience_score + Facebook_Likes, data = df)
# Extract coefficients and their statistics
coefficients <- coef(model)
importance <- summary(model)$coefficients
# Convert to a data frame for better readability
coefficients_df <- data.frame(
Variable = rownames(importance),
Estimate = importance[, "Estimate"],
Std_Error = importance[, "Std. Error"],
t_value = importance[, "t value"],
p_value = importance[, "Pr(>|t|)"]
)
# Print the coefficients in a clean format
print("Coefficients for All Variables:")
## [1] "Coefficients for All Variables:"
print(coefficients_df)
## Variable Estimate Std_Error t_value p_value
## (Intercept) (Intercept) -8.782079e+01 1.447092e+01 -6.068777 2.210967e-09
## budget budget 1.837945e+00 1.282948e-01 14.325949 1.461070e-40
## critics_score critics_score 7.822971e-01 3.373127e-01 2.319204 2.069889e-02
## audience_score audience_score 7.889459e-01 3.853044e-01 2.047591 4.100901e-02
## Facebook_Likes Facebook_Likes 9.632029e-04 6.167680e-05 15.616940 8.462428e-47