library(ggplot2)
ggplot(data, aes(x = score)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black") +
labs(title = "Distribution of Movie Scores", x = "IMDb Score", y = "Frequency")
#TOP GENRES TO FOCUS ON
top_genres <- data %>%
group_by(genre) %>%
summarise(avg_score = mean(score, na.rm = TRUE)) %>%
top_n(5, wt = avg_score)
# Display the top genres
print(top_genres)
## # A tibble: 13 × 2
## genre avg_score
## <chr> <dbl>
## 1 Adventure, Animation, Comedy, Fantasy, Mystery 85
## 2 Adventure, Fantasy, Action, Family 83
## 3 Adventure, Fantasy, Animation 83
## 4 Animation, Action, Adventure, Fantasy, Thriller 83
## 5 Animation, Action, Comedy, Mystery, Crime, Fantasy 83
## 6 Animation, Action, Science Fiction, Drama 83
## 7 Animation, Comedy, Romance 84.7
## 8 Animation, Family, Fantasy, Adventure, Comedy 83
## 9 Animation, Thriller 83
## 10 Family, Animation, Drama 83
## 11 Fantasy, Drama, Crime 85
## 12 Romance, Animation, Drama 85
## 13 TV Movie, Animation, Science Fiction, Action, Adventure, Comedy, D… 83
genre_revenue <- data %>%
group_by(genre) %>%
summarise(total_revenue = sum(revenue, na.rm = TRUE))
top_genre_revenue <- genre_revenue %>%
arrange(desc(total_revenue)) %>%
slice(1)
# Displaying the genre with the highest total revenue
print(top_genre_revenue)
## # A tibble: 1 × 2
## genre total_revenue
## <chr> <dbl>
## 1 Drama 138768214182.
genre_revenue <- data %>%
group_by(genre) %>%
summarise(total_revenue = sum(revenue, na.rm = TRUE))
highest_revenue_genre <- genre_revenue %>%
arrange(desc(total_revenue)) %>%
slice(1)
lowest_revenue_genre <- genre_revenue %>%
arrange(total_revenue) %>%
slice(1)
selected_genres <- c(highest_revenue_genre$genre, lowest_revenue_genre$genre)
selected_genre_data <- data %>%
filter(genre %in% selected_genres)
# Plotting
ggplot(selected_genre_data, aes(x = genre, y = revenue, fill = genre)) +
geom_bar(stat = "identity") +
labs(title = "Comparison of Highest and Lowest Revenue Genres", x = "Genre", y = "Total Revenue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.3.2
##
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(ggplot2)
response_variable <- "score"
data <- na.omit(data, cols = c("date_x", response_variable))
data$date_x <- as.Date(data$date_x, format = "%m/%d/%Y")
data$ID <- seq_len(nrow(data))
my_tsibble <- as_tsibble(data, key = "date_x", index = "ID")
ggplot(my_tsibble, aes(x = date_x, y = !!sym(response_variable))) +
geom_line() +
labs(title = paste("Time Series Plot of", response_variable),
x = "Date",
y = response_variable)
country_scores <- data %>%
group_by(country) %>%
summarise(avg_score = mean(score, na.rm = TRUE))
top_countries <- country_scores %>%
top_n(5, avg_score)
ggplot(top_countries, aes(x = reorder(country, -avg_score), y = avg_score)) +
geom_bar(stat = "identity", fill = "green") +
labs(title = "Top 5 Countries with Highest Average Movie Scores", x = "Country", y = "Average IMDb Score") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(top_countries)
## # A tibble: 5 × 2
## country avg_score
## <chr> <dbl>
## 1 CZ 72.5
## 2 DO 72
## 3 PR 76
## 4 SU 79.8
## 5 XC 76
top_countries_budget <- data %>%
group_by(country) %>%
summarise(avg_budget = mean(budget_x, na.rm = TRUE)) %>%
arrange(desc(avg_budget)) %>%
slice_head(n = 5)
cat("Top 5 Countries with Highest Average Budgets:\n")
## Top 5 Countries with Highest Average Budgets:
print(top_countries_budget)
## # A tibble: 5 × 2
## country avg_budget
## <chr> <dbl>
## 1 KH 195000000
## 2 SK 174600000
## 3 BY 167540000
## 4 UY 163370000
## 5 PY 153000000
lm_result <- lm(revenue ~ budget_x, data = data)
summary(lm_result)
##
## Call:
## lm(formula = revenue ~ budget_x, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.155e+09 -9.555e+07 -4.019e+07 8.152e+07 2.106e+09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.036e+07 3.081e+06 13.10 <2e-16 ***
## budget_x 3.280e+00 3.565e-02 91.99 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 205300000 on 10176 degrees of freedom
## Multiple R-squared: 0.454, Adjusted R-squared: 0.454
## F-statistic: 8463 on 1 and 10176 DF, p-value: < 2.2e-16
The linear regression analysis showed that the p-value associated with the budget_x coefficient is much less than the chosen significance level (alpha). We can reject the null hypothesis (H0) and conclude that there is a significant difference in movie revenue between different budget levels. The coefficient estimate for budget_x is 3.280e+00, indicating that, on average, for each unit increase in budget_x, the movie’s revenue is expected to increase by approximately $3.28.
# Performing ANOVA test (Null Hypothesis)
anova_result <- aov(score ~ genre, data = data)
# Summary of ANOVA results
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## genre 2303 571557 248.2 1.511 <2e-16 ***
## Residuals 7874 1293385 164.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The p-value obtained from the ANOVA test is extremely small (close to 0), indicating that there is a significant difference in IMDb scores among different movie genres. In other words, we can reject the null hypothesis (H0) and conclude that there is a statistically significant difference in IMDb scores between movie genres.
#Neyman-pearson Tests
alpha <- 0.05
power <- 0.80
effect_size <- 0.30
critical_value <- qnorm(1 - alpha)
lambda <- (critical_value + qnorm(power))^2
required_sample_size <- (qnorm(alpha / 2) + qnorm(1 - power)) ^ 2 / effect_size ^ 2
observed_sample_size <- length(data$budget_x)
t_stat <- (mean(data$revenue) - mean(data$budget_x)) / (sd(data$revenue) / sqrt(observed_sample_size))
p_value <- pt(t_stat, df = observed_sample_size - 1)
if (observed_sample_size >= required_sample_size) {
cat("Sample size is sufficient for Neyman-Pearson test.\n")
cat("Observed t-statistic:", t_stat, "\n")
cat("Observed p-value:", p_value, "\n")
if (abs(t_stat) > critical_value) {
cat("Reject the null hypothesis (H0).\n")
cat("There is a significant difference in movie revenue between different budget levels.\n")
} else {
cat("Fail to reject the null hypothesis (H0).\n")
cat("There is no significant difference in movie revenue between different budget levels.\n")
}
} else {
cat("Sample size is insufficient for Neyman-Pearson test.\n")
cat("Consider increasing the sample size to achieve the desired power.\n")
}
## Sample size is sufficient for Neyman-Pearson test.
## Observed t-statistic: 68.37077
## Observed p-value: 1
## Reject the null hypothesis (H0).
## There is a significant difference in movie revenue between different budget levels.
alpha_level <- 0.05
power_level <- 0.80
effect_size <- 0.30
se_mean_difference <- sd(data$score) / sqrt(length(data$score))
critical_value <- qnorm(1 - alpha_level)
required_sample_size <- (critical_value * se_mean_difference / effect_size)^2
current_sample_size <- length(data$score)
if (current_sample_size >= required_sample_size) {
cat("Sample size is sufficient for Neyman-Pearson test.\n")
t_statistic <- (mean(data$score[data$genre == "Action"]) - mean(data$score[data$genre != "Action"])) / se_mean_difference
p_value <- 2 * pt(-abs(t_statistic), df = current_sample_size - 2)
cat("Observed t-statistic:", t_statistic, "\n")
cat("Observed p-value:", p_value, "\n")
if (p_value <= alpha_level) {
cat("Reject the null hypothesis (H0).\n")
cat("There is a significant difference in IMDb scores between different movie genres.\n")
} else {
cat("Fail to reject the null hypothesis (H0).\n")
cat("There is no significant difference in IMDb scores between different movie genres.\n")
}
} else {
cat("Sample size is not sufficient for Neyman-Pearson test.\n")
}
## Sample size is sufficient for Neyman-Pearson test.
## Observed t-statistic: -27.77
## Observed p-value: 1.141364e-163
## Reject the null hypothesis (H0).
## There is a significant difference in IMDb scores between different movie genres.
# Load necessary libraries (if not already loaded)
library(ggplot2)
# Create a bar plot for Hypothesis 1
ggplot(data, aes(x = cut(budget_x, breaks = quantile(budget_x)), y = revenue)) +
geom_bar(stat = "summary", fun = "mean", fill = "blue") +
labs(title = "Mean Movie Revenue by Budget Level", x = "Budget Level", y = "Mean Revenue") +
theme_minimal()
anova_result_hypothesis_1 <- aov(revenue ~ cut(budget_x, breaks = quantile(budget_x)), data = data)
p_value_hypothesis_1 <- summary(anova_result_hypothesis_1)[[1]]$`Pr(>F)`[1]
print(p_value_hypothesis_1)
## [1] 0
The p-value is 0, less than 0.05. We reject the null hypothesis. There is a significant difference in movie revenue between budget levels.
library(dplyr)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
# Checking if 'success' variable exists in the data frame
if (!"success" %in% colnames(data)) {
data$success <- ifelse(data$revenue > median(data$revenue), 1, 0)
}
# Verifying the structure of 'success' variable
str(data$success) # Checking the structure
## num [1:10178] 1 1 1 0 1 0 1 1 1 1 ...
# Building a logistic regression model with 'budget_x' as the explanatory variable
model <- glm(success ~ budget_x,
data = data, family = binomial(link = "logit"))
# Displaying model summary
summary(model)
##
## Call:
## glm(formula = success ~ budget_x, family = binomial(link = "logit"),
## data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.739e+00 5.353e-02 -51.16 <2e-16 ***
## budget_x 4.854e-08 8.928e-10 54.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 14109.7 on 10177 degrees of freedom
## Residual deviance: 7309.8 on 10176 degrees of freedom
## AIC: 7313.8
##
## Number of Fisher Scoring iterations: 6
Intercept: The intercept represents the log-odds of success when the budget is zero. In our context, this value is not practically meaningful, as budgets are typically positive values.
budget_x Coefficient: The coefficient for “budget_x” is approximately 4.854e-08. This coefficient signifies that for every one-unit increase in the movie budget (e.g., increasing the budget by $1), the log-odds of a movie being successful increase by 4.854e-08.
The results of the logistic regression model indicate that there is a statistically significant positive relationship between the movie budget and the likelihood of a movie’s success. As the budget increases, the log-odds of success also increase.
It’s important to note that the model assumes a linear relationship between the budget and the log-odds of success.
# Loading necessary libraries
library(ggplot2)
# Creating a scatter plot for 'budget_x'
ggplot(data, aes(x = budget_x, y = success)) +
geom_point() +
labs(x = "Budget (X)", y = "Success (0 or 1)")
The scatter plot reveals a linear relationship between “budget_x” and
“success.” As the budget increases, there is a positive trend in the
likelihood of success. While the relationship appears linear, it’s
important to consider the potential impact of outliers or influential
data points. Based on our initial scatter plot analysis, the linear
relationship between “budget_x” and “success” suggests that a
transformation may not be necessary. The trend of increasing success
with higher budgets is well-captured by the linear model. In this
analysis, we find that a transformation for “budget_x” may not be
necessary. The linear relationship adequately represents the observed
trend, and no nonlinear patterns or issues are evident in the scatter
plot.
#Return On Investment
# Question 11: Return on Investment (ROI) Analysis
data <- data %>%
filter(budget_x > 0, revenue > 0) %>%
mutate(roi = (revenue - budget_x) / budget_x * 100)
# Summary statistics for ROI
summary(data$roi)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.000e+02 4.400e+01 2.320e+02 2.986e+06 4.880e+02 2.577e+10
The Return on Investment (ROI) analysis reveals a diverse landscape in movie financial performance:
Negative Returns: A subset of movies experienced losses, with the lowest ROI recorded at -100%, indicating financial underperformance.
Median Performance: The middle point of the ROI distribution is 232%, suggesting that half of the movies achieved a return below 232%, while the other half exceeded this benchmark.
Mean ROI Influence: The average ROI is notably high at 2,986,000%, but caution is warranted due to extreme values influencing the mean, particularly very high ROIs.
Performance Benchmarks: The 25th percentile (1st Quartile) is at 44%, and the 75th percentile (3rd Quartile) at 488%, providing benchmarks for movies with varying degrees of financial success.
Extreme Outliers: The highest ROI reached an extraordinary 25,770,000,000%, underscoring the potential impact of outliers on overall dataset metrics.
#Budget And Revenue
ggplot(data, aes(x = budget_x, y = revenue)) +
geom_point() +
labs(title = "Budget vs. Revenue", x = "Budget", y = "Revenue")
genre_budget_revenue <- data %>%
filter(budget_x > 0, revenue > 0) %>%
group_by(genre) %>%
summarise(avg_budget = mean(budget_x),
avg_revenue = mean(revenue))
# Displaying genre-wise budget and revenue
print(genre_budget_revenue)
## # A tibble: 2,295 × 3
## genre avg_budget avg_revenue
## <chr> <dbl> <dbl>
## 1 "" 128386094. 373993991.
## 2 "Action" 47453206. 198565484.
## 3 "Action, Adventure" 72861648. 229956313.
## 4 "Action, Adventure, Animation" 94533333. 548015646.
## 5 "Action, Adventure, Animation, Comedy" 126300000 401345912
## 6 "Action, Adventure, Animation, Comedy, Family" 119500000 356813558.
## 7 "Action, Adventure, Animation, Comedy, Family, Scienc… 112400000 251358333.
## 8 "Action, Adventure, Animation, Comedy, Romance, Famil… 90498000 404097025.
## 9 "Action, Adventure, Animation, Crime, Mystery" 52948531. 63147576
## 10 "Action, Adventure, Animation, Drama" 142815800 645239302.
## # ℹ 2,285 more rows
most_common_language <- data %>%
count(orig_lang) %>%
arrange(desc(n)) %>%
slice(1)
# Displaying the most common language
print(most_common_language)
## orig_lang n
## 1 English 7350
language_performance <- data %>%
group_by(orig_lang) %>%
summarise(avg_score = mean(score, na.rm = TRUE),
total_revenue = sum(revenue, na.rm = TRUE))
# Finding the language with the highest combined revenue and average score
top_language_performance <- language_performance %>%
arrange(desc(avg_score), desc(total_revenue)) %>%
slice(1)
# Displaying the language with the highest combined revenue and average score
cat("Language with Highest Revenue and Score:\n")
## Language with Highest Revenue and Score:
print(top_language_performance)
## # A tibble: 1 × 3
## orig_lang avg_score total_revenue
## <chr> <dbl> <dbl>
## 1 " Irish" 76 1756887
Conclusions: Genre Insights:
Identified top-performing movie genres based on average scores. Filmmakers can focus on these genres for potentially higher audience satisfaction.
Revenue Generation:
Explored genres contributing the most to total revenue, aiding investment decisions. Drama emerged as the genre with the highest total revenue.
International Variances:
Discovered regional variations in movie scores, highlighting audience preferences across different countries. Filmmakers can tailor content for specific markets. Financial Dynamics:
Established a positive relationship between movie budgets and revenues using regression analysis. Higher budgets are associated with increased revenue, guiding budgeting decisions.
Language Impact:
Explored the influence of movie language on both revenue and score. English-language movies dominate, but other languages can also contribute significantly.
Return on Investment (ROI):
Analyzed ROI to gauge the financial success of movies. Identified extreme cases of exceptionally high and low returns, offering insights for investors and producers.