Distribution Of Movie Scores

# Load necessary library
library(ggplot2)

# Plot the histogram
ggplot(data, aes(x = score)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black") +
  labs(title = "Distribution of Movie Scores", x = "IMDb Score", y = "Frequency")

TOP 5 GENRES

# Question: Top 5 Genres
top_genres <- data %>%
  group_by(genre) %>%
  summarise(avg_score = mean(score, na.rm = TRUE)) %>%
  top_n(5, wt = avg_score)

# Display the top genres
print(top_genres)

## # A tibble: 13 × 2
##    genre                                                               avg_score
##    <chr>                                                                   <dbl>
##  1 Adventure, Animation, Comedy, Fantasy, Mystery                           85  
##  2 Adventure, Fantasy, Action, Family                                       83  
##  3 Adventure, Fantasy, Animation                                            83  
##  4 Animation, Action, Adventure, Fantasy, Thriller                          83  
##  5 Animation, Action, Comedy, Mystery, Crime, Fantasy                       83  
##  6 Animation, Action, Science Fiction, Drama                                83  
##  7 Animation, Comedy, Romance                                               84.7
##  8 Animation, Family, Fantasy, Adventure, Comedy                            83  
##  9 Animation, Thriller                                                      83  
## 10 Family, Animation, Drama                                                 83  
## 11 Fantasy, Drama, Crime                                                    85  
## 12 Romance, Animation, Drama                                                85  
## 13 TV Movie, Animation, Science Fiction, Action, Adventure, Comedy, D…      83

# Question 7: Genre and Revenue
genre_revenue <- data %>%
  group_by(genre) %>%
  summarise(total_revenue = sum(revenue, na.rm = TRUE))

# Finding the genre with the highest total revenue
top_genre_revenue <- genre_revenue %>%
  arrange(desc(total_revenue)) %>%
  slice(1)

# Displaying the genre with the highest total revenue
print(top_genre_revenue)

## # A tibble: 1 × 2
##   genre total_revenue
##   <chr>         <dbl>
## 1 Drama 138768214182.

# Question 8: Compare Highest and Lowest Revenue Genres
genre_revenue <- data %>%
  group_by(genre) %>%
  summarise(total_revenue = sum(revenue, na.rm = TRUE))

highest_revenue_genre <- genre_revenue %>%
  arrange(desc(total_revenue)) %>%
  slice(1)

lowest_revenue_genre <- genre_revenue %>%
  arrange(total_revenue) %>%
  slice(1)

# Filter data for the selected genres
selected_genres <- c(highest_revenue_genre$genre, lowest_revenue_genre$genre)

selected_genre_data <- data %>%
  filter(genre %in% selected_genres)

# Plotting
ggplot(selected_genre_data, aes(x = genre, y = revenue, fill = genre)) +
  geom_bar(stat = "identity") +
  labs(title = "Comparison of Highest and Lowest Revenue Genres", x = "Genre", y = "Total Revenue") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Time Series

library(tsibble)

## Warning: package 'tsibble' was built under R version 4.3.2

## 
## Attaching package: 'tsibble'

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union

library(ggplot2)


response_variable <- "score"

data <- na.omit(data, cols = c("date_x", response_variable))

data$date_x <- as.Date(data$date_x, format = "%m/%d/%Y")

data$ID <- seq_len(nrow(data))

my_tsibble <- as_tsibble(data, key = "date_x", index = "ID")

ggplot(my_tsibble, aes(x = date_x, y = !!sym(response_variable))) +
  geom_line() +
  labs(title = paste("Time Series Plot of", response_variable),
       x = "Date",
       y = response_variable)

Countries With Highest AVG Score

# Assuming you already have code for creating country_scores
country_scores <- data %>%
  group_by(country) %>%
  summarise(avg_score = mean(score, na.rm = TRUE))

# Finding the Top 5 Countries with Highest Average Scores
top_countries <- country_scores %>%
  top_n(5, avg_score)

# Visualizing Top 5 Countries with Highest Average Scores
ggplot(top_countries, aes(x = reorder(country, -avg_score), y = avg_score)) +
  geom_bar(stat = "identity", fill = "green") +
  labs(title = "Top 5 Countries with Highest Average Movie Scores", x = "Country", y = "Average IMDb Score") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Displaying Top 5 Countries with Highest Average Scores
print(top_countries)

## # A tibble: 5 × 2
##   country avg_score
##   <chr>       <dbl>
## 1 CZ           72.5
## 2 DO           72  
## 3 PR           76  
## 4 SU           79.8
## 5 XC           76

# Question 9: Top 5 Countries with Highest Average Budgets
top_countries_budget <- data %>%
  group_by(country) %>%
  summarise(avg_budget = mean(budget_x, na.rm = TRUE)) %>%
  arrange(desc(avg_budget)) %>%
  slice_head(n = 5)

# Displaying the top 5 countries with the highest average budgets
cat("Top 5 Countries with Highest Average Budgets:\n")

## Top 5 Countries with Highest Average Budgets:

print(top_countries_budget)

## # A tibble: 5 × 2
##   country avg_budget
##   <chr>        <dbl>
## 1 KH       195000000
## 2 SK       174600000
## 3 BY       167540000
## 4 UY       163370000
## 5 PY       153000000

Null Hypothesis

# Perform linear regression
lm_result <- lm(revenue ~ budget_x, data = data)

# Print the summary of the linear regression
summary(lm_result)

## 
## Call:
## lm(formula = revenue ~ budget_x, data = data)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.155e+09 -9.555e+07 -4.019e+07  8.152e+07  2.106e+09 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.036e+07  3.081e+06   13.10   <2e-16 ***
## budget_x    3.280e+00  3.565e-02   91.99   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 205300000 on 10176 degrees of freedom
## Multiple R-squared:  0.454,  Adjusted R-squared:  0.454 
## F-statistic:  8463 on 1 and 10176 DF,  p-value: < 2.2e-16

The linear regression analysis showed that the p-value associated with the budget_x coefficient is much less than the chosen significance level (alpha). We can reject the null hypothesis (H0) and conclude that there is a significant difference in movie revenue between different budget levels. The coefficient estimate for budget_x is 3.280e+00, indicating that, on average, for each unit increase in budget_x, the movie’s revenue is expected to increase by approximately $3.28.

# Load necessary libraries (if not already loaded)
library(ggplot2)

# Create a bar plot for Hypothesis 1
ggplot(data, aes(x = cut(budget_x, breaks = quantile(budget_x)), y = revenue)) +
  geom_bar(stat = "summary", fun = "mean", fill = "blue") +
  labs(title = "Mean Movie Revenue by Budget Level", x = "Budget Level", y = "Mean Revenue") +
  theme_minimal()

anova_result_hypothesis_1 <- aov(revenue ~ cut(budget_x, breaks = quantile(budget_x)), data = data)

p_value_hypothesis_1 <- summary(anova_result_hypothesis_1)[[1]]$`Pr(>F)`[1]

print(p_value_hypothesis_1)

## [1] 0

The p-value is 0, less than 0.05. We reject the null hypothesis. There is a significant difference in movie revenue between budget levels.

Logistic Regression Model

library(dplyr)
library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.1-8

# Checking if 'success' variable exists in the data frame
if (!"success" %in% colnames(data)) {
  
  data$success <- ifelse(data$revenue > median(data$revenue), 1, 0)
}

# Verifying the structure of 'success' variable
str(data$success)  # Checking the structure

##  num [1:10178] 1 1 1 0 1 0 1 1 1 1 ...

# Building a logistic regression model with 'budget_x' as the explanatory variable
model <- glm(success ~ budget_x, 
             data = data, family = binomial(link = "logit"))

# Displaying model summary
summary(model)

## 
## Call:
## glm(formula = success ~ budget_x, family = binomial(link = "logit"), 
##     data = data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.739e+00  5.353e-02  -51.16   <2e-16 ***
## budget_x     4.854e-08  8.928e-10   54.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14109.7  on 10177  degrees of freedom
## Residual deviance:  7309.8  on 10176  degrees of freedom
## AIC: 7313.8
## 
## Number of Fisher Scoring iterations: 6

Intercept: The intercept represents the log-odds of success when the budget is zero. In our context, this value is not practically meaningful, as budgets are typically positive values.

budget_x Coefficient: The coefficient for “budget_x” is approximately 4.854e-08. This coefficient signifies that for every one-unit increase in the movie budget (e.g., increasing the budget by $1), the log-odds of a movie being successful increase by 4.854e-08.

The results of the logistic regression model indicate that there is a statistically significant positive relationship between the movie budget and the likelihood of a movie’s success. As the budget increases, the log-odds of success also increase.

It’s important to note that the model assumes a linear relationship between the budget and the log-odds of success.

# Question 11: Return on Investment (ROI) Analysis
data <- data %>%
  filter(budget_x > 0, revenue > 0) %>%
  mutate(roi = (revenue - budget_x) / budget_x * 100)

# Summary statistics for ROI
summary(data$roi)

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -1.000e+02  4.400e+01  2.320e+02  2.986e+06  4.880e+02  2.577e+10

Minimum (Min.): -100%: The lowest ROI in your dataset is -100%, indicating a negative return, which implies a loss on the investment. This could be due to movies that didn’t perform well financially.

1st Quartile (1st Qu.): 44%: The 25th percentile (or the first quartile) ROI is 44%. This means that 25% of the movies in your dataset have an ROI less than or equal to 44%.

Median: 232%: The median ROI is 232%, which is the middle value of the ROI distribution. Half of the movies have an ROI below 232%, and half have an ROI above this value.

Mean: 2,986,000%: The mean ROI is the average ROI across all movies in your dataset. However, it’s worth noting that the mean can be influenced by extreme values, and in this case, it seems there are some very high ROIs that are pulling the mean upward.

3rd Quartile (3rd Qu.): 488%: The 75th percentile (or the third quartile) ROI is 488%. This gives you an idea of the ROI benchmark for movies that performed relatively well.

Maximum (Max.): 25,770,000,000%: The highest ROI in your dataset is an extremely large value, indicating a movie that had an exceptionally high return. However, such extreme values can significantly impact the mean.

# Question 12: Budget vs. Revenue Scatter Plot
ggplot(data, aes(x = budget_x, y = revenue)) +
  geom_point() +
  labs(title = "Budget vs. Revenue", x = "Budget", y = "Revenue")

# Question 13: Genre-wise Budget and Revenue Analysis
genre_budget_revenue <- data %>%
  filter(budget_x > 0, revenue > 0) %>%
  group_by(genre) %>%
  summarise(avg_budget = mean(budget_x),
            avg_revenue = mean(revenue))

# Displaying genre-wise budget and revenue
print(genre_budget_revenue)

## # A tibble: 2,295 × 3
##    genre                                                  avg_budget avg_revenue
##    <chr>                                                       <dbl>       <dbl>
##  1 ""                                                     128386094.  373993991.
##  2 "Action"                                                47453206.  198565484.
##  3 "Action, Adventure"                                     72861648.  229956313.
##  4 "Action, Adventure, Animation"                          94533333.  548015646.
##  5 "Action, Adventure, Animation, Comedy"                 126300000   401345912 
##  6 "Action, Adventure, Animation, Comedy, Family"         119500000   356813558.
##  7 "Action, Adventure, Animation, Comedy, Family, Scienc… 112400000   251358333.
##  8 "Action, Adventure, Animation, Comedy, Romance, Famil…  90498000   404097025.
##  9 "Action, Adventure, Animation, Crime, Mystery"          52948531.   63147576 
## 10 "Action, Adventure, Animation, Drama"                  142815800   645239302.
## # ℹ 2,285 more rows

# Question 6: Most Common Language
most_common_language <- data %>%
  count(orig_lang) %>%
  arrange(desc(n)) %>%
  slice(1)

# Displaying the most common language
print(most_common_language)

##   orig_lang    n
## 1   English 7350

# Question 8: Language, Revenue, and Score
language_performance <- data %>%
  group_by(orig_lang) %>%
  summarise(avg_score = mean(score, na.rm = TRUE),
            total_revenue = sum(revenue, na.rm = TRUE))

# Finding the language with the highest combined revenue and average score
top_language_performance <- language_performance %>%
  arrange(desc(avg_score), desc(total_revenue)) %>%
  slice(1)

# Displaying the language with the highest combined revenue and average score
cat("Language with Highest Revenue and Score:\n")

## Language with Highest Revenue and Score:

print(top_language_performance)

## # A tibble: 1 × 3
##   orig_lang avg_score total_revenue
##   <chr>         <dbl>         <dbl>
## 1 " Irish"         76       1756887

IMDB FINAL PROJECT

2023-12-02