# Set CRAN Mirror
options(repos = c(CRAN = "https://cloud.r-project.org"))

install.packages(c("dplyr", "ggplot2", "car", "tidyverse", "stringr", "tidyverse"))
## 
## The downloaded binary packages are in
##  /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmpjqswUl/downloaded_packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(stringr)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ car::recode()   masks dplyr::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
financials <- read.csv("Movie Dataset_Financials .csv")
audience <- read.csv ("Movie Dataset_General Audience.csv")

head(financials)
##                                            original_title budget..Millions.
## 1                                           The Departed                1.0
## 2                                 Exodus: Gods and Kings                2.0
## 3                                                Spectre              110.0
## 4                                            The X Files                2.9
## 5 Star Wars: Episode VII - The Force Awakens                          190.0
## 6                                            John Carter              135.0
##   revenue..Millions. language country
## 1               0.01  English     USA
## 2               0.05  English      UK
## 3             295.24  English      UK
## 4               0.30  English     USA
## 5            1506.25   French  France
## 6             532.95  English     USA
head(audience)
##                                            original_title         type
## 1                                           The Departed  Feature Film
## 2                                 Exodus: Gods and Kings  Feature Film
## 3                                                Spectre  Feature Film
## 4                                            The X Files  Feature Film
## 5 Star Wars: Episode VII - The Force Awakens              Feature Film
## 6                                            John Carter   Documentary
##         genre runtime mpaa_rating imdb_rating imdb_num_votes  critics_rating
## 1       Drama     118     Unrated         2.1           9904           Fresh
## 2       Drama     131       PG-13         3.3           1010           Fresh
## 3      Comedy      84           R         7.6          22381 Certified Fresh
## 4       Drama      97          PG         2.5          54363          Rotten
## 5      Horror      90           R         7.2          35096 Certified Fresh
## 6 Documentary      78     Unrated         7.8            333           Fresh
##   critics_score audience_rating audience_score best_pic_nom Facebook_Likes
## 1            10         Upright             21           no             48
## 2            43         Upright             34           no            490
## 3            91         Upright             91           no          11700
## 4            27         Upright             23           no            230
## 5            81         Upright             77           no         172221
## 6            91         Upright             86           no         211873
#Data Cleaning
financials <- financials %>%
  rename(
    title = original_title,
    budget = `budget..Millions.`,
    revenue = `revenue..Millions.`
  )
audience <- audience |> rename(title = `original_title`)

#Trim whitespace and remove extra spaces from the title column
financials <- financials |> mutate(
  title = str_trim(title),
  title = str_squish(title)
)

audience <- audience |> mutate(
  title = str_trim(title),
  title = str_squish(title)
)

# Check for duplicate titles and remove them
financials <- financials |> filter(!duplicated(financials$title))
audience <- audience |> filter(!duplicated(audience$title))

df <- left_join(audience, financials, by = "title")
df <- df |> filter(!is.na(df$budget))

head(df)
##                                        title         type       genre runtime
## 1                               The Departed Feature Film       Drama     118
## 2                     Exodus: Gods and Kings Feature Film       Drama     131
## 3                                    Spectre Feature Film      Comedy      84
## 4                                The X Files Feature Film       Drama      97
## 5 Star Wars: Episode VII - The Force Awakens Feature Film      Horror      90
## 6                                John Carter  Documentary Documentary      78
##   mpaa_rating imdb_rating imdb_num_votes  critics_rating critics_score
## 1     Unrated         2.1           9904           Fresh            10
## 2       PG-13         3.3           1010           Fresh            43
## 3           R         7.6          22381 Certified Fresh            91
## 4          PG         2.5          54363          Rotten            27
## 5           R         7.2          35096 Certified Fresh            81
## 6     Unrated         7.8            333           Fresh            91
##   audience_rating audience_score best_pic_nom Facebook_Likes budget revenue
## 1         Upright             21           no             48    1.0    0.01
## 2         Upright             34           no            490    2.0    0.05
## 3         Upright             91           no          11700  110.0  295.24
## 4         Upright             23           no            230    2.9    0.30
## 5         Upright             77           no         172221  190.0 1506.25
## 6         Upright             86           no         211873  135.0  532.95
##   language country
## 1  English     USA
## 2  English      UK
## 3  English      UK
## 4  English     USA
## 5   French  France
## 6  English     USA
# Plot the distribution of IMDb ratings
ggplot(df, aes(x = imdb_rating)) +
  geom_histogram(binwidth = 0.5, fill = "blue", color = "black") +
  labs(title = "Distribution of IMDb Ratings", x = "IMDb Rating", y = "Frequency")

# Filter out invalid IMDb ratings
df <- df |> filter(imdb_rating <= 10)

ggplot(df, aes(x = imdb_rating)) +
  geom_histogram(binwidth = 0.5, fill = "blue", color = "black") +
  labs(title = "Distribution of IMDb Ratings", x = "IMDb Rating", y = "Frequency")

ggplot(df, aes(x = critics_score, y = imdb_rating)) +
  geom_point(aes(color = genre), alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "IMDb Ratings vs Critic Scores", x = "Critics' Score", y = "IMDb Rating", color = "Genre")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(df, aes(x = budget, y = revenue)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Revenue vs Budget", x = "Budget (in million $)", y = "Revenue (in million $)")
## `geom_smooth()` using formula = 'y ~ x'

df %>%
  group_by(genre) %>%
  summarize(total_likes = sum(Facebook_Likes, na.rm = TRUE)) %>%
  ggplot(aes(x = reorder(genre, total_likes), y = total_likes)) +
  geom_bar(stat = "identity", fill = "blue") +
  coord_flip() +
  labs(title = "Total Facebook Likes by Genre", x = "Genre", y = "Total Likes")

ggplot(df, aes(x = critics_score, y = audience_score)) +
  geom_point(aes(color = mpaa_rating)) +
  facet_wrap(~mpaa_rating) +
  labs(title = "Audience Score vs Critics Score by MPAA Rating", x = "Critics Score", y = "Audience Score")

df <- df %>% 
  mutate(Genre_c = case_when(
    genre == "Science Fiction & Fantasy" ~ "SciFi",
    genre == "Mystery & Suspense" ~ "Mystery",
    genre == "Drama" ~ "Drama",
    genre == "Documentary" ~ "Documentary",
    genre == "Comedy" ~ "Comedy",
    genre == "Art House & International" ~ "Arts",
    genre == "Other" ~ "Other",
    genre == "Action & Adventure" ~ "Action",
    genre == "Animation" ~ "Animation",
    genre == "Horror" ~ "Horror",
    genre == "Musical & Performing Arts" ~ "Arts",
    TRUE ~ "Check"
  ))

# Linear Model 1: Revenue ~ Critics Score + Audience Score
lm_model <- lm(revenue ~ critics_score + audience_score, data = df)
summary(lm_model)
## 
## Call:
## lm(formula = revenue ~ critics_score + audience_score, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -278.29  -88.44  -25.73   29.12 1836.30 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -118.7365    21.5138  -5.519 4.96e-08 ***
## critics_score     1.0885     0.5025   2.166   0.0307 *  
## audience_score    3.4676     0.5556   6.241 7.91e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 186.7 on 639 degrees of freedom
## Multiple R-squared:  0.2012, Adjusted R-squared:  0.1987 
## F-statistic: 80.46 on 2 and 639 DF,  p-value: < 2.2e-16
# Linear Model 2: Revenue ~ Critics Score + Audience Score + IMDb Rating + Facebook Likes
lm_model <- lm(revenue ~ critics_score + audience_score + imdb_rating + Facebook_Likes, data = df)
summary(lm_model)
## 
## Call:
## lm(formula = revenue ~ critics_score + audience_score + imdb_rating + 
##     Facebook_Likes, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -426.78  -69.31  -10.29   39.79 1737.09 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -1.101e+02  1.955e+01  -5.630  2.7e-08 ***
## critics_score   8.553e-01  3.880e-01   2.204   0.0279 *  
## audience_score  1.052e+00  6.001e-01   1.753   0.0801 .  
## imdb_rating     9.707e+00  6.211e+00   1.563   0.1186    
## Facebook_Likes  1.337e-03  6.413e-05  20.841  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 143.8 on 637 degrees of freedom
## Multiple R-squared:  0.5277, Adjusted R-squared:  0.5248 
## F-statistic:   178 on 4 and 637 DF,  p-value: < 2.2e-16
vif(lm_model)
##  critics_score audience_score    imdb_rating Facebook_Likes 
##       2.505711       4.902358       3.632303       1.119099
plot(lm_model)

shapiro.test(lm_model$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lm_model$residuals
## W = 0.71597, p-value < 2.2e-16
hist(lm_model$residuals)

df_log <- df %>%
  mutate(
    budget_log = log(budget),
    revenue_log = log(revenue)
  )

lm_model_log <- lm(revenue_log ~ critics_score + audience_score + imdb_rating + Facebook_Likes, data = df_log)
summary(lm_model_log)
## 
## Call:
## lm(formula = revenue_log ~ critics_score + audience_score + imdb_rating + 
##     Facebook_Likes, data = df_log)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3200 -0.6656  0.1558  0.7756  2.5277 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.441e-01  1.507e-01   4.938 1.01e-06 ***
## critics_score  5.052e-03  2.991e-03   1.689   0.0917 .  
## audience_score 2.242e-03  4.625e-03   0.485   0.6280    
## imdb_rating    4.156e-01  4.787e-02   8.682  < 2e-16 ***
## Facebook_Likes 6.786e-06  4.943e-07  13.728  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.109 on 637 degrees of freedom
## Multiple R-squared:  0.5444, Adjusted R-squared:  0.5415 
## F-statistic: 190.3 on 4 and 637 DF,  p-value: < 2.2e-16
#Draw the line chart
long_data <- df %>%
  select(genre, critics_score, audience_score) %>%
  pivot_longer(cols = c(critics_score, audience_score), 
               names_to = "Score_Type", 
               values_to = "Average_Score")

ggplot(long_data, aes(x = genre, y = Average_Score, group = Score_Type, color = Score_Type)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  labs(
    title = "Average Critics Score and Audience Score by Genre",
    x = "Genre",
    y = "Average Score",
    color = "Score Type"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels for readability
    plot.title = element_text(hjust = 0.5)
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Visulizations
ggplot(df,aes(x = genre,fill = genre)) +
  geom_bar(position = "dodge") +
  geom_text(stat="count", aes(label=..count..), vjust=0, size=5, color="black") +
  labs(title = "Distribution of the number of movies by genre", x = "Genre", y = "Quantity",) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot(df,aes(x = critics_rating,fill = critics_rating)) +
  geom_bar(position = "dodge") +
  geom_text(stat="count", aes(label=..count..), vjust=0, size=5, color="black") +
  facet_wrap(~ genre) +
  labs(title = "Distribution of the number of Critics Rating by genre", x = "Genre", y = "Quantity",) +
  theme_minimal()

ggplot(df, aes(x = critics_score, y = revenue, color = genre)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, color = "Black") +
  labs(
    title = "Critic Score vs Revenue",
    x = "Critic Rating",
    y = "Revenue"
  ) +
  facet_wrap(~ genre) +
  theme_minimal() +
  theme(legend.position = "right") +
  scale_color_discrete(name = "Genre")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(df, aes(x = audience_score, y = revenue, color = genre)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, color = "Black") +
  labs(
    title = "Audience Score vs Revenue",
    x = "Audience Score",
    y = "Revenue"
  ) +
  facet_wrap(~ genre) +
  theme_minimal() +
  theme(legend.position = "right") +
  scale_color_discrete(name = "Genre")
## `geom_smooth()` using formula = 'y ~ x'

# Create log-transformed columns
df$budget_log <- log(df$budget + 1)  # Adding 1 to avoid log(0)
df$revenue_log <- log(df$revenue + 1)  # Adding 1 to avoid log(0)

# Plot
ggplot(df, aes(x = budget_log, y = revenue_log, color = genre)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, color = "Black") +
  labs(
    title = "Log(Budget) vs Log(Revenue)",
    x = "Log(Budget) (Millions)",
    y = "Log(Revenue) (Millions)"
  ) +
  facet_wrap(~ genre) +
  theme_minimal() +
  theme(legend.position = "right") +
  scale_color_discrete(name = "Genre")
## `geom_smooth()` using formula = 'y ~ x'