# Set CRAN Mirror
options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages(c("dplyr", "ggplot2", "car", "tidyverse", "stringr", "tidyverse"))
##
## The downloaded binary packages are in
## /var/folders/8w/l_qbjg8n5v3_5v1lkl2ww_z80000gn/T//RtmpjqswUl/downloaded_packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(stringr)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ car::recode() masks dplyr::recode()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
financials <- read.csv("Movie Dataset_Financials .csv")
audience <- read.csv ("Movie Dataset_General Audience.csv")
head(financials)
## original_title budget..Millions.
## 1 The Departed 1.0
## 2 Exodus: Gods and Kings 2.0
## 3 Spectre 110.0
## 4 The X Files 2.9
## 5 Star Wars: Episode VII - The Force Awakens 190.0
## 6 John Carter 135.0
## revenue..Millions. language country
## 1 0.01 English USA
## 2 0.05 English UK
## 3 295.24 English UK
## 4 0.30 English USA
## 5 1506.25 French France
## 6 532.95 English USA
head(audience)
## original_title type
## 1 The Departed Feature Film
## 2 Exodus: Gods and Kings Feature Film
## 3 Spectre Feature Film
## 4 The X Files Feature Film
## 5 Star Wars: Episode VII - The Force Awakens Feature Film
## 6 John Carter Documentary
## genre runtime mpaa_rating imdb_rating imdb_num_votes critics_rating
## 1 Drama 118 Unrated 2.1 9904 Fresh
## 2 Drama 131 PG-13 3.3 1010 Fresh
## 3 Comedy 84 R 7.6 22381 Certified Fresh
## 4 Drama 97 PG 2.5 54363 Rotten
## 5 Horror 90 R 7.2 35096 Certified Fresh
## 6 Documentary 78 Unrated 7.8 333 Fresh
## critics_score audience_rating audience_score best_pic_nom Facebook_Likes
## 1 10 Upright 21 no 48
## 2 43 Upright 34 no 490
## 3 91 Upright 91 no 11700
## 4 27 Upright 23 no 230
## 5 81 Upright 77 no 172221
## 6 91 Upright 86 no 211873
#Data Cleaning
financials <- financials %>%
rename(
title = original_title,
budget = `budget..Millions.`,
revenue = `revenue..Millions.`
)
audience <- audience |> rename(title = `original_title`)
#Trim whitespace and remove extra spaces from the title column
financials <- financials |> mutate(
title = str_trim(title),
title = str_squish(title)
)
audience <- audience |> mutate(
title = str_trim(title),
title = str_squish(title)
)
# Check for duplicate titles and remove them
financials <- financials |> filter(!duplicated(financials$title))
audience <- audience |> filter(!duplicated(audience$title))
df <- left_join(audience, financials, by = "title")
df <- df |> filter(!is.na(df$budget))
head(df)
## title type genre runtime
## 1 The Departed Feature Film Drama 118
## 2 Exodus: Gods and Kings Feature Film Drama 131
## 3 Spectre Feature Film Comedy 84
## 4 The X Files Feature Film Drama 97
## 5 Star Wars: Episode VII - The Force Awakens Feature Film Horror 90
## 6 John Carter Documentary Documentary 78
## mpaa_rating imdb_rating imdb_num_votes critics_rating critics_score
## 1 Unrated 2.1 9904 Fresh 10
## 2 PG-13 3.3 1010 Fresh 43
## 3 R 7.6 22381 Certified Fresh 91
## 4 PG 2.5 54363 Rotten 27
## 5 R 7.2 35096 Certified Fresh 81
## 6 Unrated 7.8 333 Fresh 91
## audience_rating audience_score best_pic_nom Facebook_Likes budget revenue
## 1 Upright 21 no 48 1.0 0.01
## 2 Upright 34 no 490 2.0 0.05
## 3 Upright 91 no 11700 110.0 295.24
## 4 Upright 23 no 230 2.9 0.30
## 5 Upright 77 no 172221 190.0 1506.25
## 6 Upright 86 no 211873 135.0 532.95
## language country
## 1 English USA
## 2 English UK
## 3 English UK
## 4 English USA
## 5 French France
## 6 English USA
# Plot the distribution of IMDb ratings
ggplot(df, aes(x = imdb_rating)) +
geom_histogram(binwidth = 0.5, fill = "blue", color = "black") +
labs(title = "Distribution of IMDb Ratings", x = "IMDb Rating", y = "Frequency")

# Filter out invalid IMDb ratings
df <- df |> filter(imdb_rating <= 10)
ggplot(df, aes(x = imdb_rating)) +
geom_histogram(binwidth = 0.5, fill = "blue", color = "black") +
labs(title = "Distribution of IMDb Ratings", x = "IMDb Rating", y = "Frequency")

ggplot(df, aes(x = critics_score, y = imdb_rating)) +
geom_point(aes(color = genre), alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "IMDb Ratings vs Critic Scores", x = "Critics' Score", y = "IMDb Rating", color = "Genre")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(df, aes(x = budget, y = revenue)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Revenue vs Budget", x = "Budget (in million $)", y = "Revenue (in million $)")
## `geom_smooth()` using formula = 'y ~ x'

df %>%
group_by(genre) %>%
summarize(total_likes = sum(Facebook_Likes, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(genre, total_likes), y = total_likes)) +
geom_bar(stat = "identity", fill = "blue") +
coord_flip() +
labs(title = "Total Facebook Likes by Genre", x = "Genre", y = "Total Likes")

ggplot(df, aes(x = critics_score, y = audience_score)) +
geom_point(aes(color = mpaa_rating)) +
facet_wrap(~mpaa_rating) +
labs(title = "Audience Score vs Critics Score by MPAA Rating", x = "Critics Score", y = "Audience Score")

df <- df %>%
mutate(Genre_c = case_when(
genre == "Science Fiction & Fantasy" ~ "SciFi",
genre == "Mystery & Suspense" ~ "Mystery",
genre == "Drama" ~ "Drama",
genre == "Documentary" ~ "Documentary",
genre == "Comedy" ~ "Comedy",
genre == "Art House & International" ~ "Arts",
genre == "Other" ~ "Other",
genre == "Action & Adventure" ~ "Action",
genre == "Animation" ~ "Animation",
genre == "Horror" ~ "Horror",
genre == "Musical & Performing Arts" ~ "Arts",
TRUE ~ "Check"
))
# Linear Model 1: Revenue ~ Critics Score + Audience Score
lm_model <- lm(revenue ~ critics_score + audience_score, data = df)
summary(lm_model)
##
## Call:
## lm(formula = revenue ~ critics_score + audience_score, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -278.29 -88.44 -25.73 29.12 1836.30
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -118.7365 21.5138 -5.519 4.96e-08 ***
## critics_score 1.0885 0.5025 2.166 0.0307 *
## audience_score 3.4676 0.5556 6.241 7.91e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 186.7 on 639 degrees of freedom
## Multiple R-squared: 0.2012, Adjusted R-squared: 0.1987
## F-statistic: 80.46 on 2 and 639 DF, p-value: < 2.2e-16
# Linear Model 2: Revenue ~ Critics Score + Audience Score + IMDb Rating + Facebook Likes
lm_model <- lm(revenue ~ critics_score + audience_score + imdb_rating + Facebook_Likes, data = df)
summary(lm_model)
##
## Call:
## lm(formula = revenue ~ critics_score + audience_score + imdb_rating +
## Facebook_Likes, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -426.78 -69.31 -10.29 39.79 1737.09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.101e+02 1.955e+01 -5.630 2.7e-08 ***
## critics_score 8.553e-01 3.880e-01 2.204 0.0279 *
## audience_score 1.052e+00 6.001e-01 1.753 0.0801 .
## imdb_rating 9.707e+00 6.211e+00 1.563 0.1186
## Facebook_Likes 1.337e-03 6.413e-05 20.841 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 143.8 on 637 degrees of freedom
## Multiple R-squared: 0.5277, Adjusted R-squared: 0.5248
## F-statistic: 178 on 4 and 637 DF, p-value: < 2.2e-16
vif(lm_model)
## critics_score audience_score imdb_rating Facebook_Likes
## 2.505711 4.902358 3.632303 1.119099
plot(lm_model)




shapiro.test(lm_model$residuals)
##
## Shapiro-Wilk normality test
##
## data: lm_model$residuals
## W = 0.71597, p-value < 2.2e-16
hist(lm_model$residuals)

df_log <- df %>%
mutate(
budget_log = log(budget),
revenue_log = log(revenue)
)
lm_model_log <- lm(revenue_log ~ critics_score + audience_score + imdb_rating + Facebook_Likes, data = df_log)
summary(lm_model_log)
##
## Call:
## lm(formula = revenue_log ~ critics_score + audience_score + imdb_rating +
## Facebook_Likes, data = df_log)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.3200 -0.6656 0.1558 0.7756 2.5277
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.441e-01 1.507e-01 4.938 1.01e-06 ***
## critics_score 5.052e-03 2.991e-03 1.689 0.0917 .
## audience_score 2.242e-03 4.625e-03 0.485 0.6280
## imdb_rating 4.156e-01 4.787e-02 8.682 < 2e-16 ***
## Facebook_Likes 6.786e-06 4.943e-07 13.728 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.109 on 637 degrees of freedom
## Multiple R-squared: 0.5444, Adjusted R-squared: 0.5415
## F-statistic: 190.3 on 4 and 637 DF, p-value: < 2.2e-16
#Draw the line chart
long_data <- df %>%
select(genre, critics_score, audience_score) %>%
pivot_longer(cols = c(critics_score, audience_score),
names_to = "Score_Type",
values_to = "Average_Score")
ggplot(long_data, aes(x = genre, y = Average_Score, group = Score_Type, color = Score_Type)) +
geom_line(size = 1) +
geom_point(size = 2) +
labs(
title = "Average Critics Score and Audience Score by Genre",
x = "Genre",
y = "Average Score",
color = "Score Type"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels for readability
plot.title = element_text(hjust = 0.5)
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Visulizations
ggplot(df,aes(x = genre,fill = genre)) +
geom_bar(position = "dodge") +
geom_text(stat="count", aes(label=..count..), vjust=0, size=5, color="black") +
labs(title = "Distribution of the number of movies by genre", x = "Genre", y = "Quantity",) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot(df,aes(x = critics_rating,fill = critics_rating)) +
geom_bar(position = "dodge") +
geom_text(stat="count", aes(label=..count..), vjust=0, size=5, color="black") +
facet_wrap(~ genre) +
labs(title = "Distribution of the number of Critics Rating by genre", x = "Genre", y = "Quantity",) +
theme_minimal()

ggplot(df, aes(x = critics_score, y = revenue, color = genre)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "Black") +
labs(
title = "Critic Score vs Revenue",
x = "Critic Rating",
y = "Revenue"
) +
facet_wrap(~ genre) +
theme_minimal() +
theme(legend.position = "right") +
scale_color_discrete(name = "Genre")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(df, aes(x = audience_score, y = revenue, color = genre)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "Black") +
labs(
title = "Audience Score vs Revenue",
x = "Audience Score",
y = "Revenue"
) +
facet_wrap(~ genre) +
theme_minimal() +
theme(legend.position = "right") +
scale_color_discrete(name = "Genre")
## `geom_smooth()` using formula = 'y ~ x'

# Create log-transformed columns
df$budget_log <- log(df$budget + 1) # Adding 1 to avoid log(0)
df$revenue_log <- log(df$revenue + 1) # Adding 1 to avoid log(0)
# Plot
ggplot(df, aes(x = budget_log, y = revenue_log, color = genre)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "Black") +
labs(
title = "Log(Budget) vs Log(Revenue)",
x = "Log(Budget) (Millions)",
y = "Log(Revenue) (Millions)"
) +
facet_wrap(~ genre) +
theme_minimal() +
theme(legend.position = "right") +
scale_color_discrete(name = "Genre")
## `geom_smooth()` using formula = 'y ~ x'
