library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.4 ✔ stringr 1.5.1
## ✔ purrr 1.0.2 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following object is masked from 'package:purrr':
##
## transpose
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(stringr)
library(corrgram)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:purrr':
##
## some
##
## The following object is masked from 'package:dplyr':
##
## recode
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:ggplot2':
##
## margin
##
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
##
## The following object is masked from 'package:corrgram':
##
## panel.fill
##
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
Audience<-read.csv("Movie Dataset_General Audience.csv")
View(Audience)
summary(Audience)
## original_title type genre runtime
## Length:651 Length:651 Length:651 Min. : 39.0
## Class :character Class :character Class :character 1st Qu.: 92.0
## Mode :character Mode :character Mode :character Median :103.0
## Mean :105.8
## 3rd Qu.:115.8
## Max. :267.0
## NA's :1
## mpaa_rating imdb_rating imdb_num_votes critics_rating
## Length:651 Min. : 1.200 Min. : 180 Length:651
## Class :character 1st Qu.: 4.300 1st Qu.: 4035 Class :character
## Mode :character Median : 6.100 Median : 10651 Mode :character
## Mean : 5.811 Mean : 49378
## 3rd Qu.: 7.100 3rd Qu.: 56156
## Max. :55.000 Max. :893008
##
## critics_score audience_rating audience_score best_pic_nom
## Min. : 5.00 Length:651 Min. :13.00 Length:651
## 1st Qu.: 36.50 Class :character 1st Qu.:38.00 Class :character
## Median : 55.00 Mode :character Median :55.00 Mode :character
## Mean : 54.95 Mean :56.56
## 3rd Qu.: 73.50 3rd Qu.:76.00
## Max. :100.00 Max. :96.00
##
## Facebook_Likes
## Min. : 2
## 1st Qu.: 7014
## Median : 26088
## Mean : 64210
## 3rd Qu.: 82706
## Max. :555609
##
Audience <- Audience %>%
mutate(across(everything(),~ ifelse(is.na(.), mean(., na.rm=TRUE), .)))
summary(Audience)
## original_title type genre runtime
## Length:651 Length:651 Length:651 Min. : 39.0
## Class :character Class :character Class :character 1st Qu.: 92.0
## Mode :character Mode :character Mode :character Median :103.0
## Mean :105.8
## 3rd Qu.:115.5
## Max. :267.0
## mpaa_rating imdb_rating imdb_num_votes critics_rating
## Length:651 Min. : 1.200 Min. : 180 Length:651
## Class :character 1st Qu.: 4.300 1st Qu.: 4035 Class :character
## Mode :character Median : 6.100 Median : 10651 Mode :character
## Mean : 5.811 Mean : 49378
## 3rd Qu.: 7.100 3rd Qu.: 56156
## Max. :55.000 Max. :893008
## critics_score audience_rating audience_score best_pic_nom
## Min. : 5.00 Length:651 Min. :13.00 Length:651
## 1st Qu.: 36.50 Class :character 1st Qu.:38.00 Class :character
## Median : 55.00 Mode :character Median :55.00 Mode :character
## Mean : 54.95 Mean :56.56
## 3rd Qu.: 73.50 3rd Qu.:76.00
## Max. :100.00 Max. :96.00
## Facebook_Likes
## Min. : 2
## 1st Qu.: 7014
## Median : 26088
## Mean : 64210
## 3rd Qu.: 82706
## Max. :555609
Audience<- Audience |>
rename(
title = 'original_title')
Financials<-read.csv("Movie Dataset_Financials.csv")
View(Financials)
summary(Financials)
## original_title budget..Millions. revenue..Millions. language
## Length:651 Min. : 0.32 Min. : 0.01 Length:651
## Class :character 1st Qu.: 10.00 1st Qu.: 20.75 Class :character
## Mode :character Median : 25.00 Median : 61.81 Mode :character
## Mean : 40.92 Mean : 137.97
## 3rd Qu.: 52.75 3rd Qu.: 154.49
## Max. :280.00 Max. :2068.18
## country
## Length:651
## Class :character
## Mode :character
##
##
##
Financials<- Financials |>
rename(
title = 'original_title',
budget = 'budget..Millions.',
revenue = 'revenue..Millions.')
Audience<- Audience |>
mutate(
title = str_trim(title),
title = str_squish(title))
Financials<- Financials |>
mutate(
title = str_trim(title),
title = str_squish(title))
Audience |>
filter(duplicated(Audience$title))
## title type genre runtime
## 1 The Legend of Tarzan Feature Film Drama 94
## 2 Pan Feature Film Other 90
## 3 The Twilight Saga: Breaking Dawn - Part 2 Feature Film Comedy 95
## 4 Fantastic Four Feature Film Drama 101
## 5 The Fast and the Furious Feature Film Drama 104
## 6 Godzilla Resurgence Feature Film Drama 98
## 7 Hercules Documentary Documentary 107
## mpaa_rating imdb_rating imdb_num_votes critics_rating critics_score
## 1 R 3.4 57933 Rotten 25
## 2 PG 3.6 1010 Fresh 33
## 3 PG 7.5 880 Fresh 90
## 4 R 2.1 9904 Fresh 35
## 5 R 7.1 128361 Fresh 68
## 6 PG-13 6.3 50340 Fresh 60
## 7 Unrated 3.8 10522 Fresh 21
## audience_rating audience_score best_pic_nom Facebook_Likes
## 1 Spilled 39 no 11175
## 2 Upright 24 no 393
## 3 Upright 89 no 359177
## 4 Upright 26 no 51261
## 5 Upright 78 no 125327
## 6 Spilled 67 no 699
## 7 Upright 20 no 235
Audience |>
filter(title == "The Legend of Tarzan")
## title type genre runtime mpaa_rating imdb_rating
## 1 The Legend of Tarzan Documentary Documentary 92 Unrated 6.8
## 2 The Legend of Tarzan Feature Film Drama 94 R 3.4
## imdb_num_votes critics_rating critics_score audience_rating audience_score
## 1 1942 Certified Fresh 66 Upright 68
## 2 57933 Rotten 25 Spilled 39
## best_pic_nom Facebook_Likes
## 1 no 121175
## 2 no 11175
Audience<- Audience |>
filter(!duplicated(Audience$title))
Financials |>
filter(duplicated(Financials$title))
## title budget revenue language country
## 1 The Legend of Tarzan 6.0 11.12 English USA
## 2 Pan 3.0 0.70 English USA
## 3 The Twilight Saga: Breaking Dawn - Part 2 0.5 1025.47 English USA
## 4 Fantastic Four 13.0 45.30 English USA
## 5 The Fast and the Furious 35.0 77.48 English USA
## 6 Godzilla Resurgence 1.9 157.11 Japanese Japan
## 7 Hercules 6.0 1.27 English USA
Financials |>
filter(title == "The Legend of Tarzan")
## title budget revenue language country
## 1 The Legend of Tarzan 35 82.35 English USA
## 2 The Legend of Tarzan 6 11.12 English USA
Financials<- Financials |>
filter(!duplicated(Financials$title))
Movie_Data<-merge(Audience, Financials, by = "title")
View(Movie_Data)
summary(Movie_Data)
## title type genre runtime
## Length:644 Length:644 Length:644 Min. : 39.0
## Class :character Class :character Class :character 1st Qu.: 92.0
## Mode :character Mode :character Mode :character Median :103.0
## Mean :105.9
## 3rd Qu.:116.0
## Max. :267.0
## mpaa_rating imdb_rating imdb_num_votes critics_rating
## Length:644 Min. : 1.200 Min. : 180 Length:644
## Class :character 1st Qu.: 4.300 1st Qu.: 4054 Class :character
## Mode :character Median : 6.150 Median : 10718 Mode :character
## Mean : 5.822 Mean : 49513
## 3rd Qu.: 7.100 3rd Qu.: 56142
## Max. :55.000 Max. :893008
## critics_score audience_rating audience_score best_pic_nom
## Min. : 5.00 Length:644 Min. :13.00 Length:644
## 1st Qu.: 37.00 Class :character 1st Qu.:38.00 Class :character
## Median : 55.00 Mode :character Median :55.50 Mode :character
## Mean : 55.03 Mean :56.64
## 3rd Qu.: 74.00 3rd Qu.:76.00
## Max. :100.00 Max. :96.00
## Facebook_Likes budget revenue language
## Min. : 2 Min. : 0.32 Min. : 0.01 Length:644
## 1st Qu.: 7309 1st Qu.: 10.00 1st Qu.: 21.39 Class :character
## Median : 26124 Median : 25.00 Median : 61.95 Mode :character
## Mean : 64057 Mean : 41.27 Mean : 137.43
## 3rd Qu.: 82664 3rd Qu.: 53.00 3rd Qu.: 154.25
## Max. :555609 Max. :280.00 Max. :2068.18
## country
## Length:644
## Class :character
## Mode :character
##
##
##
corrgram(Movie_Data, order=NULL,
panel=panel.shade,
text.panel=panel.txt,
col = colorRampPalette(c("white","red3")),
main="Correlogram")
Movie_Data<- Movie_Data %>%
mutate(mpaa_rating = ifelse(mpaa_rating== "Unrated", "G", mpaa_rating))
View(Movie_Data)
Movie_Data<- Movie_Data %>%
mutate(mpaa_rating = ifelse(mpaa_rating== "NC-17", "R", mpaa_rating))
View(Movie_Data)
Movie_Data$mpaa_rating <- factor(Movie_Data$mpaa_rating,
levels = c("G", "PG", "PG-13", "R"))
ggplot(Movie_Data, aes(x = mpaa_rating, fill = mpaa_rating)) +
geom_bar() +
labs(title = "Movie Distribution by Customer Segmentation",
x = "MPAA Rating",
y = "Count of Movies") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("G" = "goldenrod1", "PG" = "lightgoldenrod", "PG-13" = "red3", "R" = "salmon1"))
Movie_Data <- Movie_Data %>%
mutate(genre = ifelse(genre == "Action & Adventure", "Action", genre))
Movie_Data <- Movie_Data %>%
mutate(genre = ifelse(genre == "Art House & International", "Art House", genre))
Movie_Data <- Movie_Data %>%
mutate(genre = ifelse(genre == "Musical & Performing Arts", "Musical", genre))
Movie_Data <- Movie_Data %>%
mutate(genre = ifelse(genre == "Mystery & Suspense", "Mystery", genre))
Movie_Data <- Movie_Data %>%
mutate(genre = ifelse(genre == "Science Fiction & Fantasy", "Sci-Fi & Fantasy", genre))
ggplot(Movie_Data, aes(x = genre, fill = genre)) +
geom_bar() +
labs(title = "Movie Distribution by Genre",
x = "Genre",
y = "Count of Movies") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("Action" = "steelblue1", "Animation" = "royalblue1", "Art House" = "blue2", "Comedy" = "slateblue2", "Documentary" = "orchid3", "Drama" = "hotpink2", "Horror" = "brown2", "Musical" = "red3", "Mystery" = "salmon1", "Other" = "peachpuff", "Sci-Fi & Fantasy" = "navajowhite3"))
ggplot(Movie_Data, aes(x = genre, fill = mpaa_rating)) +
geom_bar(position = "stack") +
labs(title = "MPAA Rating Distribution Across Genres",
x = "Genre",
y = "Count of Movies") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("G" = "goldenrod1", "PG" = "lightgoldenrod",
"PG-13" = "red3", "R" = "salmon1"))
Genre_Revenue <- Movie_Data %>%
group_by(genre) %>%
summarise(total_revenue = sum(revenue, na.rm = TRUE)) %>%
arrange(desc(total_revenue))
ggplot(Genre_Revenue, aes(x = reorder(genre, -total_revenue), y = total_revenue, fill = genre)) +
geom_bar(stat = "identity") +
labs(title = "Total Revenue by Genre",
x = "Genre",
y = "Total Revenue (Millions)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("Action" = "steelblue1", "Animation" = "royalblue1", "Art House" = "blue2",
"Comedy" = "slateblue2", "Documentary" = "orchid3", "Drama" = "hotpink2",
"Horror" = "brown2", "Musical" = "red3", "Mystery" = "salmon1",
"Other" = "peachpuff", "Sci-Fi & Fantasy" = "navajowhite3"))
Movie_Data <- Movie_Data %>%
mutate(profit = revenue - budget)
Genre_Financials <- Movie_Data %>%
group_by(genre) %>%
summarise(
total_revenue = sum(revenue, na.rm = TRUE),
total_budget = sum(budget, na.rm = TRUE),
total_profit = sum(profit, na.rm = TRUE)
) %>%
pivot_longer(cols = c(total_revenue, total_budget, total_profit),
names_to = "Financial_Metric",
values_to = "Amount")
ggplot(Genre_Financials, aes(x = genre, y = Amount, fill = Financial_Metric)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparison of Revenue, Budget, and Profit by Genre",
x = "Genre",
y = "Amount (Millions)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("total_revenue" = "steelblue",
"total_budget" = "red3",
"total_profit" = "goldenrod1"))
Genre_Scores <- Movie_Data %>%
group_by(genre) %>%
summarise(
avg_critics_score = mean(critics_score, na.rm = TRUE),
avg_audience_score = mean(audience_score, na.rm = TRUE)
) %>%
pivot_longer(cols = c(avg_critics_score, avg_audience_score),
names_to = "Score_Type",
values_to = "Average_Score")
ggplot(Genre_Scores, aes(x = genre, y = Average_Score, fill = Score_Type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparison of Critics' Scores and Audience Scores by Genre",
x = "Genre",
y = "Average Score") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("avg_critics_score" = "goldenrod1",
"avg_audience_score" = "red3"))
##### Simmarizing the average scores per genre and calculating the
overall average
Genre_Scores <- Movie_Data %>%
group_by(genre) %>%
summarise(
avg_critics_score = mean(critics_score, na.rm = TRUE),
avg_audience_score = mean(audience_score, na.rm = TRUE)
) %>%
mutate(avg_total_score = (avg_critics_score + avg_audience_score) / 2) %>%
pivot_longer(cols = c(avg_critics_score, avg_audience_score),
names_to = "Score_Type",
values_to = "Average_Score")
ggplot(Genre_Scores, aes(x = genre)) +
geom_bar(aes(y = Average_Score, fill = Score_Type), stat = "identity", position = "dodge") +
geom_point(aes(y = avg_total_score), color = "black", size = 2) + # Smaller dots
geom_text(aes(y = avg_total_score + 5, label = round(avg_total_score, 1)), color = "black", size = 4) + # Labels above each genre
labs(title = "Comparison of Critics' Scores and Audience Scores by Genre",
x = "Genre",
y = "Average Score") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("avg_critics_score" = "goldenrod1",
"avg_audience_score" = "red3"))
Movie_Data <- Movie_Data %>%
mutate(genre = as.factor(genre),
country = as.factor(country))
predictors <- setdiff(names(Movie_Data), c("title", "profit", "revenue"))
models <- list()
for (var in predictors) {
formula <- as.formula(paste("revenue ~", var))
model <- lm(formula, data = Movie_Data)
models[[var]] <- summary(model)
cat("\n\n### Regression Analysis for:", var, "###\n")
print(models[[var]])
}
##
##
## ### Regression Analysis for: type ###
##
## Call:
## lm(formula = formula, data = Movie_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -209.85 -114.96 -73.93 17.29 1930.37
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 123.93 28.36 4.369 1.45e-05 ***
## typeFeature Film 13.88 29.64 0.468 0.640
## typeTV Movie 114.13 97.43 1.171 0.242
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 208.4 on 641 degrees of freedom
## Multiple R-squared: 0.00217, Adjusted R-squared: -0.0009437
## F-statistic: 0.6969 on 2 and 641 DF, p-value: 0.4985
##
##
##
## ### Regression Analysis for: genre ###
##
## Call:
## lm(formula = formula, data = Movie_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -207.01 -114.68 -71.36 16.52 1928.60
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 129.1185 25.9265 4.980 8.2e-07 ***
## genreAnimation 107.4338 74.3427 1.445 0.149
## genreArt House -2.3792 61.5876 -0.039 0.969
## genreComedy -0.8756 34.3545 -0.025 0.980
## genreDocumentary -9.4294 38.8898 -0.242 0.808
## genreDrama 10.4665 28.5891 0.366 0.714
## genreHorror 59.6289 50.7132 1.176 0.240
## genreMusical 8.2890 65.6748 0.126 0.900
## genreMystery 3.4965 37.5862 0.093 0.926
## genreOther 50.8565 61.5876 0.826 0.409
## genreSci-Fi & Fantasy -61.9151 74.3427 -0.833 0.405
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 209 on 633 degrees of freedom
## Multiple R-squared: 0.009001, Adjusted R-squared: -0.006654
## F-statistic: 0.5749 on 10 and 633 DF, p-value: 0.835
##
##
##
## ### Regression Analysis for: runtime ###
##
## Call:
## lm(formula = formula, data = Movie_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -144.86 -117.09 -75.09 19.05 1922.14
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 109.8575 45.3640 2.422 0.0157 *
## runtime 0.2603 0.4213 0.618 0.5368
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 208.4 on 642 degrees of freedom
## Multiple R-squared: 0.0005944, Adjusted R-squared: -0.0009623
## F-statistic: 0.3819 on 1 and 642 DF, p-value: 0.5368
##
##
##
## ### Regression Analysis for: mpaa_rating ###
##
## Call:
## lm(formula = formula, data = Movie_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -178.99 -112.26 -72.14 21.36 1888.89
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 143.04 25.19 5.678 2.07e-08 ***
## mpaa_ratingPG 36.25 31.73 1.142 0.254
## mpaa_ratingPG-13 -27.06 31.01 -0.872 0.383
## mpaa_ratingR -12.96 27.68 -0.468 0.640
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 207.8 on 640 degrees of freedom
## Multiple R-squared: 0.01017, Adjusted R-squared: 0.00553
## F-statistic: 2.192 on 3 and 640 DF, p-value: 0.08781
##
##
##
## ### Regression Analysis for: imdb_rating ###
##
## Call:
## lm(formula = formula, data = Movie_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -807.01 -95.35 -66.52 4.50 1909.42
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.30 17.43 2.713 0.00684 **
## imdb_rating 15.48 2.66 5.819 9.32e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 203.2 on 642 degrees of freedom
## Multiple R-squared: 0.05011, Adjusted R-squared: 0.04863
## F-statistic: 33.87 on 1 and 642 DF, p-value: 9.325e-09
##
##
##
## ### Regression Analysis for: imdb_num_votes ###
##
## Call:
## lm(formula = formula, data = Movie_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -163.42 -117.07 -74.35 17.45 1931.72
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.341e+02 9.194e+00 14.59 <2e-16 ***
## imdb_num_votes 6.679e-05 8.351e-05 0.80 0.424
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 208.4 on 642 degrees of freedom
## Multiple R-squared: 0.0009953, Adjusted R-squared: -0.0005608
## F-statistic: 0.6396 on 1 and 642 DF, p-value: 0.4241
##
##
##
## ### Regression Analysis for: critics_rating ###
##
## Call:
## lm(formula = formula, data = Movie_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -174.23 -113.08 -77.42 18.45 1893.38
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 174.80 19.30 9.056 <2e-16 ***
## critics_ratingFresh -49.44 23.08 -2.142 0.0325 *
## critics_ratingRotten -41.56 23.24 -1.788 0.0742 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 207.9 on 641 degrees of freedom
## Multiple R-squared: 0.007376, Adjusted R-squared: 0.004279
## F-statistic: 2.382 on 2 and 641 DF, p-value: 0.09321
##
##
##
## ### Regression Analysis for: critics_score ###
##
## Call:
## lm(formula = formula, data = Movie_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -282.11 -92.93 -45.45 31.74 1842.92
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -56.165 19.522 -2.877 0.00415 **
## critics_score 3.518 0.327 10.756 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 191.9 on 642 degrees of freedom
## Multiple R-squared: 0.1527, Adjusted R-squared: 0.1514
## F-statistic: 115.7 on 1 and 642 DF, p-value: < 2.2e-16
##
##
##
## ### Regression Analysis for: audience_rating ###
##
## Call:
## lm(formula = formula, data = Movie_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -147.90 -116.95 -72.55 14.33 1936.29
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 148.09 14.05 10.543 <2e-16 ***
## audience_ratingUpright -16.20 17.31 -0.936 0.35
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 208.4 on 642 degrees of freedom
## Multiple R-squared: 0.001363, Adjusted R-squared: -0.0001929
## F-statistic: 0.876 on 1 and 642 DF, p-value: 0.3497
##
##
##
## ### Regression Analysis for: audience_score ###
##
## Call:
## lm(formula = formula, data = Movie_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -276.38 -93.77 -27.03 25.10 1845.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -111.8470 21.2757 -5.257 2e-07 ***
## audience_score 4.4009 0.3524 12.489 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 187 on 642 degrees of freedom
## Multiple R-squared: 0.1955, Adjusted R-squared: 0.1942
## F-statistic: 156 on 1 and 642 DF, p-value: < 2.2e-16
##
##
##
## ### Regression Analysis for: best_pic_nom ###
##
## Call:
## lm(formula = formula, data = Movie_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -137.85 -116.29 -75.38 16.70 1930.32
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 137.859 8.359 16.49 <2e-16 ***
## best_pic_nomyes -12.650 45.228 -0.28 0.78
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 208.5 on 642 degrees of freedom
## Multiple R-squared: 0.0001218, Adjusted R-squared: -0.001436
## F-statistic: 0.07823 on 1 and 642 DF, p-value: 0.7798
##
##
##
## ### Regression Analysis for: Facebook_Likes ###
##
## Call:
## lm(formula = formula, data = Movie_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -434.73 -70.63 -42.44 54.57 1774.47
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.995e+01 7.267e+00 5.497 5.58e-08 ***
## Facebook_Likes 1.522e-03 6.411e-05 23.736 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 152.2 on 642 degrees of freedom
## Multiple R-squared: 0.4674, Adjusted R-squared: 0.4666
## F-statistic: 563.4 on 1 and 642 DF, p-value: < 2.2e-16
##
##
##
## ### Regression Analysis for: budget ###
##
## Call:
## lm(formula = formula, data = Movie_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -410.22 -57.02 -22.27 18.26 1443.17
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.6674 8.0886 1.319 0.188
## budget 3.0717 0.1308 23.489 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 152.9 on 642 degrees of freedom
## Multiple R-squared: 0.4622, Adjusted R-squared: 0.4614
## F-statistic: 551.7 on 1 and 642 DF, p-value: < 2.2e-16
##
##
##
## ### Regression Analysis for: language ###
##
## Call:
## lm(formula = formula, data = Movie_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -528.90 -113.12 -72.57 16.93 1933.67
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.66 205.19 0.091 0.92757
## languageEnglish 115.85 205.35 0.564 0.57283
## languageFrench 618.57 229.40 2.696 0.00719 **
## languageJapanese -14.50 290.18 -0.050 0.96016
## languageMandarin 81.87 290.18 0.282 0.77793
## languageSpanish 188.76 251.30 0.751 0.45286
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 205.2 on 638 degrees of freedom
## Multiple R-squared: 0.03754, Adjusted R-squared: 0.03
## F-statistic: 4.977 on 5 and 638 DF, p-value: 0.0001765
##
##
##
## ### Regression Analysis for: country ###
##
## Call:
## lm(formula = formula, data = Movie_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -275.29 -112.87 -70.26 18.33 1925.77
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.660 207.749 0.090 0.928
## countryAustralia 6.808 212.032 0.032 0.974
## countryBelgium 40.320 293.801 0.137 0.891
## countryCanada 84.850 213.145 0.398 0.691
## countryChina 44.055 254.439 0.173 0.863
## countryFrance 260.622 217.888 1.196 0.232
## countryGermany 105.081 216.986 0.484 0.628
## countryJapan 204.990 254.439 0.806 0.421
## countryMexico 153.180 293.801 0.521 0.602
## countryNew Line 84.380 293.801 0.287 0.774
## countryNew Zealand 304.753 239.887 1.270 0.204
## countrySpain 66.840 293.801 0.228 0.820
## countryUK 93.249 210.614 0.443 0.658
## countryUSA 123.748 207.944 0.595 0.552
##
## Residual standard error: 207.7 on 630 degrees of freedom
## Multiple R-squared: 0.02572, Adjusted R-squared: 0.005614
## F-statistic: 1.279 on 13 and 630 DF, p-value: 0.2203
multi_reg_model <- lm(revenue ~ critics_score + audience_score + budget + Facebook_Likes, data = Movie_Data)
summary(multi_reg_model)
##
## Call:
## lm(formula = revenue ~ critics_score + audience_score + budget +
## Facebook_Likes, data = Movie_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -302.80 -54.62 -5.03 29.55 1505.24
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.792e+01 1.445e+01 -6.084 2.02e-09 ***
## critics_score 7.803e-01 3.369e-01 2.316 0.0209 *
## audience_score 7.919e-01 3.847e-01 2.058 0.0400 *
## budget 1.838e+00 1.282e-01 14.339 < 2e-16 ***
## Facebook_Likes 9.631e-04 6.163e-05 15.628 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 125.1 on 639 degrees of freedom
## Multiple R-squared: 0.6415, Adjusted R-squared: 0.6392
## F-statistic: 285.8 on 4 and 639 DF, p-value: < 2.2e-16
r_squared <- summary(multi_reg_model)$r.squared
adj_r_squared <- summary(multi_reg_model)$adj.r.squared
vif_values <- vif(multi_reg_model)
r_value <- sqrt(r_squared)
cat("\n### Multiple Regression Analysis ###\n")
##
## ### Multiple Regression Analysis ###
cat("R (Multiple Correlation Coefficient):", round(r_value, 4), "\n")
## R (Multiple Correlation Coefficient): 0.8009
cat("R-Squared:", round(r_squared, 4), "\n")
## R-Squared: 0.6415
cat("Adjusted R-Squared:", round(adj_r_squared, 4), "\n")
## Adjusted R-Squared: 0.6392
cat("\n### Variance Inflation Factors (VIF) ###\n")
##
## ### Variance Inflation Factors (VIF) ###
print(vif_values)
## critics_score audience_score budget Facebook_Likes
## 2.496051 2.662210 1.434645 1.366296
Movie_Data$predicted_revenue <- predict(multi_reg_model, newdata = Movie_Data)
ggplot(Movie_Data, aes(x = predicted_revenue, y = revenue)) +
geom_point(color = "red3") + # Red dots for actual revenue
geom_smooth(method = "lm", color = "black", se = FALSE) + # Black regression line
labs(title = "Actual vs Predicted Revenue",
x = "Predicted Revenue",
y = "Actual Revenue") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Movie_Data$residuals <- residuals(multi_reg_model)
ggplot(Movie_Data, aes(x = predicted_revenue, y = residuals)) +
geom_point(color = "red3") + # Red dots for residuals
geom_hline(yintercept = 0, linetype = "solid", color = "black") + # Zero line
labs(title = "Residual Plot",
x = "Predicted Revenue",
y = "Residuals") +
theme_minimal()
genre_models <- list()
for (g in unique(Movie_Data$genre)) {
genre_data <- Movie_Data %>% filter(genre == g)
if (nrow(genre_data) > 5) {
model <- lm(revenue ~ critics_score + audience_score + budget + Facebook_Likes, data = genre_data)
genre_models[[g]] <- model
cat("\n### Regression Analysis for Genre:", g, "###\n")
print(summary(model))
}
}
##
## ### Regression Analysis for Genre: Drama ###
##
## Call:
## lm(formula = revenue ~ critics_score + audience_score + budget +
## Facebook_Likes, data = genre_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -304.64 -57.11 -7.19 32.02 1506.05
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.219e+01 2.554e+01 -3.610 0.00036 ***
## critics_score 9.414e-01 5.751e-01 1.637 0.10272
## audience_score 7.351e-01 6.335e-01 1.160 0.24680
## budget 1.803e+00 2.104e-01 8.567 5.93e-16 ***
## Facebook_Likes 9.751e-04 1.048e-04 9.308 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 143.4 on 296 degrees of freedom
## Multiple R-squared: 0.5723, Adjusted R-squared: 0.5666
## F-statistic: 99.03 on 4 and 296 DF, p-value: < 2.2e-16
##
##
## ### Regression Analysis for Genre: Mystery ###
##
## Call:
## lm(formula = revenue ~ critics_score + audience_score + budget +
## Facebook_Likes, data = genre_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -201.302 -54.812 4.024 46.818 260.410
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.196e+02 3.229e+01 -3.705 0.000499 ***
## critics_score 1.215e+00 7.945e-01 1.529 0.132041
## audience_score 1.183e+00 9.659e-01 1.225 0.226019
## budget 8.550e-01 2.872e-01 2.977 0.004346 **
## Facebook_Likes 1.224e-03 1.295e-04 9.451 4.86e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 83.88 on 54 degrees of freedom
## Multiple R-squared: 0.8193, Adjusted R-squared: 0.8059
## F-statistic: 61.21 on 4 and 54 DF, p-value: < 2.2e-16
##
##
## ### Regression Analysis for Genre: Comedy ###
##
## Call:
## lm(formula = revenue ~ critics_score + audience_score + budget +
## Facebook_Likes, data = genre_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -222.725 -29.912 6.187 31.010 304.637
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.828e+01 2.322e+01 -3.802 0.000277 ***
## critics_score 7.361e-01 5.453e-01 1.350 0.180806
## audience_score 7.733e-01 6.429e-01 1.203 0.232484
## budget 2.717e+00 2.521e-01 10.776 < 2e-16 ***
## Facebook_Likes 6.500e-04 1.199e-04 5.421 5.96e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 75.9 on 81 degrees of freedom
## Multiple R-squared: 0.8527, Adjusted R-squared: 0.8454
## F-statistic: 117.2 on 4 and 81 DF, p-value: < 2.2e-16
##
##
## ### Regression Analysis for Genre: Action ###
##
## Call:
## lm(formula = revenue ~ critics_score + audience_score + budget +
## Facebook_Likes, data = genre_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -183.63 -45.21 -10.25 24.78 398.10
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.025e+02 3.673e+01 -2.789 0.007069 **
## critics_score 7.980e-01 6.473e-01 1.233 0.222507
## audience_score 1.543e+00 8.198e-01 1.882 0.064666 .
## budget 1.785e+00 2.765e-01 6.453 2.12e-08 ***
## Facebook_Likes 5.825e-04 1.582e-04 3.681 0.000499 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 92.47 on 60 degrees of freedom
## Multiple R-squared: 0.6928, Adjusted R-squared: 0.6723
## F-statistic: 33.82 on 4 and 60 DF, p-value: 9.172e-15
##
##
## ### Regression Analysis for Genre: Musical ###
##
## Call:
## lm(formula = revenue ~ critics_score + audience_score + budget +
## Facebook_Likes, data = genre_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -85.54 -30.96 -10.47 17.33 103.04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.068e+01 7.050e+01 -0.435 0.67656
## critics_score 1.672e+00 1.807e+00 0.925 0.38556
## audience_score -1.418e+00 2.366e+00 -0.599 0.56779
## budget 1.198e+00 1.076e+00 1.114 0.30214
## Facebook_Likes 9.781e-04 2.380e-04 4.109 0.00452 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 70.65 on 7 degrees of freedom
## Multiple R-squared: 0.8775, Adjusted R-squared: 0.8075
## F-statistic: 12.54 on 4 and 7 DF, p-value: 0.00262
##
##
## ### Regression Analysis for Genre: Documentary ###
##
## Call:
## lm(formula = revenue ~ critics_score + audience_score + budget +
## Facebook_Likes, data = genre_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -235.25 -54.36 -8.73 24.57 667.37
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.363e+01 5.256e+01 -1.211 0.2321
## critics_score -2.029e+00 2.146e+00 -0.945 0.3493
## audience_score 3.447e+00 2.177e+00 1.584 0.1200
## budget 1.136e+00 5.093e-01 2.230 0.0305 *
## Facebook_Likes 1.181e-03 2.237e-04 5.277 3.28e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 126.6 on 47 degrees of freedom
## Multiple R-squared: 0.603, Adjusted R-squared: 0.5692
## F-statistic: 17.84 on 4 and 47 DF, p-value: 5.669e-09
##
##
## ### Regression Analysis for Genre: Other ###
##
## Call:
## lm(formula = revenue ~ critics_score + audience_score + budget +
## Facebook_Likes, data = genre_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -122.83 -27.15 10.52 28.90 140.88
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.549e+01 7.246e+01 -1.318 0.2201
## critics_score 3.445e+00 3.382e+00 1.019 0.3350
## audience_score -2.168e+00 3.297e+00 -0.657 0.5273
## budget 1.875e+00 7.250e-01 2.586 0.0294 *
## Facebook_Likes 9.942e-04 3.964e-04 2.508 0.0334 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 87.4 on 9 degrees of freedom
## Multiple R-squared: 0.9243, Adjusted R-squared: 0.8906
## F-statistic: 27.47 on 4 and 9 DF, p-value: 4.663e-05
##
##
## ### Regression Analysis for Genre: Sci-Fi & Fantasy ###
##
## Call:
## lm(formula = revenue ~ critics_score + audience_score + budget +
## Facebook_Likes, data = genre_data)
##
## Residuals:
## 1 2 3 4 5 6 7 8 9
## -11.257 -48.980 21.789 37.591 -22.903 8.863 57.863 -39.486 -3.481
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.289215 107.553515 0.254 0.812
## critics_score -4.485788 4.217079 -1.064 0.347
## audience_score 5.826905 3.451658 1.688 0.167
## budget 0.434039 0.600002 0.723 0.509
## Facebook_Likes -0.001476 0.002227 -0.663 0.544
##
## Residual standard error: 49.84 on 4 degrees of freedom
## Multiple R-squared: 0.8073, Adjusted R-squared: 0.6146
## F-statistic: 4.189 on 4 and 4 DF, p-value: 0.0971
##
##
## ### Regression Analysis for Genre: Horror ###
##
## Call:
## lm(formula = revenue ~ critics_score + audience_score + budget +
## Facebook_Likes, data = genre_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -370.63 -63.16 13.64 41.01 419.68
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.279e+00 1.005e+02 0.053 0.959
## critics_score 3.561e+00 2.540e+00 1.402 0.178
## audience_score -4.835e+00 3.265e+00 -1.481 0.156
## budget 6.410e+00 1.116e+00 5.745 1.91e-05 ***
## Facebook_Likes -3.066e-04 6.155e-04 -0.498 0.624
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 154.5 on 18 degrees of freedom
## Multiple R-squared: 0.8303, Adjusted R-squared: 0.7926
## F-statistic: 22.02 on 4 and 18 DF, p-value: 9.872e-07
##
##
## ### Regression Analysis for Genre: Art House ###
##
## Call:
## lm(formula = revenue ~ critics_score + audience_score + budget +
## Facebook_Likes, data = genre_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -107.88 -24.02 -19.11 -10.46 201.82
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.0671704 86.0991271 0.129 0.9005
## critics_score -0.2932747 1.3387724 -0.219 0.8315
## audience_score 0.4643615 1.9218791 0.242 0.8145
## budget 1.7541955 0.7315859 2.398 0.0400 *
## Facebook_Likes 0.0006518 0.0003148 2.070 0.0683 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 88.56 on 9 degrees of freedom
## Multiple R-squared: 0.5835, Adjusted R-squared: 0.3983
## F-statistic: 3.152 on 4 and 9 DF, p-value: 0.07044
##
##
## ### Regression Analysis for Genre: Animation ###
##
## Call:
## lm(formula = revenue ~ critics_score + audience_score + budget +
## Facebook_Likes, data = genre_data)
##
## Residuals:
## 1 2 3 4 5 6 7 8 9
## 29.48 18.48 49.67 20.80 -75.24 30.96 -43.04 26.81 -57.93
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.338e+02 7.156e+01 -3.267 0.03087 *
## critics_score 2.370e+00 2.697e+00 0.879 0.42921
## audience_score 4.257e+00 1.828e+00 2.329 0.08031 .
## budget -1.740e+00 1.105e+00 -1.575 0.19049
## Facebook_Likes 1.483e-03 1.475e-04 10.057 0.00055 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 64.53 on 4 degrees of freedom
## Multiple R-squared: 0.9739, Adjusted R-squared: 0.9479
## F-statistic: 37.36 on 4 and 4 DF, p-value: 0.002003
genre_factors <- data.frame(Genre = character(), Factor = character(), Coefficient = numeric(), p_value = numeric())
for (g in names(genre_models)) {
model <- genre_models[[g]]
if (!inherits(model, "lm")) {
next
}
coef_summary <- summary(model)$coefficients
if (nrow(coef_summary) > 1) {
coefficients <- coef_summary[-1, 1]
p_values <- coef_summary[-1, 4]
df <- data.frame(
Genre = g,
Factor = rownames(coef_summary)[-1],
Coefficient = as.numeric(coefficients),
p_value = as.numeric(p_values)
)
df <- df %>% filter(p_value < 0.05) %>% arrange(desc(abs(Coefficient))) %>% head(2)
if (nrow(df) > 0) {
genre_factors <- rbind(genre_factors, df)
}
}
}
print(genre_factors)
## Genre Factor Coefficient p_value
## 1 Drama budget 1.8026849718 5.932222e-16
## 2 Drama Facebook_Likes 0.0009750676 3.041403e-18
## 3 Mystery budget 0.8550309531 4.345513e-03
## 4 Mystery Facebook_Likes 0.0012238953 4.857759e-13
## 5 Comedy budget 2.7165068523 2.594605e-17
## 6 Comedy Facebook_Likes 0.0006499796 5.955437e-07
## 7 Action budget 1.7845083534 2.123393e-08
## 8 Action Facebook_Likes 0.0005824697 4.988575e-04
## 9 Musical Facebook_Likes 0.0009781188 4.520852e-03
## 10 Documentary budget 1.1359108139 3.053536e-02
## 11 Documentary Facebook_Likes 0.0011807621 3.276095e-06
## 12 Other budget 1.8747488437 2.942088e-02
## 13 Other Facebook_Likes 0.0009942402 3.340487e-02
## 14 Horror budget 6.4103721361 1.908042e-05
## 15 Art House budget 1.7541954945 4.004217e-02
## 16 Animation Facebook_Likes 0.0014831791 5.497405e-04
for (g in unique(genre_factors$Genre)) {
genre_data <- Movie_Data %>% filter(genre == g)
factors <- genre_factors %>% filter(Genre == g) %>% pull(Factor)
if (length(factors) >= 2) {
factor1 <- factors[1]
factor2 <- factors[2]
p1 <- ggplot(genre_data, aes_string(x = factor1, y = "revenue")) +
geom_point(color = "red3") +
geom_smooth(method = "lm", color = "black", se = FALSE) +
labs(title = paste("Revenue vs", factor1, "in", g), x = factor1, y = "Revenue") +
theme_minimal()
p2 <- ggplot(genre_data, aes_string(x = factor2, y = "revenue")) +
geom_point(color = "blue3") +
geom_smooth(method = "lm", color = "black", se = FALSE) +
labs(title = paste("Revenue vs", factor2, "in", g), x = factor2, y = "Revenue") +
theme_minimal()
print(p1)
print(p2)
}
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Movie_Data$genre <- as.factor(Movie_Data$genre)
Movie_Data$language <- as.factor(Movie_Data$language)
Movie_Data$country <- as.factor(Movie_Data$country)
dummies <- dummyVars(~ genre + language + country, data = Movie_Data)
Movie_Data_encoded <- predict(dummies, newdata = Movie_Data) %>%
as.data.frame()
Movie_Data_encoded$budget <- Movie_Data$budget
Movie_Data_encoded$runtime <- Movie_Data$runtime
Movie_Data_encoded$revenue <- Movie_Data$revenue
Movie_Data_encoded$budget <- scale(Movie_Data_encoded$budget)
Movie_Data_encoded$runtime <- scale(Movie_Data_encoded$runtime)
str(Movie_Data_encoded)
## 'data.frame': 644 obs. of 34 variables:
## $ genre.Action : num 0 0 0 0 1 0 0 1 0 1 ...
## $ genre.Animation : num 0 0 0 0 0 0 0 0 0 0 ...
## $ genre.Art House : num 0 0 0 0 0 0 0 0 0 0 ...
## $ genre.Comedy : num 0 0 0 1 0 0 1 0 0 0 ...
## $ genre.Documentary : num 0 0 0 0 0 0 0 0 0 0 ...
## $ genre.Drama : num 1 1 0 0 0 1 0 0 1 0 ...
## $ genre.Horror : num 0 0 0 0 0 0 0 0 0 0 ...
## $ genre.Musical : num 0 0 0 0 0 0 0 0 0 0 ...
## $ genre.Mystery : num 0 0 1 0 0 0 0 0 0 0 ...
## $ genre.Other : num 0 0 0 0 0 0 0 0 0 0 ...
## $ genre.Sci-Fi & Fantasy: num 0 0 0 0 0 0 0 0 0 0 ...
## $ language. : num 1 0 0 0 0 0 0 0 0 0 ...
## $ language.English : num 0 1 1 1 1 1 1 1 1 1 ...
## $ language.French : num 0 0 0 0 0 0 0 0 0 0 ...
## $ language.Japanese : num 0 0 0 0 0 0 0 0 0 0 ...
## $ language.Mandarin : num 0 0 0 0 0 0 0 0 0 0 ...
## $ language.Spanish : num 0 0 0 0 0 0 0 0 0 0 ...
## $ country. : num 1 0 0 0 0 0 0 0 0 0 ...
## $ country.Australia : num 0 0 0 0 0 0 0 0 0 0 ...
## $ country.Belgium : num 0 0 0 0 0 0 0 0 0 0 ...
## $ country.Canada : num 0 0 0 0 0 0 0 0 0 0 ...
## $ country.China : num 0 0 0 0 0 0 0 0 0 0 ...
## $ country.France : num 0 0 0 0 0 0 0 0 0 0 ...
## $ country.Germany : num 0 0 0 0 0 0 0 0 0 0 ...
## $ country.Japan : num 0 0 0 0 0 0 0 0 0 0 ...
## $ country.Mexico : num 0 0 0 0 0 0 0 0 0 0 ...
## $ country.New Line : num 0 0 0 0 0 0 0 0 0 0 ...
## $ country.New Zealand : num 0 0 0 0 0 0 0 0 0 0 ...
## $ country.Spain : num 0 0 0 0 0 0 0 0 0 0 ...
## $ country.UK : num 0 0 0 0 0 0 0 0 0 0 ...
## $ country.USA : num 0 1 1 1 1 1 1 1 1 1 ...
## $ budget : num [1:644, 1] -0.622 0.081 -0.548 -0.819 -0.548 ...
## ..- attr(*, "scaled:center")= num 41.3
## ..- attr(*, "scaled:scale")= num 46.1
## $ runtime : num [1:644, 1] 1.44 0.108 -0.456 -0.405 0.261 ...
## ..- attr(*, "scaled:center")= num 106
## ..- attr(*, "scaled:scale")= num 19.5
## $ revenue : num 18.66 60.22 31.56 0.75 19.68 ...
colSums(is.na(Movie_Data_encoded))
## genre.Action genre.Animation genre.Art House
## 0 0 0
## genre.Comedy genre.Documentary genre.Drama
## 0 0 0
## genre.Horror genre.Musical genre.Mystery
## 0 0 0
## genre.Other genre.Sci-Fi & Fantasy language.
## 0 0 0
## language.English language.French language.Japanese
## 0 0 0
## language.Mandarin language.Spanish country.
## 0 0 0
## country.Australia country.Belgium country.Canada
## 0 0 0
## country.China country.France country.Germany
## 0 0 0
## country.Japan country.Mexico country.New Line
## 0 0 0
## country.New Zealand country.Spain country.UK
## 0 0 0
## country.USA
## 0 0 0
## revenue
## 0
Movie_Data_encoded <- na.omit(Movie_Data_encoded)
colnames(Movie_Data_encoded) <- make.names(colnames(Movie_Data_encoded))
colnames(Movie_Data_encoded)
## [1] "genre.Action" "genre.Animation" "genre.Art.House"
## [4] "genre.Comedy" "genre.Documentary" "genre.Drama"
## [7] "genre.Horror" "genre.Musical" "genre.Mystery"
## [10] "genre.Other" "genre.Sci.Fi...Fantasy" "language."
## [13] "language.English" "language.French" "language.Japanese"
## [16] "language.Mandarin" "language.Spanish" "country."
## [19] "country.Australia" "country.Belgium" "country.Canada"
## [22] "country.China" "country.France" "country.Germany"
## [25] "country.Japan" "country.Mexico" "country.New.Line"
## [28] "country.New.Zealand" "country.Spain" "country.UK"
## [31] "country.USA" "budget" "runtime"
## [34] "revenue"
set.seed(123)
model <- randomForest(revenue ~ ., data = Movie_Data_encoded, ntree = 500, importance = TRUE)
importance(model)
## %IncMSE IncNodePurity
## genre.Action -2.0629982 1.285042e+05
## genre.Animation 0.8712342 2.167023e+05
## genre.Art.House 1.8466578 5.991725e+04
## genre.Comedy 1.4165968 2.097757e+05
## genre.Documentary -3.9351418 2.195749e+05
## genre.Drama 4.1107679 3.759883e+05
## genre.Horror 2.2360427 2.345085e+05
## genre.Musical -2.7764470 8.044547e+04
## genre.Mystery -3.4439895 2.052570e+05
## genre.Other 0.2049438 1.498764e+05
## genre.Sci.Fi...Fantasy 1.4957504 5.558148e+04
## language. 0.0000000 1.679182e+02
## language.English -2.2226092 2.344806e+05
## language.French 1.6651317 4.675654e+05
## language.Japanese 0.0000000 1.485003e+03
## language.Mandarin 0.0000000 1.360147e+03
## language.Spanish 1.2965258 4.560342e+03
## country. 0.0000000 4.672061e+02
## country.Australia 5.9409368 1.139381e+05
## country.Belgium 0.0000000 2.097353e+03
## country.Canada -2.3234044 6.494410e+04
## country.China -1.2078405 1.494641e+03
## country.France -2.4240310 1.463674e+05
## country.Germany 0.7570995 3.222123e+04
## country.Japan -2.0851743 4.062564e+04
## country.Mexico 0.0000000 1.057997e+03
## country.New.Line 0.0000000 9.782750e+03
## country.New.Zealand 2.1551053 1.255662e+05
## country.Spain 0.0000000 1.258903e+03
## country.UK -4.4383242 1.209512e+05
## country.USA 2.7834660 2.397310e+05
## budget 61.5999583 1.465527e+07
## runtime -1.0363890 3.131929e+06
varImpPlot(model)
##### Finding the best movie configuration and creating the ideal movie,
then predicting revenue for this movie
print(colnames(Movie_Data_encoded))
## [1] "genre.Action" "genre.Animation" "genre.Art.House"
## [4] "genre.Comedy" "genre.Documentary" "genre.Drama"
## [7] "genre.Horror" "genre.Musical" "genre.Mystery"
## [10] "genre.Other" "genre.Sci.Fi...Fantasy" "language."
## [13] "language.English" "language.French" "language.Japanese"
## [16] "language.Mandarin" "language.Spanish" "country."
## [19] "country.Australia" "country.Belgium" "country.Canada"
## [22] "country.China" "country.France" "country.Germany"
## [25] "country.Japan" "country.Mexico" "country.New.Line"
## [28] "country.New.Zealand" "country.Spain" "country.UK"
## [31] "country.USA" "budget" "runtime"
## [34] "revenue"
best_movie <- as.data.frame(t(rep(0, length(colnames(Movie_Data_encoded)))))
colnames(best_movie) <- colnames(Movie_Data_encoded)
best_movie$runtime <- scale(120, center = attr(scale(Movie_Data_encoded$runtime), "scaled:center"),
scale = attr(scale(Movie_Data_encoded$runtime), "scaled:scale"))
best_movie$budget <- scale(200000000, center = attr(scale(Movie_Data_encoded$budget), "scaled:center"),
scale = attr(scale(Movie_Data_encoded$budget), "scaled:scale"))
best_movie$genre.Action...Adventure <- 1 # Set exact name found in Step 1
best_movie$language.English <- 1
best_movie$country.USA <- 1
predicted_revenue <- predict(model, newdata = best_movie)
print(predicted_revenue)
## 1
## 1008.441
revenue_df <- data.frame(revenue = Movie_Data_encoded$revenue)
# Create a density plot
ggplot(revenue_df, aes(x = revenue)) +
geom_density(fill = "blue3", alpha = 0.3) + # Density curve of actual revenues
geom_vline(aes(xintercept = predicted_revenue), color = "red3", linetype = "dashed", size = 1) + # Mark predicted revenue
labs(title = "Predicted Revenue vs. Movie Revenue Distribution",
x = "Revenue",
y = "Density") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
intercept <- -87.92
critics_coef <- 0.7803
facebook_coef <- 0.0009631
critics_score <- 55
facebook_likes_initial <- 1250
predicted_revenue_initial <- intercept + (critics_coef * critics_score) +
(facebook_coef * facebook_likes_initial)
cat("Predicted Revenue for Premiere: $", round(predicted_revenue_initial, 2), "\n")
## Predicted Revenue for Premiere: $ -43.8
facebook_likes_new <- facebook_likes_initial * 100 # Increase by 10,000%
predicted_revenue_new <- intercept + (critics_coef * critics_score) +
(facebook_coef * facebook_likes_new)
cat("Predicted Revenue After Advertising Campaign: $", round(predicted_revenue_new, 2), "\n")
## Predicted Revenue After Advertising Campaign: $ 75.38
increase <- predicted_revenue_new - predicted_revenue_initial
cat("Expected Revenue Increase: $", round(increase, 2), "\n")
## Expected Revenue Increase: $ 119.18
revenue_data <- data.frame(
Scenario = c("Before Campaign", "After Campaign"),
Revenue = c(predicted_revenue_initial, predicted_revenue_new)
)
revenue_data$Scenario <- factor(revenue_data$Scenario, levels = c("Before Campaign", "After Campaign"))
ggplot(revenue_data, aes(x = Scenario, y = Revenue, fill = Scenario)) +
geom_bar(stat = "identity", width = 0.6, show.legend = FALSE) +
scale_fill_manual(values = c("goldenrod1", "red3")) + # Custom colors
geom_text(aes(label = round(Revenue, 2)), vjust = -0.5, size = 5) +
labs(title = "Predicted Revenue Before and After Advertising Campaign",
x = "Scenario",
y = "Predicted Revenue ($)") +
theme_minimal()