Set up working directory
getwd()
## [1] "D:/34884/Documents"
setwd("D:/34884/Documents")
Load Package
library(ggplot2)
library(readxl)
library(dplyr)
##
## 载入程序包:'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: 程序包'tidyr'是用R版本4.4.2 来建造的
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(visreg)
library(tidyr)
library(caret)
## Warning: 程序包'caret'是用R版本4.4.2 来建造的
## 载入需要的程序包:lattice
##
## 载入程序包:'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(randomForest)
## Warning: 程序包'randomForest'是用R版本4.4.2 来建造的
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## 载入程序包:'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
Import Data File Moive Dataset_General Audience
Audience <- read.csv("Movie Dataset_General Audience.csv")
View variables names and types in the file
str(Audience)
## 'data.frame': 651 obs. of 13 variables:
## $ original_title : chr "The Departed " "Exodus: Gods and Kings " "Spectre " "The X Files " ...
## $ type : chr "Feature Film" "Feature Film" "Feature Film" "Feature Film" ...
## $ genre : chr "Drama" "Drama" "Comedy" "Drama" ...
## $ runtime : int 118 131 84 97 90 78 107 130 88 100 ...
## $ mpaa_rating : chr "Unrated" "PG-13" "R" "PG" ...
## $ imdb_rating : num 2.1 3.3 7.6 2.5 7.2 7.8 2.1 2.1 7.5 4.1 ...
## $ imdb_num_votes : int 9904 1010 22381 54363 35096 333 9904 122980 880 739 ...
## $ critics_rating : chr "Fresh" "Fresh" "Certified Fresh" "Rotten" ...
## $ critics_score : int 10 43 91 27 81 91 30 33 90 56 ...
## $ audience_rating: chr "Upright" "Upright" "Upright" "Upright" ...
## $ audience_score : int 21 34 91 23 77 86 21 31 89 45 ...
## $ best_pic_nom : chr "no" "no" "no" "no" ...
## $ Facebook_Likes : int 48 490 11700 230 172221 211873 171 355 292000 3611 ...
To preview top 10 rows
head(Audience, n = 10)
## original_title type
## 1 The Departed Feature Film
## 2 Exodus: Gods and Kings Feature Film
## 3 Spectre Feature Film
## 4 The X Files Feature Film
## 5 Star Wars: Episode VII - The Force Awakens Feature Film
## 6 John Carter Documentary
## 7 Rio Feature Film
## 8 The Expendables Feature Film
## 9 Avengers: Age of Ultron Documentary
## 10 Cutthroat Island Feature Film
## genre runtime mpaa_rating imdb_rating imdb_num_votes critics_rating
## 1 Drama 118 Unrated 2.1 9904 Fresh
## 2 Drama 131 PG-13 3.3 1010 Fresh
## 3 Comedy 84 R 7.6 22381 Certified Fresh
## 4 Drama 97 PG 2.5 54363 Rotten
## 5 Horror 90 R 7.2 35096 Certified Fresh
## 6 Documentary 78 Unrated 7.8 333 Fresh
## 7 Drama 107 R 2.1 9904 Fresh
## 8 Drama 130 R 2.1 122980 Certified Fresh
## 9 Documentary 88 Unrated 7.5 880 Fresh
## 10 Drama 100 R 4.1 739 Fresh
## critics_score audience_rating audience_score best_pic_nom Facebook_Likes
## 1 10 Upright 21 no 48
## 2 43 Upright 34 no 490
## 3 91 Upright 91 no 11700
## 4 27 Upright 23 no 230
## 5 81 Upright 77 no 172221
## 6 91 Upright 86 no 211873
## 7 30 Upright 21 no 171
## 8 33 Upright 31 no 355
## 9 90 Upright 89 no 292000
## 10 56 Upright 45 no 3611
Import Data File Movie Dataset_Financials
Financial <- read.csv("Movie Dataset_Financials.csv")
View variables names and types in the file
str(Financial)
## 'data.frame': 651 obs. of 5 variables:
## $ original_title : chr "The Departed " "Exodus: Gods and Kings " "Spectre " "The X Files " ...
## $ budget..Millions. : num 1 2 110 2.9 190 135 1 2 74 2 ...
## $ revenue..Millions.: num 0.01 0.05 295.24 0.3 1506.25 ...
## $ language : chr "English" "English" "English" "English" ...
## $ country : chr "USA" "UK" "UK" "USA" ...
To preview top 10 rows
head(Financial, n = 10)
## original_title budget..Millions.
## 1 The Departed 1.0
## 2 Exodus: Gods and Kings 2.0
## 3 Spectre 110.0
## 4 The X Files 2.9
## 5 Star Wars: Episode VII - The Force Awakens 190.0
## 6 John Carter 135.0
## 7 Rio 1.0
## 8 The Expendables 2.0
## 9 Avengers: Age of Ultron 74.0
## 10 Cutthroat Island 2.0
## revenue..Millions. language country
## 1 0.01 English USA
## 2 0.05 English UK
## 3 295.24 English UK
## 4 0.30 English USA
## 5 1506.25 French France
## 6 532.95 English USA
## 7 0.40 English USA
## 8 0.57 English USA
## 9 1156.73 English USA
## 10 1.05 English USA
Merge Audience and Financial into Movie
Movie <- merge(Audience, Financial, by = "original_title")
Check the number of unique value in Audience
n_distinct(Audience$original_title)
## [1] 644
Check the number of unique value in Financial
n_distinct(Financial$original_title)
## [1] 644
Find duplicated rows in Movie
duplicate_row <- Movie[duplicated(Movie$original_title),]
Print duplicated rows
print(duplicate_row)
## original_title type genre
## 147 Fantastic Four Feature Film Mystery & Suspense
## 148 Fantastic Four Feature Film Drama
## 149 Fantastic Four Feature Film Drama
## 174 Godzilla Resurgence Feature Film Comedy
## 175 Godzilla Resurgence Feature Film Drama
## 176 Godzilla Resurgence Feature Film Drama
## 203 Hercules Feature Film Drama
## 204 Hercules Documentary Documentary
## 205 Hercules Documentary Documentary
## 339 Pan Feature Film Drama
## 340 Pan Feature Film Other
## 341 Pan Feature Film Other
## 492 The Fast and the Furious Feature Film Drama
## 493 The Fast and the Furious Feature Film Drama
## 494 The Fast and the Furious Feature Film Drama
## 532 The Legend of Tarzan Documentary Documentary
## 533 The Legend of Tarzan Feature Film Drama
## 534 The Legend of Tarzan Feature Film Drama
## 588 The Twilight Saga: Breaking Dawn - Part 2 Documentary Documentary
## 589 The Twilight Saga: Breaking Dawn - Part 2 Feature Film Comedy
## 590 The Twilight Saga: Breaking Dawn - Part 2 Feature Film Comedy
## runtime mpaa_rating imdb_rating imdb_num_votes critics_rating
## 147 106 R 6.3 25054 Rotten
## 148 101 R 2.1 9904 Fresh
## 149 101 R 2.1 9904 Fresh
## 174 93 PG 4.2 182983 Certified Fresh
## 175 98 PG-13 6.3 50340 Fresh
## 176 98 PG-13 6.3 50340 Fresh
## 203 105 R 3.0 9216 Rotten
## 204 107 Unrated 3.8 10522 Fresh
## 205 107 Unrated 3.8 10522 Fresh
## 339 130 R 2.0 9216 Rotten
## 340 90 PG 3.6 1010 Fresh
## 341 90 PG 3.6 1010 Fresh
## 492 104 R 7.1 128361 Fresh
## 493 95 R 7.2 35635 Certified Fresh
## 494 95 R 7.2 35635 Certified Fresh
## 532 92 Unrated 6.8 1942 Certified Fresh
## 533 94 R 3.4 57933 Rotten
## 534 94 R 3.4 57933 Rotten
## 588 98 Unrated 3.1 1010 Fresh
## 589 95 PG 7.5 880 Fresh
## 590 95 PG 7.5 880 Fresh
## critics_score audience_rating audience_score best_pic_nom Facebook_Likes
## 147 43 Spilled 49 no 1261
## 148 35 Upright 26 no 51261
## 149 35 Upright 26 no 51261
## 174 34 Upright 33 yes 5699
## 175 60 Spilled 67 no 699
## 176 60 Spilled 67 no 699
## 203 50 Upright 40 no 16235
## 204 21 Upright 20 no 235
## 205 21 Upright 20 no 235
## 339 34 Upright 24 no 393
## 340 33 Upright 24 no 393
## 341 33 Upright 24 no 393
## 492 68 Upright 78 no 125327
## 493 93 Upright 80 no 45327
## 494 93 Upright 80 no 45327
## 532 66 Upright 68 no 121175
## 533 25 Spilled 39 no 11175
## 534 25 Spilled 39 no 11175
## 588 45 Upright 46 no 1177
## 589 90 Upright 89 no 359177
## 590 90 Upright 89 no 359177
## budget..Millions. revenue..Millions. language country
## 147 13.0 45.30 English USA
## 148 17.0 136.62 English USA
## 149 13.0 45.30 English USA
## 174 1.9 157.11 Japanese Japan
## 175 8.9 4.16 Japanese Japan
## 176 1.9 157.11 Japanese Japan
## 203 6.0 1.27 English USA
## 204 20.0 34.08 English USA
## 205 6.0 1.27 English USA
## 339 3.0 0.70 English USA
## 340 4.0 1.60 English USA
## 341 3.0 0.70 English USA
## 492 20.0 18.22 English USA
## 493 35.0 77.48 English USA
## 494 20.0 18.22 English USA
## 532 6.0 11.12 English USA
## 533 35.0 82.35 English USA
## 534 6.0 11.12 English USA
## 588 0.5 1025.47 English USA
## 589 1.0 3.39 English USA
## 590 0.5 1025.47 English USA
Remove duplicated rows from Movie
Movie_unique <- Movie[!duplicated(Movie$original_title),]
Print the unique data set
print(head(Movie_unique, 10))
## original_title type genre runtime mpaa_rating
## 1 10,000 B.C. Feature Film Drama 134 R
## 2 102 Dalmatians Feature Film Drama 108 PG
## 3 2 Fast 2 Furious Feature Film Mystery & Suspense 97 PG-13
## 4 2012 Feature Film Comedy 98 PG-13
## 5 300: Rise of an Empire Feature Film Action & Adventure 111 PG-13
## 6 47 Ronin Feature Film Drama 106 PG
## 7 50 First Dates Feature Film Comedy 87 R
## 8 A Beautiful Mind Feature Film Action & Adventure 83 G
## 9 A Christmas Carol Feature Film Drama 100 R
## 10 A Good Day to Die Hard Feature Film Action & Adventure 86 R
## imdb_rating imdb_num_votes critics_rating critics_score audience_rating
## 1 6.8 9025 Fresh 60 Upright
## 2 4.9 5136 Rotten 5 Spilled
## 3 6.3 54771 Rotten 40 Spilled
## 4 6.3 8646 Certified Fresh 44 Spilled
## 5 6.0 103789 Rotten 51 Spilled
## 6 7.8 12450 Fresh 75 Spilled
## 7 5.4 6811 Rotten 35 Spilled
## 8 7.6 78862 Rotten 50 Upright
## 9 7.0 8320 Rotten 70 Spilled
## 10 4.1 739 Fresh 53 Upright
## audience_score best_pic_nom Facebook_Likes budget..Millions.
## 1 76 no 23343 12.6
## 2 13 no 84182 45.0
## 3 49 no 35296 16.0
## 4 54 no 445 3.5
## 5 51 no 21583 16.0
## 6 85 no 20965 50.0
## 7 31 no 12952 50.0
## 8 81 no 52827 26.0
## 9 74 no 48878 65.0
## 10 42 no 5481 8.5
## revenue..Millions. language country
## 1 18.66 English New Zealand
## 2 60.22 English USA
## 3 31.56 English USA
## 4 0.75 English USA
## 5 19.68 English USA
## 6 240.36 English USA
## 7 36.35 English USA
## 8 43.32 English USA
## 9 235.67 English USA
## 10 5.77 English USA
Check NA value in Movie Unique (Need help)
Movie_unique %>%
filter(!complete.cases(Movie_unique))
## original_title type genre runtime mpaa_rating imdb_rating
## 1 Sex and the City 2 Documentary Documentary NA Unrated 4.3
## imdb_num_votes critics_rating critics_score audience_rating audience_score
## 1 739 Fresh 53 Upright 41
## best_pic_nom Facebook_Likes budget..Millions. revenue..Millions. language
## 1 no 44555 19.8 41.3 English
## country
## 1 USA
Replace the NA Value
Movie_unique$runtime[is.na(Movie_unique$runtime)] <- 146
Show the result of Replace NA value
print(Movie_unique[380, ])
## original_title type genre runtime mpaa_rating imdb_rating
## 392 Sex and the City 2 Documentary Documentary 146 Unrated 4.3
## imdb_num_votes critics_rating critics_score audience_rating audience_score
## 392 739 Fresh 53 Upright 41
## best_pic_nom Facebook_Likes budget..Millions. revenue..Millions. language
## 392 no 44555 19.8 41.3 English
## country
## 392 USA
Summaries of runtime
summary(Movie_unique$runtime)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 39 92 103 106 116 267
Summaries of imdb rating
summary(Movie_unique$imdb_rating)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.200 4.300 6.150 5.822 7.100 55.000
Summaries of critics score
summary(Movie_unique$critics_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 37.00 55.00 54.99 73.25 100.00
Summaries of audience score
summary(Movie_unique$audience_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 38.00 55.50 56.64 76.00 96.00
Summaries of Facebook Likes
summary(Movie_unique$Facebook_Likes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2 7309 26125 64181 82797 555609
Summaries of budget
summary(Movie_unique$budget..Millions.)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.32 10.00 25.00 41.29 53.00 280.00
Summaries of revenue
summary(Movie_unique$revenue..Millions.)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01 21.57 62.48 137.52 154.25 2068.18
Convert the genre to new groupings
Movie_unique <- Movie_unique %>%
mutate(New_Genre = case_when(genre == "Action & Adventure" ~ "Adventure",
genre == "Animation" ~ "Other",
genre == "Art House & International" ~ "Theater/International",
genre == "Comedy" ~ "Comedy",
genre == "Documentary" ~ "Other",
genre == "Drama" ~ "Drame",
genre == "Horror" ~ "Thriller",
genre == "Musical & Performing Arts" ~ "Theater/International",
genre == "Mystery & Suspense" ~ "Thriller",
genre == "Other" ~ "Other",
genre == "Science Fiction & Fantasy" ~ "Adventure",
TRUE ~ "Check"))
Convert runtime to Different Group
Movie_unique$runtimeGrp <- cut(Movie_unique$runtime,
breaks = c(39, 60, 90, 120, 267),
Labels = c("Short Movie", "Special Movie", "General Movie", "Extremely Long Movie"),
right=FALSE)
names(Movie_unique)
## [1] "original_title" "type" "genre"
## [4] "runtime" "mpaa_rating" "imdb_rating"
## [7] "imdb_num_votes" "critics_rating" "critics_score"
## [10] "audience_rating" "audience_score" "best_pic_nom"
## [13] "Facebook_Likes" "budget..Millions." "revenue..Millions."
## [16] "language" "country" "New_Genre"
## [19] "runtimeGrp"
head(Movie_unique$runtimeGrp)
## [1] [120,267) [90,120) [90,120) [90,120) [90,120) [90,120)
## Levels: [39,60) [60,90) [90,120) [120,267)
Amount of each runtime group
table(Movie_unique$runtimeGrp)
##
## [39,60) [60,90) [90,120) [120,267)
## 2 100 412 129
Visualization
Amount of each genre
ggplot(Movie_unique,aes(x = genre,fill = genre)) +
geom_bar(position = "dodge") +
geom_text(stat="count", aes(label=..count..), vjust=0, size=5, color="black") +
labs(title = "Distribution of the number of movies by genre", x = "Genre", y = "Quantity",) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Average IMDB Voting Number by Genre
Movie_unique %>%
group_by(genre) %>%
summarise(Average_vote = mean(imdb_num_votes, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(genre, Average_vote), y = Average_vote)) +
geom_bar(stat = "identity", fill = "lightblue") +
geom_text(aes(label = round(Average_vote, 1)),
hjust = 1,
color = "black",
size = 3) +
coord_flip() +
labs(title = "Average IMDB Voting Number by Genre", x = "Genre", y = "Average IMDB Voting Number")

Average IMDB_rating by Genre
Movie_unique %>%
group_by(genre) %>%
summarise(Average_imdb = mean(imdb_rating, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(genre, Average_imdb), y = Average_imdb)) +
geom_bar(stat = "identity", fill = "lightblue") +
geom_text(aes(label = round(Average_imdb, 1)),
hjust = 1,
color = "black",
size = 3) +
coord_flip() +
labs(title = "Average IMDB Rating by Genre", x = "Genre", y = "Average IMDB Rating")

Total Number of Critics Rating by Genre
ggplot(Movie_unique,aes(x = critics_rating,fill = critics_rating)) +
geom_bar(position = "dodge") +
geom_text(stat="count", aes(label=..count..), vjust=0, size=5, color="black") +
facet_wrap(~ genre) +
labs(title = "Distribution of the number of Critics Rating by genre", x = "Genre", y = "Quantity",) +
theme_minimal()

Average Critic Score by Genre
Movie_unique %>%
group_by(genre) %>%
summarise(Average_crit = mean(critics_score, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(genre, Average_crit), y = Average_crit)) +
geom_bar(stat = "identity", fill = "lightblue") +
geom_text(aes(label = round(Average_crit, 1)),
hjust = 1,
color = "black",
size = 3) +
coord_flip() +
labs(title = "Average Crtics Score by Genre", x = "Genre", y = "Average Critics Score")

Scatter plot: Critic Score vs Revenue
ggplot(Movie_unique, aes(x = critics_score, y = revenue..Millions., color = genre)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "Black") +
labs(
title = "Critic Score vs Revenue",
x = "Critic Score",
y = "revenue..Millions."
) +
facet_wrap(~ genre) +
theme_minimal() +
theme(legend.position = "right") +
scale_color_discrete(name = "Genre")
## `geom_smooth()` using formula = 'y ~ x'

Total Number of Audience Rating
ggplot(Movie_unique,aes(x = audience_rating,fill = audience_rating)) +
geom_bar(position = "dodge") +
geom_text(stat="count", aes(label=..count..), vjust=0, size=5, color="black") +
facet_wrap(~ genre)

labs(title = "Distribution of the number of Audience Rating by genre", x = "Genre", y = "Quantity",) +
theme_minimal()
## NULL
Average Audience Score by Genre
Movie_unique %>%
group_by(genre) %>%
summarise(Average_aud = mean(audience_score, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(genre, Average_aud), y = Average_aud)) +
geom_bar(stat = "identity", fill = "lightblue") +
geom_text(aes(label = round(Average_aud, 1)),
hjust = 1,
color = "black",
size = 3) +
coord_flip() +
labs(title = "Average Audience Score by Genre", x = "Genre", y = "Average Audience Score")

Scatter plot: Audience Score vs Revenue
ggplot(Movie_unique, aes(x = audience_score, y = revenue..Millions., color = genre)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "Black") +
labs(
title = "Audience Score vs Revenue",
x = "Audience Score",
y = "revenue..Millions."
) +
facet_wrap(~ genre) +
theme_minimal() +
theme(legend.position = "right") +
scale_color_discrete(name = "Genre")
## `geom_smooth()` using formula = 'y ~ x'

Calculate average scores
average_scores_by_genre <- Movie_unique %>%
group_by(genre) %>%
summarise(
average_critics_score = mean(critics_score, na.rm = TRUE),
average_audience_score = mean(audience_score, na.rm = TRUE)
)
Draw the line chart
ggplot(long_data, aes(x = genre, y = Average_Score, group = Score_Type, color = Score_Type)) +
geom_line(size = 1) +
geom_point(size = 2) +
labs(
title = "Average Critics Score and Audience Score by Genre",
x = "Genre",
y = "Average Score",
color = "Score Type"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels for readability
plot.title = element_text(hjust = 0.5)
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Average Facebook like by Genre
Movie_unique %>%
group_by(genre) %>%
summarise(Average_face = mean(Facebook_Likes, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(genre, Average_face), y = Average_face)) +
geom_bar(stat = "identity", fill = "lightblue") +
geom_text(aes(label = round(Average_face, 1)),
hjust = 1,
color = "black",
size = 3) +
coord_flip() +
labs(title = "Average Facebook Likes by Genre", x = "Genre", y = "Average Facebook Likes")

Average Budget…Millions by Genre
Movie_unique %>%
group_by(genre) %>%
summarise(Average_budget = mean(budget..Millions., na.rm = TRUE)) %>%
ggplot(aes(x = reorder(genre, Average_budget), y = Average_budget)) +
geom_bar(stat = "identity", fill = "lightblue") +
geom_text(aes(label = round(Average_budget, 1)),
hjust = 1,
color = "black",
size = 3) +
coord_flip() +
labs(title = "Average Budget(Millions) by Genre", x = "Genre", y = "Average Budget(Millions)")

Average Revenue…Millions by Genre
Movie_unique %>%
group_by(genre) %>%
summarise(Average_revenue = mean(revenue..Millions., na.rm = TRUE)) %>%
ggplot(aes(x = reorder(genre, Average_revenue), y = Average_revenue)) +
geom_bar(stat = "identity", fill = "lightblue") +
geom_text(aes(label = round(Average_revenue, 1)),
hjust = 1,
color = "black",
size = 3) +
coord_flip() +
labs(title = "Average Revenue(Millions) by Genre", x = "Genre", y = "Average Revenue(Millions)")

Scatter plot: Budget vs Revenue
Movie_unique <- Movie_unique %>%
mutate(
budget_log = log(budget..Millions.),
revenue_log = log(revenue..Millions.)
)
ggplot(Movie_unique, aes(x = budget_log, y = revenue_log, color = genre)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "Black") +
labs(
title = "Budget vs Revenue",
x = "budget..Millions.",
y = "revenue..Millions."
) +
facet_wrap(~ genre) +
theme_minimal() +
theme(legend.position = "right") +
scale_color_discrete(name = "Genre")
## `geom_smooth()` using formula = 'y ~ x'

Create Revenue Ratio by Revenue Divided By Budget
Movie_unique$Revenue_Ratio = Movie_unique$revenue..Millions. / Movie_unique$budget..Millions.
Average Revenue Ration by Genre
Movie_unique %>%
group_by(genre) %>%
summarise(Average_ratio = mean(Revenue_Ratio, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(genre, Average_ratio), y = Average_ratio)) +
geom_bar(stat = "identity", fill = "lightblue") +
geom_text(aes(label = round(Average_ratio, 1)),
hjust = 1,
color = "black",
size = 3) +
coord_flip() +
labs(title = "Average Revenue Ratio by Genre", x = "Genre", y = "Average Revenue Ratio")

t.test
table(Movie_unique$audience_rating)
##
## Spilled Upright
## 220 424
Independent sample t-test
Visually see the mean difference.
boxplot(Movie_unique$audience_score ~ Movie_unique$audience_rating, col=c(3, 6)) # visually see the mean difference.

Normality assumption on the dependent variable
shapiro.test(Movie_unique$audience_score)
##
## Shapiro-Wilk normality test
##
## data: Movie_unique$audience_score
## W = 0.95384, p-value = 2.542e-13
Select relevant columns for analysis
columns_of_interest <- c("revenue..Millions.", "runtime", "mpaa_rating",
"imdb_rating", "critics_score", "audience_score",
"budget..Millions.", "Facebook_Likes", "genre", "type")
analysis_data <- Movie_unique[, columns_of_interest]
Rename columns for convenience
colnames(analysis_data) <- c("revenue", "runtime", "mpaa_rating", "imdb_rating",
"critics_score", "audience_score", "budget",
"facebook_likes", "genre", "type")
Remove missing values
analysis_data <- na.omit(analysis_data)
Print ANOVA results
print(anova_results)
## $mpaa_rating
## Df Sum Sq Mean Sq F value Pr(>F)
## get(category) 5 295340 59068 1.365 0.235
## Residuals 638 27602236 43264
##
## $genre
## Df Sum Sq Mean Sq F value Pr(>F)
## get(category) 10 251467 25147 0.576 0.834
## Residuals 633 27646108 43675
##
## $type
## Df Sum Sq Mean Sq F value Pr(>F)
## get(category) 2 60599 30299 0.698 0.498
## Residuals 641 27836977 43427
Print simple regression results
print(simple_models)
## $runtime
##
## Call:
## lm(formula = formula, data = analysis_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -143.85 -117.27 -75.04 18.69 1922.79
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 112.2705 45.2562 2.481 0.0134 *
## runtime 0.2382 0.4199 0.567 0.5707
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 208.4 on 642 degrees of freedom
## Multiple R-squared: 0.0005011, Adjusted R-squared: -0.001056
## F-statistic: 0.3218 on 1 and 642 DF, p-value: 0.5707
##
##
## $imdb_rating
##
## Call:
## lm(formula = formula, data = analysis_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -807.87 -95.36 -66.60 4.41 1909.30
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.305 17.430 2.714 0.00683 **
## imdb_rating 15.496 2.659 5.827 8.95e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 203.2 on 642 degrees of freedom
## Multiple R-squared: 0.05023, Adjusted R-squared: 0.04875
## F-statistic: 33.95 on 1 and 642 DF, p-value: 8.948e-09
##
##
## $critics_score
##
## Call:
## lm(formula = formula, data = analysis_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -283.42 -93.01 -45.42 31.03 1842.10
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -57.2559 19.5196 -2.933 0.00347 **
## critics_score 3.5418 0.3273 10.822 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 191.7 on 642 degrees of freedom
## Multiple R-squared: 0.1543, Adjusted R-squared: 0.153
## F-statistic: 117.1 on 1 and 642 DF, p-value: < 2.2e-16
##
##
## $audience_score
##
## Call:
## lm(formula = formula, data = analysis_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -276.71 -93.98 -27.05 25.24 1845.32
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -112.1230 21.2655 -5.273 1.84e-07 ***
## audience_score 4.4077 0.3522 12.514 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 186.9 on 642 degrees of freedom
## Multiple R-squared: 0.1961, Adjusted R-squared: 0.1948
## F-statistic: 156.6 on 1 and 642 DF, p-value: < 2.2e-16
##
##
## $budget
##
## Call:
## lm(formula = formula, data = analysis_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -410.14 -57.05 -22.30 18.22 1443.26
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.7148 8.0908 1.324 0.186
## budget 3.0710 0.1308 23.481 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 152.9 on 642 degrees of freedom
## Multiple R-squared: 0.462, Adjusted R-squared: 0.4612
## F-statistic: 551.4 on 1 and 642 DF, p-value: < 2.2e-16
##
##
## $facebook_likes
##
## Call:
## lm(formula = formula, data = analysis_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -433.66 -70.58 -42.46 54.54 1774.77
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.998e+01 7.275e+00 5.495 5.63e-08 ***
## facebook_likes 1.520e-03 6.412e-05 23.701 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 152.2 on 642 degrees of freedom
## Multiple R-squared: 0.4667, Adjusted R-squared: 0.4658
## F-statistic: 561.7 on 1 and 642 DF, p-value: < 2.2e-16
Print summary of the multiple regression model
summary(multi_model)
##
## Call:
## lm(formula = revenue ~ runtime + imdb_rating + critics_score +
## audience_score + budget + facebook_likes, data = analysis_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -301.95 -52.72 -6.12 29.52 1511.98
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.803e+01 2.967e+01 -2.293 0.0222 *
## runtime -2.139e-01 2.539e-01 -0.843 0.3997
## imdb_rating 8.073e-01 1.872e+00 0.431 0.6665
## critics_score 8.109e-01 3.381e-01 2.399 0.0167 *
## audience_score 7.302e-01 4.048e-01 1.804 0.0717 .
## budget 1.837e+00 1.284e-01 14.311 <2e-16 ***
## facebook_likes 9.616e-04 6.169e-05 15.586 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 125.3 on 637 degrees of freedom
## Multiple R-squared: 0.6416, Adjusted R-squared: 0.6383
## F-statistic: 190.1 on 6 and 637 DF, p-value: < 2.2e-16
select the values with “Adventure”
stringr::str_detect(Movie_unique$New_Genre, "Adventure") # select the values with "Adventure"
## [1] FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] TRUE TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE
## [85] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
## [97] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
## [205] FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [301] FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [325] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [373] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469] TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
## [481] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [493] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## [505] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [517] FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [541] FALSE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [577] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [589] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [613] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [625] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
head(Movie_unique[str_detect(Movie_unique$New_Genre, "Adventure"),], 10) # just see the dataframe
## original_title type genre
## 5 300: Rise of an Empire Feature Film Action & Adventure
## 8 A Beautiful Mind Feature Film Action & Adventure
## 10 A Good Day to Die Hard Feature Film Action & Adventure
## 26 American Gangster Feature Film Science Fiction & Fantasy
## 39 AVP: Alien vs. Predator Feature Film Action & Adventure
## 49 Batman Returns Feature Film Science Fiction & Fantasy
## 50 Batman v Superman: Dawn of Justice Feature Film Action & Adventure
## 52 Battlefield Earth Feature Film Action & Adventure
## 56 Bee Movie Feature Film Action & Adventure
## 70 Bruce Almighty Feature Film Action & Adventure
## runtime mpaa_rating imdb_rating imdb_num_votes critics_rating critics_score
## 5 111 PG-13 6.0 103789 Rotten 51
## 8 83 G 7.6 78862 Rotten 50
## 10 86 R 4.1 739 Fresh 53
## 26 113 R 2.0 9216 Rotten 34
## 39 81 R 7.6 123769 Fresh 72
## 49 85 R 2.1 122980 Certified Fresh 33
## 50 127 PG 6.8 71979 Certified Fresh 89
## 52 93 R 6.3 42295 Rotten 25
## 56 92 PG 5.6 205065 Rotten 37
## 70 85 PG 6.0 1680 Rotten 33
## audience_rating audience_score best_pic_nom Facebook_Likes budget..Millions.
## 5 Spilled 51 no 21583 16.0
## 8 Upright 81 no 52827 26.0
## 10 Upright 42 no 5481 8.5
## 26 Upright 24 no 1354 4.9
## 39 Upright 87 no 335227 140.0
## 49 Upright 31 no 22899 90.0
## 50 Upright 75 no 124450 245.0
## 52 Spilled 59 no 971 20.0
## 56 Spilled 51 no 106576 10.0
## 70 Upright 65 no 121276 15.0
## revenue..Millions. language country New_Genre runtimeGrp budget_log
## 5 19.68 English USA Adventure [90,120) 2.772589
## 8 43.32 English USA Adventure [60,90) 3.258097
## 10 5.77 English USA Adventure [60,90) 2.140066
## 26 3.49 English USA Adventure [90,120) 1.589235
## 39 449.22 English USA Adventure [60,90) 4.941642
## 49 16.18 English USA Adventure [60,90) 4.499810
## 50 880.67 English USA Adventure [120,267) 5.501258
## 52 34.56 English USA Adventure [90,120) 2.995732
## 56 85.98 English USA Adventure [90,120) 2.302585
## 70 75.70 English USA Adventure [60,90) 2.708050
## revenue_log Revenue_Ratio
## 5 2.979603 1.2300000
## 8 3.768614 1.6661538
## 10 1.752672 0.6788235
## 26 1.249902 0.7122449
## 39 6.107513 3.2087143
## 49 2.783776 0.1797778
## 50 6.780683 3.5945714
## 52 3.542697 1.7280000
## 56 4.454115 8.5980000
## 70 4.326778 5.0466667
Movie_Genre_Adventure<-str_detect(Movie_unique$New_Genre, "Adventure") # assign into a new dataframe
Movie_Genre_Adventure # see the dataframe
## [1] FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] TRUE TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE
## [85] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
## [97] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
## [205] FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [301] FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [325] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [373] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469] TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
## [481] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [493] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## [505] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [517] FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [541] FALSE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [577] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [589] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [613] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [625] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
subadventure<-Movie_unique[Movie_Genre_Adventure,] ## assign subset data into a dataframe
head(subadventure, 10)
## original_title type genre
## 5 300: Rise of an Empire Feature Film Action & Adventure
## 8 A Beautiful Mind Feature Film Action & Adventure
## 10 A Good Day to Die Hard Feature Film Action & Adventure
## 26 American Gangster Feature Film Science Fiction & Fantasy
## 39 AVP: Alien vs. Predator Feature Film Action & Adventure
## 49 Batman Returns Feature Film Science Fiction & Fantasy
## 50 Batman v Superman: Dawn of Justice Feature Film Action & Adventure
## 52 Battlefield Earth Feature Film Action & Adventure
## 56 Bee Movie Feature Film Action & Adventure
## 70 Bruce Almighty Feature Film Action & Adventure
## runtime mpaa_rating imdb_rating imdb_num_votes critics_rating critics_score
## 5 111 PG-13 6.0 103789 Rotten 51
## 8 83 G 7.6 78862 Rotten 50
## 10 86 R 4.1 739 Fresh 53
## 26 113 R 2.0 9216 Rotten 34
## 39 81 R 7.6 123769 Fresh 72
## 49 85 R 2.1 122980 Certified Fresh 33
## 50 127 PG 6.8 71979 Certified Fresh 89
## 52 93 R 6.3 42295 Rotten 25
## 56 92 PG 5.6 205065 Rotten 37
## 70 85 PG 6.0 1680 Rotten 33
## audience_rating audience_score best_pic_nom Facebook_Likes budget..Millions.
## 5 Spilled 51 no 21583 16.0
## 8 Upright 81 no 52827 26.0
## 10 Upright 42 no 5481 8.5
## 26 Upright 24 no 1354 4.9
## 39 Upright 87 no 335227 140.0
## 49 Upright 31 no 22899 90.0
## 50 Upright 75 no 124450 245.0
## 52 Spilled 59 no 971 20.0
## 56 Spilled 51 no 106576 10.0
## 70 Upright 65 no 121276 15.0
## revenue..Millions. language country New_Genre runtimeGrp budget_log
## 5 19.68 English USA Adventure [90,120) 2.772589
## 8 43.32 English USA Adventure [60,90) 3.258097
## 10 5.77 English USA Adventure [60,90) 2.140066
## 26 3.49 English USA Adventure [90,120) 1.589235
## 39 449.22 English USA Adventure [60,90) 4.941642
## 49 16.18 English USA Adventure [60,90) 4.499810
## 50 880.67 English USA Adventure [120,267) 5.501258
## 52 34.56 English USA Adventure [90,120) 2.995732
## 56 85.98 English USA Adventure [90,120) 2.302585
## 70 75.70 English USA Adventure [60,90) 2.708050
## revenue_log Revenue_Ratio
## 5 2.979603 1.2300000
## 8 3.768614 1.6661538
## 10 1.752672 0.6788235
## 26 1.249902 0.7122449
## 39 6.107513 3.2087143
## 49 2.783776 0.1797778
## 50 6.780683 3.5945714
## 52 3.542697 1.7280000
## 56 4.454115 8.5980000
## 70 4.326778 5.0466667
Keep only numeric columns for correlation and analysis
numeric_data_adv <- subadventure %>%
select_if(is.numeric)
Correlation Analysis
correlation_matrix <- cor(numeric_data_adv, use = "complete.obs")
revenue_correlation_adv <- correlation_matrix["revenue..Millions.", ]
sorted_correlation_adv <- sort(revenue_correlation_adv, decreasing = TRUE)
print("Top correlated variables with Revenue across Genre Adventure:")
## [1] "Top correlated variables with Revenue across Genre Adventure:"
print(sorted_correlation_adv)
## revenue..Millions. budget..Millions. revenue_log Facebook_Likes
## 1.0000000 0.7148536 0.7094601 0.6194620
## audience_score imdb_rating budget_log Revenue_Ratio
## 0.4825824 0.4743989 0.4100942 0.3841060
## runtime critics_score imdb_num_votes
## 0.3536316 0.3447954 0.1652981
Variable Importance with Random Forest
Prepare data for random forest
rf_data_adv <- subadventure %>%
select(revenue..Millions., imdb_rating, critics_score, audience_score, budget..Millions., Facebook_Likes) %>%
na.omit()
Fit random forest model
rf_model_adv <- randomForest(revenue..Millions. ~ ., data = rf_data_adv, importance = TRUE)
Get variable importance
importance_scores_adv <- importance(rf_model_adv)
importance_scores_adv <- importance_scores_adv[order(-importance_scores_adv[, 1]), ]
print("Variable Importance from Random Forest:")
## [1] "Variable Importance from Random Forest:"
print(importance_scores_adv)
## %IncMSE IncNodePurity
## Facebook_Likes 15.4186666 507911.9
## budget..Millions. 12.6820126 394258.3
## audience_score 8.6969703 270776.1
## imdb_rating 6.8233853 226472.2
## critics_score 0.3957676 250839.8
Visualization 1: Facebook_Likes across Adventure
ggplot(subadventure, aes(x = New_Genre, y = Facebook_Likes)) +
geom_boxplot(fill = "lightblue", outlier.color = "red") +
theme_minimal() +
labs(title = "Distribution of Facebook Likes Across Adventure",
x = "Adventure", y = "Facebook Likes") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

Visualization 2: Budget across Adventure
ggplot(subadventure, aes(x = New_Genre, y = budget..Millions.)) +
geom_boxplot(fill = "lightblue", outlier.color = "red") +
theme_minimal() +
labs(title = paste("Distribution of Budget (Millions) Across Adventure"),
x = "Adventure", y = "budget..Millions.") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

select the text values with “Comedy”
stringr::str_detect(Movie_unique$New_Genre, "Comedy") # select the values with "Comedy"
## [1] FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [85] TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [157] TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [169] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
## [217] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE TRUE FALSE
## [289] TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [349] FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [385] FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## [397] FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [421] FALSE FALSE TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## [433] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [493] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [505] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [517] FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [541] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [553] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [577] FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [589] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [613] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [625] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE
## [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
head(Movie_unique[str_detect(Movie_unique$New_Genre, "Comedy"),], 10) # just see the dataframe
## original_title type genre runtime
## 4 2012 Feature Film Comedy 98
## 7 50 First Dates Feature Film Comedy 87
## 17 Alexander Feature Film Comedy 89
## 24 Alvin and the Chipmunks: The Road Chip Feature Film Comedy 86
## 31 Around the World in 80 Days Feature Film Comedy 97
## 32 Arthur and the Invisibles Feature Film Comedy 94
## 36 Australia Feature Film Comedy 90
## 43 Ballistic: Ecks vs. Sever Feature Film Comedy 96
## 62 Big Hero 6 Feature Film Comedy 100
## 79 Cats & Dogs: The Revenge of Kitty Galore Feature Film Comedy 119
## mpaa_rating imdb_rating imdb_num_votes critics_rating critics_score
## 4 PG-13 6.3 8646 Certified Fresh 44
## 7 R 5.4 6811 Rotten 35
## 17 PG-13 8.5 893008 Certified Fresh 88
## 24 PG-13 2.1 9904 Fresh 11
## 31 PG-13 6.9 12606 Certified Fresh 85
## 32 PG 5.1 1674 Rotten 50
## 36 PG 7.2 44741 Fresh 76
## 43 PG 6.2 12402 Fresh 63
## 62 R 5.9 82737 Rotten 46
## 79 PG-13 6.3 124250 Certified Fresh 75
## audience_rating audience_score best_pic_nom Facebook_Likes budget..Millions.
## 4 Spilled 54 no 445 3.5
## 7 Spilled 31 no 12952 50.0
## 17 Upright 91 no 224598 74.0
## 24 Upright 22 no 3450 6.0
## 31 Upright 69 no 103175 45.0
## 32 Spilled 38 no 683 18.5
## 36 Upright 80 no 921 2.5
## 43 Spilled 53 no 1846 68.0
## 62 Spilled 59 no 62963 8.5
## 79 Spilled 52 no 3326 38.0
## revenue..Millions. language country New_Genre runtimeGrp budget_log
## 4 0.75 English USA Comedy [90,120) 1.2527630
## 7 36.35 English USA Comedy [60,90) 3.9120230
## 17 311.59 English Germany Comedy [60,90) 4.3040651
## 24 3.02 English USA Comedy [60,90) 1.7917595
## 31 86.17 English USA Comedy [90,120) 3.8066625
## 32 31.91 English France Comedy [90,120) 2.9177707
## 36 91.38 English Australia Comedy [90,120) 0.9162907
## 43 203.39 English USA Comedy [90,120) 4.2195077
## 62 43.53 English USA Comedy [90,120) 2.1400662
## 79 105.61 English USA Comedy [90,120) 3.6375862
## revenue_log Revenue_Ratio
## 4 -0.2876821 0.2142857
## 7 3.5931942 0.7270000
## 17 5.7416882 4.2106757
## 24 1.1052568 0.5033333
## 31 4.4563221 1.9148889
## 32 3.4629194 1.7248649
## 36 4.5150266 36.5520000
## 43 5.3151253 2.9910294
## 62 3.7734504 5.1211765
## 79 4.6597531 2.7792105
Movie_Genre_Comedy<-str_detect(Movie_unique$New_Genre, "Comedy") # assign into a new dataframe
Movie_Genre_Comedy # see the dataframe
## [1] FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [85] TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [157] TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [169] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
## [217] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE TRUE FALSE
## [289] TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [349] FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [385] FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## [397] FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [421] FALSE FALSE TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## [433] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [493] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [505] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [517] FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [541] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [553] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [577] FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [589] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [613] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [625] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE
## [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
subcomedy<-Movie_unique[Movie_Genre_Comedy,] ## assign subset data into a dataframe
head(subcomedy, 10)
## original_title type genre runtime
## 4 2012 Feature Film Comedy 98
## 7 50 First Dates Feature Film Comedy 87
## 17 Alexander Feature Film Comedy 89
## 24 Alvin and the Chipmunks: The Road Chip Feature Film Comedy 86
## 31 Around the World in 80 Days Feature Film Comedy 97
## 32 Arthur and the Invisibles Feature Film Comedy 94
## 36 Australia Feature Film Comedy 90
## 43 Ballistic: Ecks vs. Sever Feature Film Comedy 96
## 62 Big Hero 6 Feature Film Comedy 100
## 79 Cats & Dogs: The Revenge of Kitty Galore Feature Film Comedy 119
## mpaa_rating imdb_rating imdb_num_votes critics_rating critics_score
## 4 PG-13 6.3 8646 Certified Fresh 44
## 7 R 5.4 6811 Rotten 35
## 17 PG-13 8.5 893008 Certified Fresh 88
## 24 PG-13 2.1 9904 Fresh 11
## 31 PG-13 6.9 12606 Certified Fresh 85
## 32 PG 5.1 1674 Rotten 50
## 36 PG 7.2 44741 Fresh 76
## 43 PG 6.2 12402 Fresh 63
## 62 R 5.9 82737 Rotten 46
## 79 PG-13 6.3 124250 Certified Fresh 75
## audience_rating audience_score best_pic_nom Facebook_Likes budget..Millions.
## 4 Spilled 54 no 445 3.5
## 7 Spilled 31 no 12952 50.0
## 17 Upright 91 no 224598 74.0
## 24 Upright 22 no 3450 6.0
## 31 Upright 69 no 103175 45.0
## 32 Spilled 38 no 683 18.5
## 36 Upright 80 no 921 2.5
## 43 Spilled 53 no 1846 68.0
## 62 Spilled 59 no 62963 8.5
## 79 Spilled 52 no 3326 38.0
## revenue..Millions. language country New_Genre runtimeGrp budget_log
## 4 0.75 English USA Comedy [90,120) 1.2527630
## 7 36.35 English USA Comedy [60,90) 3.9120230
## 17 311.59 English Germany Comedy [60,90) 4.3040651
## 24 3.02 English USA Comedy [60,90) 1.7917595
## 31 86.17 English USA Comedy [90,120) 3.8066625
## 32 31.91 English France Comedy [90,120) 2.9177707
## 36 91.38 English Australia Comedy [90,120) 0.9162907
## 43 203.39 English USA Comedy [90,120) 4.2195077
## 62 43.53 English USA Comedy [90,120) 2.1400662
## 79 105.61 English USA Comedy [90,120) 3.6375862
## revenue_log Revenue_Ratio
## 4 -0.2876821 0.2142857
## 7 3.5931942 0.7270000
## 17 5.7416882 4.2106757
## 24 1.1052568 0.5033333
## 31 4.4563221 1.9148889
## 32 3.4629194 1.7248649
## 36 4.5150266 36.5520000
## 43 5.3151253 2.9910294
## 62 3.7734504 5.1211765
## 79 4.6597531 2.7792105
Keep only numeric columns for correlation and analysis
numeric_data_com <- subcomedy %>%
select_if(is.numeric)
Correlation Analysis
correlation_matrix_com <- cor(numeric_data_com, use = "complete.obs")
revenue_correlation_com <- correlation_matrix_com["revenue..Millions.", ]
sorted_correlation_com <- sort(revenue_correlation_com, decreasing = TRUE)
print("Top correlated variables with Revenue Across Genre Comedy:")
## [1] "Top correlated variables with Revenue Across Genre Comedy:"
print(sorted_correlation_com)
## revenue..Millions. budget..Millions. Facebook_Likes revenue_log
## 1.00000000 0.87312360 0.74741797 0.69955478
## budget_log audience_score critics_score imdb_rating
## 0.58395817 0.57127640 0.53402017 0.50153462
## imdb_num_votes runtime Revenue_Ratio
## 0.12463494 0.09867118 0.03288816
Variable Importance with Random Forest
Prepare data for random forest
rf_data_com <- subcomedy %>%
select(revenue..Millions., imdb_rating, critics_score, audience_score, budget..Millions., Facebook_Likes) %>%
na.omit()
Fit random forest model
rf_model_com <- randomForest(revenue..Millions. ~ ., data = rf_data_com, importance = TRUE)
Get variable importance
importance_scores_com <- importance(rf_model_com)
importance_scores_com <- importance_scores_com[order(-importance_scores_com[, 1]), ]
print("Variable Importance from Random Forest:")
## [1] "Variable Importance from Random Forest:"
print(importance_scores_com)
## %IncMSE IncNodePurity
## budget..Millions. 13.292518 684750.1
## Facebook_Likes 10.406806 631198.5
## imdb_rating 7.218773 488452.7
## critics_score 4.747263 467247.7
## audience_score 4.382078 412382.3
Visualization 1: Facebook_Likes across Comedy
ggplot(subcomedy, aes(x = New_Genre, y = Facebook_Likes)) +
geom_boxplot(fill = "lightblue", outlier.color = "red") +
theme_minimal() +
labs(title = "Distribution of Facebook Likes Across Comedy",
x = "Comedy", y = "Facebook Likes") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

Visualization 2: Budget across Comedy
ggplot(subcomedy, aes(x = New_Genre, y = budget..Millions.)) +
geom_boxplot(fill = "lightblue", outlier.color = "red") +
theme_minimal() +
labs(title = paste("Distribution of Budget (Millions) Across Comedy"),
x = "Comedy", y = "budget..Millions.") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

select the text values with “Drame”
stringr::str_detect(Movie_unique$New_Genre, "Drame") # select the values with "Drame"
## [1] TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE
## [13] FALSE FALSE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE
## [25] FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [37] TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE TRUE FALSE
## [61] FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE FALSE FALSE FALSE TRUE
## [73] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [85] FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] TRUE TRUE FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE FALSE
## [109] TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE
## [121] TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE FALSE FALSE
## [133] FALSE TRUE TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE FALSE FALSE FALSE
## [157] FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [169] TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
## [181] FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE
## [193] FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE
## [205] TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE
## [229] TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE
## [241] FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
## [253] TRUE TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE TRUE TRUE
## [265] TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE
## [277] FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE
## [289] FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE FALSE
## [301] TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [313] FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [325] TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE
## [337] TRUE FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE TRUE FALSE FALSE
## [349] TRUE FALSE FALSE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## [361] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [373] FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE
## [385] TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
## [397] TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE TRUE TRUE FALSE
## [409] TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE
## [421] FALSE TRUE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE TRUE
## [433] FALSE TRUE TRUE FALSE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE
## [445] FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE FALSE
## [457] TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## [469] FALSE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE
## [481] FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## [493] TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE TRUE FALSE TRUE
## [505] FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [517] TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE
## [529] FALSE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [541] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE FALSE
## [553] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE
## [565] TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE TRUE
## [577] FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## [589] TRUE FALSE FALSE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE
## [601] TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE
## [613] TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [625] FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
## [637] TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
head(Movie_unique[str_detect(Movie_unique$New_Genre, "Drame"),], 10) # just see the dataframe
## original_title type genre runtime mpaa_rating
## 1 10,000 B.C. Feature Film Drama 134 R
## 2 102 Dalmatians Feature Film Drama 108 PG
## 6 47 Ronin Feature Film Drama 106 PG
## 9 A Christmas Carol Feature Film Drama 100 R
## 11 A Sound of Thunder Feature Film Drama 95 R
## 15 Agora Feature Film Drama 100 R
## 16 Air Force One Feature Film Drama 93 R
## 18 Ali Feature Film Drama 112 R
## 20 Alice Through the Looking Glass Feature Film Drama 90 R
## 22 Allegiant Feature Film Drama 85 PG
## imdb_rating imdb_num_votes critics_rating critics_score audience_rating
## 1 6.8 9025 Fresh 60 Upright
## 2 4.9 5136 Rotten 5 Spilled
## 6 7.8 12450 Fresh 75 Spilled
## 9 7.0 8320 Rotten 70 Spilled
## 11 6.2 1935 Rotten 35 Upright
## 15 6.1 13682 Rotten 54 Spilled
## 16 7.0 36909 Fresh 78 Upright
## 18 7.8 246587 Certified Fresh 93 Upright
## 20 5.7 390 Rotten 29 Spilled
## 22 7.8 26628 Fresh 76 Upright
## audience_score best_pic_nom Facebook_Likes budget..Millions.
## 1 76 no 23343 12.6
## 2 13 no 84182 45.0
## 6 85 no 20965 50.0
## 9 74 no 48878 65.0
## 11 70 no 41890 50.0
## 15 37 no 101829 48.0
## 16 71 no 23603 52.0
## 18 87 no 14196 12.0
## 20 25 no 80806 12.0
## 22 80 no 12452 100.0
## revenue..Millions. language country New_Genre runtimeGrp budget_log
## 1 18.66 English New Zealand Drame [120,267) 2.533697
## 2 60.22 English USA Drame [90,120) 3.806662
## 6 240.36 English USA Drame [90,120) 3.912023
## 9 235.67 English USA Drame [90,120) 4.174387
## 11 40.83 English UK Drame [90,120) 3.912023
## 15 85.50 English Spain Drame [90,120) 3.871201
## 16 216.49 English USA Drame [90,120) 3.951244
## 18 305.15 English USA Drame [90,120) 2.484907
## 20 32.25 English USA Drame [90,120) 2.484907
## 22 243.40 English USA Drame [60,90) 4.605170
## revenue_log Revenue_Ratio
## 1 2.926382 1.480952
## 2 4.098005 1.338222
## 6 5.482138 4.807200
## 9 5.462433 3.625692
## 11 3.709417 0.816600
## 15 4.448516 1.781250
## 16 5.377544 4.163269
## 18 5.720803 25.429167
## 20 3.473518 2.687500
## 22 5.494706 2.434000
Movie_Genre_Drame<-str_detect(Movie_unique$New_Genre, "Drame") # assign into a new dataframe
Movie_Genre_Drame # see the dataframe
## [1] TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE
## [13] FALSE FALSE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE
## [25] FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [37] TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE TRUE FALSE
## [61] FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE FALSE FALSE FALSE TRUE
## [73] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [85] FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] TRUE TRUE FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE FALSE
## [109] TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE
## [121] TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE FALSE FALSE
## [133] FALSE TRUE TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE FALSE FALSE FALSE
## [157] FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [169] TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
## [181] FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE
## [193] FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE
## [205] TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE
## [229] TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE
## [241] FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
## [253] TRUE TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE TRUE TRUE
## [265] TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE
## [277] FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE
## [289] FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE FALSE
## [301] TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [313] FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [325] TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE
## [337] TRUE FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE TRUE FALSE FALSE
## [349] TRUE FALSE FALSE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## [361] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [373] FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE
## [385] TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
## [397] TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE TRUE TRUE FALSE
## [409] TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE
## [421] FALSE TRUE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE TRUE
## [433] FALSE TRUE TRUE FALSE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE
## [445] FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE FALSE
## [457] TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## [469] FALSE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE
## [481] FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## [493] TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE TRUE FALSE TRUE
## [505] FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [517] TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE
## [529] FALSE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [541] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE FALSE
## [553] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE
## [565] TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE TRUE
## [577] FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## [589] TRUE FALSE FALSE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE
## [601] TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE
## [613] TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [625] FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
## [637] TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
subdrama<-Movie_unique[Movie_Genre_Drame,] ## assign subset data into a dataframe
head(subdrama, 10)
## original_title type genre runtime mpaa_rating
## 1 10,000 B.C. Feature Film Drama 134 R
## 2 102 Dalmatians Feature Film Drama 108 PG
## 6 47 Ronin Feature Film Drama 106 PG
## 9 A Christmas Carol Feature Film Drama 100 R
## 11 A Sound of Thunder Feature Film Drama 95 R
## 15 Agora Feature Film Drama 100 R
## 16 Air Force One Feature Film Drama 93 R
## 18 Ali Feature Film Drama 112 R
## 20 Alice Through the Looking Glass Feature Film Drama 90 R
## 22 Allegiant Feature Film Drama 85 PG
## imdb_rating imdb_num_votes critics_rating critics_score audience_rating
## 1 6.8 9025 Fresh 60 Upright
## 2 4.9 5136 Rotten 5 Spilled
## 6 7.8 12450 Fresh 75 Spilled
## 9 7.0 8320 Rotten 70 Spilled
## 11 6.2 1935 Rotten 35 Upright
## 15 6.1 13682 Rotten 54 Spilled
## 16 7.0 36909 Fresh 78 Upright
## 18 7.8 246587 Certified Fresh 93 Upright
## 20 5.7 390 Rotten 29 Spilled
## 22 7.8 26628 Fresh 76 Upright
## audience_score best_pic_nom Facebook_Likes budget..Millions.
## 1 76 no 23343 12.6
## 2 13 no 84182 45.0
## 6 85 no 20965 50.0
## 9 74 no 48878 65.0
## 11 70 no 41890 50.0
## 15 37 no 101829 48.0
## 16 71 no 23603 52.0
## 18 87 no 14196 12.0
## 20 25 no 80806 12.0
## 22 80 no 12452 100.0
## revenue..Millions. language country New_Genre runtimeGrp budget_log
## 1 18.66 English New Zealand Drame [120,267) 2.533697
## 2 60.22 English USA Drame [90,120) 3.806662
## 6 240.36 English USA Drame [90,120) 3.912023
## 9 235.67 English USA Drame [90,120) 4.174387
## 11 40.83 English UK Drame [90,120) 3.912023
## 15 85.50 English Spain Drame [90,120) 3.871201
## 16 216.49 English USA Drame [90,120) 3.951244
## 18 305.15 English USA Drame [90,120) 2.484907
## 20 32.25 English USA Drame [90,120) 2.484907
## 22 243.40 English USA Drame [60,90) 4.605170
## revenue_log Revenue_Ratio
## 1 2.926382 1.480952
## 2 4.098005 1.338222
## 6 5.482138 4.807200
## 9 5.462433 3.625692
## 11 3.709417 0.816600
## 15 4.448516 1.781250
## 16 5.377544 4.163269
## 18 5.720803 25.429167
## 20 3.473518 2.687500
## 22 5.494706 2.434000
Keep only numeric columns for correlation and analysis
numeric_data_dra <- subdrama %>%
select_if(is.numeric)
Correlation Analysis
correlation_matrix_dra <- cor(numeric_data_dra, use = "complete.obs")
revenue_correlation_dra <- correlation_matrix_dra["revenue..Millions.", ]
sorted_correlation_dra <- sort(revenue_correlation_dra, decreasing = TRUE)
print("Top correlated variables with Revenue Across Genre Drama:")
## [1] "Top correlated variables with Revenue Across Genre Drama:"
print(sorted_correlation_dra)
## revenue..Millions. revenue_log Facebook_Likes budget..Millions.
## 1.00000000 0.68212356 0.63988394 0.63688476
## budget_log audience_score imdb_rating critics_score
## 0.41967386 0.41449242 0.40998623 0.37639065
## Revenue_Ratio runtime imdb_num_votes
## 0.27041718 -0.01904314 -0.03713131
Variable Importance with Random Forest
Prepare data for random forest
rf_data_dra <- subdrama %>%
select(revenue..Millions., imdb_rating, critics_score, audience_score, budget..Millions., Facebook_Likes) %>%
na.omit()
Fit random forest model
rf_model_dra <- randomForest(revenue..Millions. ~ ., data = rf_data_dra, importance = TRUE)
Get variable importance
importance_scores_dra <- importance(rf_model_dra)
importance_scores_dra <- importance_scores_dra[order(-importance_scores_dra[, 1]), ]
print("Variable Importance from Random Forest:")
## [1] "Variable Importance from Random Forest:"
print(importance_scores_dra)
## %IncMSE IncNodePurity
## Facebook_Likes 24.362910 4568131
## budget..Millions. 12.370583 3361037
## audience_score 6.620963 1666807
## imdb_rating 6.102911 1509813
## critics_score 5.181925 1624568
Visualization 1: Facebook_Likes across Drama
ggplot(subdrama, aes(x = New_Genre, y = Facebook_Likes)) +
geom_boxplot(fill = "lightblue", outlier.color = "red") +
theme_minimal() +
labs(title = "Distribution of Facebook Likes Across Drama",
x = "Drama", y = "Facebook Likes") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

Visualization 2: Budget across Drama
ggplot(subdrama, aes(x = New_Genre, y = budget..Millions.)) +
geom_boxplot(fill = "lightblue", outlier.color = "red") +
theme_minimal() +
labs(title = paste("Distribution of Budget (Millions) Across Drama"),
x = "Drama", y = "budget..Millions.") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

select the text values with “Other”
stringr::str_detect(Movie_unique$New_Genre, "Other") # select the values with "Other"
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE
## [25] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [121] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [145] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
## [181] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [241] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [313] FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
## [325] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE
## [349] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [397] FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [469] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] FALSE TRUE FALSE FALSE TRUE FALSE TRUE TRUE FALSE FALSE FALSE TRUE
## [493] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [505] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE
## [517] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [541] FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [577] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [589] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [613] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE
## [625] FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
## [637] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
head(Movie_unique[str_detect(Movie_unique$New_Genre, "Other"),], 10) # just see the dataframe
## original_title type genre runtime
## 14 After Earth Documentary Documentary 74
## 21 All That Jazz Documentary Documentary 122
## 23 Alvin and the Chipmunks: Chipwrecked Documentary Documentary 86
## 25 Alvin and the Chipmunks: The Squeakquel Feature Film Other 127
## 38 Avengers: Age of Ultron Documentary Documentary 88
## 44 Bandits Documentary Documentary 90
## 51 Battle Los Angeles Documentary Documentary 93
## 60 Bicentennial Man Documentary Documentary 86
## 69 Braveheart Documentary Documentary 40
## 92 Cloudy with a Chance of Meatballs 2 Documentary Documentary 83
## mpaa_rating imdb_rating imdb_num_votes critics_rating critics_score
## 14 Unrated 7.3 285 Fresh 78
## 21 PG 7.9 1346 Fresh 94
## 23 Unrated 2.1 122980 Certified Fresh 37
## 25 PG-13 7.0 79866 Fresh 67
## 38 Unrated 7.5 880 Fresh 90
## 44 Unrated 7.8 180 Fresh 84
## 51 PG 8.4 390 Fresh 95
## 60 Unrated 2.1 9904 Fresh 31
## 69 G 7.0 723 Fresh 100
## 92 PG 3.8 10522 Fresh 33
## audience_rating audience_score best_pic_nom Facebook_Likes budget..Millions.
## 14 Upright 89 no 14168 9.50
## 21 Upright 85 no 71476 21.80
## 23 Upright 35 no 34073 14.00
## 25 Upright 71 no 345227 98.00
## 38 Upright 89 no 292000 74.00
## 44 Upright 79 no 13934 80.00
## 51 Upright 92 no 4001 30.00
## 60 Upright 22 no 254 0.32
## 69 Upright 68 no 81976 30.00
## 92 Upright 32 no 848 3.00
## revenue..Millions. language country New_Genre runtimeGrp budget_log
## 14 293.33 English USA Other [60,90) 2.251292
## 21 64.57 English USA Other [120,267) 3.081910
## 23 30.75 English USA Other [60,90) 2.639057
## 25 433.01 English USA Other [120,267) 4.584967
## 38 1156.73 English USA Other [60,90) 4.304065
## 44 90.87 English USA Other [90,120) 4.382027
## 51 287.55 English USA Other [90,120) 3.401197
## 60 0.12 English USA Other [60,90) -1.139434
## 69 61.81 English USA Other [39,60) 3.401197
## 92 2.36 English USA Other [60,90) 1.098612
## revenue_log Revenue_Ratio
## 14 5.6812983 30.8768421
## 21 4.1677499 2.9619266
## 23 3.4258900 2.1964286
## 25 6.0707608 4.4184694
## 38 7.0533523 15.6314865
## 44 4.5094299 1.1358750
## 51 5.6613968 9.5850000
## 60 -2.1202635 0.3750000
## 69 4.1240652 2.0603333
## 92 0.8586616 0.7866667
Movie_Genre_Other<-str_detect(Movie_unique$New_Genre, "Other") # assign into a new dataframe
Movie_Genre_Other # see the dataframe
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE
## [25] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [121] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [145] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
## [181] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [241] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [313] FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
## [325] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE
## [349] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [397] FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [469] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] FALSE TRUE FALSE FALSE TRUE FALSE TRUE TRUE FALSE FALSE FALSE TRUE
## [493] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [505] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE
## [517] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [541] FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [577] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [589] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [613] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE
## [625] FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
## [637] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
subother<-Movie_unique[Movie_Genre_Other,] ## assign subset data into a dataframe
head(subother, 10)
## original_title type genre runtime
## 14 After Earth Documentary Documentary 74
## 21 All That Jazz Documentary Documentary 122
## 23 Alvin and the Chipmunks: Chipwrecked Documentary Documentary 86
## 25 Alvin and the Chipmunks: The Squeakquel Feature Film Other 127
## 38 Avengers: Age of Ultron Documentary Documentary 88
## 44 Bandits Documentary Documentary 90
## 51 Battle Los Angeles Documentary Documentary 93
## 60 Bicentennial Man Documentary Documentary 86
## 69 Braveheart Documentary Documentary 40
## 92 Cloudy with a Chance of Meatballs 2 Documentary Documentary 83
## mpaa_rating imdb_rating imdb_num_votes critics_rating critics_score
## 14 Unrated 7.3 285 Fresh 78
## 21 PG 7.9 1346 Fresh 94
## 23 Unrated 2.1 122980 Certified Fresh 37
## 25 PG-13 7.0 79866 Fresh 67
## 38 Unrated 7.5 880 Fresh 90
## 44 Unrated 7.8 180 Fresh 84
## 51 PG 8.4 390 Fresh 95
## 60 Unrated 2.1 9904 Fresh 31
## 69 G 7.0 723 Fresh 100
## 92 PG 3.8 10522 Fresh 33
## audience_rating audience_score best_pic_nom Facebook_Likes budget..Millions.
## 14 Upright 89 no 14168 9.50
## 21 Upright 85 no 71476 21.80
## 23 Upright 35 no 34073 14.00
## 25 Upright 71 no 345227 98.00
## 38 Upright 89 no 292000 74.00
## 44 Upright 79 no 13934 80.00
## 51 Upright 92 no 4001 30.00
## 60 Upright 22 no 254 0.32
## 69 Upright 68 no 81976 30.00
## 92 Upright 32 no 848 3.00
## revenue..Millions. language country New_Genre runtimeGrp budget_log
## 14 293.33 English USA Other [60,90) 2.251292
## 21 64.57 English USA Other [120,267) 3.081910
## 23 30.75 English USA Other [60,90) 2.639057
## 25 433.01 English USA Other [120,267) 4.584967
## 38 1156.73 English USA Other [60,90) 4.304065
## 44 90.87 English USA Other [90,120) 4.382027
## 51 287.55 English USA Other [90,120) 3.401197
## 60 0.12 English USA Other [60,90) -1.139434
## 69 61.81 English USA Other [39,60) 3.401197
## 92 2.36 English USA Other [60,90) 1.098612
## revenue_log Revenue_Ratio
## 14 5.6812983 30.8768421
## 21 4.1677499 2.9619266
## 23 3.4258900 2.1964286
## 25 6.0707608 4.4184694
## 38 7.0533523 15.6314865
## 44 4.5094299 1.1358750
## 51 5.6613968 9.5850000
## 60 -2.1202635 0.3750000
## 69 4.1240652 2.0603333
## 92 0.8586616 0.7866667
Keep only numeric columns for correlation and analysis
numeric_data_oth <- subother %>%
select_if(is.numeric)
Correlation Analysis
correlation_matrix_oth <- cor(numeric_data_oth, use = "complete.obs")
revenue_correlation_oth <- correlation_matrix_oth["revenue..Millions.", ]
sorted_correlation_oth <- sort(revenue_correlation_oth, decreasing = TRUE)
print("Top correlated variables with Revenue Across Genre Other:")
## [1] "Top correlated variables with Revenue Across Genre Other:"
print(sorted_correlation_oth)
## revenue..Millions. Facebook_Likes revenue_log budget..Millions.
## 1.000000000 0.803872148 0.701037895 0.636080741
## budget_log audience_score critics_score Revenue_Ratio
## 0.532496939 0.428356388 0.401585423 0.193179429
## imdb_rating imdb_num_votes runtime
## 0.054244960 0.016576978 -0.007252438
Variable Importance with Random Forest
Prepare data for random forest
rf_data_oth <- subother %>%
select(revenue..Millions., imdb_rating, critics_score, audience_score, budget..Millions., Facebook_Likes) %>%
na.omit()
Fit random forest model
rf_model_oth <- randomForest(revenue..Millions. ~ ., data = rf_data_oth, importance = TRUE)
Get variable importance
importance_scores_oth <- importance(rf_model_oth)
importance_scores_oth <- importance_scores_oth[order(-importance_scores_oth[, 1]), ]
print("Variable Importance from Random Forest:")
## [1] "Variable Importance from Random Forest:"
print(importance_scores_oth)
## %IncMSE IncNodePurity
## Facebook_Likes 17.216725 1167903.9
## budget..Millions. 10.046584 817287.0
## imdb_rating 8.102096 387999.4
## critics_score 5.202174 415418.3
## audience_score 5.164914 381827.1
Visualization 1: Facebook_Likes across Other
ggplot(subother, aes(x = New_Genre, y = Facebook_Likes)) +
geom_boxplot(fill = "lightblue", outlier.color = "red") +
theme_minimal() +
labs(title = "Distribution of Facebook Likes Across Other",
x = "Other", y = "Facebook Likes") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

Visualization 2: Budget across Other
ggplot(subother, aes(x = New_Genre, y = budget..Millions.)) +
geom_boxplot(fill = "lightblue", outlier.color = "red") +
theme_minimal() +
labs(title = paste("Distribution of Budget (Millions) Across Other"),
x = "Other", y = "budget..Millions.") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

select the text values with “Theater/International”
stringr::str_detect(Movie_unique$New_Genre, "Theater/International") # select the values with "Theater/International"
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [217] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE
## [493] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [505] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [517] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [541] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [577] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [589] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [613] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [625] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
head(Movie_unique[str_detect(Movie_unique$New_Genre, "Theater/International"),], 10) # just see the dataframe
## original_title type genre
## 12 A.I. Artificial Intelligence Feature Film Musical & Performing Arts
## 30 Armageddon Feature Film Musical & Performing Arts
## 53 Battleship Feature Film Art House & International
## 66 Body of Lies Documentary Musical & Performing Arts
## 94 Cold Mountain Feature Film Musical & Performing Arts
## 108 Daylight Feature Film Art House & International
## 128 Elysium Feature Film Art House & International
## 171 Gladiator Feature Film Art House & International
## 187 Hancock Documentary Musical & Performing Arts
## 217 How the Grinch Stole Christmas Feature Film Musical & Performing Arts
## runtime mpaa_rating imdb_rating imdb_num_votes critics_rating
## 12 121 R 5.2 275125 Fresh
## 30 117 PG 3.6 1010 Fresh
## 53 115 R 6.4 3688 Fresh
## 66 103 PG 7.7 11197 Certified Fresh
## 94 115 PG 6.7 134031 Rotten
## 108 115 Unrated 7.5 9990 Fresh
## 128 86 R 6.5 5762 Fresh
## 171 107 R 6.8 9025 Fresh
## 187 96 Unrated 5.5 32751 Rotten
## 217 85 R 6.9 87215 Certified Fresh
## critics_score audience_rating audience_score best_pic_nom Facebook_Likes
## 12 59 Upright 63 no 86217
## 30 39 Upright 30 no 17029
## 53 64 Upright 54 no 126679
## 66 95 Upright 81 no 59824
## 94 64 Spilled 78 no 461110
## 108 71 Upright 86 no 25126
## 128 64 Spilled 48 no 2689
## 171 61 Upright 77 no 6521
## 187 15 Spilled 36 no 11584
## 217 63 Upright 71 no 224146
## budget..Millions. revenue..Millions. language country New_Genre
## 12 70.0 61.28 English USA Theater/International
## 30 13.0 11.78 English USA Theater/International
## 53 20.0 88.35 English USA Theater/International
## 66 22.0 40.27 English USA Theater/International
## 94 90.0 485.02 English USA Theater/International
## 108 53.0 126.69 English USA Theater/International
## 128 3.3 3.99 English USA Theater/International
## 171 145.0 272.91 English USA Theater/International
## 187 40.0 10.66 English USA Theater/International
## 217 61.0 369.33 English USA Theater/International
## runtimeGrp budget_log revenue_log Revenue_Ratio
## 12 [120,267) 4.248495 4.115454 0.8754286
## 30 [90,120) 2.564949 2.466403 0.9061538
## 53 [90,120) 2.995732 4.481306 4.4175000
## 66 [90,120) 3.091042 3.695607 1.8304545
## 94 [90,120) 4.499810 6.184190 5.3891111
## 108 [90,120) 3.970292 4.841743 2.3903774
## 128 [60,90) 1.193922 1.383791 1.2090909
## 171 [90,120) 4.976734 5.609142 1.8821379
## 187 [90,120) 3.688879 2.366498 0.2665000
## 217 [60,90) 4.110874 5.911691 6.0545902
Movie_Genre_Thea<-str_detect(Movie_unique$New_Genre, "Theater/International") # assign into a new dataframe
Movie_Genre_Thea # see the dataframe
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [217] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE
## [493] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [505] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [517] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [541] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [577] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [589] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [613] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [625] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
subtheater<-Movie_unique[Movie_Genre_Thea,] ## assign subset data into a dataframe
head(subtheater, 10)
## original_title type genre
## 12 A.I. Artificial Intelligence Feature Film Musical & Performing Arts
## 30 Armageddon Feature Film Musical & Performing Arts
## 53 Battleship Feature Film Art House & International
## 66 Body of Lies Documentary Musical & Performing Arts
## 94 Cold Mountain Feature Film Musical & Performing Arts
## 108 Daylight Feature Film Art House & International
## 128 Elysium Feature Film Art House & International
## 171 Gladiator Feature Film Art House & International
## 187 Hancock Documentary Musical & Performing Arts
## 217 How the Grinch Stole Christmas Feature Film Musical & Performing Arts
## runtime mpaa_rating imdb_rating imdb_num_votes critics_rating
## 12 121 R 5.2 275125 Fresh
## 30 117 PG 3.6 1010 Fresh
## 53 115 R 6.4 3688 Fresh
## 66 103 PG 7.7 11197 Certified Fresh
## 94 115 PG 6.7 134031 Rotten
## 108 115 Unrated 7.5 9990 Fresh
## 128 86 R 6.5 5762 Fresh
## 171 107 R 6.8 9025 Fresh
## 187 96 Unrated 5.5 32751 Rotten
## 217 85 R 6.9 87215 Certified Fresh
## critics_score audience_rating audience_score best_pic_nom Facebook_Likes
## 12 59 Upright 63 no 86217
## 30 39 Upright 30 no 17029
## 53 64 Upright 54 no 126679
## 66 95 Upright 81 no 59824
## 94 64 Spilled 78 no 461110
## 108 71 Upright 86 no 25126
## 128 64 Spilled 48 no 2689
## 171 61 Upright 77 no 6521
## 187 15 Spilled 36 no 11584
## 217 63 Upright 71 no 224146
## budget..Millions. revenue..Millions. language country New_Genre
## 12 70.0 61.28 English USA Theater/International
## 30 13.0 11.78 English USA Theater/International
## 53 20.0 88.35 English USA Theater/International
## 66 22.0 40.27 English USA Theater/International
## 94 90.0 485.02 English USA Theater/International
## 108 53.0 126.69 English USA Theater/International
## 128 3.3 3.99 English USA Theater/International
## 171 145.0 272.91 English USA Theater/International
## 187 40.0 10.66 English USA Theater/International
## 217 61.0 369.33 English USA Theater/International
## runtimeGrp budget_log revenue_log Revenue_Ratio
## 12 [120,267) 4.248495 4.115454 0.8754286
## 30 [90,120) 2.564949 2.466403 0.9061538
## 53 [90,120) 2.995732 4.481306 4.4175000
## 66 [90,120) 3.091042 3.695607 1.8304545
## 94 [90,120) 4.499810 6.184190 5.3891111
## 108 [90,120) 3.970292 4.841743 2.3903774
## 128 [60,90) 1.193922 1.383791 1.2090909
## 171 [90,120) 4.976734 5.609142 1.8821379
## 187 [90,120) 3.688879 2.366498 0.2665000
## 217 [60,90) 4.110874 5.911691 6.0545902
Keep only numeric columns for correlation and analysis
numeric_data_the <- subtheater %>%
select_if(is.numeric)
Correlation Analysis
correlation_matrix_the <- cor(numeric_data_the, use = "complete.obs")
revenue_correlation_the <- correlation_matrix_the["revenue..Millions.", ]
sorted_correlation_the <- sort(revenue_correlation_the, decreasing = TRUE)
print("Top correlated variables with Revenue:")
## [1] "Top correlated variables with Revenue:"
print(sorted_correlation_the)
## revenue..Millions. revenue_log Facebook_Likes budget..Millions.
## 1.00000000 0.86291892 0.74707354 0.61623664
## budget_log audience_score imdb_rating Revenue_Ratio
## 0.60558008 0.48768175 0.44474441 0.37907761
## critics_score runtime imdb_num_votes
## 0.25949880 0.13359410 0.08893409
Variable Importance with Random Forest
Prepare data for random forest
rf_data_the <- subtheater %>%
select(revenue..Millions., imdb_rating, critics_score, audience_score, budget..Millions., Facebook_Likes) %>%
na.omit()
Fit random forest model
rf_model_the <- randomForest(revenue..Millions. ~ ., data = rf_data_the, importance = TRUE)
Get variable importance
importance_scores_the <- importance(rf_model_the)
importance_scores_the <- importance_scores_the[order(-importance_scores_the[, 1]), ]
print("Variable Importance from Random Forest:")
## [1] "Variable Importance from Random Forest:"
print(importance_scores_the)
## %IncMSE IncNodePurity
## Facebook_Likes 10.470511 97065.42
## audience_score 9.722142 78563.02
## imdb_rating 9.096318 72758.04
## budget..Millions. 7.619482 81755.04
## critics_score 2.722505 63684.03
Visualization 1: Facebook_Likes across Theater/International
ggplot(subtheater, aes(x = New_Genre, y = Facebook_Likes)) +
geom_boxplot(fill = "lightblue", outlier.color = "red") +
theme_minimal() +
labs(title = "Distribution of Facebook Likes Across Theater/International",
x = "Theater/International", y = "Facebook Likes") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

Visualization 2: Budget across Theater/International
ggplot(subtheater, aes(x = New_Genre, y = budget..Millions.)) +
geom_boxplot(fill = "lightblue", outlier.color = "red") +
theme_minimal() +
labs(title = paste("Distribution of Budget (Millions) Across Theater/International"),
x = "Theater/International", y = "budget..Millions.") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

select the text values with “Thriller”
stringr::str_detect(Movie_unique$New_Genre, "Thriller") # select the values with "Thriller"
## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [61] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [121] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE
## [193] FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE
## [289] FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [313] TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [385] FALSE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [421] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [445] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [493] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [505] FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [517] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [541] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [577] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## [589] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [613] FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [625] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
head(Movie_unique[str_detect(Movie_unique$New_Genre, "Thriller"),], 10) # just see the dataframe
## original_title type genre runtime
## 3 2 Fast 2 Furious Feature Film Mystery & Suspense 97
## 13 Abraham Lincoln: Vampire Hunter Feature Film Mystery & Suspense 88
## 19 Alice in Wonderland Feature Film Mystery & Suspense 124
## 33 Arthur Christmas Feature Film Horror 100
## 34 Asterix at the Olympic Games Feature Film Horror 97
## 45 Basic Instinct 2 Feature Film Mystery & Suspense 121
## 48 Batman Forever Feature Film Horror 108
## 57 Ben-Hur Feature Film Mystery & Suspense 122
## 61 Big Fish Feature Film Horror 91
## 71 Captain America: Civil War Feature Film Horror 84
## mpaa_rating imdb_rating imdb_num_votes critics_rating critics_score
## 3 PG-13 6.3 54771 Rotten 40
## 13 PG 6.8 16366 Fresh 77
## 19 R 7.1 259822 Fresh 69
## 33 R 4.5 16824 Rotten 10
## 34 R 7.1 25264 Rotten 69
## 45 R 6.7 58907 Fresh 67
## 48 R 3.0 9216 Rotten 45
## 57 R 2.1 9904 Fresh 29
## 61 R 6.9 19539 Rotten 59
## 71 R 5.6 19285 Certified Fresh 77
## audience_rating audience_score best_pic_nom Facebook_Likes budget..Millions.
## 3 Spilled 49 no 35296 16.0
## 13 Upright 73 no 211234 150.0
## 19 Upright 70 no 79957 58.0
## 33 Spilled 31 no 2707 20.0
## 34 Spilled 78 no 224355 145.0
## 45 Upright 70 no 101899 137.0
## 48 Upright 35 no 26057 1.0
## 57 Upright 20 no 379 2.8
## 61 Upright 70 no 16138 37.0
## 71 Spilled 43 no 264798 81.0
## revenue..Millions. language country New_Genre runtimeGrp budget_log
## 3 31.56 English USA Thriller [90,120) 2.772589
## 13 341.13 English USA Thriller [60,90) 5.010635
## 19 150.17 English USA Thriller [120,267) 4.060443
## 33 32.01 English UK Thriller [90,120) 2.995732
## 34 609.12 French France Thriller [90,120) 4.976734
## 45 85.13 English UK Thriller [120,267) 4.919981
## 48 17.51 English USA Thriller [90,120) 0.000000
## 57 1.83 English USA Thriller [120,267) 1.029619
## 61 165.34 English USA Thriller [90,120) 3.610918
## 71 403.80 English USA Thriller [60,90) 4.394449
## revenue_log Revenue_Ratio
## 3 3.451890 1.9725000
## 13 5.832264 2.2742000
## 19 5.011768 2.5891379
## 33 3.466048 1.6005000
## 34 6.412015 4.2008276
## 45 4.444179 0.6213869
## 48 2.862772 17.5100000
## 57 0.604316 0.6535714
## 61 5.108004 4.4686486
## 71 6.000920 4.9851852
Movie_Genre_Thriller<-str_detect(Movie_unique$New_Genre, "Thriller") # assign into a new dataframe
Movie_Genre_Thriller # see the dataframe
## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [61] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [121] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE
## [193] FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE
## [289] FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [313] TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [385] FALSE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [421] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [445] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [493] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [505] FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [517] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [541] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [577] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## [589] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [613] FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [625] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
subthriller<-Movie_unique[Movie_Genre_Thriller,] ## assign subset data into a dataframe
head(subthriller, 10)
## original_title type genre runtime
## 3 2 Fast 2 Furious Feature Film Mystery & Suspense 97
## 13 Abraham Lincoln: Vampire Hunter Feature Film Mystery & Suspense 88
## 19 Alice in Wonderland Feature Film Mystery & Suspense 124
## 33 Arthur Christmas Feature Film Horror 100
## 34 Asterix at the Olympic Games Feature Film Horror 97
## 45 Basic Instinct 2 Feature Film Mystery & Suspense 121
## 48 Batman Forever Feature Film Horror 108
## 57 Ben-Hur Feature Film Mystery & Suspense 122
## 61 Big Fish Feature Film Horror 91
## 71 Captain America: Civil War Feature Film Horror 84
## mpaa_rating imdb_rating imdb_num_votes critics_rating critics_score
## 3 PG-13 6.3 54771 Rotten 40
## 13 PG 6.8 16366 Fresh 77
## 19 R 7.1 259822 Fresh 69
## 33 R 4.5 16824 Rotten 10
## 34 R 7.1 25264 Rotten 69
## 45 R 6.7 58907 Fresh 67
## 48 R 3.0 9216 Rotten 45
## 57 R 2.1 9904 Fresh 29
## 61 R 6.9 19539 Rotten 59
## 71 R 5.6 19285 Certified Fresh 77
## audience_rating audience_score best_pic_nom Facebook_Likes budget..Millions.
## 3 Spilled 49 no 35296 16.0
## 13 Upright 73 no 211234 150.0
## 19 Upright 70 no 79957 58.0
## 33 Spilled 31 no 2707 20.0
## 34 Spilled 78 no 224355 145.0
## 45 Upright 70 no 101899 137.0
## 48 Upright 35 no 26057 1.0
## 57 Upright 20 no 379 2.8
## 61 Upright 70 no 16138 37.0
## 71 Spilled 43 no 264798 81.0
## revenue..Millions. language country New_Genre runtimeGrp budget_log
## 3 31.56 English USA Thriller [90,120) 2.772589
## 13 341.13 English USA Thriller [60,90) 5.010635
## 19 150.17 English USA Thriller [120,267) 4.060443
## 33 32.01 English UK Thriller [90,120) 2.995732
## 34 609.12 French France Thriller [90,120) 4.976734
## 45 85.13 English UK Thriller [120,267) 4.919981
## 48 17.51 English USA Thriller [90,120) 0.000000
## 57 1.83 English USA Thriller [120,267) 1.029619
## 61 165.34 English USA Thriller [90,120) 3.610918
## 71 403.80 English USA Thriller [60,90) 4.394449
## revenue_log Revenue_Ratio
## 3 3.451890 1.9725000
## 13 5.832264 2.2742000
## 19 5.011768 2.5891379
## 33 3.466048 1.6005000
## 34 6.412015 4.2008276
## 45 4.444179 0.6213869
## 48 2.862772 17.5100000
## 57 0.604316 0.6535714
## 61 5.108004 4.4686486
## 71 6.000920 4.9851852
Keep only numeric columns for correlation and analysis
numeric_data_thr <- subthriller %>%
select_if(is.numeric)
Correlation Analysis
correlation_matrix_thr <- cor(numeric_data_thr, use = "complete.obs")
revenue_correlation_thr <- correlation_matrix_thr["revenue..Millions.", ]
sorted_correlation_thr <- sort(revenue_correlation_thr, decreasing = TRUE)
print("Top correlated variables with Revenue Across Genre Thriller:")
## [1] "Top correlated variables with Revenue Across Genre Thriller:"
print(sorted_correlation_thr)
## revenue..Millions. Facebook_Likes budget..Millions. revenue_log
## 1.00000000 0.72189241 0.71882979 0.71692781
## budget_log audience_score critics_score imdb_rating
## 0.53761242 0.49225106 0.44500410 0.42730362
## imdb_num_votes Revenue_Ratio runtime
## 0.22602141 0.18640937 -0.05849818
Variable Importance with Random Forest
Prepare data for random forest
rf_data_thr <- subthriller %>%
select(revenue..Millions., imdb_rating, critics_score, audience_score, budget..Millions., Facebook_Likes) %>%
na.omit()
Fit random forest model
rf_model_thr <- randomForest(revenue..Millions. ~ ., data = rf_data_thr, importance = TRUE)
Get variable importance
importance_scores_thr <- importance(rf_model_thr)
importance_scores_thr <- importance_scores_thr[order(-importance_scores_thr[, 1]), ]
print("Variable Importance from Random Forest:")
## [1] "Variable Importance from Random Forest:"
print(importance_scores_thr)
## %IncMSE IncNodePurity
## Facebook_Likes 16.202045 1100794.3
## budget..Millions. 10.820270 1029941.9
## critics_score 4.544067 640753.3
## audience_score 4.038513 699512.6
## imdb_rating 1.516156 600087.5
Visualization 1: Facebook_Likes across Thriller
ggplot(subthriller, aes(x = New_Genre, y = Facebook_Likes)) +
geom_boxplot(fill = "lightblue", outlier.color = "red") +
theme_minimal() +
labs(title = "Distribution of Facebook Likes Across Thriller",
x = "Thriller", y = "Facebook Likes") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

Visualization 2: Budget across Thriller
ggplot(subthriller, aes(x = New_Genre, y = budget..Millions.)) +
geom_boxplot(fill = "lightblue", outlier.color = "red") +
theme_minimal() +
labs(title = paste("Distribution of Budget (Millions) Across Thriller"),
x = "Thriller", y = "budget..Millions.") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

Build the multiple linear regression model
model <- lm(revenue ~ critics_score + facebook_likes + budget, data = analysis_data)
Summary of the model
summary(model)
##
## Call:
## lm(formula = revenue ~ critics_score + facebook_likes + budget,
## data = analysis_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -304.01 -52.48 -7.94 27.93 1499.63
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.435e+01 1.280e+01 -5.809 9.88e-09 ***
## critics_score 1.304e+00 2.278e-01 5.722 1.62e-08 ***
## facebook_likes 9.739e-04 6.144e-05 15.851 < 2e-16 ***
## budget 1.881e+00 1.268e-01 14.831 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 125.5 on 640 degrees of freedom
## Multiple R-squared: 0.6388, Adjusted R-squared: 0.6371
## F-statistic: 377.3 on 3 and 640 DF, p-value: < 2.2e-16
Predict revenue
predicted_revenue_a <- predict(model, newdata = input_data_a)
cat("Predicted revenue (a):", predicted_revenue_a, "million dollars\n")
## Predicted revenue (a): 76.23573 million dollars
Campaign increases Facebook likes by 10,000% and budget to $20
million
new_facebook_likes <- 1250 * (1 + 10000 / 100) # Increase by 10,000%
new_budget <- 20
Create a data frame with the updated values
input_data_b <- data.frame(
critics_score = 55,
facebook_likes = new_facebook_likes,
budget = new_budget
)
Predict revenue after campaign
predicted_revenue_b <- predict(model, newdata = input_data_b)
cat("Predicted revenue after campaign (b):", predicted_revenue_b, "million dollars\n")
## Predicted revenue after campaign (b): 157.9315 million dollars
Calculate the improvement in revenue
improvement <- predicted_revenue_b - predicted_revenue_a
cat("Revenue improvement due to campaign:", improvement, "million dollars\n")
## Revenue improvement due to campaign: 81.69575 million dollars