Set up working directory

getwd()
## [1] "D:/34884/Documents"
setwd("D:/34884/Documents")

Load Package

library(ggplot2)
library(readxl)
library(dplyr)
## 
## 载入程序包:'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## Warning: 程序包'tidyr'是用R版本4.4.2 来建造的
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(visreg)
library(tidyr)
library(caret)
## Warning: 程序包'caret'是用R版本4.4.2 来建造的
## 载入需要的程序包:lattice
## 
## 载入程序包:'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(randomForest)
## Warning: 程序包'randomForest'是用R版本4.4.2 来建造的
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## 载入程序包:'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin

Import Data File Moive Dataset_General Audience

Audience <- read.csv("Movie Dataset_General Audience.csv")

View variables names and types in the file

str(Audience)
## 'data.frame':    651 obs. of  13 variables:
##  $ original_title : chr  "The Departed " "Exodus: Gods and Kings " "Spectre " "The X Files " ...
##  $ type           : chr  "Feature Film" "Feature Film" "Feature Film" "Feature Film" ...
##  $ genre          : chr  "Drama" "Drama" "Comedy" "Drama" ...
##  $ runtime        : int  118 131 84 97 90 78 107 130 88 100 ...
##  $ mpaa_rating    : chr  "Unrated" "PG-13" "R" "PG" ...
##  $ imdb_rating    : num  2.1 3.3 7.6 2.5 7.2 7.8 2.1 2.1 7.5 4.1 ...
##  $ imdb_num_votes : int  9904 1010 22381 54363 35096 333 9904 122980 880 739 ...
##  $ critics_rating : chr  "Fresh" "Fresh" "Certified Fresh" "Rotten" ...
##  $ critics_score  : int  10 43 91 27 81 91 30 33 90 56 ...
##  $ audience_rating: chr  "Upright" "Upright" "Upright" "Upright" ...
##  $ audience_score : int  21 34 91 23 77 86 21 31 89 45 ...
##  $ best_pic_nom   : chr  "no" "no" "no" "no" ...
##  $ Facebook_Likes : int  48 490 11700 230 172221 211873 171 355 292000 3611 ...

To preview top 10 rows

head(Audience, n = 10)
##                                             original_title         type
## 1                                            The Departed  Feature Film
## 2                                  Exodus: Gods and Kings  Feature Film
## 3                                                 Spectre  Feature Film
## 4                                             The X Files  Feature Film
## 5  Star Wars: Episode VII - The Force Awakens              Feature Film
## 6                                             John Carter   Documentary
## 7                                                     Rio  Feature Film
## 8                                         The Expendables  Feature Film
## 9                                 Avengers: Age of Ultron   Documentary
## 10                                       Cutthroat Island  Feature Film
##          genre runtime mpaa_rating imdb_rating imdb_num_votes  critics_rating
## 1        Drama     118     Unrated         2.1           9904           Fresh
## 2        Drama     131       PG-13         3.3           1010           Fresh
## 3       Comedy      84           R         7.6          22381 Certified Fresh
## 4        Drama      97          PG         2.5          54363          Rotten
## 5       Horror      90           R         7.2          35096 Certified Fresh
## 6  Documentary      78     Unrated         7.8            333           Fresh
## 7        Drama     107           R         2.1           9904           Fresh
## 8        Drama     130           R         2.1         122980 Certified Fresh
## 9  Documentary      88     Unrated         7.5            880           Fresh
## 10       Drama     100           R         4.1            739           Fresh
##    critics_score audience_rating audience_score best_pic_nom Facebook_Likes
## 1             10         Upright             21           no             48
## 2             43         Upright             34           no            490
## 3             91         Upright             91           no          11700
## 4             27         Upright             23           no            230
## 5             81         Upright             77           no         172221
## 6             91         Upright             86           no         211873
## 7             30         Upright             21           no            171
## 8             33         Upright             31           no            355
## 9             90         Upright             89           no         292000
## 10            56         Upright             45           no           3611

Import Data File Movie Dataset_Financials

Financial <- read.csv("Movie Dataset_Financials.csv")

View variables names and types in the file

str(Financial)
## 'data.frame':    651 obs. of  5 variables:
##  $ original_title    : chr  "The Departed " "Exodus: Gods and Kings " "Spectre " "The X Files " ...
##  $ budget..Millions. : num  1 2 110 2.9 190 135 1 2 74 2 ...
##  $ revenue..Millions.: num  0.01 0.05 295.24 0.3 1506.25 ...
##  $ language          : chr  "English" "English" "English" "English" ...
##  $ country           : chr  "USA" "UK" "UK" "USA" ...

To preview top 10 rows

head(Financial, n = 10)
##                                             original_title budget..Millions.
## 1                                            The Departed                1.0
## 2                                  Exodus: Gods and Kings                2.0
## 3                                                 Spectre              110.0
## 4                                             The X Files                2.9
## 5  Star Wars: Episode VII - The Force Awakens                          190.0
## 6                                             John Carter              135.0
## 7                                                     Rio                1.0
## 8                                         The Expendables                2.0
## 9                                 Avengers: Age of Ultron               74.0
## 10                                       Cutthroat Island                2.0
##    revenue..Millions. language country
## 1                0.01  English     USA
## 2                0.05  English      UK
## 3              295.24  English      UK
## 4                0.30  English     USA
## 5             1506.25   French  France
## 6              532.95  English     USA
## 7                0.40  English     USA
## 8                0.57  English     USA
## 9             1156.73  English     USA
## 10               1.05  English     USA

Merge Audience and Financial into Movie

Movie <- merge(Audience, Financial, by = "original_title")

Check the number of unique value in Audience

n_distinct(Audience$original_title)
## [1] 644

Check the number of unique value in Financial

n_distinct(Financial$original_title)
## [1] 644

Find duplicated rows in Movie

duplicate_row <- Movie[duplicated(Movie$original_title),]

Remove duplicated rows from Movie

Movie_unique <- Movie[!duplicated(Movie$original_title),]

Check NA value in Movie Unique (Need help)

Movie_unique %>%
  filter(!complete.cases(Movie_unique))
##        original_title        type       genre runtime mpaa_rating imdb_rating
## 1 Sex and the City 2  Documentary Documentary      NA     Unrated         4.3
##   imdb_num_votes critics_rating critics_score audience_rating audience_score
## 1            739          Fresh            53         Upright             41
##   best_pic_nom Facebook_Likes budget..Millions. revenue..Millions. language
## 1           no          44555              19.8               41.3  English
##   country
## 1     USA

Replace the NA Value

Movie_unique$runtime[is.na(Movie_unique$runtime)] <- 146

Show the result of Replace NA value

print(Movie_unique[380, ])
##          original_title        type       genre runtime mpaa_rating imdb_rating
## 392 Sex and the City 2  Documentary Documentary     146     Unrated         4.3
##     imdb_num_votes critics_rating critics_score audience_rating audience_score
## 392            739          Fresh            53         Upright             41
##     best_pic_nom Facebook_Likes budget..Millions. revenue..Millions. language
## 392           no          44555              19.8               41.3  English
##     country
## 392     USA

Summaries of runtime

summary(Movie_unique$runtime)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      39      92     103     106     116     267

Summaries of imdb rating

summary(Movie_unique$imdb_rating)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.200   4.300   6.150   5.822   7.100  55.000

Summaries of critics score

summary(Movie_unique$critics_score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   37.00   55.00   54.99   73.25  100.00

Summaries of audience score

summary(Movie_unique$audience_score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   38.00   55.50   56.64   76.00   96.00

Summaries of Facebook Likes

summary(Movie_unique$Facebook_Likes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2    7309   26125   64181   82797  555609

Summaries of budget

summary(Movie_unique$budget..Millions.)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.32   10.00   25.00   41.29   53.00  280.00

Summaries of revenue

summary(Movie_unique$revenue..Millions.)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.01   21.57   62.48  137.52  154.25 2068.18

Convert the genre to new groupings

Movie_unique <- Movie_unique %>% 
mutate(New_Genre = case_when(genre == "Action & Adventure" ~ "Adventure",
                             genre == "Animation" ~ "Other",
                             genre == "Art House & International" ~ "Theatre/International",
                             genre == "Comedy" ~ "Comedy",
                             genre == "Documentary" ~ "Other",
                             genre == "Drama" ~ "Drame",
                             genre == "Horror" ~ "Thriller",
                             genre == "Musical & Performing Arts" ~ "Theatre/International",
                             genre == "Mystery & Suspense" ~ "Thriller",
                             genre == "Other" ~ "Other",
                             genre == "Science Fiction & Fantasy" ~ "Adventure",
                             TRUE ~ "Check"))

Convert runtime to Different Group

Movie_unique$runtimeGrp <- cut(Movie_unique$runtime,
                            breaks = c(39, 60, 90, 120, 267),
                            Labels = c("Short Movie", "Special Movie", "General Movie", "Extremely Long Movie"),
                            right=FALSE)
names(Movie_unique)
##  [1] "original_title"     "type"               "genre"             
##  [4] "runtime"            "mpaa_rating"        "imdb_rating"       
##  [7] "imdb_num_votes"     "critics_rating"     "critics_score"     
## [10] "audience_rating"    "audience_score"     "best_pic_nom"      
## [13] "Facebook_Likes"     "budget..Millions."  "revenue..Millions."
## [16] "language"           "country"            "New_Genre"         
## [19] "runtimeGrp"
head(Movie_unique$runtimeGrp)
## [1] [120,267) [90,120)  [90,120)  [90,120)  [90,120)  [90,120) 
## Levels: [39,60) [60,90) [90,120) [120,267)

Amount of each runtime group

table(Movie_unique$runtimeGrp)
## 
##   [39,60)   [60,90)  [90,120) [120,267) 
##         2       100       412       129

Visualization

Amount of each genre

ggplot(Movie_unique,aes(x = genre,fill = genre)) +
  geom_bar(position = "dodge") +
  geom_text(stat="count", aes(label=..count..), vjust=0, size=5, color="black") +
  labs(title = "Distribution of the number of movies by genre", x = "Genre", y = "Quantity",) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Average IMDB Voting Number by Genre

Movie_unique %>%
  group_by(genre) %>%
  summarise(Average_vote = mean(imdb_num_votes, na.rm = TRUE)) %>%
  ggplot(aes(x = reorder(genre, Average_vote), y = Average_vote)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  geom_text(aes(label = round(Average_vote, 1)), 
            hjust = 1, 
            color = "black", 
            size = 3) +
  coord_flip() +
  labs(title = "Average IMDB Voting Number by Genre", x = "Genre", y = "Average IMDB Voting Number")

Average IMDB_rating by Genre

Movie_unique %>%
  group_by(genre) %>%
  summarise(Average_imdb = mean(imdb_rating, na.rm = TRUE)) %>%
  ggplot(aes(x = reorder(genre, Average_imdb), y = Average_imdb)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  geom_text(aes(label = round(Average_imdb, 1)), 
            hjust = 1, 
            color = "black", 
            size = 3) +
  coord_flip() +
  labs(title = "Average IMDB Rating by Genre", x = "Genre", y = "Average IMDB Rating")

Total Number of Critics Rating by Genre

ggplot(Movie_unique,aes(x = critics_rating,fill = critics_rating)) +
  geom_bar(position = "dodge") +
  geom_text(stat="count", aes(label=..count..), vjust=0, size=5, color="black") +
  facet_wrap(~ genre) +
  labs(title = "Distribution of the number of Critics Rating by genre", x = "Genre", y = "Quantity",) +
  theme_minimal()

Average Critic Score by Genre

Movie_unique %>%
  group_by(genre) %>%
  summarise(Average_crit = mean(critics_score, na.rm = TRUE)) %>%
  ggplot(aes(x = reorder(genre, Average_crit), y = Average_crit)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  geom_text(aes(label = round(Average_crit, 1)), 
            hjust = 1, 
            color = "black", 
            size = 3) +
  coord_flip() +
  labs(title = "Average Crtics Score by Genre", x = "Genre", y = "Average Critics Score")

Scatter plot: Critic Score vs Revenue

ggplot(Movie_unique, aes(x = critics_score, y = revenue..Millions., color = genre)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, color = "Black") +
  labs(
    title = "Critic Score vs Revenue",
    x = "Critic Score",
    y = "revenue..Millions."
  ) +
  facet_wrap(~ genre) +
  theme_minimal() +
  theme(legend.position = "right") +
  scale_color_discrete(name = "Genre")
## `geom_smooth()` using formula = 'y ~ x'

Total Number of Audience Rating

ggplot(Movie_unique,aes(x = audience_rating,fill = audience_rating)) +
  geom_bar(position = "dodge") +
  geom_text(stat="count", aes(label=..count..), vjust=0, size=5, color="black") +
  facet_wrap(~ genre)

  labs(title = "Distribution of the number of Audience Rating by genre", x = "Genre", y = "Quantity",) +
  theme_minimal()
## NULL

Average Audience Score by Genre

Movie_unique %>%
  group_by(genre) %>%
  summarise(Average_aud = mean(audience_score, na.rm = TRUE)) %>%
  ggplot(aes(x = reorder(genre, Average_aud), y = Average_aud)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  geom_text(aes(label = round(Average_aud, 1)), 
            hjust = 1, 
            color = "black", 
            size = 3) +
  coord_flip() +
  labs(title = "Average Audience Score by Genre", x = "Genre", y = "Average Audience Score")

Scatter plot: Audience Score vs Revenue

ggplot(Movie_unique, aes(x = audience_score, y = revenue..Millions., color = genre)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, color = "Black") +
  labs(
    title = "Audience Score vs Revenue",
    x = "Audience Score",
    y = "revenue..Millions."
  ) +
  facet_wrap(~ genre) +
  theme_minimal() +
  theme(legend.position = "right") +
  scale_color_discrete(name = "Genre")
## `geom_smooth()` using formula = 'y ~ x'

Calculate average scores

average_scores_by_genre <- Movie_unique %>%
  group_by(genre) %>%
  summarise(
    average_critics_score = mean(critics_score, na.rm = TRUE),
    average_audience_score = mean(audience_score, na.rm = TRUE)
  )

Reshape data into long format for plotting

long_data <- average_scores_by_genre %>%
  pivot_longer(
    cols = c(average_critics_score, average_audience_score),
    names_to = "Score_Type",
    values_to = "Average_Score"
  )

Draw the line chart

ggplot(long_data, aes(x = genre, y = Average_Score, group = Score_Type, color = Score_Type)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  labs(
    title = "Average Critics Score and Audience Score by Genre",
    x = "Genre",
    y = "Average Score",
    color = "Score Type"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels for readability
    plot.title = element_text(hjust = 0.5)
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Average Facebook like by Genre

Movie_unique %>%
  group_by(genre) %>%
  summarise(Average_face = mean(Facebook_Likes, na.rm = TRUE)) %>%
  ggplot(aes(x = reorder(genre, Average_face), y = Average_face)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  geom_text(aes(label = round(Average_face, 1)), 
            hjust = 1, 
            color = "black", 
            size = 3) +
  coord_flip() +
  labs(title = "Average Facebook Likes by Genre", x = "Genre", y = "Average Facebook Likes")

Average Budget…Millions by Genre

Movie_unique %>%
  group_by(genre) %>%
  summarise(Average_budget = mean(budget..Millions., na.rm = TRUE)) %>%
  ggplot(aes(x = reorder(genre, Average_budget), y = Average_budget)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  geom_text(aes(label = round(Average_budget, 1)), 
            hjust = 1, 
            color = "black", 
            size = 3) +
  coord_flip() +
  labs(title = "Average Budget(Millions) by Genre", x = "Genre", y = "Average Budget(Millions)")

Average Revenue…Millions by Genre

Movie_unique %>%
  group_by(genre) %>%
  summarise(Average_revenue = mean(revenue..Millions., na.rm = TRUE)) %>%
  ggplot(aes(x = reorder(genre, Average_revenue), y = Average_revenue)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  geom_text(aes(label = round(Average_revenue, 1)), 
            hjust = 1, 
            color = "black", 
            size = 3) +
  coord_flip() +
  labs(title = "Average Revenue(Millions) by Genre", x = "Genre", y = "Average Revenue(Millions)")

Scatter plot: Budget vs Revenue

Movie_unique <- Movie_unique %>%
  mutate(
    budget_log = log(budget..Millions.),
    revenue_log = log(revenue..Millions.)
  )

ggplot(Movie_unique, aes(x = budget_log, y = revenue_log, color = genre)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, color = "Black") +
  labs(
    title = "Budget vs Revenue",
    x = "budget..Millions.",
    y = "revenue..Millions."
  ) +
  facet_wrap(~ genre) +
  theme_minimal() +
  theme(legend.position = "right") +
  scale_color_discrete(name = "Genre")
## `geom_smooth()` using formula = 'y ~ x'

Create Revenue Ratio by Revenue Divided By Budget

Movie_unique$Revenue_Ratio = Movie_unique$revenue..Millions. / Movie_unique$budget..Millions.

Average Revenue Ration by Genre

Movie_unique %>%
  group_by(genre) %>%
  summarise(Average_ratio = mean(Revenue_Ratio, na.rm = TRUE)) %>%
  ggplot(aes(x = reorder(genre, Average_ratio), y = Average_ratio)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  geom_text(aes(label = round(Average_ratio, 1)), 
            hjust = 1, 
            color = "black", 
            size = 3) +
  coord_flip() +
  labs(title = "Average Revenue Ratio by Genre", x = "Genre", y = "Average Revenue Ratio")

t.test

table(Movie_unique$audience_rating)
## 
## Spilled Upright 
##     220     424

Independent sample t-test

Visually see the mean difference.

boxplot(Movie_unique$audience_score ~ Movie_unique$audience_rating, col=c(3, 6)) # visually see the mean difference.

Normality assumption on the dependent variable

shapiro.test(Movie_unique$audience_score)
## 
##  Shapiro-Wilk normality test
## 
## data:  Movie_unique$audience_score
## W = 0.95384, p-value = 2.542e-13

Select relevant columns for analysis

columns_of_interest <- c("revenue..Millions.", "runtime", "mpaa_rating", 
                         "imdb_rating", "critics_score", "audience_score", 
                         "budget..Millions.", "Facebook_Likes", "genre", "type")
analysis_data <- Movie_unique[, columns_of_interest]

Rename columns for convenience

colnames(analysis_data) <- c("revenue", "runtime", "mpaa_rating", "imdb_rating", 
                             "critics_score", "audience_score", "budget", 
                             "facebook_likes", "genre", "type")

Remove missing values

analysis_data <- na.omit(analysis_data)

Perform one-way ANOVA for categorical predictors

anova_results <- list()
categorical_columns <- c("mpaa_rating", "genre", "type")
for (category in categorical_columns) {
  anova_model <- aov(revenue ~ get(category), data = analysis_data)
  anova_results[[category]] <- summary(anova_model)
}

Perform simple linear regression for numerical predictors

numerical_columns <- c("runtime", "imdb_rating", "critics_score", "audience_score", "budget", "facebook_likes")
simple_models <- list()
for (column in numerical_columns) {
  formula <- as.formula(paste("revenue ~", column))
  model <- lm(formula, data = analysis_data)
  simple_models[[column]] <- summary(model)
}

Perform multiple linear regression

multi_model <- lm(revenue ~ runtime + imdb_rating + critics_score + audience_score + budget + facebook_likes, 
                  data = analysis_data)

select the values with “Adventure”

stringr::str_detect(Movie_unique$New_Genre, "Adventure") # select the values with "Adventure"
##   [1] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49]  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE
##  [85] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE
##  [97] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133]  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE
## [205] FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [301] FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [325] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [373]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469]  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE
## [481] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [493] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE
## [505] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [517] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [541] FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [577] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [589] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [613] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [625] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
head(Movie_unique[str_detect(Movie_unique$New_Genre, "Adventure"),], 10) # just see the dataframe
##                         original_title         type                     genre
## 5              300: Rise of an Empire  Feature Film        Action & Adventure
## 8                    A Beautiful Mind  Feature Film        Action & Adventure
## 10             A Good Day to Die Hard  Feature Film        Action & Adventure
## 26                  American Gangster  Feature Film Science Fiction & Fantasy
## 39            AVP: Alien vs. Predator  Feature Film        Action & Adventure
## 49                     Batman Returns  Feature Film Science Fiction & Fantasy
## 50 Batman v Superman: Dawn of Justice  Feature Film        Action & Adventure
## 52                  Battlefield Earth  Feature Film        Action & Adventure
## 56                          Bee Movie  Feature Film        Action & Adventure
## 70                     Bruce Almighty  Feature Film        Action & Adventure
##    runtime mpaa_rating imdb_rating imdb_num_votes  critics_rating critics_score
## 5      111       PG-13         6.0         103789          Rotten            51
## 8       83           G         7.6          78862          Rotten            50
## 10      86           R         4.1            739           Fresh            53
## 26     113           R         2.0           9216          Rotten            34
## 39      81           R         7.6         123769           Fresh            72
## 49      85           R         2.1         122980 Certified Fresh            33
## 50     127          PG         6.8          71979 Certified Fresh            89
## 52      93           R         6.3          42295          Rotten            25
## 56      92          PG         5.6         205065          Rotten            37
## 70      85          PG         6.0           1680          Rotten            33
##    audience_rating audience_score best_pic_nom Facebook_Likes budget..Millions.
## 5          Spilled             51           no          21583              16.0
## 8          Upright             81           no          52827              26.0
## 10         Upright             42           no           5481               8.5
## 26         Upright             24           no           1354               4.9
## 39         Upright             87           no         335227             140.0
## 49         Upright             31           no          22899              90.0
## 50         Upright             75           no         124450             245.0
## 52         Spilled             59           no            971              20.0
## 56         Spilled             51           no         106576              10.0
## 70         Upright             65           no         121276              15.0
##    revenue..Millions. language country New_Genre runtimeGrp budget_log
## 5               19.68  English     USA Adventure   [90,120)   2.772589
## 8               43.32  English     USA Adventure    [60,90)   3.258097
## 10               5.77  English     USA Adventure    [60,90)   2.140066
## 26               3.49  English     USA Adventure   [90,120)   1.589235
## 39             449.22  English     USA Adventure    [60,90)   4.941642
## 49              16.18  English     USA Adventure    [60,90)   4.499810
## 50             880.67  English     USA Adventure  [120,267)   5.501258
## 52              34.56  English     USA Adventure   [90,120)   2.995732
## 56              85.98  English     USA Adventure   [90,120)   2.302585
## 70              75.70  English     USA Adventure    [60,90)   2.708050
##    revenue_log Revenue_Ratio
## 5     2.979603     1.2300000
## 8     3.768614     1.6661538
## 10    1.752672     0.6788235
## 26    1.249902     0.7122449
## 39    6.107513     3.2087143
## 49    2.783776     0.1797778
## 50    6.780683     3.5945714
## 52    3.542697     1.7280000
## 56    4.454115     8.5980000
## 70    4.326778     5.0466667
Movie_Genre_Adventure<-str_detect(Movie_unique$New_Genre, "Adventure") # assign into a new dataframe
Movie_Genre_Adventure # see the dataframe
##   [1] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49]  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE
##  [85] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE
##  [97] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133]  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE
## [205] FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [301] FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [325] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [373]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469]  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE
## [481] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [493] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE
## [505] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [517] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [541] FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [577] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [589] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [613] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [625] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
subadventure<-Movie_unique[Movie_Genre_Adventure,] ## assign subset data into a dataframe
head(subadventure, 10)
##                         original_title         type                     genre
## 5              300: Rise of an Empire  Feature Film        Action & Adventure
## 8                    A Beautiful Mind  Feature Film        Action & Adventure
## 10             A Good Day to Die Hard  Feature Film        Action & Adventure
## 26                  American Gangster  Feature Film Science Fiction & Fantasy
## 39            AVP: Alien vs. Predator  Feature Film        Action & Adventure
## 49                     Batman Returns  Feature Film Science Fiction & Fantasy
## 50 Batman v Superman: Dawn of Justice  Feature Film        Action & Adventure
## 52                  Battlefield Earth  Feature Film        Action & Adventure
## 56                          Bee Movie  Feature Film        Action & Adventure
## 70                     Bruce Almighty  Feature Film        Action & Adventure
##    runtime mpaa_rating imdb_rating imdb_num_votes  critics_rating critics_score
## 5      111       PG-13         6.0         103789          Rotten            51
## 8       83           G         7.6          78862          Rotten            50
## 10      86           R         4.1            739           Fresh            53
## 26     113           R         2.0           9216          Rotten            34
## 39      81           R         7.6         123769           Fresh            72
## 49      85           R         2.1         122980 Certified Fresh            33
## 50     127          PG         6.8          71979 Certified Fresh            89
## 52      93           R         6.3          42295          Rotten            25
## 56      92          PG         5.6         205065          Rotten            37
## 70      85          PG         6.0           1680          Rotten            33
##    audience_rating audience_score best_pic_nom Facebook_Likes budget..Millions.
## 5          Spilled             51           no          21583              16.0
## 8          Upright             81           no          52827              26.0
## 10         Upright             42           no           5481               8.5
## 26         Upright             24           no           1354               4.9
## 39         Upright             87           no         335227             140.0
## 49         Upright             31           no          22899              90.0
## 50         Upright             75           no         124450             245.0
## 52         Spilled             59           no            971              20.0
## 56         Spilled             51           no         106576              10.0
## 70         Upright             65           no         121276              15.0
##    revenue..Millions. language country New_Genre runtimeGrp budget_log
## 5               19.68  English     USA Adventure   [90,120)   2.772589
## 8               43.32  English     USA Adventure    [60,90)   3.258097
## 10               5.77  English     USA Adventure    [60,90)   2.140066
## 26               3.49  English     USA Adventure   [90,120)   1.589235
## 39             449.22  English     USA Adventure    [60,90)   4.941642
## 49              16.18  English     USA Adventure    [60,90)   4.499810
## 50             880.67  English     USA Adventure  [120,267)   5.501258
## 52              34.56  English     USA Adventure   [90,120)   2.995732
## 56              85.98  English     USA Adventure   [90,120)   2.302585
## 70              75.70  English     USA Adventure    [60,90)   2.708050
##    revenue_log Revenue_Ratio
## 5     2.979603     1.2300000
## 8     3.768614     1.6661538
## 10    1.752672     0.6788235
## 26    1.249902     0.7122449
## 39    6.107513     3.2087143
## 49    2.783776     0.1797778
## 50    6.780683     3.5945714
## 52    3.542697     1.7280000
## 56    4.454115     8.5980000
## 70    4.326778     5.0466667

Keep only numeric columns for correlation and analysis

numeric_data <- subadventure %>%
  select_if(is.numeric)

Correlation Analysis

correlation_matrix <- cor(numeric_data, use = "complete.obs")
revenue_correlation <- correlation_matrix["revenue..Millions.", ]
sorted_correlation <- sort(revenue_correlation, decreasing = TRUE)
print("Top correlated variables with Revenue across Genre Adventure:")
## [1] "Top correlated variables with Revenue across Genre Adventure:"
print(sorted_correlation)
## revenue..Millions.  budget..Millions.        revenue_log     Facebook_Likes 
##          1.0000000          0.7148536          0.7094601          0.6194620 
##     audience_score        imdb_rating         budget_log      Revenue_Ratio 
##          0.4825824          0.4743989          0.4100942          0.3841060 
##            runtime      critics_score     imdb_num_votes 
##          0.3536316          0.3447954          0.1652981

Linear Regression

lm_model <- lm(revenue..Millions. ~ imdb_rating + critics_score + audience_score + budget..Millions. + Facebook_Likes, data = subadventure)

Variable Importance with Random Forest

Prepare data for random forest

rf_data <- subadventure %>%
  select(revenue..Millions., imdb_rating, critics_score, audience_score, budget..Millions., Facebook_Likes) %>%
  na.omit()

Fit random forest model

rf_model <- randomForest(revenue..Millions. ~ ., data = rf_data, importance = TRUE)

Get variable importance

importance_scores <- importance(rf_model)
importance_scores <- importance_scores[order(-importance_scores[, 1]), ]
print("Variable Importance from Random Forest:")
## [1] "Variable Importance from Random Forest:"
print(importance_scores)
##                     %IncMSE IncNodePurity
## budget..Millions. 15.088286      383011.2
## Facebook_Likes    14.480836      446790.1
## audience_score     7.896802      294501.9
## imdb_rating        6.665376      216396.6
## critics_score      1.377827      233494.4

Visualization 1: Facebook_Likes across Adventure

ggplot(subadventure, aes(x = New_Genre, y = Facebook_Likes)) +
  geom_boxplot(fill = "lightblue", outlier.color = "red") +
  theme_minimal() +
  labs(title = "Distribution of Facebook Likes Across Adventure",
     x = "Adventure", y = "Facebook Likes") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Visualization 2: Budget across Adventure

ggplot(subadventure, aes(x = New_Genre, y = budget..Millions.)) +
  geom_boxplot(fill = "lightblue", outlier.color = "red") +
  theme_minimal() +
  labs(title = paste("Distribution of Budget (Millions) Across Adventure"),
       x = "Adventure", y = "budget..Millions.") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

select the text values with “Comedy”

stringr::str_detect(Movie_unique$New_Genre, "Comedy") # select the values with "Comedy"
##   [1] FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
##  [85]  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [157]  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [169] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193]  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE
## [217]  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [277]  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE
## [289]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [349] FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [385] FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE
## [397] FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [421] FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE
## [433]  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481]  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [493] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [505] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
## [517] FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [541]  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [553] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [577] FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [589] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [613] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [625] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE
## [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
head(Movie_unique[str_detect(Movie_unique$New_Genre, "Comedy"),], 10) # just see the dataframe
##                               original_title         type  genre runtime
## 4                                      2012  Feature Film Comedy      98
## 7                            50 First Dates  Feature Film Comedy      87
## 17                                Alexander  Feature Film Comedy      89
## 24   Alvin and the Chipmunks: The Road Chip  Feature Film Comedy      86
## 31              Around the World in 80 Days  Feature Film Comedy      97
## 32                Arthur and the Invisibles  Feature Film Comedy      94
## 36                                Australia  Feature Film Comedy      90
## 43                Ballistic: Ecks vs. Sever  Feature Film Comedy      96
## 62                               Big Hero 6  Feature Film Comedy     100
## 79 Cats & Dogs: The Revenge of Kitty Galore  Feature Film Comedy     119
##    mpaa_rating imdb_rating imdb_num_votes  critics_rating critics_score
## 4        PG-13         6.3           8646 Certified Fresh            44
## 7            R         5.4           6811          Rotten            35
## 17       PG-13         8.5         893008 Certified Fresh            88
## 24       PG-13         2.1           9904           Fresh            11
## 31       PG-13         6.9          12606 Certified Fresh            85
## 32          PG         5.1           1674          Rotten            50
## 36          PG         7.2          44741           Fresh            76
## 43          PG         6.2          12402           Fresh            63
## 62           R         5.9          82737          Rotten            46
## 79       PG-13         6.3         124250 Certified Fresh            75
##    audience_rating audience_score best_pic_nom Facebook_Likes budget..Millions.
## 4          Spilled             54           no            445               3.5
## 7          Spilled             31           no          12952              50.0
## 17         Upright             91           no         224598              74.0
## 24         Upright             22           no           3450               6.0
## 31         Upright             69           no         103175              45.0
## 32         Spilled             38           no            683              18.5
## 36         Upright             80           no            921               2.5
## 43         Spilled             53           no           1846              68.0
## 62         Spilled             59           no          62963               8.5
## 79         Spilled             52           no           3326              38.0
##    revenue..Millions. language   country New_Genre runtimeGrp budget_log
## 4                0.75  English       USA    Comedy   [90,120)  1.2527630
## 7               36.35  English       USA    Comedy    [60,90)  3.9120230
## 17             311.59  English   Germany    Comedy    [60,90)  4.3040651
## 24               3.02  English       USA    Comedy    [60,90)  1.7917595
## 31              86.17  English       USA    Comedy   [90,120)  3.8066625
## 32              31.91  English    France    Comedy   [90,120)  2.9177707
## 36              91.38  English Australia    Comedy   [90,120)  0.9162907
## 43             203.39  English       USA    Comedy   [90,120)  4.2195077
## 62              43.53  English       USA    Comedy   [90,120)  2.1400662
## 79             105.61  English       USA    Comedy   [90,120)  3.6375862
##    revenue_log Revenue_Ratio
## 4   -0.2876821     0.2142857
## 7    3.5931942     0.7270000
## 17   5.7416882     4.2106757
## 24   1.1052568     0.5033333
## 31   4.4563221     1.9148889
## 32   3.4629194     1.7248649
## 36   4.5150266    36.5520000
## 43   5.3151253     2.9910294
## 62   3.7734504     5.1211765
## 79   4.6597531     2.7792105
Movie_Genre_Comedy<-str_detect(Movie_unique$New_Genre, "Comedy") # assign into a new dataframe
Movie_Genre_Comedy # see the dataframe
##   [1] FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
##  [85]  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [157]  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [169] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193]  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE
## [217]  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [277]  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE
## [289]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [349] FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [385] FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE
## [397] FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [421] FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE
## [433]  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481]  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [493] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [505] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
## [517] FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [541]  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [553] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [577] FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [589] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [613] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [625] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE
## [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
subcomedy<-Movie_unique[Movie_Genre_Comedy,] ## assign subset data into a dataframe
head(subcomedy, 10)
##                               original_title         type  genre runtime
## 4                                      2012  Feature Film Comedy      98
## 7                            50 First Dates  Feature Film Comedy      87
## 17                                Alexander  Feature Film Comedy      89
## 24   Alvin and the Chipmunks: The Road Chip  Feature Film Comedy      86
## 31              Around the World in 80 Days  Feature Film Comedy      97
## 32                Arthur and the Invisibles  Feature Film Comedy      94
## 36                                Australia  Feature Film Comedy      90
## 43                Ballistic: Ecks vs. Sever  Feature Film Comedy      96
## 62                               Big Hero 6  Feature Film Comedy     100
## 79 Cats & Dogs: The Revenge of Kitty Galore  Feature Film Comedy     119
##    mpaa_rating imdb_rating imdb_num_votes  critics_rating critics_score
## 4        PG-13         6.3           8646 Certified Fresh            44
## 7            R         5.4           6811          Rotten            35
## 17       PG-13         8.5         893008 Certified Fresh            88
## 24       PG-13         2.1           9904           Fresh            11
## 31       PG-13         6.9          12606 Certified Fresh            85
## 32          PG         5.1           1674          Rotten            50
## 36          PG         7.2          44741           Fresh            76
## 43          PG         6.2          12402           Fresh            63
## 62           R         5.9          82737          Rotten            46
## 79       PG-13         6.3         124250 Certified Fresh            75
##    audience_rating audience_score best_pic_nom Facebook_Likes budget..Millions.
## 4          Spilled             54           no            445               3.5
## 7          Spilled             31           no          12952              50.0
## 17         Upright             91           no         224598              74.0
## 24         Upright             22           no           3450               6.0
## 31         Upright             69           no         103175              45.0
## 32         Spilled             38           no            683              18.5
## 36         Upright             80           no            921               2.5
## 43         Spilled             53           no           1846              68.0
## 62         Spilled             59           no          62963               8.5
## 79         Spilled             52           no           3326              38.0
##    revenue..Millions. language   country New_Genre runtimeGrp budget_log
## 4                0.75  English       USA    Comedy   [90,120)  1.2527630
## 7               36.35  English       USA    Comedy    [60,90)  3.9120230
## 17             311.59  English   Germany    Comedy    [60,90)  4.3040651
## 24               3.02  English       USA    Comedy    [60,90)  1.7917595
## 31              86.17  English       USA    Comedy   [90,120)  3.8066625
## 32              31.91  English    France    Comedy   [90,120)  2.9177707
## 36              91.38  English Australia    Comedy   [90,120)  0.9162907
## 43             203.39  English       USA    Comedy   [90,120)  4.2195077
## 62              43.53  English       USA    Comedy   [90,120)  2.1400662
## 79             105.61  English       USA    Comedy   [90,120)  3.6375862
##    revenue_log Revenue_Ratio
## 4   -0.2876821     0.2142857
## 7    3.5931942     0.7270000
## 17   5.7416882     4.2106757
## 24   1.1052568     0.5033333
## 31   4.4563221     1.9148889
## 32   3.4629194     1.7248649
## 36   4.5150266    36.5520000
## 43   5.3151253     2.9910294
## 62   3.7734504     5.1211765
## 79   4.6597531     2.7792105

Keep only numeric columns for correlation and analysis

numeric_data <- subadventure %>%
  select_if(is.numeric)

Correlation Analysis

correlation_matrix <- cor(numeric_data, use = "complete.obs")
revenue_correlation <- correlation_matrix["revenue..Millions.", ]
sorted_correlation <- sort(revenue_correlation, decreasing = TRUE)
print("Top correlated variables with Revenue Across Genre Comedy:")
## [1] "Top correlated variables with Revenue Across Genre Comedy:"
print(sorted_correlation)
## revenue..Millions.  budget..Millions.        revenue_log     Facebook_Likes 
##          1.0000000          0.7148536          0.7094601          0.6194620 
##     audience_score        imdb_rating         budget_log      Revenue_Ratio 
##          0.4825824          0.4743989          0.4100942          0.3841060 
##            runtime      critics_score     imdb_num_votes 
##          0.3536316          0.3447954          0.1652981

Linear Regression

lm_model <- lm(revenue..Millions. ~ imdb_rating + critics_score + audience_score + budget..Millions. + Facebook_Likes, data = subadventure)

Variable Importance with Random Forest

Prepare data for random forest

rf_data <- subadventure %>%
  select(revenue..Millions., imdb_rating, critics_score, audience_score, budget..Millions., Facebook_Likes) %>%
  na.omit()

Fit random forest model

rf_model <- randomForest(revenue..Millions. ~ ., data = rf_data, importance = TRUE)

Get variable importance

importance_scores <- importance(rf_model)
importance_scores <- importance_scores[order(-importance_scores[, 1]), ]
print("Variable Importance from Random Forest:")
## [1] "Variable Importance from Random Forest:"
print(importance_scores)
##                     %IncMSE IncNodePurity
## Facebook_Likes    13.854769      457815.4
## budget..Millions. 13.644952      375615.7
## audience_score     7.768896      233046.9
## imdb_rating        6.783975      245426.8
## critics_score      1.753180      234366.1

Visualization 1: Facebook_Likes across Comedy

ggplot(subcomedy, aes(x = New_Genre, y = Facebook_Likes)) +
  geom_boxplot(fill = "lightblue", outlier.color = "red") +
  theme_minimal() +
  labs(title = "Distribution of Facebook Likes Across Comedy",
     x = "Comedy", y = "Facebook Likes") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Visualization 2: Budget across Comedy

ggplot(subcomedy, aes(x = New_Genre, y = budget..Millions.)) +
  geom_boxplot(fill = "lightblue", outlier.color = "red") +
  theme_minimal() +
  labs(title = paste("Distribution of Budget (Millions) Across Comedy"),
       x = "Comedy", y = "budget..Millions.") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

select the text values with “Drame”

stringr::str_detect(Movie_unique$New_Genre, "Drame") # select the values with "Drame"
##   [1]  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE
##  [13] FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE
##  [25] FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
##  [37]  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE
##  [61] FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE
##  [73]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
##  [85] FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97]  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE
## [109]  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE
## [121]  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE
## [133] FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE
## [157] FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
## [169]  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE
## [181] FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE
## [193] FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE
## [205]  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE
## [229]  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE
## [241] FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE
## [253]  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE
## [265]  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
## [277] FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE
## [289] FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE
## [301]  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [313] FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [325]  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE
## [337]  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE
## [349]  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
## [361]  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [373] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE
## [385]  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE
## [397]  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE
## [409]  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE
## [421] FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE
## [433] FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE
## [445] FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE
## [457]  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE
## [469] FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE
## [481] FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE
## [493]  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE
## [505] FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [517]  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE
## [529] FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
## [541] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE
## [553]  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE
## [565]  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE  TRUE
## [577] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE
## [589]  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE
## [601]  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE
## [613]  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [625] FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE
## [637]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
head(Movie_unique[str_detect(Movie_unique$New_Genre, "Drame"),], 10) # just see the dataframe
##                      original_title         type genre runtime mpaa_rating
## 1          10,000 B.C.              Feature Film Drama     134           R
## 2                   102 Dalmatians  Feature Film Drama     108          PG
## 6                         47 Ronin  Feature Film Drama     106          PG
## 9                A Christmas Carol  Feature Film Drama     100           R
## 11              A Sound of Thunder  Feature Film Drama      95           R
## 15                           Agora  Feature Film Drama     100           R
## 16                   Air Force One  Feature Film Drama      93           R
## 18                             Ali  Feature Film Drama     112           R
## 20 Alice Through the Looking Glass  Feature Film Drama      90           R
## 22                       Allegiant  Feature Film Drama      85          PG
##    imdb_rating imdb_num_votes  critics_rating critics_score audience_rating
## 1          6.8           9025           Fresh            60         Upright
## 2          4.9           5136          Rotten             5         Spilled
## 6          7.8          12450           Fresh            75         Spilled
## 9          7.0           8320          Rotten            70         Spilled
## 11         6.2           1935          Rotten            35         Upright
## 15         6.1          13682          Rotten            54         Spilled
## 16         7.0          36909           Fresh            78         Upright
## 18         7.8         246587 Certified Fresh            93         Upright
## 20         5.7            390          Rotten            29         Spilled
## 22         7.8          26628           Fresh            76         Upright
##    audience_score best_pic_nom Facebook_Likes budget..Millions.
## 1              76           no          23343              12.6
## 2              13           no          84182              45.0
## 6              85           no          20965              50.0
## 9              74           no          48878              65.0
## 11             70           no          41890              50.0
## 15             37           no         101829              48.0
## 16             71           no          23603              52.0
## 18             87           no          14196              12.0
## 20             25           no          80806              12.0
## 22             80           no          12452             100.0
##    revenue..Millions. language     country New_Genre runtimeGrp budget_log
## 1               18.66  English New Zealand     Drame  [120,267)   2.533697
## 2               60.22  English         USA     Drame   [90,120)   3.806662
## 6              240.36  English         USA     Drame   [90,120)   3.912023
## 9              235.67  English         USA     Drame   [90,120)   4.174387
## 11              40.83  English          UK     Drame   [90,120)   3.912023
## 15              85.50  English       Spain     Drame   [90,120)   3.871201
## 16             216.49  English         USA     Drame   [90,120)   3.951244
## 18             305.15  English         USA     Drame   [90,120)   2.484907
## 20              32.25  English         USA     Drame   [90,120)   2.484907
## 22             243.40  English         USA     Drame    [60,90)   4.605170
##    revenue_log Revenue_Ratio
## 1     2.926382      1.480952
## 2     4.098005      1.338222
## 6     5.482138      4.807200
## 9     5.462433      3.625692
## 11    3.709417      0.816600
## 15    4.448516      1.781250
## 16    5.377544      4.163269
## 18    5.720803     25.429167
## 20    3.473518      2.687500
## 22    5.494706      2.434000
Movie_Genre_Drame<-str_detect(Movie_unique$New_Genre, "Drame") # assign into a new dataframe
Movie_Genre_Drame # see the dataframe
##   [1]  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE
##  [13] FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE
##  [25] FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
##  [37]  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE
##  [61] FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE
##  [73]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
##  [85] FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97]  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE
## [109]  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE
## [121]  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE
## [133] FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE
## [157] FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
## [169]  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE
## [181] FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE
## [193] FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE
## [205]  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE
## [229]  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE
## [241] FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE
## [253]  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE
## [265]  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
## [277] FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE
## [289] FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE
## [301]  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [313] FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [325]  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE
## [337]  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE
## [349]  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
## [361]  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [373] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE
## [385]  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE
## [397]  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE
## [409]  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE
## [421] FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE
## [433] FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE
## [445] FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE
## [457]  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE
## [469] FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE
## [481] FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE
## [493]  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE
## [505] FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [517]  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE
## [529] FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
## [541] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE
## [553]  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE
## [565]  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE  TRUE
## [577] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE
## [589]  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE
## [601]  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE
## [613]  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [625] FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE
## [637]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
subdrama<-Movie_unique[Movie_Genre_Drame,] ## assign subset data into a dataframe
head(subdrama, 10)
##                      original_title         type genre runtime mpaa_rating
## 1          10,000 B.C.              Feature Film Drama     134           R
## 2                   102 Dalmatians  Feature Film Drama     108          PG
## 6                         47 Ronin  Feature Film Drama     106          PG
## 9                A Christmas Carol  Feature Film Drama     100           R
## 11              A Sound of Thunder  Feature Film Drama      95           R
## 15                           Agora  Feature Film Drama     100           R
## 16                   Air Force One  Feature Film Drama      93           R
## 18                             Ali  Feature Film Drama     112           R
## 20 Alice Through the Looking Glass  Feature Film Drama      90           R
## 22                       Allegiant  Feature Film Drama      85          PG
##    imdb_rating imdb_num_votes  critics_rating critics_score audience_rating
## 1          6.8           9025           Fresh            60         Upright
## 2          4.9           5136          Rotten             5         Spilled
## 6          7.8          12450           Fresh            75         Spilled
## 9          7.0           8320          Rotten            70         Spilled
## 11         6.2           1935          Rotten            35         Upright
## 15         6.1          13682          Rotten            54         Spilled
## 16         7.0          36909           Fresh            78         Upright
## 18         7.8         246587 Certified Fresh            93         Upright
## 20         5.7            390          Rotten            29         Spilled
## 22         7.8          26628           Fresh            76         Upright
##    audience_score best_pic_nom Facebook_Likes budget..Millions.
## 1              76           no          23343              12.6
## 2              13           no          84182              45.0
## 6              85           no          20965              50.0
## 9              74           no          48878              65.0
## 11             70           no          41890              50.0
## 15             37           no         101829              48.0
## 16             71           no          23603              52.0
## 18             87           no          14196              12.0
## 20             25           no          80806              12.0
## 22             80           no          12452             100.0
##    revenue..Millions. language     country New_Genre runtimeGrp budget_log
## 1               18.66  English New Zealand     Drame  [120,267)   2.533697
## 2               60.22  English         USA     Drame   [90,120)   3.806662
## 6              240.36  English         USA     Drame   [90,120)   3.912023
## 9              235.67  English         USA     Drame   [90,120)   4.174387
## 11              40.83  English          UK     Drame   [90,120)   3.912023
## 15              85.50  English       Spain     Drame   [90,120)   3.871201
## 16             216.49  English         USA     Drame   [90,120)   3.951244
## 18             305.15  English         USA     Drame   [90,120)   2.484907
## 20              32.25  English         USA     Drame   [90,120)   2.484907
## 22             243.40  English         USA     Drame    [60,90)   4.605170
##    revenue_log Revenue_Ratio
## 1     2.926382      1.480952
## 2     4.098005      1.338222
## 6     5.482138      4.807200
## 9     5.462433      3.625692
## 11    3.709417      0.816600
## 15    4.448516      1.781250
## 16    5.377544      4.163269
## 18    5.720803     25.429167
## 20    3.473518      2.687500
## 22    5.494706      2.434000

Correlation Analysis

correlation_matrix <- cor(numeric_data, use = "complete.obs")
revenue_correlation <- correlation_matrix["revenue..Millions.", ]
sorted_correlation <- sort(revenue_correlation, decreasing = TRUE)
print("Top correlated variables with Revenue Across Genre Drama:")
## [1] "Top correlated variables with Revenue Across Genre Drama:"
print(sorted_correlation)
## revenue..Millions.  budget..Millions.        revenue_log     Facebook_Likes 
##          1.0000000          0.7148536          0.7094601          0.6194620 
##     audience_score        imdb_rating         budget_log      Revenue_Ratio 
##          0.4825824          0.4743989          0.4100942          0.3841060 
##            runtime      critics_score     imdb_num_votes 
##          0.3536316          0.3447954          0.1652981

Linear Regression

lm_model <- lm(revenue..Millions. ~ imdb_rating + critics_score + audience_score + budget..Millions. + Facebook_Likes, data = subadventure)

Variable Importance with Random Forest

Prepare data for random forest

rf_data <- subadventure %>%
  select(revenue..Millions., imdb_rating, critics_score, audience_score, budget..Millions., Facebook_Likes) %>%
  na.omit()

Fit random forest model

rf_model <- randomForest(revenue..Millions. ~ ., data = rf_data, importance = TRUE)

Get variable importance

importance_scores <- importance(rf_model)
importance_scores <- importance_scores[order(-importance_scores[, 1]), ]
print("Variable Importance from Random Forest:")
## [1] "Variable Importance from Random Forest:"
print(importance_scores)
##                     %IncMSE IncNodePurity
## Facebook_Likes    14.173612      438727.0
## budget..Millions. 12.064587      375150.1
## audience_score     6.642722      262499.1
## imdb_rating        6.214101      212782.4
## critics_score      1.119114      262605.9

Visualization 1: Facebook_Likes across Drama

ggplot(subdrama, aes(x = New_Genre, y = Facebook_Likes)) +
  geom_boxplot(fill = "lightblue", outlier.color = "red") +
  theme_minimal() +
  labs(title = "Distribution of Facebook Likes Across Drama",
     x = "Drama", y = "Facebook Likes") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Visualization 2: Budget across Drama

ggplot(subdrama, aes(x = New_Genre, y = budget..Millions.)) +
  geom_boxplot(fill = "lightblue", outlier.color = "red") +
  theme_minimal() +
  labs(title = paste("Distribution of Budget (Millions) Across Drama"),
       x = "Drama", y = "budget..Millions.") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

select the text values with “Other”

stringr::str_detect(Movie_unique$New_Genre, "Other") # select the values with "Other"
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE
##  [25]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [121] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [145]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE
## [181] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [241]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [313] FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE
## [325] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE
## [349] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [397] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [469] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE
## [493] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [505]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE
## [517] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [541] FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [577]  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [589] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [613] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE
## [625] FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE
## [637] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
head(Movie_unique[str_detect(Movie_unique$New_Genre, "Other"),], 10) # just see the dataframe
##                              original_title         type       genre runtime
## 14                             After Earth   Documentary Documentary      74
## 21                           All That Jazz   Documentary Documentary     122
## 23    Alvin and the Chipmunks: Chipwrecked   Documentary Documentary      86
## 25 Alvin and the Chipmunks: The Squeakquel  Feature Film       Other     127
## 38                 Avengers: Age of Ultron   Documentary Documentary      88
## 44                                 Bandits   Documentary Documentary      90
## 51                      Battle Los Angeles   Documentary Documentary      93
## 60                        Bicentennial Man   Documentary Documentary      86
## 69                              Braveheart   Documentary Documentary      40
## 92     Cloudy with a Chance of Meatballs 2   Documentary Documentary      83
##    mpaa_rating imdb_rating imdb_num_votes  critics_rating critics_score
## 14     Unrated         7.3            285           Fresh            78
## 21          PG         7.9           1346           Fresh            94
## 23     Unrated         2.1         122980 Certified Fresh            37
## 25       PG-13         7.0          79866           Fresh            67
## 38     Unrated         7.5            880           Fresh            90
## 44     Unrated         7.8            180           Fresh            84
## 51          PG         8.4            390           Fresh            95
## 60     Unrated         2.1           9904           Fresh            31
## 69           G         7.0            723           Fresh           100
## 92          PG         3.8          10522           Fresh            33
##    audience_rating audience_score best_pic_nom Facebook_Likes budget..Millions.
## 14         Upright             89           no          14168              9.50
## 21         Upright             85           no          71476             21.80
## 23         Upright             35           no          34073             14.00
## 25         Upright             71           no         345227             98.00
## 38         Upright             89           no         292000             74.00
## 44         Upright             79           no          13934             80.00
## 51         Upright             92           no           4001             30.00
## 60         Upright             22           no            254              0.32
## 69         Upright             68           no          81976             30.00
## 92         Upright             32           no            848              3.00
##    revenue..Millions. language country New_Genre runtimeGrp budget_log
## 14             293.33  English     USA     Other    [60,90)   2.251292
## 21              64.57  English     USA     Other  [120,267)   3.081910
## 23              30.75  English     USA     Other    [60,90)   2.639057
## 25             433.01  English     USA     Other  [120,267)   4.584967
## 38            1156.73  English     USA     Other    [60,90)   4.304065
## 44              90.87  English     USA     Other   [90,120)   4.382027
## 51             287.55  English     USA     Other   [90,120)   3.401197
## 60               0.12  English     USA     Other    [60,90)  -1.139434
## 69              61.81  English     USA     Other    [39,60)   3.401197
## 92               2.36  English     USA     Other    [60,90)   1.098612
##    revenue_log Revenue_Ratio
## 14   5.6812983    30.8768421
## 21   4.1677499     2.9619266
## 23   3.4258900     2.1964286
## 25   6.0707608     4.4184694
## 38   7.0533523    15.6314865
## 44   4.5094299     1.1358750
## 51   5.6613968     9.5850000
## 60  -2.1202635     0.3750000
## 69   4.1240652     2.0603333
## 92   0.8586616     0.7866667
Movie_Genre_Other<-str_detect(Movie_unique$New_Genre, "Other") # assign into a new dataframe
Movie_Genre_Other # see the dataframe
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE
##  [25]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [121] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [145]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE
## [181] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [241]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [313] FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE
## [325] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE
## [349] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [397] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [469] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE
## [493] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [505]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE
## [517] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [541] FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [577]  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [589] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [613] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE
## [625] FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE
## [637] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
subother<-Movie_unique[Movie_Genre_Other,] ## assign subset data into a dataframe
head(subother, 10)
##                              original_title         type       genre runtime
## 14                             After Earth   Documentary Documentary      74
## 21                           All That Jazz   Documentary Documentary     122
## 23    Alvin and the Chipmunks: Chipwrecked   Documentary Documentary      86
## 25 Alvin and the Chipmunks: The Squeakquel  Feature Film       Other     127
## 38                 Avengers: Age of Ultron   Documentary Documentary      88
## 44                                 Bandits   Documentary Documentary      90
## 51                      Battle Los Angeles   Documentary Documentary      93
## 60                        Bicentennial Man   Documentary Documentary      86
## 69                              Braveheart   Documentary Documentary      40
## 92     Cloudy with a Chance of Meatballs 2   Documentary Documentary      83
##    mpaa_rating imdb_rating imdb_num_votes  critics_rating critics_score
## 14     Unrated         7.3            285           Fresh            78
## 21          PG         7.9           1346           Fresh            94
## 23     Unrated         2.1         122980 Certified Fresh            37
## 25       PG-13         7.0          79866           Fresh            67
## 38     Unrated         7.5            880           Fresh            90
## 44     Unrated         7.8            180           Fresh            84
## 51          PG         8.4            390           Fresh            95
## 60     Unrated         2.1           9904           Fresh            31
## 69           G         7.0            723           Fresh           100
## 92          PG         3.8          10522           Fresh            33
##    audience_rating audience_score best_pic_nom Facebook_Likes budget..Millions.
## 14         Upright             89           no          14168              9.50
## 21         Upright             85           no          71476             21.80
## 23         Upright             35           no          34073             14.00
## 25         Upright             71           no         345227             98.00
## 38         Upright             89           no         292000             74.00
## 44         Upright             79           no          13934             80.00
## 51         Upright             92           no           4001             30.00
## 60         Upright             22           no            254              0.32
## 69         Upright             68           no          81976             30.00
## 92         Upright             32           no            848              3.00
##    revenue..Millions. language country New_Genre runtimeGrp budget_log
## 14             293.33  English     USA     Other    [60,90)   2.251292
## 21              64.57  English     USA     Other  [120,267)   3.081910
## 23              30.75  English     USA     Other    [60,90)   2.639057
## 25             433.01  English     USA     Other  [120,267)   4.584967
## 38            1156.73  English     USA     Other    [60,90)   4.304065
## 44              90.87  English     USA     Other   [90,120)   4.382027
## 51             287.55  English     USA     Other   [90,120)   3.401197
## 60               0.12  English     USA     Other    [60,90)  -1.139434
## 69              61.81  English     USA     Other    [39,60)   3.401197
## 92               2.36  English     USA     Other    [60,90)   1.098612
##    revenue_log Revenue_Ratio
## 14   5.6812983    30.8768421
## 21   4.1677499     2.9619266
## 23   3.4258900     2.1964286
## 25   6.0707608     4.4184694
## 38   7.0533523    15.6314865
## 44   4.5094299     1.1358750
## 51   5.6613968     9.5850000
## 60  -2.1202635     0.3750000
## 69   4.1240652     2.0603333
## 92   0.8586616     0.7866667

Correlation Analysis

correlation_matrix <- cor(numeric_data, use = "complete.obs")
revenue_correlation <- correlation_matrix["revenue..Millions.", ]
sorted_correlation <- sort(revenue_correlation, decreasing = TRUE)
print("Top correlated variables with Revenue Across Genre Other:")
## [1] "Top correlated variables with Revenue Across Genre Other:"
print(sorted_correlation)
## revenue..Millions.  budget..Millions.        revenue_log     Facebook_Likes 
##          1.0000000          0.7148536          0.7094601          0.6194620 
##     audience_score        imdb_rating         budget_log      Revenue_Ratio 
##          0.4825824          0.4743989          0.4100942          0.3841060 
##            runtime      critics_score     imdb_num_votes 
##          0.3536316          0.3447954          0.1652981

Linear Regression

lm_model <- lm(revenue..Millions. ~ imdb_rating + critics_score + audience_score + budget..Millions. + Facebook_Likes, data = subadventure)

Variable Importance with Random Forest

Prepare data for random forest

rf_data <- subadventure %>%
  select(revenue..Millions., imdb_rating, critics_score, audience_score, budget..Millions., Facebook_Likes) %>%
  na.omit()

Fit random forest model

rf_model <- randomForest(revenue..Millions. ~ ., data = rf_data, importance = TRUE)

Get variable importance

importance_scores <- importance(rf_model)
importance_scores <- importance_scores[order(-importance_scores[, 1]), ]
print("Variable Importance from Random Forest:")
## [1] "Variable Importance from Random Forest:"
print(importance_scores)
##                     %IncMSE IncNodePurity
## Facebook_Likes    13.943548      470165.6
## budget..Millions. 13.533702      353914.7
## audience_score     9.615821      266087.3
## imdb_rating        8.644461      246375.8
## critics_score      2.042307      229212.9

Visualization 1: Facebook_Likes across Other

ggplot(subother, aes(x = New_Genre, y = Facebook_Likes)) +
  geom_boxplot(fill = "lightblue", outlier.color = "red") +
  theme_minimal() +
  labs(title = "Distribution of Facebook Likes Across Other",
     x = "Other", y = "Facebook Likes") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Visualization 2: Budget across Other

ggplot(subother, aes(x = New_Genre, y = budget..Millions.)) +
  geom_boxplot(fill = "lightblue", outlier.color = "red") +
  theme_minimal() +
  labs(title = paste("Distribution of Budget (Millions) Across Other"),
       x = "Other", y = "budget..Millions.") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

select the text values with “Theatre/International”

stringr::str_detect(Movie_unique$New_Genre, "Theatre/International") # select the values with "Theatre/International"
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [217] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE
## [493] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [505] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [517] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [541] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [577] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [589] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [613] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [625]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
head(Movie_unique[str_detect(Movie_unique$New_Genre, "Theatre/International"),], 10) # just see the dataframe
##                      original_title         type                     genre
## 12    A.I. Artificial Intelligence  Feature Film Musical & Performing Arts
## 30                      Armageddon  Feature Film Musical & Performing Arts
## 53                      Battleship  Feature Film Art House & International
## 66                    Body of Lies   Documentary Musical & Performing Arts
## 94                   Cold Mountain  Feature Film Musical & Performing Arts
## 108                       Daylight  Feature Film Art House & International
## 128                        Elysium  Feature Film Art House & International
## 171                      Gladiator  Feature Film Art House & International
## 187                        Hancock   Documentary Musical & Performing Arts
## 217 How the Grinch Stole Christmas  Feature Film Musical & Performing Arts
##     runtime mpaa_rating imdb_rating imdb_num_votes  critics_rating
## 12      121           R         5.2         275125           Fresh
## 30      117          PG         3.6           1010           Fresh
## 53      115           R         6.4           3688           Fresh
## 66      103          PG         7.7          11197 Certified Fresh
## 94      115          PG         6.7         134031          Rotten
## 108     115     Unrated         7.5           9990           Fresh
## 128      86           R         6.5           5762           Fresh
## 171     107           R         6.8           9025           Fresh
## 187      96     Unrated         5.5          32751          Rotten
## 217      85           R         6.9          87215 Certified Fresh
##     critics_score audience_rating audience_score best_pic_nom Facebook_Likes
## 12             59         Upright             63           no          86217
## 30             39         Upright             30           no          17029
## 53             64         Upright             54           no         126679
## 66             95         Upright             81           no          59824
## 94             64         Spilled             78           no         461110
## 108            71         Upright             86           no          25126
## 128            64         Spilled             48           no           2689
## 171            61         Upright             77           no           6521
## 187            15         Spilled             36           no          11584
## 217            63         Upright             71           no         224146
##     budget..Millions. revenue..Millions. language country             New_Genre
## 12               70.0              61.28  English     USA Theatre/International
## 30               13.0              11.78  English     USA Theatre/International
## 53               20.0              88.35  English     USA Theatre/International
## 66               22.0              40.27  English     USA Theatre/International
## 94               90.0             485.02  English     USA Theatre/International
## 108              53.0             126.69  English     USA Theatre/International
## 128               3.3               3.99  English     USA Theatre/International
## 171             145.0             272.91  English     USA Theatre/International
## 187              40.0              10.66  English     USA Theatre/International
## 217              61.0             369.33  English     USA Theatre/International
##     runtimeGrp budget_log revenue_log Revenue_Ratio
## 12   [120,267)   4.248495    4.115454     0.8754286
## 30    [90,120)   2.564949    2.466403     0.9061538
## 53    [90,120)   2.995732    4.481306     4.4175000
## 66    [90,120)   3.091042    3.695607     1.8304545
## 94    [90,120)   4.499810    6.184190     5.3891111
## 108   [90,120)   3.970292    4.841743     2.3903774
## 128    [60,90)   1.193922    1.383791     1.2090909
## 171   [90,120)   4.976734    5.609142     1.8821379
## 187   [90,120)   3.688879    2.366498     0.2665000
## 217    [60,90)   4.110874    5.911691     6.0545902
Movie_Genre_Thea<-str_detect(Movie_unique$New_Genre, "Theatre/International") # assign into a new dataframe
Movie_Genre_Thea # see the dataframe
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [217] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE
## [493] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [505] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [517] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [541] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [577] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [589] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [613] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [625]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
subtheatre<-Movie_unique[Movie_Genre_Thea,] ## assign subset data into a dataframe
head(subtheatre, 10)
##                      original_title         type                     genre
## 12    A.I. Artificial Intelligence  Feature Film Musical & Performing Arts
## 30                      Armageddon  Feature Film Musical & Performing Arts
## 53                      Battleship  Feature Film Art House & International
## 66                    Body of Lies   Documentary Musical & Performing Arts
## 94                   Cold Mountain  Feature Film Musical & Performing Arts
## 108                       Daylight  Feature Film Art House & International
## 128                        Elysium  Feature Film Art House & International
## 171                      Gladiator  Feature Film Art House & International
## 187                        Hancock   Documentary Musical & Performing Arts
## 217 How the Grinch Stole Christmas  Feature Film Musical & Performing Arts
##     runtime mpaa_rating imdb_rating imdb_num_votes  critics_rating
## 12      121           R         5.2         275125           Fresh
## 30      117          PG         3.6           1010           Fresh
## 53      115           R         6.4           3688           Fresh
## 66      103          PG         7.7          11197 Certified Fresh
## 94      115          PG         6.7         134031          Rotten
## 108     115     Unrated         7.5           9990           Fresh
## 128      86           R         6.5           5762           Fresh
## 171     107           R         6.8           9025           Fresh
## 187      96     Unrated         5.5          32751          Rotten
## 217      85           R         6.9          87215 Certified Fresh
##     critics_score audience_rating audience_score best_pic_nom Facebook_Likes
## 12             59         Upright             63           no          86217
## 30             39         Upright             30           no          17029
## 53             64         Upright             54           no         126679
## 66             95         Upright             81           no          59824
## 94             64         Spilled             78           no         461110
## 108            71         Upright             86           no          25126
## 128            64         Spilled             48           no           2689
## 171            61         Upright             77           no           6521
## 187            15         Spilled             36           no          11584
## 217            63         Upright             71           no         224146
##     budget..Millions. revenue..Millions. language country             New_Genre
## 12               70.0              61.28  English     USA Theatre/International
## 30               13.0              11.78  English     USA Theatre/International
## 53               20.0              88.35  English     USA Theatre/International
## 66               22.0              40.27  English     USA Theatre/International
## 94               90.0             485.02  English     USA Theatre/International
## 108              53.0             126.69  English     USA Theatre/International
## 128               3.3               3.99  English     USA Theatre/International
## 171             145.0             272.91  English     USA Theatre/International
## 187              40.0              10.66  English     USA Theatre/International
## 217              61.0             369.33  English     USA Theatre/International
##     runtimeGrp budget_log revenue_log Revenue_Ratio
## 12   [120,267)   4.248495    4.115454     0.8754286
## 30    [90,120)   2.564949    2.466403     0.9061538
## 53    [90,120)   2.995732    4.481306     4.4175000
## 66    [90,120)   3.091042    3.695607     1.8304545
## 94    [90,120)   4.499810    6.184190     5.3891111
## 108   [90,120)   3.970292    4.841743     2.3903774
## 128    [60,90)   1.193922    1.383791     1.2090909
## 171   [90,120)   4.976734    5.609142     1.8821379
## 187   [90,120)   3.688879    2.366498     0.2665000
## 217    [60,90)   4.110874    5.911691     6.0545902

Correlation Analysis

correlation_matrix <- cor(numeric_data, use = "complete.obs")
revenue_correlation <- correlation_matrix["revenue..Millions.", ]
sorted_correlation <- sort(revenue_correlation, decreasing = TRUE)
print("Top correlated variables with Revenue:")
## [1] "Top correlated variables with Revenue:"
print(sorted_correlation)
## revenue..Millions.  budget..Millions.        revenue_log     Facebook_Likes 
##          1.0000000          0.7148536          0.7094601          0.6194620 
##     audience_score        imdb_rating         budget_log      Revenue_Ratio 
##          0.4825824          0.4743989          0.4100942          0.3841060 
##            runtime      critics_score     imdb_num_votes 
##          0.3536316          0.3447954          0.1652981

Linear Regression

lm_model <- lm(revenue..Millions. ~ imdb_rating + critics_score + audience_score + budget..Millions. + Facebook_Likes, data = subadventure)

Variable Importance with Random Forest

Prepare data for random forest

rf_data <- subadventure %>%
  select(revenue..Millions., imdb_rating, critics_score, audience_score, budget..Millions., Facebook_Likes) %>%
  na.omit()

Fit random forest model

rf_model <- randomForest(revenue..Millions. ~ ., data = rf_data, importance = TRUE)

Get variable importance

importance_scores <- importance(rf_model)
importance_scores <- importance_scores[order(-importance_scores[, 1]), ]
print("Variable Importance from Random Forest:")
## [1] "Variable Importance from Random Forest:"
print(importance_scores)
##                     %IncMSE IncNodePurity
## Facebook_Likes    15.325593      471319.7
## budget..Millions. 12.545457      395365.0
## audience_score     7.701132      243939.3
## imdb_rating        6.945438      238325.8
## critics_score      2.031686      223318.6

Visualization 1: Facebook_Likes across Theatre/International

ggplot(subtheatre, aes(x = New_Genre, y = Facebook_Likes)) +
  geom_boxplot(fill = "lightblue", outlier.color = "red") +
  theme_minimal() +
  labs(title = "Distribution of Facebook Likes Across Theatre/International",
     x = "Theatre/International", y = "Facebook Likes") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Visualization 2: Budget across Theatre/International

ggplot(subtheatre, aes(x = New_Genre, y = budget..Millions.)) +
  geom_boxplot(fill = "lightblue", outlier.color = "red") +
  theme_minimal() +
  labs(title = paste("Distribution of Budget (Millions) Across Theatre/International"),
       x = "Theatre/International", y = "budget..Millions.") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

select the text values with “Thriller”

stringr::str_detect(Movie_unique$New_Genre, "Thriller") # select the values with "Thriller"
##   [1] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13]  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
##  [61]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [121] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE
## [193] FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE
## [289] FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [313]  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [385] FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [421]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [445]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [493] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [505] FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [517] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [541] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
## [577] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE
## [589] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [613] FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [625] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
head(Movie_unique[str_detect(Movie_unique$New_Genre, "Thriller"),], 10) # just see the dataframe
##                      original_title         type              genre runtime
## 3                 2 Fast 2 Furious  Feature Film Mystery & Suspense      97
## 13 Abraham Lincoln: Vampire Hunter  Feature Film Mystery & Suspense      88
## 19             Alice in Wonderland  Feature Film Mystery & Suspense     124
## 33                Arthur Christmas  Feature Film             Horror     100
## 34    Asterix at the Olympic Games  Feature Film             Horror      97
## 45                Basic Instinct 2  Feature Film Mystery & Suspense     121
## 48                  Batman Forever  Feature Film             Horror     108
## 57                         Ben-Hur  Feature Film Mystery & Suspense     122
## 61                        Big Fish  Feature Film             Horror      91
## 71      Captain America: Civil War  Feature Film             Horror      84
##    mpaa_rating imdb_rating imdb_num_votes  critics_rating critics_score
## 3        PG-13         6.3          54771          Rotten            40
## 13          PG         6.8          16366           Fresh            77
## 19           R         7.1         259822           Fresh            69
## 33           R         4.5          16824          Rotten            10
## 34           R         7.1          25264          Rotten            69
## 45           R         6.7          58907           Fresh            67
## 48           R         3.0           9216          Rotten            45
## 57           R         2.1           9904           Fresh            29
## 61           R         6.9          19539          Rotten            59
## 71           R         5.6          19285 Certified Fresh            77
##    audience_rating audience_score best_pic_nom Facebook_Likes budget..Millions.
## 3          Spilled             49           no          35296              16.0
## 13         Upright             73           no         211234             150.0
## 19         Upright             70           no          79957              58.0
## 33         Spilled             31           no           2707              20.0
## 34         Spilled             78           no         224355             145.0
## 45         Upright             70           no         101899             137.0
## 48         Upright             35           no          26057               1.0
## 57         Upright             20           no            379               2.8
## 61         Upright             70           no          16138              37.0
## 71         Spilled             43           no         264798              81.0
##    revenue..Millions. language country New_Genre runtimeGrp budget_log
## 3               31.56  English     USA  Thriller   [90,120)   2.772589
## 13             341.13  English     USA  Thriller    [60,90)   5.010635
## 19             150.17  English     USA  Thriller  [120,267)   4.060443
## 33              32.01  English      UK  Thriller   [90,120)   2.995732
## 34             609.12   French  France  Thriller   [90,120)   4.976734
## 45              85.13  English      UK  Thriller  [120,267)   4.919981
## 48              17.51  English     USA  Thriller   [90,120)   0.000000
## 57               1.83  English     USA  Thriller  [120,267)   1.029619
## 61             165.34  English     USA  Thriller   [90,120)   3.610918
## 71             403.80  English     USA  Thriller    [60,90)   4.394449
##    revenue_log Revenue_Ratio
## 3     3.451890     1.9725000
## 13    5.832264     2.2742000
## 19    5.011768     2.5891379
## 33    3.466048     1.6005000
## 34    6.412015     4.2008276
## 45    4.444179     0.6213869
## 48    2.862772    17.5100000
## 57    0.604316     0.6535714
## 61    5.108004     4.4686486
## 71    6.000920     4.9851852
Movie_Genre_Thriller<-str_detect(Movie_unique$New_Genre, "Thriller") # assign into a new dataframe
Movie_Genre_Thriller # see the dataframe
##   [1] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13]  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
##  [61]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [121] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE
## [193] FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE
## [289] FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [313]  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [385] FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [421]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [445]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [493] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [505] FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [517] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [541] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
## [577] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE
## [589] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [613] FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [625] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
subthriller<-Movie_unique[Movie_Genre_Thriller,] ## assign subset data into a dataframe
head(subthriller, 10)
##                      original_title         type              genre runtime
## 3                 2 Fast 2 Furious  Feature Film Mystery & Suspense      97
## 13 Abraham Lincoln: Vampire Hunter  Feature Film Mystery & Suspense      88
## 19             Alice in Wonderland  Feature Film Mystery & Suspense     124
## 33                Arthur Christmas  Feature Film             Horror     100
## 34    Asterix at the Olympic Games  Feature Film             Horror      97
## 45                Basic Instinct 2  Feature Film Mystery & Suspense     121
## 48                  Batman Forever  Feature Film             Horror     108
## 57                         Ben-Hur  Feature Film Mystery & Suspense     122
## 61                        Big Fish  Feature Film             Horror      91
## 71      Captain America: Civil War  Feature Film             Horror      84
##    mpaa_rating imdb_rating imdb_num_votes  critics_rating critics_score
## 3        PG-13         6.3          54771          Rotten            40
## 13          PG         6.8          16366           Fresh            77
## 19           R         7.1         259822           Fresh            69
## 33           R         4.5          16824          Rotten            10
## 34           R         7.1          25264          Rotten            69
## 45           R         6.7          58907           Fresh            67
## 48           R         3.0           9216          Rotten            45
## 57           R         2.1           9904           Fresh            29
## 61           R         6.9          19539          Rotten            59
## 71           R         5.6          19285 Certified Fresh            77
##    audience_rating audience_score best_pic_nom Facebook_Likes budget..Millions.
## 3          Spilled             49           no          35296              16.0
## 13         Upright             73           no         211234             150.0
## 19         Upright             70           no          79957              58.0
## 33         Spilled             31           no           2707              20.0
## 34         Spilled             78           no         224355             145.0
## 45         Upright             70           no         101899             137.0
## 48         Upright             35           no          26057               1.0
## 57         Upright             20           no            379               2.8
## 61         Upright             70           no          16138              37.0
## 71         Spilled             43           no         264798              81.0
##    revenue..Millions. language country New_Genre runtimeGrp budget_log
## 3               31.56  English     USA  Thriller   [90,120)   2.772589
## 13             341.13  English     USA  Thriller    [60,90)   5.010635
## 19             150.17  English     USA  Thriller  [120,267)   4.060443
## 33              32.01  English      UK  Thriller   [90,120)   2.995732
## 34             609.12   French  France  Thriller   [90,120)   4.976734
## 45              85.13  English      UK  Thriller  [120,267)   4.919981
## 48              17.51  English     USA  Thriller   [90,120)   0.000000
## 57               1.83  English     USA  Thriller  [120,267)   1.029619
## 61             165.34  English     USA  Thriller   [90,120)   3.610918
## 71             403.80  English     USA  Thriller    [60,90)   4.394449
##    revenue_log Revenue_Ratio
## 3     3.451890     1.9725000
## 13    5.832264     2.2742000
## 19    5.011768     2.5891379
## 33    3.466048     1.6005000
## 34    6.412015     4.2008276
## 45    4.444179     0.6213869
## 48    2.862772    17.5100000
## 57    0.604316     0.6535714
## 61    5.108004     4.4686486
## 71    6.000920     4.9851852

Correlation Analysis

correlation_matrix <- cor(numeric_data, use = "complete.obs")
revenue_correlation <- correlation_matrix["revenue..Millions.", ]
sorted_correlation <- sort(revenue_correlation, decreasing = TRUE)
print("Top correlated variables with Revenue Across Genre Thriller:")
## [1] "Top correlated variables with Revenue Across Genre Thriller:"
print(sorted_correlation)
## revenue..Millions.  budget..Millions.        revenue_log     Facebook_Likes 
##          1.0000000          0.7148536          0.7094601          0.6194620 
##     audience_score        imdb_rating         budget_log      Revenue_Ratio 
##          0.4825824          0.4743989          0.4100942          0.3841060 
##            runtime      critics_score     imdb_num_votes 
##          0.3536316          0.3447954          0.1652981

Linear Regression

lm_model <- lm(revenue..Millions. ~ imdb_rating + critics_score + audience_score + budget..Millions. + Facebook_Likes, data = subadventure)

Variable Importance with Random Forest

Prepare data for random forest

rf_data <- subadventure %>%
  select(revenue..Millions., imdb_rating, critics_score, audience_score, budget..Millions., Facebook_Likes) %>%
  na.omit()

Fit random forest model

rf_model <- randomForest(revenue..Millions. ~ ., data = rf_data, importance = TRUE)

Get variable importance

importance_scores <- importance(rf_model)
importance_scores <- importance_scores[order(-importance_scores[, 1]), ]
print("Variable Importance from Random Forest:")
## [1] "Variable Importance from Random Forest:"
print(importance_scores)
##                     %IncMSE IncNodePurity
## Facebook_Likes    15.033957      461158.3
## budget..Millions. 14.011969      414605.7
## audience_score     7.754531      258653.1
## imdb_rating        7.661333      222085.4
## critics_score      1.862758      233495.1

Visualization 1: Facebook_Likes across Thriller

ggplot(subthriller, aes(x = New_Genre, y = Facebook_Likes)) +
  geom_boxplot(fill = "lightblue", outlier.color = "red") +
  theme_minimal() +
  labs(title = "Distribution of Facebook Likes Across Thriller",
     x = "Thriller", y = "Facebook Likes") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Visualization 2: Budget across Thriller

ggplot(subthriller, aes(x = New_Genre, y = budget..Millions.)) +
  geom_boxplot(fill = "lightblue", outlier.color = "red") +
  theme_minimal() +
  labs(title = paste("Distribution of Budget (Millions) Across Thriller"),
       x = "Thriller", y = "budget..Millions.") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Build the multiple linear regression model

model <- lm(revenue ~ critics_score + facebook_likes + budget, data = analysis_data)

Summary of the model

summary(model)
## 
## Call:
## lm(formula = revenue ~ critics_score + facebook_likes + budget, 
##     data = analysis_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -304.01  -52.48   -7.94   27.93 1499.63 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -7.435e+01  1.280e+01  -5.809 9.88e-09 ***
## critics_score   1.304e+00  2.278e-01   5.722 1.62e-08 ***
## facebook_likes  9.739e-04  6.144e-05  15.851  < 2e-16 ***
## budget          1.881e+00  1.268e-01  14.831  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 125.5 on 640 degrees of freedom
## Multiple R-squared:  0.6388, Adjusted R-squared:  0.6371 
## F-statistic: 377.3 on 3 and 640 DF,  p-value: < 2.2e-16

Create a data frame with the input values

input_data_a <- data.frame(
  critics_score = 55,
  facebook_likes = 1250,
  budget = mean(Movie_unique$budget..Millions., na.rm = TRUE))

Predict revenue

predicted_revenue_a <- predict(model, newdata = input_data_a)
cat("Predicted revenue (a):", predicted_revenue_a, "million dollars\n")
## Predicted revenue (a): 76.23573 million dollars

Campaign increases Facebook likes by 10,000% and budget to $20 million

new_facebook_likes <- 1250 * (1 + 10000 / 100) # Increase by 10,000%
new_budget <- 20

Create a data frame with the updated values

input_data_b <- data.frame(
  critics_score = 55,
  facebook_likes = new_facebook_likes,
  budget = new_budget
)

Predict revenue after campaign

predicted_revenue_b <- predict(model, newdata = input_data_b)
cat("Predicted revenue after campaign (b):", predicted_revenue_b, "million dollars\n")
## Predicted revenue after campaign (b): 157.9315 million dollars

Calculate the improvement in revenue

improvement <- predicted_revenue_b - predicted_revenue_a
cat("Revenue improvement due to campaign:", improvement, "million dollars\n")
## Revenue improvement due to campaign: 81.69575 million dollars