Dataset

library(dplyr)
library(readr)

df <- read_csv("C:/Users/Anhuynh/Desktop/Data Science Project/IMDB_top_movies/imdb_top_1000.csv")
##Find missing values
colSums(is.na(df))
##   Poster_Link  Series_Title Released_Year   Certificate       Runtime 
##             0             0             0           101             0 
##         Genre   IMDB_Rating      Overview    Meta_score      Director 
##             0             0             0           157             0 
##         Star1         Star2         Star3         Star4   No_of_Votes 
##             0             0             0             0             0 
##         Gross 
##           169
# Calculate percentage of missing values in data set for choosing target variances

Certificate <- paste(101/ 1000 * 100, "%")
Meta_score <- paste(157/ 1000 * 100, "%")
Gross <- paste(169/ 1000 * 100, "%")

NA_data <- rbind(Certificate, Meta_score, Gross )

colnames(NA_data) <- c("Missing_Value_Percentage")

NA_data
##             Missing_Value_Percentage
## Certificate "10.1 %"                
## Meta_score  "15.7 %"                
## Gross       "16.9 %"
# replace NAs = 0
df[["Meta_score"]][is.na(df[["Meta_score"]])] <- 0
df[["Gross"]][is.na(df[["Gross"]])] <- 0

colSums(is.na(df))
##   Poster_Link  Series_Title Released_Year   Certificate       Runtime 
##             0             0             0           101             0 
##         Genre   IMDB_Rating      Overview    Meta_score      Director 
##             0             0             0             0             0 
##         Star1         Star2         Star3         Star4   No_of_Votes 
##             0             0             0             0             0 
##         Gross 
##             0
# Split numbers and characters
df$Runtime = as.numeric(gsub(".*?([0-9]+).*", "\\1", df$Runtime)) 
#Correlation Matrix
library(corrplot)
library(Hmisc)

## Mark the insignificant coefficients according to the specified p-value significance level
cor_9 <- rcorr(as.matrix(df[-c(1,2,3,4,6,8,10,11,12,13,14)]))
df_cor <- cor_9$r
p_mat <- cor_9$P

col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))

corrplot(df_cor, method = "color", col = col(200),  
         type = "upper", order = "hclust", 
         addCoef.col = "black", # Add coefficient of correlation
         tl.col = "darkblue", tl.srt = 45, #Text label color and rotation
         # Combine with significance level
         p.mat = p_mat, sig.level = 0.01,  
         # hide correlation coefficient on the principal diagonal
         diag = FALSE 
         )

library(reshape2)
# create new dataset, convert column values into rows
df2<- melt(df[-c(1, 4, 8)], na.rm = FALSE, value.name = "Star", id = c("Series_Title","Released_Year","Runtime","Genre","IMDB_Rating","Meta_score","Director","No_of_Votes", "Gross"))

Q4: A big studio wants to make a box office hit this year. What genre(s) or star(s) would you recommend they use?

Test 1: Quantify correlation of regressors/predictors by estimating coefficient in variance inflation (VIF) analysis.

library(car)
## Testing Hypotheses
# Run linear regression (03 regressors/predtictors: No_of_Votes, IMDB_Rating and Meta_score)
gross_model1 <- lm(Gross ~ No_of_Votes+IMDB_Rating + Meta_score, df2)
data.frame(vif(gross_model1))
##             vif.gross_model1.
## No_of_Votes          1.471678
## IMDB_Rating          1.380027
## Meta_score           1.113203
gross_model2 <- lm(Gross ~ No_of_Votes + IMDB_Rating, df2)
data.frame(vif(gross_model2))
##             vif.gross_model2.
## No_of_Votes           1.32451
## IMDB_Rating           1.32451
gross_model3 <- lm(Gross ~ No_of_Votes + Meta_score, df2)
data.frame(vif(gross_model3))
##             vif.gross_model3.
## No_of_Votes           1.06842
## Meta_score            1.06842

Observation: In the model of 03 regressors/predictors, correlated predictors are No_of_Votes (1st), and IMDB_Rating (2nd) and Meta_score (3rd).

Test 2: Measure affection of regressors towards changes in revenue (“Gross”)

We considered the interaction between IMDB_rating and Meta_score with No_of_Votes to see how that affects changes in revenue (Gross).

fit1 <- lm(Gross ~ No_of_Votes + IMDB_Rating -1, df2) # model 01
fit2 <- lm(Gross ~ No_of_Votes + Meta_score -1, df2) # model 02
summary(fit1)$coef
##                Estimate   Std. Error   t value   Pr(>|t|)
## No_of_Votes    191.7181      4.04094 47.443943 0.00000000
## IMDB_Rating 412607.4632 216719.10690  1.903881 0.05699737
summary(fit2)$coef
##               Estimate   Std. Error   t value     Pr(>|t|)
## No_of_Votes   185.8383     4.078405 45.566423 0.000000e+00
## Meta_score  96787.1253 24002.029687  4.032456 5.622635e-05

Observation:

  • Model 01: For one unit change in IMDB_Rating, the expected change in revenue increases $412,607. Under this model, for one unit change in No_of_votes, the expected change in revenue increases $192.

  • Model 02: For one unit change in Meta_score, the expected change in revenue increases $96,787. Under this model, for one unit change in No_of_votes, the expected change in revenue increases $186.

Test 3: Quantify the significance of regressors

# Test residuals for normality
## model 01
shapiro.test(fit1$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  fit1$residuals
## W = 0.70592, p-value < 2.2e-16
## model 02
shapiro.test(fit2$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  fit2$residuals
## W = 0.70534, p-value < 2.2e-16

Observation: p-value < 0.05 (statistical significance level), sample size (dataset) is not following normal distribution.

# create subset of data with revenue >$500mio 
df2$rev_rank <- ifelse(df2$Gross >500000000, 1 ,0)
df2$rev_rank <- as.factor(df2$rev_rank)

head(df2)
##                                    Series_Title Released_Year Runtime
## 1                      The Shawshank Redemption          1994     142
## 2                                 The Godfather          1972     175
## 3                               The Dark Knight          2008     152
## 4                        The Godfather: Part II          1974     202
## 5                                  12 Angry Men          1957      96
## 6 The Lord of the Rings: The Return of the King          2003     201
##                      Genre IMDB_Rating Meta_score             Director
## 1                    Drama         9.3         80       Frank Darabont
## 2             Crime, Drama         9.2        100 Francis Ford Coppola
## 3     Action, Crime, Drama         9.0         84    Christopher Nolan
## 4             Crime, Drama         9.0         90 Francis Ford Coppola
## 5             Crime, Drama         9.0         96         Sidney Lumet
## 6 Action, Adventure, Drama         8.9         94        Peter Jackson
##   No_of_Votes     Gross variable           Star rev_rank
## 1     2343110  28341469    Star1    Tim Robbins        0
## 2     1620367 134966411    Star1  Marlon Brando        0
## 3     2303232 534858444    Star1 Christian Bale        1
## 4     1129952  57300000    Star1      Al Pacino        0
## 5      689845   4360000    Star1    Henry Fonda        0
## 6     1642758 377845905    Star1    Elijah Wood        0
# Use Kruskal-Wallis rank sum test (non-parametric test) to quantify the significance of regressors
kruskal.test(No_of_Votes + IMDB_Rating + Meta_score ~rev_rank, data = df2)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  No_of_Votes + IMDB_Rating + Meta_score by rev_rank
## Kruskal-Wallis chi-squared = 73.957, df = 1, p-value < 2.2e-16
kruskal.test(No_of_Votes ~rev_rank, data = df2)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  No_of_Votes by rev_rank
## Kruskal-Wallis chi-squared = 73.957, df = 1, p-value < 2.2e-16
kruskal.test(No_of_Votes + IMDB_Rating  ~rev_rank, data = df2)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  No_of_Votes + IMDB_Rating by rev_rank
## Kruskal-Wallis chi-squared = 73.957, df = 1, p-value < 2.2e-16
kruskal.test(No_of_Votes + Meta_score ~rev_rank, data = df2)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  No_of_Votes + Meta_score by rev_rank
## Kruskal-Wallis chi-squared = 73.957, df = 1, p-value < 2.2e-16
kruskal.test(IMDB_Rating  ~rev_rank, data = df2)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  IMDB_Rating by rev_rank
## Kruskal-Wallis chi-squared = 1.8324, df = 1, p-value = 0.1758
kruskal.test(Meta_score ~rev_rank, data = df2)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  Meta_score by rev_rank
## Kruskal-Wallis chi-squared = 0.033298, df = 1, p-value = 0.8552

Observation: No_of_Votes is the most important regressor to changes of Revenue (Gross).

Q5: Are there any major differences in what types of movies score highly on the IMDB rating vs meta score?

sub_df <- df2[which(df2$IMDB_Rating > 8 & df2$Meta_score > 80), ]

Drama <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Drama", sub_df$Genre, fixed = TRUE)]))
Drama$Category <- "Drama"

Action <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Action", sub_df$Genre, fixed = TRUE)]))
Action$Category <- "Action"

Adventure <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Adventure", sub_df$Genre, fixed = TRUE)]))
Adventure$Category <- "Adventure"

Fantasy <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Fantasy", sub_df$Genre, fixed = TRUE)]))
Fantasy$Category <- "Fantasy"

Biography <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Biography", sub_df$Genre, fixed = TRUE)]))
Biography$Category <- "Biography"

Crime <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Crime", sub_df$Genre, fixed = TRUE)]))
Crime$Category <- "Crime"

War <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("War", sub_df$Genre, fixed = TRUE)]))
War$Category <- "War"

Comedy <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Comedy", sub_df$Genre, fixed = TRUE)]))
Comedy$Category <- "Comedy"

Music <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Music", sub_df$Genre, fixed = TRUE)]))
Music$Category <- "Music"

Musical <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Musical", sub_df$Genre, fixed = TRUE)]))
Musical$Category <- "Music"

Mystery <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Mystery", sub_df$Genre, fixed = TRUE)]))
Mystery$Category <- "Mystery"

Thriller <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Thriller", sub_df$Genre, fixed = TRUE)]))
Thriller$Category <- "Thriller"

Animation <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Animation", sub_df$Genre, fixed = TRUE)]))
Animation$Category <- "Animation"

Family <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Family", sub_df$Genre, fixed = TRUE)]))
Family$Category <- "Family"

History <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("History", sub_df$Genre, fixed = TRUE)]))
History$Category <- "History"

Sport <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Sport", sub_df$Genre, fixed = TRUE)]))
Sport$Category <- "Sport"

Romance <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Romance", sub_df$Genre, fixed = TRUE)]))
Romance$Category <- "Romance"

Film_Noir <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Film-Noir", sub_df$Genre, fixed = TRUE)]))
Film_Noir$Category <- "Film_Noir"

Sci_Fi <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Sci-Fi", sub_df$Genre, fixed = TRUE)]))
Sci_Fi$Category <- "Sci-Fi"

Western <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Western", sub_df$Genre, fixed = TRUE)]))
Western$Category <- "Western"

Horror <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Horror", sub_df$Genre, fixed = TRUE)]))
Horror$Category <- "Horror"

newdf <- rbind(Drama, Action, Adventure, Fantasy, Biography, Crime, War, Comedy, Musical, Music, Mystery, Thriller, Animation, Family, History, Sport, Romance, Film_Noir, Sci_Fi, Western, Horror)
newdf$Category <- as.factor(newdf$Category)
head(newdf[ ,c("IMDB_Rating", "Meta_score", "Category")])
##   IMDB_Rating Meta_score Category
## 2         9.2        100    Drama
## 3         9.0         84    Drama
## 4         9.0         90    Drama
## 5         9.0         96    Drama
## 6         8.9         94    Drama
## 7         8.9         94    Drama
library(tidyverse)
library(rstatix)
stat.test.IMDB <- newdf[ ,c("IMDB_Rating", "Category")] %>% 
  wilcox_test(IMDB_Rating ~ Category) %>%
  add_significance()

stat.test.Meta <- newdf[ ,c("Meta_score", "Category")] %>% 
  wilcox_test(Meta_score ~ Category) %>%
  add_significance()

stat.test.IMDB[which(stat.test.IMDB$p.adj.signif != "ns"), ]
## # A tibble: 18 × 9
##    .y.         group1  group2    n1    n2 statistic       p   p.adj p.adj.signif
##    <chr>       <chr>   <chr>  <int> <int>     <dbl>   <dbl>   <dbl> <chr>       
##  1 IMDB_Rating Action  Adven…    64   148      6320 8.41e-5 1.5 e-2 *           
##  2 IMDB_Rating Action  Anima…    64    60      2696 8.35e-5 1.5 e-2 *           
##  3 IMDB_Rating Action  Comedy    64    96      4504 3.81e-7 7.12e-5 ****        
##  4 IMDB_Rating Action  Drama     64   392     16832 6.29e-6 1   e-3 ***         
##  5 IMDB_Rating Action  Myste…    64    64      2944 1.44e-5 3   e-3 **          
##  6 IMDB_Rating Action  Roman…    64    76      3672 9.33e-8 1.77e-5 ****        
##  7 IMDB_Rating Action  Thril…    64    84      4000 1.64e-7 3.1 e-5 ****        
##  8 IMDB_Rating Action  War       64    48      2152 2.42e-4 4.2 e-2 *           
##  9 IMDB_Rating Animat… Family    60    24       328 7.63e-5 1.4 e-2 *           
## 10 IMDB_Rating Comedy  Family    96    24       400 4.55e-7 8.46e-5 ****        
## 11 IMDB_Rating Drama   Family   392    24      2608 1.57e-4 2.8 e-2 *           
## 12 IMDB_Rating Family  Histo…    24    40       744 1.91e-4 3.3 e-2 *           
## 13 IMDB_Rating Family  Myste…    24    64      1240 6.51e-6 1   e-3 ***         
## 14 IMDB_Rating Family  Roman…    24    76      1536 1.74e-7 3.27e-5 ****        
## 15 IMDB_Rating Family  Sci-Fi    24    36       712 1.68e-5 3   e-3 **          
## 16 IMDB_Rating Family  Sport     24     8       184 9.36e-5 1.7 e-2 *           
## 17 IMDB_Rating Family  Thril…    24    84      1624 2.19e-6 4.05e-4 ***         
## 18 IMDB_Rating Horror  Roman…     8    76       528 2.86e-4 4.9 e-2 *
stat.test.Meta[which(stat.test.Meta$p.adj.signif != "ns"), ]
## # A tibble: 18 × 9
##    .y.        group1   group2    n1    n2 statistic       p   p.adj p.adj.signif
##    <chr>      <chr>    <chr>  <int> <int>     <dbl>   <dbl>   <dbl> <chr>       
##  1 Meta_score Action   Comedy    64    96      1856 2.17e-5 4   e-3 **          
##  2 Meta_score Action   Film_…    64     8        48 1.86e-4 3.2 e-2 *           
##  3 Meta_score Action   Myste…    64    64       944 1.29e-7 2.45e-5 ****        
##  4 Meta_score Action   Roman…    64    76      1360 6.93e-6 1   e-3 ***         
##  5 Meta_score Action   War       64    48       752 3.75e-6 7.05e-4 ***         
##  6 Meta_score Adventu… Comedy   148    96      5088 1.75e-4 3   e-2 *           
##  7 Meta_score Adventu… Myste…   148    64      2680 4.97e-7 9.39e-5 ****        
##  8 Meta_score Adventu… Roman…   148    76      3776 5.5 e-5 1   e-2 **          
##  9 Meta_score Adventu… War      148    48      2184 5.91e-5 1.1 e-2 *           
## 10 Meta_score Animati… Film_…    60     8        32 7.09e-5 1.2 e-2 *           
## 11 Meta_score Animati… Myste…    60    64      1072 2.12e-5 4   e-3 **          
## 12 Meta_score Biograp… Myste…    48    64       896 1.56e-4 2.7 e-2 *           
## 13 Meta_score Crime    Myste…   100    64      2016 6.25e-5 1.1 e-2 *           
## 14 Meta_score Drama    Myste…   392    64      8232 9.8 e-6 2   e-3 **          
## 15 Meta_score Fantasy  Myste…    36    64       568 2.5 e-5 5   e-3 **          
## 16 Meta_score Fantasy  War       36    48       424 6.43e-5 1.1 e-2 *           
## 17 Meta_score Mystery  Sci-Fi    64    36      1760 1.11e-5 2   e-3 **          
## 18 Meta_score Sci-Fi   War       36    48       416 4.75e-5 9   e-3 **
eff_size_IMDB <- newdf[ ,c("IMDB_Rating", "Category")]  %>%
  wilcox_effsize(IMDB_Rating ~ Category)
eff_size_Meta <- newdf[ ,c("Meta_score", "Category")]  %>%
  wilcox_effsize(Meta_score ~ Category)

#which is max value of effect size
eff_size_IMDB[which(eff_size_IMDB$effsize >= 0.5), ]
## # A tibble: 7 × 7
##   .y.         group1    group2    effsize    n1    n2 magnitude
##   <chr>       <chr>     <chr>       <dbl> <int> <int> <ord>    
## 1 IMDB_Rating Family    Film_Noir   0.628    24     8 large    
## 2 IMDB_Rating Family    Romance     0.523    24    76 large    
## 3 IMDB_Rating Family    Sci-Fi      0.557    24    36 large    
## 4 IMDB_Rating Family    Sport       0.695    24     8 large    
## 5 IMDB_Rating Film_Noir Horror      0.866     8     8 large    
## 6 IMDB_Rating Horror    Sci-Fi      0.536     8    36 large    
## 7 IMDB_Rating Horror    Sport       0.866     8     8 large
eff_size_Meta[which(eff_size_Meta$effsize >= 0.5), ]
## # A tibble: 4 × 7
##   .y.        group1    group2    effsize    n1    n2 magnitude
##   <chr>      <chr>     <chr>       <dbl> <int> <int> <ord>    
## 1 Meta_score Fantasy   Film_Noir   0.517    36     8 large    
## 2 Meta_score Film_Noir Sci-Fi      0.523     8    36 large    
## 3 Meta_score Film_Noir Sport       0.866     8     8 large    
## 4 Meta_score Horror    Sport       0.685     8     8 large