library(dplyr)
library(readr)
df <- read_csv("C:/Users/Anhuynh/Desktop/Data Science Project/IMDB_top_movies/imdb_top_1000.csv")
##Find missing values
colSums(is.na(df))
## Poster_Link Series_Title Released_Year Certificate Runtime
## 0 0 0 101 0
## Genre IMDB_Rating Overview Meta_score Director
## 0 0 0 157 0
## Star1 Star2 Star3 Star4 No_of_Votes
## 0 0 0 0 0
## Gross
## 169
# Calculate percentage of missing values in data set for choosing target variances
Certificate <- paste(101/ 1000 * 100, "%")
Meta_score <- paste(157/ 1000 * 100, "%")
Gross <- paste(169/ 1000 * 100, "%")
NA_data <- rbind(Certificate, Meta_score, Gross )
colnames(NA_data) <- c("Missing_Value_Percentage")
NA_data
## Missing_Value_Percentage
## Certificate "10.1 %"
## Meta_score "15.7 %"
## Gross "16.9 %"
# replace NAs = 0
df[["Meta_score"]][is.na(df[["Meta_score"]])] <- 0
df[["Gross"]][is.na(df[["Gross"]])] <- 0
colSums(is.na(df))
## Poster_Link Series_Title Released_Year Certificate Runtime
## 0 0 0 101 0
## Genre IMDB_Rating Overview Meta_score Director
## 0 0 0 0 0
## Star1 Star2 Star3 Star4 No_of_Votes
## 0 0 0 0 0
## Gross
## 0
# Split numbers and characters
df$Runtime = as.numeric(gsub(".*?([0-9]+).*", "\\1", df$Runtime))
#Correlation Matrix
library(corrplot)
library(Hmisc)
## Mark the insignificant coefficients according to the specified p-value significance level
cor_9 <- rcorr(as.matrix(df[-c(1,2,3,4,6,8,10,11,12,13,14)]))
df_cor <- cor_9$r
p_mat <- cor_9$P
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(df_cor, method = "color", col = col(200),
type = "upper", order = "hclust",
addCoef.col = "black", # Add coefficient of correlation
tl.col = "darkblue", tl.srt = 45, #Text label color and rotation
# Combine with significance level
p.mat = p_mat, sig.level = 0.01,
# hide correlation coefficient on the principal diagonal
diag = FALSE
)
library(reshape2)
# create new dataset, convert column values into rows
df2<- melt(df[-c(1, 4, 8)], na.rm = FALSE, value.name = "Star", id = c("Series_Title","Released_Year","Runtime","Genre","IMDB_Rating","Meta_score","Director","No_of_Votes", "Gross"))
library(car)
## Testing Hypotheses
# Run linear regression (03 regressors/predtictors: No_of_Votes, IMDB_Rating and Meta_score)
gross_model1 <- lm(Gross ~ No_of_Votes+IMDB_Rating + Meta_score, df2)
data.frame(vif(gross_model1))
## vif.gross_model1.
## No_of_Votes 1.471678
## IMDB_Rating 1.380027
## Meta_score 1.113203
gross_model2 <- lm(Gross ~ No_of_Votes + IMDB_Rating, df2)
data.frame(vif(gross_model2))
## vif.gross_model2.
## No_of_Votes 1.32451
## IMDB_Rating 1.32451
gross_model3 <- lm(Gross ~ No_of_Votes + Meta_score, df2)
data.frame(vif(gross_model3))
## vif.gross_model3.
## No_of_Votes 1.06842
## Meta_score 1.06842
Observation: In the model of 03 regressors/predictors, correlated predictors are No_of_Votes (1st), and IMDB_Rating (2nd) and Meta_score (3rd).
We considered the interaction between IMDB_rating and Meta_score with No_of_Votes to see how that affects changes in revenue (Gross).
fit1 <- lm(Gross ~ No_of_Votes + IMDB_Rating -1, df2) # model 01
fit2 <- lm(Gross ~ No_of_Votes + Meta_score -1, df2) # model 02
summary(fit1)$coef
## Estimate Std. Error t value Pr(>|t|)
## No_of_Votes 191.7181 4.04094 47.443943 0.00000000
## IMDB_Rating 412607.4632 216719.10690 1.903881 0.05699737
summary(fit2)$coef
## Estimate Std. Error t value Pr(>|t|)
## No_of_Votes 185.8383 4.078405 45.566423 0.000000e+00
## Meta_score 96787.1253 24002.029687 4.032456 5.622635e-05
Observation:
Model 01: For one unit change in IMDB_Rating, the expected change in revenue increases $412,607. Under this model, for one unit change in No_of_votes, the expected change in revenue increases $192.
Model 02: For one unit change in Meta_score, the expected change in revenue increases $96,787. Under this model, for one unit change in No_of_votes, the expected change in revenue increases $186.
# Test residuals for normality
## model 01
shapiro.test(fit1$residuals)
##
## Shapiro-Wilk normality test
##
## data: fit1$residuals
## W = 0.70592, p-value < 2.2e-16
## model 02
shapiro.test(fit2$residuals)
##
## Shapiro-Wilk normality test
##
## data: fit2$residuals
## W = 0.70534, p-value < 2.2e-16
Observation: p-value < 0.05 (statistical significance level), sample size (dataset) is not following normal distribution.
# create subset of data with revenue >$500mio
df2$rev_rank <- ifelse(df2$Gross >500000000, 1 ,0)
df2$rev_rank <- as.factor(df2$rev_rank)
head(df2)
## Series_Title Released_Year Runtime
## 1 The Shawshank Redemption 1994 142
## 2 The Godfather 1972 175
## 3 The Dark Knight 2008 152
## 4 The Godfather: Part II 1974 202
## 5 12 Angry Men 1957 96
## 6 The Lord of the Rings: The Return of the King 2003 201
## Genre IMDB_Rating Meta_score Director
## 1 Drama 9.3 80 Frank Darabont
## 2 Crime, Drama 9.2 100 Francis Ford Coppola
## 3 Action, Crime, Drama 9.0 84 Christopher Nolan
## 4 Crime, Drama 9.0 90 Francis Ford Coppola
## 5 Crime, Drama 9.0 96 Sidney Lumet
## 6 Action, Adventure, Drama 8.9 94 Peter Jackson
## No_of_Votes Gross variable Star rev_rank
## 1 2343110 28341469 Star1 Tim Robbins 0
## 2 1620367 134966411 Star1 Marlon Brando 0
## 3 2303232 534858444 Star1 Christian Bale 1
## 4 1129952 57300000 Star1 Al Pacino 0
## 5 689845 4360000 Star1 Henry Fonda 0
## 6 1642758 377845905 Star1 Elijah Wood 0
# Use Kruskal-Wallis rank sum test (non-parametric test) to quantify the significance of regressors
kruskal.test(No_of_Votes + IMDB_Rating + Meta_score ~rev_rank, data = df2)
##
## Kruskal-Wallis rank sum test
##
## data: No_of_Votes + IMDB_Rating + Meta_score by rev_rank
## Kruskal-Wallis chi-squared = 73.957, df = 1, p-value < 2.2e-16
kruskal.test(No_of_Votes ~rev_rank, data = df2)
##
## Kruskal-Wallis rank sum test
##
## data: No_of_Votes by rev_rank
## Kruskal-Wallis chi-squared = 73.957, df = 1, p-value < 2.2e-16
kruskal.test(No_of_Votes + IMDB_Rating ~rev_rank, data = df2)
##
## Kruskal-Wallis rank sum test
##
## data: No_of_Votes + IMDB_Rating by rev_rank
## Kruskal-Wallis chi-squared = 73.957, df = 1, p-value < 2.2e-16
kruskal.test(No_of_Votes + Meta_score ~rev_rank, data = df2)
##
## Kruskal-Wallis rank sum test
##
## data: No_of_Votes + Meta_score by rev_rank
## Kruskal-Wallis chi-squared = 73.957, df = 1, p-value < 2.2e-16
kruskal.test(IMDB_Rating ~rev_rank, data = df2)
##
## Kruskal-Wallis rank sum test
##
## data: IMDB_Rating by rev_rank
## Kruskal-Wallis chi-squared = 1.8324, df = 1, p-value = 0.1758
kruskal.test(Meta_score ~rev_rank, data = df2)
##
## Kruskal-Wallis rank sum test
##
## data: Meta_score by rev_rank
## Kruskal-Wallis chi-squared = 0.033298, df = 1, p-value = 0.8552
Observation: No_of_Votes is the most important regressor to changes of Revenue (Gross).
sub_df <- df2[which(df2$IMDB_Rating > 8 & df2$Meta_score > 80), ]
Drama <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Drama", sub_df$Genre, fixed = TRUE)]))
Drama$Category <- "Drama"
Action <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Action", sub_df$Genre, fixed = TRUE)]))
Action$Category <- "Action"
Adventure <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Adventure", sub_df$Genre, fixed = TRUE)]))
Adventure$Category <- "Adventure"
Fantasy <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Fantasy", sub_df$Genre, fixed = TRUE)]))
Fantasy$Category <- "Fantasy"
Biography <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Biography", sub_df$Genre, fixed = TRUE)]))
Biography$Category <- "Biography"
Crime <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Crime", sub_df$Genre, fixed = TRUE)]))
Crime$Category <- "Crime"
War <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("War", sub_df$Genre, fixed = TRUE)]))
War$Category <- "War"
Comedy <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Comedy", sub_df$Genre, fixed = TRUE)]))
Comedy$Category <- "Comedy"
Music <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Music", sub_df$Genre, fixed = TRUE)]))
Music$Category <- "Music"
Musical <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Musical", sub_df$Genre, fixed = TRUE)]))
Musical$Category <- "Music"
Mystery <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Mystery", sub_df$Genre, fixed = TRUE)]))
Mystery$Category <- "Mystery"
Thriller <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Thriller", sub_df$Genre, fixed = TRUE)]))
Thriller$Category <- "Thriller"
Animation <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Animation", sub_df$Genre, fixed = TRUE)]))
Animation$Category <- "Animation"
Family <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Family", sub_df$Genre, fixed = TRUE)]))
Family$Category <- "Family"
History <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("History", sub_df$Genre, fixed = TRUE)]))
History$Category <- "History"
Sport <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Sport", sub_df$Genre, fixed = TRUE)]))
Sport$Category <- "Sport"
Romance <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Romance", sub_df$Genre, fixed = TRUE)]))
Romance$Category <- "Romance"
Film_Noir <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Film-Noir", sub_df$Genre, fixed = TRUE)]))
Film_Noir$Category <- "Film_Noir"
Sci_Fi <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Sci-Fi", sub_df$Genre, fixed = TRUE)]))
Sci_Fi$Category <- "Sci-Fi"
Western <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Western", sub_df$Genre, fixed = TRUE)]))
Western$Category <- "Western"
Horror <- subset(sub_df, sub_df$ Genre %in% (sub_df$Genre[grepl("Horror", sub_df$Genre, fixed = TRUE)]))
Horror$Category <- "Horror"
newdf <- rbind(Drama, Action, Adventure, Fantasy, Biography, Crime, War, Comedy, Musical, Music, Mystery, Thriller, Animation, Family, History, Sport, Romance, Film_Noir, Sci_Fi, Western, Horror)
newdf$Category <- as.factor(newdf$Category)
head(newdf[ ,c("IMDB_Rating", "Meta_score", "Category")])
## IMDB_Rating Meta_score Category
## 2 9.2 100 Drama
## 3 9.0 84 Drama
## 4 9.0 90 Drama
## 5 9.0 96 Drama
## 6 8.9 94 Drama
## 7 8.9 94 Drama
library(tidyverse)
library(rstatix)
stat.test.IMDB <- newdf[ ,c("IMDB_Rating", "Category")] %>%
wilcox_test(IMDB_Rating ~ Category) %>%
add_significance()
stat.test.Meta <- newdf[ ,c("Meta_score", "Category")] %>%
wilcox_test(Meta_score ~ Category) %>%
add_significance()
stat.test.IMDB[which(stat.test.IMDB$p.adj.signif != "ns"), ]
## # A tibble: 18 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 IMDB_Rating Action Adven… 64 148 6320 8.41e-5 1.5 e-2 *
## 2 IMDB_Rating Action Anima… 64 60 2696 8.35e-5 1.5 e-2 *
## 3 IMDB_Rating Action Comedy 64 96 4504 3.81e-7 7.12e-5 ****
## 4 IMDB_Rating Action Drama 64 392 16832 6.29e-6 1 e-3 ***
## 5 IMDB_Rating Action Myste… 64 64 2944 1.44e-5 3 e-3 **
## 6 IMDB_Rating Action Roman… 64 76 3672 9.33e-8 1.77e-5 ****
## 7 IMDB_Rating Action Thril… 64 84 4000 1.64e-7 3.1 e-5 ****
## 8 IMDB_Rating Action War 64 48 2152 2.42e-4 4.2 e-2 *
## 9 IMDB_Rating Animat… Family 60 24 328 7.63e-5 1.4 e-2 *
## 10 IMDB_Rating Comedy Family 96 24 400 4.55e-7 8.46e-5 ****
## 11 IMDB_Rating Drama Family 392 24 2608 1.57e-4 2.8 e-2 *
## 12 IMDB_Rating Family Histo… 24 40 744 1.91e-4 3.3 e-2 *
## 13 IMDB_Rating Family Myste… 24 64 1240 6.51e-6 1 e-3 ***
## 14 IMDB_Rating Family Roman… 24 76 1536 1.74e-7 3.27e-5 ****
## 15 IMDB_Rating Family Sci-Fi 24 36 712 1.68e-5 3 e-3 **
## 16 IMDB_Rating Family Sport 24 8 184 9.36e-5 1.7 e-2 *
## 17 IMDB_Rating Family Thril… 24 84 1624 2.19e-6 4.05e-4 ***
## 18 IMDB_Rating Horror Roman… 8 76 528 2.86e-4 4.9 e-2 *
stat.test.Meta[which(stat.test.Meta$p.adj.signif != "ns"), ]
## # A tibble: 18 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 Meta_score Action Comedy 64 96 1856 2.17e-5 4 e-3 **
## 2 Meta_score Action Film_… 64 8 48 1.86e-4 3.2 e-2 *
## 3 Meta_score Action Myste… 64 64 944 1.29e-7 2.45e-5 ****
## 4 Meta_score Action Roman… 64 76 1360 6.93e-6 1 e-3 ***
## 5 Meta_score Action War 64 48 752 3.75e-6 7.05e-4 ***
## 6 Meta_score Adventu… Comedy 148 96 5088 1.75e-4 3 e-2 *
## 7 Meta_score Adventu… Myste… 148 64 2680 4.97e-7 9.39e-5 ****
## 8 Meta_score Adventu… Roman… 148 76 3776 5.5 e-5 1 e-2 **
## 9 Meta_score Adventu… War 148 48 2184 5.91e-5 1.1 e-2 *
## 10 Meta_score Animati… Film_… 60 8 32 7.09e-5 1.2 e-2 *
## 11 Meta_score Animati… Myste… 60 64 1072 2.12e-5 4 e-3 **
## 12 Meta_score Biograp… Myste… 48 64 896 1.56e-4 2.7 e-2 *
## 13 Meta_score Crime Myste… 100 64 2016 6.25e-5 1.1 e-2 *
## 14 Meta_score Drama Myste… 392 64 8232 9.8 e-6 2 e-3 **
## 15 Meta_score Fantasy Myste… 36 64 568 2.5 e-5 5 e-3 **
## 16 Meta_score Fantasy War 36 48 424 6.43e-5 1.1 e-2 *
## 17 Meta_score Mystery Sci-Fi 64 36 1760 1.11e-5 2 e-3 **
## 18 Meta_score Sci-Fi War 36 48 416 4.75e-5 9 e-3 **
eff_size_IMDB <- newdf[ ,c("IMDB_Rating", "Category")] %>%
wilcox_effsize(IMDB_Rating ~ Category)
eff_size_Meta <- newdf[ ,c("Meta_score", "Category")] %>%
wilcox_effsize(Meta_score ~ Category)
#which is max value of effect size
eff_size_IMDB[which(eff_size_IMDB$effsize >= 0.5), ]
## # A tibble: 7 × 7
## .y. group1 group2 effsize n1 n2 magnitude
## <chr> <chr> <chr> <dbl> <int> <int> <ord>
## 1 IMDB_Rating Family Film_Noir 0.628 24 8 large
## 2 IMDB_Rating Family Romance 0.523 24 76 large
## 3 IMDB_Rating Family Sci-Fi 0.557 24 36 large
## 4 IMDB_Rating Family Sport 0.695 24 8 large
## 5 IMDB_Rating Film_Noir Horror 0.866 8 8 large
## 6 IMDB_Rating Horror Sci-Fi 0.536 8 36 large
## 7 IMDB_Rating Horror Sport 0.866 8 8 large
eff_size_Meta[which(eff_size_Meta$effsize >= 0.5), ]
## # A tibble: 4 × 7
## .y. group1 group2 effsize n1 n2 magnitude
## <chr> <chr> <chr> <dbl> <int> <int> <ord>
## 1 Meta_score Fantasy Film_Noir 0.517 36 8 large
## 2 Meta_score Film_Noir Sci-Fi 0.523 8 36 large
## 3 Meta_score Film_Noir Sport 0.866 8 8 large
## 4 Meta_score Horror Sport 0.685 8 8 large