IMDB
#the distribution
describe2(d_feature$averageRating)
quantile(d_feature$averageRating, probs = c(.01, seq(.05, .95, by =.05), .96, .97, .98, .99))
## 1% 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60% 65% 70% 75% 80% 85% 90% 95%
## 2.8 4.0 4.7 5.1 5.4 5.6 5.8 6.0 6.2 6.3 6.4 6.6 6.7 6.8 6.9 7.1 7.2 7.4 7.6 7.9
## 96% 97% 98% 99%
## 8.0 8.1 8.2 8.4
GG_denhist(d_feature$averageRating) +
scale_x_continuous("Average rating") +
ggtitle("IMDB movies",
subtitle = str_glue("Feature films, non-pornography, 1000+ votes, n = {nrow(d_feature)}"))
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/IMDB ratings dist.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
genre_dummies %>%
as.data.frame() %>%
pivot_longer(cols = everything()) %>%
filter(
value == T
) %>%
pull(name) %>%
table2(include_NA = F) %>%
mutate(
genre = fct_reorder(Group, Count)
) %>%
ggplot(aes(genre, Count)) +
geom_bar(stat = "identity") +
coord_flip()

GG_save("figs/genre counts.png")
#movies correlates of average rating
movie_cors = d_feature %>%
select(averageRating,
year,
runtime,
Adventure:News
) %>%
map_df(as.numeric) %>%
psych::mixedCor()
## Warning in cor.smooth(mat): Matrix was not positive definite, smoothing was
## done
movie_cors$rho %>%
GG_heatmap(font_size = 2)

movie_cors_df = tibble(
name = names(movie_cors$rho[-1, 1]),
value = movie_cors$rho[-1, 1]
)
movie_cors_df %>%
mutate(
name = fct_reorder(name, value)
) %>%
ggplot(aes(name, value)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_y_continuous("Correlation with average rating") +
scale_x_discrete("Predictor") +
ggtitle("Predictors of average IMDB movie rating",
"Individual effects (singular regression)")

GG_save("figs/single effects.png")
#ols
d_feature_z = d_feature %>% df_standardize()
## Skipped tconst because it is a character (string)
## Skipped titleType because it is a character (string)
## Skipped primaryTitle because it is a character (string)
## Skipped originalTitle because it is a character (string)
## Skipped isAdult because it is ranged from 0 to 1 (a proportion, maybe)
## Skipped endYear because it is a character (string)
## Skipped genres because it is a character (string)
## Skipped averageRating_quantile because it is ranged from 0 to 1 (a proportion, maybe)
## Skipped year2024 because it is a logical (boolean)
## Skipped year2023 because it is a logical (boolean)
## Skipped year2022 because it is a logical (boolean)
## Skipped year2021 because it is a logical (boolean)
## Skipped year2020 because it is a logical (boolean)
## Skipped Adventure because it is class factor
## Skipped Drama because it is class factor
## Skipped Fantasy because it is class factor
## Skipped Crime because it is class factor
## Skipped Mystery because it is class factor
## Skipped Horror because it is class factor
## Skipped History because it is class factor
## Skipped Action because it is class factor
## Skipped Romance because it is class factor
## Skipped Comedy because it is class factor
## Skipped War because it is class factor
## Skipped Biography because it is class factor
## Skipped Sci_Fi because it is class factor
## Skipped Western because it is class factor
## Skipped Family because it is class factor
## Skipped Thriller because it is class factor
## Skipped Sport because it is class factor
## Skipped Documentary because it is class factor
## Skipped Animation because it is class factor
## Skipped Musical because it is class factor
## Skipped Music because it is class factor
## Skipped Film_Noir because it is class factor
## Skipped News because it is class factor
ols_fit = lm(as.formula(build_formula("averageRating", preds = movie_cors_df$name)), data = d_feature_z)
ols_fit %>% summary()
##
## Call:
## lm(formula = as.formula(build_formula("averageRating", preds = movie_cors_df$name)),
## data = d_feature_z)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.626 -0.422 0.092 0.533 3.556
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.10680 0.01487 -7.18 6.9e-13 ***
## year -0.23622 0.00442 -53.44 < 2e-16 ***
## runtime 0.25413 0.00439 57.94 < 2e-16 ***
## AdventureTRUE -0.10573 0.01489 -7.10 1.3e-12 ***
## DramaTRUE 0.35878 0.01067 33.63 < 2e-16 ***
## FantasyTRUE -0.06366 0.01844 -3.45 0.00056 ***
## CrimeTRUE 0.04536 0.01200 3.78 0.00016 ***
## MysteryTRUE 0.04745 0.01544 3.07 0.00212 **
## HorrorTRUE -0.60745 0.01437 -42.28 < 2e-16 ***
## HistoryTRUE 0.01431 0.02226 0.64 0.52040
## ActionTRUE -0.26175 0.01206 -21.70 < 2e-16 ***
## RomanceTRUE -0.05598 0.01191 -4.70 2.6e-06 ***
## ComedyTRUE -0.01604 0.01085 -1.48 0.13911
## WarTRUE 0.01762 0.02620 0.67 0.50116
## BiographyTRUE 0.15142 0.01927 7.86 4.0e-15 ***
## Sci_FiTRUE -0.28631 0.01933 -14.81 < 2e-16 ***
## WesternTRUE -0.15248 0.03466 -4.40 1.1e-05 ***
## FamilyTRUE -0.09810 0.02095 -4.68 2.9e-06 ***
## ThrillerTRUE -0.06954 0.01299 -5.35 8.6e-08 ***
## SportTRUE -0.02625 0.03001 -0.87 0.38168
## DocumentaryTRUE 1.06715 0.02225 47.96 < 2e-16 ***
## AnimationTRUE 0.79176 0.02482 31.90 < 2e-16 ***
## MusicalTRUE -0.14850 0.03925 -3.78 0.00015 ***
## MusicTRUE 0.02505 0.02444 1.02 0.30538
## Film_NoirTRUE -0.00826 0.04307 -0.19 0.84795
## NewsTRUE 0.36200 0.15266 2.37 0.01773 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.83 on 42136 degrees of freedom
## (73 observations deleted due to missingness)
## Multiple R-squared: 0.311, Adjusted R-squared: 0.311
## F-statistic: 760 on 25 and 42136 DF, p-value: <2e-16
ols_fit %>%
summ() %>%
{
# browser()
# rownames(.$coeftable) = rownames(.$coeftable) %>% str_replace("TRUE", "")
names(.$model$coefficients) = names(.$model$coefficients) %>% str_replace("TRUE", "")
# .$model$coefficients = .$model$coefficients %>% sort(decreasing = T)
# .$coeftable = .$coeftable %>% as.data.frame() %>% arrange(-Est.) %>% as.matrix()
.
} %>%
plot_coefs() +
ggtitle("Multiple regression predictors of average IMDB rating")
## Loading required namespace: broom.mixed

GG_save("figs/ols linear.png")
#effect of year of release
full_model = as.formula(str_glue("averageRating ~ rcs(year) + rcs(runtime) + {str_c(names(genre_dummies), collapse = ' + ')}"))
year_dummies_model = as.formula(str_glue("averageRating ~ year2020 + year2021 + year2022 + year2023 + year2024 + year + rcs(runtime) + {str_c(names(genre_dummies), collapse = ' + ')}"))
ols(full_model, data = d_feature) %>%
ggpredict(terms = "year") %>%
plot() +
theme_bw()

GG_save("figs/full ols year effect.png")
ols(full_model, data = d_feature) %>%
ggpredict(terms = "runtime") %>%
plot()

#year dummies?
(dummies_model_fit = ols(year_dummies_model, data = d_feature))
## Frequencies of Missing Values Due to Each Variable
## averageRating year2020 year2021 year2022 year2023
## 0 0 0 0 0
## year2024 year runtime Adventure Drama
## 0 0 73 0 0
## Fantasy Crime Mystery Horror History
## 0 0 0 0 0
## Action Romance Comedy War Biography
## 0 0 0 0 0
## Sci_Fi Western Family Thriller Sport
## 0 0 0 0 0
## Documentary Animation Musical Music Film_Noir
## 0 0 0 0 0
## News
## 0
##
## Linear Regression Model
##
## ols(formula = year_dummies_model, data = d_feature)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 42162 LR chi2 16949.75 R2 0.331
## sigma0.9617 d.f. 33 R2 adj 0.331
## d.f. 42128 Pr(> chi2) 0.0000 g 0.764
##
## Residuals
##
## Min 1Q Median 3Q Max
## -5.9107 -0.4985 0.1013 0.6255 3.9826
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 31.9184 0.4856 65.73 <0.0001
## year2020 -0.0913 0.0286 -3.19 0.0014
## year2021 0.0003 0.0275 0.01 0.9918
## year2022 0.0317 0.0263 1.21 0.2275
## year2023 0.2374 0.0295 8.04 <0.0001
## year2024 0.6759 0.0986 6.85 <0.0001
## year -0.0134 0.0002 -54.01 <0.0001
## runtime 0.0063 0.0013 4.87 <0.0001
## runtime' 0.1547 0.0175 8.82 <0.0001
## runtime'' -0.4391 0.0741 -5.92 <0.0001
## runtime''' 0.2312 0.0801 2.89 0.0039
## Adventure=TRUE -0.1411 0.0173 -8.17 <0.0001
## Drama=TRUE 0.3955 0.0124 31.84 <0.0001
## Fantasy=TRUE -0.0858 0.0214 -4.01 <0.0001
## Crime=TRUE 0.0446 0.0139 3.20 0.0014
## Mystery=TRUE 0.0424 0.0179 2.37 0.0179
## Horror=TRUE -0.6474 0.0168 -38.55 <0.0001
## History=TRUE 0.0121 0.0258 0.47 0.6398
## Action=TRUE -0.3148 0.0140 -22.49 <0.0001
## Romance=TRUE -0.0811 0.0138 -5.87 <0.0001
## Comedy=TRUE -0.0037 0.0126 -0.30 0.7671
## War=TRUE 0.0056 0.0304 0.18 0.8549
## Biography=TRUE 0.1294 0.0224 5.78 <0.0001
## Sci_Fi=TRUE -0.3314 0.0224 -14.78 <0.0001
## Western=TRUE -0.1947 0.0402 -4.84 <0.0001
## Family=TRUE -0.1013 0.0243 -4.17 <0.0001
## Thriller=TRUE -0.0879 0.0151 -5.83 <0.0001
## Sport=TRUE -0.0541 0.0348 -1.55 0.1202
## Documentary=TRUE 1.3354 0.0260 51.35 <0.0001
## Animation=TRUE 0.9861 0.0291 33.94 <0.0001
## Musical=TRUE -0.1654 0.0455 -3.63 0.0003
## Music=TRUE 0.0074 0.0283 0.26 0.7934
## Film_Noir=TRUE 0.0265 0.0500 0.53 0.5961
## News=TRUE 0.4164 0.1770 2.35 0.0186
#adjusted ratings
d_feature %<>% mutate(
averageRating_adj = averageRating - case_when(
d_feature$year == 2024 ~ dummies_model_fit$coefficients["year2024"],
d_feature$year == 2023 ~ dummies_model_fit$coefficients["year2023"],
.default = 0
)
)
#ecdf without recency bias
ratings_ecdf_nobias = d_feature %>% filter(year <= 2022) %>% pull(averageRating) %>% ecdf()
#add adjusted centiles
d_feature %<>% mutate(
averageRating_adj_quantile = ratings_ecdf_nobias(averageRating_adj)
)
#Dune 2
d_feature %>%
filter(str_detect(primaryTitle, "Dune")) %>%
select(
primaryTitle, year, runtime, averageRating, averageRating_adj, averageRating_quantile, averageRating_adj_quantile
) %>%
arrange(year)
Emil data
Plots
#over time
ggplot(emil_ratings, aes(year, Your_Rating)) +
geom_point() +
geom_smooth() +
scale_y_continuous("Emil's rating", limits = c(1, 10), breaks = 1:10) +
scale_x_continuous("Year of release")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).

GG_save("figs/emil_release_rating.png")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Removed 1 rows containing missing values (`geom_point()`).
#Emil vs. mean
GG_scatter(emil_ratings, "IMDb_Rating", "Your_Rating", case_names = "Title") +
scale_y_continuous("Emil's rating", limits = c(1, 10), breaks = 1:10) +
scale_x_continuous("Average IMDB rating", breaks = 1:10, limits = c(1, 10)) +
geom_abline(intercept = 0, slope = 1, linetype = "dotted")
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/emil_IMDB.png")
## `geom_smooth()` using formula = 'y ~ x'
Models
#merge emil Rating with main set
d_feature = left_join(
d_feature,
emil_ratings %>% select(Const, Your_Rating),
by = c("tconst" = "Const")
)
#predicted rating from linear model, minus bias
d_feature$predicted_rating = predict(dummies_model_fit) - case_when(
d_feature$year == 2024 ~ dummies_model_fit$coefficients["year2024"],
d_feature$year == 2023 ~ dummies_model_fit$coefficients["year2023"],
.default = 0
)
#predict Rating for Emil
emil_model = ols(Your_Rating ~ rcs(year, 3) + rcs(runtime, 3) + Adventure + Drama +
Fantasy + Crime + Mystery + Horror + History + Action + Romance +
Comedy + War + Biography + Sci_Fi + Western + Family + Thriller +
Sport + Documentary + Animation + Musical + Music + Film_Noir + averageRating, data = d_feature)
emil_model
## Frequencies of Missing Values Due to Each Variable
## Your_Rating year runtime Adventure Drama
## 41406 0 73 0 0
## Fantasy Crime Mystery Horror History
## 0 0 0 0 0
## Action Romance Comedy War Biography
## 0 0 0 0 0
## Sci_Fi Western Family Thriller Sport
## 0 0 0 0 0
## Documentary Animation Musical Music Film_Noir
## 0 0 0 0 0
## averageRating
## 0
##
## Linear Regression Model
##
## ols(formula = Your_Rating ~ rcs(year, 3) + rcs(runtime, 3) +
## Adventure + Drama + Fantasy + Crime + Mystery + Horror +
## History + Action + Romance + Comedy + War + Biography + Sci_Fi +
## Western + Family + Thriller + Sport + Documentary + Animation +
## Musical + Music + Film_Noir + averageRating, data = d_feature)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 829 LR chi2 475.73 R2 0.437
## sigma1.0080 d.f. 27 R2 adj 0.418
## d.f. 801 Pr(> chi2) 0.0000 g 0.886
##
## Residuals
##
## Min 1Q Median 3Q Max
## -3.73378 -0.67283 -0.09337 0.56933 4.38535
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -27.6864 8.2522 -3.36 0.0008
## year 0.0153 0.0041 3.69 0.0002
## year' -0.0230 0.0048 -4.83 <0.0001
## runtime -0.0105 0.0073 -1.44 0.1504
## runtime' 0.0089 0.0086 1.04 0.3004
## Adventure=TRUE 0.0528 0.1101 0.48 0.6316
## Drama=TRUE 0.3471 0.1083 3.21 0.0014
## Fantasy=TRUE -0.0976 0.1465 -0.67 0.5055
## Crime=TRUE 0.0567 0.1044 0.54 0.5870
## Mystery=TRUE 0.3222 0.1217 2.65 0.0083
## Horror=TRUE -0.3742 0.1930 -1.94 0.0529
## History=TRUE -0.3872 0.2276 -1.70 0.0893
## Action=TRUE -0.1783 0.1022 -1.74 0.0814
## Romance=TRUE 0.4456 0.1228 3.63 0.0003
## Comedy=TRUE 0.0580 0.1188 0.49 0.6257
## War=TRUE -0.1254 0.1949 -0.64 0.5202
## Biography=TRUE -0.0920 0.1538 -0.60 0.5500
## Sci_Fi=TRUE -0.0034 0.1265 -0.03 0.9789
## Western=TRUE 0.3367 0.2812 1.20 0.2315
## Family=TRUE -0.1024 0.1966 -0.52 0.6028
## Thriller=TRUE 0.0216 0.1158 0.19 0.8519
## Sport=TRUE 0.0408 0.3740 0.11 0.9131
## Documentary=TRUE 1.4237 0.5352 2.66 0.0080
## Animation=TRUE -0.1932 0.1735 -1.11 0.2656
## Musical=TRUE 0.2116 0.5987 0.35 0.7239
## Music=TRUE -0.1727 0.3026 -0.57 0.5683
## Film_Noir=TRUE -0.1303 0.3733 -0.35 0.7271
## averageRating 0.7632 0.0448 17.04 <0.0001
d_feature$predicted_rating_Emil = predict(emil_model, newdata = as.data.frame(d_feature))
#highest predicted movies based on my tastes that I haven't seen
d_feature %>%
filter(is.na(Your_Rating), Documentary == F) %>%
arrange(-predicted_rating_Emil) %>%
select(
primaryTitle, year, runtime, averageRating, averageRating_adj, predicted_rating_Emil
) %>%
print(n=20)
## # A tibble: 39,263 × 6
## primaryTitle year runtime averageRating averageRating_adj
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 The Chaos Class 1975 87 9.2 9.2
## 2 The Elusive Summer of '68 1984 91 8.5 8.5
## 3 Yesterday 1988 84 8.9 8.9
## 4 The Girl with the Red Scarf 1977 90 8.5 8.5
## 5 Godfather 1991 150 8.6 8.6
## 6 Poove Unakkaga 1996 150 8.6 8.6
## 7 Thoovanathumbikal 1987 151 8.6 8.6
## 8 Pesvebi 1987 87 8.8 8.8
## 9 Who's Singin' Over There? 1980 86 8.7 8.7
## 10 Moondram Pirai 1982 143 8.6 8.6
## 11 Love Today 1997 151 8.6 8.6
## 12 The Marathon Family 1982 92 8.8 8.8
## 13 Srabon Megher Din 1999 150 8.6 8.6
## 14 Balkan Spy 1984 95 8.8 8.8
## 15 Sandesham 1991 138 9 9
## 16 Kibar Feyzo 1978 83 8.7 8.7
## 17 Sankarabharanam 1980 143 8.8 8.8
## 18 Thevar Magan 1992 145 8.7 8.7
## 19 The King of the Street Cleaners 1977 80 8.5 8.5
## 20 The Phantom of the Opera at th… 2011 137 8.8 8.8
## # ℹ 39,243 more rows
## # ℹ 1 more variable: predicted_rating_Emil <dbl>
GG_scatter(d_feature, "predicted_rating_Emil", "Your_Rating", case_names = "primaryTitle")
## `geom_smooth()` using formula = 'y ~ x'
