Init

options(digits = 3)
library(kirkegaard)

## Loading required package: tidyverse

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: Hmisc
## 
## 
## Attaching package: 'Hmisc'
## 
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## 
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following object is masked from 'package:Hmisc':
## 
##     describe
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +

load_packages(
  googlesheets4, 
  rms, 
  lubridate,
  jtools,
  ggeffects
  )

## 
## Attaching package: 'jtools'
## 
## The following object is masked from 'package:kirkegaard':
## 
##     standardize
## 
## The following object is masked from 'package:Hmisc':
## 
##     %nin%

theme_set(theme_bw())

Functions

#split a string with , separation into dummies
split_into_dummies = function(x) {
  genres = x %>% 
    str_split(pattern = ",") %>% 
    unlist() %>% 
    unique() %>% 
    str_subset("\\\\", negate = T)

  dummies_mat = matrix(F, nrow = length(x), ncol = length(genres))
  colnames(dummies_mat) = genres
 
  #loop across and fill in T when detected
  for (i in seq_along(genres)) {
    dummies_mat[, i] = str_detect(x, pattern = genres[i])
  }
  
  dummies_mat %>% 
    as.data.frame() %>% 
    map_df(as.factor)
}

#build formula
build_formula = function(y, preds, rcs = NULL, rcs_knots = 3) {
  #add any rcs()
  if (!is.null(rcs)) {
    preds[preds %in% rcs] = str_glue("rcs({preds[preds %in% rcs]})")
  }
  
  form = str_glue("{y} ~ {str_c(preds, collapse = ' + ')}")
  
  form
}

Load data

Emil data

#Emil
emil_ratings_url = "https://www.imdb.com/user/ur11227647/ratings"

#download
#TODO: need to use something fancy

#use manually expoerted data
# emil_ratings = read_sheet("https://docs.google.com/spreadsheets/d/13y3enP_GhrAu1ai-1ob3Uvh-ELKX2pQMkCDNGrCD3YE/edit#gid=1733559149") %>% 
#   df_legalize_names()
emil_ratings = read_csv("data/ratings.csv") %>% 
  df_legalize_names() %>% 
  mutate(
    year = year(Release_Date)
  )

## Rows: 847 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): Const, Title, URL, Title Type, Genres, Directors
## dbl  (5): Your Rating, IMDb Rating, Runtime (mins), Year, Num Votes
## date (2): Date Rated, Release Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#director collapse
emil_ratings$Directors %>% table2()

emil_ratings$Directors5 = emil_ratings$Directors %>% fct_lump(prop = 5/nrow(emil_ratings))

IMDB data

https://datasets.imdbws.com/

titles = read_tsv("data/title.basics.tsv.gz")

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 10594587 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (8): tconst, titleType, primaryTitle, originalTitle, startYear, endYear,...
## dbl (1): isAdult
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

ratings = read_tsv("data/title.ratings.tsv.gz")

## Rows: 1408974 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): tconst
## dbl (2): averageRating, numVotes
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

d = inner_join(
  titles,
  ratings,
  by = "tconst"
)

#subset
d_feature = d %>% 
  filter(
    titleType == "movie",
    numVotes >= 1000,
    isAdult == 0,
    !str_detect(genres, "Adult")
  )

#add quantiles
ratings_ecdf = ecdf(d_feature$averageRating)

d_feature %<>% 
    rename(
    year = startYear,
    runtime = runtimeMinutes
  ) %>% 
  mutate(
    averageRating_quantile = ratings_ecdf(d_feature$averageRating),
    rank = rank(-d_feature$averageRating),
    year = as.numeric(year),
    runtime = as.numeric(runtime),
    year2024 = year == 2024,
    year2023 = year == 2023,
    year2022 = year == 2022,
    year2021 = year == 2021,
    year2020 = year == 2020,
  )

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `runtime = as.numeric(runtime)`.
## Caused by warning:
## ! NAs introduced by coercion

#split factor to dummies
genre_dummies = split_into_dummies(d_feature$genres) %>% 
  as.data.frame() %>% 
  df_legalize_names()

d_feature = bind_cols(
  d_feature,
  genre_dummies
)

Analysis

IMDB

#the distribution
describe2(d_feature$averageRating)

quantile(d_feature$averageRating, probs = c(.01, seq(.05, .95, by =.05), .96, .97, .98, .99))

##  1%  5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60% 65% 70% 75% 80% 85% 90% 95% 
## 2.8 4.0 4.7 5.1 5.4 5.6 5.8 6.0 6.2 6.3 6.4 6.6 6.7 6.8 6.9 7.1 7.2 7.4 7.6 7.9 
## 96% 97% 98% 99% 
## 8.0 8.1 8.2 8.4

GG_denhist(d_feature$averageRating) +
  scale_x_continuous("Average rating") +
  ggtitle("IMDB movies",
          subtitle = str_glue("Feature films, non-pornography, 1000+ votes, n = {nrow(d_feature)}"))

## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/IMDB ratings dist.png")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

genre_dummies %>% 
  as.data.frame() %>% 
  pivot_longer(cols = everything()) %>% 
  filter(
    value == T
  ) %>% 
  pull(name) %>% 
  table2(include_NA = F) %>% 
  mutate(
    genre = fct_reorder(Group, Count)
  ) %>% 
  ggplot(aes(genre, Count)) + 
  geom_bar(stat = "identity") +
  coord_flip()

GG_save("figs/genre counts.png")

#movies correlates of average rating
movie_cors = d_feature %>% 
  select(averageRating,
         year,
         runtime,
         Adventure:News
         ) %>% 
  map_df(as.numeric) %>% 
  psych::mixedCor()

## Warning in cor.smooth(mat): Matrix was not positive definite, smoothing was
## done

movie_cors$rho %>% 
  GG_heatmap(font_size = 2)

movie_cors_df = tibble(
  name = names(movie_cors$rho[-1, 1]),
  value = movie_cors$rho[-1, 1]
)

movie_cors_df %>% 
  mutate(
    name = fct_reorder(name, value)
  ) %>% 
  ggplot(aes(name, value)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_y_continuous("Correlation with average rating") +
  scale_x_discrete("Predictor") +
  ggtitle("Predictors of average IMDB movie rating",
          "Individual effects (singular regression)")

GG_save("figs/single effects.png") 

#ols
d_feature_z = d_feature %>% df_standardize()

## Skipped tconst because it is a character (string)
## Skipped titleType because it is a character (string)
## Skipped primaryTitle because it is a character (string)
## Skipped originalTitle because it is a character (string)
## Skipped isAdult because it is ranged from 0 to 1 (a proportion, maybe)
## Skipped endYear because it is a character (string)
## Skipped genres because it is a character (string)
## Skipped averageRating_quantile because it is ranged from 0 to 1 (a proportion, maybe)
## Skipped year2024 because it is a logical (boolean)
## Skipped year2023 because it is a logical (boolean)
## Skipped year2022 because it is a logical (boolean)
## Skipped year2021 because it is a logical (boolean)
## Skipped year2020 because it is a logical (boolean)
## Skipped Adventure because it is class factor
## Skipped Drama because it is class factor
## Skipped Fantasy because it is class factor
## Skipped Crime because it is class factor
## Skipped Mystery because it is class factor
## Skipped Horror because it is class factor
## Skipped History because it is class factor
## Skipped Action because it is class factor
## Skipped Romance because it is class factor
## Skipped Comedy because it is class factor
## Skipped War because it is class factor
## Skipped Biography because it is class factor
## Skipped Sci_Fi because it is class factor
## Skipped Western because it is class factor
## Skipped Family because it is class factor
## Skipped Thriller because it is class factor
## Skipped Sport because it is class factor
## Skipped Documentary because it is class factor
## Skipped Animation because it is class factor
## Skipped Musical because it is class factor
## Skipped Music because it is class factor
## Skipped Film_Noir because it is class factor
## Skipped News because it is class factor

ols_fit = lm(as.formula(build_formula("averageRating", preds = movie_cors_df$name)), data = d_feature_z)
ols_fit %>% summary()

## 
## Call:
## lm(formula = as.formula(build_formula("averageRating", preds = movie_cors_df$name)), 
##     data = d_feature_z)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.626 -0.422  0.092  0.533  3.556 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.10680    0.01487   -7.18  6.9e-13 ***
## year            -0.23622    0.00442  -53.44  < 2e-16 ***
## runtime          0.25413    0.00439   57.94  < 2e-16 ***
## AdventureTRUE   -0.10573    0.01489   -7.10  1.3e-12 ***
## DramaTRUE        0.35878    0.01067   33.63  < 2e-16 ***
## FantasyTRUE     -0.06366    0.01844   -3.45  0.00056 ***
## CrimeTRUE        0.04536    0.01200    3.78  0.00016 ***
## MysteryTRUE      0.04745    0.01544    3.07  0.00212 ** 
## HorrorTRUE      -0.60745    0.01437  -42.28  < 2e-16 ***
## HistoryTRUE      0.01431    0.02226    0.64  0.52040    
## ActionTRUE      -0.26175    0.01206  -21.70  < 2e-16 ***
## RomanceTRUE     -0.05598    0.01191   -4.70  2.6e-06 ***
## ComedyTRUE      -0.01604    0.01085   -1.48  0.13911    
## WarTRUE          0.01762    0.02620    0.67  0.50116    
## BiographyTRUE    0.15142    0.01927    7.86  4.0e-15 ***
## Sci_FiTRUE      -0.28631    0.01933  -14.81  < 2e-16 ***
## WesternTRUE     -0.15248    0.03466   -4.40  1.1e-05 ***
## FamilyTRUE      -0.09810    0.02095   -4.68  2.9e-06 ***
## ThrillerTRUE    -0.06954    0.01299   -5.35  8.6e-08 ***
## SportTRUE       -0.02625    0.03001   -0.87  0.38168    
## DocumentaryTRUE  1.06715    0.02225   47.96  < 2e-16 ***
## AnimationTRUE    0.79176    0.02482   31.90  < 2e-16 ***
## MusicalTRUE     -0.14850    0.03925   -3.78  0.00015 ***
## MusicTRUE        0.02505    0.02444    1.02  0.30538    
## Film_NoirTRUE   -0.00826    0.04307   -0.19  0.84795    
## NewsTRUE         0.36200    0.15266    2.37  0.01773 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.83 on 42136 degrees of freedom
##   (73 observations deleted due to missingness)
## Multiple R-squared:  0.311,  Adjusted R-squared:  0.311 
## F-statistic:  760 on 25 and 42136 DF,  p-value: <2e-16

ols_fit %>% 
  summ() %>% 
  {
    # browser()
    # rownames(.$coeftable) = rownames(.$coeftable) %>% str_replace("TRUE", "")
    names(.$model$coefficients) = names(.$model$coefficients) %>% str_replace("TRUE", "")
    # .$model$coefficients = .$model$coefficients %>% sort(decreasing = T)
    # .$coeftable = .$coeftable %>% as.data.frame() %>% arrange(-Est.) %>% as.matrix()
    .
  } %>% 
  plot_coefs() +
  ggtitle("Multiple regression predictors of average IMDB rating")

## Loading required namespace: broom.mixed

GG_save("figs/ols linear.png")

#effect of year of release
full_model = as.formula(str_glue("averageRating ~ rcs(year) + rcs(runtime) + {str_c(names(genre_dummies), collapse = ' + ')}"))
year_dummies_model = as.formula(str_glue("averageRating ~ year2020 + year2021 + year2022 + year2023 + year2024 + year + rcs(runtime) + {str_c(names(genre_dummies), collapse = ' + ')}"))

ols(full_model, data = d_feature) %>% 
  ggpredict(terms = "year") %>% 
  plot() +
  theme_bw()

GG_save("figs/full ols year effect.png")

ols(full_model, data = d_feature) %>% 
  ggpredict(terms = "runtime") %>% 
  plot()

#year dummies?
(dummies_model_fit = ols(year_dummies_model, data = d_feature))

## Frequencies of Missing Values Due to Each Variable
## averageRating      year2020      year2021      year2022      year2023 
##             0             0             0             0             0 
##      year2024          year       runtime     Adventure         Drama 
##             0             0            73             0             0 
##       Fantasy         Crime       Mystery        Horror       History 
##             0             0             0             0             0 
##        Action       Romance        Comedy           War     Biography 
##             0             0             0             0             0 
##        Sci_Fi       Western        Family      Thriller         Sport 
##             0             0             0             0             0 
##   Documentary     Animation       Musical         Music     Film_Noir 
##             0             0             0             0             0 
##          News 
##             0 
## 
## Linear Regression Model
## 
## ols(formula = year_dummies_model, data = d_feature)
## 
## 
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
## Obs   42162    LR chi2    16949.75    R2       0.331    
## sigma0.9617    d.f.             33    R2 adj   0.331    
## d.f.  42128    Pr(> chi2)   0.0000    g        0.764    
## 
## Residuals
## 
##     Min      1Q  Median      3Q     Max 
## -5.9107 -0.4985  0.1013  0.6255  3.9826 
## 
## 
##                  Coef    S.E.   t      Pr(>|t|)
## Intercept        31.9184 0.4856  65.73 <0.0001 
## year2020         -0.0913 0.0286  -3.19 0.0014  
## year2021          0.0003 0.0275   0.01 0.9918  
## year2022          0.0317 0.0263   1.21 0.2275  
## year2023          0.2374 0.0295   8.04 <0.0001 
## year2024          0.6759 0.0986   6.85 <0.0001 
## year             -0.0134 0.0002 -54.01 <0.0001 
## runtime           0.0063 0.0013   4.87 <0.0001 
## runtime'          0.1547 0.0175   8.82 <0.0001 
## runtime''        -0.4391 0.0741  -5.92 <0.0001 
## runtime'''        0.2312 0.0801   2.89 0.0039  
## Adventure=TRUE   -0.1411 0.0173  -8.17 <0.0001 
## Drama=TRUE        0.3955 0.0124  31.84 <0.0001 
## Fantasy=TRUE     -0.0858 0.0214  -4.01 <0.0001 
## Crime=TRUE        0.0446 0.0139   3.20 0.0014  
## Mystery=TRUE      0.0424 0.0179   2.37 0.0179  
## Horror=TRUE      -0.6474 0.0168 -38.55 <0.0001 
## History=TRUE      0.0121 0.0258   0.47 0.6398  
## Action=TRUE      -0.3148 0.0140 -22.49 <0.0001 
## Romance=TRUE     -0.0811 0.0138  -5.87 <0.0001 
## Comedy=TRUE      -0.0037 0.0126  -0.30 0.7671  
## War=TRUE          0.0056 0.0304   0.18 0.8549  
## Biography=TRUE    0.1294 0.0224   5.78 <0.0001 
## Sci_Fi=TRUE      -0.3314 0.0224 -14.78 <0.0001 
## Western=TRUE     -0.1947 0.0402  -4.84 <0.0001 
## Family=TRUE      -0.1013 0.0243  -4.17 <0.0001 
## Thriller=TRUE    -0.0879 0.0151  -5.83 <0.0001 
## Sport=TRUE       -0.0541 0.0348  -1.55 0.1202  
## Documentary=TRUE  1.3354 0.0260  51.35 <0.0001 
## Animation=TRUE    0.9861 0.0291  33.94 <0.0001 
## Musical=TRUE     -0.1654 0.0455  -3.63 0.0003  
## Music=TRUE        0.0074 0.0283   0.26 0.7934  
## Film_Noir=TRUE    0.0265 0.0500   0.53 0.5961  
## News=TRUE         0.4164 0.1770   2.35 0.0186

#adjusted ratings
d_feature %<>% mutate(
  averageRating_adj = averageRating - case_when(
    d_feature$year == 2024 ~ dummies_model_fit$coefficients["year2024"],
    d_feature$year == 2023 ~ dummies_model_fit$coefficients["year2023"],
    .default = 0
  )
)

#ecdf without recency bias
ratings_ecdf_nobias = d_feature %>% filter(year <= 2022) %>% pull(averageRating) %>% ecdf()

#add adjusted centiles
d_feature %<>% mutate(
  averageRating_adj_quantile = ratings_ecdf_nobias(averageRating_adj)
)

#Dune 2
d_feature %>% 
  filter(str_detect(primaryTitle, "Dune")) %>% 
  select(
    primaryTitle, year, runtime, averageRating, averageRating_adj, averageRating_quantile, averageRating_adj_quantile
  ) %>% 
  arrange(year)

Emil data

Plots

#over time
ggplot(emil_ratings, aes(year, Your_Rating)) +
  geom_point() +
  geom_smooth() +
  scale_y_continuous("Emil's rating", limits = c(1, 10), breaks = 1:10) +
  scale_x_continuous("Year of release")

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 1 rows containing missing values (`geom_point()`).

GG_save("figs/emil_release_rating.png")

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Removed 1 rows containing missing values (`geom_point()`).

#Emil vs. mean
GG_scatter(emil_ratings, "IMDb_Rating", "Your_Rating", case_names = "Title") +
  scale_y_continuous("Emil's rating", limits = c(1, 10), breaks = 1:10) +
  scale_x_continuous("Average IMDB rating", breaks = 1:10, limits = c(1, 10)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dotted")

## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/emil_IMDB.png")

## `geom_smooth()` using formula = 'y ~ x'

Models

#merge emil Rating with main set
d_feature = left_join(
  d_feature,
  emil_ratings %>% select(Const, Your_Rating),
  by = c("tconst" = "Const")
)

#predicted rating from linear model, minus bias
d_feature$predicted_rating = predict(dummies_model_fit) - case_when(
    d_feature$year == 2024 ~ dummies_model_fit$coefficients["year2024"],
    d_feature$year == 2023 ~ dummies_model_fit$coefficients["year2023"],
    .default = 0
  )

#predict Rating for Emil
emil_model = ols(Your_Rating ~ rcs(year, 3) + rcs(runtime, 3) + Adventure + Drama + 
    Fantasy + Crime + Mystery + Horror + History + Action + Romance + 
    Comedy + War + Biography + Sci_Fi + Western + Family + Thriller + 
    Sport + Documentary + Animation + Musical + Music + Film_Noir + averageRating, data = d_feature)
emil_model

## Frequencies of Missing Values Due to Each Variable
##   Your_Rating          year       runtime     Adventure         Drama 
##         41406             0            73             0             0 
##       Fantasy         Crime       Mystery        Horror       History 
##             0             0             0             0             0 
##        Action       Romance        Comedy           War     Biography 
##             0             0             0             0             0 
##        Sci_Fi       Western        Family      Thriller         Sport 
##             0             0             0             0             0 
##   Documentary     Animation       Musical         Music     Film_Noir 
##             0             0             0             0             0 
## averageRating 
##             0 
## 
## Linear Regression Model
## 
## ols(formula = Your_Rating ~ rcs(year, 3) + rcs(runtime, 3) + 
##     Adventure + Drama + Fantasy + Crime + Mystery + Horror + 
##     History + Action + Romance + Comedy + War + Biography + Sci_Fi + 
##     Western + Family + Thriller + Sport + Documentary + Animation + 
##     Musical + Music + Film_Noir + averageRating, data = d_feature)
## 
## 
##                 Model Likelihood    Discrimination    
##                       Ratio Test           Indexes    
## Obs     829    LR chi2    475.73    R2       0.437    
## sigma1.0080    d.f.           27    R2 adj   0.418    
## d.f.    801    Pr(> chi2) 0.0000    g        0.886    
## 
## Residuals
## 
##      Min       1Q   Median       3Q      Max 
## -3.73378 -0.67283 -0.09337  0.56933  4.38535 
## 
## 
##                  Coef     S.E.   t     Pr(>|t|)
## Intercept        -27.6864 8.2522 -3.36 0.0008  
## year               0.0153 0.0041  3.69 0.0002  
## year'             -0.0230 0.0048 -4.83 <0.0001 
## runtime           -0.0105 0.0073 -1.44 0.1504  
## runtime'           0.0089 0.0086  1.04 0.3004  
## Adventure=TRUE     0.0528 0.1101  0.48 0.6316  
## Drama=TRUE         0.3471 0.1083  3.21 0.0014  
## Fantasy=TRUE      -0.0976 0.1465 -0.67 0.5055  
## Crime=TRUE         0.0567 0.1044  0.54 0.5870  
## Mystery=TRUE       0.3222 0.1217  2.65 0.0083  
## Horror=TRUE       -0.3742 0.1930 -1.94 0.0529  
## History=TRUE      -0.3872 0.2276 -1.70 0.0893  
## Action=TRUE       -0.1783 0.1022 -1.74 0.0814  
## Romance=TRUE       0.4456 0.1228  3.63 0.0003  
## Comedy=TRUE        0.0580 0.1188  0.49 0.6257  
## War=TRUE          -0.1254 0.1949 -0.64 0.5202  
## Biography=TRUE    -0.0920 0.1538 -0.60 0.5500  
## Sci_Fi=TRUE       -0.0034 0.1265 -0.03 0.9789  
## Western=TRUE       0.3367 0.2812  1.20 0.2315  
## Family=TRUE       -0.1024 0.1966 -0.52 0.6028  
## Thriller=TRUE      0.0216 0.1158  0.19 0.8519  
## Sport=TRUE         0.0408 0.3740  0.11 0.9131  
## Documentary=TRUE   1.4237 0.5352  2.66 0.0080  
## Animation=TRUE    -0.1932 0.1735 -1.11 0.2656  
## Musical=TRUE       0.2116 0.5987  0.35 0.7239  
## Music=TRUE        -0.1727 0.3026 -0.57 0.5683  
## Film_Noir=TRUE    -0.1303 0.3733 -0.35 0.7271  
## averageRating      0.7632 0.0448 17.04 <0.0001

d_feature$predicted_rating_Emil = predict(emil_model, newdata = as.data.frame(d_feature))

#highest predicted movies based on my tastes that I haven't seen
d_feature %>% 
  filter(is.na(Your_Rating), Documentary == F) %>% 
  arrange(-predicted_rating_Emil) %>% 
  select(
    primaryTitle, year, runtime, averageRating, averageRating_adj, predicted_rating_Emil
  ) %>% 
  print(n=20)

## # A tibble: 39,263 × 6
##    primaryTitle                     year runtime averageRating averageRating_adj
##    <chr>                           <dbl>   <dbl>         <dbl>             <dbl>
##  1 The Chaos Class                  1975      87           9.2               9.2
##  2 The Elusive Summer of '68        1984      91           8.5               8.5
##  3 Yesterday                        1988      84           8.9               8.9
##  4 The Girl with the Red Scarf      1977      90           8.5               8.5
##  5 Godfather                        1991     150           8.6               8.6
##  6 Poove Unakkaga                   1996     150           8.6               8.6
##  7 Thoovanathumbikal                1987     151           8.6               8.6
##  8 Pesvebi                          1987      87           8.8               8.8
##  9 Who's Singin' Over There?        1980      86           8.7               8.7
## 10 Moondram Pirai                   1982     143           8.6               8.6
## 11 Love Today                       1997     151           8.6               8.6
## 12 The Marathon Family              1982      92           8.8               8.8
## 13 Srabon Megher Din                1999     150           8.6               8.6
## 14 Balkan Spy                       1984      95           8.8               8.8
## 15 Sandesham                        1991     138           9                 9  
## 16 Kibar Feyzo                      1978      83           8.7               8.7
## 17 Sankarabharanam                  1980     143           8.8               8.8
## 18 Thevar Magan                     1992     145           8.7               8.7
## 19 The King of the Street Cleaners  1977      80           8.5               8.5
## 20 The Phantom of the Opera at th…  2011     137           8.8               8.8
## # ℹ 39,243 more rows
## # ℹ 1 more variable: predicted_rating_Emil <dbl>

GG_scatter(d_feature, "predicted_rating_Emil", "Your_Rating", case_names = "primaryTitle")

## `geom_smooth()` using formula = 'y ~ x'

IMDB ratings

Init

Functions

Load data

Emil data

IMDB data

Analysis

IMDB

Emil data

Plots

Models

Meta