SQL + R: Movie Ratings (6 films × ≥5 raters)

Author

Sachi Kapoor

Setup

Code

library(DBI)
library(RSQLite)
library(tidyverse)
library(janitor)
library(glue)
db_path <- "movies.db"
if (file.exists(db_path)) file.remove(db_path)

[1] TRUE

Code

con <- dbConnect(RSQLite::SQLite(), db_path)
DBI::dbExecute(con, "PRAGMA foreign_keys = ON;")

[1] 0

Define data

Code

movies <- c("Dune: Part Two","Barbie","Oppenheimer","Inside Out 2","Top Gun: Maverick","Spider-Verse")
raters <- c("Sachi","Ava","Noah","Liam","Mia")
ratings_long <- tribble(
  ~rater, ~movie,               ~rating,
  "Sachi","Dune: Part Two",        5,
  "Sachi","Barbie",                4,
  "Sachi","Oppenheimer",           4,
"Sachi","Inside Out 2",          5,
"Sachi","Top Gun: Maverick",     4,
"Ava",  "Dune: Part Two",        4,
"Ava",  "Barbie",                5,
"Ava",  "Oppenheimer",           4,
"Ava",  "Inside Out 2",          4,
"Noah", "Barbie",                3,
"Noah", "Oppenheimer",           5,
"Noah", "Inside Out 2",          4,
"Liam", "Dune: Part Two",        4,
"Liam", "Spider-Verse",          5,
"Liam", "Top Gun: Maverick",     3,
"Mia",  "Spider-Verse",          4,
"Mia",  "Top Gun: Maverick",     4,
"Mia",  "Inside Out 2",          5
) |> mutate(rater=as.character(rater), movie=as.character(movie), rating=as.integer(rating))
stopifnot(length(unique(movies)) >= 6, length(unique(raters)) >= 5)
stopifnot(all(ratings_long$rating %in% 1:5))

SQL schema + load

Code

dbExecute(con, "CREATE TABLE movies (movie_id INTEGER PRIMARY KEY, title TEXT NOT NULL UNIQUE);")

[1] 0

Code

dbExecute(con, "CREATE TABLE raters (rater_id INTEGER PRIMARY KEY, name TEXT NOT NULL UNIQUE);")

[1] 0

Code

dbExecute(con, "CREATE TABLE ratings (
rater_id INTEGER NOT NULL,
movie_id INTEGER NOT NULL,
rating   INTEGER CHECK(rating BETWEEN 1 AND 5),
rated_at TEXT DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (rater_id, movie_id),
FOREIGN KEY (rater_id) REFERENCES raters(rater_id),
FOREIGN KEY (movie_id) REFERENCES movies(movie_id));")

[1] 0

Code

dbWriteTable(con, "movies", tibble(title=movies), append=TRUE)
dbWriteTable(con, "raters", tibble(name =raters), append=TRUE)
dim_movies <- dbReadTable(con, "movies")
dim_raters <- dbReadTable(con, "raters")
ratings_sql <- ratings_long |>
  left_join(dim_movies, by = join_by(movie == title)) |>
  left_join(dim_raters, by = join_by(rater == name)) |>
  transmute(rater_id, movie_id, rating)
dbWriteTable(con, "ratings", ratings_sql, append=TRUE)
dbListTables(con)

[1] "movies"  "raters"  "ratings"

Read from SQL (full grid)

Code

ratings_full <- 
DBI::dbGetQuery(con, "
SELECT r.name  AS rater,
       m.title AS movie,
       rt.rating
FROM raters AS r
CROSS JOIN movies AS m
LEFT JOIN ratings AS rt
  ON rt.rater_id = r.rater_id
 AND rt.movie_id = m.movie_id
ORDER BY r.name, m.title;
 ") |> tibble::as_tibble()
glimpse(ratings_full)

Rows: 30
Columns: 3
$ rater  <chr> "Ava", "Ava", "Ava", "Ava", "Ava", "Ava", "Liam", "Liam", "Liam…
$ movie  <chr> "Barbie", "Dune: Part Two", "Inside Out 2", "Oppenheimer", "Spi…
$ rating <int> 5, 4, 4, 4, NA, NA, NA, 4, NA, NA, 5, 3, NA, NA, 5, NA, 4, 4, 3…

Summaries

Code

movie_summary <- ratings_full |>
  group_by(movie) |>
  summarize(n_ratings=sum(!is.na(rating)),
            mean_rating=mean(rating, na.rm=TRUE),
            sd_rating=sd(rating, na.rm=TRUE), .groups="drop") |>
  arrange(desc(mean_rating))
movie_summary

# A tibble: 6 × 4
  movie             n_ratings mean_rating sd_rating
  <chr>                 <int>       <dbl>     <dbl>
1 Inside Out 2              4        4.5      0.577
2 Spider-Verse              2        4.5      0.707
3 Dune: Part Two            3        4.33     0.577
4 Oppenheimer               3        4.33     0.577
5 Barbie                    3        4        1    
6 Top Gun: Maverick         3        3.67     0.577

Code

movie_means <- ratings_full |>
  summarize(movie_mean=mean(rating, na.rm=TRUE), .by=movie)
ratings_imputed <- ratings_full |>
  left_join(movie_means, by="movie") |>
  mutate(rating_imp = coalesce(rating, movie_mean))
movie_summary_imp <- ratings_imputed |>
summarize(n_observed=sum(!is.na(rating)),
mean_rating=mean(rating_imp), .by=movie) |>
  arrange(desc(mean_rating))
movie_summary_imp

# A tibble: 6 × 3
  movie             n_observed mean_rating
  <chr>                  <int>       <dbl>
1 Inside Out 2               4        4.5 
2 Spider-Verse               2        4.5 
3 Dune: Part Two             3        4.33
4 Oppenheimer                3        4.33
5 Barbie                     3        4   
6 Top Gun: Maverick          3        3.67

Code

## Discussion: engaging with the data

Code

top_avail <- movie_summary |> slice_max(mean_rating, n = 2)
top_imp   <- movie_summary_imp |> slice_max(mean_rating, n = 2)
dplyr::bind_rows(Available = top_avail, Imputed = top_imp, .id = "version")

# A tibble: 4 × 6
  version   movie        n_ratings mean_rating sd_rating n_observed
  <chr>     <chr>            <int>       <dbl>     <dbl>      <int>
1 Available Inside Out 2         4         4.5     0.577         NA
2 Available Spider-Verse         2         4.5     0.707         NA
3 Imputed   Inside Out 2        NA         4.5    NA              4
4 Imputed   Spider-Verse        NA         4.5    NA              2

msg <- glue::glue( “Takeaways. By available-case averaging, the top films are”, “{top_avail\(movie[1]} (mean={round(top_avail\)mean_rating[1],2)}, n={top_avail\(n_ratings[1]}) ", "and {top_avail\)movie[2]} (mean={round(top_avail\(mean_rating[2],2)}, n={top_avail\)n_ratings[2]}).”, “After simple imputation by movie mean, the leaders are”, “{top_imp\(movie[1]} and {top_imp\)movie[2]}, with very similar ordering.”, “The heatmap shows most missingness clustered on a few rater–movie pairs,”, “so imputing with the movie mean didn’t materially shift rankings—our top choice looks robust.” ) cat(msg)


## Plots

::: {.cell}

```{.r .cell-code}
movie_summary |>
  ggplot(aes(x=reorder(movie, mean_rating), y=mean_rating)) +
  geom_col() +
  geom_text(aes(label=glue("n={n_ratings}")), vjust=-0.5, size=3) +
  coord_flip() +
labs(title="Average rating by movie (available-case)", x=NULL, y="Mean (1–5)") +
  ylim(0,5) + theme_minimal()

:::

Code

ratings_full |>
  mutate(rated=!is.na(rating)) |>
  ggplot(aes(x=movie, y=rater, fill=rated)) +
  geom_tile() +
  scale_fill_manual(values=c("grey85","steelblue"), name="Has rating") +
  labs(title="Who saw which movies") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle=35, hjust=1))

Save & close

Code

if (exists("con") && DBI::dbIsValid(con)) DBI::dbDisconnect(con)
write.csv(movie_summary, "movie_summary.csv", row.names = FALSE)
DBI::dbDisconnect(con)
list.files()

[1] "assignment_sql_r_files"     "assignment_sql_r.html"     
[3] "assignment_sql_r.qmd"       "assignment_sql_r.rmarkdown"
[5] "movie_summary.csv"          "movies.db"                 
[7] "project.Rproj"              "rsconnect"