##Objective Creating a normalized SQLite database for movie ratings, loading the data into R, handling missing data, and standardizing ratings.
##Populate
library(RSQLite)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
con <- dbConnect(RSQLite::SQLite(), "movie_ratings.db")
dbExecute(con, "
CREATE TABLE IF NOT EXISTS Movies (
movie_id INTEGER PRIMARY KEY,
title TEXT NOT NULL,
release_year INTEGER
)")
## [1] 0
dbExecute(con, "
CREATE TABLE IF NOT EXISTS Friends (
friend_id TEXT PRIMARY KEY,
friend_name TEXT NOT NULL
)")
## [1] 0
dbExecute(con, "
CREATE TABLE IF NOT EXISTS Ratings (
rating_id INTEGER PRIMARY KEY AUTOINCREMENT,
friend_id TEXT,
movie_id INTEGER,
rating INTEGER,
FOREIGN KEY (friend_id) REFERENCES Friends(friend_id),
FOREIGN KEY (movie_id) REFERENCES Movies(movie_id)
)")
## [1] 0
movies_data <- data.frame(
movie_id = 1:6,
title = c("Dune", "Spider-Man: No Way Home", "The Batman",
"Top Gun: Maverick", "Avatar: The Way of Water", "Oppenheimer"),
release_year = c(2021, 2021, 2022, 2022, 2022, 2023)
)
dbWriteTable(con, "Movies", movies_data, append = TRUE, row.names = FALSE)
friends_data <- data.frame(
friend_id = c("F1", "F2", "F3", "F4", "F5"),
friend_name = c("Friend A", "Friend B", "Friend C", "Friend D", "Friend E")
)
dbWriteTable(con, "Friends", friends_data, append = TRUE, row.names = FALSE)
ratings_data <- data.frame(
friend_id = c("F1", "F1", "F1", "F1", "F1", "F2", "F2", "F2", "F2", "F2", "F3", "F3", "F3", "F3", "F3", "F4", "F4", "F4", "F4", "F5", "F5", "F5", "F5", "F5"),
movie_id = c(1, 2, 3, 4, 6, 1, 2, 3, 5, 6, 1, 3, 4, 5, 6, 2, 3, 4, 5, 1, 2, 4, 5, 6),
rating = c(4, 5, 3, 4, 5, 5, 4, 4, 5, 4, 3, 5, 4, 4, 5, 5, 4, 5, 5, 4, 4, 3, 4, 5)
)
dbWriteTable(con, "Ratings", ratings_data, append = TRUE, row.names = FALSE)
##Load
ratings_df <- dbGetQuery(con, "SELECT * FROM Ratings")
movies_df <- dbGetQuery(con, "SELECT * FROM Movies")
head(ratings_df)
## rating_id friend_id movie_id rating
## 1 1 F1 1 4
## 2 2 F1 2 5
## 3 3 F1 3 3
## 4 4 F1 4 4
## 5 5 F1 6 5
## 6 6 F2 1 5
head(movies_df)
## movie_id title release_year
## 1 1 Dune 2021
## 2 2 Spider-Man: No Way Home 2021
## 3 3 The Batman 2022
## 4 4 Top Gun: Maverick 2022
## 5 5 Avatar: The Way of Water 2022
## 6 6 Oppenheimer 2023
##Matrix
ratings_wide <- dbGetQuery(con, "
SELECT f.friend_id, m.title,
CASE WHEN r.rating IS NULL THEN 'NA' ELSE CAST(r.rating AS TEXT) END as rating
FROM Friends f
CROSS JOIN Movies m
LEFT JOIN Ratings r ON f.friend_id = r.friend_id AND m.movie_id = r.movie_id
ORDER BY f.friend_id, m.movie_id
")
ratings_matrix <- ratings_wide %>%
pivot_wider(names_from = title, values_from = rating) %>%
as.data.frame()
ratings_matrix
## friend_id Dune Spider-Man: No Way Home The Batman Top Gun: Maverick
## 1 F1 4 5 3 4
## 2 F2 5 4 4 NA
## 3 F3 3 NA 5 4
## 4 F4 NA 5 4 5
## 5 F5 4 4 NA 3
## Avatar: The Way of Water Oppenheimer
## 1 NA 5
## 2 5 4
## 3 4 5
## 4 5 NA
## 5 4 5
##Standardize
ratings_standardized <- dbGetQuery(con, "
WITH MovieStats AS (
SELECT movie_id,
AVG(CAST(rating AS FLOAT)) as mean_rating,
SQRT(AVG(CAST(rating AS FLOAT) * CAST(rating AS FLOAT)) -
AVG(CAST(rating AS FLOAT)) * AVG(CAST(rating AS FLOAT))) as std_dev
FROM Ratings
GROUP BY movie_id
)
SELECT r.friend_id,
m.title,
r.rating as original_rating,
ROUND(CAST((r.rating - ms.mean_rating) / CASE WHEN ms.std_dev = 0 THEN 1 ELSE ms.std_dev END AS FLOAT), 2) as z_score
FROM Ratings r
JOIN Movies m ON r.movie_id = m.movie_id
JOIN MovieStats ms ON r.movie_id = ms.movie_id
ORDER BY r.friend_id, m.movie_id
")
ratings_standardized
## friend_id title original_rating z_score
## 1 F1 Dune 4 0.00
## 2 F1 Spider-Man: No Way Home 5 1.00
## 3 F1 The Batman 3 -1.41
## 4 F1 Top Gun: Maverick 4 0.00
## 5 F1 Oppenheimer 5 0.58
## 6 F2 Dune 5 1.41
## 7 F2 Spider-Man: No Way Home 4 -1.00
## 8 F2 The Batman 4 0.00
## 9 F2 Avatar: The Way of Water 5 1.00
## 10 F2 Oppenheimer 4 -1.73
## 11 F3 Dune 3 -1.41
## 12 F3 The Batman 5 1.41
## 13 F3 Top Gun: Maverick 4 0.00
## 14 F3 Avatar: The Way of Water 4 -1.00
## 15 F3 Oppenheimer 5 0.58
## 16 F4 Spider-Man: No Way Home 5 1.00
## 17 F4 The Batman 4 0.00
## 18 F4 Top Gun: Maverick 5 1.41
## 19 F4 Avatar: The Way of Water 5 1.00
## 20 F5 Dune 4 0.00
## 21 F5 Spider-Man: No Way Home 4 -1.00
## 22 F5 Top Gun: Maverick 3 -1.41
## 23 F5 Avatar: The Way of Water 4 -1.00
## 24 F5 Oppenheimer 5 0.58
##Summary
rating_summary <- dbGetQuery(con, "
SELECT m.title,
COUNT(r.rating) as num_ratings,
ROUND(AVG(CAST(r.rating AS FLOAT)), 2) as avg_rating,
MIN(r.rating) as min_rating,
MAX(r.rating) as max_rating
FROM Movies m
LEFT JOIN Ratings r ON m.movie_id = r.movie_id
GROUP BY m.title
ORDER BY m.movie_id
")
rating_summary
## title num_ratings avg_rating min_rating max_rating
## 1 Dune 4 4.00 3 5
## 2 Spider-Man: No Way Home 4 4.50 4 5
## 3 The Batman 4 4.00 3 5
## 4 Top Gun: Maverick 4 4.00 3 5
## 5 Avatar: The Way of Water 4 4.50 4 5
## 6 Oppenheimer 4 4.75 4 5
##Conclusion